From f133739cd00992a0a56b6689ef9b5ae2457510ac Mon Sep 17 00:00:00 2001 From: fp64 <106717720+fp64@users.noreply.github.com> Date: Thu, 29 Jun 2023 16:43:21 -0400 Subject: [PATCH] Replace some signed divison in SoftGPU This also adds a few bitwise operations to Vec4 and further SIMDifies it. Also, fixes unrelated warning. --- Common/MemoryUtil.cpp | 2 +- GPU/Math3D.h | 67 +++++++++++++++++++++++++++++++++++++++ GPU/Software/Lighting.cpp | 2 +- GPU/Software/Sampler.cpp | 30 +++++++++--------- 4 files changed, 84 insertions(+), 17 deletions(-) diff --git a/Common/MemoryUtil.cpp b/Common/MemoryUtil.cpp index 4ea9ce5d54..68f0d2b085 100644 --- a/Common/MemoryUtil.cpp +++ b/Common/MemoryUtil.cpp @@ -258,7 +258,7 @@ void *AllocateAlignedMemory(size_t size, size_t alignment) { #endif #endif - _assert_msg_(ptr != nullptr, "Failed to allocate aligned memory of size %lu", size); + _assert_msg_(ptr != nullptr, "Failed to allocate aligned memory of size %llu", (unsigned long long)size); return ptr; } diff --git a/GPU/Math3D.h b/GPU/Math3D.h index 9470c0e4fd..0fb061f00a 100644 --- a/GPU/Math3D.h +++ b/GPU/Math3D.h @@ -665,6 +665,20 @@ public: { return Vec4(x | other.x, y | other.y, z | other.z, w | other.w); } + Vec4 operator & (const Vec4 &other) const + { + return Vec4(x & other.x, y & other.y, z & other.z, w & other.w); + } + Vec4 operator << (const int amount) const + { + // NOTE: x*(1<> (const int amount) const + { + return Vec4(x >> amount, y >> amount, z >> amount, w >> amount); + } template Vec4 operator * (const V& f) const { @@ -1363,6 +1377,59 @@ inline Vec3 Vec3::operator * (const float &other) const { return Vec3(_mm_mul_ps(SAFE_M128(vec), _mm_set_ps1(other))); } +// Vec4 operation +template<> +inline Vec4 Vec4::operator + (const Vec4 &other) const { + return Vec4(_mm_add_epi32(SAFE_M128I(ivec), SAFE_M128I(other.ivec))); +} + +template<> +inline Vec4 Vec4::operator * (const Vec4 &other) const { + __m128i a = SAFE_M128I(ivec); + __m128i b = SAFE_M128I(other.ivec); +#if _M_SSE >= 0x401 + return Vec4(_mm_mullo_epi32(a, b)); +#else + // This is what clang does. Seems about as good + // as it gets. + __m128i m02 = _mm_mul_epu32(a, b); + __m128i m13 = _mm_mul_epu32( // 0xF5 -> [1, 1, 3, 3] + _mm_shuffle_epi32(a, 0xF5), + _mm_shuffle_epi32(b, 0xF5)); + __m128i ret = _mm_unpacklo_epi32( // 0xE8 -> [0, 2, 2, 3] + _mm_shuffle_epi32(m02, 0xE8), + _mm_shuffle_epi32(m13, 0xE8)); + return Vec4(ret); +#endif +} + +template<> template<> +inline Vec4 Vec4::operator * (const int &other) const { + return (*this) * Vec4(_mm_set1_epi32(other)); +} + +template<> +inline Vec4 Vec4::operator | (const Vec4 &other) const { + return Vec4(_mm_or_si128(SAFE_M128I(ivec), SAFE_M128I(other.ivec))); +} + +template<> +inline Vec4 Vec4::operator & (const Vec4 &other) const { + return Vec4(_mm_and_si128(SAFE_M128I(ivec), SAFE_M128I(other.ivec))); +} + +// NOTE: modern GCC, clang, and MSVC are all ok with +// non-compile-time-const amount for _mm_slli_epi32/_mm_srli_epi32. +template<> +inline Vec4 Vec4::operator << (const int amount) const { + return Vec4(_mm_slli_epi32(SAFE_M128I(ivec), amount)); +} + +template<> +inline Vec4 Vec4::operator >> (const int amount) const { + return Vec4(_mm_srli_epi32(SAFE_M128I(ivec), amount)); +} + // Vec4 operation template<> inline void Vec4::operator += (const Vec4 &other) { diff --git a/GPU/Software/Lighting.cpp b/GPU/Software/Lighting.cpp index 72065c80cc..3ba37a04dd 100644 --- a/GPU/Software/Lighting.cpp +++ b/GPU/Software/Lighting.cpp @@ -283,7 +283,7 @@ static void ProcessSIMD(VertexData &vertex, const WorldCoords &worldpos, const W Vec4 mec = Vec4::FromRGBA(gstate.getMaterialEmissive()); Vec4 mac = state.colorForAmbient ? colorFactor : state.material.ambientColorFactor; - Vec4 ambient = (mac * state.baseAmbientColorFactor) / 1024; + Vec4 ambient = (mac * state.baseAmbientColorFactor) >> 10; Vec4 final_color = mec + ambient; Vec4 specular_color = Vec4::AssignToAll(0); diff --git a/GPU/Software/Sampler.cpp b/GPU/Software/Sampler.cpp index cb6a60b41e..8f0f317210 100644 --- a/GPU/Software/Sampler.cpp +++ b/GPU/Software/Sampler.cpp @@ -276,13 +276,13 @@ static inline int GetPixelDataOffset(uint32_t row_pitch_pixels, uint32_t u, uint if (!swizzled) return (v * (row_pitch_pixels * texel_size_bits >> 3)) + (u * texel_size_bits >> 3); - const int tile_size_bits = 32; - const int tiles_in_block_horizontal = 4; - const int tiles_in_block_vertical = 8; + const uint32_t tile_size_bits = 32; + const uint32_t tiles_in_block_horizontal = 4; + const uint32_t tiles_in_block_vertical = 8; - constexpr int texels_per_tile = tile_size_bits / texel_size_bits; - int tile_u = u / texels_per_tile; - int tile_idx = (v % tiles_in_block_vertical) * (tiles_in_block_horizontal) + + constexpr uint32_t texels_per_tile = tile_size_bits / texel_size_bits; + uint32_t tile_u = u / texels_per_tile; + uint32_t tile_idx = (v % tiles_in_block_vertical) * (tiles_in_block_horizontal) + // TODO: not sure if the *texel_size_bits/8 factor is correct (v / tiles_in_block_vertical) * ((row_pitch_pixels*texel_size_bits/(tile_size_bits))*tiles_in_block_vertical) + (tile_u % tiles_in_block_horizontal) + @@ -408,22 +408,22 @@ inline static Nearest4 SOFTRAST_CALL SampleNearest(const int u[N], const int v[N case GE_TFMT_DXT1: for (int i = 0; i < N; ++i) { - const DXT1Block *block = (const DXT1Block *)srcptr + (v[i] / 4) * (texbufw / 4) + (u[i] / 4); - res.v[i] = GetDXT1Texel(block, u[i] % 4, v[i] % 4); + const DXT1Block *block = (const DXT1Block *)srcptr + (v[i] >> 2) * (texbufw >> 2) + (u[i] >> 2); + res.v[i] = GetDXT1Texel(block, u[i] & 3, v[i] & 3); } return res; case GE_TFMT_DXT3: for (int i = 0; i < N; ++i) { - const DXT3Block *block = (const DXT3Block *)srcptr + (v[i] / 4) * (texbufw / 4) + (u[i] / 4); - res.v[i] = GetDXT3Texel(block, u[i] % 4, v[i] % 4); + const DXT3Block *block = (const DXT3Block *)srcptr + (v[i] >> 2) * (texbufw >> 2) + (u[i] >> 2); + res.v[i] = GetDXT3Texel(block, u[i] & 3, v[i] & 3); } return res; case GE_TFMT_DXT5: for (int i = 0; i < N; ++i) { - const DXT5Block *block = (const DXT5Block *)srcptr + (v[i] / 4) * (texbufw / 4) + (u[i] / 4); - res.v[i] = GetDXT5Texel(block, u[i] % 4, v[i] % 4); + const DXT5Block *block = (const DXT5Block *)srcptr + (v[i] >> 2) * (texbufw >> 2) + (u[i] >> 2); + res.v[i] = GetDXT5Texel(block, u[i] & 3, v[i] & 3); } return res; @@ -613,7 +613,7 @@ static Vec4IntResult SOFTRAST_CALL SampleNearest(float s, float t, Vec4IntArg pr GetTexelCoordinates(level + 1, s, t, u, v, samplerID); Vec4 c1 = Vec4::FromRGBA(SampleNearest<1>(&u, &v, tptr[1], bufw[1], level + 1, samplerID).v[0]); - c0 = (c1 * levelFrac + c0 * (16 - levelFrac)) / 16; + c0 = (c1 * levelFrac + c0 * (16 - levelFrac)) >> 4; } return GetTextureFunctionOutput(prim_color, ToVec4IntArg(c0), samplerID); @@ -748,7 +748,7 @@ static Vec4IntResult SOFTRAST_CALL SampleLinearLevel(float s, float t, const u8 Vec4 texcolor_br = Vec4::FromRGBA(c.v[3]); Vec4 top = texcolor_tl * (0x10 - frac_u) + texcolor_tr * frac_u; Vec4 bot = texcolor_bl * (0x10 - frac_u) + texcolor_br * frac_u; - return ToVec4IntResult((top * (0x10 - frac_v) + bot * frac_v) / (16 * 16)); + return ToVec4IntResult((top * (0x10 - frac_v) + bot * frac_v) >> 8); #endif } @@ -756,7 +756,7 @@ static Vec4IntResult SOFTRAST_CALL SampleLinear(float s, float t, Vec4IntArg pri Vec4 c0 = SampleLinearLevel(s, t, tptr, bufw, texlevel, samplerID); if (levelFrac) { const Vec4 c1 = SampleLinearLevel(s, t, tptr + 1, bufw + 1, texlevel + 1, samplerID); - c0 = (c1 * levelFrac + c0 * (16 - levelFrac)) / 16; + c0 = (c1 * levelFrac + c0 * (16 - levelFrac)) >> 4; } return GetTextureFunctionOutput(prim_color, ToVec4IntArg(c0), samplerID); }