From f133739cd00992a0a56b6689ef9b5ae2457510ac Mon Sep 17 00:00:00 2001
From: fp64 <106717720+fp64@users.noreply.github.com>
Date: Thu, 29 Jun 2023 16:43:21 -0400
Subject: [PATCH] Replace some signed divison in SoftGPU

This also adds a few bitwise operations to Vec4<int> and further
SIMDifies it.
Also, fixes unrelated warning.
---
 Common/MemoryUtil.cpp     |  2 +-
 GPU/Math3D.h              | 67 +++++++++++++++++++++++++++++++++++++++
 GPU/Software/Lighting.cpp |  2 +-
 GPU/Software/Sampler.cpp  | 30 +++++++++---------
 4 files changed, 84 insertions(+), 17 deletions(-)
diff --git a/Common/MemoryUtil.cpp b/Common/MemoryUtil.cpp
index 4ea9ce5d54..68f0d2b085 100644
--- a/Common/MemoryUtil.cpp
+++ b/Common/MemoryUtil.cpp
@@ -258,7 +258,7 @@ void *AllocateAlignedMemory(size_t size, size_t alignment) {
 #endif
 #endif
 
-	_assert_msg_(ptr != nullptr, "Failed to allocate aligned memory of size %lu", size);
+	_assert_msg_(ptr != nullptr, "Failed to allocate aligned memory of size %llu", (unsigned long long)size);
 	return ptr;
 }
 
diff --git a/GPU/Math3D.h b/GPU/Math3D.h
index 9470c0e4fd..0fb061f00a 100644
--- a/GPU/Math3D.h
+++ b/GPU/Math3D.h
@@ -665,6 +665,20 @@ public:
 	{
 		return Vec4(x | other.x, y | other.y, z | other.z, w | other.w);
 	}
+	Vec4 operator & (const Vec4 &other) const
+	{
+		return Vec4(x & other.x, y & other.y, z & other.z, w & other.w);
+	}
+	Vec4 operator << (const int amount) const
+	{
+		// NOTE: x*(1<<amount), etc., might be safer, since
+		// left-shifting negatives is UB pre-C++20.
+		return Vec4(x << amount, y << amount, z << amount, w << amount);
+	}
+	Vec4 operator >> (const int amount) const
+	{
+		return Vec4(x >> amount, y >> amount, z >> amount, w >> amount);
+	}
 	template<typename V>
 	Vec4 operator * (const V& f) const
 	{
@@ -1363,6 +1377,59 @@ inline Vec3<float> Vec3<float>::operator * (const float &other) const {
 	return Vec3<float>(_mm_mul_ps(SAFE_M128(vec), _mm_set_ps1(other)));
 }
 
+// Vec4<int> operation
+template<>
+inline Vec4<int> Vec4<int>::operator + (const Vec4 &other) const {
+	return Vec4<int>(_mm_add_epi32(SAFE_M128I(ivec), SAFE_M128I(other.ivec)));
+}
+
+template<>
+inline Vec4<int> Vec4<int>::operator * (const Vec4 &other) const {
+	__m128i a = SAFE_M128I(ivec);
+	__m128i b = SAFE_M128I(other.ivec);
+#if _M_SSE >= 0x401
+	return Vec4<int>(_mm_mullo_epi32(a, b));
+#else
+	// This is what clang does. Seems about as good
+	// as it gets.
+	__m128i m02 = _mm_mul_epu32(a, b);
+	__m128i m13 = _mm_mul_epu32( // 0xF5 -> [1, 1, 3, 3]
+		_mm_shuffle_epi32(a, 0xF5),
+		_mm_shuffle_epi32(b, 0xF5));
+	__m128i ret = _mm_unpacklo_epi32( // 0xE8 -> [0, 2, 2, 3]
+		_mm_shuffle_epi32(m02, 0xE8),
+		_mm_shuffle_epi32(m13, 0xE8));
+	return Vec4<int>(ret);
+#endif
+}
+
+template<> template<>
+inline Vec4<int> Vec4<int>::operator * (const int &other) const {
+	return (*this) * Vec4<int>(_mm_set1_epi32(other));
+}
+
+template<>
+inline Vec4<int> Vec4<int>::operator | (const Vec4 &other) const {
+	return Vec4<int>(_mm_or_si128(SAFE_M128I(ivec), SAFE_M128I(other.ivec)));
+}
+
+template<>
+inline Vec4<int> Vec4<int>::operator & (const Vec4 &other) const {
+	return Vec4<int>(_mm_and_si128(SAFE_M128I(ivec), SAFE_M128I(other.ivec)));
+}
+
+// NOTE: modern GCC, clang, and MSVC are all ok with
+// non-compile-time-const amount for _mm_slli_epi32/_mm_srli_epi32.
+template<>
+inline Vec4<int> Vec4<int>::operator << (const int amount) const {
+	return Vec4<int>(_mm_slli_epi32(SAFE_M128I(ivec), amount));
+}
+
+template<>
+inline Vec4<int> Vec4<int>::operator >> (const int amount) const {
+	return Vec4<int>(_mm_srli_epi32(SAFE_M128I(ivec), amount));
+}
+
 // Vec4<float> operation
 template<>
 inline void Vec4<float>::operator += (const Vec4<float> &other) {
diff --git a/GPU/Software/Lighting.cpp b/GPU/Software/Lighting.cpp
index 72065c80cc..3ba37a04dd 100644
--- a/GPU/Software/Lighting.cpp
+++ b/GPU/Software/Lighting.cpp
@@ -283,7 +283,7 @@ static void ProcessSIMD(VertexData &vertex, const WorldCoords &worldpos, const W
 	Vec4<int> mec = Vec4<int>::FromRGBA(gstate.getMaterialEmissive());
 
 	Vec4<int> mac = state.colorForAmbient ? colorFactor : state.material.ambientColorFactor;
-	Vec4<int> ambient = (mac * state.baseAmbientColorFactor) / 1024;
+	Vec4<int> ambient = (mac * state.baseAmbientColorFactor) >> 10;
 
 	Vec4<int> final_color = mec + ambient;
 	Vec4<int> specular_color = Vec4<int>::AssignToAll(0);
diff --git a/GPU/Software/Sampler.cpp b/GPU/Software/Sampler.cpp
index cb6a60b41e..8f0f317210 100644
--- a/GPU/Software/Sampler.cpp
+++ b/GPU/Software/Sampler.cpp
@@ -276,13 +276,13 @@ static inline int GetPixelDataOffset(uint32_t row_pitch_pixels, uint32_t u, uint
 	if (!swizzled)
 		return (v * (row_pitch_pixels * texel_size_bits >> 3)) + (u * texel_size_bits >> 3);
 
-	const int tile_size_bits = 32;
-	const int tiles_in_block_horizontal = 4;
-	const int tiles_in_block_vertical = 8;
+	const uint32_t tile_size_bits = 32;
+	const uint32_t tiles_in_block_horizontal = 4;
+	const uint32_t tiles_in_block_vertical = 8;
 
-	constexpr int texels_per_tile = tile_size_bits / texel_size_bits;
-	int tile_u = u / texels_per_tile;
-	int tile_idx = (v % tiles_in_block_vertical) * (tiles_in_block_horizontal) +
+	constexpr uint32_t texels_per_tile = tile_size_bits / texel_size_bits;
+	uint32_t tile_u = u / texels_per_tile;
+	uint32_t tile_idx = (v % tiles_in_block_vertical) * (tiles_in_block_horizontal) +
 	// TODO: not sure if the *texel_size_bits/8 factor is correct
 					(v / tiles_in_block_vertical) * ((row_pitch_pixels*texel_size_bits/(tile_size_bits))*tiles_in_block_vertical) +
 					(tile_u % tiles_in_block_horizontal) +
@@ -408,22 +408,22 @@ inline static Nearest4 SOFTRAST_CALL SampleNearest(const int u[N], const int v[N
 
 	case GE_TFMT_DXT1:
 		for (int i = 0; i < N; ++i) {
-			const DXT1Block *block = (const DXT1Block *)srcptr + (v[i] / 4) * (texbufw / 4) + (u[i] / 4);
-			res.v[i] = GetDXT1Texel(block, u[i] % 4, v[i] % 4);
+			const DXT1Block *block = (const DXT1Block *)srcptr + (v[i] >> 2) * (texbufw >> 2) + (u[i] >> 2);
+			res.v[i] = GetDXT1Texel(block, u[i] & 3, v[i] & 3);
 		}
 		return res;
 
 	case GE_TFMT_DXT3:
 		for (int i = 0; i < N; ++i) {
-			const DXT3Block *block = (const DXT3Block *)srcptr + (v[i] / 4) * (texbufw / 4) + (u[i] / 4);
-			res.v[i] = GetDXT3Texel(block, u[i] % 4, v[i] % 4);
+			const DXT3Block *block = (const DXT3Block *)srcptr + (v[i] >> 2) * (texbufw >> 2) + (u[i] >> 2);
+			res.v[i] = GetDXT3Texel(block, u[i] & 3, v[i] & 3);
 		}
 		return res;
 
 	case GE_TFMT_DXT5:
 		for (int i = 0; i < N; ++i) {
-			const DXT5Block *block = (const DXT5Block *)srcptr + (v[i] / 4) * (texbufw / 4) + (u[i] / 4);
-			res.v[i] = GetDXT5Texel(block, u[i] % 4, v[i] % 4);
+			const DXT5Block *block = (const DXT5Block *)srcptr + (v[i] >> 2) * (texbufw >> 2) + (u[i] >> 2);
+			res.v[i] = GetDXT5Texel(block, u[i] & 3, v[i] & 3);
 		}
 		return res;
 
@@ -613,7 +613,7 @@ static Vec4IntResult SOFTRAST_CALL SampleNearest(float s, float t, Vec4IntArg pr
 		GetTexelCoordinates(level + 1, s, t, u, v, samplerID);
 		Vec4<int> c1 = Vec4<int>::FromRGBA(SampleNearest<1>(&u, &v, tptr[1], bufw[1], level + 1, samplerID).v[0]);
 
-		c0 = (c1 * levelFrac + c0 * (16 - levelFrac)) / 16;
+		c0 = (c1 * levelFrac + c0 * (16 - levelFrac)) >> 4;
 	}
 
 	return GetTextureFunctionOutput(prim_color, ToVec4IntArg(c0), samplerID);
@@ -748,7 +748,7 @@ static Vec4IntResult SOFTRAST_CALL SampleLinearLevel(float s, float t, const u8
 	Vec4<int> texcolor_br = Vec4<int>::FromRGBA(c.v[3]);
 	Vec4<int> top = texcolor_tl * (0x10 - frac_u) + texcolor_tr * frac_u;
 	Vec4<int> bot = texcolor_bl * (0x10 - frac_u) + texcolor_br * frac_u;
-	return ToVec4IntResult((top * (0x10 - frac_v) + bot * frac_v) / (16 * 16));
+	return ToVec4IntResult((top * (0x10 - frac_v) + bot * frac_v) >> 8);
 #endif
 }
 
@@ -756,7 +756,7 @@ static Vec4IntResult SOFTRAST_CALL SampleLinear(float s, float t, Vec4IntArg pri
 	Vec4<int> c0 = SampleLinearLevel(s, t, tptr, bufw, texlevel, samplerID);
 	if (levelFrac) {
 		const Vec4<int> c1 = SampleLinearLevel(s, t, tptr + 1, bufw + 1, texlevel + 1, samplerID);
-		c0 = (c1 * levelFrac + c0 * (16 - levelFrac)) / 16;
+		c0 = (c1 * levelFrac + c0 * (16 - levelFrac)) >> 4;
 	}
 	return GetTextureFunctionOutput(prim_color, ToVec4IntArg(c0), samplerID);
 }