Remove SSE4 path from Vec4<int>::operator*

2025-02-17 04:39:34 +00:00 · 2023-06-30 22:07:26 -04:00 · 2023-06-30 22:07:26 -04:00 · cd9f01c4df
commit cd9f01c4df
parent f133739cd0
2 changed files with 11 additions and 13 deletions
--- a/GPU/Math3D.h
+++ b/GPU/Math3D.h
@ -1387,20 +1387,18 @@ template<>
 inline Vec4<int> Vec4<int>::operator * (const Vec4 &other) const {
 	__m128i a = SAFE_M128I(ivec);
 	__m128i b = SAFE_M128I(other.ivec);
-#if _M_SSE >= 0x401
-	return Vec4<int>(_mm_mullo_epi32(a, b));
-#else
-	// This is what clang does. Seems about as good
-	// as it gets.
+	// Intel in its immense wisdom decided that
+	// SSE2 does not get _mm_mullo_epi32(),
+	// so we do it this way. This is what clang does,
+	// which seems about as good as it gets.
 	__m128i m02 = _mm_mul_epu32(a, b);
-	__m128i m13 = _mm_mul_epu32( // 0xF5 -> [1, 1, 3, 3]
-		_mm_shuffle_epi32(a, 0xF5),
-		_mm_shuffle_epi32(b, 0xF5));
-	__m128i ret = _mm_unpacklo_epi32( // 0xE8 -> [0, 2, 2, 3]
-		_mm_shuffle_epi32(m02, 0xE8),
-		_mm_shuffle_epi32(m13, 0xE8));
+	__m128i m13 = _mm_mul_epu32(
+		_mm_shuffle_epi32(a, _MM_SHUFFLE(3, 3, 1, 1)),
+		_mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1)));
+	__m128i ret = _mm_unpacklo_epi32(
+		_mm_shuffle_epi32(m02, _MM_SHUFFLE(3, 2, 2, 0)),
+		_mm_shuffle_epi32(m13, _MM_SHUFFLE(3, 2, 2, 0)));
 	return Vec4<int>(ret);
-#endif
 }

 template<> template<>
--- a/GPU/Software/Sampler.cpp
+++ b/GPU/Software/Sampler.cpp
@ -748,7 +748,7 @@ static Vec4IntResult SOFTRAST_CALL SampleLinearLevel(float s, float t, const u8
 	Vec4<int> texcolor_br = Vec4<int>::FromRGBA(c.v[3]);
 	Vec4<int> top = texcolor_tl * (0x10 - frac_u) + texcolor_tr * frac_u;
 	Vec4<int> bot = texcolor_bl * (0x10 - frac_u) + texcolor_br * frac_u;
-	return ToVec4IntResult((top * (0x10 - frac_v) + bot * frac_v) >> 8);
+	return ToVec4IntResult((top * (0x10 - frac_v) + bot * frac_v) >> (4 + 4));
 #endif
 }