From 4d6a2f39199e08973fa8cbd3cb6a9d1bd935ab2b Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sun, 12 Dec 2021 15:15:10 -0800
Subject: [PATCH] samplerjit: Blend linear using integers.

---
 GPU/Software/SamplerX86.cpp | 134 ++++++++++++++----------------------
 1 file changed, 53 insertions(+), 81 deletions(-)

diff --git a/GPU/Software/SamplerX86.cpp b/GPU/Software/SamplerX86.cpp
index 7822f31bee..915e86f8ba 100644
--- a/GPU/Software/SamplerX86.cpp
+++ b/GPU/Software/SamplerX86.cpp
@@ -161,6 +161,15 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) {
 		regCache_.ForceRelease(RegCache::GEN_ARG_LEVEL);
 	regCache_.Reset(true);
 
+	// Let's drop some helpful constants here.
+	const u8 *const100_11_4s = AlignCode16();
+	Write16(0x100 << 4); Write16(0x100 << 4); Write16(0x100 << 4); Write16(0x100 << 4);
+	Write16(0x100 << 4); Write16(0x100 << 4); Write16(0x100 << 4); Write16(0x100 << 4);
+
+	const u8 *const100Low_11_4s = AlignCode16();
+	Write16(0x100 << 4); Write16(0x100 << 4); Write16(0x100 << 4); Write16(0x100 << 4);
+	Write16(0); Write16(0); Write16(0); Write16(0);
+
 	// Now the actual linear func, which is exposed externally.
 	const u8 *start = AlignCode16();
 
@@ -244,94 +253,57 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) {
 	doNearestCall(8);
 	doNearestCall(12);
 
-	// Convert TL, TR, BL, BR to floats for easier blending.
-	if (!cpu_info.bSSE4_1) {
-		PXOR(XMM0, R(XMM0));
-	}
+	// fpScratchReg1 will have top RRRRRRRR LLLLLLLL, fpScratchReg2 for bottom.
+	// Start with XXXX XXXX RRRR LLLL, and swizzle the 8 bits to both slots in the 16.
+	PSHUFD(fpScratchReg1, R(XMM5), _MM_SHUFFLE(0, 0, 1, 0));
+	PSHUFD(fpScratchReg2, R(XMM5), _MM_SHUFFLE(0, 0, 3, 2));
+	PUNPCKLBW(fpScratchReg1, R(fpScratchReg1));
+	PUNPCKLBW(fpScratchReg2, R(fpScratchReg2));
 
-	PSHUFD(fpScratchReg1, R(XMM5), _MM_SHUFFLE(0, 0, 0, 0));
-	PSHUFD(fpScratchReg2, R(XMM5), _MM_SHUFFLE(1, 1, 1, 1));
-	PSHUFD(fpScratchReg3, R(XMM5), _MM_SHUFFLE(2, 2, 2, 2));
-	PSHUFD(fpScratchReg4, R(XMM5), _MM_SHUFFLE(3, 3, 3, 3));
-
-	if (cpu_info.bSSE4_1) {
-		PMOVZXBD(fpScratchReg1, R(fpScratchReg1));
-		PMOVZXBD(fpScratchReg2, R(fpScratchReg2));
-		PMOVZXBD(fpScratchReg3, R(fpScratchReg3));
-		PMOVZXBD(fpScratchReg4, R(fpScratchReg4));
-	} else {
-		PUNPCKLBW(fpScratchReg1, R(XMM0));
-		PUNPCKLBW(fpScratchReg2, R(XMM0));
-		PUNPCKLBW(fpScratchReg3, R(XMM0));
-		PUNPCKLBW(fpScratchReg4, R(XMM0));
-		PUNPCKLWD(fpScratchReg1, R(XMM0));
-		PUNPCKLWD(fpScratchReg2, R(XMM0));
-		PUNPCKLWD(fpScratchReg3, R(XMM0));
-		PUNPCKLWD(fpScratchReg4, R(XMM0));
-	}
-	CVTDQ2PS(fpScratchReg1, R(fpScratchReg1));
-	CVTDQ2PS(fpScratchReg2, R(fpScratchReg2));
-	CVTDQ2PS(fpScratchReg3, R(fpScratchReg3));
-	CVTDQ2PS(fpScratchReg4, R(fpScratchReg4));
-
-	// Okay, now multiply the R sides by frac_u, and L by (256 - frac_u)...
+	// Grab frac_u and spread to lower (L) lanes.
 	MOVD_xmm(fpScratchReg5, MDisp(RSP, 0));
-	CVTDQ2PS(fpScratchReg5, R(fpScratchReg5));
-	SHUFPS(fpScratchReg5, R(fpScratchReg5), _MM_SHUFFLE(0, 0, 0, 0));
-	if (RipAccessible(by256)) {
-		MULPS(fpScratchReg5, M(by256));
-	} else {
-		X64Reg tempReg = RAX;
-		MOV(PTRBITS, R(tempReg), ImmPtr(by256));
-		MULPS(fpScratchReg5, MatR(tempReg));
-	}
+	PSHUFLW(fpScratchReg5, R(fpScratchReg5), _MM_SHUFFLE(0, 0, 0, 0));
+	// Convert to s.11.4.
+	PSLLW(fpScratchReg5, 4);
+	// Now subtract 0x100 - frac_u in the L lanes only: 00000000 LLLLLLLL.
+	MOVDQA(fpScratchReg3, M(const100Low_11_4s));
+	PSUBW(fpScratchReg3, R(fpScratchReg5));
+	// Now we just shift and OR in the original frac_u.
+	PSLLDQ(fpScratchReg5, 8);
+	POR(fpScratchReg3, R(fpScratchReg5));
 
-	if (RipAccessible(ones)) {
-		MOVAPS(XMM0, M(ones));
-	} else {
-		X64Reg tempReg = RAX;
-		MOV(PTRBITS, R(tempReg), ImmPtr(ones));
-		MOVAPS(XMM0, MatR(tempReg));
-	}
-	SUBPS(XMM0, R(fpScratchReg5));
+	// Okay, we have 8-bits repeated in the top and bottom rows for the color.
+	// Shift frac by 4, and multiply to get the top 16 bits - that will give us 12 bits of result.
+	PMULHUW(fpScratchReg1, R(fpScratchReg3));
+	PMULHUW(fpScratchReg2, R(fpScratchReg3));
 
-	MULPS(fpScratchReg1, R(XMM0));
-	MULPS(fpScratchReg2, R(fpScratchReg5));
-	MULPS(fpScratchReg3, R(XMM0));
-	MULPS(fpScratchReg4, R(fpScratchReg5));
-
-	// Now set top=fpScratchReg1, bottom=fpScratchReg3.
-	ADDPS(fpScratchReg1, R(fpScratchReg2));
-	ADDPS(fpScratchReg3, R(fpScratchReg4));
-
-	// Next, time for frac_v.
+	// Time for frac_v.  This time, we want it in all 8 lanes.
 	MOVD_xmm(fpScratchReg5, MDisp(RSP, 8));
-	CVTDQ2PS(fpScratchReg5, R(fpScratchReg5));
-	SHUFPS(fpScratchReg5, R(fpScratchReg5), _MM_SHUFFLE(0, 0, 0, 0));
-	if (RipAccessible(ones)) {
-		MULPS(fpScratchReg5, M(by256));
+	PSHUFLW(fpScratchReg5, R(fpScratchReg5), _MM_SHUFFLE(0, 0, 0, 0));
+	PSHUFD(fpScratchReg5, R(fpScratchReg5), _MM_SHUFFLE(0, 0, 0, 0));
+	// We need to shift before multiplying to get the 8 bits we want.
+	PSLLW(fpScratchReg5, 4);
+
+	// Now, inverse fpScratchReg5 into fpScratchReg3 for the top row.
+	MOVDQA(fpScratchReg3, M(const100_11_4s));
+	PSUBW(fpScratchReg3, R(fpScratchReg5));
+
+	// We had 12, plus 8 frac and 4 shift, that gives us 24.  Take the top 8 bits.
+	PMULHUW(fpScratchReg2, R(fpScratchReg5));
+	PMULHUW(fpScratchReg1, R(fpScratchReg3));
+
+	// Finally, time to sum them all up.
+	PADDW(fpScratchReg2, R(fpScratchReg1));
+	PSHUFD(XMM0, R(fpScratchReg2), _MM_SHUFFLE(3, 2, 3, 2));
+	PADDW(XMM0, R(fpScratchReg2));
+
+	// Finally, convert to 32-bit channels.
+	if (cpu_info.bSSE4_1) {
+		PMOVZXWD(XMM0, R(XMM0));
 	} else {
-		X64Reg tempReg = RAX;
-		MOV(PTRBITS, R(tempReg), ImmPtr(by256));
-		MULPS(fpScratchReg5, MatR(tempReg));
+		PXOR(fpScratchReg1, R(fpScratchReg1));
+		PUNPCKLWD(XMM0, R(fpScratchReg1));
 	}
-	if (RipAccessible(ones)) {
-		MOVAPS(XMM0, M(ones));
-	} else {
-		X64Reg tempReg = RAX;
-		MOV(PTRBITS, R(tempReg), ImmPtr(ones));
-		MOVAPS(XMM0, MatR(tempReg));
-	}
-	SUBPS(XMM0, R(fpScratchReg5));
-
-	MULPS(fpScratchReg1, R(XMM0));
-	MULPS(fpScratchReg3, R(fpScratchReg5));
-
-	// Still at the 255 scale, now we're interpolated.
-	ADDPS(fpScratchReg1, R(fpScratchReg3));
-
-	// Time to convert back to a single 32 bit value.
-	CVTPS2DQ(XMM0, R(fpScratchReg1));
 
 	if (id.hasInvalidPtr) {
 		SetJumpTarget(zeroSrc);