samplerjit: Blend linear using integers.

2024-11-23 13:30:02 +00:00 · 2021-12-12 15:15:10 -08:00 · 2021-12-12 15:15:10 -08:00 · 4d6a2f3919
commit 4d6a2f3919
parent 6f4e735757
1 changed files with 53 additions and 81 deletions
--- a/GPU/Software/SamplerX86.cpp
+++ b/GPU/Software/SamplerX86.cpp
@ -161,6 +161,15 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) {
 		regCache_.ForceRelease(RegCache::GEN_ARG_LEVEL);
 	regCache_.Reset(true);
 	// Let's drop some helpful constants here.
 	const u8 *const100_11_4s = AlignCode16();
 	Write16(0x100 << 4); Write16(0x100 << 4); Write16(0x100 << 4); Write16(0x100 << 4);
 	Write16(0x100 << 4); Write16(0x100 << 4); Write16(0x100 << 4); Write16(0x100 << 4);
 	const u8 *const100Low_11_4s = AlignCode16();
 	Write16(0x100 << 4); Write16(0x100 << 4); Write16(0x100 << 4); Write16(0x100 << 4);
 	Write16(0); Write16(0); Write16(0); Write16(0);
 	// Now the actual linear func, which is exposed externally.
 	const u8 *start = AlignCode16();
@ -244,94 +253,57 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) {
 	doNearestCall(8);
 	doNearestCall(12);
-	// Convert TL, TR, BL, BR to floats for easier blending.
+	// fpScratchReg1 will have top RRRRRRRR LLLLLLLL, fpScratchReg2 for bottom.
-	if (!cpu_info.bSSE4_1) {
+	// Start with XXXX XXXX RRRR LLLL, and swizzle the 8 bits to both slots in the 16.
-		PXOR(XMM0, R(XMM0));
+	PSHUFD(fpScratchReg1, R(XMM5), _MM_SHUFFLE(0, 0, 1, 0));
-	}
+	PSHUFD(fpScratchReg2, R(XMM5), _MM_SHUFFLE(0, 0, 3, 2));
 	PUNPCKLBW(fpScratchReg1, R(fpScratchReg1));
 	PUNPCKLBW(fpScratchReg2, R(fpScratchReg2));
-	PSHUFD(fpScratchReg1, R(XMM5), _MM_SHUFFLE(0, 0, 0, 0));
+	// Grab frac_u and spread to lower (L) lanes.
 	PSHUFD(fpScratchReg2, R(XMM5), _MM_SHUFFLE(1, 1, 1, 1));
 	PSHUFD(fpScratchReg3, R(XMM5), _MM_SHUFFLE(2, 2, 2, 2));
 	PSHUFD(fpScratchReg4, R(XMM5), _MM_SHUFFLE(3, 3, 3, 3));
 	if (cpu_info.bSSE4_1) {
 		PMOVZXBD(fpScratchReg1, R(fpScratchReg1));
 		PMOVZXBD(fpScratchReg2, R(fpScratchReg2));
 		PMOVZXBD(fpScratchReg3, R(fpScratchReg3));
 		PMOVZXBD(fpScratchReg4, R(fpScratchReg4));
 	} else {
 		PUNPCKLBW(fpScratchReg1, R(XMM0));
 		PUNPCKLBW(fpScratchReg2, R(XMM0));
 		PUNPCKLBW(fpScratchReg3, R(XMM0));
 		PUNPCKLBW(fpScratchReg4, R(XMM0));
 		PUNPCKLWD(fpScratchReg1, R(XMM0));
 		PUNPCKLWD(fpScratchReg2, R(XMM0));
 		PUNPCKLWD(fpScratchReg3, R(XMM0));
 		PUNPCKLWD(fpScratchReg4, R(XMM0));
 	}
 	CVTDQ2PS(fpScratchReg1, R(fpScratchReg1));
 	CVTDQ2PS(fpScratchReg2, R(fpScratchReg2));
 	CVTDQ2PS(fpScratchReg3, R(fpScratchReg3));
 	CVTDQ2PS(fpScratchReg4, R(fpScratchReg4));
 	// Okay, now multiply the R sides by frac_u, and L by (256 - frac_u)...
 	MOVD_xmm(fpScratchReg5, MDisp(RSP, 0));
-	CVTDQ2PS(fpScratchReg5, R(fpScratchReg5));
+	PSHUFLW(fpScratchReg5, R(fpScratchReg5), _MM_SHUFFLE(0, 0, 0, 0));
-	SHUFPS(fpScratchReg5, R(fpScratchReg5), _MM_SHUFFLE(0, 0, 0, 0));
+	// Convert to s.11.4.
-	if (RipAccessible(by256)) {
+	PSLLW(fpScratchReg5, 4);
-		MULPS(fpScratchReg5, M(by256));
+	// Now subtract 0x100 - frac_u in the L lanes only: 00000000 LLLLLLLL.
-	} else {
+	MOVDQA(fpScratchReg3, M(const100Low_11_4s));
-		X64Reg tempReg = RAX;
+	PSUBW(fpScratchReg3, R(fpScratchReg5));
-		MOV(PTRBITS, R(tempReg), ImmPtr(by256));
+	// Now we just shift and OR in the original frac_u.
-		MULPS(fpScratchReg5, MatR(tempReg));
+	PSLLDQ(fpScratchReg5, 8);
-	}
+	POR(fpScratchReg3, R(fpScratchReg5));
-	if (RipAccessible(ones)) {
+	// Okay, we have 8-bits repeated in the top and bottom rows for the color.
-		MOVAPS(XMM0, M(ones));
+	// Shift frac by 4, and multiply to get the top 16 bits - that will give us 12 bits of result.
-	} else {
+	PMULHUW(fpScratchReg1, R(fpScratchReg3));
-		X64Reg tempReg = RAX;
+	PMULHUW(fpScratchReg2, R(fpScratchReg3));
 		MOV(PTRBITS, R(tempReg), ImmPtr(ones));
 		MOVAPS(XMM0, MatR(tempReg));
 	}
 	SUBPS(XMM0, R(fpScratchReg5));
-	MULPS(fpScratchReg1, R(XMM0));
+	// Time for frac_v.  This time, we want it in all 8 lanes.
 	MULPS(fpScratchReg2, R(fpScratchReg5));
 	MULPS(fpScratchReg3, R(XMM0));
 	MULPS(fpScratchReg4, R(fpScratchReg5));
 	// Now set top=fpScratchReg1, bottom=fpScratchReg3.
 	ADDPS(fpScratchReg1, R(fpScratchReg2));
 	ADDPS(fpScratchReg3, R(fpScratchReg4));
 	// Next, time for frac_v.
 	MOVD_xmm(fpScratchReg5, MDisp(RSP, 8));
-	CVTDQ2PS(fpScratchReg5, R(fpScratchReg5));
+	PSHUFLW(fpScratchReg5, R(fpScratchReg5), _MM_SHUFFLE(0, 0, 0, 0));
-	SHUFPS(fpScratchReg5, R(fpScratchReg5), _MM_SHUFFLE(0, 0, 0, 0));
+	PSHUFD(fpScratchReg5, R(fpScratchReg5), _MM_SHUFFLE(0, 0, 0, 0));
-	if (RipAccessible(ones)) {
+	// We need to shift before multiplying to get the 8 bits we want.
-		MULPS(fpScratchReg5, M(by256));
+	PSLLW(fpScratchReg5, 4);
 	// Now, inverse fpScratchReg5 into fpScratchReg3 for the top row.
 	MOVDQA(fpScratchReg3, M(const100_11_4s));
 	PSUBW(fpScratchReg3, R(fpScratchReg5));
 	// We had 12, plus 8 frac and 4 shift, that gives us 24.  Take the top 8 bits.
 	PMULHUW(fpScratchReg2, R(fpScratchReg5));
 	PMULHUW(fpScratchReg1, R(fpScratchReg3));
 	// Finally, time to sum them all up.
 	PADDW(fpScratchReg2, R(fpScratchReg1));
 	PSHUFD(XMM0, R(fpScratchReg2), _MM_SHUFFLE(3, 2, 3, 2));
 	PADDW(XMM0, R(fpScratchReg2));
 	// Finally, convert to 32-bit channels.
 	if (cpu_info.bSSE4_1) {
 		PMOVZXWD(XMM0, R(XMM0));
 	} else {
-		X64Reg tempReg = RAX;
+		PXOR(fpScratchReg1, R(fpScratchReg1));
-		MOV(PTRBITS, R(tempReg), ImmPtr(by256));
+		PUNPCKLWD(XMM0, R(fpScratchReg1));
 		MULPS(fpScratchReg5, MatR(tempReg));
 	}
 	if (RipAccessible(ones)) {
 		MOVAPS(XMM0, M(ones));
 	} else {
 		X64Reg tempReg = RAX;
 		MOV(PTRBITS, R(tempReg), ImmPtr(ones));
 		MOVAPS(XMM0, MatR(tempReg));
 	}
 	SUBPS(XMM0, R(fpScratchReg5));
 	MULPS(fpScratchReg1, R(XMM0));
 	MULPS(fpScratchReg3, R(fpScratchReg5));
 	// Still at the 255 scale, now we're interpolated.
 	ADDPS(fpScratchReg1, R(fpScratchReg3));
 	// Time to convert back to a single 32 bit value.
 	CVTPS2DQ(XMM0, R(fpScratchReg1));
 	if (id.hasInvalidPtr) {
 		SetJumpTarget(zeroSrc);