GS/SW: Use non-saturating ARM instructions for color gradient setup.

This is more efficient on ARM, though the equivalent instructions are not currently used in the x64 JIT and C++ versions of GSVector. Co-authored-by: TellowKrinkle
GS/SW: Mask color gradients to prevent incorrect clamping.
2026-01-31 01:15:24 +01:00 · 2025-11-26 20:25:10 +01:00 · 2025-11-26 20:25:10 +01:00 · 2025-11-25 19:03:17 -05:00
4 changed files with 39 additions and 39 deletions
--- a/pcsx2-qt/Translations/pcsx2-qt_en.ts
+++ b/pcsx2-qt/Translations/pcsx2-qt_en.ts
@@ -11804,7 +11804,7 @@ This action cannot be undone.</source>
        <translation type="unfinished"></translation>
    </message>
    <message>
-        <location filename="../../pcsx2/GS/Renderers/Vulkan/GSDeviceVK.cpp" line="5031"/>
+        <location filename="../../pcsx2/GS/Renderers/Vulkan/GSDeviceVK.cpp" line="5016"/>
        <source>Spin GPU During Readbacks is enabled, but calibrated timestamps are unavailable.  This might be really slow.</source>
        <translation type="unfinished"></translation>
    </message>
--- a/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp
+++ b/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp
@@ -323,10 +323,11 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u16* index, cons
 	{
 		if (sel.iip)
 		{
+			constexpr VectorI mask16 = VectorI::cxpr(0xFFFF);
 #if _M_SSE >= 0x501
-			GSVector4i::storel(&local.d8.c, GSVector4i(dscan.c * step_shift).xzyw().ps32());
+			GSVector4i::storel(&local.d8.c, (GSVector4i(dscan.c * step_shift) & GSVector4i::cast(mask16)).xzyw().pu32());
 #else
-			local.d4.c = GSVector4i(dscan.c * step_shift).xzyw().ps32();
+			local.d4.c = (GSVector4i(dscan.c * step_shift) & mask16).xzyw().pu32();
 #endif
 			VectorF dc(dscan.c);

@@ -335,8 +336,8 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u16* index, cons

 			for (int i = 0; i < vlen; i++)
 			{
-				VectorI r = VectorI(dr * shift[1 + i]).ps32();
-				VectorI b = VectorI(db * shift[1 + i]).ps32();
+				VectorI r = (VectorI(dr * shift[1 + i]) & mask16).pu32();
+				VectorI b = (VectorI(db * shift[1 + i]) & mask16).pu32();

 				local.d[i].rb = r.upl16(b);
 			}
@@ -346,8 +347,8 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u16* index, cons

 			for (int i = 0; i < vlen; i++)
 			{
-				VectorI g = VectorI(dg * shift[1 + i]).ps32();
-				VectorI a = VectorI(da * shift[1 + i]).ps32();
+				VectorI g = (VectorI(dg * shift[1 + i]) & mask16).pu32();
+				VectorI a = (VectorI(da * shift[1 + i]) & mask16).pu32();

 				local.d[i].ga = g.upl16(a);
 			}
--- a/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.cpp
+++ b/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.cpp
@@ -89,7 +89,7 @@ void GSSetupPrimCodeGenerator::Generate()
 	many_regs = isYmm && !m_sel.notest && needs_shift;

 #ifdef _WIN64
-	int needs_saving = many_regs ? 6 : m_sel.notest ? 0 : 2;
+	int needs_saving = many_regs ? 7 : m_sel.notest ? 1 : 3;
 	if (needs_saving)
 	{
 		sub(rsp, 8 + 16 * needs_saving);
@@ -398,12 +398,17 @@ void GSSetupPrimCodeGenerator::Color()

 		broadcastf128(xym0, ptr[_dscan + offsetof(GSVertexSW, c)]);

-		// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
+		// constexpr VectorI mask16 = VectorI::cxpr(0xFFFF);
+		XYm mask16 = XYm(many_regs ? 12 : m_sel.notest ? 6 : 8);
+		pcmpeqd(mask16, mask16);
+		psrld(mask16, 16);

+		// local.d4.c = (GSVector4i(dscan.c * step_shift) & mask16).xzyw().pu32();
 		THREEARG(mulps, xmm1, xmm0, xmm3);
 		cvttps2dq(xmm1, xmm1);
 		pshufd(xmm1, xmm1, _MM_SHUFFLE(3, 1, 2, 0));
-		packssdw(xmm1, xmm1);
+		pand(xym1, mask16);
+		packusdw(xmm1, xmm1);
 		if (isXmm)
 			movdqa(_rip_local_d(c), xmm1);
 		else
@@ -419,23 +424,25 @@ void GSSetupPrimCodeGenerator::Color()

 		for (int i = 0; i < (m_sel.notest ? 1 : dsize); i++)
 		{
-			// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
+			// VectorI r = (VectorI(dr * shift[1 + i]) & mask16).pu32();

 			if (i < 4 || many_regs)
 				THREEARG(mulps, xym0, XYm(4 + i), xym2);
 			else
 				vmulps(ymm0, ymm2, ptr[g_const.m_shift_256b[i + 1]]);
 			cvttps2dq(xym0, xym0);
-			packssdw(xym0, xym0);
+			pand(xym0, mask16);
+			packusdw(xym0, xym0);

-			// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
+			// VectorI b = (VectorI(db * shift[1 + i]) & mask16).pu32();

 			if (i < 4 || many_regs)
 				THREEARG(mulps, xym1, XYm(4 + i), xym3);
 			else
 				vmulps(ymm1, ymm3, ptr[g_const.m_shift_256b[i + 1]]);
 			cvttps2dq(xym1, xym1);
-			packssdw(xym1, xym1);
+			pand(xym1, mask16);
+			packusdw(xym1, xym1);

 			// m_local.d[i].rb = r.upl16(b);

@@ -455,23 +462,25 @@ void GSSetupPrimCodeGenerator::Color()

 		for (int i = 0; i < (m_sel.notest ? 1 : dsize); i++)
 		{
-			// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
+			// VectorI g = (VectorI(dg * shift[1 + i]) & mask16).pu32();

 			if (i < 4 || many_regs)
 				THREEARG(mulps, xym0, XYm(4 + i), xym2);
 			else
 				vmulps(ymm0, ymm2, ptr[g_const.m_shift_256b[i + 1]]);
 			cvttps2dq(xym0, xym0);
-			packssdw(xym0, xym0);
+			pand(xym0, mask16);
+			packusdw(xym0, xym1);

-			// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
+			// VectorI a = (VectorI(da * shift[1 + i]) & mask16).pu32();

 			if (i < 4 || many_regs)
 				THREEARG(mulps, xym1, XYm(4 + i), xym3);
 			else
 				vmulps(ymm1, ymm3, ptr[g_const.m_shift_256b[i + 1]]);
 			cvttps2dq(xym1, xym1);
-			packssdw(xym1, xym1);
+			pand(xym1, mask16);
+			packusdw(xym1, xym1);

 			// m_local.d[i].ga = g.upl16(a);

--- a/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.arm64.cpp
+++ b/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.arm64.cpp
@@ -225,14 +225,13 @@ void GSSetupPrimCodeGenerator::Color()
 		// GSVector4 c = dscan.c;
 		armAsm->Ldr(v16, MemOperand(_dscan, offsetof(GSVertexSW, c)));

-		// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
-
+		// GSVector4i tmp = GSVector4i(dscan.c * step_shift).xzyw();
+		// local.d4.c = tmp.uzp1_16(tmp); // Not currently in GSVector since that's mainly targeting x86 for now
 		armAsm->Fmul(v2.V4S(), v16.V4S(), v3.V4S());
 		armAsm->Fcvtzs(v2.V4S(), v2.V4S());
 		armAsm->Rev64(_vscratch.V4S(), v2.V4S());
 		armAsm->Uzp1(v2.V4S(), v2.V4S(), _vscratch.V4S());
-		armAsm->Sqxtn(v2.V4H(), v2.V4S());
-		armAsm->Dup(v2.V2D(), v2.V2D(), 0);
+		armAsm->Uzp1(v2.V8H(), v2.V8H(), v2.V8H());
 		armAsm->Str(v2, MemOperand(_locals, offsetof(GSScanlineLocalData, d4.c)));

 		// GSVector4 dr = c.xxxx();
@@ -243,23 +242,18 @@ void GSSetupPrimCodeGenerator::Color()

 		for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
 		{
-			// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
+			// VectorI r = VectorI(dr * shift[1 + i]);

 			armAsm->Fmul(v2.V4S(), v0.V4S(), VRegister(4 + i, kFormat4S));
 			armAsm->Fcvtzs(v2.V4S(), v2.V4S());
-			armAsm->Sqxtn(v2.V4H(), v2.V4S());
-			armAsm->Dup(v2.V2D(), v2.V2D(), 0);

-			// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
+			// VectorI b = VectorI(db * shift[1 + i]);

 			armAsm->Fmul(v3.V4S(), v1.V4S(), VRegister(4 + i, kFormat4S));
 			armAsm->Fcvtzs(v3.V4S(), v3.V4S());
-			armAsm->Sqxtn(v3.V4H(), v3.V4S());
-			armAsm->Dup(v3.V2D(), v3.V2D(), 0);

-			// m_local.d[i].rb = r.upl16(b);
-
-			armAsm->Zip1(v2.V8H(), v2.V8H(), v3.V8H());
+			// m_local.d[i].rb = r.trn1_16(b); // Not currently in GSVector since that's mainly targeting x86 for now
+			armAsm->Trn1(v2.V8H(), v2.V8H(), v3.V8H());
 			armAsm->Str(v2, _local(d[i].rb));
 		}

@@ -273,23 +267,19 @@ void GSSetupPrimCodeGenerator::Color()

 		for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
 		{
-			// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
+			// VectorI g = VectorI(dg * shift[1 + i]);

 			armAsm->Fmul(v2.V4S(), v0.V4S(), VRegister(4 + i, kFormat4S));
 			armAsm->Fcvtzs(v2.V4S(), v2.V4S());
-			armAsm->Sqxtn(v2.V4H(), v2.V4S());
-			armAsm->Dup(v2.V2D(), v2.V2D(), 0);

-			// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
+			// VectorI a = VectorI(da * shift[1 + i]);

 			armAsm->Fmul(v3.V4S(), v1.V4S(), VRegister(4 + i, kFormat4S));
 			armAsm->Fcvtzs(v3.V4S(), v3.V4S());
-			armAsm->Sqxtn(v3.V4H(), v3.V4S());
-			armAsm->Dup(v3.V2D(), v3.V2D(), 0);

-			// m_local.d[i].ga = g.upl16(a);
+			// m_local.d[i].ga = g.trn1_16(a); // Not currently in GSVector since that's mainly targeting x86 for now

-			armAsm->Zip1(v2.V8H(), v2.V8H(), v3.V8H());
+			armAsm->Trn1(v2.V8H(), v2.V8H(), v3.V8H());
 			armAsm->Str(v2, _local(d[i].ga));
 		}
 	}
Author	SHA1	Message	Date
TJnotJT	f322dfb1d4	GS/SW: Use non-saturating ARM instructions for color gradient setup. This is more efficient on ARM, though the equivalent instructions are not currently used in the x64 JIT and C++ versions of GSVector. Co-authored-by: TellowKrinkle	2025-11-26 20:25:10 +01:00
TJnotJT	a7f5ddfe0d	GS/SW: Mask color gradients to prevent incorrect clamping. Co-authored-by: TellowKrinkle	2025-11-26 20:25:10 +01:00
PCSX2 Bot	0cdfb75fd0	[ci skip] Qt: Update Base Translation.	2025-11-25 19:03:17 -05:00