Compare commits

...

3 Commits

Author SHA1 Message Date
TJnotJT
f322dfb1d4 GS/SW: Use non-saturating ARM instructions for color gradient setup.
This is more efficient on ARM, though the equivalent instructions are not currently used in the x64 JIT and C++ versions of GSVector.

Co-authored-by: TellowKrinkle
2025-11-26 20:25:10 +01:00
TJnotJT
a7f5ddfe0d GS/SW: Mask color gradients to prevent incorrect clamping.
Co-authored-by: TellowKrinkle
2025-11-26 20:25:10 +01:00
PCSX2 Bot
0cdfb75fd0 [ci skip] Qt: Update Base Translation. 2025-11-25 19:03:17 -05:00
4 changed files with 39 additions and 39 deletions

View File

@@ -11804,7 +11804,7 @@ This action cannot be undone.</source>
<translation type="unfinished"></translation>
</message>
<message>
<location filename="../../pcsx2/GS/Renderers/Vulkan/GSDeviceVK.cpp" line="5031"/>
<location filename="../../pcsx2/GS/Renderers/Vulkan/GSDeviceVK.cpp" line="5016"/>
<source>Spin GPU During Readbacks is enabled, but calibrated timestamps are unavailable. This might be really slow.</source>
<translation type="unfinished"></translation>
</message>

View File

@@ -323,10 +323,11 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u16* index, cons
{
if (sel.iip)
{
constexpr VectorI mask16 = VectorI::cxpr(0xFFFF);
#if _M_SSE >= 0x501
GSVector4i::storel(&local.d8.c, GSVector4i(dscan.c * step_shift).xzyw().ps32());
GSVector4i::storel(&local.d8.c, (GSVector4i(dscan.c * step_shift) & GSVector4i::cast(mask16)).xzyw().pu32());
#else
local.d4.c = GSVector4i(dscan.c * step_shift).xzyw().ps32();
local.d4.c = (GSVector4i(dscan.c * step_shift) & mask16).xzyw().pu32();
#endif
VectorF dc(dscan.c);
@@ -335,8 +336,8 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u16* index, cons
for (int i = 0; i < vlen; i++)
{
VectorI r = VectorI(dr * shift[1 + i]).ps32();
VectorI b = VectorI(db * shift[1 + i]).ps32();
VectorI r = (VectorI(dr * shift[1 + i]) & mask16).pu32();
VectorI b = (VectorI(db * shift[1 + i]) & mask16).pu32();
local.d[i].rb = r.upl16(b);
}
@@ -346,8 +347,8 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u16* index, cons
for (int i = 0; i < vlen; i++)
{
VectorI g = VectorI(dg * shift[1 + i]).ps32();
VectorI a = VectorI(da * shift[1 + i]).ps32();
VectorI g = (VectorI(dg * shift[1 + i]) & mask16).pu32();
VectorI a = (VectorI(da * shift[1 + i]) & mask16).pu32();
local.d[i].ga = g.upl16(a);
}

View File

@@ -89,7 +89,7 @@ void GSSetupPrimCodeGenerator::Generate()
many_regs = isYmm && !m_sel.notest && needs_shift;
#ifdef _WIN64
int needs_saving = many_regs ? 6 : m_sel.notest ? 0 : 2;
int needs_saving = many_regs ? 7 : m_sel.notest ? 1 : 3;
if (needs_saving)
{
sub(rsp, 8 + 16 * needs_saving);
@@ -398,12 +398,17 @@ void GSSetupPrimCodeGenerator::Color()
broadcastf128(xym0, ptr[_dscan + offsetof(GSVertexSW, c)]);
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
// constexpr VectorI mask16 = VectorI::cxpr(0xFFFF);
XYm mask16 = XYm(many_regs ? 12 : m_sel.notest ? 6 : 8);
pcmpeqd(mask16, mask16);
psrld(mask16, 16);
// local.d4.c = (GSVector4i(dscan.c * step_shift) & mask16).xzyw().pu32();
THREEARG(mulps, xmm1, xmm0, xmm3);
cvttps2dq(xmm1, xmm1);
pshufd(xmm1, xmm1, _MM_SHUFFLE(3, 1, 2, 0));
packssdw(xmm1, xmm1);
pand(xym1, mask16);
packusdw(xmm1, xmm1);
if (isXmm)
movdqa(_rip_local_d(c), xmm1);
else
@@ -419,23 +424,25 @@ void GSSetupPrimCodeGenerator::Color()
for (int i = 0; i < (m_sel.notest ? 1 : dsize); i++)
{
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
// VectorI r = (VectorI(dr * shift[1 + i]) & mask16).pu32();
if (i < 4 || many_regs)
THREEARG(mulps, xym0, XYm(4 + i), xym2);
else
vmulps(ymm0, ymm2, ptr[g_const.m_shift_256b[i + 1]]);
cvttps2dq(xym0, xym0);
packssdw(xym0, xym0);
pand(xym0, mask16);
packusdw(xym0, xym0);
// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
// VectorI b = (VectorI(db * shift[1 + i]) & mask16).pu32();
if (i < 4 || many_regs)
THREEARG(mulps, xym1, XYm(4 + i), xym3);
else
vmulps(ymm1, ymm3, ptr[g_const.m_shift_256b[i + 1]]);
cvttps2dq(xym1, xym1);
packssdw(xym1, xym1);
pand(xym1, mask16);
packusdw(xym1, xym1);
// m_local.d[i].rb = r.upl16(b);
@@ -455,23 +462,25 @@ void GSSetupPrimCodeGenerator::Color()
for (int i = 0; i < (m_sel.notest ? 1 : dsize); i++)
{
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
// VectorI g = (VectorI(dg * shift[1 + i]) & mask16).pu32();
if (i < 4 || many_regs)
THREEARG(mulps, xym0, XYm(4 + i), xym2);
else
vmulps(ymm0, ymm2, ptr[g_const.m_shift_256b[i + 1]]);
cvttps2dq(xym0, xym0);
packssdw(xym0, xym0);
pand(xym0, mask16);
packusdw(xym0, xym1);
// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
// VectorI a = (VectorI(da * shift[1 + i]) & mask16).pu32();
if (i < 4 || many_regs)
THREEARG(mulps, xym1, XYm(4 + i), xym3);
else
vmulps(ymm1, ymm3, ptr[g_const.m_shift_256b[i + 1]]);
cvttps2dq(xym1, xym1);
packssdw(xym1, xym1);
pand(xym1, mask16);
packusdw(xym1, xym1);
// m_local.d[i].ga = g.upl16(a);

View File

@@ -225,14 +225,13 @@ void GSSetupPrimCodeGenerator::Color()
// GSVector4 c = dscan.c;
armAsm->Ldr(v16, MemOperand(_dscan, offsetof(GSVertexSW, c)));
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
// GSVector4i tmp = GSVector4i(dscan.c * step_shift).xzyw();
// local.d4.c = tmp.uzp1_16(tmp); // Not currently in GSVector since that's mainly targeting x86 for now
armAsm->Fmul(v2.V4S(), v16.V4S(), v3.V4S());
armAsm->Fcvtzs(v2.V4S(), v2.V4S());
armAsm->Rev64(_vscratch.V4S(), v2.V4S());
armAsm->Uzp1(v2.V4S(), v2.V4S(), _vscratch.V4S());
armAsm->Sqxtn(v2.V4H(), v2.V4S());
armAsm->Dup(v2.V2D(), v2.V2D(), 0);
armAsm->Uzp1(v2.V8H(), v2.V8H(), v2.V8H());
armAsm->Str(v2, MemOperand(_locals, offsetof(GSScanlineLocalData, d4.c)));
// GSVector4 dr = c.xxxx();
@@ -243,23 +242,18 @@ void GSSetupPrimCodeGenerator::Color()
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
// VectorI r = VectorI(dr * shift[1 + i]);
armAsm->Fmul(v2.V4S(), v0.V4S(), VRegister(4 + i, kFormat4S));
armAsm->Fcvtzs(v2.V4S(), v2.V4S());
armAsm->Sqxtn(v2.V4H(), v2.V4S());
armAsm->Dup(v2.V2D(), v2.V2D(), 0);
// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
// VectorI b = VectorI(db * shift[1 + i]);
armAsm->Fmul(v3.V4S(), v1.V4S(), VRegister(4 + i, kFormat4S));
armAsm->Fcvtzs(v3.V4S(), v3.V4S());
armAsm->Sqxtn(v3.V4H(), v3.V4S());
armAsm->Dup(v3.V2D(), v3.V2D(), 0);
// m_local.d[i].rb = r.upl16(b);
armAsm->Zip1(v2.V8H(), v2.V8H(), v3.V8H());
// m_local.d[i].rb = r.trn1_16(b); // Not currently in GSVector since that's mainly targeting x86 for now
armAsm->Trn1(v2.V8H(), v2.V8H(), v3.V8H());
armAsm->Str(v2, _local(d[i].rb));
}
@@ -273,23 +267,19 @@ void GSSetupPrimCodeGenerator::Color()
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
// VectorI g = VectorI(dg * shift[1 + i]);
armAsm->Fmul(v2.V4S(), v0.V4S(), VRegister(4 + i, kFormat4S));
armAsm->Fcvtzs(v2.V4S(), v2.V4S());
armAsm->Sqxtn(v2.V4H(), v2.V4S());
armAsm->Dup(v2.V2D(), v2.V2D(), 0);
// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
// VectorI a = VectorI(da * shift[1 + i]);
armAsm->Fmul(v3.V4S(), v1.V4S(), VRegister(4 + i, kFormat4S));
armAsm->Fcvtzs(v3.V4S(), v3.V4S());
armAsm->Sqxtn(v3.V4H(), v3.V4S());
armAsm->Dup(v3.V2D(), v3.V2D(), 0);
// m_local.d[i].ga = g.upl16(a);
// m_local.d[i].ga = g.trn1_16(a); // Not currently in GSVector since that's mainly targeting x86 for now
armAsm->Zip1(v2.V8H(), v2.V8H(), v3.V8H());
armAsm->Trn1(v2.V8H(), v2.V8H(), v3.V8H());
armAsm->Str(v2, _local(d[i].ga));
}
}