From 22f770c82816c1148ca3147f3f8f6523b2b2dae5 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Sun, 2 Jan 2022 10:45:03 -0800 Subject: [PATCH] samplerjit: Use VPGATHERDD for simple CLUT4 loads. Planning to expand this to more paths. --- Common/x64Emitter.cpp | 16 +-- Common/x64Emitter.h | 8 +- GPU/Software/RasterizerRegCache.h | 1 + GPU/Software/Sampler.h | 2 + GPU/Software/SamplerX86.cpp | 179 ++++++++++++++++++++++++++++-- 5 files changed, 185 insertions(+), 21 deletions(-) diff --git a/Common/x64Emitter.cpp b/Common/x64Emitter.cpp index affd61545e..c05e6ac407 100644 --- a/Common/x64Emitter.cpp +++ b/Common/x64Emitter.cpp @@ -2240,20 +2240,20 @@ void XEmitter::VGATHERQPD(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) { _assert_msg_(regOp1 != regOp2 && !arg.IsIndexedReg(regOp1) && !arg.IsIndexedReg(regOp2), "VGATHER cannot have overlapped registers"); WriteAVX2Op(bits, 0x66, 0x3893, regOp1, regOp2, arg); } -void XEmitter::VGATHERDD(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) { - _assert_msg_(regOp1 != regOp2 && !arg.IsIndexedReg(regOp1) && !arg.IsIndexedReg(regOp2), "VGATHER cannot have overlapped registers"); +void XEmitter::VPGATHERDD(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) { + _assert_msg_(regOp1 != regOp2 && !arg.IsIndexedReg(regOp1) && !arg.IsIndexedReg(regOp2), "VPGATHER cannot have overlapped registers"); WriteAVX2Op(bits, 0x66, 0x3890, regOp1, regOp2, arg); } -void XEmitter::VGATHERQD(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) { - _assert_msg_(regOp1 != regOp2 && !arg.IsIndexedReg(regOp1) && !arg.IsIndexedReg(regOp2), "VGATHER cannot have overlapped registers"); +void XEmitter::VPGATHERQD(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) { + _assert_msg_(regOp1 != regOp2 && !arg.IsIndexedReg(regOp1) && !arg.IsIndexedReg(regOp2), "VPGATHER cannot have overlapped registers"); WriteAVX2Op(bits, 0x66, 0x3891, regOp1, regOp2, arg); } -void XEmitter::VGATHERDQ(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) { - _assert_msg_(regOp1 != regOp2 && !arg.IsIndexedReg(regOp1) && !arg.IsIndexedReg(regOp2), "VGATHER cannot have overlapped registers"); +void XEmitter::VPGATHERDQ(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) { + _assert_msg_(regOp1 != regOp2 && !arg.IsIndexedReg(regOp1) && !arg.IsIndexedReg(regOp2), "VPGATHER cannot have overlapped registers"); WriteAVX2Op(bits, 0x66, 0x3890, regOp1, regOp2, arg, 0, 1); } -void XEmitter::VGATHERQQ(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) { - _assert_msg_(regOp1 != regOp2 && !arg.IsIndexedReg(regOp1) && !arg.IsIndexedReg(regOp2), "VGATHER cannot have overlapped registers"); +void XEmitter::VPGATHERQQ(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) { + _assert_msg_(regOp1 != regOp2 && !arg.IsIndexedReg(regOp1) && !arg.IsIndexedReg(regOp2), "VPGATHER cannot have overlapped registers"); WriteAVX2Op(bits, 0x66, 0x3891, regOp1, regOp2, arg, 0, 1); } diff --git a/Common/x64Emitter.h b/Common/x64Emitter.h index d534b9c482..dbdf921871 100644 --- a/Common/x64Emitter.h +++ b/Common/x64Emitter.h @@ -1239,10 +1239,10 @@ public: void VGATHERDPD(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2); void VGATHERQPS(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2); void VGATHERQPD(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2); - void VGATHERDD(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2); - void VGATHERQD(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2); - void VGATHERDQ(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2); - void VGATHERQQ(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2); + void VPGATHERDD(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2); + void VPGATHERQD(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2); + void VPGATHERDQ(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2); + void VPGATHERQQ(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2); void VPSLLVD(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg); void VPSLLVQ(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg); diff --git a/GPU/Software/RasterizerRegCache.h b/GPU/Software/RasterizerRegCache.h index 064f3365b3..15071663ee 100644 --- a/GPU/Software/RasterizerRegCache.h +++ b/GPU/Software/RasterizerRegCache.h @@ -103,6 +103,7 @@ struct RegCache { VEC_RESULT1 = 0x0002, VEC_U1 = 0x0003, VEC_V1 = 0x0004, + VEC_INDEX = 0x0005, GEN_SRC_ALPHA = 0x0100, GEN_GSTATE = 0x0101, diff --git a/GPU/Software/Sampler.h b/GPU/Software/Sampler.h index 7d5d5f9da9..b1530ee883 100644 --- a/GPU/Software/Sampler.h +++ b/GPU/Software/Sampler.h @@ -101,6 +101,8 @@ private: bool Jit_PrepareDataOffsets(const SamplerID &id, Rasterizer::RegCache::Reg uReg, Rasterizer::RegCache::Reg vReg, bool level1); bool Jit_PrepareDataDirectOffsets(const SamplerID &id, Rasterizer::RegCache::Reg uReg, Rasterizer::RegCache::Reg vReg, bool level1, int bitsPerTexel); bool Jit_PrepareDataSwizzledOffsets(const SamplerID &id, Rasterizer::RegCache::Reg uReg, Rasterizer::RegCache::Reg vReg, bool level1, int bitsPerTexel); + bool Jit_ReadQuad(const SamplerID &id, bool level1, bool *doFallback); + bool Jit_ReadClutQuad(const SamplerID &id, bool level1); bool Jit_BlendQuad(const SamplerID &id, bool level1); bool Jit_DecodeQuad(const SamplerID &id, bool level1); bool Jit_Decode5650Quad(const SamplerID &id, Rasterizer::RegCache::Reg quadReg); diff --git a/GPU/Software/SamplerX86.cpp b/GPU/Software/SamplerX86.cpp index e7952061e8..da734c2fa3 100644 --- a/GPU/Software/SamplerX86.cpp +++ b/GPU/Software/SamplerX86.cpp @@ -686,11 +686,15 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) { regCache_.Unlock(vecResultReg, level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT); }; - Describe("Calls"); - doNearestCall(0, false); - doNearestCall(4, false); - doNearestCall(8, false); - doNearestCall(12, false); + bool readFallback; + success = success && Jit_ReadQuad(id, false, &readFallback); + if (readFallback) { + Describe("Calls"); + doNearestCall(0, false); + doNearestCall(4, false); + doNearestCall(8, false); + doNearestCall(12, false); + } if (id.hasAnyMips) { Describe("MipsCalls"); @@ -713,10 +717,14 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) { ADD(32, MDisp(RSP, stackArgPos_ + 16), Imm8(1)); } - doNearestCall(0, true); - doNearestCall(4, true); - doNearestCall(8, true); - doNearestCall(12, true); + success = success && Jit_ReadQuad(id, true, &readFallback); + if (readFallback) { + Describe("Calls"); + doNearestCall(0, true); + doNearestCall(4, true); + doNearestCall(8, true); + doNearestCall(12, true); + } SetJumpTarget(skip); } @@ -931,6 +939,159 @@ RegCache::Reg SamplerJitCache::GetGState() { return regCache_.Find(RegCache::GEN_GSTATE); } +bool SamplerJitCache::Jit_ReadQuad(const SamplerID &id, bool level1, bool *doFallback) { + *doFallback = false; + + bool success = true; + // TODO: Limit less. + if (cpu_info.bAVX2 && id.TexFmt() == GE_TFMT_CLUT4 && id.ClutFmt() == GE_CMODE_32BIT_ABGR8888 && !id.hasClutMask && !id.hasClutOffset && !id.hasClutShift) { + Describe("ReadQuad"); + + X64Reg baseReg = regCache_.Alloc(RegCache::GEN_ARG_TEXPTR); + X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR_PTR); + MOV(64, R(baseReg), MDisp(srcReg, level1 ? 8 : 0)); + regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR_PTR); + + X64Reg vecIndexReg = regCache_.Alloc(RegCache::VEC_INDEX); + X64Reg maskReg = regCache_.Alloc(RegCache::VEC_TEMP0); + // We have to set a mask for which values to load. Load all 4. + // Note this is overwritten with zeroes by the instruction. + PCMPEQD(maskReg, R(maskReg)); + X64Reg indexReg = regCache_.Find(level1 ? RegCache::VEC_V1 : RegCache::VEC_ARG_V); + VPGATHERDD(128, vecIndexReg, MComplex(baseReg, indexReg, SCALE_1, 0), maskReg); + regCache_.Unlock(indexReg, level1 ? RegCache::VEC_V1 : RegCache::VEC_ARG_V); + regCache_.Release(baseReg, RegCache::GEN_ARG_TEXPTR); + + // Take only lowest bit, multiply by 4 with shifting. + X64Reg uReg = regCache_.Find(level1 ? RegCache::VEC_U1 : RegCache::VEC_ARG_U); + PSLLD(uReg, 31); + PSRLD(uReg, 29); + // Next, shift away based on the odd U bits. + VPSRLVD(128, vecIndexReg, vecIndexReg, R(uReg)); + regCache_.Unlock(uReg, level1 ? RegCache::VEC_U1 : RegCache::VEC_ARG_U); + + // Okay, now we need to mask out just the low four bits. + PCMPEQD(maskReg, R(maskReg)); + PSRLD(maskReg, 28); + PAND(vecIndexReg, R(maskReg)); + regCache_.Release(maskReg, RegCache::VEC_TEMP0); + regCache_.Unlock(vecIndexReg, RegCache::VEC_INDEX); + + // Great, now we can use our CLUT indices to gather again. + success = success && Jit_ReadClutQuad(id, level1); + } else { + // TODO + *doFallback = true; + } + return success; +} + +bool SamplerJitCache::Jit_ReadClutQuad(const SamplerID &id, bool level1) { + Describe("ReadCLUTQuad"); + X64Reg indexReg = regCache_.Find(RegCache::VEC_INDEX); + + if (!id.useSharedClut) { + X64Reg vecLevelReg = regCache_.Alloc(RegCache::VEC_TEMP0); + + if (regCache_.Has(RegCache::GEN_ARG_LEVEL)) { + X64Reg levelReg = regCache_.Find(RegCache::GEN_ARG_LEVEL); + MOVD_xmm(vecLevelReg, R(levelReg)); + regCache_.Unlock(levelReg, RegCache::GEN_ARG_LEVEL); + } else { +#if PPSSPP_PLATFORM(WINDOWS) + if (cpu_info.bAVX2) { + VPBROADCASTD(128, vecLevelReg, MDisp(RSP, stackArgPos_ + 16)); + } else { + MOVD_xmm(vecLevelReg, MDisp(RSP, stackArgPos_ + 16)); + PSHUFD(vecLevelReg, R(vecLevelReg), _MM_SHUFFLE(0, 0, 0, 0)); + } +#else + _assert_(false); +#endif + } + + // Now we multiply by 16, and add. + PSLLD(vecLevelReg, 4); + PADDD(indexReg, R(vecLevelReg)); + regCache_.Release(vecLevelReg, RegCache::VEC_TEMP0); + } + + X64Reg clutBaseReg = regCache_.Alloc(RegCache::GEN_TEMP1); + MOV(PTRBITS, R(clutBaseReg), ImmPtr(clut)); + + X64Reg resultReg = regCache_.Find(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT); + X64Reg maskReg = regCache_.Alloc(RegCache::VEC_TEMP0); + if (cpu_info.bAVX2) + PCMPEQD(maskReg, R(maskReg)); + + switch (id.ClutFmt()) { + case GE_CMODE_16BIT_BGR5650: + case GE_CMODE_16BIT_ABGR5551: + case GE_CMODE_16BIT_ABGR4444: + if (cpu_info.bAVX2) { + VPGATHERDD(128, resultReg, MComplex(clutBaseReg, indexReg, SCALE_2, 0), maskReg); + // Clear out the top 16 bits. + PCMPEQD(maskReg, R(maskReg)); + PSRLD(maskReg, 16); + PAND(resultReg, R(maskReg)); + } else { + PXOR(resultReg, R(resultReg)); + + X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2); + if (cpu_info.bSSE4_1) { + for (int i = 0; i < 4; ++i) { + PEXTRD(R(temp2Reg), indexReg, i); + PINSRW(resultReg, MComplex(clutBaseReg, temp2Reg, SCALE_2, 0), i * 2); + } + } else { + for (int i = 0; i < 4; ++i) { + MOVD_xmm(R(temp2Reg), indexReg); + if (i != 3) + PSRLDQ(indexReg, 4); + PINSRW(resultReg, MComplex(clutBaseReg, temp2Reg, SCALE_2, 0), i * 2); + } + } + regCache_.Release(temp2Reg, RegCache::GEN_TEMP2); + } + break; + + case GE_CMODE_32BIT_ABGR8888: + if (cpu_info.bAVX2) { + VPGATHERDD(128, resultReg, MComplex(clutBaseReg, indexReg, SCALE_4, 0), maskReg); + } else { + X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2); + if (cpu_info.bSSE4_1) { + for (int i = 0; i < 4; ++i) { + PEXTRD(R(temp2Reg), indexReg, i); + PINSRD(resultReg, MComplex(clutBaseReg, temp2Reg, SCALE_4, 0), i); + } + } else { + for (int i = 0; i < 4; ++i) { + MOVD_xmm(R(temp2Reg), indexReg); + if (i != 3) + PSRLDQ(indexReg, 4); + + if (i == 0) { + MOVD_xmm(resultReg , MComplex(clutBaseReg, temp2Reg, SCALE_4, 0)); + } else { + MOVD_xmm(maskReg, MComplex(clutBaseReg, temp2Reg, SCALE_4, 0)); + PSLLDQ(maskReg, 4 * i); + POR(resultReg, R(maskReg)); + } + } + } + regCache_.Release(temp2Reg, RegCache::GEN_TEMP2); + } + break; + } + regCache_.Release(maskReg, RegCache::VEC_TEMP0); + regCache_.Unlock(resultReg, level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT); + + regCache_.Release(clutBaseReg, RegCache::GEN_TEMP1); + regCache_.Release(indexReg, RegCache::VEC_INDEX); + return true; +} + bool SamplerJitCache::Jit_BlendQuad(const SamplerID &id, bool level1) { Describe(level1 ? "BlendQuadMips" : "BlendQuad");