samplerjit: Apply gather lookup to all CLUT4.

This commit is contained in:
Unknown W. Brackets 2022-01-02 13:52:48 -08:00
parent 22f770c828
commit ce6ea8da11
4 changed files with 122 additions and 11 deletions

View File

@ -1701,16 +1701,15 @@ void XEmitter::PSRLQ(X64Reg reg, int shift)
Write8(shift);
}
void XEmitter::PSRLQ(X64Reg reg, OpArg arg)
{
WriteSSEOp(0x66, 0xd3, reg, arg);
}
void XEmitter::PSRLDQ(X64Reg reg, int shift) {
WriteSSEOp(0x66, 0x73, (X64Reg)3, R(reg));
Write8(shift);
}
void XEmitter::PSRLW(X64Reg reg, OpArg arg) { WriteSSEOp(0x66, 0xD1, reg, arg); }
void XEmitter::PSRLD(X64Reg reg, OpArg arg) { WriteSSEOp(0x66, 0xD2, reg, arg); }
void XEmitter::PSRLQ(X64Reg reg, OpArg arg) { WriteSSEOp(0x66, 0xD3, reg, arg); }
void XEmitter::PSLLW(X64Reg reg, int shift)
{
WriteSSEOp(0x66, 0x71, (X64Reg)6, R(reg));
@ -1734,6 +1733,10 @@ void XEmitter::PSLLDQ(X64Reg reg, int shift) {
Write8(shift);
}
void XEmitter::PSLLW(X64Reg reg, OpArg arg) { WriteSSEOp(0x66, 0xF1, reg, arg); }
void XEmitter::PSLLD(X64Reg reg, OpArg arg) { WriteSSEOp(0x66, 0xF2, reg, arg); }
void XEmitter::PSLLQ(X64Reg reg, OpArg arg) { WriteSSEOp(0x66, 0xF3, reg, arg); }
void XEmitter::PSRAW(X64Reg reg, int shift)
{
WriteSSEOp(0x66, 0x71, (X64Reg)4, R(reg));
@ -1746,6 +1749,9 @@ void XEmitter::PSRAD(X64Reg reg, int shift)
Write8(shift);
}
void XEmitter::PSRAW(X64Reg reg, OpArg arg) { WriteSSEOp(0x66, 0xE1, reg, arg); }
void XEmitter::PSRAD(X64Reg reg, OpArg arg) { WriteSSEOp(0x66, 0xE2, reg, arg); }
void XEmitter::PMULLW(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0xD5, dest, arg);}
void XEmitter::PMULHW(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0xE5, dest, arg);}
void XEmitter::PMULHUW(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0xE4, dest, arg);}

View File

@ -847,16 +847,31 @@ public:
void PSRLW(X64Reg reg, int shift);
void PSRLD(X64Reg reg, int shift);
void PSRLQ(X64Reg reg, int shift);
void PSRLQ(X64Reg reg, OpArg arg);
void PSRLDQ(X64Reg reg, int shift);
// Note: all values shifted by lowest 64-bit in XMM arg.
void PSRLW(X64Reg reg, OpArg arg);
// Note: all values shifted by lowest 64-bit in XMM arg.
void PSRLD(X64Reg reg, OpArg arg);
// Note: both values shifted by lowest 64-bit in XMM arg.
void PSRLQ(X64Reg reg, OpArg arg);
void PSLLW(X64Reg reg, int shift);
void PSLLD(X64Reg reg, int shift);
void PSLLQ(X64Reg reg, int shift);
void PSLLDQ(X64Reg reg, int shift);
// Note: all values shifted by lowest 64-bit in XMM arg.
void PSLLW(X64Reg reg, OpArg arg);
// Note: all values shifted by lowest 64-bit in XMM arg.
void PSLLD(X64Reg reg, OpArg arg);
// Note: both values shifted by lowest 64-bit in XMM arg.
void PSLLQ(X64Reg reg, OpArg arg);
void PSRAW(X64Reg reg, int shift);
void PSRAD(X64Reg reg, int shift);
// Note: all values shifted by lowest 64-bit in XMM arg.
void PSRAW(X64Reg reg, OpArg arg);
// Note: all values shifted by lowest 64-bit in XMM arg.
void PSRAD(X64Reg reg, OpArg arg);
void PMULLW(X64Reg dest, const OpArg &arg);
void PMULHW(X64Reg dest, const OpArg &arg);

View File

@ -102,6 +102,7 @@ private:
bool Jit_PrepareDataDirectOffsets(const SamplerID &id, Rasterizer::RegCache::Reg uReg, Rasterizer::RegCache::Reg vReg, bool level1, int bitsPerTexel);
bool Jit_PrepareDataSwizzledOffsets(const SamplerID &id, Rasterizer::RegCache::Reg uReg, Rasterizer::RegCache::Reg vReg, bool level1, int bitsPerTexel);
bool Jit_ReadQuad(const SamplerID &id, bool level1, bool *doFallback);
bool Jit_TransformClutIndexQuad(const SamplerID &id, int bitsPerIndex);
bool Jit_ReadClutQuad(const SamplerID &id, bool level1);
bool Jit_BlendQuad(const SamplerID &id, bool level1);
bool Jit_DecodeQuad(const SamplerID &id, bool level1);

View File

@ -944,7 +944,7 @@ bool SamplerJitCache::Jit_ReadQuad(const SamplerID &id, bool level1, bool *doFal
bool success = true;
// TODO: Limit less.
if (cpu_info.bAVX2 && id.TexFmt() == GE_TFMT_CLUT4 && id.ClutFmt() == GE_CMODE_32BIT_ABGR8888 && !id.hasClutMask && !id.hasClutOffset && !id.hasClutShift) {
if (cpu_info.bAVX2 && id.TexFmt() == GE_TFMT_CLUT4) {
Describe("ReadQuad");
X64Reg baseReg = regCache_.Alloc(RegCache::GEN_ARG_TEXPTR);
@ -970,13 +970,12 @@ bool SamplerJitCache::Jit_ReadQuad(const SamplerID &id, bool level1, bool *doFal
VPSRLVD(128, vecIndexReg, vecIndexReg, R(uReg));
regCache_.Unlock(uReg, level1 ? RegCache::VEC_U1 : RegCache::VEC_ARG_U);
// Okay, now we need to mask out just the low four bits.
PCMPEQD(maskReg, R(maskReg));
PSRLD(maskReg, 28);
PAND(vecIndexReg, R(maskReg));
regCache_.Release(maskReg, RegCache::VEC_TEMP0);
regCache_.Unlock(vecIndexReg, RegCache::VEC_INDEX);
// Apply mask and any other CLUT transformations.
success = success && Jit_TransformClutIndexQuad(id, 4);
// Great, now we can use our CLUT indices to gather again.
success = success && Jit_ReadClutQuad(id, level1);
} else {
@ -986,6 +985,96 @@ bool SamplerJitCache::Jit_ReadQuad(const SamplerID &id, bool level1, bool *doFal
return success;
}
bool SamplerJitCache::Jit_TransformClutIndexQuad(const SamplerID &id, int bitsPerIndex) {
Describe("TrCLUTQuad");
GEPaletteFormat fmt = id.ClutFmt();
if (!id.hasClutShift && !id.hasClutMask && !id.hasClutOffset) {
// This is simple - just mask.
X64Reg indexReg = regCache_.Find(RegCache::VEC_INDEX);
// Mask to 8 bits for CLUT8/16/32, 4 bits for CLUT4.
PSLLD(indexReg, bitsPerIndex >= 8 ? 24 : 28);
PSRLD(indexReg, bitsPerIndex >= 8 ? 24 : 28);
regCache_.Unlock(indexReg, RegCache::VEC_INDEX);
return true;
}
X64Reg indexReg = regCache_.Find(RegCache::VEC_INDEX);
bool maskedIndex = false;
// Okay, first load the actual gstate clutformat bits we'll use.
X64Reg formatReg = regCache_.Alloc(RegCache::VEC_TEMP0);
X64Reg gstateReg = GetGState();
if (cpu_info.bAVX2 && !id.hasClutShift)
VPBROADCASTD(128, formatReg, MDisp(gstateReg, offsetof(GPUgstate, clutformat)));
else
MOVD_xmm(formatReg, MDisp(gstateReg, offsetof(GPUgstate, clutformat)));
regCache_.Unlock(gstateReg, RegCache::GEN_GSTATE);
// Shift = (clutformat >> 2) & 0x1F
if (id.hasClutShift) {
// Before shifting, let's mask if needed (we always read 32 bits.)
// We have to do this here, because the bits should be zero even if F is used as a mask.
if (bitsPerIndex < 32) {
PSLLD(indexReg, 32 - bitsPerIndex);
PSRLD(indexReg, 32 - bitsPerIndex);
maskedIndex = true;
}
X64Reg shiftReg = regCache_.Alloc(RegCache::VEC_TEMP1);
// Shift against walls to get 5 bits after the rightmost 2.
if (cpu_info.bAVX) {
VPSLLD(128, shiftReg, formatReg, 32 - 7);
} else {
MOVDQA(shiftReg, R(formatReg));
PSLLD(shiftReg, 32 - 7);
}
PSRLD(shiftReg, 32 - 5);
// The other lanes are zero, so we can use PSRLD.
PSRLD(indexReg, R(shiftReg));
regCache_.Release(shiftReg, RegCache::VEC_TEMP1);
}
// With shifting done, we need the format in each lane.
if (!cpu_info.bAVX2 || id.hasClutShift)
PSHUFD(formatReg, R(formatReg), _MM_SHUFFLE(0, 0, 0, 0));
// Mask = (clutformat >> 8) & 0xFF
if (id.hasClutMask) {
X64Reg maskReg = regCache_.Alloc(RegCache::VEC_TEMP1);
// If it was CLUT4, grab only 4 bits of the mask.
if (cpu_info.bAVX) {
VPSLLD(128, maskReg, formatReg, bitsPerIndex == 4 ? 20 : 16);
} else {
MOVDQA(maskReg, R(formatReg));
PSLLD(maskReg, bitsPerIndex == 4 ? 20 : 16);
}
PSRLD(maskReg, bitsPerIndex == 4 ? 28 : 24);
PAND(indexReg, R(maskReg));
regCache_.Release(maskReg, RegCache::VEC_TEMP1);
} else if (!maskedIndex || bitsPerIndex > 8) {
// Apply the fixed 8 bit mask (or the CLUT4 mask if we didn't shift.)
PSLLD(indexReg, maskedIndex || bitsPerIndex >= 8 ? 24 : 28);
PSRLD(indexReg, maskedIndex || bitsPerIndex >= 8 ? 24 : 28);
}
// Offset = (clutformat >> 12) & 0x01F0
if (id.hasClutOffset) {
// Use walls to extract the 5 bits at 16, and then put them shifted left by 4.
int offsetBits = fmt == GE_CMODE_32BIT_ABGR8888 ? 4 : 5;
PSRLD(formatReg, 16);
PSLLD(formatReg, 32 - offsetBits);
PSRLD(formatReg, 32 - offsetBits - 4);
POR(indexReg, R(formatReg));
}
regCache_.Release(formatReg, RegCache::VEC_TEMP0);
regCache_.Unlock(indexReg, RegCache::VEC_INDEX);
return true;
}
bool SamplerJitCache::Jit_ReadClutQuad(const SamplerID &id, bool level1) {
Describe("ReadCLUTQuad");
X64Reg indexReg = regCache_.Find(RegCache::VEC_INDEX);