mirror of
https://github.com/hrydgard/ppsspp.git
synced 2025-02-02 11:43:31 +00:00
samplerjit: Apply gather lookup to all CLUT4.
This commit is contained in:
parent
22f770c828
commit
ce6ea8da11
@ -1701,16 +1701,15 @@ void XEmitter::PSRLQ(X64Reg reg, int shift)
|
||||
Write8(shift);
|
||||
}
|
||||
|
||||
void XEmitter::PSRLQ(X64Reg reg, OpArg arg)
|
||||
{
|
||||
WriteSSEOp(0x66, 0xd3, reg, arg);
|
||||
}
|
||||
|
||||
void XEmitter::PSRLDQ(X64Reg reg, int shift) {
|
||||
WriteSSEOp(0x66, 0x73, (X64Reg)3, R(reg));
|
||||
Write8(shift);
|
||||
}
|
||||
|
||||
void XEmitter::PSRLW(X64Reg reg, OpArg arg) { WriteSSEOp(0x66, 0xD1, reg, arg); }
|
||||
void XEmitter::PSRLD(X64Reg reg, OpArg arg) { WriteSSEOp(0x66, 0xD2, reg, arg); }
|
||||
void XEmitter::PSRLQ(X64Reg reg, OpArg arg) { WriteSSEOp(0x66, 0xD3, reg, arg); }
|
||||
|
||||
void XEmitter::PSLLW(X64Reg reg, int shift)
|
||||
{
|
||||
WriteSSEOp(0x66, 0x71, (X64Reg)6, R(reg));
|
||||
@ -1734,6 +1733,10 @@ void XEmitter::PSLLDQ(X64Reg reg, int shift) {
|
||||
Write8(shift);
|
||||
}
|
||||
|
||||
void XEmitter::PSLLW(X64Reg reg, OpArg arg) { WriteSSEOp(0x66, 0xF1, reg, arg); }
|
||||
void XEmitter::PSLLD(X64Reg reg, OpArg arg) { WriteSSEOp(0x66, 0xF2, reg, arg); }
|
||||
void XEmitter::PSLLQ(X64Reg reg, OpArg arg) { WriteSSEOp(0x66, 0xF3, reg, arg); }
|
||||
|
||||
void XEmitter::PSRAW(X64Reg reg, int shift)
|
||||
{
|
||||
WriteSSEOp(0x66, 0x71, (X64Reg)4, R(reg));
|
||||
@ -1746,6 +1749,9 @@ void XEmitter::PSRAD(X64Reg reg, int shift)
|
||||
Write8(shift);
|
||||
}
|
||||
|
||||
void XEmitter::PSRAW(X64Reg reg, OpArg arg) { WriteSSEOp(0x66, 0xE1, reg, arg); }
|
||||
void XEmitter::PSRAD(X64Reg reg, OpArg arg) { WriteSSEOp(0x66, 0xE2, reg, arg); }
|
||||
|
||||
void XEmitter::PMULLW(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0xD5, dest, arg);}
|
||||
void XEmitter::PMULHW(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0xE5, dest, arg);}
|
||||
void XEmitter::PMULHUW(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0xE4, dest, arg);}
|
||||
|
@ -847,16 +847,31 @@ public:
|
||||
void PSRLW(X64Reg reg, int shift);
|
||||
void PSRLD(X64Reg reg, int shift);
|
||||
void PSRLQ(X64Reg reg, int shift);
|
||||
void PSRLQ(X64Reg reg, OpArg arg);
|
||||
void PSRLDQ(X64Reg reg, int shift);
|
||||
// Note: all values shifted by lowest 64-bit in XMM arg.
|
||||
void PSRLW(X64Reg reg, OpArg arg);
|
||||
// Note: all values shifted by lowest 64-bit in XMM arg.
|
||||
void PSRLD(X64Reg reg, OpArg arg);
|
||||
// Note: both values shifted by lowest 64-bit in XMM arg.
|
||||
void PSRLQ(X64Reg reg, OpArg arg);
|
||||
|
||||
void PSLLW(X64Reg reg, int shift);
|
||||
void PSLLD(X64Reg reg, int shift);
|
||||
void PSLLQ(X64Reg reg, int shift);
|
||||
void PSLLDQ(X64Reg reg, int shift);
|
||||
// Note: all values shifted by lowest 64-bit in XMM arg.
|
||||
void PSLLW(X64Reg reg, OpArg arg);
|
||||
// Note: all values shifted by lowest 64-bit in XMM arg.
|
||||
void PSLLD(X64Reg reg, OpArg arg);
|
||||
// Note: both values shifted by lowest 64-bit in XMM arg.
|
||||
void PSLLQ(X64Reg reg, OpArg arg);
|
||||
|
||||
void PSRAW(X64Reg reg, int shift);
|
||||
void PSRAD(X64Reg reg, int shift);
|
||||
// Note: all values shifted by lowest 64-bit in XMM arg.
|
||||
void PSRAW(X64Reg reg, OpArg arg);
|
||||
// Note: all values shifted by lowest 64-bit in XMM arg.
|
||||
void PSRAD(X64Reg reg, OpArg arg);
|
||||
|
||||
void PMULLW(X64Reg dest, const OpArg &arg);
|
||||
void PMULHW(X64Reg dest, const OpArg &arg);
|
||||
|
@ -102,6 +102,7 @@ private:
|
||||
bool Jit_PrepareDataDirectOffsets(const SamplerID &id, Rasterizer::RegCache::Reg uReg, Rasterizer::RegCache::Reg vReg, bool level1, int bitsPerTexel);
|
||||
bool Jit_PrepareDataSwizzledOffsets(const SamplerID &id, Rasterizer::RegCache::Reg uReg, Rasterizer::RegCache::Reg vReg, bool level1, int bitsPerTexel);
|
||||
bool Jit_ReadQuad(const SamplerID &id, bool level1, bool *doFallback);
|
||||
bool Jit_TransformClutIndexQuad(const SamplerID &id, int bitsPerIndex);
|
||||
bool Jit_ReadClutQuad(const SamplerID &id, bool level1);
|
||||
bool Jit_BlendQuad(const SamplerID &id, bool level1);
|
||||
bool Jit_DecodeQuad(const SamplerID &id, bool level1);
|
||||
|
@ -944,7 +944,7 @@ bool SamplerJitCache::Jit_ReadQuad(const SamplerID &id, bool level1, bool *doFal
|
||||
|
||||
bool success = true;
|
||||
// TODO: Limit less.
|
||||
if (cpu_info.bAVX2 && id.TexFmt() == GE_TFMT_CLUT4 && id.ClutFmt() == GE_CMODE_32BIT_ABGR8888 && !id.hasClutMask && !id.hasClutOffset && !id.hasClutShift) {
|
||||
if (cpu_info.bAVX2 && id.TexFmt() == GE_TFMT_CLUT4) {
|
||||
Describe("ReadQuad");
|
||||
|
||||
X64Reg baseReg = regCache_.Alloc(RegCache::GEN_ARG_TEXPTR);
|
||||
@ -970,13 +970,12 @@ bool SamplerJitCache::Jit_ReadQuad(const SamplerID &id, bool level1, bool *doFal
|
||||
VPSRLVD(128, vecIndexReg, vecIndexReg, R(uReg));
|
||||
regCache_.Unlock(uReg, level1 ? RegCache::VEC_U1 : RegCache::VEC_ARG_U);
|
||||
|
||||
// Okay, now we need to mask out just the low four bits.
|
||||
PCMPEQD(maskReg, R(maskReg));
|
||||
PSRLD(maskReg, 28);
|
||||
PAND(vecIndexReg, R(maskReg));
|
||||
regCache_.Release(maskReg, RegCache::VEC_TEMP0);
|
||||
regCache_.Unlock(vecIndexReg, RegCache::VEC_INDEX);
|
||||
|
||||
// Apply mask and any other CLUT transformations.
|
||||
success = success && Jit_TransformClutIndexQuad(id, 4);
|
||||
|
||||
// Great, now we can use our CLUT indices to gather again.
|
||||
success = success && Jit_ReadClutQuad(id, level1);
|
||||
} else {
|
||||
@ -986,6 +985,96 @@ bool SamplerJitCache::Jit_ReadQuad(const SamplerID &id, bool level1, bool *doFal
|
||||
return success;
|
||||
}
|
||||
|
||||
bool SamplerJitCache::Jit_TransformClutIndexQuad(const SamplerID &id, int bitsPerIndex) {
|
||||
Describe("TrCLUTQuad");
|
||||
GEPaletteFormat fmt = id.ClutFmt();
|
||||
if (!id.hasClutShift && !id.hasClutMask && !id.hasClutOffset) {
|
||||
// This is simple - just mask.
|
||||
X64Reg indexReg = regCache_.Find(RegCache::VEC_INDEX);
|
||||
// Mask to 8 bits for CLUT8/16/32, 4 bits for CLUT4.
|
||||
PSLLD(indexReg, bitsPerIndex >= 8 ? 24 : 28);
|
||||
PSRLD(indexReg, bitsPerIndex >= 8 ? 24 : 28);
|
||||
regCache_.Unlock(indexReg, RegCache::VEC_INDEX);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
X64Reg indexReg = regCache_.Find(RegCache::VEC_INDEX);
|
||||
bool maskedIndex = false;
|
||||
|
||||
// Okay, first load the actual gstate clutformat bits we'll use.
|
||||
X64Reg formatReg = regCache_.Alloc(RegCache::VEC_TEMP0);
|
||||
X64Reg gstateReg = GetGState();
|
||||
if (cpu_info.bAVX2 && !id.hasClutShift)
|
||||
VPBROADCASTD(128, formatReg, MDisp(gstateReg, offsetof(GPUgstate, clutformat)));
|
||||
else
|
||||
MOVD_xmm(formatReg, MDisp(gstateReg, offsetof(GPUgstate, clutformat)));
|
||||
regCache_.Unlock(gstateReg, RegCache::GEN_GSTATE);
|
||||
|
||||
// Shift = (clutformat >> 2) & 0x1F
|
||||
if (id.hasClutShift) {
|
||||
// Before shifting, let's mask if needed (we always read 32 bits.)
|
||||
// We have to do this here, because the bits should be zero even if F is used as a mask.
|
||||
if (bitsPerIndex < 32) {
|
||||
PSLLD(indexReg, 32 - bitsPerIndex);
|
||||
PSRLD(indexReg, 32 - bitsPerIndex);
|
||||
maskedIndex = true;
|
||||
}
|
||||
|
||||
X64Reg shiftReg = regCache_.Alloc(RegCache::VEC_TEMP1);
|
||||
// Shift against walls to get 5 bits after the rightmost 2.
|
||||
if (cpu_info.bAVX) {
|
||||
VPSLLD(128, shiftReg, formatReg, 32 - 7);
|
||||
} else {
|
||||
MOVDQA(shiftReg, R(formatReg));
|
||||
PSLLD(shiftReg, 32 - 7);
|
||||
}
|
||||
PSRLD(shiftReg, 32 - 5);
|
||||
// The other lanes are zero, so we can use PSRLD.
|
||||
PSRLD(indexReg, R(shiftReg));
|
||||
regCache_.Release(shiftReg, RegCache::VEC_TEMP1);
|
||||
}
|
||||
|
||||
// With shifting done, we need the format in each lane.
|
||||
if (!cpu_info.bAVX2 || id.hasClutShift)
|
||||
PSHUFD(formatReg, R(formatReg), _MM_SHUFFLE(0, 0, 0, 0));
|
||||
|
||||
// Mask = (clutformat >> 8) & 0xFF
|
||||
if (id.hasClutMask) {
|
||||
X64Reg maskReg = regCache_.Alloc(RegCache::VEC_TEMP1);
|
||||
// If it was CLUT4, grab only 4 bits of the mask.
|
||||
if (cpu_info.bAVX) {
|
||||
VPSLLD(128, maskReg, formatReg, bitsPerIndex == 4 ? 20 : 16);
|
||||
} else {
|
||||
MOVDQA(maskReg, R(formatReg));
|
||||
PSLLD(maskReg, bitsPerIndex == 4 ? 20 : 16);
|
||||
}
|
||||
PSRLD(maskReg, bitsPerIndex == 4 ? 28 : 24);
|
||||
|
||||
PAND(indexReg, R(maskReg));
|
||||
regCache_.Release(maskReg, RegCache::VEC_TEMP1);
|
||||
} else if (!maskedIndex || bitsPerIndex > 8) {
|
||||
// Apply the fixed 8 bit mask (or the CLUT4 mask if we didn't shift.)
|
||||
PSLLD(indexReg, maskedIndex || bitsPerIndex >= 8 ? 24 : 28);
|
||||
PSRLD(indexReg, maskedIndex || bitsPerIndex >= 8 ? 24 : 28);
|
||||
}
|
||||
|
||||
// Offset = (clutformat >> 12) & 0x01F0
|
||||
if (id.hasClutOffset) {
|
||||
// Use walls to extract the 5 bits at 16, and then put them shifted left by 4.
|
||||
int offsetBits = fmt == GE_CMODE_32BIT_ABGR8888 ? 4 : 5;
|
||||
PSRLD(formatReg, 16);
|
||||
PSLLD(formatReg, 32 - offsetBits);
|
||||
PSRLD(formatReg, 32 - offsetBits - 4);
|
||||
|
||||
POR(indexReg, R(formatReg));
|
||||
}
|
||||
|
||||
regCache_.Release(formatReg, RegCache::VEC_TEMP0);
|
||||
regCache_.Unlock(indexReg, RegCache::VEC_INDEX);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool SamplerJitCache::Jit_ReadClutQuad(const SamplerID &id, bool level1) {
|
||||
Describe("ReadCLUTQuad");
|
||||
X64Reg indexReg = regCache_.Find(RegCache::VEC_INDEX);
|
||||
|
Loading…
x
Reference in New Issue
Block a user