samplerjit: Use VPGATHERDD for simple CLUT4 loads.

Planning to expand this to more paths.
This commit is contained in:
Unknown W. Brackets 2022-01-02 10:45:03 -08:00
parent 65c84d5dd5
commit 22f770c828
5 changed files with 185 additions and 21 deletions

View File

@ -2240,20 +2240,20 @@ void XEmitter::VGATHERQPD(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {
_assert_msg_(regOp1 != regOp2 && !arg.IsIndexedReg(regOp1) && !arg.IsIndexedReg(regOp2), "VGATHER cannot have overlapped registers");
WriteAVX2Op(bits, 0x66, 0x3893, regOp1, regOp2, arg);
}
void XEmitter::VGATHERDD(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {
_assert_msg_(regOp1 != regOp2 && !arg.IsIndexedReg(regOp1) && !arg.IsIndexedReg(regOp2), "VGATHER cannot have overlapped registers");
void XEmitter::VPGATHERDD(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {
_assert_msg_(regOp1 != regOp2 && !arg.IsIndexedReg(regOp1) && !arg.IsIndexedReg(regOp2), "VPGATHER cannot have overlapped registers");
WriteAVX2Op(bits, 0x66, 0x3890, regOp1, regOp2, arg);
}
void XEmitter::VGATHERQD(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {
_assert_msg_(regOp1 != regOp2 && !arg.IsIndexedReg(regOp1) && !arg.IsIndexedReg(regOp2), "VGATHER cannot have overlapped registers");
void XEmitter::VPGATHERQD(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {
_assert_msg_(regOp1 != regOp2 && !arg.IsIndexedReg(regOp1) && !arg.IsIndexedReg(regOp2), "VPGATHER cannot have overlapped registers");
WriteAVX2Op(bits, 0x66, 0x3891, regOp1, regOp2, arg);
}
void XEmitter::VGATHERDQ(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {
_assert_msg_(regOp1 != regOp2 && !arg.IsIndexedReg(regOp1) && !arg.IsIndexedReg(regOp2), "VGATHER cannot have overlapped registers");
void XEmitter::VPGATHERDQ(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {
_assert_msg_(regOp1 != regOp2 && !arg.IsIndexedReg(regOp1) && !arg.IsIndexedReg(regOp2), "VPGATHER cannot have overlapped registers");
WriteAVX2Op(bits, 0x66, 0x3890, regOp1, regOp2, arg, 0, 1);
}
void XEmitter::VGATHERQQ(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {
_assert_msg_(regOp1 != regOp2 && !arg.IsIndexedReg(regOp1) && !arg.IsIndexedReg(regOp2), "VGATHER cannot have overlapped registers");
void XEmitter::VPGATHERQQ(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {
_assert_msg_(regOp1 != regOp2 && !arg.IsIndexedReg(regOp1) && !arg.IsIndexedReg(regOp2), "VPGATHER cannot have overlapped registers");
WriteAVX2Op(bits, 0x66, 0x3891, regOp1, regOp2, arg, 0, 1);
}

View File

@ -1239,10 +1239,10 @@ public:
void VGATHERDPD(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2);
void VGATHERQPS(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2);
void VGATHERQPD(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2);
void VGATHERDD(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2);
void VGATHERQD(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2);
void VGATHERDQ(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2);
void VGATHERQQ(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2);
void VPGATHERDD(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2);
void VPGATHERQD(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2);
void VPGATHERDQ(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2);
void VPGATHERQQ(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2);
void VPSLLVD(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg);
void VPSLLVQ(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg);

View File

@ -103,6 +103,7 @@ struct RegCache {
VEC_RESULT1 = 0x0002,
VEC_U1 = 0x0003,
VEC_V1 = 0x0004,
VEC_INDEX = 0x0005,
GEN_SRC_ALPHA = 0x0100,
GEN_GSTATE = 0x0101,

View File

@ -101,6 +101,8 @@ private:
bool Jit_PrepareDataOffsets(const SamplerID &id, Rasterizer::RegCache::Reg uReg, Rasterizer::RegCache::Reg vReg, bool level1);
bool Jit_PrepareDataDirectOffsets(const SamplerID &id, Rasterizer::RegCache::Reg uReg, Rasterizer::RegCache::Reg vReg, bool level1, int bitsPerTexel);
bool Jit_PrepareDataSwizzledOffsets(const SamplerID &id, Rasterizer::RegCache::Reg uReg, Rasterizer::RegCache::Reg vReg, bool level1, int bitsPerTexel);
bool Jit_ReadQuad(const SamplerID &id, bool level1, bool *doFallback);
bool Jit_ReadClutQuad(const SamplerID &id, bool level1);
bool Jit_BlendQuad(const SamplerID &id, bool level1);
bool Jit_DecodeQuad(const SamplerID &id, bool level1);
bool Jit_Decode5650Quad(const SamplerID &id, Rasterizer::RegCache::Reg quadReg);

View File

@ -686,11 +686,15 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) {
regCache_.Unlock(vecResultReg, level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
};
Describe("Calls");
doNearestCall(0, false);
doNearestCall(4, false);
doNearestCall(8, false);
doNearestCall(12, false);
bool readFallback;
success = success && Jit_ReadQuad(id, false, &readFallback);
if (readFallback) {
Describe("Calls");
doNearestCall(0, false);
doNearestCall(4, false);
doNearestCall(8, false);
doNearestCall(12, false);
}
if (id.hasAnyMips) {
Describe("MipsCalls");
@ -713,10 +717,14 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) {
ADD(32, MDisp(RSP, stackArgPos_ + 16), Imm8(1));
}
doNearestCall(0, true);
doNearestCall(4, true);
doNearestCall(8, true);
doNearestCall(12, true);
success = success && Jit_ReadQuad(id, true, &readFallback);
if (readFallback) {
Describe("Calls");
doNearestCall(0, true);
doNearestCall(4, true);
doNearestCall(8, true);
doNearestCall(12, true);
}
SetJumpTarget(skip);
}
@ -931,6 +939,159 @@ RegCache::Reg SamplerJitCache::GetGState() {
return regCache_.Find(RegCache::GEN_GSTATE);
}
bool SamplerJitCache::Jit_ReadQuad(const SamplerID &id, bool level1, bool *doFallback) {
*doFallback = false;
bool success = true;
// TODO: Limit less.
if (cpu_info.bAVX2 && id.TexFmt() == GE_TFMT_CLUT4 && id.ClutFmt() == GE_CMODE_32BIT_ABGR8888 && !id.hasClutMask && !id.hasClutOffset && !id.hasClutShift) {
Describe("ReadQuad");
X64Reg baseReg = regCache_.Alloc(RegCache::GEN_ARG_TEXPTR);
X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR_PTR);
MOV(64, R(baseReg), MDisp(srcReg, level1 ? 8 : 0));
regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR_PTR);
X64Reg vecIndexReg = regCache_.Alloc(RegCache::VEC_INDEX);
X64Reg maskReg = regCache_.Alloc(RegCache::VEC_TEMP0);
// We have to set a mask for which values to load. Load all 4.
// Note this is overwritten with zeroes by the instruction.
PCMPEQD(maskReg, R(maskReg));
X64Reg indexReg = regCache_.Find(level1 ? RegCache::VEC_V1 : RegCache::VEC_ARG_V);
VPGATHERDD(128, vecIndexReg, MComplex(baseReg, indexReg, SCALE_1, 0), maskReg);
regCache_.Unlock(indexReg, level1 ? RegCache::VEC_V1 : RegCache::VEC_ARG_V);
regCache_.Release(baseReg, RegCache::GEN_ARG_TEXPTR);
// Take only lowest bit, multiply by 4 with shifting.
X64Reg uReg = regCache_.Find(level1 ? RegCache::VEC_U1 : RegCache::VEC_ARG_U);
PSLLD(uReg, 31);
PSRLD(uReg, 29);
// Next, shift away based on the odd U bits.
VPSRLVD(128, vecIndexReg, vecIndexReg, R(uReg));
regCache_.Unlock(uReg, level1 ? RegCache::VEC_U1 : RegCache::VEC_ARG_U);
// Okay, now we need to mask out just the low four bits.
PCMPEQD(maskReg, R(maskReg));
PSRLD(maskReg, 28);
PAND(vecIndexReg, R(maskReg));
regCache_.Release(maskReg, RegCache::VEC_TEMP0);
regCache_.Unlock(vecIndexReg, RegCache::VEC_INDEX);
// Great, now we can use our CLUT indices to gather again.
success = success && Jit_ReadClutQuad(id, level1);
} else {
// TODO
*doFallback = true;
}
return success;
}
bool SamplerJitCache::Jit_ReadClutQuad(const SamplerID &id, bool level1) {
Describe("ReadCLUTQuad");
X64Reg indexReg = regCache_.Find(RegCache::VEC_INDEX);
if (!id.useSharedClut) {
X64Reg vecLevelReg = regCache_.Alloc(RegCache::VEC_TEMP0);
if (regCache_.Has(RegCache::GEN_ARG_LEVEL)) {
X64Reg levelReg = regCache_.Find(RegCache::GEN_ARG_LEVEL);
MOVD_xmm(vecLevelReg, R(levelReg));
regCache_.Unlock(levelReg, RegCache::GEN_ARG_LEVEL);
} else {
#if PPSSPP_PLATFORM(WINDOWS)
if (cpu_info.bAVX2) {
VPBROADCASTD(128, vecLevelReg, MDisp(RSP, stackArgPos_ + 16));
} else {
MOVD_xmm(vecLevelReg, MDisp(RSP, stackArgPos_ + 16));
PSHUFD(vecLevelReg, R(vecLevelReg), _MM_SHUFFLE(0, 0, 0, 0));
}
#else
_assert_(false);
#endif
}
// Now we multiply by 16, and add.
PSLLD(vecLevelReg, 4);
PADDD(indexReg, R(vecLevelReg));
regCache_.Release(vecLevelReg, RegCache::VEC_TEMP0);
}
X64Reg clutBaseReg = regCache_.Alloc(RegCache::GEN_TEMP1);
MOV(PTRBITS, R(clutBaseReg), ImmPtr(clut));
X64Reg resultReg = regCache_.Find(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
X64Reg maskReg = regCache_.Alloc(RegCache::VEC_TEMP0);
if (cpu_info.bAVX2)
PCMPEQD(maskReg, R(maskReg));
switch (id.ClutFmt()) {
case GE_CMODE_16BIT_BGR5650:
case GE_CMODE_16BIT_ABGR5551:
case GE_CMODE_16BIT_ABGR4444:
if (cpu_info.bAVX2) {
VPGATHERDD(128, resultReg, MComplex(clutBaseReg, indexReg, SCALE_2, 0), maskReg);
// Clear out the top 16 bits.
PCMPEQD(maskReg, R(maskReg));
PSRLD(maskReg, 16);
PAND(resultReg, R(maskReg));
} else {
PXOR(resultReg, R(resultReg));
X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);
if (cpu_info.bSSE4_1) {
for (int i = 0; i < 4; ++i) {
PEXTRD(R(temp2Reg), indexReg, i);
PINSRW(resultReg, MComplex(clutBaseReg, temp2Reg, SCALE_2, 0), i * 2);
}
} else {
for (int i = 0; i < 4; ++i) {
MOVD_xmm(R(temp2Reg), indexReg);
if (i != 3)
PSRLDQ(indexReg, 4);
PINSRW(resultReg, MComplex(clutBaseReg, temp2Reg, SCALE_2, 0), i * 2);
}
}
regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);
}
break;
case GE_CMODE_32BIT_ABGR8888:
if (cpu_info.bAVX2) {
VPGATHERDD(128, resultReg, MComplex(clutBaseReg, indexReg, SCALE_4, 0), maskReg);
} else {
X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);
if (cpu_info.bSSE4_1) {
for (int i = 0; i < 4; ++i) {
PEXTRD(R(temp2Reg), indexReg, i);
PINSRD(resultReg, MComplex(clutBaseReg, temp2Reg, SCALE_4, 0), i);
}
} else {
for (int i = 0; i < 4; ++i) {
MOVD_xmm(R(temp2Reg), indexReg);
if (i != 3)
PSRLDQ(indexReg, 4);
if (i == 0) {
MOVD_xmm(resultReg , MComplex(clutBaseReg, temp2Reg, SCALE_4, 0));
} else {
MOVD_xmm(maskReg, MComplex(clutBaseReg, temp2Reg, SCALE_4, 0));
PSLLDQ(maskReg, 4 * i);
POR(resultReg, R(maskReg));
}
}
}
regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);
}
break;
}
regCache_.Release(maskReg, RegCache::VEC_TEMP0);
regCache_.Unlock(resultReg, level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
regCache_.Release(clutBaseReg, RegCache::GEN_TEMP1);
regCache_.Release(indexReg, RegCache::VEC_INDEX);
return true;
}
bool SamplerJitCache::Jit_BlendQuad(const SamplerID &id, bool level1) {
Describe(level1 ? "BlendQuadMips" : "BlendQuad");