samplerjit: Use SSSE3/SSE4 in linear filtering.

This commit is contained in:
Unknown W. Brackets 2021-12-30 15:23:48 -08:00
parent 7aa9664d20
commit 1addf84e90
4 changed files with 188 additions and 102 deletions

View File

@ -1851,6 +1851,7 @@ void XEmitter::PINSRB(X64Reg dest, OpArg arg, u8 subreg) {WriteSSE41Op(0x66,
void XEmitter::PINSRD(X64Reg dest, OpArg arg, u8 subreg) {WriteSSE41Op(0x66, 0x3A22, dest, arg, 1); Write8(subreg);}
void XEmitter::PMADDWD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xF5, dest, arg); }
void XEmitter::PMADDUBSW(X64Reg dest, OpArg arg) {WriteSSSE3Op(0x66, 0x3804, dest, arg);}
void XEmitter::PSADBW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xF6, dest, arg);}
void XEmitter::PMAXSW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xEE, dest, arg); }

View File

@ -820,6 +820,7 @@ public:
void PINSRD(X64Reg dest, OpArg arg, u8 subreg);
void PMADDWD(X64Reg dest, OpArg arg);
void PMADDUBSW(X64Reg dest, OpArg arg);
void PSADBW(X64Reg dest, OpArg arg);
void PMAXSW(X64Reg dest, OpArg arg);

View File

@ -113,8 +113,9 @@ private:
const u8 *constVNext_ = nullptr;
const u8 *constOnes32_ = nullptr;
const u8 *constOnes16_ = nullptr;
const u8 *const10All16_ = nullptr;
const u8 *const10Low_ = nullptr;
const u8 *const10All_ = nullptr;
const u8 *const10All8_ = nullptr;
std::unordered_map<SamplerID, NearestFunc> cache_;
std::unordered_map<SamplerID, const u8 *> addresses_;

View File

@ -178,7 +178,7 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) {
regCache_.Reset(true);
// Let's drop some helpful constants here.
const10All_ = AlignCode16();
const10All16_ = AlignCode16();
Write16(0x10); Write16(0x10); Write16(0x10); Write16(0x10);
Write16(0x10); Write16(0x10); Write16(0x10); Write16(0x10);
@ -186,6 +186,10 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) {
Write16(0x10); Write16(0x10); Write16(0x10); Write16(0x10);
Write16(0); Write16(0); Write16(0); Write16(0);
const10All8_ = AlignCode16();
for (int i = 0; i < 16; ++i)
Write8(0x10);
if (!id.hasAnyMips) {
constWidth256f_ = AlignCode16();
float w256f = (1 << id.width0Shift) * 256;
@ -502,7 +506,7 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) {
// Okay, next we need an inverse for color 0.
X64Reg invFracReg = regCache_.Alloc(RegCache::VEC_TEMP1);
MOVDQA(invFracReg, M(const10All_));
MOVDQA(invFracReg, M(const10All16_));
PSUBW(invFracReg, R(fracReg));
// And multiply.
@ -587,110 +591,189 @@ RegCache::Reg SamplerJitCache::GetGState() {
bool SamplerJitCache::Jit_BlendQuad(const SamplerID &id, bool level1) {
Describe(level1 ? "BlendQuadMips" : "BlendQuad");
// First put the top RRRRRRRR LLLLLLLL into topReg, bottom into bottomReg.
// Start with XXXX XXXX RRRR LLLL, and then expand 8 bits to 16 bits.
// TODO: Maybe we can use PMADDUBSW?
X64Reg topReg = regCache_.Alloc(RegCache::VEC_TEMP0);
X64Reg bottomReg = regCache_.Alloc(RegCache::VEC_TEMP1);
X64Reg quadReg = regCache_.Find(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
if (!cpu_info.bSSE4_1) {
if (cpu_info.bSSE4_1 && cpu_info.bSSSE3) {
// Let's start by rearranging from TL TR BL BR like this:
// ABCD EFGH IJKL MNOP -> AI BJ CK DL EM FN GO HP -> AIEM BJFN CKGO DLHP
// This way, all the RGBAs are next to each other, and in order TL BL TR BR.
X64Reg quadReg = regCache_.Find(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
X64Reg tempArrangeReg = regCache_.Alloc(RegCache::VEC_TEMP0);
PSHUFD(tempArrangeReg, R(quadReg), _MM_SHUFFLE(3, 2, 3, 2));
PUNPCKLBW(quadReg, R(tempArrangeReg));
// Okay, that's top and bottom interleaved, now for left and right.
PSHUFD(tempArrangeReg, R(quadReg), _MM_SHUFFLE(3, 2, 3, 2));
PUNPCKLWD(quadReg, R(tempArrangeReg));
regCache_.Release(tempArrangeReg, RegCache::VEC_TEMP0);
// Next up, we want to multiply and add using a repeated TB frac pair.
// That's (0x10 - frac_v) in byte 1, frac_v in byte 2, repeating.
X64Reg fracReg = regCache_.Alloc(RegCache::VEC_TEMP0);
X64Reg zeroReg = GetZeroVec();
PSHUFD(topReg, R(quadReg), _MM_SHUFFLE(0, 0, 1, 0));
PSHUFD(bottomReg, R(quadReg), _MM_SHUFFLE(0, 0, 3, 2));
PUNPCKLBW(topReg, R(zeroReg));
PUNPCKLBW(bottomReg, R(zeroReg));
regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
} else {
PSHUFD(bottomReg, R(quadReg), _MM_SHUFFLE(0, 0, 3, 2));
PMOVZXBW(topReg, R(quadReg));
PMOVZXBW(bottomReg, R(bottomReg));
}
if (!level1) {
regCache_.Unlock(quadReg, RegCache::VEC_RESULT);
regCache_.ForceRelease(RegCache::VEC_RESULT);
}
// Grab frac_u and spread to lower (L) lanes.
X64Reg fracReg = regCache_.Alloc(RegCache::VEC_TEMP2);
X64Reg fracMulReg = regCache_.Alloc(RegCache::VEC_TEMP3);
if (level1) {
MOVD_xmm(fracReg, MDisp(RSP, stackArgPos_ + stackFracUV1Offset_));
} else {
X64Reg fracUReg = regCache_.Find(RegCache::GEN_ARG_FRAC_U);
MOVD_xmm(fracReg, R(fracUReg));
regCache_.Unlock(fracUReg, RegCache::GEN_ARG_FRAC_U);
regCache_.ForceRelease(RegCache::GEN_ARG_FRAC_U);
}
PSHUFLW(fracReg, R(fracReg), _MM_SHUFFLE(0, 0, 0, 0));
// Now subtract 0x10 - frac_u in the L lanes only: 00000000 LLLLLLLL.
MOVDQA(fracMulReg, M(const10Low_));
PSUBW(fracMulReg, R(fracReg));
// Then we just shift and OR in the original frac_u.
PSLLDQ(fracReg, 8);
POR(fracMulReg, R(fracReg));
regCache_.Release(fracReg, RegCache::VEC_TEMP2);
// Okay, we have 8-bits in the top and bottom rows for the color.
// Multiply by frac to get 12, which we keep for the next stage.
PMULLW(topReg, R(fracMulReg));
PMULLW(bottomReg, R(fracMulReg));
regCache_.Release(fracMulReg, RegCache::VEC_TEMP3);
// Time for frac_v. This time, we want it in all 8 lanes.
fracReg = regCache_.Alloc(RegCache::VEC_TEMP2);
X64Reg fracTopReg = regCache_.Alloc(RegCache::VEC_TEMP3);
if (level1) {
MOVD_xmm(fracReg, MDisp(RSP, stackArgPos_ + stackFracUV1Offset_ + 4));
} else {
X64Reg fracVReg = regCache_.Find(RegCache::GEN_ARG_FRAC_V);
MOVD_xmm(fracReg, R(fracVReg));
regCache_.Unlock(fracVReg, RegCache::GEN_ARG_FRAC_V);
regCache_.ForceRelease(RegCache::GEN_ARG_FRAC_V);
}
PSHUFLW(fracReg, R(fracReg), _MM_SHUFFLE(0, 0, 0, 0));
PSHUFD(fracReg, R(fracReg), _MM_SHUFFLE(0, 0, 0, 0));
// Now, inverse fracReg into fracTopReg for the top row.
MOVDQA(fracTopReg, M(const10All_));
PSUBW(fracTopReg, R(fracReg));
// We had 12, plus 4 frac, that gives us 16.
PMULLW(bottomReg, R(fracReg));
PMULLW(topReg, R(fracTopReg));
regCache_.Release(fracReg, RegCache::VEC_TEMP2);
regCache_.Release(fracTopReg, RegCache::VEC_TEMP3);
// Finally, time to sum them all up and divide by 256 to get back to 8 bits.
PADDUSW(bottomReg, R(topReg));
regCache_.Release(topReg, RegCache::VEC_TEMP0);
bool changeSuccess = true;
if (level1) {
PSHUFD(quadReg, R(bottomReg), _MM_SHUFFLE(3, 2, 3, 2));
PADDUSW(quadReg, R(bottomReg));
PSRLW(quadReg, 8);
regCache_.Release(bottomReg, RegCache::VEC_TEMP1);
regCache_.Unlock(quadReg, RegCache::VEC_RESULT1);
} else {
changeSuccess = regCache_.ChangeReg(XMM0, RegCache::VEC_RESULT);
if (!changeSuccess) {
_assert_msg_(XMM0 == bottomReg, "Unexpected other reg locked as destReg");
X64Reg otherReg = regCache_.Alloc(RegCache::VEC_TEMP0);
PSHUFD(otherReg, R(bottomReg), _MM_SHUFFLE(3, 2, 3, 2));
PADDUSW(bottomReg, R(otherReg));
regCache_.Release(otherReg, RegCache::VEC_TEMP0);
regCache_.Release(bottomReg, RegCache::VEC_TEMP1);
// Okay, now it can be changed.
regCache_.ChangeReg(XMM0, RegCache::VEC_RESULT);
if (level1) {
MOVD_xmm(fracReg, MDisp(RSP, stackArgPos_ + stackFracUV1Offset_ + 4));
} else {
PSHUFD(XMM0, R(bottomReg), _MM_SHUFFLE(3, 2, 3, 2));
PADDUSW(XMM0, R(bottomReg));
regCache_.Release(bottomReg, RegCache::VEC_TEMP1);
X64Reg fracVReg = regCache_.Find(RegCache::GEN_ARG_FRAC_V);
MOVD_xmm(fracReg, R(fracVReg));
regCache_.Unlock(fracVReg, RegCache::GEN_ARG_FRAC_V);
regCache_.ForceRelease(RegCache::GEN_ARG_FRAC_V);
}
PSHUFB(fracReg, R(zeroReg));
regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
// Now, inverse fracReg, then interleave into the actual multiplier.
// This gives us the repeated TB pairs we wanted.
X64Reg multTBReg = regCache_.Alloc(RegCache::VEC_TEMP1);
MOVDQA(multTBReg, M(const10All8_));
PSUBB(multTBReg, R(fracReg));
PUNPCKLBW(multTBReg, R(fracReg));
regCache_.Release(fracReg, RegCache::VEC_TEMP0);
// Now we can multiply and add paired lanes in one go.
// Note that since T+B=0x10, this gives us exactly 12 bits.
PMADDUBSW(quadReg, R(multTBReg));
regCache_.Release(multTBReg, RegCache::VEC_TEMP1);
// With that done, we need to multiply by LR, or rather 0L0R, and sum again.
// Since RRRR was all next to each other, this gives us a clean total R.
fracReg = regCache_.Alloc(RegCache::VEC_TEMP0);
if (level1) {
MOVD_xmm(fracReg, MDisp(RSP, stackArgPos_ + stackFracUV1Offset_));
} else {
X64Reg fracUReg = regCache_.Find(RegCache::GEN_ARG_FRAC_U);
MOVD_xmm(fracReg, R(fracUReg));
regCache_.Unlock(fracUReg, RegCache::GEN_ARG_FRAC_U);
regCache_.ForceRelease(RegCache::GEN_ARG_FRAC_U);
}
// We can ignore the high bits, since we'll interleave those away anyway.
PSHUFLW(fracReg, R(fracReg), _MM_SHUFFLE(0, 0, 0, 0));
// Again, we're inversing into an interleaved multiplier. L is the inversed one.
// 0L0R is (0x10 - frac_u), frac_u - 2x16 repeated four times.
X64Reg multLRReg = regCache_.Alloc(RegCache::VEC_TEMP1);
MOVDQA(multLRReg, M(const10All16_));
PSUBW(multLRReg, R(fracReg));
PUNPCKLWD(multLRReg, R(fracReg));
regCache_.Release(fracReg, RegCache::VEC_TEMP0);
// This gives us RGBA as dwords, but they're all shifted left by 8 from the multiplies.
PMADDWD(quadReg, R(multLRReg));
PSRLD(quadReg, 8);
regCache_.Release(multLRReg, RegCache::VEC_TEMP1);
// Shrink to 16-bit, it's more convenient for later.
PACKSSDW(quadReg, R(quadReg));
if (level1) {
regCache_.Unlock(quadReg, RegCache::VEC_RESULT1);
} else {
MOVDQA(XMM0, R(quadReg));
regCache_.Unlock(quadReg, RegCache::VEC_RESULT);
regCache_.ForceRelease(RegCache::VEC_RESULT);
bool changeSuccess = regCache_.ChangeReg(XMM0, RegCache::VEC_RESULT);
_assert_msg_(changeSuccess, "Unexpected reg locked as destReg");
}
} else {
X64Reg topReg = regCache_.Alloc(RegCache::VEC_TEMP0);
X64Reg bottomReg = regCache_.Alloc(RegCache::VEC_TEMP1);
X64Reg quadReg = regCache_.Find(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
if (!cpu_info.bSSE4_1) {
X64Reg zeroReg = GetZeroVec();
PSHUFD(topReg, R(quadReg), _MM_SHUFFLE(0, 0, 1, 0));
PSHUFD(bottomReg, R(quadReg), _MM_SHUFFLE(0, 0, 3, 2));
PUNPCKLBW(topReg, R(zeroReg));
PUNPCKLBW(bottomReg, R(zeroReg));
regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
} else {
PSHUFD(bottomReg, R(quadReg), _MM_SHUFFLE(0, 0, 3, 2));
PMOVZXBW(topReg, R(quadReg));
PMOVZXBW(bottomReg, R(bottomReg));
}
if (!level1) {
regCache_.Unlock(quadReg, RegCache::VEC_RESULT);
regCache_.ForceRelease(RegCache::VEC_RESULT);
}
PSRLW(XMM0, 8);
// Grab frac_u and spread to lower (L) lanes.
X64Reg fracReg = regCache_.Alloc(RegCache::VEC_TEMP2);
X64Reg fracMulReg = regCache_.Alloc(RegCache::VEC_TEMP3);
if (level1) {
MOVD_xmm(fracReg, MDisp(RSP, stackArgPos_ + stackFracUV1Offset_));
} else {
X64Reg fracUReg = regCache_.Find(RegCache::GEN_ARG_FRAC_U);
MOVD_xmm(fracReg, R(fracUReg));
regCache_.Unlock(fracUReg, RegCache::GEN_ARG_FRAC_U);
regCache_.ForceRelease(RegCache::GEN_ARG_FRAC_U);
}
PSHUFLW(fracReg, R(fracReg), _MM_SHUFFLE(0, 0, 0, 0));
// Now subtract 0x10 - frac_u in the L lanes only: 00000000 LLLLLLLL.
MOVDQA(fracMulReg, M(const10Low_));
PSUBW(fracMulReg, R(fracReg));
// Then we just shift and OR in the original frac_u.
PSLLDQ(fracReg, 8);
POR(fracMulReg, R(fracReg));
regCache_.Release(fracReg, RegCache::VEC_TEMP2);
// Okay, we have 8-bits in the top and bottom rows for the color.
// Multiply by frac to get 12, which we keep for the next stage.
PMULLW(topReg, R(fracMulReg));
PMULLW(bottomReg, R(fracMulReg));
regCache_.Release(fracMulReg, RegCache::VEC_TEMP3);
// Time for frac_v. This time, we want it in all 8 lanes.
fracReg = regCache_.Alloc(RegCache::VEC_TEMP2);
X64Reg fracTopReg = regCache_.Alloc(RegCache::VEC_TEMP3);
if (level1) {
MOVD_xmm(fracReg, MDisp(RSP, stackArgPos_ + stackFracUV1Offset_ + 4));
} else {
X64Reg fracVReg = regCache_.Find(RegCache::GEN_ARG_FRAC_V);
MOVD_xmm(fracReg, R(fracVReg));
regCache_.Unlock(fracVReg, RegCache::GEN_ARG_FRAC_V);
regCache_.ForceRelease(RegCache::GEN_ARG_FRAC_V);
}
PSHUFLW(fracReg, R(fracReg), _MM_SHUFFLE(0, 0, 0, 0));
PSHUFD(fracReg, R(fracReg), _MM_SHUFFLE(0, 0, 0, 0));
// Now, inverse fracReg into fracTopReg for the top row.
MOVDQA(fracTopReg, M(const10All16_));
PSUBW(fracTopReg, R(fracReg));
// We had 12, plus 4 frac, that gives us 16.
PMULLW(bottomReg, R(fracReg));
PMULLW(topReg, R(fracTopReg));
regCache_.Release(fracReg, RegCache::VEC_TEMP2);
regCache_.Release(fracTopReg, RegCache::VEC_TEMP3);
// Finally, time to sum them all up and divide by 256 to get back to 8 bits.
PADDUSW(bottomReg, R(topReg));
regCache_.Release(topReg, RegCache::VEC_TEMP0);
if (level1) {
PSHUFD(quadReg, R(bottomReg), _MM_SHUFFLE(3, 2, 3, 2));
PADDUSW(quadReg, R(bottomReg));
PSRLW(quadReg, 8);
regCache_.Release(bottomReg, RegCache::VEC_TEMP1);
regCache_.Unlock(quadReg, RegCache::VEC_RESULT1);
} else {
bool changeSuccess = regCache_.ChangeReg(XMM0, RegCache::VEC_RESULT);
if (!changeSuccess) {
_assert_msg_(XMM0 == bottomReg, "Unexpected other reg locked as destReg");
X64Reg otherReg = regCache_.Alloc(RegCache::VEC_TEMP0);
PSHUFD(otherReg, R(bottomReg), _MM_SHUFFLE(3, 2, 3, 2));
PADDUSW(bottomReg, R(otherReg));
regCache_.Release(otherReg, RegCache::VEC_TEMP0);
regCache_.Release(bottomReg, RegCache::VEC_TEMP1);
// Okay, now it can be changed.
regCache_.ChangeReg(XMM0, RegCache::VEC_RESULT);
} else {
PSHUFD(XMM0, R(bottomReg), _MM_SHUFFLE(3, 2, 3, 2));
PADDUSW(XMM0, R(bottomReg));
regCache_.Release(bottomReg, RegCache::VEC_TEMP1);
}
PSRLW(XMM0, 8);
}
}
return true;