samplerjit: Calculate mip level U/V/offsets.

Not actually doing the sampling for the second mip level in the single jit
pass yet, but close.
This commit is contained in:
Unknown W. Brackets 2021-12-28 14:12:58 -08:00
parent a4558a5736
commit cdf14c8579
3 changed files with 330 additions and 164 deletions

View File

@ -100,6 +100,9 @@ struct RegCache {
VEC_ZERO = 0x0000,
VEC_RESULT = 0x0001,
VEC_RESULT1 = 0x0002,
VEC_U1 = 0x0003,
VEC_V1 = 0x0004,
GEN_SRC_ALPHA = 0x0100,
GEN_GSTATE = 0x0101,

View File

@ -86,13 +86,15 @@ private:
bool Jit_ApplyDXTAlpha(const SamplerID &id);
bool Jit_GetTexelCoordsQuad(const SamplerID &id);
bool Jit_PrepareDataOffsets(const SamplerID &id);
bool Jit_PrepareDataSwizzledOffsets(const SamplerID &id, int bitsPerTexel);
bool Jit_PrepareDataOffsets(const SamplerID &id, Rasterizer::RegCache::Reg uReg, Rasterizer::RegCache::Reg vReg);
bool Jit_PrepareDataDirectOffsets(const SamplerID &id, Rasterizer::RegCache::Reg uReg, Rasterizer::RegCache::Reg vReg, int bitsPerTexel);
bool Jit_PrepareDataSwizzledOffsets(const SamplerID &id, Rasterizer::RegCache::Reg uReg, Rasterizer::RegCache::Reg vReg, int bitsPerTexel);
#if PPSSPP_ARCH(ARM64)
Arm64Gen::ARM64FloatEmitter fp;
#elif PPSSPP_ARCH(AMD64) || PPSSPP_ARCH(X86)
int stackArgPos_ = 0;
int stackFracUV1Offset_ = 0;
#endif
const u8 *constWidth256f_ = nullptr;

View File

@ -116,12 +116,21 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) {
RegCache::GEN_ARG_LEVELFRAC,
});
regCache_.ChangeReg(RAX, RegCache::GEN_RESULT);
regCache_.ChangeReg(XMM0, RegCache::VEC_ARG_U);
regCache_.ForceRetain(RegCache::VEC_ARG_U);
regCache_.ChangeReg(XMM1, RegCache::VEC_ARG_V);
regCache_.ForceRetain(RegCache::VEC_ARG_V);
regCache_.ChangeReg(XMM5, RegCache::VEC_RESULT);
regCache_.ForceRetain(RegCache::VEC_RESULT);
auto lockReg = [&](X64Reg r, RegCache::Purpose p) {
regCache_.ChangeReg(r, p);
regCache_.ForceRetain(p);
};
lockReg(XMM0, RegCache::VEC_ARG_U);
lockReg(XMM1, RegCache::VEC_ARG_V);
lockReg(XMM5, RegCache::VEC_RESULT);
#if !PPSSPP_PLATFORM(WINDOWS)
if (id.hasAnyMips) {
lockReg(XMM6, RegCache::VEC_U1);
lockReg(XMM7, RegCache::VEC_V1);
lockReg(XMM8, RegCache::VEC_RESULT1);
}
lockReg(XMM9, RegCache::VEC_ARG_COLOR);
#endif
// We'll first write the nearest sampler, which we will CALL.
// This may differ slightly based on the "linear" flag.
@ -139,10 +148,17 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) {
regCache_.ForceRelease(RegCache::VEC_ARG_U);
regCache_.ForceRelease(RegCache::VEC_ARG_V);
regCache_.ForceRelease(RegCache::VEC_RESULT);
if (regCache_.Has(RegCache::GEN_ARG_LEVEL))
regCache_.ForceRelease(RegCache::GEN_ARG_LEVEL);
if (regCache_.Has(RegCache::GEN_ARG_LEVELFRAC))
regCache_.ForceRelease(RegCache::GEN_ARG_LEVELFRAC);
auto unlockOptReg = [&](RegCache::Purpose p) {
if (regCache_.Has(p))
regCache_.ForceRelease(p);
};
unlockOptReg(RegCache::GEN_ARG_LEVEL);
unlockOptReg(RegCache::GEN_ARG_LEVELFRAC);
unlockOptReg(RegCache::VEC_U1);
unlockOptReg(RegCache::VEC_V1);
unlockOptReg(RegCache::VEC_RESULT1);
unlockOptReg(RegCache::VEC_ARG_COLOR);
regCache_.Reset(true);
// Let's drop some helpful constants here.
@ -226,6 +242,39 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) {
regCache_.Add(R12, RegCache::GEN_ARG_FRAC_U);
stackArgPos_ += 32;
#if PPSSPP_PLATFORM(WINDOWS)
// Free up some more vector regs on Windows, where we're a bit tight.
SUB(64, R(RSP), Imm8(16 * 4 + 8));
stackArgPos_ += 16 * 4 + 8;
MOVDQA(MDisp(RSP, 0), XMM6);
MOVDQA(MDisp(RSP, 16), XMM7);
MOVDQA(MDisp(RSP, 32), XMM8);
MOVDQA(MDisp(RSP, 48), XMM9);
regCache_.Add(XMM6, RegCache::VEC_INVALID);
regCache_.Add(XMM7, RegCache::VEC_INVALID);
regCache_.Add(XMM8, RegCache::VEC_INVALID);
regCache_.Add(XMM9, RegCache::VEC_INVALID);
// Store frac UV in the gap.
stackFracUV1Offset_ = -stackArgPos_ + 16 * 4;
#endif
// Reserve a couple regs that the nearest CALL won't use.
if (id.hasAnyMips) {
regCache_.ChangeReg(XMM6, RegCache::VEC_U1);
regCache_.ChangeReg(XMM7, RegCache::VEC_V1);
regCache_.ForceRetain(RegCache::VEC_U1);
regCache_.ForceRetain(RegCache::VEC_V1);
}
// Save prim color for later in a different XMM too.
X64Reg primColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);
MOVDQA(XMM9, R(primColorReg));
regCache_.Unlock(primColorReg, RegCache::VEC_ARG_COLOR);
regCache_.ForceRelease(RegCache::VEC_ARG_COLOR);
regCache_.ChangeReg(XMM9, RegCache::VEC_ARG_COLOR);
regCache_.ForceRetain(RegCache::VEC_ARG_COLOR);
// We also want to save src and bufw for later. Might be in a reg already.
if (regCache_.Has(RegCache::GEN_ARG_TEXPTR)) {
_assert_(regCache_.Has(RegCache::GEN_ARG_BUFW));
@ -248,13 +297,10 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) {
regCache_.ForceRetain(RegCache::GEN_ARG_TEXPTR);
regCache_.ForceRetain(RegCache::GEN_ARG_BUFW);
bool success = true;
// Our first goal is to convert S/T and X/Y into U/V and frac_u/frac_v.
if (!Jit_GetTexelCoordsQuad(id)) {
regCache_.Reset(false);
EndWrite();
ResetCodePtr(GetOffset(nearest));
return nullptr;
}
success = success && Jit_GetTexelCoordsQuad(id);
// Early exit on !srcPtr (either one.)
FixupBranch zeroSrc;
@ -279,19 +325,24 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) {
regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR);
}
// TODO: Save color or put it somewhere... or reserve the reg?
// For now, throwing away to avoid confusion.
regCache_.ForceRelease(RegCache::VEC_ARG_COLOR);
auto prepareDataOffsets = [&](RegCache::Purpose uPurpose, RegCache::Purpose vPurpose) {
X64Reg uReg = regCache_.Find(uPurpose);
X64Reg vReg = regCache_.Find(vPurpose);
success = success && Jit_PrepareDataOffsets(id, uReg, vReg);
regCache_.Unlock(uReg, uPurpose);
regCache_.Unlock(vReg, vPurpose);
};
if (!Jit_PrepareDataOffsets(id)) {
regCache_.Reset(false);
EndWrite();
ResetCodePtr(GetOffset(nearest));
return nullptr;
}
prepareDataOffsets(RegCache::VEC_ARG_U, RegCache::VEC_ARG_V);
if (id.hasAnyMips)
prepareDataOffsets(RegCache::VEC_U1, RegCache::VEC_V1);
regCache_.ChangeReg(XMM5, RegCache::VEC_RESULT);
regCache_.ForceRetain(RegCache::VEC_RESULT);
if (id.hasAnyMips) {
regCache_.ChangeReg(XMM8, RegCache::VEC_RESULT1);
regCache_.ForceRetain(RegCache::VEC_RESULT1);
}
// This stores the result in an XMM for later processing.
// We map lookups to nearest CALLs, with arg order: u, v, src, bufw, level
@ -353,13 +404,27 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) {
// We're done with these now.
regCache_.ForceRelease(RegCache::VEC_ARG_U);
regCache_.ForceRelease(RegCache::VEC_ARG_V);
if (regCache_.Has(RegCache::VEC_U1))
regCache_.ForceRelease(RegCache::VEC_U1);
if (regCache_.Has(RegCache::VEC_V1))
regCache_.ForceRelease(RegCache::VEC_V1);
regCache_.ForceRelease(RegCache::VEC_ARG_COLOR);
regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR);
regCache_.ForceRelease(RegCache::GEN_ARG_BUFW);
if (regCache_.Has(RegCache::GEN_ARG_LEVEL))
regCache_.ForceRelease(RegCache::GEN_ARG_LEVEL);
if (!success) {
regCache_.Reset(false);
EndWrite();
ResetCodePtr(GetOffset(nearest));
return nullptr;
}
// TODO: Convert to reg cache.
regCache_.ForceRelease(RegCache::VEC_RESULT);
if (regCache_.Has(RegCache::VEC_RESULT1))
regCache_.ForceRelease(RegCache::VEC_RESULT1);
static const X64Reg fpScratchReg1 = XMM1;
static const X64Reg fpScratchReg2 = XMM2;
static const X64Reg fpScratchReg3 = XMM3;
@ -436,6 +501,13 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) {
SetJumpTarget(zeroSrc);
}
#if PPSSPP_PLATFORM(WINDOWS)
MOVDQA(XMM6, MDisp(RSP, 0));
MOVDQA(XMM7, MDisp(RSP, 16));
MOVDQA(XMM8, MDisp(RSP, 32));
MOVDQA(XMM9, MDisp(RSP, 48));
ADD(64, R(RSP), Imm8(16 * 4 + 8));
#endif
POP(R12);
POP(R13);
POP(R14);
@ -1265,9 +1337,12 @@ bool SamplerJitCache::Jit_GetTexelCoordsQuad(const SamplerID &id) {
X64Reg sReg = regCache_.Find(RegCache::VEC_ARG_S);
X64Reg tReg = regCache_.Find(RegCache::VEC_ARG_T);
// Start by multiplying with the width/height.
X64Reg widthVecReg = INVALID_REG;
X64Reg heightVecReg = INVALID_REG;
// Start by multiplying with the width/height... which might be complex with mips.
X64Reg width0VecReg = INVALID_REG;
X64Reg height0VecReg = INVALID_REG;
X64Reg width1VecReg = INVALID_REG;
X64Reg height1VecReg = INVALID_REG;
if (constWidth256f_ == nullptr) {
// We have to figure out levels and the proper width, ugh.
X64Reg shiftReg = regCache_.Find(RegCache::GEN_SHIFTVAL);
@ -1286,23 +1361,44 @@ bool SamplerJitCache::Jit_GetTexelCoordsQuad(const SamplerID &id) {
}
MOV(PTRBITS, R(gstateReg), ImmPtr(&gstate.nop));
// Load width for the given level.
MOVZX(32, 8, shiftReg, MComplex(gstateReg, levelReg, SCALE_4, offsetof(GPUgstate, texsize)));
AND(32, R(shiftReg), Imm8(0x0F));
MOV(32, R(tempReg), Imm32(1));
SHL(32, R(tempReg), R(shiftReg));
// Okay, now into a vector reg with it.
widthVecReg = regCache_.Alloc(RegCache::VEC_TEMP4);
MOVD_xmm(widthVecReg, R(tempReg));
X64Reg tempVecReg = regCache_.Alloc(RegCache::VEC_TEMP0);
auto loadSizeAndMul = [&](X64Reg dest, X64Reg size, bool isY, bool isLevel1) {
int offset = offsetof(GPUgstate, texsize) + (isY ? 1 : 0) + (isLevel1 ? 4 : 0);
// Grab the size, and shift.
MOVZX(32, 8, shiftReg, MComplex(gstateReg, levelReg, SCALE_4, offset));
AND(32, R(shiftReg), Imm8(0x0F));
MOV(32, R(tempReg), Imm32(1));
SHL(32, R(tempReg), R(shiftReg));
// Now for height, same deal.
MOVZX(32, 8, shiftReg, MComplex(gstateReg, levelReg, SCALE_4, 1 + offsetof(GPUgstate, texsize)));
AND(32, R(shiftReg), Imm8(0x0F));
MOV(32, R(tempReg), Imm32(1));
SHL(32, R(tempReg), R(shiftReg));
heightVecReg = regCache_.Alloc(RegCache::VEC_TEMP5);
MOVD_xmm(heightVecReg, R(tempReg));
// Okay, now move into a vec (two, in fact, one for the multiply.)
MOVD_xmm(tempVecReg, R(tempReg));
PSHUFD(size, R(tempVecReg), _MM_SHUFFLE(0, 0, 0, 0));
// Multiply by 256 and convert to a float.
PSLLD(tempVecReg, 8);
CVTDQ2PS(tempVecReg, R(tempVecReg));
// And then multiply.
MULPS(dest, R(tempVecReg));
};
// Copy out S and T so we can multiply.
X64Reg u1Reg = regCache_.Find(RegCache::VEC_U1);
X64Reg v1Reg = regCache_.Find(RegCache::VEC_V1);
MOVDQA(u1Reg, R(sReg));
MOVDQA(v1Reg, R(tReg));
// Load width and height for the given level, and multiply sReg/tReg meanwhile.
width0VecReg = regCache_.Alloc(RegCache::VEC_TEMP2);
loadSizeAndMul(sReg, width0VecReg, false, false);
height0VecReg = regCache_.Alloc(RegCache::VEC_TEMP3);
loadSizeAndMul(tReg, height0VecReg, true, false);
// And same for the next level, but with u1Reg/v1Reg.
width1VecReg = regCache_.Alloc(RegCache::VEC_TEMP4);
loadSizeAndMul(u1Reg, width1VecReg, false, true);
height1VecReg = regCache_.Alloc(RegCache::VEC_TEMP5);
loadSizeAndMul(v1Reg, height1VecReg, true, true);
regCache_.Unlock(shiftReg, RegCache::GEN_SHIFTVAL);
regCache_.Unlock(gstateReg, RegCache::GEN_GSTATE);
@ -1312,27 +1408,16 @@ bool SamplerJitCache::Jit_GetTexelCoordsQuad(const SamplerID &id) {
else
regCache_.Unlock(levelReg, RegCache::GEN_ARG_LEVEL);
// Okay, now we need to convert to float and multiply. Do the * 256 first.
PSLLD(widthVecReg, 8);
PSLLD(heightVecReg, 8);
// Use a temp for the multiply, since we need the ints for later.
X64Reg tempVecReg = regCache_.Alloc(RegCache::VEC_TEMP0);
CVTDQ2PS(tempVecReg, R(widthVecReg));
MULSS(sReg, R(tempVecReg));
CVTDQ2PS(tempVecReg, R(heightVecReg));
MULSS(tReg, R(tempVecReg));
regCache_.Release(tempVecReg, RegCache::VEC_TEMP0);
// Okay, undo the 256 multiply and broadcast.
PSRLD(widthVecReg, 8);
PSHUFD(widthVecReg, R(widthVecReg), _MM_SHUFFLE(0, 0, 0, 0));
PSRLD(heightVecReg, 8);
PSHUFD(heightVecReg, R(heightVecReg), _MM_SHUFFLE(0, 0, 0, 0));
regCache_.Unlock(u1Reg, RegCache::VEC_U1);
regCache_.Unlock(v1Reg, RegCache::VEC_V1);
// Now just subtract one. We use this later for clamp/wrap.
PSUBD(widthVecReg, M(constOnes_));
PSUBD(heightVecReg, M(constOnes_));
MOVDQA(tempVecReg, M(constOnes_));
PSUBD(width0VecReg, R(tempVecReg));
PSUBD(height0VecReg, R(tempVecReg));
PSUBD(width1VecReg, R(tempVecReg));
PSUBD(height1VecReg, R(tempVecReg));
regCache_.Release(tempVecReg, RegCache::VEC_TEMP0);
} else {
// Easy mode.
MULSS(sReg, M(constWidth256f_));
@ -1342,6 +1427,14 @@ bool SamplerJitCache::Jit_GetTexelCoordsQuad(const SamplerID &id) {
// And now, convert to integers for all later processing.
CVTPS2DQ(sReg, R(sReg));
CVTPS2DQ(tReg, R(tReg));
if (regCache_.Has(RegCache::VEC_U1)) {
X64Reg u1Reg = regCache_.Find(RegCache::VEC_U1);
X64Reg v1Reg = regCache_.Find(RegCache::VEC_V1);
CVTPS2DQ(u1Reg, R(u1Reg));
CVTPS2DQ(v1Reg, R(v1Reg));
regCache_.Unlock(u1Reg, RegCache::VEC_U1);
regCache_.Unlock(v1Reg, RegCache::VEC_V1);
}
// Now adjust X and Y...
X64Reg xReg = regCache_.Find(RegCache::GEN_ARG_X);
@ -1355,8 +1448,18 @@ bool SamplerJitCache::Jit_GetTexelCoordsQuad(const SamplerID &id) {
X64Reg tempXYReg = regCache_.Alloc(RegCache::VEC_TEMP0);
MOVD_xmm(tempXYReg, R(xReg));
PADDD(sReg, R(tempXYReg));
if (regCache_.Has(RegCache::VEC_U1)) {
X64Reg u1Reg = regCache_.Find(RegCache::VEC_U1);
PADDD(u1Reg, R(tempXYReg));
regCache_.Unlock(u1Reg, RegCache::VEC_U1);
}
MOVD_xmm(tempXYReg, R(yReg));
PADDD(tReg, R(tempXYReg));
if (regCache_.Has(RegCache::VEC_V1)) {
X64Reg v1Reg = regCache_.Find(RegCache::VEC_V1);
PADDD(v1Reg, R(tempXYReg));
regCache_.Unlock(v1Reg, RegCache::VEC_V1);
}
regCache_.Release(tempXYReg, RegCache::VEC_TEMP0);
regCache_.Unlock(xReg, RegCache::GEN_ARG_X);
@ -1367,6 +1470,23 @@ bool SamplerJitCache::Jit_GetTexelCoordsQuad(const SamplerID &id) {
// We do want the fraction, though, so extract that.
X64Reg fracUReg = regCache_.Find(RegCache::GEN_ARG_FRAC_U);
X64Reg fracVReg = regCache_.Find(RegCache::GEN_ARG_FRAC_V);
if (regCache_.Has(RegCache::VEC_U1)) {
// Start with the next level so we end with current in the regs.
X64Reg u1Reg = regCache_.Find(RegCache::VEC_U1);
X64Reg v1Reg = regCache_.Find(RegCache::VEC_V1);
MOVD_xmm(R(fracUReg), u1Reg);
MOVD_xmm(R(fracVReg), v1Reg);
SHR(32, R(fracUReg), Imm8(4));
AND(32, R(fracUReg), Imm8(0x0F));
SHR(32, R(fracVReg), Imm8(4));
AND(32, R(fracVReg), Imm8(0x0F));
regCache_.Unlock(u1Reg, RegCache::VEC_U1);
regCache_.Unlock(v1Reg, RegCache::VEC_V1);
// Store them on the stack for now.
MOV(32, MDisp(RSP, stackArgPos_ + stackFracUV1Offset_), R(fracUReg));
MOV(32, MDisp(RSP, stackArgPos_ + stackFracUV1Offset_ + 4), R(fracVReg));
}
MOVD_xmm(R(fracUReg), sReg);
MOVD_xmm(R(fracVReg), tReg);
SHR(32, R(fracUReg), Imm8(4));
@ -1381,11 +1501,23 @@ bool SamplerJitCache::Jit_GetTexelCoordsQuad(const SamplerID &id) {
PSRAD(tReg, 8);
PSHUFD(sReg, R(sReg), _MM_SHUFFLE(0, 0, 0, 0));
PSHUFD(tReg, R(tReg), _MM_SHUFFLE(0, 0, 0, 0));
// Add U/V values for the next coords.
PADDD(sReg, M(constUNext_));
PADDD(tReg, M(constVNext_));
if (regCache_.Has(RegCache::VEC_U1)) {
X64Reg u1Reg = regCache_.Find(RegCache::VEC_U1);
X64Reg v1Reg = regCache_.Find(RegCache::VEC_V1);
PSRAD(u1Reg, 8);
PSRAD(v1Reg, 8);
PSHUFD(u1Reg, R(u1Reg), _MM_SHUFFLE(0, 0, 0, 0));
PSHUFD(v1Reg, R(v1Reg), _MM_SHUFFLE(0, 0, 0, 0));
PADDD(u1Reg, M(constUNext_));
PADDD(v1Reg, M(constVNext_));
regCache_.Unlock(u1Reg, RegCache::VEC_U1);
regCache_.Unlock(v1Reg, RegCache::VEC_V1);
}
X64Reg temp0ClampReg = regCache_.Alloc(RegCache::VEC_TEMP0);
bool temp0ClampZero = false;
@ -1422,13 +1554,25 @@ bool SamplerJitCache::Jit_GetTexelCoordsQuad(const SamplerID &id) {
}
};
doClamp(id.clampS, sReg, widthVecReg == INVALID_REG ? M(constWidthMinus1i_) : R(widthVecReg));
doClamp(id.clampT, tReg, heightVecReg == INVALID_REG ? M(constHeightMinus1i_) : R(heightVecReg));
doClamp(id.clampS, sReg, width0VecReg == INVALID_REG ? M(constWidthMinus1i_) : R(width0VecReg));
doClamp(id.clampT, tReg, height0VecReg == INVALID_REG ? M(constHeightMinus1i_) : R(height0VecReg));
if (width1VecReg != INVALID_REG) {
X64Reg u1Reg = regCache_.Find(RegCache::VEC_U1);
X64Reg v1Reg = regCache_.Find(RegCache::VEC_V1);
doClamp(id.clampS, u1Reg, R(width1VecReg));
doClamp(id.clampT, v1Reg, R(height1VecReg));
regCache_.Unlock(u1Reg, RegCache::VEC_U1);
regCache_.Unlock(v1Reg, RegCache::VEC_V1);
}
if (widthVecReg != INVALID_REG)
regCache_.Release(widthVecReg, RegCache::VEC_TEMP4);
if (heightVecReg != INVALID_REG)
regCache_.Release(heightVecReg, RegCache::VEC_TEMP5);
if (width0VecReg != INVALID_REG)
regCache_.Release(width0VecReg, RegCache::VEC_TEMP2);
if (height0VecReg != INVALID_REG)
regCache_.Release(height0VecReg, RegCache::VEC_TEMP3);
if (width1VecReg != INVALID_REG)
regCache_.Release(width1VecReg, RegCache::VEC_TEMP4);
if (height1VecReg != INVALID_REG)
regCache_.Release(height1VecReg, RegCache::VEC_TEMP5);
regCache_.Release(temp0ClampReg, RegCache::VEC_TEMP0);
@ -1439,11 +1583,9 @@ bool SamplerJitCache::Jit_GetTexelCoordsQuad(const SamplerID &id) {
return true;
}
bool SamplerJitCache::Jit_PrepareDataOffsets(const SamplerID &id) {
bool SamplerJitCache::Jit_PrepareDataOffsets(const SamplerID &id, RegCache::Reg uReg, RegCache::Reg vReg) {
_assert_(id.linear);
// TODO: Use reg cache to avoid overwriting color...
bool success = true;
int bits = -1;
switch (id.TexFmt()) {
@ -1478,87 +1620,99 @@ bool SamplerJitCache::Jit_PrepareDataOffsets(const SamplerID &id) {
if (success && bits != -1) {
if (id.swizzle) {
success = Jit_PrepareDataSwizzledOffsets(id, bits);
success = Jit_PrepareDataSwizzledOffsets(id, uReg, vReg, bits);
} else {
if (!id.useStandardBufw || id.hasAnyMips) {
// Spread bufw into each lane.
X64Reg bufwReg = regCache_.Find(RegCache::GEN_ARG_BUFW);
MOVD_xmm(XMM2, MatR(bufwReg));
regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW);
PSHUFD(XMM2, R(XMM2), _MM_SHUFFLE(0, 0, 0, 0));
if (bits == 4)
PSRLD(XMM2, 1);
else if (bits == 16)
PSLLD(XMM2, 1);
else if (bits == 32)
PSLLD(XMM2, 2);
}
if (id.useStandardBufw && !id.hasAnyMips) {
int amt = id.width0Shift;
if (bits == 4)
amt -= 1;
else if (bits == 16)
amt += 1;
else if (bits == 32)
amt += 2;
// It's aligned to 16 bytes, so must at least be 16.
PSLLD(XMM1, std::max(4, amt));
} else if (cpu_info.bSSE4_1) {
// And now multiply. This is slow, but not worse than the SSE2 version...
PMULLD(XMM1, R(XMM2));
} else {
// Copy that into another temp for multiply.
MOVDQA(XMM3, R(XMM1));
// Okay, first, multiply to get XXXX CCCC XXXX AAAA.
PMULUDQ(XMM1, R(XMM2));
PSRLDQ(XMM3, 4);
PSRLDQ(XMM2, 4);
// And now get XXXX DDDD XXXX BBBB.
PMULUDQ(XMM3, R(XMM2));
// We know everything is positive, so XXXX must be zero. Let's combine.
PSLLDQ(XMM3, 4);
POR(XMM1, R(XMM3));
}
if (bits == 4) {
// Need to keep uvec for the odd bit.
MOVDQA(XMM2, R(XMM0));
PSRLD(XMM2, 1);
PADDD(XMM1, R(XMM2));
} else {
// Destroy uvec, we won't use it again.
if (bits == 16)
PSLLD(XMM0, 1);
else if (bits == 32)
PSLLD(XMM0, 2);
PADDD(XMM1, R(XMM0));
}
success = Jit_PrepareDataDirectOffsets(id, uReg, vReg, bits);
}
}
return success;
}
bool SamplerJitCache::Jit_PrepareDataSwizzledOffsets(const SamplerID &id, int bitsPerTexel) {
// See Jit_GetTexDataSwizzled() for usage of this offset.
// TODO: Use reg cache to avoid overwriting color...
bool SamplerJitCache::Jit_PrepareDataDirectOffsets(const SamplerID &id, RegCache::Reg uReg, RegCache::Reg vReg, int bitsPerTexel) {
X64Reg bufwVecReg = regCache_.Alloc(RegCache::VEC_TEMP0);
if (!id.useStandardBufw || id.hasAnyMips) {
// Spread bufw into each lane.
X64Reg bufwReg = regCache_.Find(RegCache::GEN_ARG_BUFW);
MOVD_xmm(XMM2, MatR(bufwReg));
MOVD_xmm(bufwVecReg, MatR(bufwReg));
regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW);
PSHUFD(XMM2, R(XMM2), _MM_SHUFFLE(0, 0, 0, 0));
PSHUFD(bufwVecReg, R(bufwVecReg), _MM_SHUFFLE(0, 0, 0, 0));
if (bitsPerTexel == 4)
PSRLD(bufwVecReg, 1);
else if (bitsPerTexel == 16)
PSLLD(bufwVecReg, 1);
else if (bitsPerTexel == 32)
PSLLD(bufwVecReg, 2);
}
if (id.useStandardBufw && !id.hasAnyMips) {
int amt = id.width0Shift;
if (bitsPerTexel == 4)
amt -= 1;
else if (bitsPerTexel == 16)
amt += 1;
else if (bitsPerTexel == 32)
amt += 2;
// It's aligned to 16 bytes, so must at least be 16.
PSLLD(vReg, std::max(4, amt));
} else if (cpu_info.bSSE4_1) {
// And now multiply. This is slow, but not worse than the SSE2 version...
PMULLD(vReg, R(bufwVecReg));
} else {
// Copy that into another temp for multiply.
X64Reg vOddLaneReg = regCache_.Alloc(RegCache::VEC_TEMP1);
MOVDQA(vOddLaneReg, R(vReg));
// Okay, first, multiply to get XXXX CCCC XXXX AAAA.
PMULUDQ(vReg, R(bufwVecReg));
PSRLDQ(vOddLaneReg, 4);
PSRLDQ(bufwVecReg, 4);
// And now get XXXX DDDD XXXX BBBB.
PMULUDQ(vOddLaneReg, R(bufwVecReg));
// We know everything is positive, so XXXX must be zero. Let's combine.
PSLLDQ(vOddLaneReg, 4);
POR(vReg, R(vOddLaneReg));
regCache_.Release(vOddLaneReg, RegCache::VEC_TEMP1);
}
regCache_.Release(bufwVecReg, RegCache::VEC_TEMP0);
if (bitsPerTexel == 4) {
// Need to keep uvec for the odd bit.
X64Reg uCopyReg = regCache_.Alloc(RegCache::VEC_TEMP0);
MOVDQA(uCopyReg, R(uReg));
PSRLD(uCopyReg, 1);
PADDD(vReg, R(uCopyReg));
regCache_.Release(uCopyReg, RegCache::VEC_TEMP0);
} else {
// Destroy uvec, we won't use it again.
if (bitsPerTexel == 16)
PSLLD(uReg, 1);
else if (bitsPerTexel == 32)
PSLLD(uReg, 2);
PADDD(vReg, R(uReg));
}
return true;
}
bool SamplerJitCache::Jit_PrepareDataSwizzledOffsets(const SamplerID &id, RegCache::Reg uReg, RegCache::Reg vReg, int bitsPerTexel) {
// See Jit_GetTexDataSwizzled() for usage of this offset.
X64Reg bufwVecReg = regCache_.Alloc(RegCache::VEC_TEMP0);
if (!id.useStandardBufw || id.hasAnyMips) {
// Spread bufw into each lane.
X64Reg bufwReg = regCache_.Find(RegCache::GEN_ARG_BUFW);
MOVD_xmm(bufwVecReg, MatR(bufwReg));
regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW);
PSHUFD(bufwVecReg, R(bufwVecReg), _MM_SHUFFLE(0, 0, 0, 0));
}
// Divide vvec by 8 in a temp.
MOVDQA(XMM3, R(XMM1));
PSRLD(XMM3, 3);
X64Reg vMultReg = regCache_.Alloc(RegCache::VEC_TEMP1);
MOVDQA(vMultReg, R(vReg));
PSRLD(vMultReg, 3);
// And now multiply by bufw. May be able to use a shift in a common case.
int shiftAmount = 32 - clz32_nonzero(bitsPerTexel - 1);
@ -1569,51 +1723,58 @@ bool SamplerJitCache::Jit_PrepareDataSwizzledOffsets(const SamplerID &id, int bi
shiftAmount += amt;
} else if (cpu_info.bSSE4_1) {
// And now multiply. This is slow, but not worse than the SSE2 version...
PMULLD(XMM3, R(XMM2));
PMULLD(vMultReg, R(bufwVecReg));
} else {
// Copy that into another temp for multiply.
MOVDQA(XMM4, R(XMM3));
X64Reg vOddLaneReg = regCache_.Alloc(RegCache::VEC_TEMP2);
MOVDQA(vOddLaneReg, R(vMultReg));
// Okay, first, multiply to get XXXX CCCC XXXX AAAA.
PMULUDQ(XMM3, R(XMM2));
PSRLDQ(XMM4, 4);
PSRLDQ(XMM2, 4);
PMULUDQ(vMultReg, R(bufwVecReg));
PSRLDQ(vOddLaneReg, 4);
PSRLDQ(bufwVecReg, 4);
// And now get XXXX DDDD XXXX BBBB.
PMULUDQ(XMM4, R(XMM2));
PMULUDQ(vOddLaneReg, R(bufwVecReg));
// We know everything is positive, so XXXX must be zero. Let's combine.
PSLLDQ(XMM4, 4);
POR(XMM3, R(XMM4));
PSLLDQ(vOddLaneReg, 4);
POR(vMultReg, R(vOddLaneReg));
regCache_.Release(vOddLaneReg, RegCache::VEC_TEMP2);
}
regCache_.Release(bufwVecReg, RegCache::VEC_TEMP0);
// Multiply the result by bitsPerTexel using a shift.
PSLLD(XMM3, shiftAmount);
PSLLD(vMultReg, shiftAmount);
// Now we're adding (v & 7) * 16. Use a 16-bit wall.
PSLLW(XMM1, 13);
PSRLD(XMM1, 9);
PADDD(XMM1, R(XMM3));
PSLLW(vReg, 13);
PSRLD(vReg, 9);
PADDD(vReg, R(vMultReg));
regCache_.Release(vMultReg, RegCache::VEC_TEMP1);
// Now get ((uvec / texels_per_tile) / 4) * 32 * 4 aka (uvec / (128 / bitsPerTexel)) << 7.
MOVDQA(XMM2, R(XMM0));
PSRLD(XMM2, 7 + clz32_nonzero(bitsPerTexel - 1) - 32);
PSLLD(XMM2, 7);
X64Reg uCopyReg = regCache_.Alloc(RegCache::VEC_TEMP0);
MOVDQA(uCopyReg, R(uReg));
PSRLD(uCopyReg, 7 + clz32_nonzero(bitsPerTexel - 1) - 32);
PSLLD(uCopyReg, 7);
// Add it in to our running total.
PADDD(XMM1, R(XMM2));
PADDD(vReg, R(uCopyReg));
if (bitsPerTexel == 4) {
// Finally, we want (uvec & 31) / 2. Use a 16-bit wall.
MOVDQA(XMM2, R(XMM0));
PSLLW(XMM2, 11);
PSRLD(XMM2, 12);
MOVDQA(uCopyReg, R(uReg));
PSLLW(uCopyReg, 11);
PSRLD(uCopyReg, 12);
// With that, this is our byte offset. uvec & 1 has which half.
PADDD(XMM1, R(XMM2));
PADDD(vReg, R(uCopyReg));
} else {
// We can destroy uvec in this path. Clear all but 2 bits for 32, 3 for 16, or 4 for 8.
PSLLW(XMM0, 32 - clz32_nonzero(bitsPerTexel - 1) + 9);
PSLLW(uReg, 32 - clz32_nonzero(bitsPerTexel - 1) + 9);
// Now that it's at the top of the 16 bits, we always shift that to the top of 4 bits.
PSRLD(XMM0, 12);
PADDD(XMM1, R(XMM0));
PSRLD(uReg, 12);
PADDD(vReg, R(uReg));
}
regCache_.Release(uCopyReg, RegCache::VEC_TEMP0);
return true;
}