Merge pull request #4468 from hrydgard/vtxdec-prescale

Add support for prescaled UV in vertex decoder JIT
This commit is contained in:
Henrik Rydgård 2013-11-07 02:51:14 -08:00
commit c213c44050
2 changed files with 208 additions and 31 deletions

View File

@ -787,6 +787,9 @@ struct JitLookup {
JitStepFunction jitFunc;
};
static const float by128 = 1.0f / 128.0f;
static const float by32768 = 1.0f / 32768.0f;
#ifdef ARM
using namespace ArmGen;
@ -798,6 +801,12 @@ static const ARMReg scratchReg = R6;
static const ARMReg srcReg = R0;
static const ARMReg dstReg = R1;
static const ARMReg counterReg = R2;
static const ARMReg fpScratchReg = S4;
static const ARMReg fpScratchReg2 = S5;
static const ARMReg fpUscaleReg = S0;
static const ARMReg fpVscaleReg = S1;
static const ARMReg fpUoffsetReg = S2;
static const ARMReg fpVoffsetReg = S3;
static const JitLookup jitLookup[] = {
{&VertexDecoder::Step_WeightsU8, &VertexDecoderJitCache::Jit_WeightsU8},
@ -808,6 +817,10 @@ static const JitLookup jitLookup[] = {
{&VertexDecoder::Step_TcU16, &VertexDecoderJitCache::Jit_TcU16},
{&VertexDecoder::Step_TcFloat, &VertexDecoderJitCache::Jit_TcFloat},
{&VertexDecoder::Step_TcU8Prescale, &VertexDecoderJitCache::Jit_TcU8Prescale},
{&VertexDecoder::Step_TcU16Prescale, &VertexDecoderJitCache::Jit_TcU16Prescale},
{&VertexDecoder::Step_TcFloatPrescale, &VertexDecoderJitCache::Jit_TcFloatPrescale},
{&VertexDecoder::Step_TcU16Through, &VertexDecoderJitCache::Jit_TcU16Through},
{&VertexDecoder::Step_TcFloatThrough, &VertexDecoderJitCache::Jit_TcFloatThrough},
@ -830,19 +843,40 @@ static const JitLookup jitLookup[] = {
};
JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) {
// return 0;
dec_ = &dec;
const u8 *start = this->GetCodePtr();
// TODO: Test and make work
bool prescaleStep = false;
// Look for prescaled texcoord steps
for (int i = 0; i < dec.numSteps_; i++) {
if (dec.steps_[i] == &VertexDecoder::Step_TcU8Prescale ||
dec.steps_[i] == &VertexDecoder::Step_TcU16Prescale ||
dec.steps_[i] == &VertexDecoder::Step_TcFloatPrescale) {
prescaleStep = true;
}
}
SetCC(CC_AL);
PUSH(6, R4, R5, R6, R7, R8, _LR);
// Preserving our FP scratch register appears to improve stability.
VMOV(R7, S0);
// Keep the scale/offset in a few fp registers if we need it.
if (prescaleStep) {
MOVI2R(R3, (u32)(&gstate_c.uv), scratchReg);
VLDR(fpUscaleReg, R3, 0);
VLDR(fpVscaleReg, R3, 4);
VLDR(fpUoffsetReg, R3, 8);
VLDR(fpVoffsetReg, R3, 12);
if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_8BIT) {
MOVI2F(fpScratchReg, by128, scratchReg);
VMUL(fpUscaleReg, fpUscaleReg, fpScratchReg);
VMUL(fpVscaleReg, fpVscaleReg, fpScratchReg);
} else if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_16BIT) {
MOVI2F(fpScratchReg, by32768, scratchReg);
VMUL(fpUscaleReg, fpUscaleReg, fpScratchReg);
VMUL(fpVscaleReg, fpVscaleReg, fpScratchReg);
}
}
JumpTarget loopStart = GetCodePtr();
for (int i = 0; i < dec.numSteps_; i++) {
@ -861,17 +895,14 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) {
SUBS(counterReg, counterReg, 1);
B_CC(CC_NEQ, loopStart);
VMOV(S0, R7); // restore our fp scratch
EOR(R0, R0, R0);
POP(6, R4, R5, R6, R7, R8, _PC);
FlushIcache();
// DisassembleArm(start, GetCodePtr() - start);
// char temp[1024] = {0};
// dec.ToString(temp);
// INFO_LOG(HLE, "%s", temp);
DisassembleArm(start, GetCodePtr() - start);
char temp[1024] = {0};
dec.ToString(temp);
INFO_LOG(HLE, "%s", temp);
return (JittedVertexDecoder)start;
}
@ -962,6 +993,51 @@ void VertexDecoderJitCache::Jit_TcFloatThrough() {
STR(tempReg2, dstReg, dec_->decFmt.uvoff + 4);
}
void VertexDecoderJitCache::Jit_TcU8Prescale() {
// TODO: SIMD
LDRB(tempReg1, srcReg, dec_->tcoff);
LDRB(tempReg2, srcReg, dec_->tcoff + 1);
VMOV(fpScratchReg, tempReg1);
VMOV(fpScratchReg2, tempReg2);
VCVT(fpScratchReg, fpScratchReg, TO_FLOAT);
VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT);
// Could replace VMUL + VADD with VMLA but would require 2 more regs as we don't want to destroy fp*offsetReg. Later.
VMUL(fpScratchReg, fpScratchReg, fpUscaleReg);
VMUL(fpScratchReg2, fpScratchReg2, fpVscaleReg);
VADD(fpScratchReg, fpScratchReg, fpUoffsetReg);
VADD(fpScratchReg2, fpScratchReg2, fpVoffsetReg);
VSTR(fpScratchReg, dstReg, dec_->decFmt.uvoff);
VSTR(fpScratchReg2, dstReg, dec_->decFmt.uvoff + 4);
}
void VertexDecoderJitCache::Jit_TcU16Prescale() {
// TODO: SIMD
LDRH(tempReg1, srcReg, dec_->tcoff);
LDRH(tempReg2, srcReg, dec_->tcoff + 2);
VMOV(fpScratchReg, tempReg1);
VMOV(fpScratchReg2, tempReg2);
VCVT(fpScratchReg, fpScratchReg, TO_FLOAT);
VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT);
VMUL(fpScratchReg, fpScratchReg, fpUscaleReg);
VMUL(fpScratchReg2, fpScratchReg2, fpVscaleReg);
VADD(fpScratchReg, fpScratchReg, fpUoffsetReg);
VADD(fpScratchReg2, fpScratchReg2, fpVoffsetReg);
VSTR(fpScratchReg, dstReg, dec_->decFmt.uvoff);
VSTR(fpScratchReg2, dstReg, dec_->decFmt.uvoff + 4);
}
void VertexDecoderJitCache::Jit_TcFloatPrescale() {
// TODO: SIMD
VLDR(fpScratchReg, srcReg, dec_->tcoff);
VLDR(fpScratchReg2, srcReg, dec_->tcoff + 4);
VMUL(fpScratchReg, fpScratchReg, fpUscaleReg);
VMUL(fpScratchReg2, fpScratchReg2, fpVscaleReg);
VADD(fpScratchReg, fpScratchReg, fpUoffsetReg);
VADD(fpScratchReg2, fpScratchReg2, fpVoffsetReg);
VSTR(fpScratchReg, dstReg, dec_->decFmt.uvoff);
VSTR(fpScratchReg2, dstReg, dec_->decFmt.uvoff + 4);
}
void VertexDecoderJitCache::Jit_Color8888() {
LDR(tempReg1, srcReg, dec_->coloff);
STR(tempReg1, dstReg, dec_->decFmt.c0off);
@ -1080,23 +1156,23 @@ void VertexDecoderJitCache::Jit_PosS8Through() {
LDRSB(tempReg3, srcReg, dec_->posoff + 2);
static const ARMReg tr[3] = { tempReg1, tempReg2, tempReg3 };
for (int i = 0; i < 3; i++) {
VMOV(S0, tr[i]);
VCVT(S0, S0, TO_FLOAT | IS_SIGNED);
VSTR(S0, dstReg, dec_->decFmt.posoff + i * 4);
VMOV(fpScratchReg, tr[i]);
VCVT(fpScratchReg, fpScratchReg, TO_FLOAT | IS_SIGNED);
VSTR(fpScratchReg, dstReg, dec_->decFmt.posoff + i * 4);
}
}
// Through expands into floats, always. Might want to look at changing this.
void VertexDecoderJitCache::Jit_PosS16Through() {
// TODO: SIMD
LDRSH(tempReg1, srcReg, dec_->posoff);
LDRSH(tempReg2, srcReg, dec_->posoff + 2);
LDRSH(tempReg3, srcReg, dec_->posoff + 4);
static const ARMReg tr[3] = { tempReg1, tempReg2, tempReg3 };
// TODO: SIMD
for (int i = 0; i < 3; i++) {
VMOV(S0, tr[i]);
VCVT(S0, S0, TO_FLOAT | IS_SIGNED);
VSTR(S0, dstReg, dec_->decFmt.posoff + i * 4);
VMOV(fpScratchReg, tr[i]);
VCVT(fpScratchReg, fpScratchReg, TO_FLOAT | IS_SIGNED);
VSTR(fpScratchReg, dstReg, dec_->decFmt.posoff + i * 4);
}
}
@ -1160,6 +1236,14 @@ static const X64Reg dstReg = EDI;
static const X64Reg counterReg = ECX;
#endif
// XMM0-XMM5 are volatile on Windows X64
// XMM0-XMM7 are arguments (and thus volatile) on System V ABI (other x64 platforms)
static const X64Reg fpUscaleReg = XMM0;
static const X64Reg fpVscaleReg = XMM1;
static const X64Reg fpUoffsetReg = XMM2;
static const X64Reg fpVoffsetReg = XMM3;
static const X64Reg fpScratchReg = XMM4;
static const X64Reg fpScratchReg2 = XMM5;
// To debug, just comment them out one at a time until it works. We fall back
// on the interpreter if the compiler fails.
@ -1173,6 +1257,10 @@ static const JitLookup jitLookup[] = {
{&VertexDecoder::Step_TcU16, &VertexDecoderJitCache::Jit_TcU16},
{&VertexDecoder::Step_TcFloat, &VertexDecoderJitCache::Jit_TcFloat},
{&VertexDecoder::Step_TcU8Prescale, &VertexDecoderJitCache::Jit_TcU8Prescale},
{&VertexDecoder::Step_TcU16Prescale, &VertexDecoderJitCache::Jit_TcU16Prescale},
{&VertexDecoder::Step_TcFloatPrescale, &VertexDecoderJitCache::Jit_TcFloatPrescale},
{&VertexDecoder::Step_TcU16Through, &VertexDecoderJitCache::Jit_TcU16Through},
{&VertexDecoder::Step_TcFloatThrough, &VertexDecoderJitCache::Jit_TcFloatThrough},
@ -1194,6 +1282,13 @@ static const JitLookup jitLookup[] = {
{&VertexDecoder::Step_PosFloat, &VertexDecoderJitCache::Jit_PosFloat},
};
// TODO: This should probably be global...
#ifdef _M_X64
#define PTRBITS 64
#else
#define PTRBITS 32
#endif
JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) {
dec_ = &dec;
const u8 *start = this->GetCodePtr();
@ -1211,6 +1306,44 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) {
MOV(32, R(dstReg), MDisp(ESP, 16 + offset + 4));
MOV(32, R(counterReg), MDisp(ESP, 16 + offset + 8));
#endif
// Save XMM4/XMM5 which apparently can be problematic?
// Actually, if they are, it must be a compiler bug because they SHOULD be ok.
// So I won't bother.
// SUB(PTRBITS, R(ESP), Imm8(32));
// MOVUPS(MDisp(ESP, 0), XMM4);
// MOVUPS(MDisp(ESP, 16), XMM5);
bool prescaleStep = false;
// Look for prescaled texcoord steps
for (int i = 0; i < dec.numSteps_; i++) {
if (dec.steps_[i] == &VertexDecoder::Step_TcU8Prescale ||
dec.steps_[i] == &VertexDecoder::Step_TcU16Prescale ||
dec.steps_[i] == &VertexDecoder::Step_TcFloatPrescale) {
prescaleStep = true;
}
}
// Keep the scale/offset in a few fp registers if we need it.
if (prescaleStep) {
#ifdef _M_X64
MOV(64, R(tempReg1), Imm64((u64)(&gstate_c.uv)));
#else
MOV(32, R(tempReg1), Imm32((u32)(&gstate_c.uv)));
#endif
MOVSS(fpUscaleReg, MDisp(tempReg1, 0));
MOVSS(fpVscaleReg, MDisp(tempReg1, 4));
MOVSS(fpUoffsetReg, MDisp(tempReg1, 8));
MOVSS(fpVoffsetReg, MDisp(tempReg1, 12));
if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_8BIT) {
MULSS(fpUscaleReg, M((void *)&by128));
MULSS(fpVscaleReg, M((void *)&by128));
} else if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_16BIT) {
MULSS(fpUscaleReg, M((void *)&by32768));
MULSS(fpVscaleReg, M((void *)&by32768));
}
}
// Let's not bother with a proper stack frame. We just grab the arguments and go.
JumpTarget loopStart = GetCodePtr();
for (int i = 0; i < dec.numSteps_; i++) {
@ -1221,16 +1354,15 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) {
}
}
#ifdef _M_X64
ADD(64, R(srcReg), Imm32(dec.VertexSize()));
ADD(64, R(dstReg), Imm32(dec.decFmt.stride));
#else
ADD(32, R(srcReg), Imm32(dec.VertexSize()));
ADD(32, R(dstReg), Imm32(dec.decFmt.stride));
#endif
ADD(PTRBITS, R(srcReg), Imm32(dec.VertexSize()));
ADD(PTRBITS, R(dstReg), Imm32(dec.decFmt.stride));
SUB(32, R(counterReg), Imm8(1));
J_CC(CC_NZ, loopStart, true);
// MOVUPS(XMM4, MDisp(ESP, 0));
// MOVUPS(XMM5, MDisp(ESP, 16));
// ADD(PTRBITS, R(ESP), Imm8(32));
#ifdef _M_IX86
// Restore register values
POP(EBP);
@ -1305,6 +1437,46 @@ void VertexDecoderJitCache::Jit_TcFloat() {
#endif
}
void VertexDecoderJitCache::Jit_TcU8Prescale() {
// TODO: SIMD
MOVZX(32, 8, tempReg1, MDisp(srcReg, dec_->tcoff));
MOVZX(32, 8, tempReg2, MDisp(srcReg, dec_->tcoff + 1));
CVTSI2SS(fpScratchReg, R(tempReg1));
CVTSI2SS(fpScratchReg2, R(tempReg2));
MULSS(fpScratchReg, R(fpUscaleReg));
MULSS(fpScratchReg2, R(fpVscaleReg));
ADDSS(fpScratchReg, R(fpUoffsetReg));
ADDSS(fpScratchReg2, R(fpVoffsetReg));
MOVSS(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
MOVSS(MDisp(dstReg, dec_->decFmt.uvoff + 4), fpScratchReg2);
}
void VertexDecoderJitCache::Jit_TcU16Prescale() {
// TODO: SIMD
MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->tcoff));
MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->tcoff + 2));
CVTSI2SS(fpScratchReg, R(tempReg1));
CVTSI2SS(fpScratchReg2, R(tempReg2));
MULSS(fpScratchReg, R(fpUscaleReg));
MULSS(fpScratchReg2, R(fpVscaleReg));
ADDSS(fpScratchReg, R(fpUoffsetReg));
ADDSS(fpScratchReg2, R(fpVoffsetReg));
MOVSS(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
MOVSS(MDisp(dstReg, dec_->decFmt.uvoff + 4), fpScratchReg2);
}
void VertexDecoderJitCache::Jit_TcFloatPrescale() {
// TODO: SIMD
MOVSS(fpScratchReg, MDisp(srcReg, dec_->tcoff));
MOVSS(fpScratchReg2, MDisp(srcReg, dec_->tcoff + 4));
MULSS(fpScratchReg, R(fpUscaleReg));
MULSS(fpScratchReg2, R(fpVscaleReg));
ADDSS(fpScratchReg, R(fpUoffsetReg));
ADDSS(fpScratchReg2, R(fpVoffsetReg));
MOVSS(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
MOVSS(MDisp(dstReg, dec_->decFmt.uvoff + 4), fpScratchReg2);
}
void VertexDecoderJitCache::Jit_TcU16Through() {
MOV(32, R(tempReg1), MDisp(srcReg, dec_->tcoff));
MOV(32, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1));
@ -1464,8 +1636,8 @@ void VertexDecoderJitCache::Jit_PosS8Through() {
// TODO: SIMD
for (int i = 0; i < 3; i++) {
MOVSX(32, 8, tempReg1, MDisp(srcReg, dec_->posoff + i));
CVTSI2SS(XMM0, R(tempReg1));
MOVSS(MDisp(dstReg, dec_->decFmt.posoff + i * 4), XMM0);
CVTSI2SS(fpScratchReg, R(tempReg1));
MOVSS(MDisp(dstReg, dec_->decFmt.posoff + i * 4), fpScratchReg);
}
}
@ -1474,8 +1646,8 @@ void VertexDecoderJitCache::Jit_PosS16Through() {
// TODO: SIMD
for (int i = 0; i < 3; i++) {
MOVSX(32, 16, tempReg1, MDisp(srcReg, dec_->posoff + i * 2));
CVTSI2SS(XMM0, R(tempReg1));
MOVSS(MDisp(dstReg, dec_->decFmt.posoff + i * 4), XMM0);
CVTSI2SS(fpScratchReg, R(tempReg1));
MOVSS(MDisp(dstReg, dec_->decFmt.posoff + i * 4), fpScratchReg);
}
}

View File

@ -57,6 +57,7 @@ public:
void DecodeVerts(u8 *decoded, const void *verts, int indexLowerBound, int indexUpperBound) const;
bool hasColor() const { return col != 0; }
bool hasTexcoord() const { return tc != 0; }
int VertexSize() const { return size; } // PSP format size
void Step_WeightsU8() const;
@ -190,6 +191,10 @@ public:
void Jit_TcU16();
void Jit_TcFloat();
void Jit_TcU8Prescale();
void Jit_TcU16Prescale();
void Jit_TcFloatPrescale();
void Jit_TcU16Through();
void Jit_TcFloatThrough();
@ -211,4 +216,4 @@ public:
private:
bool CompileStep(const VertexDecoder &dec, int i);
const VertexDecoder *dec_;
};
};