From 6b3944d3296eee8e749fa2bc7b7f62f59b6bb9ab Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Sat, 6 May 2017 18:45:04 -0700 Subject: [PATCH 1/3] UnitTest: Correct vertex and jit tests. We now convert texcoords to floats. --- Core/MIPS/MIPSAsm.cpp | 6 ++++-- ext/native/thin3d/thin3d_vulkan.cpp | 4 ++-- headless/StubHost.h | 4 ++-- unittest/JitHarness.cpp | 2 +- unittest/TestVertexJit.cpp | 15 +++++++++------ 5 files changed, 18 insertions(+), 13 deletions(-) diff --git a/Core/MIPS/MIPSAsm.cpp b/Core/MIPS/MIPSAsm.cpp index 19d6c938e1..1defcb9477 100644 --- a/Core/MIPS/MIPSAsm.cpp +++ b/Core/MIPS/MIPSAsm.cpp @@ -89,8 +89,10 @@ bool MipsAssembleOpcode(const char* line, DebugInterface* cpu, u32 address) args.silent = true; args.memoryFile = &file; args.errorsResult = &errors; - - g_symbolMap->GetLabels(args.labels); + + if (g_symbolMap) { + g_symbolMap->GetLabels(args.labels); + } errorText = L""; if (!runArmips(args)) diff --git a/ext/native/thin3d/thin3d_vulkan.cpp b/ext/native/thin3d/thin3d_vulkan.cpp index 237184a8ff..1a596fb27b 100644 --- a/ext/native/thin3d/thin3d_vulkan.cpp +++ b/ext/native/thin3d/thin3d_vulkan.cpp @@ -376,7 +376,7 @@ public: curIBufferOffset_ = offset; } - void UpdateDynamicUniformBuffer(const void *ub, size_t size); + void UpdateDynamicUniformBuffer(const void *ub, size_t size) override; // TODO: Add more sophisticated draws. void Draw(int vertexCount, int offset) override; @@ -1320,4 +1320,4 @@ void VKContext::HandleEvent(Event ev, int width, int height, void *param1, void // Noop } -} // namespace Draw \ No newline at end of file +} // namespace Draw diff --git a/headless/StubHost.h b/headless/StubHost.h index 78e9b1a15f..0a4be899c5 100644 --- a/headless/StubHost.h +++ b/headless/StubHost.h @@ -29,7 +29,7 @@ public: void UpdateMemView() override {} void UpdateDisassembly() override {} - void SetDebugMode(bool mode) { } + void SetDebugMode(bool mode) override { } void SetGraphicsCore(GPUCore core) { gpuCore_ = core; } bool InitGraphics(std::string *error_message, GraphicsContext **ctx) override {return false;} @@ -73,4 +73,4 @@ public: protected: std::string debugOutputBuffer_; GPUCore gpuCore_; -}; \ No newline at end of file +}; diff --git a/unittest/JitHarness.cpp b/unittest/JitHarness.cpp index 3f0c2493db..29ab315e44 100644 --- a/unittest/JitHarness.cpp +++ b/unittest/JitHarness.cpp @@ -91,8 +91,8 @@ static void DestroyJitHarness() { // Clear our custom module out to be safe. HLEShutdown(); CoreTiming::Shutdown(); - Memory::Shutdown(); mipsr4k.Shutdown(); + Memory::Shutdown(); coreState = CORE_POWERDOWN; currentMIPS = nullptr; } diff --git a/unittest/TestVertexJit.cpp b/unittest/TestVertexJit.cpp index 79012974fd..40dd640c8b 100644 --- a/unittest/TestVertexJit.cpp +++ b/unittest/TestVertexJit.cpp @@ -36,6 +36,10 @@ public: cache_ = new VertexDecoderJitCache(); g_Config.bVertexDecoderJit = true; + // Required for jit to be enabled. + g_Config.iCpuCore = (int)CPUCore::JIT; + gstate_c.uv.uScale = 1.0f; + gstate_c.uv.vScale = 1.0f; } ~VertexDecoderTestHarness() { delete src_; @@ -297,8 +301,7 @@ static bool TestVertex8() { for (int jit = 0; jit <= 1; ++jit) { dec.Execute(vtype, 0, jit == 1); - dec.Assert8("TestVertex8-TC", 127, 128); - dec.Skip(2); + dec.AssertFloat("TestVertex8-TC", 127.0f / 128.0f, 1.0f); dec.Assert8("TestVertex8-Nrm", 127, 0, 128); dec.Skip(1); dec.AssertFloat("TestVertex8-Pos", 127.0f / 128.0f, 0.0f, -1.0f); @@ -317,7 +320,7 @@ static bool TestVertex16() { for (int jit = 0; jit <= 1; ++jit) { dec.Execute(vtype, 0, jit == 1); - dec.Assert16("TestVertex16-TC", 32767, 32768); + dec.AssertFloat("TestVertex16-TC", 32767.0f / 32768.0f, 1.0f); dec.Assert16("TestVertex16-Nrm", 32767, 0, 32768); dec.Skip(2); dec.AssertFloat("TestVertex16-Pos", 32767.0f / 32768.0f, 0.0f, -1.0f); @@ -354,8 +357,8 @@ static bool TestVertex8Through() { for (int jit = 0; jit <= 1; ++jit) { dec.Execute(vtype, 0, jit == 1); - dec.Assert8("TestVertex8Through-TC", 127, 128); - dec.Skip(2); + // Note: this is correct, even in through. + dec.AssertFloat("TestVertex8Through-TC", 127.0f / 128.0f, 1.0f); dec.Assert8("TestVertex8Through-Nrm", 127, 0, 128); // Ignoring Pos since s8 through isn't really an option. } @@ -373,7 +376,7 @@ static bool TestVertex16Through() { for (int jit = 0; jit <= 1; ++jit) { dec.Execute(vtype, 0, jit == 1); - dec.Assert16("TestVertex16Through-TC", 32767, 32768); + dec.AssertFloat("TestVertex16Through-TC", 32767.0f, 32768.0f); dec.Assert16("TestVertex16Through-Nrm", 32767, 0, 32768); dec.Skip(2); dec.AssertFloat("TestVertex16Through-Pos", 32767.0f, 0.0f, 32768.0f); From 257f8dbbc64deca2da26ab3f7ebd892001cbacc7 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Sat, 6 May 2017 18:55:16 -0700 Subject: [PATCH 2/3] GPU: Remove now-unused vertex decoder funcs. We always convert to float now, so these functions are no longer used. --- GPU/Common/VertexDecoderArm.cpp | 44 --------------- GPU/Common/VertexDecoderArm64.cpp | 37 ------------- GPU/Common/VertexDecoderCommon.cpp | 88 ------------------------------ GPU/Common/VertexDecoderCommon.h | 10 ---- GPU/Common/VertexDecoderX86.cpp | 42 -------------- 5 files changed, 221 deletions(-) diff --git a/GPU/Common/VertexDecoderArm.cpp b/GPU/Common/VertexDecoderArm.cpp index fcaf3e4f27..3f9267cab3 100644 --- a/GPU/Common/VertexDecoderArm.cpp +++ b/GPU/Common/VertexDecoderArm.cpp @@ -121,15 +121,12 @@ static const JitLookup jitLookup[] = { {&VertexDecoder::Step_TcFloat, &VertexDecoderJitCache::Jit_TcFloat}, {&VertexDecoder::Step_TcU8ToFloat, &VertexDecoderJitCache::Jit_TcU8ToFloat}, {&VertexDecoder::Step_TcU16ToFloat, &VertexDecoderJitCache::Jit_TcU16ToFloat}, - {&VertexDecoder::Step_TcU16Double, &VertexDecoderJitCache::Jit_TcU16Double}, {&VertexDecoder::Step_TcU8Prescale, &VertexDecoderJitCache::Jit_TcU8Prescale}, {&VertexDecoder::Step_TcU16Prescale, &VertexDecoderJitCache::Jit_TcU16Prescale}, {&VertexDecoder::Step_TcFloatPrescale, &VertexDecoderJitCache::Jit_TcFloatPrescale}, - {&VertexDecoder::Step_TcU16Through, &VertexDecoderJitCache::Jit_TcU16Through}, {&VertexDecoder::Step_TcFloatThrough, &VertexDecoderJitCache::Jit_TcFloatThrough}, - {&VertexDecoder::Step_TcU16ThroughDouble, &VertexDecoderJitCache::Jit_TcU16ThroughDouble}, // {&VertexDecoder::Step_TcU16ThroughToFloat, &VertexDecoderJitCache::Jit_TcU16ThroughToFloat}, {&VertexDecoder::Step_NormalS8, &VertexDecoderJitCache::Jit_NormalS8}, @@ -571,31 +568,6 @@ void VertexDecoderJitCache::Jit_TcFloat() { STR(tempReg2, dstReg, dec_->decFmt.uvoff + 4); } -void VertexDecoderJitCache::Jit_TcU16Through() { - LDRH(tempReg1, srcReg, dec_->tcoff); - LDRH(tempReg2, srcReg, dec_->tcoff + 2); - - // TODO: Cleanup. - MOVP2R(scratchReg, &gstate_c.vertBounds.minU); - - auto updateSide = [&](ARMReg r, CCFlags cc, u32 off) { - LDRH(tempReg3, scratchReg, off); - CMP(r, tempReg3); - SetCC(cc); - STRH(r, scratchReg, off); - SetCC(CC_AL); - }; - - // TODO: Can this actually be fast? Hmm, floats aren't better. - updateSide(tempReg1, CC_LT, offsetof(KnownVertexBounds, minU)); - updateSide(tempReg1, CC_GT, offsetof(KnownVertexBounds, maxU)); - updateSide(tempReg2, CC_LT, offsetof(KnownVertexBounds, minV)); - updateSide(tempReg2, CC_GT, offsetof(KnownVertexBounds, maxV)); - - ORR(tempReg1, tempReg1, Operand2(tempReg2, ST_LSL, 16)); - STR(tempReg1, dstReg, dec_->decFmt.uvoff); -} - void VertexDecoderJitCache::Jit_TcFloatThrough() { LDR(tempReg1, srcReg, dec_->tcoff); LDR(tempReg2, srcReg, dec_->tcoff + 4); @@ -603,22 +575,6 @@ void VertexDecoderJitCache::Jit_TcFloatThrough() { STR(tempReg2, dstReg, dec_->decFmt.uvoff + 4); } -void VertexDecoderJitCache::Jit_TcU16Double() { - LDRH(tempReg1, srcReg, dec_->tcoff); - LDRH(tempReg2, srcReg, dec_->tcoff + 2); - LSL(tempReg1, tempReg1, 1); - ORR(tempReg1, tempReg1, Operand2(tempReg2, ST_LSL, 17)); - STR(tempReg1, dstReg, dec_->decFmt.uvoff); -} - -void VertexDecoderJitCache::Jit_TcU16ThroughDouble() { - LDRH(tempReg1, srcReg, dec_->tcoff); - LDRH(tempReg2, srcReg, dec_->tcoff + 2); - LSL(tempReg1, tempReg1, 1); - ORR(tempReg1, tempReg1, Operand2(tempReg2, ST_LSL, 17)); - STR(tempReg1, dstReg, dec_->decFmt.uvoff); -} - void VertexDecoderJitCache::Jit_TcU8Prescale() { if (cpu_info.bNEON) { // TODO: Needs testing diff --git a/GPU/Common/VertexDecoderArm64.cpp b/GPU/Common/VertexDecoderArm64.cpp index c7e737428b..f68f100457 100644 --- a/GPU/Common/VertexDecoderArm64.cpp +++ b/GPU/Common/VertexDecoderArm64.cpp @@ -95,15 +95,12 @@ static const JitLookup jitLookup[] = { {&VertexDecoder::Step_TcFloat, &VertexDecoderJitCache::Jit_TcFloat}, {&VertexDecoder::Step_TcU8ToFloat, &VertexDecoderJitCache::Jit_TcU8ToFloat}, {&VertexDecoder::Step_TcU16ToFloat, &VertexDecoderJitCache::Jit_TcU16ToFloat}, - {&VertexDecoder::Step_TcU16Double, &VertexDecoderJitCache::Jit_TcU16Double}, {&VertexDecoder::Step_TcU8Prescale, &VertexDecoderJitCache::Jit_TcU8Prescale}, {&VertexDecoder::Step_TcU16Prescale, &VertexDecoderJitCache::Jit_TcU16Prescale}, {&VertexDecoder::Step_TcFloatPrescale, &VertexDecoderJitCache::Jit_TcFloatPrescale}, - {&VertexDecoder::Step_TcU16Through, &VertexDecoderJitCache::Jit_TcU16Through}, {&VertexDecoder::Step_TcFloatThrough, &VertexDecoderJitCache::Jit_TcFloatThrough}, - {&VertexDecoder::Step_TcU16ThroughDouble, &VertexDecoderJitCache::Jit_TcU16ThroughDouble}, // {&VertexDecoder::Step_TcU16ThroughToFloat, &VertexDecoderJitCache::Jit_TcU16ThroughToFloat}, {&VertexDecoder::Step_NormalS8, &VertexDecoderJitCache::Jit_NormalS8}, @@ -582,45 +579,11 @@ void VertexDecoderJitCache::Jit_Color5551() { CSEL(fullAlphaReg, fullAlphaReg, WZR, CC_EQ); } -void VertexDecoderJitCache::Jit_TcU16Through() { - LDRH(INDEX_UNSIGNED, tempReg1, srcReg, dec_->tcoff); - LDRH(INDEX_UNSIGNED, tempReg2, srcReg, dec_->tcoff + 2); - - auto updateSide = [&](ARM64Reg src, CCFlags cc, ARM64Reg dst) { - CMP(src, dst); - CSEL(dst, src, dst, cc); - }; - - updateSide(tempReg1, CC_LT, boundsMinUReg); - updateSide(tempReg1, CC_GT, boundsMaxUReg); - updateSide(tempReg2, CC_LT, boundsMinVReg); - updateSide(tempReg2, CC_GT, boundsMaxVReg); - - ORR(tempReg1, tempReg1, tempReg2, ArithOption(tempReg2, ST_LSL, 16)); - STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.uvoff); -} - void VertexDecoderJitCache::Jit_TcFloatThrough() { LDP(INDEX_SIGNED, tempReg1, tempReg2, srcReg, dec_->tcoff); STP(INDEX_SIGNED, tempReg1, tempReg2, dstReg, dec_->decFmt.uvoff); } -void VertexDecoderJitCache::Jit_TcU16Double() { - LDRH(INDEX_UNSIGNED, tempReg1, srcReg, dec_->tcoff); - LDRH(INDEX_UNSIGNED, tempReg2, srcReg, dec_->tcoff + 2); - LSL(tempReg1, tempReg1, 1); - ORR(tempReg1, tempReg1, tempReg2, ArithOption(tempReg2, ST_LSL, 17)); - STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.uvoff); -} - -void VertexDecoderJitCache::Jit_TcU16ThroughDouble() { - LDRH(INDEX_UNSIGNED, tempReg1, srcReg, dec_->tcoff); - LDRH(INDEX_UNSIGNED, tempReg2, srcReg, dec_->tcoff + 2); - LSL(tempReg1, tempReg1, 1); - ORR(tempReg1, tempReg1, tempReg2, ArithOption(tempReg2, ST_LSL, 17)); - STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.uvoff); -} - void VertexDecoderJitCache::Jit_TcFloat() { LDP(INDEX_SIGNED, tempReg1, tempReg2, srcReg, dec_->tcoff); STP(INDEX_SIGNED, tempReg1, tempReg2, dstReg, dec_->decFmt.uvoff); diff --git a/GPU/Common/VertexDecoderCommon.cpp b/GPU/Common/VertexDecoderCommon.cpp index 6cb48e9d42..63f33f845c 100644 --- a/GPU/Common/VertexDecoderCommon.cpp +++ b/GPU/Common/VertexDecoderCommon.cpp @@ -281,35 +281,6 @@ void VertexDecoder::Step_TcU16ToFloat() const uv[1] = uvdata[1] * (1.0f / 32768.0f); } -void VertexDecoder::Step_TcU16Double() const -{ - u16 *uv = (u16*)(decoded_ + decFmt.uvoff); - const u16 *uvdata = (const u16_le*)(ptr_ + tcoff); - uv[0] = uvdata[0] * 2; - uv[1] = uvdata[1] * 2; -} - -void VertexDecoder::Step_TcU16Through() const -{ - u16 *uv = (u16 *)(decoded_ + decFmt.uvoff); - const u16 *uvdata = (const u16_le*)(ptr_ + tcoff); - uv[0] = uvdata[0]; - uv[1] = uvdata[1]; - - gstate_c.vertBounds.minU = std::min(gstate_c.vertBounds.minU, uvdata[0]); - gstate_c.vertBounds.maxU = std::max(gstate_c.vertBounds.maxU, uvdata[0]); - gstate_c.vertBounds.minV = std::min(gstate_c.vertBounds.minV, uvdata[1]); - gstate_c.vertBounds.maxV = std::max(gstate_c.vertBounds.maxV, uvdata[1]); -} - -void VertexDecoder::Step_TcU16ThroughDouble() const -{ - u16 *uv = (u16 *)(decoded_ + decFmt.uvoff); - const u16 *uvdata = (const u16_le*)(ptr_ + tcoff); - uv[0] = uvdata[0] * 2; - uv[1] = uvdata[1] * 2; -} - void VertexDecoder::Step_TcU16DoubleToFloat() const { float *uv = (float*)(decoded_ + decFmt.uvoff); @@ -388,51 +359,6 @@ void VertexDecoder::Step_TcFloatPrescale() const { uv[1] = uvdata[1] * gstate_c.uv.vScale + gstate_c.uv.vOff; } -void VertexDecoder::Step_TcU8Morph() const { - float uv[2] = { 0, 0 }; - for (int n = 0; n < morphcount; n++) { - float w = gstate_c.morphWeights[n]; - const u8 *uvdata = (const u8 *)(ptr_ + onesize_*n + tcoff); - - uv[0] += (float)uvdata[0] * w; - uv[1] += (float)uvdata[1] * w; - } - - u8 *out = decoded_ + decFmt.uvoff; - out[0] = (int)uv[0]; - out[1] = (int)uv[1]; -} - -void VertexDecoder::Step_TcU16Morph() const { - float uv[2] = { 0, 0 }; - for (int n = 0; n < morphcount; n++) { - float w = gstate_c.morphWeights[n]; - const u16_le *uvdata = (const u16_le *)(ptr_ + onesize_*n + tcoff); - - uv[0] += (float)uvdata[0] * w; - uv[1] += (float)uvdata[1] * w; - } - - u16_le *out = (u16_le *)(decoded_ + decFmt.uvoff); - out[0] = (int)uv[0]; - out[1] = (int)uv[1]; -} - -void VertexDecoder::Step_TcU16DoubleMorph() const { - float uv[2] = { 0, 0 }; - for (int n = 0; n < morphcount; n++) { - float w = gstate_c.morphWeights[n]; - const u16_le *uvdata = (const u16_le *)(ptr_ + onesize_*n + tcoff); - - uv[0] += (float)uvdata[0] * w; - uv[1] += (float)uvdata[1] * w; - } - - u16_le *out = (u16_le *)(decoded_ + decFmt.uvoff); - out[0] = (int)(uv[0] * 2.0f); - out[1] = (int)(uv[1] * 2.0f); -} - void VertexDecoder::Step_TcU8MorphToFloat() const { float uv[2] = { 0, 0 }; for (int n = 0; n < morphcount; n++) { @@ -922,20 +848,6 @@ static const StepFunction tcstep_prescale_morph_remaster[4] = { &VertexDecoder::Step_TcFloatPrescaleMorph, }; -static const StepFunction tcstep_morph[4] = { - 0, - &VertexDecoder::Step_TcU8Morph, - &VertexDecoder::Step_TcU16Morph, - &VertexDecoder::Step_TcFloatMorph, -}; - -static const StepFunction tcstep_morph_remaster[4] = { - 0, - &VertexDecoder::Step_TcU8Morph, - &VertexDecoder::Step_TcU16DoubleMorph, - &VertexDecoder::Step_TcFloatMorph, -}; - static const StepFunction tcstep_morphToFloat[4] = { 0, &VertexDecoder::Step_TcU8MorphToFloat, diff --git a/GPU/Common/VertexDecoderCommon.h b/GPU/Common/VertexDecoderCommon.h index 1531822e02..194bbc5ed5 100644 --- a/GPU/Common/VertexDecoderCommon.h +++ b/GPU/Common/VertexDecoderCommon.h @@ -514,17 +514,11 @@ public: void Step_TcU16DoublePrescale() const; void Step_TcFloatPrescale() const; - void Step_TcU16Double() const; - void Step_TcU16Through() const; - void Step_TcU16ThroughDouble() const; void Step_TcU16DoubleToFloat() const; void Step_TcU16ThroughToFloat() const; void Step_TcU16ThroughDoubleToFloat() const; void Step_TcFloatThrough() const; - void Step_TcU8Morph() const; - void Step_TcU16Morph() const; - void Step_TcU16DoubleMorph() const; void Step_TcU8MorphToFloat() const; void Step_TcU16MorphToFloat() const; void Step_TcU16DoubleMorphToFloat() const; @@ -675,10 +669,6 @@ public: void Jit_TcU16PrescaleMorph(); void Jit_TcFloatPrescaleMorph(); - void Jit_TcU16Double(); - void Jit_TcU16ThroughDouble(); - - void Jit_TcU16Through(); void Jit_TcU16ThroughToFloat(); void Jit_TcFloatThrough(); diff --git a/GPU/Common/VertexDecoderX86.cpp b/GPU/Common/VertexDecoderX86.cpp index 0db6364af2..91ae79e276 100644 --- a/GPU/Common/VertexDecoderX86.cpp +++ b/GPU/Common/VertexDecoderX86.cpp @@ -100,16 +100,13 @@ static const JitLookup jitLookup[] = { {&VertexDecoder::Step_TcFloat, &VertexDecoderJitCache::Jit_TcFloat}, {&VertexDecoder::Step_TcU8ToFloat, &VertexDecoderJitCache::Jit_TcU8ToFloat}, {&VertexDecoder::Step_TcU16ToFloat, &VertexDecoderJitCache::Jit_TcU16ToFloat}, - {&VertexDecoder::Step_TcU16Double, &VertexDecoderJitCache::Jit_TcU16Double}, {&VertexDecoder::Step_TcU8Prescale, &VertexDecoderJitCache::Jit_TcU8Prescale}, {&VertexDecoder::Step_TcU16Prescale, &VertexDecoderJitCache::Jit_TcU16Prescale}, {&VertexDecoder::Step_TcFloatPrescale, &VertexDecoderJitCache::Jit_TcFloatPrescale}, - {&VertexDecoder::Step_TcU16Through, &VertexDecoderJitCache::Jit_TcU16Through}, {&VertexDecoder::Step_TcU16ThroughToFloat, &VertexDecoderJitCache::Jit_TcU16ThroughToFloat}, {&VertexDecoder::Step_TcFloatThrough, &VertexDecoderJitCache::Jit_TcFloatThrough}, - {&VertexDecoder::Step_TcU16ThroughDouble, &VertexDecoderJitCache::Jit_TcU16ThroughDouble}, {&VertexDecoder::Step_TcU8MorphToFloat, &VertexDecoderJitCache::Jit_TcU8MorphToFloat}, {&VertexDecoder::Step_TcU16MorphToFloat, &VertexDecoderJitCache::Jit_TcU16MorphToFloat}, @@ -696,15 +693,6 @@ void VertexDecoderJitCache::Jit_TcU16ToFloat() { MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), XMM3); } -void VertexDecoderJitCache::Jit_TcU16Double() { - MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->tcoff)); - MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->tcoff + 2)); - SHL(16, R(tempReg1), Imm8(1)); // 16 to get a wall to shift into - SHL(32, R(tempReg2), Imm8(17)); - OR(32, R(tempReg1), R(tempReg2)); - MOV(32, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1)); -} - void VertexDecoderJitCache::Jit_TcFloat() { #ifdef _M_X64 MOV(64, R(tempReg1), MDisp(srcReg, dec_->tcoff)); @@ -851,27 +839,6 @@ void VertexDecoderJitCache::Jit_TcFloatPrescaleMorph() { MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg); } -void VertexDecoderJitCache::Jit_TcU16Through() { - MOV(32, R(tempReg1), MDisp(srcReg, dec_->tcoff)); - MOV(32, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1)); - - MOV(32, R(tempReg2), R(tempReg1)); - SHR(32, R(tempReg2), Imm8(16)); - - auto updateSide = [&](X64Reg r, CCFlags skipCC, u16 *value) { - CMP(16, R(r), M(value)); - FixupBranch skip = J_CC(skipCC); - MOV(16, M(value), R(r)); - SetJumpTarget(skip); - }; - - // TODO: Can this actually be fast? Hmm, floats aren't better. - updateSide(tempReg1, CC_GE, &gstate_c.vertBounds.minU); - updateSide(tempReg1, CC_LE, &gstate_c.vertBounds.maxU); - updateSide(tempReg2, CC_GE, &gstate_c.vertBounds.minV); - updateSide(tempReg2, CC_LE, &gstate_c.vertBounds.maxV); -} - void VertexDecoderJitCache::Jit_TcU16ThroughToFloat() { PXOR(fpScratchReg2, R(fpScratchReg2)); MOV(32, R(tempReg1), MDisp(srcReg, dec_->tcoff)); @@ -897,15 +864,6 @@ void VertexDecoderJitCache::Jit_TcU16ThroughToFloat() { updateSide(tempReg2, CC_LE, &gstate_c.vertBounds.maxV); } -void VertexDecoderJitCache::Jit_TcU16ThroughDouble() { - MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->tcoff)); - MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->tcoff + 2)); - SHL(16, R(tempReg1), Imm8(1)); // 16 to get a wall to shift into - SHL(32, R(tempReg2), Imm8(17)); - OR(32, R(tempReg1), R(tempReg2)); - MOV(32, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1)); -} - void VertexDecoderJitCache::Jit_TcFloatThrough() { #ifdef _M_X64 MOV(64, R(tempReg1), MDisp(srcReg, dec_->tcoff)); From 7699fa55de16f4ed14512dd702fbded2ff0daa70 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Sat, 6 May 2017 18:58:15 -0700 Subject: [PATCH 3/3] arm: Jit throughmode 16-bit texcoords. It's popular, and this makes decoding such verts much faster. --- GPU/Common/VertexDecoderArm.cpp | 39 ++++++++++++++++++++++++++++++- GPU/Common/VertexDecoderArm64.cpp | 22 ++++++++++++++++- 2 files changed, 59 insertions(+), 2 deletions(-) diff --git a/GPU/Common/VertexDecoderArm.cpp b/GPU/Common/VertexDecoderArm.cpp index 3f9267cab3..ce87e02117 100644 --- a/GPU/Common/VertexDecoderArm.cpp +++ b/GPU/Common/VertexDecoderArm.cpp @@ -127,7 +127,7 @@ static const JitLookup jitLookup[] = { {&VertexDecoder::Step_TcFloatPrescale, &VertexDecoderJitCache::Jit_TcFloatPrescale}, {&VertexDecoder::Step_TcFloatThrough, &VertexDecoderJitCache::Jit_TcFloatThrough}, - // {&VertexDecoder::Step_TcU16ThroughToFloat, &VertexDecoderJitCache::Jit_TcU16ThroughToFloat}, + {&VertexDecoder::Step_TcU16ThroughToFloat, &VertexDecoderJitCache::Jit_TcU16ThroughToFloat}, {&VertexDecoder::Step_NormalS8, &VertexDecoderJitCache::Jit_NormalS8}, {&VertexDecoder::Step_NormalS16, &VertexDecoderJitCache::Jit_NormalS16}, @@ -568,6 +568,43 @@ void VertexDecoderJitCache::Jit_TcFloat() { STR(tempReg2, dstReg, dec_->decFmt.uvoff + 4); } +void VertexDecoderJitCache::Jit_TcU16ThroughToFloat() { + LDRH(tempReg1, srcReg, dec_->tcoff); + LDRH(tempReg2, srcReg, dec_->tcoff + 2); + + MOVP2R(scratchReg, &gstate_c.vertBounds.minU); + + auto updateSide = [&](ARMReg r, CCFlags cc, u32 off) { + LDRH(tempReg3, scratchReg, off); + CMP(r, tempReg3); + SetCC(cc); + STRH(r, scratchReg, off); + SetCC(CC_AL); + }; + + // TODO: Can this actually be fast? Hmm, floats aren't better. + updateSide(tempReg1, CC_LT, offsetof(KnownVertexBounds, minU)); + updateSide(tempReg1, CC_GT, offsetof(KnownVertexBounds, maxU)); + updateSide(tempReg2, CC_LT, offsetof(KnownVertexBounds, minV)); + updateSide(tempReg2, CC_GT, offsetof(KnownVertexBounds, maxV)); + + if (cpu_info.bNEON) { + ADD(scratchReg, srcReg, dec_->tcoff); + VLD1_lane(I_32, neonScratchReg, scratchReg, 0, false); + VMOVL(I_16 | I_UNSIGNED, neonScratchRegQ, neonScratchReg); // Widen to 32-bit + VCVT(F_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ); + ADD(scratchReg2, dstReg, dec_->decFmt.uvoff); + VST1(F_32, neonScratchReg, scratchReg2, 1, ALIGN_NONE); + } else { + VMOV(fpScratchReg, tempReg1); + VMOV(fpScratchReg2, tempReg2); + VCVT(fpScratchReg, fpScratchReg, TO_FLOAT); + VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT); + VSTR(fpScratchReg, dstReg, dec_->decFmt.uvoff); + VSTR(fpScratchReg2, dstReg, dec_->decFmt.uvoff + 4); + } +} + void VertexDecoderJitCache::Jit_TcFloatThrough() { LDR(tempReg1, srcReg, dec_->tcoff); LDR(tempReg2, srcReg, dec_->tcoff + 4); diff --git a/GPU/Common/VertexDecoderArm64.cpp b/GPU/Common/VertexDecoderArm64.cpp index f68f100457..12f58e829b 100644 --- a/GPU/Common/VertexDecoderArm64.cpp +++ b/GPU/Common/VertexDecoderArm64.cpp @@ -101,7 +101,7 @@ static const JitLookup jitLookup[] = { {&VertexDecoder::Step_TcFloatPrescale, &VertexDecoderJitCache::Jit_TcFloatPrescale}, {&VertexDecoder::Step_TcFloatThrough, &VertexDecoderJitCache::Jit_TcFloatThrough}, - // {&VertexDecoder::Step_TcU16ThroughToFloat, &VertexDecoderJitCache::Jit_TcU16ThroughToFloat}, + {&VertexDecoder::Step_TcU16ThroughToFloat, &VertexDecoderJitCache::Jit_TcU16ThroughToFloat}, {&VertexDecoder::Step_NormalS8, &VertexDecoderJitCache::Jit_NormalS8}, {&VertexDecoder::Step_NormalS16, &VertexDecoderJitCache::Jit_NormalS16}, @@ -579,6 +579,26 @@ void VertexDecoderJitCache::Jit_Color5551() { CSEL(fullAlphaReg, fullAlphaReg, WZR, CC_EQ); } +void VertexDecoderJitCache::Jit_TcU16ThroughToFloat() { + LDRH(INDEX_UNSIGNED, tempReg1, srcReg, dec_->tcoff); + LDRH(INDEX_UNSIGNED, tempReg2, srcReg, dec_->tcoff + 2); + + auto updateSide = [&](ARM64Reg src, CCFlags cc, ARM64Reg dst) { + CMP(src, dst); + CSEL(dst, src, dst, cc); + }; + + updateSide(tempReg1, CC_LT, boundsMinUReg); + updateSide(tempReg1, CC_GT, boundsMaxUReg); + updateSide(tempReg2, CC_LT, boundsMinVReg); + updateSide(tempReg2, CC_GT, boundsMaxVReg); + + fp.LDUR(32, neonScratchRegD, srcReg, dec_->tcoff); + fp.UXTL(16, neonScratchRegQ, neonScratchRegD); // Widen to 32-bit + fp.UCVTF(32, neonScratchRegD, neonScratchRegD); + fp.STUR(64, neonScratchRegD, dstReg, dec_->decFmt.uvoff); +} + void VertexDecoderJitCache::Jit_TcFloatThrough() { LDP(INDEX_SIGNED, tempReg1, tempReg2, srcReg, dec_->tcoff); STP(INDEX_SIGNED, tempReg1, tempReg2, dstReg, dec_->decFmt.uvoff);