From acb39941722615cb437543db29d1abe1783a482e Mon Sep 17 00:00:00 2001 From: Henrik Rydgard Date: Sun, 24 Nov 2013 15:58:15 +0100 Subject: [PATCH] Split VertexDecoder into files for ARM and x86, got too large. --- CMakeLists.txt | 4 + GPU/GLES/VertexDecoder.cpp | 1549 +-------------------------------- GPU/GLES/VertexDecoder.h | 5 + GPU/GLES/VertexDecoderArm.cpp | 748 ++++++++++++++++ GPU/GLES/VertexDecoderX86.cpp | 842 ++++++++++++++++++ GPU/GPU.vcxproj | 9 +- GPU/GPU.vcxproj.filters | 4 +- android/jni/Android.mk | 5 +- 8 files changed, 1615 insertions(+), 1551 deletions(-) create mode 100644 GPU/GLES/VertexDecoderArm.cpp create mode 100644 GPU/GLES/VertexDecoderX86.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index fb16a24d06..2df9b63e33 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -689,6 +689,8 @@ if(ARM) Core/MIPS/ARM/ArmRegCache.h Core/MIPS/ARM/ArmRegCacheFPU.cpp Core/MIPS/ARM/ArmRegCacheFPU.h + Core/GPU/GLES/VertexDecoderArm.cpp + Core/GPU/GLES/VertexDecoderArm.h ext/disarm.cpp) elseif(X86) set(CoreExtra ${CoreExtra} @@ -705,6 +707,8 @@ elseif(X86) Core/MIPS/x86/RegCache.h Core/MIPS/x86/RegCacheFPU.cpp Core/MIPS/x86/RegCacheFPU.h + Core/GPU/GLES/VertexDecoderX86.cpp + Core/GPU/GLES/VertexDecoderX86.h ext/disarm.cpp) endif() diff --git a/GPU/GLES/VertexDecoder.cpp b/GPU/GLES/VertexDecoder.cpp index d4aa9eb0b8..7a0047814e 100644 --- a/GPU/GLES/VertexDecoder.cpp +++ b/GPU/GLES/VertexDecoder.cpp @@ -27,10 +27,6 @@ #include "VertexDecoder.h" #include "VertexShaderGenerator.h" -#if defined(_M_IX86) || defined(_M_X64) -#include -#endif - extern void DisassembleArm(const u8 *data, int size); static const u8 tcsize[4] = {0,2,4,8}, tcalign[4] = {0,1,2,4}; @@ -43,23 +39,6 @@ static const u8 wtsize[4] = {0,1,2,4}, wtalign[4] = {0,1,2,4}; // is kept in registers. static float MEMORY_ALIGNED16(skinMatrix[12]); -// We start out by converting the active matrices into 4x4 which are easier to multiply with -// using SSE / NEON and store them here. -static float MEMORY_ALIGNED16(bones[16 * 8]); - -// The rest will be dumped to bones as on x86. - -// NEON register allocation: -// Q0: Texture scaling parameters -// Q1: Temp storage -// Q2: Vector-by-matrix accumulator -// Q3: Unused -// -// We'll use Q4-Q7 as the "matrix accumulator". -// First two matrices will be preloaded into Q8-Q11 and Q12-Q15 to reduce -// memory bandwidth requirements. - - inline int align(int n, int align) { return (n + (align - 1)) & ~(align - 1); } @@ -911,1534 +890,8 @@ VertexDecoderJitCache::VertexDecoderJitCache() { #endif } -typedef void (VertexDecoderJitCache::*JitStepFunction)(); - -struct JitLookup { - StepFunction func; - JitStepFunction jitFunc; -}; - -#ifdef ARM - -static const float by128 = 1.0f / 128.0f; -static const float by256 = 1.0f / 256.0f; -static const float by32768 = 1.0f / 32768.0f; - -using namespace ArmGen; - -static const ARMReg tempReg1 = R3; -static const ARMReg tempReg2 = R4; -static const ARMReg tempReg3 = R5; -static const ARMReg scratchReg = R6; -static const ARMReg scratchReg2 = R7; -static const ARMReg scratchReg3 = R12; -static const ARMReg srcReg = R0; -static const ARMReg dstReg = R1; -static const ARMReg counterReg = R2; -static const ARMReg fpScratchReg = S4; -static const ARMReg fpScratchReg2 = S5; -static const ARMReg fpScratchReg3 = S6; -static const ARMReg fpScratchReg4 = S7; -static const ARMReg fpUscaleReg = S0; -static const ARMReg fpVscaleReg = S1; -static const ARMReg fpUoffsetReg = S2; -static const ARMReg fpVoffsetReg = S3; - -// Simpler aliases for NEON. Overlaps with corresponding VFP regs. -static const ARMReg neonUVScaleReg = D0; -static const ARMReg neonUVOffsetReg = D1; -static const ARMReg neonScratchReg = D2; -static const ARMReg neonScratchRegQ = Q1; // Overlaps with all the scratch regs - -// Everything above S6 is fair game for skinning - -// S8-S15 are used during matrix generation - -// These only live through the matrix multiplication -static const ARMReg src[3] = {S8, S9, S10}; // skin source -static const ARMReg acc[3] = {S11, S12, S13}; // skin accumulator - -static const JitLookup jitLookup[] = { - {&VertexDecoder::Step_WeightsU8, &VertexDecoderJitCache::Jit_WeightsU8}, - {&VertexDecoder::Step_WeightsU16, &VertexDecoderJitCache::Jit_WeightsU16}, - {&VertexDecoder::Step_WeightsFloat, &VertexDecoderJitCache::Jit_WeightsFloat}, - - {&VertexDecoder::Step_WeightsU8Skin, &VertexDecoderJitCache::Jit_WeightsU8Skin}, - {&VertexDecoder::Step_WeightsU16Skin, &VertexDecoderJitCache::Jit_WeightsU16Skin}, - {&VertexDecoder::Step_WeightsFloatSkin, &VertexDecoderJitCache::Jit_WeightsFloatSkin}, - - {&VertexDecoder::Step_TcU8, &VertexDecoderJitCache::Jit_TcU8}, - {&VertexDecoder::Step_TcU16, &VertexDecoderJitCache::Jit_TcU16}, - {&VertexDecoder::Step_TcFloat, &VertexDecoderJitCache::Jit_TcFloat}, - {&VertexDecoder::Step_TcU16Double, &VertexDecoderJitCache::Jit_TcU16Double}, - - {&VertexDecoder::Step_TcU8Prescale, &VertexDecoderJitCache::Jit_TcU8Prescale}, - {&VertexDecoder::Step_TcU16Prescale, &VertexDecoderJitCache::Jit_TcU16Prescale}, - {&VertexDecoder::Step_TcFloatPrescale, &VertexDecoderJitCache::Jit_TcFloatPrescale}, - - {&VertexDecoder::Step_TcU16Through, &VertexDecoderJitCache::Jit_TcU16Through}, - {&VertexDecoder::Step_TcFloatThrough, &VertexDecoderJitCache::Jit_TcFloatThrough}, - {&VertexDecoder::Step_TcU16ThroughDouble, &VertexDecoderJitCache::Jit_TcU16ThroughDouble}, - - {&VertexDecoder::Step_NormalS8, &VertexDecoderJitCache::Jit_NormalS8}, - {&VertexDecoder::Step_NormalS16, &VertexDecoderJitCache::Jit_NormalS16}, - {&VertexDecoder::Step_NormalFloat, &VertexDecoderJitCache::Jit_NormalFloat}, - - {&VertexDecoder::Step_NormalS8Skin, &VertexDecoderJitCache::Jit_NormalS8Skin}, - {&VertexDecoder::Step_NormalS16Skin, &VertexDecoderJitCache::Jit_NormalS16Skin}, - {&VertexDecoder::Step_NormalFloatSkin, &VertexDecoderJitCache::Jit_NormalFloatSkin}, - - {&VertexDecoder::Step_Color8888, &VertexDecoderJitCache::Jit_Color8888}, - {&VertexDecoder::Step_Color4444, &VertexDecoderJitCache::Jit_Color4444}, - {&VertexDecoder::Step_Color565, &VertexDecoderJitCache::Jit_Color565}, - {&VertexDecoder::Step_Color5551, &VertexDecoderJitCache::Jit_Color5551}, - - {&VertexDecoder::Step_PosS8Through, &VertexDecoderJitCache::Jit_PosS8Through}, - {&VertexDecoder::Step_PosS16Through, &VertexDecoderJitCache::Jit_PosS16Through}, - {&VertexDecoder::Step_PosFloatThrough, &VertexDecoderJitCache::Jit_PosFloat}, - - {&VertexDecoder::Step_PosS8, &VertexDecoderJitCache::Jit_PosS8}, - {&VertexDecoder::Step_PosS16, &VertexDecoderJitCache::Jit_PosS16}, - {&VertexDecoder::Step_PosFloat, &VertexDecoderJitCache::Jit_PosFloat}, - - {&VertexDecoder::Step_PosS8Skin, &VertexDecoderJitCache::Jit_PosS8Skin}, - {&VertexDecoder::Step_PosS16Skin, &VertexDecoderJitCache::Jit_PosS16Skin}, - {&VertexDecoder::Step_PosFloatSkin, &VertexDecoderJitCache::Jit_PosFloatSkin}, -}; - -JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) { - dec_ = &dec; - const u8 *start = AlignCode16(); - - bool prescaleStep = false; - bool skinning = false; - - // Look for prescaled texcoord steps - for (int i = 0; i < dec.numSteps_; i++) { - if (dec.steps_[i] == &VertexDecoder::Step_TcU8Prescale || - dec.steps_[i] == &VertexDecoder::Step_TcU16Prescale || - dec.steps_[i] == &VertexDecoder::Step_TcFloatPrescale) { - prescaleStep = true; - } - if (dec.steps_[i] == &VertexDecoder::Step_WeightsU8Skin || - dec.steps_[i] == &VertexDecoder::Step_WeightsU16Skin || - dec.steps_[i] == &VertexDecoder::Step_WeightsFloatSkin) { - skinning = true; - } - } - - SetCC(CC_AL); - - PUSH(6, R4, R5, R6, R7, R8, _LR); - - // Keep the scale/offset in a few fp registers if we need it. - if (prescaleStep) { - MOVI2R(R3, (u32)(&gstate_c.uv), scratchReg); - VLDR(fpUscaleReg, R3, 0); - VLDR(fpVscaleReg, R3, 4); - VLDR(fpUoffsetReg, R3, 8); - VLDR(fpVoffsetReg, R3, 12); - if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_8BIT) { - MOVI2F(fpScratchReg, by128, scratchReg); - VMUL(fpUscaleReg, fpUscaleReg, fpScratchReg); - VMUL(fpVscaleReg, fpVscaleReg, fpScratchReg); - } else if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_16BIT) { - MOVI2F(fpScratchReg, by32768, scratchReg); - VMUL(fpUscaleReg, fpUscaleReg, fpScratchReg); - VMUL(fpVscaleReg, fpVscaleReg, fpScratchReg); - } - } - - // TODO: NEON skinning register mapping - // The matrix will be built in Q12-Q15. - // The temporary matrix to be added to the built matrix will be in Q8-Q11. - - if (skinning) { - // TODO: Preload scale factors - } - - JumpTarget loopStart = GetCodePtr(); - // Preload data cache ahead of reading. TODO: Experiment with the offset. - PLD(srcReg, 64); - for (int i = 0; i < dec.numSteps_; i++) { - if (!CompileStep(dec, i)) { - // Reset the code ptr and return zero to indicate that we failed. - SetCodePtr(const_cast(start)); - char temp[1024] = {0}; - dec.ToString(temp); - INFO_LOG(HLE, "Could not compile vertex decoder: %s", temp); - return 0; - } - } - - ADDI2R(srcReg, srcReg, dec.VertexSize(), scratchReg); - ADDI2R(dstReg, dstReg, dec.decFmt.stride, scratchReg); - SUBS(counterReg, counterReg, 1); - B_CC(CC_NEQ, loopStart); - - POP(6, R4, R5, R6, R7, R8, _PC); - - FlushLitPool(); - FlushIcache(); - // DisassembleArm(start, GetCodePtr() - start); - // char temp[1024] = {0}; - // dec.ToString(temp); - // INFO_LOG(HLE, "%s", temp); - - return (JittedVertexDecoder)start; -} - -void VertexDecoderJitCache::Jit_WeightsU8() { - // Basic implementation - a byte at a time. TODO: Optimize - int j; - for (j = 0; j < dec_->nweights; j++) { - LDRB(tempReg1, srcReg, dec_->weightoff + j); - STRB(tempReg1, dstReg, dec_->decFmt.w0off + j); - } - if (j & 3) { - // Create a zero register. Might want to make a fixed one. - EOR(scratchReg, scratchReg, scratchReg); - } - while (j & 3) { - STRB(scratchReg, dstReg, dec_->decFmt.w0off + j); - j++; - } -} - -void VertexDecoderJitCache::Jit_WeightsU16() { - // Basic implementation - a short at a time. TODO: Optimize - int j; - for (j = 0; j < dec_->nweights; j++) { - LDRH(tempReg1, srcReg, dec_->weightoff + j * 2); - STRH(tempReg1, dstReg, dec_->decFmt.w0off + j * 2); - } - if (j & 3) { - // Create a zero register. Might want to make a fixed one. - EOR(scratchReg, scratchReg, scratchReg); - } - while (j & 3) { - STRH(scratchReg, dstReg, dec_->decFmt.w0off + j * 2); - j++; - } -} - -void VertexDecoderJitCache::Jit_WeightsFloat() { - int j; - for (j = 0; j < dec_->nweights; j++) { - LDR(tempReg1, srcReg, dec_->weightoff + j * 4); - STR(tempReg1, dstReg, dec_->decFmt.w0off + j * 4); - } - if (j & 3) { - // Create a zero register. Might want to make a fixed one. - EOR(scratchReg, scratchReg, scratchReg); - } -} - -static const ARMReg weightRegs[8] = { S8, S9, S10, S11, S12, S13, S14, S15 }; - -void VertexDecoderJitCache::Jit_ApplyWeights() { - MOVI2R(tempReg2, (u32)skinMatrix, scratchReg); -#if 1 - // This approach saves a few stores but accesses the matrices in a more - // sparse order. - const float *bone = &gstate.boneMatrix[0]; - MOVI2R(tempReg1, (u32)bone, scratchReg); - for (int i = 0; i < 12; i++) { - VLDR(fpScratchReg3, tempReg1, i * 4); - VMUL(fpScratchReg3, fpScratchReg3, weightRegs[0]); - for (int j = 1; j < dec_->nweights; j++) { - VLDR(fpScratchReg2, tempReg1, i * 4 + j * 4 * 12); - VMLA(fpScratchReg3, fpScratchReg2, weightRegs[j]); - } - VSTR(fpScratchReg3, tempReg2, i * 4); - } -#else - // This one does accesses in linear order but wastes time storing, loading, storing. - for (int j = 0; j < dec_->nweights; j++) { - const float *bone = &gstate.boneMatrix[j * 12]; - MOVI2R(tempReg1, (u32)bone, scratchReg); - // Okay, we have the weight. - if (j == 0) { - for (int i = 0; i < 12; i++) { - VLDR(fpScratchReg2, tempReg1, i * 4); - VMUL(fpScratchReg2, fpScratchReg2, weightRegs[j]); - VSTR(fpScratchReg2, tempReg2, i * 4); - } - } else { - for (int i = 0; i < 12; i++) { - VLDR(fpScratchReg2, tempReg1, i * 4); - VLDR(fpScratchReg3, tempReg2, i * 4); - VMLA(fpScratchReg3, fpScratchReg2, weightRegs[j]); - VSTR(fpScratchReg3, tempReg2, i * 4); - } - } - } -#endif -} - -void VertexDecoderJitCache::Jit_WeightsU8Skin() { - // No need to zero skinMatrix, we'll just STR to it in the first lap, - // then VLDR/VADD/VSTR in subsequent laps. - for (int j = 0; j < dec_->nweights; j++) { - LDRB(tempReg1, srcReg, dec_->weightoff + j); - VMOV(fpScratchReg, tempReg1); - VCVT(fpScratchReg, fpScratchReg, TO_FLOAT); - MOVI2F(fpScratchReg2, by128, scratchReg); - VMUL(weightRegs[j], fpScratchReg, fpScratchReg2); - } - - Jit_ApplyWeights(); -} - -void VertexDecoderJitCache::Jit_WeightsU16Skin() { - // No need to zero skinMatrix, we'll just STR to it in the first lap, - // then VLDR/VADD/VSTR in subsequent laps. - for (int j = 0; j < dec_->nweights; j++) { - LDRH(tempReg1, srcReg, dec_->weightoff + j * 2); - VMOV(fpScratchReg, tempReg1); - VCVT(fpScratchReg, fpScratchReg, TO_FLOAT); - MOVI2F(fpScratchReg2, 1.0f / 32768.0f, scratchReg); - VMUL(weightRegs[j], fpScratchReg, fpScratchReg2); - } - - Jit_ApplyWeights(); -} - -void VertexDecoderJitCache::Jit_WeightsFloatSkin() { - // No need to zero skinMatrix, we'll just STR to it in the first lap, - // then VLDR/VADD/VSTR in subsequent laps. - for (int j = 0; j < dec_->nweights; j++) { - VLDR(weightRegs[j], srcReg, dec_->weightoff + j * 4); - } - - Jit_ApplyWeights(); -} - -// Fill last two bytes with zeroes to align to 4 bytes. LDRH does it for us, handy. -void VertexDecoderJitCache::Jit_TcU8() { - LDRB(tempReg1, srcReg, dec_->tcoff); - LDRB(tempReg2, srcReg, dec_->tcoff + 1); - ORR(tempReg1, tempReg1, Operand2(tempReg2, ST_LSL, 8)); - STR(tempReg1, dstReg, dec_->decFmt.uvoff); -} - -void VertexDecoderJitCache::Jit_TcU16() { - LDRH(tempReg1, srcReg, dec_->tcoff); - LDRH(tempReg2, srcReg, dec_->tcoff + 2); - ORR(tempReg1, tempReg1, Operand2(tempReg2, ST_LSL, 16)); - STR(tempReg1, dstReg, dec_->decFmt.uvoff); -} - -void VertexDecoderJitCache::Jit_TcFloat() { - LDR(tempReg1, srcReg, dec_->tcoff); - LDR(tempReg2, srcReg, dec_->tcoff + 4); - STR(tempReg1, dstReg, dec_->decFmt.uvoff); - STR(tempReg2, dstReg, dec_->decFmt.uvoff + 4); -} - -void VertexDecoderJitCache::Jit_TcU16Through() { - LDRH(tempReg1, srcReg, dec_->tcoff); - LDRH(tempReg2, srcReg, dec_->tcoff + 2); - ORR(tempReg1, tempReg1, Operand2(tempReg2, ST_LSL, 16)); - STR(tempReg1, dstReg, dec_->decFmt.uvoff); -} - -void VertexDecoderJitCache::Jit_TcFloatThrough() { - LDR(tempReg1, srcReg, dec_->tcoff); - LDR(tempReg2, srcReg, dec_->tcoff + 4); - STR(tempReg1, dstReg, dec_->decFmt.uvoff); - STR(tempReg2, dstReg, dec_->decFmt.uvoff + 4); -} - -void VertexDecoderJitCache::Jit_TcU16Double() { - LDRH(tempReg1, srcReg, dec_->tcoff); - LDRH(tempReg2, srcReg, dec_->tcoff + 2); - LSL(tempReg1, tempReg1, 1); - ORR(tempReg1, tempReg1, Operand2(tempReg2, ST_LSL, 17)); - STR(tempReg1, dstReg, dec_->decFmt.uvoff); -} - -void VertexDecoderJitCache::Jit_TcU16ThroughDouble() { - LDRH(tempReg1, srcReg, dec_->tcoff); - LDRH(tempReg2, srcReg, dec_->tcoff + 2); - LSL(tempReg1, tempReg1, 1); - ORR(tempReg1, tempReg1, Operand2(tempReg2, ST_LSL, 17)); - STR(tempReg1, dstReg, dec_->decFmt.uvoff); -} - -void VertexDecoderJitCache::Jit_TcU8Prescale() { - if (cpu_info.bNEON) { - // TODO: Needs testing - ADD(scratchReg, srcReg, dec_->tcoff); - VLD1_lane(I_16, neonScratchReg, scratchReg, 0, false); - VMOVL(I_8 | I_UNSIGNED, neonScratchRegQ, neonScratchReg); // Widen to 16-bit - VMOVL(I_16 | I_UNSIGNED, neonScratchRegQ, neonScratchReg); // Widen to 32-bit - VCVT(F_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ); - ADD(scratchReg2, dstReg, dec_->decFmt.uvoff); - VMUL(F_32, neonScratchReg, neonScratchReg, neonUVScaleReg); - VADD(F_32, neonScratchReg, neonScratchReg, neonUVOffsetReg); - VST1(F_32, neonScratchReg, scratchReg2, 1, ALIGN_NONE); - } else { - // TODO: SIMD - LDRB(tempReg1, srcReg, dec_->tcoff); - LDRB(tempReg2, srcReg, dec_->tcoff + 1); - VMOV(fpScratchReg, tempReg1); - VMOV(fpScratchReg2, tempReg2); - VCVT(fpScratchReg, fpScratchReg, TO_FLOAT); - VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT); - // Could replace VMUL + VADD with VMLA but would require 2 more regs as we don't want to destroy fp*offsetReg. Later. - VMUL(fpScratchReg, fpScratchReg, fpUscaleReg); - VMUL(fpScratchReg2, fpScratchReg2, fpVscaleReg); - VADD(fpScratchReg, fpScratchReg, fpUoffsetReg); - VADD(fpScratchReg2, fpScratchReg2, fpVoffsetReg); - VSTR(fpScratchReg, dstReg, dec_->decFmt.uvoff); - VSTR(fpScratchReg2, dstReg, dec_->decFmt.uvoff + 4); - } -} - -void VertexDecoderJitCache::Jit_TcU16Prescale() { - if (cpu_info.bNEON) { - // TODO: Needs testing - ADD(scratchReg, srcReg, dec_->tcoff); - VLD1_lane(I_32, neonScratchReg, scratchReg, 0, false); - VMOVL(I_16 | I_UNSIGNED, neonScratchRegQ, neonScratchReg); // Widen to 32-bit - VCVT(F_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ); - ADD(scratchReg2, dstReg, dec_->decFmt.uvoff); - VMUL(F_32, neonScratchReg, neonScratchReg, neonUVScaleReg); - VADD(F_32, neonScratchReg, neonScratchReg, neonUVOffsetReg); - VST1(F_32, neonScratchReg, scratchReg2, 1, ALIGN_NONE); - } else { - LDRH(tempReg1, srcReg, dec_->tcoff); - LDRH(tempReg2, srcReg, dec_->tcoff + 2); - VMOV(fpScratchReg, tempReg1); - VMOV(fpScratchReg2, tempReg2); - VCVT(fpScratchReg, fpScratchReg, TO_FLOAT); - VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT); - VMUL(fpScratchReg, fpScratchReg, fpUscaleReg); - VMUL(fpScratchReg2, fpScratchReg2, fpVscaleReg); - VADD(fpScratchReg, fpScratchReg, fpUoffsetReg); - VADD(fpScratchReg2, fpScratchReg2, fpVoffsetReg); - VSTR(fpScratchReg, dstReg, dec_->decFmt.uvoff); - VSTR(fpScratchReg2, dstReg, dec_->decFmt.uvoff + 4); - } -} - -void VertexDecoderJitCache::Jit_TcFloatPrescale() { - if (cpu_info.bNEON) { - ADD(scratchReg, srcReg, dec_->tcoff); - VLD1(F_32, neonScratchReg, scratchReg, 1, ALIGN_NONE); - ADD(scratchReg2, dstReg, dec_->decFmt.uvoff); - VMUL(F_32, neonScratchReg, neonScratchReg, neonUVScaleReg); - VADD(F_32, neonScratchReg, neonScratchReg, neonUVOffsetReg); - VST1(F_32, neonScratchReg, scratchReg2, 1, ALIGN_NONE); - } else { - // TODO: SIMD - VLDR(fpScratchReg, srcReg, dec_->tcoff); - VLDR(fpScratchReg2, srcReg, dec_->tcoff + 4); - VMUL(fpScratchReg, fpScratchReg, fpUscaleReg); - VMUL(fpScratchReg2, fpScratchReg2, fpVscaleReg); - VADD(fpScratchReg, fpScratchReg, fpUoffsetReg); - VADD(fpScratchReg2, fpScratchReg2, fpVoffsetReg); - VSTR(fpScratchReg, dstReg, dec_->decFmt.uvoff); - VSTR(fpScratchReg2, dstReg, dec_->decFmt.uvoff + 4); - } -} - -void VertexDecoderJitCache::Jit_Color8888() { - LDR(tempReg1, srcReg, dec_->coloff); - STR(tempReg1, dstReg, dec_->decFmt.c0off); -} - -void VertexDecoderJitCache::Jit_Color4444() { - LDRH(tempReg1, srcReg, dec_->coloff); - - // Spread out the components. - ANDI2R(tempReg2, tempReg1, 0x000F, scratchReg); - ANDI2R(tempReg3, tempReg1, 0x00F0, scratchReg); - ORR(tempReg2, tempReg2, Operand2(tempReg3, ST_LSL, 4)); - ANDI2R(tempReg3, tempReg1, 0x0F00, scratchReg); - ORR(tempReg2, tempReg2, Operand2(tempReg3, ST_LSL, 8)); - ANDI2R(tempReg3, tempReg1, 0xF000, scratchReg); - ORR(tempReg2, tempReg2, Operand2(tempReg3, ST_LSL, 12)); - - // And saturate. - ORR(tempReg1, tempReg2, Operand2(tempReg2, ST_LSL, 4)); - - STR(tempReg1, dstReg, dec_->decFmt.c0off); -} - -void VertexDecoderJitCache::Jit_Color565() { - LDRH(tempReg1, srcReg, dec_->coloff); - - // Spread out R and B first. This puts them in 0x001F001F. - ANDI2R(tempReg2, tempReg1, 0x001F, scratchReg); - ANDI2R(tempReg3, tempReg1, 0xF800, scratchReg); - ORR(tempReg2, tempReg2, Operand2(tempReg3, ST_LSL, 5)); - - // Expand 5 -> 8. - LSL(tempReg3, tempReg2, 3); - ORR(tempReg2, tempReg3, Operand2(tempReg2, ST_LSR, 2)); - ANDI2R(tempReg2, tempReg2, 0xFFFF00FF, scratchReg); - - // Now finally G. We start by shoving it into a wall. - LSR(tempReg1, tempReg1, 5); - ANDI2R(tempReg1, tempReg1, 0x003F, scratchReg); - LSL(tempReg3, tempReg1, 2); - // Don't worry, shifts into a wall. - ORR(tempReg3, tempReg3, Operand2(tempReg1, ST_LSR, 4)); - ORR(tempReg2, tempReg2, Operand2(tempReg3, ST_LSL, 8)); - - // Add in full alpha. - ORI2R(tempReg1, tempReg2, 0xFF000000, scratchReg); - - STR(tempReg1, dstReg, dec_->decFmt.c0off); -} - -void VertexDecoderJitCache::Jit_Color5551() { - LDRH(tempReg1, srcReg, dec_->coloff); - - ANDI2R(tempReg2, tempReg1, 0x001F, scratchReg); - ANDI2R(tempReg3, tempReg1, 0x07E0, scratchReg); - ORR(tempReg2, tempReg2, Operand2(tempReg3, ST_LSL, 3)); - ANDI2R(tempReg3, tempReg1, 0xF800, scratchReg); - ORR(tempReg2, tempReg2, Operand2(tempReg3, ST_LSL, 6)); - - // Expand 5 -> 8. - LSR(tempReg3, tempReg2, 2); - // Clean up the bits that were shifted right. - BIC(tempReg3, tempReg1, AssumeMakeOperand2(0x000000F8)); - BIC(tempReg3, tempReg3, AssumeMakeOperand2(0x0000F800)); - ORR(tempReg2, tempReg3, Operand2(tempReg2, ST_LSL, 3)); - - // Now we just need alpha. - TSTI2R(tempReg1, 0x8000, scratchReg); - SetCC(CC_NEQ); - ORI2R(tempReg2, tempReg2, 0xFF000000, scratchReg); - SetCC(CC_AL); - - STR(tempReg2, dstReg, dec_->decFmt.c0off); -} - -void VertexDecoderJitCache::Jit_NormalS8() { - LDRB(tempReg1, srcReg, dec_->nrmoff); - LDRB(tempReg2, srcReg, dec_->nrmoff + 1); - LDRB(tempReg3, srcReg, dec_->nrmoff + 2); - ORR(tempReg1, tempReg1, Operand2(tempReg2, ST_LSL, 8)); - ORR(tempReg1, tempReg1, Operand2(tempReg3, ST_LSL, 16)); - STR(tempReg1, dstReg, dec_->decFmt.nrmoff); - - // Copy 3 bytes and then a zero. Might as well copy four. - // LDR(tempReg1, srcReg, dec_->nrmoff); - // ANDI2R(tempReg1, tempReg1, 0x00FFFFFF, scratchReg); - // STR(tempReg1, dstReg, dec_->decFmt.nrmoff); -} - -// Copy 6 bytes and then 2 zeroes. -void VertexDecoderJitCache::Jit_NormalS16() { - LDRH(tempReg1, srcReg, dec_->nrmoff); - LDRH(tempReg2, srcReg, dec_->nrmoff + 2); - LDRH(tempReg3, srcReg, dec_->nrmoff + 4); - ORR(tempReg1, tempReg1, Operand2(tempReg2, ST_LSL, 16)); - STR(tempReg1, dstReg, dec_->decFmt.nrmoff); - STR(tempReg3, dstReg, dec_->decFmt.nrmoff + 4); -} - -void VertexDecoderJitCache::Jit_NormalFloat() { - // Might not be aligned to 4, so we can't use LDMIA. - // Actually - not true: This will always be aligned. TODO - LDR(tempReg1, srcReg, dec_->nrmoff); - LDR(tempReg2, srcReg, dec_->nrmoff + 4); - LDR(tempReg3, srcReg, dec_->nrmoff + 8); - // But this is always aligned to 4 so we're safe. - ADD(scratchReg, dstReg, dec_->decFmt.nrmoff); - STMIA(scratchReg, false, 3, tempReg1, tempReg2, tempReg3); -} - -// Through expands into floats, always. Might want to look at changing this. -void VertexDecoderJitCache::Jit_PosS8Through() { - // TODO: SIMD - LDRSB(tempReg1, srcReg, dec_->posoff); - LDRSB(tempReg2, srcReg, dec_->posoff + 1); - LDRSB(tempReg3, srcReg, dec_->posoff + 2); - static const ARMReg tr[3] = { tempReg1, tempReg2, tempReg3 }; - for (int i = 0; i < 3; i++) { - VMOV(fpScratchReg, tr[i]); - VCVT(fpScratchReg, fpScratchReg, TO_FLOAT | IS_SIGNED); - VSTR(fpScratchReg, dstReg, dec_->decFmt.posoff + i * 4); - } -} - -// Through expands into floats, always. Might want to look at changing this. -void VertexDecoderJitCache::Jit_PosS16Through() { - // TODO: SIMD - LDRSH(tempReg1, srcReg, dec_->posoff); - LDRSH(tempReg2, srcReg, dec_->posoff + 2); - LDRSH(tempReg3, srcReg, dec_->posoff + 4); - static const ARMReg tr[3] = { tempReg1, tempReg2, tempReg3 }; - for (int i = 0; i < 3; i++) { - VMOV(fpScratchReg, tr[i]); - VCVT(fpScratchReg, fpScratchReg, TO_FLOAT | IS_SIGNED); - VSTR(fpScratchReg, dstReg, dec_->decFmt.posoff + i * 4); - } -} - -// Copy 3 bytes and then a zero. Might as well copy four. -void VertexDecoderJitCache::Jit_PosS8() { - LDRB(tempReg1, srcReg, dec_->posoff); - LDRB(tempReg2, srcReg, dec_->posoff + 1); - LDRB(tempReg3, srcReg, dec_->posoff + 2); - ORR(tempReg1, tempReg1, Operand2(tempReg2, ST_LSL, 8)); - ORR(tempReg1, tempReg1, Operand2(tempReg3, ST_LSL, 16)); - STR(tempReg1, dstReg, dec_->decFmt.posoff); -} - -// Copy 6 bytes and then 2 zeroes. -void VertexDecoderJitCache::Jit_PosS16() { - LDRH(tempReg1, srcReg, dec_->posoff); - LDRH(tempReg2, srcReg, dec_->posoff + 2); - LDRH(tempReg3, srcReg, dec_->posoff + 4); - ORR(tempReg1, tempReg1, Operand2(tempReg2, ST_LSL, 16)); - STR(tempReg1, dstReg, dec_->decFmt.posoff); - STR(tempReg3, dstReg, dec_->decFmt.posoff + 4); -} - -// Just copy 12 bytes. -void VertexDecoderJitCache::Jit_PosFloat() { - LDR(tempReg1, srcReg, dec_->posoff); - LDR(tempReg2, srcReg, dec_->posoff + 4); - LDR(tempReg3, srcReg, dec_->posoff + 8); - // But this is always aligned to 4 so we're safe. - ADD(scratchReg, dstReg, dec_->decFmt.posoff); - STMIA(scratchReg, false, 3, tempReg1, tempReg2, tempReg3); -} - -void VertexDecoderJitCache::Jit_NormalS8Skin() { - LDRSB(tempReg1, srcReg, dec_->nrmoff); - LDRSB(tempReg2, srcReg, dec_->nrmoff + 1); - LDRSB(tempReg3, srcReg, dec_->nrmoff + 2); - VMOV(src[0], tempReg1); - VMOV(src[1], tempReg2); - VMOV(src[2], tempReg3); - MOVI2F(S15, 1.0f/128.0f, scratchReg); - VCVT(src[0], src[0], TO_FLOAT | IS_SIGNED); - VCVT(src[1], src[1], TO_FLOAT | IS_SIGNED); - VCVT(src[2], src[2], TO_FLOAT | IS_SIGNED); - VMUL(src[0], src[0], S15); - VMUL(src[1], src[1], S15); - VMUL(src[2], src[2], S15); - Jit_WriteMatrixMul(dec_->decFmt.nrmoff, false); -} - -void VertexDecoderJitCache::Jit_NormalS16Skin() { - LDRSH(tempReg1, srcReg, dec_->nrmoff); - LDRSH(tempReg2, srcReg, dec_->nrmoff + 2); - LDRSH(tempReg3, srcReg, dec_->nrmoff + 4); - VMOV(fpScratchReg, tempReg1); - VMOV(fpScratchReg2, tempReg2); - VMOV(fpScratchReg3, tempReg3); - MOVI2F(S15, 1.0f/32768.0f, scratchReg); - VCVT(fpScratchReg, fpScratchReg, TO_FLOAT | IS_SIGNED); - VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT | IS_SIGNED); - VCVT(fpScratchReg3, fpScratchReg3, TO_FLOAT | IS_SIGNED); - VMUL(src[0], fpScratchReg, S15); - VMUL(src[1], fpScratchReg2, S15); - VMUL(src[2], fpScratchReg3, S15); - Jit_WriteMatrixMul(dec_->decFmt.nrmoff, false); -} - -void VertexDecoderJitCache::Jit_NormalFloatSkin() { - VLDR(src[0], srcReg, dec_->nrmoff); - VLDR(src[1], srcReg, dec_->nrmoff + 4); - VLDR(src[2], srcReg, dec_->nrmoff + 8); - Jit_WriteMatrixMul(dec_->decFmt.nrmoff, false); -} - -void VertexDecoderJitCache::Jit_WriteMatrixMul(int outOff, bool pos) { - MOVI2R(tempReg1, (u32)skinMatrix, scratchReg); - for (int i = 0; i < 3; i++) { - VLDR(fpScratchReg, tempReg1, 4 * i); - VMUL(acc[i], fpScratchReg, src[0]); - } - for (int i = 0; i < 3; i++) { - VLDR(fpScratchReg, tempReg1, 12 + 4 * i); - VMLA(acc[i], fpScratchReg, src[1]); - } - for (int i = 0; i < 3; i++) { - VLDR(fpScratchReg, tempReg1, 24 + 4 * i); - VMLA(acc[i], fpScratchReg, src[2]); - } - if (pos) { - for (int i = 0; i < 3; i++) { - VLDR(fpScratchReg, tempReg1, 36 + 4 * i); - VADD(acc[i], acc[i], fpScratchReg); - } - } - for (int i = 0; i < 3; i++) { - VSTR(acc[i], dstReg, outOff + i * 4); - } -} - -void VertexDecoderJitCache::Jit_PosS8Skin() { - LDRSB(tempReg1, srcReg, dec_->posoff); - LDRSB(tempReg2, srcReg, dec_->posoff + 1); - LDRSB(tempReg3, srcReg, dec_->posoff + 2); - VMOV(src[0], tempReg1); - VMOV(src[1], tempReg2); - VMOV(src[2], tempReg3); - MOVI2F(S15, 1.0f/128.0f, scratchReg); - VCVT(src[0], src[0], TO_FLOAT | IS_SIGNED); - VCVT(src[1], src[1], TO_FLOAT | IS_SIGNED); - VCVT(src[2], src[2], TO_FLOAT | IS_SIGNED); - VMUL(src[0], src[0], S15); - VMUL(src[1], src[1], S15); - VMUL(src[2], src[2], S15); - Jit_WriteMatrixMul(dec_->decFmt.posoff, true); -} - -void VertexDecoderJitCache::Jit_PosS16Skin() { - LDRSH(tempReg1, srcReg, dec_->posoff); - LDRSH(tempReg2, srcReg, dec_->posoff + 2); - LDRSH(tempReg3, srcReg, dec_->posoff + 4); - VMOV(src[0], tempReg1); - VMOV(src[1], tempReg2); - VMOV(src[2], tempReg3); - MOVI2F(S15, 1.0f/32768.0f, scratchReg); - VCVT(src[0], src[0], TO_FLOAT | IS_SIGNED); - VCVT(src[1], src[1], TO_FLOAT | IS_SIGNED); - VCVT(src[2], src[2], TO_FLOAT | IS_SIGNED); - VMUL(src[0], src[0], S15); - VMUL(src[1], src[1], S15); - VMUL(src[2], src[2], S15); - Jit_WriteMatrixMul(dec_->decFmt.posoff, true); -} - -void VertexDecoderJitCache::Jit_PosFloatSkin() { - VLDR(src[0], srcReg, dec_->posoff); - VLDR(src[1], srcReg, dec_->posoff + 4); - VLDR(src[2], srcReg, dec_->posoff + 8); - Jit_WriteMatrixMul(dec_->decFmt.posoff, true); -} - -#elif defined(_M_X64) || defined(_M_IX86) - -using namespace Gen; - -static const float MEMORY_ALIGNED16( by128[4] ) = { - 1.0f / 128.0f, 1.0f / 128.0f, 1.0f / 128.0f, 1.0f / 128.0f -}; -static const float MEMORY_ALIGNED16( by256[4] ) = { - 1.0f / 256, 1.0f / 256, 1.0f / 256, 1.0f / 256 -}; -static const float MEMORY_ALIGNED16( by32768[4] ) = { - 1.0f / 32768.0f, 1.0f / 32768.0f, 1.0f / 32768.0f, 1.0f / 32768.0f, -}; - -static const u32 MEMORY_ALIGNED16( threeMasks[4] ) = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0}; -static const u32 MEMORY_ALIGNED16( aOne[4] ) = {0, 0, 0, 0x3F800000}; - -#ifdef _M_X64 -#ifdef _WIN32 -static const X64Reg tempReg1 = RAX; -static const X64Reg tempReg2 = R9; -static const X64Reg tempReg3 = R10; -static const X64Reg srcReg = RCX; -static const X64Reg dstReg = RDX; -static const X64Reg counterReg = R8; -#else -static const X64Reg tempReg1 = RAX; -static const X64Reg tempReg2 = R9; -static const X64Reg tempReg3 = R10; -static const X64Reg srcReg = RDI; -static const X64Reg dstReg = RSI; -static const X64Reg counterReg = RDX; -#endif -#else -static const X64Reg tempReg1 = EAX; -static const X64Reg tempReg2 = EBX; -static const X64Reg tempReg3 = EDX; -static const X64Reg srcReg = ESI; -static const X64Reg dstReg = EDI; -static const X64Reg counterReg = ECX; -#endif - -// XMM0-XMM5 are volatile on Windows X64 -// XMM0-XMM7 are arguments (and thus volatile) on System V ABI (other x64 platforms) -static const X64Reg fpScaleOffsetReg = XMM0; - -static const X64Reg fpScratchReg = XMM1; -static const X64Reg fpScratchReg2 = XMM2; -static const X64Reg fpScratchReg3 = XMM3; - -// We're gonna keep the current skinning matrix in 4 XMM regs. Fortunately we easily -// have space for that now. - -// To debug, just comment them out one at a time until it works. We fall back -// on the interpreter if the compiler fails. - -static const JitLookup jitLookup[] = { - {&VertexDecoder::Step_WeightsU8, &VertexDecoderJitCache::Jit_WeightsU8}, - {&VertexDecoder::Step_WeightsU16, &VertexDecoderJitCache::Jit_WeightsU16}, - {&VertexDecoder::Step_WeightsFloat, &VertexDecoderJitCache::Jit_WeightsFloat}, - - {&VertexDecoder::Step_WeightsU8Skin, &VertexDecoderJitCache::Jit_WeightsU8Skin}, - {&VertexDecoder::Step_WeightsU16Skin, &VertexDecoderJitCache::Jit_WeightsU16Skin}, - {&VertexDecoder::Step_WeightsFloatSkin, &VertexDecoderJitCache::Jit_WeightsFloatSkin}, - - {&VertexDecoder::Step_TcU8, &VertexDecoderJitCache::Jit_TcU8}, - {&VertexDecoder::Step_TcU16, &VertexDecoderJitCache::Jit_TcU16}, - {&VertexDecoder::Step_TcFloat, &VertexDecoderJitCache::Jit_TcFloat}, - {&VertexDecoder::Step_TcU16Double, &VertexDecoderJitCache::Jit_TcU16Double}, - - {&VertexDecoder::Step_TcU8Prescale, &VertexDecoderJitCache::Jit_TcU8Prescale}, - {&VertexDecoder::Step_TcU16Prescale, &VertexDecoderJitCache::Jit_TcU16Prescale}, - {&VertexDecoder::Step_TcFloatPrescale, &VertexDecoderJitCache::Jit_TcFloatPrescale}, - - {&VertexDecoder::Step_TcU16Through, &VertexDecoderJitCache::Jit_TcU16Through}, - {&VertexDecoder::Step_TcFloatThrough, &VertexDecoderJitCache::Jit_TcFloatThrough}, - {&VertexDecoder::Step_TcU16ThroughDouble, &VertexDecoderJitCache::Jit_TcU16ThroughDouble}, - - {&VertexDecoder::Step_NormalS8, &VertexDecoderJitCache::Jit_NormalS8}, - {&VertexDecoder::Step_NormalS16, &VertexDecoderJitCache::Jit_NormalS16}, - {&VertexDecoder::Step_NormalFloat, &VertexDecoderJitCache::Jit_NormalFloat}, - - {&VertexDecoder::Step_NormalS8Skin, &VertexDecoderJitCache::Jit_NormalS8Skin}, - {&VertexDecoder::Step_NormalS16Skin, &VertexDecoderJitCache::Jit_NormalS16Skin}, - {&VertexDecoder::Step_NormalFloatSkin, &VertexDecoderJitCache::Jit_NormalFloatSkin}, - - {&VertexDecoder::Step_Color8888, &VertexDecoderJitCache::Jit_Color8888}, - {&VertexDecoder::Step_Color4444, &VertexDecoderJitCache::Jit_Color4444}, - {&VertexDecoder::Step_Color565, &VertexDecoderJitCache::Jit_Color565}, - {&VertexDecoder::Step_Color5551, &VertexDecoderJitCache::Jit_Color5551}, - - {&VertexDecoder::Step_PosS8Through, &VertexDecoderJitCache::Jit_PosS8Through}, - {&VertexDecoder::Step_PosS16Through, &VertexDecoderJitCache::Jit_PosS16Through}, - {&VertexDecoder::Step_PosFloatThrough, &VertexDecoderJitCache::Jit_PosFloat}, - - {&VertexDecoder::Step_PosS8, &VertexDecoderJitCache::Jit_PosS8}, - {&VertexDecoder::Step_PosS16, &VertexDecoderJitCache::Jit_PosS16}, - {&VertexDecoder::Step_PosFloat, &VertexDecoderJitCache::Jit_PosFloat}, - - {&VertexDecoder::Step_PosS8Skin, &VertexDecoderJitCache::Jit_PosS8Skin}, - {&VertexDecoder::Step_PosS16Skin, &VertexDecoderJitCache::Jit_PosS16Skin}, - {&VertexDecoder::Step_PosFloatSkin, &VertexDecoderJitCache::Jit_PosFloatSkin}, -}; - -// TODO: This should probably be global... -#ifdef _M_X64 -#define PTRBITS 64 -#else -#define PTRBITS 32 -#endif - -JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) { - dec_ = &dec; - const u8 *start = this->GetCodePtr(); - -#ifdef _M_IX86 - // Store register values - PUSH(ESI); - PUSH(EDI); - PUSH(EBX); - PUSH(EBP); - - // Read parameters - int offset = 4; - MOV(32, R(srcReg), MDisp(ESP, 16 + offset + 0)); - MOV(32, R(dstReg), MDisp(ESP, 16 + offset + 4)); - MOV(32, R(counterReg), MDisp(ESP, 16 + offset + 8)); -#endif - - // Save XMM4/XMM5 which apparently can be problematic? - // Actually, if they are, it must be a compiler bug because they SHOULD be ok. - // So I won't bother. - SUB(PTRBITS, R(ESP), Imm8(64)); - MOVUPS(MDisp(ESP, 0), XMM4); - MOVUPS(MDisp(ESP, 16), XMM5); - MOVUPS(MDisp(ESP, 32), XMM6); - MOVUPS(MDisp(ESP, 48), XMM7); - - bool prescaleStep = false; - // Look for prescaled texcoord steps - for (int i = 0; i < dec.numSteps_; i++) { - if (dec.steps_[i] == &VertexDecoder::Step_TcU8Prescale || - dec.steps_[i] == &VertexDecoder::Step_TcU16Prescale || - dec.steps_[i] == &VertexDecoder::Step_TcFloatPrescale) { - prescaleStep = true; - } - } - - // Add code to convert matrices to 4x4. - // Later we might want to do this when the matrices are loaded instead. - // This is mostly proof of concept. - int boneCount = 0; - if (dec.weighttype && g_Config.bSoftwareSkinning) { - for (int i = 0; i < 8; i++) { - MOVUPS(XMM0, M((void *)(gstate.boneMatrix + 12 * i))); - MOVUPS(XMM1, M((void *)(gstate.boneMatrix + 12 * i + 3))); - MOVUPS(XMM2, M((void *)(gstate.boneMatrix + 12 * i + 3 * 2))); - MOVUPS(XMM3, M((void *)(gstate.boneMatrix + 12 * i + 3 * 3))); - ANDPS(XMM0, M((void *)&threeMasks)); - ANDPS(XMM1, M((void *)&threeMasks)); - ANDPS(XMM2, M((void *)&threeMasks)); - ANDPS(XMM3, M((void *)&threeMasks)); - ORPS(XMM3, M((void *)&aOne)); - MOVAPS(M((void *)(bones + 16 * i)), XMM0); - MOVAPS(M((void *)(bones + 16 * i + 4)), XMM1); - MOVAPS(M((void *)(bones + 16 * i + 8)), XMM2); - MOVAPS(M((void *)(bones + 16 * i + 12)), XMM3); - } - } - - // Keep the scale/offset in a few fp registers if we need it. - if (prescaleStep) { -#ifdef _M_X64 - MOV(64, R(tempReg1), Imm64((u64)(&gstate_c.uv))); -#else - MOV(32, R(tempReg1), Imm32((u32)(&gstate_c.uv))); -#endif - MOVSS(fpScaleOffsetReg, MDisp(tempReg1, 0)); - MOVSS(fpScratchReg, MDisp(tempReg1, 4)); - UNPCKLPS(fpScaleOffsetReg, R(fpScratchReg)); - if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_8BIT) { - MULPS(fpScaleOffsetReg, M((void *)&by128)); - } else if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_16BIT) { - MULPS(fpScaleOffsetReg, M((void *)&by32768)); - } - MOVSS(fpScratchReg, MDisp(tempReg1, 8)); - MOVSS(fpScratchReg2, MDisp(tempReg1, 12)); - UNPCKLPS(fpScratchReg, R(fpScratchReg2)); - UNPCKLPD(fpScaleOffsetReg, R(fpScratchReg)); - } - - // Let's not bother with a proper stack frame. We just grab the arguments and go. - JumpTarget loopStart = GetCodePtr(); - for (int i = 0; i < dec.numSteps_; i++) { - if (!CompileStep(dec, i)) { - // Reset the code ptr and return zero to indicate that we failed. - SetCodePtr(const_cast(start)); - return 0; - } - } - - ADD(PTRBITS, R(srcReg), Imm32(dec.VertexSize())); - ADD(PTRBITS, R(dstReg), Imm32(dec.decFmt.stride)); - SUB(32, R(counterReg), Imm8(1)); - J_CC(CC_NZ, loopStart, true); - - MOVUPS(XMM4, MDisp(ESP, 0)); - MOVUPS(XMM5, MDisp(ESP, 16)); - MOVUPS(XMM6, MDisp(ESP, 32)); - MOVUPS(XMM7, MDisp(ESP, 48)); - ADD(PTRBITS, R(ESP), Imm8(64)); - -#ifdef _M_IX86 - // Restore register values - POP(EBP); - POP(EBX); - POP(EDI); - POP(ESI); -#endif - - RET(); - - return (JittedVertexDecoder)start; -} - -void VertexDecoderJitCache::Jit_WeightsU8() { - switch (dec_->nweights) { - case 1: - MOVZX(32, 8, tempReg1, MDisp(srcReg, dec_->weightoff)); - MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1)); - return; - case 2: - MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->weightoff)); - MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1)); - return; - case 3: - MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff)); - AND(32, R(tempReg1), Imm32(0x00FFFFFF)); - MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1)); - return; - case 4: - MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff)); - MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1)); - return; - case 8: - MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff)); - MOV(32, R(tempReg2), MDisp(srcReg, dec_->weightoff + 4)); - MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1)); - MOV(32, MDisp(dstReg, dec_->decFmt.w1off), R(tempReg2)); - return; - } - - // Basic implementation - a byte at a time. TODO: Optimize - int j; - for (j = 0; j < dec_->nweights; j++) { - MOV(8, R(tempReg1), MDisp(srcReg, dec_->weightoff + j)); - MOV(8, MDisp(dstReg, dec_->decFmt.w0off + j), R(tempReg1)); - } - while (j & 3) { - MOV(8, MDisp(dstReg, dec_->decFmt.w0off + j), Imm8(0)); - j++; - } -} - -void VertexDecoderJitCache::Jit_WeightsU16() { - switch (dec_->nweights) { - case 1: - MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->weightoff)); - MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1)); - MOV(32, MDisp(dstReg, dec_->decFmt.w0off + 4), Imm32(0)); - return; - - case 2: - MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff)); - MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1)); - MOV(32, MDisp(dstReg, dec_->decFmt.w0off + 4), Imm32(0)); - return; - - case 3: - MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff)); - MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->weightoff + 4)); - MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1)); - MOV(32, MDisp(dstReg, dec_->decFmt.w0off + 4), R(tempReg2)); - return; - - case 4: - MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff)); - MOV(32, R(tempReg2), MDisp(srcReg, dec_->weightoff + 4)); - MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1)); - MOV(32, MDisp(dstReg, dec_->decFmt.w0off + 4), R(tempReg2)); - return; - } - - // Basic implementation - a short at a time. TODO: Optimize - int j; - for (j = 0; j < dec_->nweights; j++) { - MOV(16, R(tempReg1), MDisp(srcReg, dec_->weightoff + j * 2)); - MOV(16, MDisp(dstReg, dec_->decFmt.w0off + j * 2), R(tempReg1)); - } - while (j & 3) { - MOV(16, MDisp(dstReg, dec_->decFmt.w0off + j * 2), Imm16(0)); - j++; - } -} - -void VertexDecoderJitCache::Jit_WeightsFloat() { - int j; - for (j = 0; j < dec_->nweights; j++) { - MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff + j * 4)); - MOV(32, MDisp(dstReg, dec_->decFmt.w0off + j * 4), R(tempReg1)); - } - while (j & 3) { // Zero additional weights rounding up to 4. - MOV(32, MDisp(dstReg, dec_->decFmt.w0off + j * 4), Imm32(0)); - j++; - } -} - -void VertexDecoderJitCache::Jit_WeightsU8Skin() { -#ifdef _M_X64 - MOV(PTRBITS, R(tempReg2), Imm64((uintptr_t)&bones)); -#else - MOV(PTRBITS, R(tempReg2), Imm32((uintptr_t)&bones)); -#endif - for (int j = 0; j < dec_->nweights; j++) { - MOVZX(32, 8, tempReg1, MDisp(srcReg, dec_->weightoff + j)); - CVTSI2SS(XMM1, R(tempReg1)); - MULSS(XMM1, M((void *)&by128)); - SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0)); - if (j == 0) { - MOVAPS(XMM4, MDisp(tempReg2, 0)); - MOVAPS(XMM5, MDisp(tempReg2, 16)); - MULPS(XMM4, R(XMM1)); - MULPS(XMM5, R(XMM1)); - MOVAPS(XMM6, MDisp(tempReg2, 32)); - MOVAPS(XMM7, MDisp(tempReg2, 48)); - MULPS(XMM6, R(XMM1)); - MULPS(XMM7, R(XMM1)); - } else { - MOVAPS(XMM2, MDisp(tempReg2, 0)); - MOVAPS(XMM3, MDisp(tempReg2, 16)); - MULPS(XMM2, R(XMM1)); - MULPS(XMM3, R(XMM1)); - ADDPS(XMM4, R(XMM2)); - ADDPS(XMM5, R(XMM3)); - MOVAPS(XMM2, MDisp(tempReg2, 32)); - MOVAPS(XMM3, MDisp(tempReg2, 48)); - MULPS(XMM2, R(XMM1)); - MULPS(XMM3, R(XMM1)); - ADDPS(XMM6, R(XMM2)); - ADDPS(XMM7, R(XMM3)); - } - ADD(PTRBITS, R(tempReg2), Imm8(4 * 16)); - } -} - -void VertexDecoderJitCache::Jit_WeightsU16Skin() { -#ifdef _M_X64 - MOV(PTRBITS, R(tempReg2), Imm64((uintptr_t)&bones)); -#else - MOV(PTRBITS, R(tempReg2), Imm32((uintptr_t)&bones)); -#endif - for (int j = 0; j < dec_->nweights; j++) { - MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->weightoff + j * 2)); - CVTSI2SS(XMM1, R(tempReg1)); - MULSS(XMM1, M((void *)&by32768)); - SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0)); - if (j == 0) { - MOVAPS(XMM4, MDisp(tempReg2, 0)); - MOVAPS(XMM5, MDisp(tempReg2, 16)); - MULPS(XMM4, R(XMM1)); - MULPS(XMM5, R(XMM1)); - MOVAPS(XMM6, MDisp(tempReg2, 32)); - MOVAPS(XMM7, MDisp(tempReg2, 48)); - MULPS(XMM6, R(XMM1)); - MULPS(XMM7, R(XMM1)); - } else { - MOVAPS(XMM2, MDisp(tempReg2, 0)); - MOVAPS(XMM3, MDisp(tempReg2, 16)); - MULPS(XMM2, R(XMM1)); - MULPS(XMM3, R(XMM1)); - ADDPS(XMM4, R(XMM2)); - ADDPS(XMM5, R(XMM3)); - MOVAPS(XMM2, MDisp(tempReg2, 32)); - MOVAPS(XMM3, MDisp(tempReg2, 48)); - MULPS(XMM2, R(XMM1)); - MULPS(XMM3, R(XMM1)); - ADDPS(XMM6, R(XMM2)); - ADDPS(XMM7, R(XMM3)); - } - ADD(PTRBITS, R(tempReg2), Imm8(4 * 16)); - } -} - -void VertexDecoderJitCache::Jit_WeightsFloatSkin() { -#ifdef _M_X64 - MOV(PTRBITS, R(tempReg2), Imm64((uintptr_t)&bones)); -#else - MOV(PTRBITS, R(tempReg2), Imm32((uintptr_t)&bones)); -#endif - for (int j = 0; j < dec_->nweights; j++) { - MOVSS(XMM1, MDisp(srcReg, dec_->weightoff + j * 4)); - SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0)); - if (j == 0) { - MOVAPS(XMM4, MDisp(tempReg2, 0)); - MOVAPS(XMM5, MDisp(tempReg2, 16)); - MULPS(XMM4, R(XMM1)); - MULPS(XMM5, R(XMM1)); - MOVAPS(XMM6, MDisp(tempReg2, 32)); - MOVAPS(XMM7, MDisp(tempReg2, 48)); - MULPS(XMM6, R(XMM1)); - MULPS(XMM7, R(XMM1)); - } else { - MOVAPS(XMM2, MDisp(tempReg2, 0)); - MOVAPS(XMM3, MDisp(tempReg2, 16)); - MULPS(XMM2, R(XMM1)); - MULPS(XMM3, R(XMM1)); - ADDPS(XMM4, R(XMM2)); - ADDPS(XMM5, R(XMM3)); - MOVAPS(XMM2, MDisp(tempReg2, 32)); - MOVAPS(XMM3, MDisp(tempReg2, 48)); - MULPS(XMM2, R(XMM1)); - MULPS(XMM3, R(XMM1)); - ADDPS(XMM6, R(XMM2)); - ADDPS(XMM7, R(XMM3)); - } - ADD(PTRBITS, R(tempReg2), Imm8(4 * 16)); - } -} - -// Fill last two bytes with zeroes to align to 4 bytes. MOVZX does it for us, handy. -void VertexDecoderJitCache::Jit_TcU8() { - MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->tcoff)); - MOV(32, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1)); -} - -void VertexDecoderJitCache::Jit_TcU16() { - MOV(32, R(tempReg1), MDisp(srcReg, dec_->tcoff)); - MOV(32, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1)); -} - -void VertexDecoderJitCache::Jit_TcU16Double() { - MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->tcoff)); - MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->tcoff + 2)); - SHL(16, R(tempReg1), Imm8(1)); // 16 to get a wall to shift into - SHL(32, R(tempReg2), Imm8(17)); - OR(32, R(tempReg1), R(tempReg2)); - MOV(32, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1)); -} - -void VertexDecoderJitCache::Jit_TcFloat() { -#ifdef _M_X64 - MOV(64, R(tempReg1), MDisp(srcReg, dec_->tcoff)); - MOV(64, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1)); -#else - MOV(32, R(tempReg1), MDisp(srcReg, dec_->tcoff)); - MOV(32, R(tempReg2), MDisp(srcReg, dec_->tcoff + 4)); - MOV(32, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1)); - MOV(32, MDisp(dstReg, dec_->decFmt.uvoff + 4), R(tempReg2)); -#endif -} - -void VertexDecoderJitCache::Jit_TcU8Prescale() { - // TODO: The first five instructions could be done in 1 or 2 in SSE4 - MOVZX(32, 8, tempReg1, MDisp(srcReg, dec_->tcoff)); - MOVZX(32, 8, tempReg2, MDisp(srcReg, dec_->tcoff + 1)); - CVTSI2SS(fpScratchReg, R(tempReg1)); - CVTSI2SS(fpScratchReg2, R(tempReg2)); - UNPCKLPS(fpScratchReg, R(fpScratchReg2)); - MULPS(fpScratchReg, R(fpScaleOffsetReg)); - SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2)); - ADDPS(fpScratchReg, R(fpScaleOffsetReg)); - SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2)); - MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg); -} - -void VertexDecoderJitCache::Jit_TcU16Prescale() { - PXOR(fpScratchReg2, R(fpScratchReg2)); - MOVD_xmm(fpScratchReg, MDisp(srcReg, dec_->tcoff)); - PUNPCKLWD(fpScratchReg, R(fpScratchReg2)); - CVTDQ2PS(fpScratchReg, R(fpScratchReg)); - MULPS(fpScratchReg, R(fpScaleOffsetReg)); - SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2)); - ADDPS(fpScratchReg, R(fpScaleOffsetReg)); - SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2)); - MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg); -} - -void VertexDecoderJitCache::Jit_TcFloatPrescale() { - MOVQ_xmm(fpScratchReg, MDisp(srcReg, dec_->tcoff)); - MULPS(fpScratchReg, R(fpScaleOffsetReg)); - SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2)); - ADDPS(fpScratchReg, R(fpScaleOffsetReg)); - SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2)); - MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg); -} - -void VertexDecoderJitCache::Jit_TcU16Through() { - MOV(32, R(tempReg1), MDisp(srcReg, dec_->tcoff)); - MOV(32, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1)); -} - -void VertexDecoderJitCache::Jit_TcU16ThroughDouble() { - MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->tcoff)); - MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->tcoff + 2)); - SHL(16, R(tempReg1), Imm8(1)); // 16 to get a wall to shift into - SHL(32, R(tempReg2), Imm8(17)); - OR(32, R(tempReg1), R(tempReg2)); - MOV(32, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1)); -} - -void VertexDecoderJitCache::Jit_TcFloatThrough() { -#ifdef _M_X64 - MOV(64, R(tempReg1), MDisp(srcReg, dec_->tcoff)); - MOV(64, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1)); -#else - MOV(32, R(tempReg1), MDisp(srcReg, dec_->tcoff)); - MOV(32, R(tempReg2), MDisp(srcReg, dec_->tcoff + 4)); - MOV(32, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1)); - MOV(32, MDisp(dstReg, dec_->decFmt.uvoff + 4), R(tempReg2)); -#endif -} - -void VertexDecoderJitCache::Jit_Color8888() { - MOV(32, R(tempReg1), MDisp(srcReg, dec_->coloff)); - MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg1)); -} - -static const u32 MEMORY_ALIGNED16(nibbles[4]) = { 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, }; - - -void VertexDecoderJitCache::Jit_Color4444() { - // Needs benchmarking. A bit wasteful by only using 1 SSE lane. -#if 0 - // Alternate approach - MOVD_xmm(XMM3, MDisp(srcReg, dec_->coloff)); - MOVAPS(XMM2, R(XMM3)); - MOVAPS(XMM1, M((void *)nibbles)); - PSLLD(XMM2, 4); - PAND(XMM3, R(XMM1)); - PAND(XMM2, R(XMM1)); - PSRLD(XMM2, 4); - PXOR(XMM1, R(XMM1)); - PUNPCKLBW(XMM2, R(XMM1)); - PUNPCKLBW(XMM3, R(XMM1)); - PSLLD(XMM2, 4); - POR(XMM3, R(XMM2)); - MOVAPS(XMM2, R(XMM3)); - PSLLD(XMM2, 4); - POR(XMM3, R(XMM2)); - MOVD_xmm(MDisp(dstReg, dec_->decFmt.c0off), XMM3); - return; -#endif - - MOV(32, R(tempReg1), MDisp(srcReg, dec_->coloff)); - - // 0000ABGR, copy R and double forwards. - MOV(32, R(tempReg3), R(tempReg1)); - AND(32, R(tempReg3), Imm32(0x0000000F)); - MOV(32, R(tempReg2), R(tempReg3)); - SHL(32, R(tempReg3), Imm8(4)); - OR(32, R(tempReg2), R(tempReg3)); - - // tempReg1 -> 00ABGR00, then double G backwards. - SHL(32, R(tempReg1), Imm8(8)); - MOV(32, R(tempReg3), R(tempReg1)); - AND(32, R(tempReg3), Imm32(0x0000F000)); - OR(32, R(tempReg2), R(tempReg3)); - SHR(32, R(tempReg3), Imm8(4)); - OR(32, R(tempReg2), R(tempReg3)); - - // Now do B forwards again (still 00ABGR00.) - MOV(32, R(tempReg3), R(tempReg1)); - AND(32, R(tempReg3), Imm32(0x000F0000)); - OR(32, R(tempReg2), R(tempReg3)); - SHL(32, R(tempReg3), Imm8(4)); - OR(32, R(tempReg2), R(tempReg3)); - - // tempReg1 -> ABGR0000, then double A backwards. - SHL(32, R(tempReg1), Imm8(8)); - MOV(32, R(tempReg3), R(tempReg1)); - AND(32, R(tempReg3), Imm32(0xF0000000)); - OR(32, R(tempReg2), R(tempReg3)); - SHR(32, R(tempReg3), Imm8(4)); - OR(32, R(tempReg2), R(tempReg3)); - - MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg2)); -} - -void VertexDecoderJitCache::Jit_Color565() { - MOV(32, R(tempReg1), MDisp(srcReg, dec_->coloff)); - - MOV(32, R(tempReg2), R(tempReg1)); - AND(32, R(tempReg2), Imm32(0x0000001F)); - - // B (we do R and B at the same time, they're both 5.) - MOV(32, R(tempReg3), R(tempReg1)); - AND(32, R(tempReg3), Imm32(0x0000F800)); - SHL(32, R(tempReg3), Imm8(5)); - OR(32, R(tempReg2), R(tempReg3)); - - // Expand 5 -> 8. At this point we have 00BB00RR. - MOV(32, R(tempReg3), R(tempReg2)); - SHL(32, R(tempReg2), Imm8(3)); - SHR(32, R(tempReg3), Imm8(2)); - OR(32, R(tempReg2), R(tempReg3)); - AND(32, R(tempReg2), Imm32(0x00FF00FF)); - - // Now's as good a time to put in A as any. - OR(32, R(tempReg2), Imm32(0xFF000000)); - - // Last, we need to align, extract, and expand G. - // 3 to align to G, and then 2 to expand to 8. - SHL(32, R(tempReg1), Imm8(3 + 2)); - AND(32, R(tempReg1), Imm32(0x0000FC00)); - MOV(32, R(tempReg3), R(tempReg1)); - // 2 to account for tempReg1 being preshifted, 4 for expansion. - SHR(32, R(tempReg3), Imm8(2 + 4)); - OR(32, R(tempReg1), R(tempReg3)); - AND(32, R(tempReg1), Imm32(0x0000FF00)); - OR(32, R(tempReg2), R(tempReg1)); - - MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg2)); -} - -void VertexDecoderJitCache::Jit_Color5551() { - MOV(32, R(tempReg1), MDisp(srcReg, dec_->coloff)); - - MOV(32, R(tempReg2), R(tempReg1)); - AND(32, R(tempReg2), Imm32(0x0000001F)); - - MOV(32, R(tempReg3), R(tempReg1)); - AND(32, R(tempReg3), Imm32(0x000003E0)); - SHL(32, R(tempReg3), Imm8(3)); - OR(32, R(tempReg2), R(tempReg3)); - - MOV(32, R(tempReg3), R(tempReg1)); - AND(32, R(tempReg3), Imm32(0x00007C00)); - SHL(32, R(tempReg3), Imm8(6)); - OR(32, R(tempReg2), R(tempReg3)); - - // Expand 5 -> 8. After this is just A. - MOV(32, R(tempReg3), R(tempReg2)); - SHL(32, R(tempReg2), Imm8(3)); - SHR(32, R(tempReg3), Imm8(2)); - // Chop off the bits that were shifted out. - AND(32, R(tempReg3), Imm32(0x00070707)); - OR(32, R(tempReg2), R(tempReg3)); - - // For A, we shift it to a single bit, and then subtract and XOR. - // That's probably the simplest way to expand it... - SHR(32, R(tempReg1), Imm8(15)); - // If it was 0, it's now -1, otherwise it's 0. Easy. - SUB(32, R(tempReg1), Imm8(1)); - XOR(32, R(tempReg1), Imm32(0xFF000000)); - AND(32, R(tempReg1), Imm32(0xFF000000)); - OR(32, R(tempReg2), R(tempReg1)); - - MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg2)); -} - -// Copy 3 bytes and then a zero. Might as well copy four. -void VertexDecoderJitCache::Jit_NormalS8() { - MOV(32, R(tempReg1), MDisp(srcReg, dec_->nrmoff)); - AND(32, R(tempReg1), Imm32(0x00FFFFFF)); - MOV(32, MDisp(dstReg, dec_->decFmt.nrmoff), R(tempReg1)); -} - -// Copy 6 bytes and then 2 zeroes. -void VertexDecoderJitCache::Jit_NormalS16() { - MOV(32, R(tempReg1), MDisp(srcReg, dec_->nrmoff)); - MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->nrmoff + 4)); - MOV(32, MDisp(dstReg, dec_->decFmt.nrmoff), R(tempReg1)); - MOV(32, MDisp(dstReg, dec_->decFmt.nrmoff + 4), R(tempReg2)); -} - -void VertexDecoderJitCache::Jit_NormalFloat() { - MOV(32, R(tempReg1), MDisp(srcReg, dec_->nrmoff)); - MOV(32, R(tempReg2), MDisp(srcReg, dec_->nrmoff + 4)); - MOV(32, R(tempReg3), MDisp(srcReg, dec_->nrmoff + 8)); - MOV(32, MDisp(dstReg, dec_->decFmt.nrmoff), R(tempReg1)); - MOV(32, MDisp(dstReg, dec_->decFmt.nrmoff + 4), R(tempReg2)); - MOV(32, MDisp(dstReg, dec_->decFmt.nrmoff + 8), R(tempReg3)); -} - -// This could be a bit shorter with AVX 3-operand instructions and FMA. -void VertexDecoderJitCache::Jit_WriteMatrixMul(int outOff, bool pos) { - MOVAPS(XMM1, R(XMM3)); - SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0)); - MULPS(XMM1, R(XMM4)); - MOVAPS(XMM2, R(XMM3)); - SHUFPS(XMM2, R(XMM2), _MM_SHUFFLE(1, 1, 1, 1)); - MULPS(XMM2, R(XMM5)); - ADDPS(XMM1, R(XMM2)); - MOVAPS(XMM2, R(XMM3)); - SHUFPS(XMM2, R(XMM2), _MM_SHUFFLE(2, 2, 2, 2)); - MULPS(XMM2, R(XMM6)); - ADDPS(XMM1, R(XMM2)); - if (pos) { - ADDPS(XMM1, R(XMM7)); - } - MOVUPS(MDisp(dstReg, outOff), XMM1); -} - -void VertexDecoderJitCache::Jit_NormalS8Skin() { - XORPS(XMM3, R(XMM3)); - MOVD_xmm(XMM1, MDisp(srcReg, dec_->nrmoff)); - PUNPCKLBW(XMM1, R(XMM3)); - PUNPCKLWD(XMM1, R(XMM3)); - PSLLD(XMM1, 24); - PSRAD(XMM1, 24); // Ugly sign extension, can be done faster in SSE4 - CVTDQ2PS(XMM3, R(XMM1)); - MULPS(XMM3, M((void *)&by128)); - Jit_WriteMatrixMul(dec_->decFmt.nrmoff, false); -} - -// Copy 6 bytes and then 2 zeroes. -void VertexDecoderJitCache::Jit_NormalS16Skin() { - XORPS(XMM3, R(XMM3)); - MOVQ_xmm(XMM1, MDisp(srcReg, dec_->nrmoff)); - PUNPCKLWD(XMM1, R(XMM3)); - PSLLD(XMM1, 16); - PSRAD(XMM1, 16); // Ugly sign extension, can be done faster in SSE4 - CVTDQ2PS(XMM3, R(XMM1)); - MULPS(XMM3, M((void *)&by32768)); - Jit_WriteMatrixMul(dec_->decFmt.nrmoff, false); -} - -void VertexDecoderJitCache::Jit_NormalFloatSkin() { - MOVUPS(XMM3, MDisp(srcReg, dec_->nrmoff)); - Jit_WriteMatrixMul(dec_->decFmt.nrmoff, false); -} - -// Through expands into floats, always. Might want to look at changing this. -void VertexDecoderJitCache::Jit_PosS8Through() { - // TODO: SIMD - for (int i = 0; i < 3; i++) { - MOVSX(32, 8, tempReg1, MDisp(srcReg, dec_->posoff + i)); - CVTSI2SS(fpScratchReg, R(tempReg1)); - MOVSS(MDisp(dstReg, dec_->decFmt.posoff + i * 4), fpScratchReg); - } -} - -// Through expands into floats, always. Might want to look at changing this. -void VertexDecoderJitCache::Jit_PosS16Through() { - XORPS(XMM3, R(XMM3)); - MOVQ_xmm(XMM1, MDisp(srcReg, dec_->posoff)); - PUNPCKLWD(XMM1, R(XMM3)); - PSLLD(XMM1, 16); - PSRAD(XMM1, 16); // Ugly sign extension, can be done faster in SSE4 - CVTDQ2PS(XMM3, R(XMM1)); - MOVUPS(MDisp(dstReg, dec_->decFmt.posoff), XMM3); -} - -// Copy 3 bytes and then a zero. Might as well copy four. -void VertexDecoderJitCache::Jit_PosS8() { - MOV(32, R(tempReg1), MDisp(srcReg, dec_->posoff)); - AND(32, R(tempReg1), Imm32(0x00FFFFFF)); - MOV(32, MDisp(dstReg, dec_->decFmt.posoff), R(tempReg1)); -} - -// Copy 6 bytes and then 2 zeroes. -void VertexDecoderJitCache::Jit_PosS16() { - MOV(32, R(tempReg1), MDisp(srcReg, dec_->posoff)); - MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->posoff + 4)); - MOV(32, MDisp(dstReg, dec_->decFmt.posoff), R(tempReg1)); - MOV(32, MDisp(dstReg, dec_->decFmt.posoff + 4), R(tempReg2)); -} - -// Just copy 12 bytes. -void VertexDecoderJitCache::Jit_PosFloat() { - MOV(32, R(tempReg1), MDisp(srcReg, dec_->posoff)); - MOV(32, R(tempReg2), MDisp(srcReg, dec_->posoff + 4)); - MOV(32, R(tempReg3), MDisp(srcReg, dec_->posoff + 8)); - MOV(32, MDisp(dstReg, dec_->decFmt.posoff), R(tempReg1)); - MOV(32, MDisp(dstReg, dec_->decFmt.posoff + 4), R(tempReg2)); - MOV(32, MDisp(dstReg, dec_->decFmt.posoff + 8), R(tempReg3)); -} - -void VertexDecoderJitCache::Jit_PosS8Skin() { - XORPS(XMM3, R(XMM3)); - MOVD_xmm(XMM1, MDisp(srcReg, dec_->posoff)); - PUNPCKLBW(XMM1, R(XMM3)); - PUNPCKLWD(XMM1, R(XMM3)); - PSLLD(XMM1, 24); - PSRAD(XMM1, 24); // Ugly sign extension, can be done faster in SSE4 - CVTDQ2PS(XMM3, R(XMM1)); - MULPS(XMM3, M((void *)&by128)); - Jit_WriteMatrixMul(dec_->decFmt.posoff, true); -} - -void VertexDecoderJitCache::Jit_PosS16Skin() { - XORPS(XMM3, R(XMM3)); - MOVQ_xmm(XMM1, MDisp(srcReg, dec_->posoff)); - PUNPCKLWD(XMM1, R(XMM3)); - PSLLD(XMM1, 16); - PSRAD(XMM1, 16); // Ugly sign extension, can be done faster in SSE4 - CVTDQ2PS(XMM3, R(XMM1)); - MULPS(XMM3, M((void *)&by32768)); - Jit_WriteMatrixMul(dec_->decFmt.posoff, true); -} - -// Just copy 12 bytes. -void VertexDecoderJitCache::Jit_PosFloatSkin() { - MOVUPS(XMM3, MDisp(srcReg, dec_->posoff)); - Jit_WriteMatrixMul(dec_->decFmt.posoff, true); -} - -#elif defined(PPC) +#if defined(PPC) #error This should not be built for PowerPC, at least not yet. #endif - -bool VertexDecoderJitCache::CompileStep(const VertexDecoder &dec, int step) { - // See if we find a matching JIT function - for (size_t i = 0; i < ARRAY_SIZE(jitLookup); i++) { - if (dec.steps_[step] == jitLookup[i].func) { - ((*this).*jitLookup[i].jitFunc)(); - return true; - } - } - return false; -} diff --git a/GPU/GLES/VertexDecoder.h b/GPU/GLES/VertexDecoder.h index eee81bc091..ab974aef00 100644 --- a/GPU/GLES/VertexDecoder.h +++ b/GPU/GLES/VertexDecoder.h @@ -34,7 +34,12 @@ class VertexDecoder; class VertexDecoderJitCache; typedef void (VertexDecoder::*StepFunction)() const; +typedef void (VertexDecoderJitCache::*JitStepFunction)(); +struct JitLookup { + StepFunction func; + JitStepFunction jitFunc; +}; typedef void (*JittedVertexDecoder)(const u8 *src, u8 *dst, int count); diff --git a/GPU/GLES/VertexDecoderArm.cpp b/GPU/GLES/VertexDecoderArm.cpp new file mode 100644 index 0000000000..f43e70fc96 --- /dev/null +++ b/GPU/GLES/VertexDecoderArm.cpp @@ -0,0 +1,748 @@ +// Copyright (c) 2013- PPSSPP Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0 or later versions. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official git repository and contact information can be found at +// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. + +#include "Common/CPUDetect.h" +#include "GPU/GLES/VertexDecoder.h" + +// Used only in non-NEON mode. +static float MEMORY_ALIGNED16(skinMatrix[12]); + +// Will be used only in NEON mode. +static float MEMORY_ALIGNED16(bones[16 * 6]); // First two are kept in registers. + +// NEON register allocation: +// Q0: Texture scaling parameters +// Q1: Temp storage +// Q2: Vector-by-matrix accumulator +// Q3: Unused +// +// We'll use Q4-Q7 as the "matrix accumulator". +// First two matrices will be preloaded into Q8-Q11 and Q12-Q15 to reduce +// memory bandwidth requirements. +// The rest will be dumped to bones as on x86. + + +static const float by128 = 1.0f / 128.0f; +static const float by256 = 1.0f / 256.0f; +static const float by32768 = 1.0f / 32768.0f; + +using namespace ArmGen; + +static const ARMReg tempReg1 = R3; +static const ARMReg tempReg2 = R4; +static const ARMReg tempReg3 = R5; +static const ARMReg scratchReg = R6; +static const ARMReg scratchReg2 = R7; +static const ARMReg scratchReg3 = R12; +static const ARMReg srcReg = R0; +static const ARMReg dstReg = R1; +static const ARMReg counterReg = R2; +static const ARMReg fpScratchReg = S4; +static const ARMReg fpScratchReg2 = S5; +static const ARMReg fpScratchReg3 = S6; +static const ARMReg fpScratchReg4 = S7; +static const ARMReg fpUscaleReg = S0; +static const ARMReg fpVscaleReg = S1; +static const ARMReg fpUoffsetReg = S2; +static const ARMReg fpVoffsetReg = S3; + +// Simpler aliases for NEON. Overlaps with corresponding VFP regs. +static const ARMReg neonUVScaleReg = D0; +static const ARMReg neonUVOffsetReg = D1; +static const ARMReg neonScratchReg = D2; +static const ARMReg neonScratchRegQ = Q1; // Overlaps with all the scratch regs + +// Everything above S6 is fair game for skinning + +// S8-S15 are used during matrix generation + +// These only live through the matrix multiplication +static const ARMReg src[3] = {S8, S9, S10}; // skin source +static const ARMReg acc[3] = {S11, S12, S13}; // skin accumulator + +static const JitLookup jitLookup[] = { + {&VertexDecoder::Step_WeightsU8, &VertexDecoderJitCache::Jit_WeightsU8}, + {&VertexDecoder::Step_WeightsU16, &VertexDecoderJitCache::Jit_WeightsU16}, + {&VertexDecoder::Step_WeightsFloat, &VertexDecoderJitCache::Jit_WeightsFloat}, + + {&VertexDecoder::Step_WeightsU8Skin, &VertexDecoderJitCache::Jit_WeightsU8Skin}, + {&VertexDecoder::Step_WeightsU16Skin, &VertexDecoderJitCache::Jit_WeightsU16Skin}, + {&VertexDecoder::Step_WeightsFloatSkin, &VertexDecoderJitCache::Jit_WeightsFloatSkin}, + + {&VertexDecoder::Step_TcU8, &VertexDecoderJitCache::Jit_TcU8}, + {&VertexDecoder::Step_TcU16, &VertexDecoderJitCache::Jit_TcU16}, + {&VertexDecoder::Step_TcFloat, &VertexDecoderJitCache::Jit_TcFloat}, + {&VertexDecoder::Step_TcU16Double, &VertexDecoderJitCache::Jit_TcU16Double}, + + {&VertexDecoder::Step_TcU8Prescale, &VertexDecoderJitCache::Jit_TcU8Prescale}, + {&VertexDecoder::Step_TcU16Prescale, &VertexDecoderJitCache::Jit_TcU16Prescale}, + {&VertexDecoder::Step_TcFloatPrescale, &VertexDecoderJitCache::Jit_TcFloatPrescale}, + + {&VertexDecoder::Step_TcU16Through, &VertexDecoderJitCache::Jit_TcU16Through}, + {&VertexDecoder::Step_TcFloatThrough, &VertexDecoderJitCache::Jit_TcFloatThrough}, + {&VertexDecoder::Step_TcU16ThroughDouble, &VertexDecoderJitCache::Jit_TcU16ThroughDouble}, + + {&VertexDecoder::Step_NormalS8, &VertexDecoderJitCache::Jit_NormalS8}, + {&VertexDecoder::Step_NormalS16, &VertexDecoderJitCache::Jit_NormalS16}, + {&VertexDecoder::Step_NormalFloat, &VertexDecoderJitCache::Jit_NormalFloat}, + + {&VertexDecoder::Step_NormalS8Skin, &VertexDecoderJitCache::Jit_NormalS8Skin}, + {&VertexDecoder::Step_NormalS16Skin, &VertexDecoderJitCache::Jit_NormalS16Skin}, + {&VertexDecoder::Step_NormalFloatSkin, &VertexDecoderJitCache::Jit_NormalFloatSkin}, + + {&VertexDecoder::Step_Color8888, &VertexDecoderJitCache::Jit_Color8888}, + {&VertexDecoder::Step_Color4444, &VertexDecoderJitCache::Jit_Color4444}, + {&VertexDecoder::Step_Color565, &VertexDecoderJitCache::Jit_Color565}, + {&VertexDecoder::Step_Color5551, &VertexDecoderJitCache::Jit_Color5551}, + + {&VertexDecoder::Step_PosS8Through, &VertexDecoderJitCache::Jit_PosS8Through}, + {&VertexDecoder::Step_PosS16Through, &VertexDecoderJitCache::Jit_PosS16Through}, + {&VertexDecoder::Step_PosFloatThrough, &VertexDecoderJitCache::Jit_PosFloat}, + + {&VertexDecoder::Step_PosS8, &VertexDecoderJitCache::Jit_PosS8}, + {&VertexDecoder::Step_PosS16, &VertexDecoderJitCache::Jit_PosS16}, + {&VertexDecoder::Step_PosFloat, &VertexDecoderJitCache::Jit_PosFloat}, + + {&VertexDecoder::Step_PosS8Skin, &VertexDecoderJitCache::Jit_PosS8Skin}, + {&VertexDecoder::Step_PosS16Skin, &VertexDecoderJitCache::Jit_PosS16Skin}, + {&VertexDecoder::Step_PosFloatSkin, &VertexDecoderJitCache::Jit_PosFloatSkin}, +}; + +JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) { + dec_ = &dec; + const u8 *start = AlignCode16(); + + bool prescaleStep = false; + bool skinning = false; + + // Look for prescaled texcoord steps + for (int i = 0; i < dec.numSteps_; i++) { + if (dec.steps_[i] == &VertexDecoder::Step_TcU8Prescale || + dec.steps_[i] == &VertexDecoder::Step_TcU16Prescale || + dec.steps_[i] == &VertexDecoder::Step_TcFloatPrescale) { + prescaleStep = true; + } + if (dec.steps_[i] == &VertexDecoder::Step_WeightsU8Skin || + dec.steps_[i] == &VertexDecoder::Step_WeightsU16Skin || + dec.steps_[i] == &VertexDecoder::Step_WeightsFloatSkin) { + skinning = true; + } + } + + SetCC(CC_AL); + + PUSH(6, R4, R5, R6, R7, R8, _LR); + + // Keep the scale/offset in a few fp registers if we need it. + // This step can be NEON-ized but the savings would be miniscule. + if (prescaleStep) { + MOVI2R(R3, (u32)(&gstate_c.uv), scratchReg); + VLDR(fpUscaleReg, R3, 0); + VLDR(fpVscaleReg, R3, 4); + VLDR(fpUoffsetReg, R3, 8); + VLDR(fpVoffsetReg, R3, 12); + if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_8BIT) { + MOVI2F(fpScratchReg, by128, scratchReg); + VMUL(fpUscaleReg, fpUscaleReg, fpScratchReg); + VMUL(fpVscaleReg, fpVscaleReg, fpScratchReg); + } else if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_16BIT) { + MOVI2F(fpScratchReg, by32768, scratchReg); + VMUL(fpUscaleReg, fpUscaleReg, fpScratchReg); + VMUL(fpVscaleReg, fpVscaleReg, fpScratchReg); + } + } + + // TODO: NEON skinning register mapping + // The matrix will be built in Q12-Q15. + // The temporary matrix to be added to the built matrix will be in Q8-Q11. + + if (skinning) { + // TODO: Preload scale factors + } + + JumpTarget loopStart = GetCodePtr(); + // Preload data cache ahead of reading. TODO: Experiment with the offset. + PLD(srcReg, 64); + for (int i = 0; i < dec.numSteps_; i++) { + if (!CompileStep(dec, i)) { + // Reset the code ptr and return zero to indicate that we failed. + SetCodePtr(const_cast(start)); + char temp[1024] = {0}; + dec.ToString(temp); + INFO_LOG(HLE, "Could not compile vertex decoder: %s", temp); + return 0; + } + } + + ADDI2R(srcReg, srcReg, dec.VertexSize(), scratchReg); + ADDI2R(dstReg, dstReg, dec.decFmt.stride, scratchReg); + SUBS(counterReg, counterReg, 1); + B_CC(CC_NEQ, loopStart); + + POP(6, R4, R5, R6, R7, R8, _PC); + + FlushLitPool(); + FlushIcache(); + // DisassembleArm(start, GetCodePtr() - start); + // char temp[1024] = {0}; + // dec.ToString(temp); + // INFO_LOG(HLE, "%s", temp); + + return (JittedVertexDecoder)start; +} + +void VertexDecoderJitCache::Jit_WeightsU8() { + // Basic implementation - a byte at a time. TODO: Optimize + int j; + for (j = 0; j < dec_->nweights; j++) { + LDRB(tempReg1, srcReg, dec_->weightoff + j); + STRB(tempReg1, dstReg, dec_->decFmt.w0off + j); + } + if (j & 3) { + // Create a zero register. Might want to make a fixed one. + EOR(scratchReg, scratchReg, scratchReg); + } + while (j & 3) { + STRB(scratchReg, dstReg, dec_->decFmt.w0off + j); + j++; + } +} + +void VertexDecoderJitCache::Jit_WeightsU16() { + // Basic implementation - a short at a time. TODO: Optimize + int j; + for (j = 0; j < dec_->nweights; j++) { + LDRH(tempReg1, srcReg, dec_->weightoff + j * 2); + STRH(tempReg1, dstReg, dec_->decFmt.w0off + j * 2); + } + if (j & 3) { + // Create a zero register. Might want to make a fixed one. + EOR(scratchReg, scratchReg, scratchReg); + } + while (j & 3) { + STRH(scratchReg, dstReg, dec_->decFmt.w0off + j * 2); + j++; + } +} + +void VertexDecoderJitCache::Jit_WeightsFloat() { + int j; + for (j = 0; j < dec_->nweights; j++) { + LDR(tempReg1, srcReg, dec_->weightoff + j * 4); + STR(tempReg1, dstReg, dec_->decFmt.w0off + j * 4); + } + if (j & 3) { + // Create a zero register. Might want to make a fixed one. + EOR(scratchReg, scratchReg, scratchReg); + } +} + +static const ARMReg weightRegs[8] = { S8, S9, S10, S11, S12, S13, S14, S15 }; + +void VertexDecoderJitCache::Jit_ApplyWeights() { + MOVI2R(tempReg2, (u32)skinMatrix, scratchReg); +#if 1 + // This approach saves a few stores but accesses the matrices in a more + // sparse order. + const float *bone = &gstate.boneMatrix[0]; + MOVI2R(tempReg1, (u32)bone, scratchReg); + for (int i = 0; i < 12; i++) { + VLDR(fpScratchReg3, tempReg1, i * 4); + VMUL(fpScratchReg3, fpScratchReg3, weightRegs[0]); + for (int j = 1; j < dec_->nweights; j++) { + VLDR(fpScratchReg2, tempReg1, i * 4 + j * 4 * 12); + VMLA(fpScratchReg3, fpScratchReg2, weightRegs[j]); + } + VSTR(fpScratchReg3, tempReg2, i * 4); + } +#else + // This one does accesses in linear order but wastes time storing, loading, storing. + for (int j = 0; j < dec_->nweights; j++) { + const float *bone = &gstate.boneMatrix[j * 12]; + MOVI2R(tempReg1, (u32)bone, scratchReg); + // Okay, we have the weight. + if (j == 0) { + for (int i = 0; i < 12; i++) { + VLDR(fpScratchReg2, tempReg1, i * 4); + VMUL(fpScratchReg2, fpScratchReg2, weightRegs[j]); + VSTR(fpScratchReg2, tempReg2, i * 4); + } + } else { + for (int i = 0; i < 12; i++) { + VLDR(fpScratchReg2, tempReg1, i * 4); + VLDR(fpScratchReg3, tempReg2, i * 4); + VMLA(fpScratchReg3, fpScratchReg2, weightRegs[j]); + VSTR(fpScratchReg3, tempReg2, i * 4); + } + } + } +#endif +} + +void VertexDecoderJitCache::Jit_WeightsU8Skin() { + // No need to zero skinMatrix, we'll just STR to it in the first lap, + // then VLDR/VADD/VSTR in subsequent laps. + for (int j = 0; j < dec_->nweights; j++) { + LDRB(tempReg1, srcReg, dec_->weightoff + j); + VMOV(fpScratchReg, tempReg1); + VCVT(fpScratchReg, fpScratchReg, TO_FLOAT); + MOVI2F(fpScratchReg2, by128, scratchReg); + VMUL(weightRegs[j], fpScratchReg, fpScratchReg2); + } + + Jit_ApplyWeights(); +} + +void VertexDecoderJitCache::Jit_WeightsU16Skin() { + // No need to zero skinMatrix, we'll just STR to it in the first lap, + // then VLDR/VADD/VSTR in subsequent laps. + for (int j = 0; j < dec_->nweights; j++) { + LDRH(tempReg1, srcReg, dec_->weightoff + j * 2); + VMOV(fpScratchReg, tempReg1); + VCVT(fpScratchReg, fpScratchReg, TO_FLOAT); + MOVI2F(fpScratchReg2, 1.0f / 32768.0f, scratchReg); + VMUL(weightRegs[j], fpScratchReg, fpScratchReg2); + } + + Jit_ApplyWeights(); +} + +void VertexDecoderJitCache::Jit_WeightsFloatSkin() { + // No need to zero skinMatrix, we'll just STR to it in the first lap, + // then VLDR/VADD/VSTR in subsequent laps. + for (int j = 0; j < dec_->nweights; j++) { + VLDR(weightRegs[j], srcReg, dec_->weightoff + j * 4); + } + + Jit_ApplyWeights(); +} + +// Fill last two bytes with zeroes to align to 4 bytes. LDRH does it for us, handy. +void VertexDecoderJitCache::Jit_TcU8() { + LDRB(tempReg1, srcReg, dec_->tcoff); + LDRB(tempReg2, srcReg, dec_->tcoff + 1); + ORR(tempReg1, tempReg1, Operand2(tempReg2, ST_LSL, 8)); + STR(tempReg1, dstReg, dec_->decFmt.uvoff); +} + +void VertexDecoderJitCache::Jit_TcU16() { + LDRH(tempReg1, srcReg, dec_->tcoff); + LDRH(tempReg2, srcReg, dec_->tcoff + 2); + ORR(tempReg1, tempReg1, Operand2(tempReg2, ST_LSL, 16)); + STR(tempReg1, dstReg, dec_->decFmt.uvoff); +} + +void VertexDecoderJitCache::Jit_TcFloat() { + LDR(tempReg1, srcReg, dec_->tcoff); + LDR(tempReg2, srcReg, dec_->tcoff + 4); + STR(tempReg1, dstReg, dec_->decFmt.uvoff); + STR(tempReg2, dstReg, dec_->decFmt.uvoff + 4); +} + +void VertexDecoderJitCache::Jit_TcU16Through() { + LDRH(tempReg1, srcReg, dec_->tcoff); + LDRH(tempReg2, srcReg, dec_->tcoff + 2); + ORR(tempReg1, tempReg1, Operand2(tempReg2, ST_LSL, 16)); + STR(tempReg1, dstReg, dec_->decFmt.uvoff); +} + +void VertexDecoderJitCache::Jit_TcFloatThrough() { + LDR(tempReg1, srcReg, dec_->tcoff); + LDR(tempReg2, srcReg, dec_->tcoff + 4); + STR(tempReg1, dstReg, dec_->decFmt.uvoff); + STR(tempReg2, dstReg, dec_->decFmt.uvoff + 4); +} + +void VertexDecoderJitCache::Jit_TcU16Double() { + LDRH(tempReg1, srcReg, dec_->tcoff); + LDRH(tempReg2, srcReg, dec_->tcoff + 2); + LSL(tempReg1, tempReg1, 1); + ORR(tempReg1, tempReg1, Operand2(tempReg2, ST_LSL, 17)); + STR(tempReg1, dstReg, dec_->decFmt.uvoff); +} + +void VertexDecoderJitCache::Jit_TcU16ThroughDouble() { + LDRH(tempReg1, srcReg, dec_->tcoff); + LDRH(tempReg2, srcReg, dec_->tcoff + 2); + LSL(tempReg1, tempReg1, 1); + ORR(tempReg1, tempReg1, Operand2(tempReg2, ST_LSL, 17)); + STR(tempReg1, dstReg, dec_->decFmt.uvoff); +} + +void VertexDecoderJitCache::Jit_TcU8Prescale() { + if (cpu_info.bNEON) { + // TODO: Needs testing + ADD(scratchReg, srcReg, dec_->tcoff); + VLD1_lane(I_16, neonScratchReg, scratchReg, 0, false); + VMOVL(I_8 | I_UNSIGNED, neonScratchRegQ, neonScratchReg); // Widen to 16-bit + VMOVL(I_16 | I_UNSIGNED, neonScratchRegQ, neonScratchReg); // Widen to 32-bit + VCVT(F_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ); + ADD(scratchReg2, dstReg, dec_->decFmt.uvoff); + VMUL(F_32, neonScratchReg, neonScratchReg, neonUVScaleReg); + VADD(F_32, neonScratchReg, neonScratchReg, neonUVOffsetReg); + VST1(F_32, neonScratchReg, scratchReg2, 1, ALIGN_NONE); + } else { + // TODO: SIMD + LDRB(tempReg1, srcReg, dec_->tcoff); + LDRB(tempReg2, srcReg, dec_->tcoff + 1); + VMOV(fpScratchReg, tempReg1); + VMOV(fpScratchReg2, tempReg2); + VCVT(fpScratchReg, fpScratchReg, TO_FLOAT); + VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT); + // Could replace VMUL + VADD with VMLA but would require 2 more regs as we don't want to destroy fp*offsetReg. Later. + VMUL(fpScratchReg, fpScratchReg, fpUscaleReg); + VMUL(fpScratchReg2, fpScratchReg2, fpVscaleReg); + VADD(fpScratchReg, fpScratchReg, fpUoffsetReg); + VADD(fpScratchReg2, fpScratchReg2, fpVoffsetReg); + VSTR(fpScratchReg, dstReg, dec_->decFmt.uvoff); + VSTR(fpScratchReg2, dstReg, dec_->decFmt.uvoff + 4); + } +} + +void VertexDecoderJitCache::Jit_TcU16Prescale() { + if (cpu_info.bNEON) { + // TODO: Needs testing + ADD(scratchReg, srcReg, dec_->tcoff); + VLD1_lane(I_32, neonScratchReg, scratchReg, 0, false); + VMOVL(I_16 | I_UNSIGNED, neonScratchRegQ, neonScratchReg); // Widen to 32-bit + VCVT(F_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ); + ADD(scratchReg2, dstReg, dec_->decFmt.uvoff); + VMUL(F_32, neonScratchReg, neonScratchReg, neonUVScaleReg); + VADD(F_32, neonScratchReg, neonScratchReg, neonUVOffsetReg); + VST1(F_32, neonScratchReg, scratchReg2, 1, ALIGN_NONE); + } else { + LDRH(tempReg1, srcReg, dec_->tcoff); + LDRH(tempReg2, srcReg, dec_->tcoff + 2); + VMOV(fpScratchReg, tempReg1); + VMOV(fpScratchReg2, tempReg2); + VCVT(fpScratchReg, fpScratchReg, TO_FLOAT); + VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT); + VMUL(fpScratchReg, fpScratchReg, fpUscaleReg); + VMUL(fpScratchReg2, fpScratchReg2, fpVscaleReg); + VADD(fpScratchReg, fpScratchReg, fpUoffsetReg); + VADD(fpScratchReg2, fpScratchReg2, fpVoffsetReg); + VSTR(fpScratchReg, dstReg, dec_->decFmt.uvoff); + VSTR(fpScratchReg2, dstReg, dec_->decFmt.uvoff + 4); + } +} + +void VertexDecoderJitCache::Jit_TcFloatPrescale() { + if (cpu_info.bNEON) { + ADD(scratchReg, srcReg, dec_->tcoff); + VLD1(F_32, neonScratchReg, scratchReg, 1, ALIGN_NONE); + ADD(scratchReg2, dstReg, dec_->decFmt.uvoff); + VMUL(F_32, neonScratchReg, neonScratchReg, neonUVScaleReg); + VADD(F_32, neonScratchReg, neonScratchReg, neonUVOffsetReg); + VST1(F_32, neonScratchReg, scratchReg2, 1, ALIGN_NONE); + } else { + // TODO: SIMD + VLDR(fpScratchReg, srcReg, dec_->tcoff); + VLDR(fpScratchReg2, srcReg, dec_->tcoff + 4); + VMUL(fpScratchReg, fpScratchReg, fpUscaleReg); + VMUL(fpScratchReg2, fpScratchReg2, fpVscaleReg); + VADD(fpScratchReg, fpScratchReg, fpUoffsetReg); + VADD(fpScratchReg2, fpScratchReg2, fpVoffsetReg); + VSTR(fpScratchReg, dstReg, dec_->decFmt.uvoff); + VSTR(fpScratchReg2, dstReg, dec_->decFmt.uvoff + 4); + } +} + +void VertexDecoderJitCache::Jit_Color8888() { + LDR(tempReg1, srcReg, dec_->coloff); + STR(tempReg1, dstReg, dec_->decFmt.c0off); +} + +void VertexDecoderJitCache::Jit_Color4444() { + LDRH(tempReg1, srcReg, dec_->coloff); + + // Spread out the components. + ANDI2R(tempReg2, tempReg1, 0x000F, scratchReg); + ANDI2R(tempReg3, tempReg1, 0x00F0, scratchReg); + ORR(tempReg2, tempReg2, Operand2(tempReg3, ST_LSL, 4)); + ANDI2R(tempReg3, tempReg1, 0x0F00, scratchReg); + ORR(tempReg2, tempReg2, Operand2(tempReg3, ST_LSL, 8)); + ANDI2R(tempReg3, tempReg1, 0xF000, scratchReg); + ORR(tempReg2, tempReg2, Operand2(tempReg3, ST_LSL, 12)); + + // And saturate. + ORR(tempReg1, tempReg2, Operand2(tempReg2, ST_LSL, 4)); + + STR(tempReg1, dstReg, dec_->decFmt.c0off); +} + +void VertexDecoderJitCache::Jit_Color565() { + LDRH(tempReg1, srcReg, dec_->coloff); + + // Spread out R and B first. This puts them in 0x001F001F. + ANDI2R(tempReg2, tempReg1, 0x001F, scratchReg); + ANDI2R(tempReg3, tempReg1, 0xF800, scratchReg); + ORR(tempReg2, tempReg2, Operand2(tempReg3, ST_LSL, 5)); + + // Expand 5 -> 8. + LSL(tempReg3, tempReg2, 3); + ORR(tempReg2, tempReg3, Operand2(tempReg2, ST_LSR, 2)); + ANDI2R(tempReg2, tempReg2, 0xFFFF00FF, scratchReg); + + // Now finally G. We start by shoving it into a wall. + LSR(tempReg1, tempReg1, 5); + ANDI2R(tempReg1, tempReg1, 0x003F, scratchReg); + LSL(tempReg3, tempReg1, 2); + // Don't worry, shifts into a wall. + ORR(tempReg3, tempReg3, Operand2(tempReg1, ST_LSR, 4)); + ORR(tempReg2, tempReg2, Operand2(tempReg3, ST_LSL, 8)); + + // Add in full alpha. + ORI2R(tempReg1, tempReg2, 0xFF000000, scratchReg); + + STR(tempReg1, dstReg, dec_->decFmt.c0off); +} + +void VertexDecoderJitCache::Jit_Color5551() { + LDRH(tempReg1, srcReg, dec_->coloff); + + ANDI2R(tempReg2, tempReg1, 0x001F, scratchReg); + ANDI2R(tempReg3, tempReg1, 0x07E0, scratchReg); + ORR(tempReg2, tempReg2, Operand2(tempReg3, ST_LSL, 3)); + ANDI2R(tempReg3, tempReg1, 0xF800, scratchReg); + ORR(tempReg2, tempReg2, Operand2(tempReg3, ST_LSL, 6)); + + // Expand 5 -> 8. + LSR(tempReg3, tempReg2, 2); + // Clean up the bits that were shifted right. + BIC(tempReg3, tempReg1, AssumeMakeOperand2(0x000000F8)); + BIC(tempReg3, tempReg3, AssumeMakeOperand2(0x0000F800)); + ORR(tempReg2, tempReg3, Operand2(tempReg2, ST_LSL, 3)); + + // Now we just need alpha. + TSTI2R(tempReg1, 0x8000, scratchReg); + SetCC(CC_NEQ); + ORI2R(tempReg2, tempReg2, 0xFF000000, scratchReg); + SetCC(CC_AL); + + STR(tempReg2, dstReg, dec_->decFmt.c0off); +} + +void VertexDecoderJitCache::Jit_NormalS8() { + LDRB(tempReg1, srcReg, dec_->nrmoff); + LDRB(tempReg2, srcReg, dec_->nrmoff + 1); + LDRB(tempReg3, srcReg, dec_->nrmoff + 2); + ORR(tempReg1, tempReg1, Operand2(tempReg2, ST_LSL, 8)); + ORR(tempReg1, tempReg1, Operand2(tempReg3, ST_LSL, 16)); + STR(tempReg1, dstReg, dec_->decFmt.nrmoff); + + // Copy 3 bytes and then a zero. Might as well copy four. + // LDR(tempReg1, srcReg, dec_->nrmoff); + // ANDI2R(tempReg1, tempReg1, 0x00FFFFFF, scratchReg); + // STR(tempReg1, dstReg, dec_->decFmt.nrmoff); +} + +// Copy 6 bytes and then 2 zeroes. +void VertexDecoderJitCache::Jit_NormalS16() { + LDRH(tempReg1, srcReg, dec_->nrmoff); + LDRH(tempReg2, srcReg, dec_->nrmoff + 2); + LDRH(tempReg3, srcReg, dec_->nrmoff + 4); + ORR(tempReg1, tempReg1, Operand2(tempReg2, ST_LSL, 16)); + STR(tempReg1, dstReg, dec_->decFmt.nrmoff); + STR(tempReg3, dstReg, dec_->decFmt.nrmoff + 4); +} + +void VertexDecoderJitCache::Jit_NormalFloat() { + // Might not be aligned to 4, so we can't use LDMIA. + // Actually - not true: This will always be aligned. TODO + LDR(tempReg1, srcReg, dec_->nrmoff); + LDR(tempReg2, srcReg, dec_->nrmoff + 4); + LDR(tempReg3, srcReg, dec_->nrmoff + 8); + // But this is always aligned to 4 so we're safe. + ADD(scratchReg, dstReg, dec_->decFmt.nrmoff); + STMIA(scratchReg, false, 3, tempReg1, tempReg2, tempReg3); +} + +// Through expands into floats, always. Might want to look at changing this. +void VertexDecoderJitCache::Jit_PosS8Through() { + // TODO: SIMD + LDRSB(tempReg1, srcReg, dec_->posoff); + LDRSB(tempReg2, srcReg, dec_->posoff + 1); + LDRSB(tempReg3, srcReg, dec_->posoff + 2); + static const ARMReg tr[3] = { tempReg1, tempReg2, tempReg3 }; + for (int i = 0; i < 3; i++) { + VMOV(fpScratchReg, tr[i]); + VCVT(fpScratchReg, fpScratchReg, TO_FLOAT | IS_SIGNED); + VSTR(fpScratchReg, dstReg, dec_->decFmt.posoff + i * 4); + } +} + +// Through expands into floats, always. Might want to look at changing this. +void VertexDecoderJitCache::Jit_PosS16Through() { + // TODO: SIMD + LDRSH(tempReg1, srcReg, dec_->posoff); + LDRSH(tempReg2, srcReg, dec_->posoff + 2); + LDRSH(tempReg3, srcReg, dec_->posoff + 4); + static const ARMReg tr[3] = { tempReg1, tempReg2, tempReg3 }; + for (int i = 0; i < 3; i++) { + VMOV(fpScratchReg, tr[i]); + VCVT(fpScratchReg, fpScratchReg, TO_FLOAT | IS_SIGNED); + VSTR(fpScratchReg, dstReg, dec_->decFmt.posoff + i * 4); + } +} + +// Copy 3 bytes and then a zero. Might as well copy four. +void VertexDecoderJitCache::Jit_PosS8() { + LDRB(tempReg1, srcReg, dec_->posoff); + LDRB(tempReg2, srcReg, dec_->posoff + 1); + LDRB(tempReg3, srcReg, dec_->posoff + 2); + ORR(tempReg1, tempReg1, Operand2(tempReg2, ST_LSL, 8)); + ORR(tempReg1, tempReg1, Operand2(tempReg3, ST_LSL, 16)); + STR(tempReg1, dstReg, dec_->decFmt.posoff); +} + +// Copy 6 bytes and then 2 zeroes. +void VertexDecoderJitCache::Jit_PosS16() { + LDRH(tempReg1, srcReg, dec_->posoff); + LDRH(tempReg2, srcReg, dec_->posoff + 2); + LDRH(tempReg3, srcReg, dec_->posoff + 4); + ORR(tempReg1, tempReg1, Operand2(tempReg2, ST_LSL, 16)); + STR(tempReg1, dstReg, dec_->decFmt.posoff); + STR(tempReg3, dstReg, dec_->decFmt.posoff + 4); +} + +// Just copy 12 bytes. +void VertexDecoderJitCache::Jit_PosFloat() { + LDR(tempReg1, srcReg, dec_->posoff); + LDR(tempReg2, srcReg, dec_->posoff + 4); + LDR(tempReg3, srcReg, dec_->posoff + 8); + // But this is always aligned to 4 so we're safe. + ADD(scratchReg, dstReg, dec_->decFmt.posoff); + STMIA(scratchReg, false, 3, tempReg1, tempReg2, tempReg3); +} + +void VertexDecoderJitCache::Jit_NormalS8Skin() { + LDRSB(tempReg1, srcReg, dec_->nrmoff); + LDRSB(tempReg2, srcReg, dec_->nrmoff + 1); + LDRSB(tempReg3, srcReg, dec_->nrmoff + 2); + VMOV(src[0], tempReg1); + VMOV(src[1], tempReg2); + VMOV(src[2], tempReg3); + MOVI2F(S15, 1.0f/128.0f, scratchReg); + VCVT(src[0], src[0], TO_FLOAT | IS_SIGNED); + VCVT(src[1], src[1], TO_FLOAT | IS_SIGNED); + VCVT(src[2], src[2], TO_FLOAT | IS_SIGNED); + VMUL(src[0], src[0], S15); + VMUL(src[1], src[1], S15); + VMUL(src[2], src[2], S15); + Jit_WriteMatrixMul(dec_->decFmt.nrmoff, false); +} + +void VertexDecoderJitCache::Jit_NormalS16Skin() { + LDRSH(tempReg1, srcReg, dec_->nrmoff); + LDRSH(tempReg2, srcReg, dec_->nrmoff + 2); + LDRSH(tempReg3, srcReg, dec_->nrmoff + 4); + VMOV(fpScratchReg, tempReg1); + VMOV(fpScratchReg2, tempReg2); + VMOV(fpScratchReg3, tempReg3); + MOVI2F(S15, 1.0f/32768.0f, scratchReg); + VCVT(fpScratchReg, fpScratchReg, TO_FLOAT | IS_SIGNED); + VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT | IS_SIGNED); + VCVT(fpScratchReg3, fpScratchReg3, TO_FLOAT | IS_SIGNED); + VMUL(src[0], fpScratchReg, S15); + VMUL(src[1], fpScratchReg2, S15); + VMUL(src[2], fpScratchReg3, S15); + Jit_WriteMatrixMul(dec_->decFmt.nrmoff, false); +} + +void VertexDecoderJitCache::Jit_NormalFloatSkin() { + VLDR(src[0], srcReg, dec_->nrmoff); + VLDR(src[1], srcReg, dec_->nrmoff + 4); + VLDR(src[2], srcReg, dec_->nrmoff + 8); + Jit_WriteMatrixMul(dec_->decFmt.nrmoff, false); +} + +void VertexDecoderJitCache::Jit_WriteMatrixMul(int outOff, bool pos) { + MOVI2R(tempReg1, (u32)skinMatrix, scratchReg); + for (int i = 0; i < 3; i++) { + VLDR(fpScratchReg, tempReg1, 4 * i); + VMUL(acc[i], fpScratchReg, src[0]); + } + for (int i = 0; i < 3; i++) { + VLDR(fpScratchReg, tempReg1, 12 + 4 * i); + VMLA(acc[i], fpScratchReg, src[1]); + } + for (int i = 0; i < 3; i++) { + VLDR(fpScratchReg, tempReg1, 24 + 4 * i); + VMLA(acc[i], fpScratchReg, src[2]); + } + if (pos) { + for (int i = 0; i < 3; i++) { + VLDR(fpScratchReg, tempReg1, 36 + 4 * i); + VADD(acc[i], acc[i], fpScratchReg); + } + } + for (int i = 0; i < 3; i++) { + VSTR(acc[i], dstReg, outOff + i * 4); + } +} + +void VertexDecoderJitCache::Jit_PosS8Skin() { + LDRSB(tempReg1, srcReg, dec_->posoff); + LDRSB(tempReg2, srcReg, dec_->posoff + 1); + LDRSB(tempReg3, srcReg, dec_->posoff + 2); + VMOV(src[0], tempReg1); + VMOV(src[1], tempReg2); + VMOV(src[2], tempReg3); + MOVI2F(S15, 1.0f/128.0f, scratchReg); + VCVT(src[0], src[0], TO_FLOAT | IS_SIGNED); + VCVT(src[1], src[1], TO_FLOAT | IS_SIGNED); + VCVT(src[2], src[2], TO_FLOAT | IS_SIGNED); + VMUL(src[0], src[0], S15); + VMUL(src[1], src[1], S15); + VMUL(src[2], src[2], S15); + Jit_WriteMatrixMul(dec_->decFmt.posoff, true); +} + +void VertexDecoderJitCache::Jit_PosS16Skin() { + LDRSH(tempReg1, srcReg, dec_->posoff); + LDRSH(tempReg2, srcReg, dec_->posoff + 2); + LDRSH(tempReg3, srcReg, dec_->posoff + 4); + VMOV(src[0], tempReg1); + VMOV(src[1], tempReg2); + VMOV(src[2], tempReg3); + MOVI2F(S15, 1.0f/32768.0f, scratchReg); + VCVT(src[0], src[0], TO_FLOAT | IS_SIGNED); + VCVT(src[1], src[1], TO_FLOAT | IS_SIGNED); + VCVT(src[2], src[2], TO_FLOAT | IS_SIGNED); + VMUL(src[0], src[0], S15); + VMUL(src[1], src[1], S15); + VMUL(src[2], src[2], S15); + Jit_WriteMatrixMul(dec_->decFmt.posoff, true); +} + +void VertexDecoderJitCache::Jit_PosFloatSkin() { + VLDR(src[0], srcReg, dec_->posoff); + VLDR(src[1], srcReg, dec_->posoff + 4); + VLDR(src[2], srcReg, dec_->posoff + 8); + Jit_WriteMatrixMul(dec_->decFmt.posoff, true); +} + +bool VertexDecoderJitCache::CompileStep(const VertexDecoder &dec, int step) { + // See if we find a matching JIT function + for (size_t i = 0; i < ARRAY_SIZE(jitLookup); i++) { + if (dec.steps_[step] == jitLookup[i].func) { + ((*this).*jitLookup[i].jitFunc)(); + return true; + } + } + return false; +} diff --git a/GPU/GLES/VertexDecoderX86.cpp b/GPU/GLES/VertexDecoderX86.cpp new file mode 100644 index 0000000000..0853a84666 --- /dev/null +++ b/GPU/GLES/VertexDecoderX86.cpp @@ -0,0 +1,842 @@ +// Copyright (c) 2013- PPSSPP Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0 or later versions. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official git repository and contact information can be found at +// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. + +#include + +#include "Common/CPUDetect.h" +#include "Core/Config.h" +#include "GPU/GLES/VertexDecoder.h" + +// We start out by converting the active matrices into 4x4 which are easier to multiply with +// using SSE / NEON and store them here. +static float MEMORY_ALIGNED16(bones[16 * 8]); + +using namespace Gen; + +static const float MEMORY_ALIGNED16( by128[4] ) = { + 1.0f / 128.0f, 1.0f / 128.0f, 1.0f / 128.0f, 1.0f / 128.0f +}; +static const float MEMORY_ALIGNED16( by256[4] ) = { + 1.0f / 256, 1.0f / 256, 1.0f / 256, 1.0f / 256 +}; +static const float MEMORY_ALIGNED16( by32768[4] ) = { + 1.0f / 32768.0f, 1.0f / 32768.0f, 1.0f / 32768.0f, 1.0f / 32768.0f, +}; + +static const u32 MEMORY_ALIGNED16( threeMasks[4] ) = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0}; +static const u32 MEMORY_ALIGNED16( aOne[4] ) = {0, 0, 0, 0x3F800000}; + +#ifdef _M_X64 +#ifdef _WIN32 +static const X64Reg tempReg1 = RAX; +static const X64Reg tempReg2 = R9; +static const X64Reg tempReg3 = R10; +static const X64Reg srcReg = RCX; +static const X64Reg dstReg = RDX; +static const X64Reg counterReg = R8; +#else +static const X64Reg tempReg1 = RAX; +static const X64Reg tempReg2 = R9; +static const X64Reg tempReg3 = R10; +static const X64Reg srcReg = RDI; +static const X64Reg dstReg = RSI; +static const X64Reg counterReg = RDX; +#endif +#else +static const X64Reg tempReg1 = EAX; +static const X64Reg tempReg2 = EBX; +static const X64Reg tempReg3 = EDX; +static const X64Reg srcReg = ESI; +static const X64Reg dstReg = EDI; +static const X64Reg counterReg = ECX; +#endif + +// XMM0-XMM5 are volatile on Windows X64 +// XMM0-XMM7 are arguments (and thus volatile) on System V ABI (other x64 platforms) +static const X64Reg fpScaleOffsetReg = XMM0; + +static const X64Reg fpScratchReg = XMM1; +static const X64Reg fpScratchReg2 = XMM2; +static const X64Reg fpScratchReg3 = XMM3; + +// We're gonna keep the current skinning matrix in 4 XMM regs. Fortunately we easily +// have space for that now. + +// To debug, just comment them out one at a time until it works. We fall back +// on the interpreter if the compiler fails. + +static const JitLookup jitLookup[] = { + {&VertexDecoder::Step_WeightsU8, &VertexDecoderJitCache::Jit_WeightsU8}, + {&VertexDecoder::Step_WeightsU16, &VertexDecoderJitCache::Jit_WeightsU16}, + {&VertexDecoder::Step_WeightsFloat, &VertexDecoderJitCache::Jit_WeightsFloat}, + + {&VertexDecoder::Step_WeightsU8Skin, &VertexDecoderJitCache::Jit_WeightsU8Skin}, + {&VertexDecoder::Step_WeightsU16Skin, &VertexDecoderJitCache::Jit_WeightsU16Skin}, + {&VertexDecoder::Step_WeightsFloatSkin, &VertexDecoderJitCache::Jit_WeightsFloatSkin}, + + {&VertexDecoder::Step_TcU8, &VertexDecoderJitCache::Jit_TcU8}, + {&VertexDecoder::Step_TcU16, &VertexDecoderJitCache::Jit_TcU16}, + {&VertexDecoder::Step_TcFloat, &VertexDecoderJitCache::Jit_TcFloat}, + {&VertexDecoder::Step_TcU16Double, &VertexDecoderJitCache::Jit_TcU16Double}, + + {&VertexDecoder::Step_TcU8Prescale, &VertexDecoderJitCache::Jit_TcU8Prescale}, + {&VertexDecoder::Step_TcU16Prescale, &VertexDecoderJitCache::Jit_TcU16Prescale}, + {&VertexDecoder::Step_TcFloatPrescale, &VertexDecoderJitCache::Jit_TcFloatPrescale}, + + {&VertexDecoder::Step_TcU16Through, &VertexDecoderJitCache::Jit_TcU16Through}, + {&VertexDecoder::Step_TcFloatThrough, &VertexDecoderJitCache::Jit_TcFloatThrough}, + {&VertexDecoder::Step_TcU16ThroughDouble, &VertexDecoderJitCache::Jit_TcU16ThroughDouble}, + + {&VertexDecoder::Step_NormalS8, &VertexDecoderJitCache::Jit_NormalS8}, + {&VertexDecoder::Step_NormalS16, &VertexDecoderJitCache::Jit_NormalS16}, + {&VertexDecoder::Step_NormalFloat, &VertexDecoderJitCache::Jit_NormalFloat}, + + {&VertexDecoder::Step_NormalS8Skin, &VertexDecoderJitCache::Jit_NormalS8Skin}, + {&VertexDecoder::Step_NormalS16Skin, &VertexDecoderJitCache::Jit_NormalS16Skin}, + {&VertexDecoder::Step_NormalFloatSkin, &VertexDecoderJitCache::Jit_NormalFloatSkin}, + + {&VertexDecoder::Step_Color8888, &VertexDecoderJitCache::Jit_Color8888}, + {&VertexDecoder::Step_Color4444, &VertexDecoderJitCache::Jit_Color4444}, + {&VertexDecoder::Step_Color565, &VertexDecoderJitCache::Jit_Color565}, + {&VertexDecoder::Step_Color5551, &VertexDecoderJitCache::Jit_Color5551}, + + {&VertexDecoder::Step_PosS8Through, &VertexDecoderJitCache::Jit_PosS8Through}, + {&VertexDecoder::Step_PosS16Through, &VertexDecoderJitCache::Jit_PosS16Through}, + {&VertexDecoder::Step_PosFloatThrough, &VertexDecoderJitCache::Jit_PosFloat}, + + {&VertexDecoder::Step_PosS8, &VertexDecoderJitCache::Jit_PosS8}, + {&VertexDecoder::Step_PosS16, &VertexDecoderJitCache::Jit_PosS16}, + {&VertexDecoder::Step_PosFloat, &VertexDecoderJitCache::Jit_PosFloat}, + + {&VertexDecoder::Step_PosS8Skin, &VertexDecoderJitCache::Jit_PosS8Skin}, + {&VertexDecoder::Step_PosS16Skin, &VertexDecoderJitCache::Jit_PosS16Skin}, + {&VertexDecoder::Step_PosFloatSkin, &VertexDecoderJitCache::Jit_PosFloatSkin}, +}; + +// TODO: This should probably be global... +#ifdef _M_X64 +#define PTRBITS 64 +#else +#define PTRBITS 32 +#endif + +JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) { + dec_ = &dec; + const u8 *start = this->GetCodePtr(); + +#ifdef _M_IX86 + // Store register values + PUSH(ESI); + PUSH(EDI); + PUSH(EBX); + PUSH(EBP); + + // Read parameters + int offset = 4; + MOV(32, R(srcReg), MDisp(ESP, 16 + offset + 0)); + MOV(32, R(dstReg), MDisp(ESP, 16 + offset + 4)); + MOV(32, R(counterReg), MDisp(ESP, 16 + offset + 8)); +#endif + + // Save XMM4/XMM5 which apparently can be problematic? + // Actually, if they are, it must be a compiler bug because they SHOULD be ok. + // So I won't bother. + SUB(PTRBITS, R(ESP), Imm8(64)); + MOVUPS(MDisp(ESP, 0), XMM4); + MOVUPS(MDisp(ESP, 16), XMM5); + MOVUPS(MDisp(ESP, 32), XMM6); + MOVUPS(MDisp(ESP, 48), XMM7); + + bool prescaleStep = false; + // Look for prescaled texcoord steps + for (int i = 0; i < dec.numSteps_; i++) { + if (dec.steps_[i] == &VertexDecoder::Step_TcU8Prescale || + dec.steps_[i] == &VertexDecoder::Step_TcU16Prescale || + dec.steps_[i] == &VertexDecoder::Step_TcFloatPrescale) { + prescaleStep = true; + } + } + + // Add code to convert matrices to 4x4. + // Later we might want to do this when the matrices are loaded instead. + // This is mostly proof of concept. + int boneCount = 0; + if (dec.weighttype && g_Config.bSoftwareSkinning) { + for (int i = 0; i < 8; i++) { + MOVUPS(XMM0, M((void *)(gstate.boneMatrix + 12 * i))); + MOVUPS(XMM1, M((void *)(gstate.boneMatrix + 12 * i + 3))); + MOVUPS(XMM2, M((void *)(gstate.boneMatrix + 12 * i + 3 * 2))); + MOVUPS(XMM3, M((void *)(gstate.boneMatrix + 12 * i + 3 * 3))); + ANDPS(XMM0, M((void *)&threeMasks)); + ANDPS(XMM1, M((void *)&threeMasks)); + ANDPS(XMM2, M((void *)&threeMasks)); + ANDPS(XMM3, M((void *)&threeMasks)); + ORPS(XMM3, M((void *)&aOne)); + MOVAPS(M((void *)(bones + 16 * i)), XMM0); + MOVAPS(M((void *)(bones + 16 * i + 4)), XMM1); + MOVAPS(M((void *)(bones + 16 * i + 8)), XMM2); + MOVAPS(M((void *)(bones + 16 * i + 12)), XMM3); + } + } + + // Keep the scale/offset in a few fp registers if we need it. + if (prescaleStep) { +#ifdef _M_X64 + MOV(64, R(tempReg1), Imm64((u64)(&gstate_c.uv))); +#else + MOV(32, R(tempReg1), Imm32((u32)(&gstate_c.uv))); +#endif + MOVSS(fpScaleOffsetReg, MDisp(tempReg1, 0)); + MOVSS(fpScratchReg, MDisp(tempReg1, 4)); + UNPCKLPS(fpScaleOffsetReg, R(fpScratchReg)); + if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_8BIT) { + MULPS(fpScaleOffsetReg, M((void *)&by128)); + } else if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_16BIT) { + MULPS(fpScaleOffsetReg, M((void *)&by32768)); + } + MOVSS(fpScratchReg, MDisp(tempReg1, 8)); + MOVSS(fpScratchReg2, MDisp(tempReg1, 12)); + UNPCKLPS(fpScratchReg, R(fpScratchReg2)); + UNPCKLPD(fpScaleOffsetReg, R(fpScratchReg)); + } + + // Let's not bother with a proper stack frame. We just grab the arguments and go. + JumpTarget loopStart = GetCodePtr(); + for (int i = 0; i < dec.numSteps_; i++) { + if (!CompileStep(dec, i)) { + // Reset the code ptr and return zero to indicate that we failed. + SetCodePtr(const_cast(start)); + return 0; + } + } + + ADD(PTRBITS, R(srcReg), Imm32(dec.VertexSize())); + ADD(PTRBITS, R(dstReg), Imm32(dec.decFmt.stride)); + SUB(32, R(counterReg), Imm8(1)); + J_CC(CC_NZ, loopStart, true); + + MOVUPS(XMM4, MDisp(ESP, 0)); + MOVUPS(XMM5, MDisp(ESP, 16)); + MOVUPS(XMM6, MDisp(ESP, 32)); + MOVUPS(XMM7, MDisp(ESP, 48)); + ADD(PTRBITS, R(ESP), Imm8(64)); + +#ifdef _M_IX86 + // Restore register values + POP(EBP); + POP(EBX); + POP(EDI); + POP(ESI); +#endif + + RET(); + + return (JittedVertexDecoder)start; +} + +void VertexDecoderJitCache::Jit_WeightsU8() { + switch (dec_->nweights) { + case 1: + MOVZX(32, 8, tempReg1, MDisp(srcReg, dec_->weightoff)); + MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1)); + return; + case 2: + MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->weightoff)); + MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1)); + return; + case 3: + MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff)); + AND(32, R(tempReg1), Imm32(0x00FFFFFF)); + MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1)); + return; + case 4: + MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff)); + MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1)); + return; + case 8: + MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff)); + MOV(32, R(tempReg2), MDisp(srcReg, dec_->weightoff + 4)); + MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1)); + MOV(32, MDisp(dstReg, dec_->decFmt.w1off), R(tempReg2)); + return; + } + + // Basic implementation - a byte at a time. TODO: Optimize + int j; + for (j = 0; j < dec_->nweights; j++) { + MOV(8, R(tempReg1), MDisp(srcReg, dec_->weightoff + j)); + MOV(8, MDisp(dstReg, dec_->decFmt.w0off + j), R(tempReg1)); + } + while (j & 3) { + MOV(8, MDisp(dstReg, dec_->decFmt.w0off + j), Imm8(0)); + j++; + } +} + +void VertexDecoderJitCache::Jit_WeightsU16() { + switch (dec_->nweights) { + case 1: + MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->weightoff)); + MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1)); + MOV(32, MDisp(dstReg, dec_->decFmt.w0off + 4), Imm32(0)); + return; + + case 2: + MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff)); + MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1)); + MOV(32, MDisp(dstReg, dec_->decFmt.w0off + 4), Imm32(0)); + return; + + case 3: + MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff)); + MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->weightoff + 4)); + MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1)); + MOV(32, MDisp(dstReg, dec_->decFmt.w0off + 4), R(tempReg2)); + return; + + case 4: + MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff)); + MOV(32, R(tempReg2), MDisp(srcReg, dec_->weightoff + 4)); + MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1)); + MOV(32, MDisp(dstReg, dec_->decFmt.w0off + 4), R(tempReg2)); + return; + } + + // Basic implementation - a short at a time. TODO: Optimize + int j; + for (j = 0; j < dec_->nweights; j++) { + MOV(16, R(tempReg1), MDisp(srcReg, dec_->weightoff + j * 2)); + MOV(16, MDisp(dstReg, dec_->decFmt.w0off + j * 2), R(tempReg1)); + } + while (j & 3) { + MOV(16, MDisp(dstReg, dec_->decFmt.w0off + j * 2), Imm16(0)); + j++; + } +} + +void VertexDecoderJitCache::Jit_WeightsFloat() { + int j; + for (j = 0; j < dec_->nweights; j++) { + MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff + j * 4)); + MOV(32, MDisp(dstReg, dec_->decFmt.w0off + j * 4), R(tempReg1)); + } + while (j & 3) { // Zero additional weights rounding up to 4. + MOV(32, MDisp(dstReg, dec_->decFmt.w0off + j * 4), Imm32(0)); + j++; + } +} + +void VertexDecoderJitCache::Jit_WeightsU8Skin() { +#ifdef _M_X64 + MOV(PTRBITS, R(tempReg2), Imm64((uintptr_t)&bones)); +#else + MOV(PTRBITS, R(tempReg2), Imm32((uintptr_t)&bones)); +#endif + for (int j = 0; j < dec_->nweights; j++) { + MOVZX(32, 8, tempReg1, MDisp(srcReg, dec_->weightoff + j)); + CVTSI2SS(XMM1, R(tempReg1)); + MULSS(XMM1, M((void *)&by128)); + SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0)); + if (j == 0) { + MOVAPS(XMM4, MDisp(tempReg2, 0)); + MOVAPS(XMM5, MDisp(tempReg2, 16)); + MULPS(XMM4, R(XMM1)); + MULPS(XMM5, R(XMM1)); + MOVAPS(XMM6, MDisp(tempReg2, 32)); + MOVAPS(XMM7, MDisp(tempReg2, 48)); + MULPS(XMM6, R(XMM1)); + MULPS(XMM7, R(XMM1)); + } else { + MOVAPS(XMM2, MDisp(tempReg2, 0)); + MOVAPS(XMM3, MDisp(tempReg2, 16)); + MULPS(XMM2, R(XMM1)); + MULPS(XMM3, R(XMM1)); + ADDPS(XMM4, R(XMM2)); + ADDPS(XMM5, R(XMM3)); + MOVAPS(XMM2, MDisp(tempReg2, 32)); + MOVAPS(XMM3, MDisp(tempReg2, 48)); + MULPS(XMM2, R(XMM1)); + MULPS(XMM3, R(XMM1)); + ADDPS(XMM6, R(XMM2)); + ADDPS(XMM7, R(XMM3)); + } + ADD(PTRBITS, R(tempReg2), Imm8(4 * 16)); + } +} + +void VertexDecoderJitCache::Jit_WeightsU16Skin() { +#ifdef _M_X64 + MOV(PTRBITS, R(tempReg2), Imm64((uintptr_t)&bones)); +#else + MOV(PTRBITS, R(tempReg2), Imm32((uintptr_t)&bones)); +#endif + for (int j = 0; j < dec_->nweights; j++) { + MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->weightoff + j * 2)); + CVTSI2SS(XMM1, R(tempReg1)); + MULSS(XMM1, M((void *)&by32768)); + SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0)); + if (j == 0) { + MOVAPS(XMM4, MDisp(tempReg2, 0)); + MOVAPS(XMM5, MDisp(tempReg2, 16)); + MULPS(XMM4, R(XMM1)); + MULPS(XMM5, R(XMM1)); + MOVAPS(XMM6, MDisp(tempReg2, 32)); + MOVAPS(XMM7, MDisp(tempReg2, 48)); + MULPS(XMM6, R(XMM1)); + MULPS(XMM7, R(XMM1)); + } else { + MOVAPS(XMM2, MDisp(tempReg2, 0)); + MOVAPS(XMM3, MDisp(tempReg2, 16)); + MULPS(XMM2, R(XMM1)); + MULPS(XMM3, R(XMM1)); + ADDPS(XMM4, R(XMM2)); + ADDPS(XMM5, R(XMM3)); + MOVAPS(XMM2, MDisp(tempReg2, 32)); + MOVAPS(XMM3, MDisp(tempReg2, 48)); + MULPS(XMM2, R(XMM1)); + MULPS(XMM3, R(XMM1)); + ADDPS(XMM6, R(XMM2)); + ADDPS(XMM7, R(XMM3)); + } + ADD(PTRBITS, R(tempReg2), Imm8(4 * 16)); + } +} + +void VertexDecoderJitCache::Jit_WeightsFloatSkin() { +#ifdef _M_X64 + MOV(PTRBITS, R(tempReg2), Imm64((uintptr_t)&bones)); +#else + MOV(PTRBITS, R(tempReg2), Imm32((uintptr_t)&bones)); +#endif + for (int j = 0; j < dec_->nweights; j++) { + MOVSS(XMM1, MDisp(srcReg, dec_->weightoff + j * 4)); + SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0)); + if (j == 0) { + MOVAPS(XMM4, MDisp(tempReg2, 0)); + MOVAPS(XMM5, MDisp(tempReg2, 16)); + MULPS(XMM4, R(XMM1)); + MULPS(XMM5, R(XMM1)); + MOVAPS(XMM6, MDisp(tempReg2, 32)); + MOVAPS(XMM7, MDisp(tempReg2, 48)); + MULPS(XMM6, R(XMM1)); + MULPS(XMM7, R(XMM1)); + } else { + MOVAPS(XMM2, MDisp(tempReg2, 0)); + MOVAPS(XMM3, MDisp(tempReg2, 16)); + MULPS(XMM2, R(XMM1)); + MULPS(XMM3, R(XMM1)); + ADDPS(XMM4, R(XMM2)); + ADDPS(XMM5, R(XMM3)); + MOVAPS(XMM2, MDisp(tempReg2, 32)); + MOVAPS(XMM3, MDisp(tempReg2, 48)); + MULPS(XMM2, R(XMM1)); + MULPS(XMM3, R(XMM1)); + ADDPS(XMM6, R(XMM2)); + ADDPS(XMM7, R(XMM3)); + } + ADD(PTRBITS, R(tempReg2), Imm8(4 * 16)); + } +} + +// Fill last two bytes with zeroes to align to 4 bytes. MOVZX does it for us, handy. +void VertexDecoderJitCache::Jit_TcU8() { + MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->tcoff)); + MOV(32, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1)); +} + +void VertexDecoderJitCache::Jit_TcU16() { + MOV(32, R(tempReg1), MDisp(srcReg, dec_->tcoff)); + MOV(32, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1)); +} + +void VertexDecoderJitCache::Jit_TcU16Double() { + MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->tcoff)); + MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->tcoff + 2)); + SHL(16, R(tempReg1), Imm8(1)); // 16 to get a wall to shift into + SHL(32, R(tempReg2), Imm8(17)); + OR(32, R(tempReg1), R(tempReg2)); + MOV(32, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1)); +} + +void VertexDecoderJitCache::Jit_TcFloat() { +#ifdef _M_X64 + MOV(64, R(tempReg1), MDisp(srcReg, dec_->tcoff)); + MOV(64, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1)); +#else + MOV(32, R(tempReg1), MDisp(srcReg, dec_->tcoff)); + MOV(32, R(tempReg2), MDisp(srcReg, dec_->tcoff + 4)); + MOV(32, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1)); + MOV(32, MDisp(dstReg, dec_->decFmt.uvoff + 4), R(tempReg2)); +#endif +} + +void VertexDecoderJitCache::Jit_TcU8Prescale() { + // TODO: The first five instructions could be done in 1 or 2 in SSE4 + MOVZX(32, 8, tempReg1, MDisp(srcReg, dec_->tcoff)); + MOVZX(32, 8, tempReg2, MDisp(srcReg, dec_->tcoff + 1)); + CVTSI2SS(fpScratchReg, R(tempReg1)); + CVTSI2SS(fpScratchReg2, R(tempReg2)); + UNPCKLPS(fpScratchReg, R(fpScratchReg2)); + MULPS(fpScratchReg, R(fpScaleOffsetReg)); + SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2)); + ADDPS(fpScratchReg, R(fpScaleOffsetReg)); + SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2)); + MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg); +} + +void VertexDecoderJitCache::Jit_TcU16Prescale() { + PXOR(fpScratchReg2, R(fpScratchReg2)); + MOVD_xmm(fpScratchReg, MDisp(srcReg, dec_->tcoff)); + PUNPCKLWD(fpScratchReg, R(fpScratchReg2)); + CVTDQ2PS(fpScratchReg, R(fpScratchReg)); + MULPS(fpScratchReg, R(fpScaleOffsetReg)); + SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2)); + ADDPS(fpScratchReg, R(fpScaleOffsetReg)); + SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2)); + MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg); +} + +void VertexDecoderJitCache::Jit_TcFloatPrescale() { + MOVQ_xmm(fpScratchReg, MDisp(srcReg, dec_->tcoff)); + MULPS(fpScratchReg, R(fpScaleOffsetReg)); + SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2)); + ADDPS(fpScratchReg, R(fpScaleOffsetReg)); + SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2)); + MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg); +} + +void VertexDecoderJitCache::Jit_TcU16Through() { + MOV(32, R(tempReg1), MDisp(srcReg, dec_->tcoff)); + MOV(32, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1)); +} + +void VertexDecoderJitCache::Jit_TcU16ThroughDouble() { + MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->tcoff)); + MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->tcoff + 2)); + SHL(16, R(tempReg1), Imm8(1)); // 16 to get a wall to shift into + SHL(32, R(tempReg2), Imm8(17)); + OR(32, R(tempReg1), R(tempReg2)); + MOV(32, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1)); +} + +void VertexDecoderJitCache::Jit_TcFloatThrough() { +#ifdef _M_X64 + MOV(64, R(tempReg1), MDisp(srcReg, dec_->tcoff)); + MOV(64, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1)); +#else + MOV(32, R(tempReg1), MDisp(srcReg, dec_->tcoff)); + MOV(32, R(tempReg2), MDisp(srcReg, dec_->tcoff + 4)); + MOV(32, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1)); + MOV(32, MDisp(dstReg, dec_->decFmt.uvoff + 4), R(tempReg2)); +#endif +} + +void VertexDecoderJitCache::Jit_Color8888() { + MOV(32, R(tempReg1), MDisp(srcReg, dec_->coloff)); + MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg1)); +} + +static const u32 MEMORY_ALIGNED16(nibbles[4]) = { 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, }; + + +void VertexDecoderJitCache::Jit_Color4444() { + // Needs benchmarking. A bit wasteful by only using 1 SSE lane. +#if 0 + // Alternate approach + MOVD_xmm(XMM3, MDisp(srcReg, dec_->coloff)); + MOVAPS(XMM2, R(XMM3)); + MOVAPS(XMM1, M((void *)nibbles)); + PSLLD(XMM2, 4); + PAND(XMM3, R(XMM1)); + PAND(XMM2, R(XMM1)); + PSRLD(XMM2, 4); + PXOR(XMM1, R(XMM1)); + PUNPCKLBW(XMM2, R(XMM1)); + PUNPCKLBW(XMM3, R(XMM1)); + PSLLD(XMM2, 4); + POR(XMM3, R(XMM2)); + MOVAPS(XMM2, R(XMM3)); + PSLLD(XMM2, 4); + POR(XMM3, R(XMM2)); + MOVD_xmm(MDisp(dstReg, dec_->decFmt.c0off), XMM3); + return; +#endif + + MOV(32, R(tempReg1), MDisp(srcReg, dec_->coloff)); + + // 0000ABGR, copy R and double forwards. + MOV(32, R(tempReg3), R(tempReg1)); + AND(32, R(tempReg3), Imm32(0x0000000F)); + MOV(32, R(tempReg2), R(tempReg3)); + SHL(32, R(tempReg3), Imm8(4)); + OR(32, R(tempReg2), R(tempReg3)); + + // tempReg1 -> 00ABGR00, then double G backwards. + SHL(32, R(tempReg1), Imm8(8)); + MOV(32, R(tempReg3), R(tempReg1)); + AND(32, R(tempReg3), Imm32(0x0000F000)); + OR(32, R(tempReg2), R(tempReg3)); + SHR(32, R(tempReg3), Imm8(4)); + OR(32, R(tempReg2), R(tempReg3)); + + // Now do B forwards again (still 00ABGR00.) + MOV(32, R(tempReg3), R(tempReg1)); + AND(32, R(tempReg3), Imm32(0x000F0000)); + OR(32, R(tempReg2), R(tempReg3)); + SHL(32, R(tempReg3), Imm8(4)); + OR(32, R(tempReg2), R(tempReg3)); + + // tempReg1 -> ABGR0000, then double A backwards. + SHL(32, R(tempReg1), Imm8(8)); + MOV(32, R(tempReg3), R(tempReg1)); + AND(32, R(tempReg3), Imm32(0xF0000000)); + OR(32, R(tempReg2), R(tempReg3)); + SHR(32, R(tempReg3), Imm8(4)); + OR(32, R(tempReg2), R(tempReg3)); + + MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg2)); +} + +void VertexDecoderJitCache::Jit_Color565() { + MOV(32, R(tempReg1), MDisp(srcReg, dec_->coloff)); + + MOV(32, R(tempReg2), R(tempReg1)); + AND(32, R(tempReg2), Imm32(0x0000001F)); + + // B (we do R and B at the same time, they're both 5.) + MOV(32, R(tempReg3), R(tempReg1)); + AND(32, R(tempReg3), Imm32(0x0000F800)); + SHL(32, R(tempReg3), Imm8(5)); + OR(32, R(tempReg2), R(tempReg3)); + + // Expand 5 -> 8. At this point we have 00BB00RR. + MOV(32, R(tempReg3), R(tempReg2)); + SHL(32, R(tempReg2), Imm8(3)); + SHR(32, R(tempReg3), Imm8(2)); + OR(32, R(tempReg2), R(tempReg3)); + AND(32, R(tempReg2), Imm32(0x00FF00FF)); + + // Now's as good a time to put in A as any. + OR(32, R(tempReg2), Imm32(0xFF000000)); + + // Last, we need to align, extract, and expand G. + // 3 to align to G, and then 2 to expand to 8. + SHL(32, R(tempReg1), Imm8(3 + 2)); + AND(32, R(tempReg1), Imm32(0x0000FC00)); + MOV(32, R(tempReg3), R(tempReg1)); + // 2 to account for tempReg1 being preshifted, 4 for expansion. + SHR(32, R(tempReg3), Imm8(2 + 4)); + OR(32, R(tempReg1), R(tempReg3)); + AND(32, R(tempReg1), Imm32(0x0000FF00)); + OR(32, R(tempReg2), R(tempReg1)); + + MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg2)); +} + +void VertexDecoderJitCache::Jit_Color5551() { + MOV(32, R(tempReg1), MDisp(srcReg, dec_->coloff)); + + MOV(32, R(tempReg2), R(tempReg1)); + AND(32, R(tempReg2), Imm32(0x0000001F)); + + MOV(32, R(tempReg3), R(tempReg1)); + AND(32, R(tempReg3), Imm32(0x000003E0)); + SHL(32, R(tempReg3), Imm8(3)); + OR(32, R(tempReg2), R(tempReg3)); + + MOV(32, R(tempReg3), R(tempReg1)); + AND(32, R(tempReg3), Imm32(0x00007C00)); + SHL(32, R(tempReg3), Imm8(6)); + OR(32, R(tempReg2), R(tempReg3)); + + // Expand 5 -> 8. After this is just A. + MOV(32, R(tempReg3), R(tempReg2)); + SHL(32, R(tempReg2), Imm8(3)); + SHR(32, R(tempReg3), Imm8(2)); + // Chop off the bits that were shifted out. + AND(32, R(tempReg3), Imm32(0x00070707)); + OR(32, R(tempReg2), R(tempReg3)); + + // For A, we shift it to a single bit, and then subtract and XOR. + // That's probably the simplest way to expand it... + SHR(32, R(tempReg1), Imm8(15)); + // If it was 0, it's now -1, otherwise it's 0. Easy. + SUB(32, R(tempReg1), Imm8(1)); + XOR(32, R(tempReg1), Imm32(0xFF000000)); + AND(32, R(tempReg1), Imm32(0xFF000000)); + OR(32, R(tempReg2), R(tempReg1)); + + MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg2)); +} + +// Copy 3 bytes and then a zero. Might as well copy four. +void VertexDecoderJitCache::Jit_NormalS8() { + MOV(32, R(tempReg1), MDisp(srcReg, dec_->nrmoff)); + AND(32, R(tempReg1), Imm32(0x00FFFFFF)); + MOV(32, MDisp(dstReg, dec_->decFmt.nrmoff), R(tempReg1)); +} + +// Copy 6 bytes and then 2 zeroes. +void VertexDecoderJitCache::Jit_NormalS16() { + MOV(32, R(tempReg1), MDisp(srcReg, dec_->nrmoff)); + MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->nrmoff + 4)); + MOV(32, MDisp(dstReg, dec_->decFmt.nrmoff), R(tempReg1)); + MOV(32, MDisp(dstReg, dec_->decFmt.nrmoff + 4), R(tempReg2)); +} + +void VertexDecoderJitCache::Jit_NormalFloat() { + MOV(32, R(tempReg1), MDisp(srcReg, dec_->nrmoff)); + MOV(32, R(tempReg2), MDisp(srcReg, dec_->nrmoff + 4)); + MOV(32, R(tempReg3), MDisp(srcReg, dec_->nrmoff + 8)); + MOV(32, MDisp(dstReg, dec_->decFmt.nrmoff), R(tempReg1)); + MOV(32, MDisp(dstReg, dec_->decFmt.nrmoff + 4), R(tempReg2)); + MOV(32, MDisp(dstReg, dec_->decFmt.nrmoff + 8), R(tempReg3)); +} + +// This could be a bit shorter with AVX 3-operand instructions and FMA. +void VertexDecoderJitCache::Jit_WriteMatrixMul(int outOff, bool pos) { + MOVAPS(XMM1, R(XMM3)); + SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0)); + MULPS(XMM1, R(XMM4)); + MOVAPS(XMM2, R(XMM3)); + SHUFPS(XMM2, R(XMM2), _MM_SHUFFLE(1, 1, 1, 1)); + MULPS(XMM2, R(XMM5)); + ADDPS(XMM1, R(XMM2)); + MOVAPS(XMM2, R(XMM3)); + SHUFPS(XMM2, R(XMM2), _MM_SHUFFLE(2, 2, 2, 2)); + MULPS(XMM2, R(XMM6)); + ADDPS(XMM1, R(XMM2)); + if (pos) { + ADDPS(XMM1, R(XMM7)); + } + MOVUPS(MDisp(dstReg, outOff), XMM1); +} + +void VertexDecoderJitCache::Jit_NormalS8Skin() { + XORPS(XMM3, R(XMM3)); + MOVD_xmm(XMM1, MDisp(srcReg, dec_->nrmoff)); + PUNPCKLBW(XMM1, R(XMM3)); + PUNPCKLWD(XMM1, R(XMM3)); + PSLLD(XMM1, 24); + PSRAD(XMM1, 24); // Ugly sign extension, can be done faster in SSE4 + CVTDQ2PS(XMM3, R(XMM1)); + MULPS(XMM3, M((void *)&by128)); + Jit_WriteMatrixMul(dec_->decFmt.nrmoff, false); +} + +// Copy 6 bytes and then 2 zeroes. +void VertexDecoderJitCache::Jit_NormalS16Skin() { + XORPS(XMM3, R(XMM3)); + MOVQ_xmm(XMM1, MDisp(srcReg, dec_->nrmoff)); + PUNPCKLWD(XMM1, R(XMM3)); + PSLLD(XMM1, 16); + PSRAD(XMM1, 16); // Ugly sign extension, can be done faster in SSE4 + CVTDQ2PS(XMM3, R(XMM1)); + MULPS(XMM3, M((void *)&by32768)); + Jit_WriteMatrixMul(dec_->decFmt.nrmoff, false); +} + +void VertexDecoderJitCache::Jit_NormalFloatSkin() { + MOVUPS(XMM3, MDisp(srcReg, dec_->nrmoff)); + Jit_WriteMatrixMul(dec_->decFmt.nrmoff, false); +} + +// Through expands into floats, always. Might want to look at changing this. +void VertexDecoderJitCache::Jit_PosS8Through() { + // TODO: SIMD + for (int i = 0; i < 3; i++) { + MOVSX(32, 8, tempReg1, MDisp(srcReg, dec_->posoff + i)); + CVTSI2SS(fpScratchReg, R(tempReg1)); + MOVSS(MDisp(dstReg, dec_->decFmt.posoff + i * 4), fpScratchReg); + } +} + +// Through expands into floats, always. Might want to look at changing this. +void VertexDecoderJitCache::Jit_PosS16Through() { + XORPS(XMM3, R(XMM3)); + MOVQ_xmm(XMM1, MDisp(srcReg, dec_->posoff)); + PUNPCKLWD(XMM1, R(XMM3)); + PSLLD(XMM1, 16); + PSRAD(XMM1, 16); // Ugly sign extension, can be done faster in SSE4 + CVTDQ2PS(XMM3, R(XMM1)); + MOVUPS(MDisp(dstReg, dec_->decFmt.posoff), XMM3); +} + +// Copy 3 bytes and then a zero. Might as well copy four. +void VertexDecoderJitCache::Jit_PosS8() { + MOV(32, R(tempReg1), MDisp(srcReg, dec_->posoff)); + AND(32, R(tempReg1), Imm32(0x00FFFFFF)); + MOV(32, MDisp(dstReg, dec_->decFmt.posoff), R(tempReg1)); +} + +// Copy 6 bytes and then 2 zeroes. +void VertexDecoderJitCache::Jit_PosS16() { + MOV(32, R(tempReg1), MDisp(srcReg, dec_->posoff)); + MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->posoff + 4)); + MOV(32, MDisp(dstReg, dec_->decFmt.posoff), R(tempReg1)); + MOV(32, MDisp(dstReg, dec_->decFmt.posoff + 4), R(tempReg2)); +} + +// Just copy 12 bytes. +void VertexDecoderJitCache::Jit_PosFloat() { + MOV(32, R(tempReg1), MDisp(srcReg, dec_->posoff)); + MOV(32, R(tempReg2), MDisp(srcReg, dec_->posoff + 4)); + MOV(32, R(tempReg3), MDisp(srcReg, dec_->posoff + 8)); + MOV(32, MDisp(dstReg, dec_->decFmt.posoff), R(tempReg1)); + MOV(32, MDisp(dstReg, dec_->decFmt.posoff + 4), R(tempReg2)); + MOV(32, MDisp(dstReg, dec_->decFmt.posoff + 8), R(tempReg3)); +} + +void VertexDecoderJitCache::Jit_PosS8Skin() { + XORPS(XMM3, R(XMM3)); + MOVD_xmm(XMM1, MDisp(srcReg, dec_->posoff)); + PUNPCKLBW(XMM1, R(XMM3)); + PUNPCKLWD(XMM1, R(XMM3)); + PSLLD(XMM1, 24); + PSRAD(XMM1, 24); // Ugly sign extension, can be done faster in SSE4 + CVTDQ2PS(XMM3, R(XMM1)); + MULPS(XMM3, M((void *)&by128)); + Jit_WriteMatrixMul(dec_->decFmt.posoff, true); +} + +void VertexDecoderJitCache::Jit_PosS16Skin() { + XORPS(XMM3, R(XMM3)); + MOVQ_xmm(XMM1, MDisp(srcReg, dec_->posoff)); + PUNPCKLWD(XMM1, R(XMM3)); + PSLLD(XMM1, 16); + PSRAD(XMM1, 16); // Ugly sign extension, can be done faster in SSE4 + CVTDQ2PS(XMM3, R(XMM1)); + MULPS(XMM3, M((void *)&by32768)); + Jit_WriteMatrixMul(dec_->decFmt.posoff, true); +} + +// Just copy 12 bytes. +void VertexDecoderJitCache::Jit_PosFloatSkin() { + MOVUPS(XMM3, MDisp(srcReg, dec_->posoff)); + Jit_WriteMatrixMul(dec_->decFmt.posoff, true); +} + +bool VertexDecoderJitCache::CompileStep(const VertexDecoder &dec, int step) { + // See if we find a matching JIT function + for (size_t i = 0; i < ARRAY_SIZE(jitLookup); i++) { + if (dec.steps_[step] == jitLookup[i].func) { + ((*this).*jitLookup[i].jitFunc)(); + return true; + } + } + return false; +} diff --git a/GPU/GPU.vcxproj b/GPU/GPU.vcxproj index f15328b66a..a961adbe4a 100644 --- a/GPU/GPU.vcxproj +++ b/GPU/GPU.vcxproj @@ -245,6 +245,13 @@ + + true + true + true + true + + @@ -268,4 +275,4 @@ - + \ No newline at end of file diff --git a/GPU/GPU.vcxproj.filters b/GPU/GPU.vcxproj.filters index 4853171322..a6657c2a69 100644 --- a/GPU/GPU.vcxproj.filters +++ b/GPU/GPU.vcxproj.filters @@ -296,8 +296,10 @@ Common + + - + \ No newline at end of file diff --git a/android/jni/Android.mk b/android/jni/Android.mk index 47eef04939..6edd81b6c7 100644 --- a/android/jni/Android.mk +++ b/android/jni/Android.mk @@ -42,7 +42,8 @@ ARCH_FILES := \ $(SRC)/Core/MIPS/x86/Asm.cpp \ $(SRC)/Core/MIPS/x86/Jit.cpp \ $(SRC)/Core/MIPS/x86/RegCache.cpp \ - $(SRC)/Core/MIPS/x86/RegCacheFPU.cpp + $(SRC)/Core/MIPS/x86/RegCacheFPU.cpp \ + $(SRC)/GPU/GLES/VertexDecoderX86.cpp endif ifeq ($(TARGET_ARCH_ABI),armeabi-v7a) @@ -61,6 +62,7 @@ ARCH_FILES := \ $(SRC)/Core/MIPS/ARM/ArmJit.cpp \ $(SRC)/Core/MIPS/ARM/ArmRegCache.cpp \ $(SRC)/Core/MIPS/ARM/ArmRegCacheFPU.cpp \ + $(SRC)/GPU/GLES/VertexDecoderArm.cpp \ ArmEmitterTest.cpp endif @@ -79,6 +81,7 @@ ARCH_FILES := \ $(SRC)/Core/MIPS/ARM/ArmJit.cpp \ $(SRC)/Core/MIPS/ARM/ArmRegCache.cpp \ $(SRC)/Core/MIPS/ARM/ArmRegCacheFPU.cpp \ + $(SRC)/GPU/GLES/VertexDecoderArm.cpp \ ArmEmitterTest.cpp endif