diff --git a/GPU/Software/DrawPixel.cpp b/GPU/Software/DrawPixel.cpp index e81b8dc7c4..8b774eb03e 100644 --- a/GPU/Software/DrawPixel.cpp +++ b/GPU/Software/DrawPixel.cpp @@ -639,15 +639,9 @@ void PixelRegCache::Reset() { regs.clear(); } -void PixelRegCache::Release(PixelRegCache::Reg r, PixelRegCache::Type t, PixelRegCache::Purpose p) { +void PixelRegCache::Add(PixelRegCache::Reg r, PixelRegCache::Type t, PixelRegCache::Purpose p) { RegStatus *status = FindReg(r, t); - if (status) { - _assert_msg_(status->locked > 0, "softjit Release() reg that isn't locked"); - _assert_msg_(!status->forceLocked, "softjit Release() reg that is force locked"); - status->purpose = p; - status->locked--; - return; - } + _assert_msg_(status == nullptr, "softjit Add() reg duplicate"); RegStatus newStatus; newStatus.reg = r; @@ -656,6 +650,24 @@ void PixelRegCache::Release(PixelRegCache::Reg r, PixelRegCache::Type t, PixelRe regs.push_back(newStatus); } +void PixelRegCache::Change(PixelRegCache::Reg r, PixelRegCache::Type t, PixelRegCache::Purpose p) { + RegStatus *status = FindReg(r, t); + _assert_msg_(status != nullptr, "softjit Add() reg duplicate"); + + status->purpose = p; +} + +void PixelRegCache::Release(PixelRegCache::Reg r, PixelRegCache::Type t) { + RegStatus *status = FindReg(r, t); + _assert_msg_(status != nullptr, "softjit Release() reg that isn't there"); + _assert_msg_(status->locked > 0, "softjit Release() reg that isn't locked"); + _assert_msg_(!status->forceLocked, "softjit Release() reg that is force locked"); + + status->locked--; + if (status->locked == 0) + status->purpose = INVALID; +} + void PixelRegCache::Unlock(PixelRegCache::Reg r, PixelRegCache::Type t) { RegStatus *status = FindReg(r, t); if (status) { @@ -717,10 +729,10 @@ PixelRegCache::Reg PixelRegCache::Alloc(PixelRegCache::Purpose p, PixelRegCache: return Reg(); } -void PixelRegCache::ForceLock(PixelRegCache::Purpose p, PixelRegCache::Type t, bool state) { +void PixelRegCache::ForceLock(PixelRegCache::Purpose p, PixelRegCache::Type t) { for (auto ® : regs) { if (reg.purpose == p && reg.type == t) { - reg.forceLocked = state; + reg.forceLocked = true; return; } } @@ -728,6 +740,19 @@ void PixelRegCache::ForceLock(PixelRegCache::Purpose p, PixelRegCache::Type t, b _assert_msg_(false, "softjit ForceLock() reg that isn't there"); } +void PixelRegCache::ForceRelease(PixelRegCache::Purpose p, PixelRegCache::Type t) { + for (auto ® : regs) { + if (reg.purpose == p && reg.type == t) { + _assert_msg_(reg.locked == 0, "softjit ForceRelease() while locked"); + reg.forceLocked = false; + reg.purpose = INVALID; + return; + } + } + + _assert_msg_(false, "softjit ForceRelease() reg that isn't there"); +} + void PixelRegCache::GrabReg(PixelRegCache::Reg r, PixelRegCache::Purpose p, PixelRegCache::Type t, bool &needsSwap, PixelRegCache::Reg swapReg) { for (auto ® : regs) { if (reg.reg != r || reg.type != t) diff --git a/GPU/Software/DrawPixel.h b/GPU/Software/DrawPixel.h index 936ce5537f..95c455a85a 100644 --- a/GPU/Software/DrawPixel.h +++ b/GPU/Software/DrawPixel.h @@ -70,6 +70,12 @@ struct PixelRegCache { STENCIL, COLOR_OFF, DEPTH_OFF, + ARG_X, + ARG_Y, + ARG_Z, + ARG_FOG, + ARG_COLOR, + ARG_MASK, // Above this can only be temps. TEMP0, @@ -100,12 +106,15 @@ struct PixelRegCache { }; void Reset(); - void Release(Reg r, Type t, Purpose p = INVALID); + void Add(Reg r, Type t, Purpose p); + void Change(Reg r, Type t, Purpose p); + void Release(Reg r, Type t); void Unlock(Reg r, Type t); bool Has(Purpose p, Type t); Reg Find(Purpose p, Type t); Reg Alloc(Purpose p, Type t); - void ForceLock(Purpose p, Type t, bool state = true); + void ForceLock(Purpose p, Type t); + void ForceRelease(Purpose p, Type t); // For getting a specific reg. WARNING: May return a locked reg, so you have to check. void GrabReg(Reg r, Purpose p, Type t, bool &needsSwap, Reg swapReg); diff --git a/GPU/Software/DrawPixelX86.cpp b/GPU/Software/DrawPixelX86.cpp index ad1ad8b25c..75d8accc5d 100644 --- a/GPU/Software/DrawPixelX86.cpp +++ b/GPU/Software/DrawPixelX86.cpp @@ -32,22 +32,11 @@ using namespace Gen; namespace Rasterizer { #if PPSSPP_PLATFORM(WINDOWS) -static const X64Reg argXReg = RCX; -static const X64Reg argYReg = RDX; -static const X64Reg argZReg = R8; -static const X64Reg argFogReg = R9; -static const X64Reg argColorReg = XMM4; - // Windows reserves space to save args, 1 xmm + 4 ints before the id. static const OpArg mArgID = MDisp(RSP, 1 * 16 + 4 * PTRBITS / 8); // Must save: RBX, RSP, RBP, RDI, RSI, R12-R15, XMM6-15 #else -static const X64Reg argXReg = RDI; -static const X64Reg argYReg = RSI; -static const X64Reg argZReg = RDX; -static const X64Reg argFogReg = RCX; -static const X64Reg argColorReg = XMM0; // Here we just have the return and padding to align RPB. static const OpArg mArgID = MDisp(RSP, 16); @@ -90,22 +79,41 @@ static OpArg MConstDisp(X64Reg r, const T *t) { SingleFunc PixelJitCache::CompileSingle(const PixelFuncID &id) { // Setup the reg cache. regCache_.Reset(); - regCache_.Release(RAX, PixelRegCache::T_GEN); - regCache_.Release(R10, PixelRegCache::T_GEN); - regCache_.Release(R11, PixelRegCache::T_GEN); - regCache_.Release(XMM1, PixelRegCache::T_VEC); - regCache_.Release(XMM2, PixelRegCache::T_VEC); - regCache_.Release(XMM3, PixelRegCache::T_VEC); - regCache_.Release(XMM5, PixelRegCache::T_VEC); + regCache_.Add(RAX, PixelRegCache::T_GEN, PixelRegCache::INVALID); + regCache_.Add(R10, PixelRegCache::T_GEN, PixelRegCache::INVALID); + regCache_.Add(R11, PixelRegCache::T_GEN, PixelRegCache::INVALID); + regCache_.Add(XMM1, PixelRegCache::T_VEC, PixelRegCache::INVALID); + regCache_.Add(XMM2, PixelRegCache::T_VEC, PixelRegCache::INVALID); + regCache_.Add(XMM3, PixelRegCache::T_VEC, PixelRegCache::INVALID); + regCache_.Add(XMM5, PixelRegCache::T_VEC, PixelRegCache::INVALID); -#if !PPSSPP_PLATFORM(WINDOWS) - regCache_.Release(R8, PixelRegCache::T_GEN); - regCache_.Release(R9, PixelRegCache::T_GEN); - regCache_.Release(XMM4, PixelRegCache::T_VEC); +#if PPSSPP_PLATFORM(WINDOWS) + regCache_.Add(XMM0, PixelRegCache::T_VEC, PixelRegCache::INVALID); + + regCache_.Add(RCX, PixelRegCache::T_GEN, PixelRegCache::ARG_X); + regCache_.Add(RDX, PixelRegCache::T_GEN, PixelRegCache::ARG_Y); + regCache_.Add(R8, PixelRegCache::T_GEN, PixelRegCache::ARG_Z); + regCache_.Add(R9, PixelRegCache::T_GEN, PixelRegCache::ARG_FOG); + regCache_.Add(XMM4, PixelRegCache::T_VEC, PixelRegCache::ARG_COLOR); #else - regCache_.Release(XMM0, PixelRegCache::T_VEC); + regCache_.Add(R8, PixelRegCache::T_GEN, PixelRegCache::INVALID); + regCache_.Add(R9, PixelRegCache::T_GEN, PixelRegCache::INVALID); + regCache_.Add(XMM4, PixelRegCache::T_VEC, PixelRegCache::INVALID); + + regCache_.Add(RDI, PixelRegCache::T_GEN, PixelRegCache::ARG_X); + regCache_.Add(RSI, PixelRegCache::T_GEN, PixelRegCache::ARG_Y); + regCache_.Add(RDX, PixelRegCache::T_GEN, PixelRegCache::ARG_Z); + regCache_.Add(RCX, PixelRegCache::T_GEN, PixelRegCache::ARG_FOG); + regCache_.Add(XMM0, PixelRegCache::T_VEC, PixelRegCache::ARG_COLOR); #endif + // Initially, disallow spill for args (they get unlocked when unused.) + regCache_.ForceLock(PixelRegCache::ARG_X, PixelRegCache::T_GEN); + regCache_.ForceLock(PixelRegCache::ARG_Y, PixelRegCache::T_GEN); + regCache_.ForceLock(PixelRegCache::ARG_Z, PixelRegCache::T_GEN); + regCache_.ForceLock(PixelRegCache::ARG_FOG, PixelRegCache::T_GEN); + regCache_.ForceLock(PixelRegCache::ARG_COLOR, PixelRegCache::T_VEC); + BeginWrite(); const u8 *start = AlignCode16(); bool success = true; @@ -115,8 +123,10 @@ SingleFunc PixelJitCache::CompileSingle(const PixelFuncID &id) { // Next, let's clamp the color (might affect alpha test, and everything expects it clamped.) // We simply convert to 4x8-bit to clamp. Everything else expects color in this format. + X64Reg argColorReg = regCache_.Find(PixelRegCache::ARG_COLOR, PixelRegCache::T_VEC); PACKSSDW(argColorReg, R(argColorReg)); PACKUSWB(argColorReg, R(argColorReg)); + regCache_.Unlock(argColorReg, PixelRegCache::T_VEC); colorIs16Bit_ = false; success = success && Jit_AlphaTest(id); @@ -185,6 +195,8 @@ PixelRegCache::Reg PixelJitCache::GetColorOff(const PixelFuncID &id) { if (id.useStandardStride && !id.dithering) { bool loadDepthOff = id.depthWrite || id.DepthTestFunc() != GE_COMP_ALWAYS; X64Reg depthTemp = INVALID_REG; + X64Reg argYReg = regCache_.Find(PixelRegCache::ARG_Y, PixelRegCache::T_GEN); + X64Reg argXReg = regCache_.Find(PixelRegCache::ARG_X, PixelRegCache::T_GEN); // In this mode, we force argXReg to the off, and throw away argYReg. SHL(32, R(argYReg), Imm8(9)); @@ -202,9 +214,10 @@ PixelRegCache::Reg PixelJitCache::GetColorOff(const PixelFuncID &id) { } LEA(PTRBITS, argYReg, MComplex(argYReg, argXReg, id.FBFormat() == GE_FORMAT_8888 ? 4 : 2, 0)); // With that, argYOff is now COLOR_OFF. - regCache_.Release(argYReg, PixelRegCache::T_GEN, PixelRegCache::COLOR_OFF); + regCache_.Change(argYReg, PixelRegCache::T_GEN, PixelRegCache::COLOR_OFF); // Lock it, because we can't recalculate this. regCache_.ForceLock(PixelRegCache::COLOR_OFF, PixelRegCache::T_GEN); + regCache_.Unlock(argYReg, PixelRegCache::T_GEN); // Next, also calculate the depth offset, unless we won't need it at all. if (loadDepthOff) { @@ -213,15 +226,18 @@ PixelRegCache::Reg PixelJitCache::GetColorOff(const PixelFuncID &id) { regCache_.Release(depthTemp, PixelRegCache::T_GEN); // Okay, same deal - release as DEPTH_OFF and force lock. - regCache_.Release(argXReg, PixelRegCache::T_GEN, PixelRegCache::DEPTH_OFF); + regCache_.Change(argXReg, PixelRegCache::T_GEN, PixelRegCache::DEPTH_OFF); regCache_.ForceLock(PixelRegCache::DEPTH_OFF, PixelRegCache::T_GEN); + regCache_.Unlock(argXReg, PixelRegCache::T_GEN); } else { - regCache_.Release(argXReg, PixelRegCache::T_GEN); + regCache_.Unlock(argXReg, PixelRegCache::T_GEN); + regCache_.ForceRelease(PixelRegCache::ARG_X, PixelRegCache::T_GEN); } return regCache_.Find(PixelRegCache::COLOR_OFF, PixelRegCache::T_GEN); } + X64Reg argYReg = regCache_.Find(PixelRegCache::ARG_Y, PixelRegCache::T_GEN); X64Reg r; if (id.useStandardStride) { r = regCache_.Alloc(PixelRegCache::COLOR_OFF, PixelRegCache::T_GEN); @@ -236,8 +252,11 @@ PixelRegCache::Reg PixelJitCache::GetColorOff(const PixelFuncID &id) { AND(16, R(r), Imm16(0x07FC)); IMUL(32, r, R(argYReg)); } + regCache_.Unlock(argYReg, PixelRegCache::T_GEN); + X64Reg argXReg = regCache_.Find(PixelRegCache::ARG_X, PixelRegCache::T_GEN); ADD(32, R(r), R(argXReg)); + regCache_.Unlock(argXReg, PixelRegCache::T_GEN); X64Reg temp = regCache_.Alloc(PixelRegCache::TEMP_HELPER, PixelRegCache::T_GEN); MOV(PTRBITS, R(temp), ImmPtr(&fb.data)); @@ -259,6 +278,7 @@ PixelRegCache::Reg PixelJitCache::GetDepthOff(const PixelFuncID &id) { return regCache_.Find(PixelRegCache::DEPTH_OFF, PixelRegCache::T_GEN); } + X64Reg argYReg = regCache_.Find(PixelRegCache::ARG_Y, PixelRegCache::T_GEN); X64Reg r; if (id.useStandardStride) { r = regCache_.Alloc(PixelRegCache::DEPTH_OFF, PixelRegCache::T_GEN); @@ -273,8 +293,11 @@ PixelRegCache::Reg PixelJitCache::GetDepthOff(const PixelFuncID &id) { AND(16, R(r), Imm16(0x07FC)); IMUL(32, r, R(argYReg)); } + regCache_.Unlock(argYReg, PixelRegCache::T_GEN); + X64Reg argXReg = regCache_.Find(PixelRegCache::ARG_X, PixelRegCache::T_GEN); ADD(32, R(r), R(argXReg)); + regCache_.Unlock(argXReg, PixelRegCache::T_GEN); X64Reg temp = regCache_.Alloc(PixelRegCache::TEMP_HELPER, PixelRegCache::T_GEN); MOV(PTRBITS, R(temp), ImmPtr(&depthbuf.data)); @@ -332,10 +355,12 @@ bool PixelJitCache::Jit_ApplyDepthRange(const PixelFuncID &id) { MOVZX(32, 16, minReg, MDisp(gstateReg, offsetof(GPUgstate, minz))); MOVZX(32, 16, maxReg, MDisp(gstateReg, offsetof(GPUgstate, maxz))); + X64Reg argZReg = regCache_.Find(PixelRegCache::ARG_Z, PixelRegCache::T_GEN); CMP(32, R(argZReg), R(minReg)); Discard(CC_L); CMP(32, R(argZReg), R(maxReg)); Discard(CC_G); + regCache_.Unlock(argZReg, PixelRegCache::T_GEN); regCache_.Unlock(gstateReg, PixelRegCache::T_GEN); regCache_.Release(minReg, PixelRegCache::T_GEN); @@ -344,9 +369,9 @@ bool PixelJitCache::Jit_ApplyDepthRange(const PixelFuncID &id) { // Since this is early on, try to free up the z reg if we don't need it anymore. if (id.clearMode && !id.DepthClear()) - regCache_.Release(argZReg, PixelRegCache::T_GEN); + regCache_.ForceRelease(PixelRegCache::ARG_Z, PixelRegCache::T_GEN); else if (!id.clearMode && !id.depthWrite && id.DepthTestFunc() == GE_COMP_ALWAYS) - regCache_.Release(argZReg, PixelRegCache::T_GEN); + regCache_.ForceRelease(PixelRegCache::ARG_Z, PixelRegCache::T_GEN); return true; } @@ -372,7 +397,9 @@ bool PixelJitCache::Jit_AlphaTest(const PixelFuncID &id) { } else { alphaReg = regCache_.Alloc(PixelRegCache::SRC_ALPHA, PixelRegCache::T_GEN); _assert_(!colorIs16Bit_); + X64Reg argColorReg = regCache_.Find(PixelRegCache::ARG_COLOR, PixelRegCache::T_VEC); MOVD_xmm(R(alphaReg), argColorReg); + regCache_.Unlock(argColorReg, PixelRegCache::T_VEC); SHR(32, R(alphaReg), Imm8(24)); } @@ -448,6 +475,7 @@ bool PixelJitCache::Jit_ColorTest(const PixelFuncID &id) { MOV(32, R(refReg), MDisp(gstateReg, offsetof(GPUgstate, colorref))); AND(32, R(refReg), R(maskReg)); + X64Reg argColorReg = regCache_.Find(PixelRegCache::ARG_COLOR, PixelRegCache::T_VEC); if (colorIs16Bit_) { // If it's expanded, we need to clamp anyway if it was fogged. PACKUSWB(argColorReg, R(argColorReg)); @@ -457,6 +485,7 @@ bool PixelJitCache::Jit_ColorTest(const PixelFuncID &id) { // Temporarily abuse funcReg to grab the color into maskReg. MOVD_xmm(R(funcReg), argColorReg); AND(32, R(maskReg), R(funcReg)); + regCache_.Unlock(argColorReg, PixelRegCache::T_VEC); // Now that we're setup, get the func and follow it. MOVZX(32, 8, funcReg, MDisp(gstateReg, offsetof(GPUgstate, colortest))); @@ -493,7 +522,7 @@ bool PixelJitCache::Jit_ColorTest(const PixelFuncID &id) { bool PixelJitCache::Jit_ApplyFog(const PixelFuncID &id) { if (!id.applyFog) { // Okay, anyone can use the fog register then. - regCache_.Release(argFogReg, PixelRegCache::T_GEN); + regCache_.ForceRelease(PixelRegCache::ARG_FOG, PixelRegCache::T_GEN); return true; } @@ -519,6 +548,7 @@ bool PixelJitCache::Jit_ApplyFog(const PixelFuncID &id) { regCache_.Unlock(constReg, PixelRegCache::T_GEN); // Expand (we clamped) color to 16 bit as well, so we can multiply with fog. + X64Reg argColorReg = regCache_.Find(PixelRegCache::ARG_COLOR, PixelRegCache::T_VEC); if (!colorIs16Bit_) { if (cpu_info.bSSE4_1) { PMOVZXBW(argColorReg, R(argColorReg)); @@ -541,10 +571,12 @@ bool PixelJitCache::Jit_ApplyFog(const PixelFuncID &id) { // Okay, let's broadcast fog to an XMM. X64Reg fogMultReg = regCache_.Alloc(PixelRegCache::TEMP3, PixelRegCache::T_VEC); + X64Reg argFogReg = regCache_.Find(PixelRegCache::ARG_FOG, PixelRegCache::T_GEN); MOVD_xmm(fogMultReg, R(argFogReg)); PSHUFLW(fogMultReg, R(fogMultReg), _MM_SHUFFLE(0, 0, 0, 0)); + regCache_.Unlock(argFogReg, PixelRegCache::T_GEN); // We can free up the actual fog reg now. - regCache_.Release(argFogReg, PixelRegCache::T_GEN); + regCache_.ForceRelease(PixelRegCache::ARG_FOG, PixelRegCache::T_GEN); // Now we multiply the existing color by fog... PMULLW(argColorReg, R(fogMultReg)); @@ -566,6 +598,7 @@ bool PixelJitCache::Jit_ApplyFog(const PixelFuncID &id) { // Okay, put A back in, we'll shrink it to 8888 when needed. PINSRW(argColorReg, R(alphaReg), 3); + regCache_.Unlock(argColorReg, PixelRegCache::T_VEC); // We won't use alphaReg again, so toss it. regCache_.Release(alphaReg, PixelRegCache::T_GEN); @@ -688,8 +721,10 @@ bool PixelJitCache::Jit_DepthTestForStencil(const PixelFuncID &id, PixelRegCache return true; X64Reg depthOffReg = GetDepthOff(id); + X64Reg argZReg = regCache_.Find(PixelRegCache::ARG_Z, PixelRegCache::T_GEN); CMP(16, R(argZReg), MatR(depthOffReg)); regCache_.Unlock(depthOffReg, PixelRegCache::T_GEN); + regCache_.Unlock(argZReg, PixelRegCache::T_GEN); // We discard the opposite of the passing test. FixupBranch skip; @@ -741,7 +776,7 @@ bool PixelJitCache::Jit_DepthTestForStencil(const PixelFuncID &id, PixelRegCache // Like in Jit_DepthTest(), at this point we may not need this reg anymore. if (!id.depthWrite) - regCache_.Release(argZReg, PixelRegCache::T_GEN); + regCache_.ForceRelease(PixelRegCache::ARG_Z, PixelRegCache::T_GEN); return success; } @@ -919,8 +954,10 @@ bool PixelJitCache::Jit_DepthTest(const PixelFuncID &id) { } X64Reg depthOffReg = GetDepthOff(id); + X64Reg argZReg = regCache_.Find(PixelRegCache::ARG_Z, PixelRegCache::T_GEN); CMP(16, R(argZReg), MatR(depthOffReg)); regCache_.Unlock(depthOffReg, PixelRegCache::T_GEN); + regCache_.Unlock(argZReg, PixelRegCache::T_GEN); // We discard the opposite of the passing test. switch (id.DepthTestFunc()) { @@ -955,7 +992,7 @@ bool PixelJitCache::Jit_DepthTest(const PixelFuncID &id) { // If we're not writing, we don't need Z anymore. We'll free DEPTH_OFF in Jit_WriteDepth(). if (!id.depthWrite) - regCache_.Release(argZReg, PixelRegCache::T_GEN); + regCache_.ForceRelease(PixelRegCache::ARG_Z, PixelRegCache::T_GEN); return true; } @@ -964,14 +1001,16 @@ bool PixelJitCache::Jit_WriteDepth(const PixelFuncID &id) { // Clear mode shares depthWrite for DepthClear(). if (id.depthWrite) { X64Reg depthOffReg = GetDepthOff(id); + X64Reg argZReg = regCache_.Find(PixelRegCache::ARG_Z, PixelRegCache::T_GEN); MOV(16, MatR(depthOffReg), R(argZReg)); regCache_.Unlock(depthOffReg, PixelRegCache::T_GEN); - regCache_.Release(argZReg, PixelRegCache::T_GEN); + regCache_.Unlock(argZReg, PixelRegCache::T_GEN); + regCache_.ForceRelease(PixelRegCache::ARG_Z, PixelRegCache::T_GEN); } // We can free up this reg if we force locked it. if (regCache_.Has(PixelRegCache::DEPTH_OFF, PixelRegCache::T_GEN)) { - regCache_.ForceLock(PixelRegCache::DEPTH_OFF, PixelRegCache::T_GEN, false); + regCache_.ForceRelease(PixelRegCache::DEPTH_OFF, PixelRegCache::T_GEN); } return true; @@ -1027,6 +1066,7 @@ bool PixelJitCache::Jit_AlphaBlend(const PixelFuncID &id) { } // Step 2: Load and apply factors. + X64Reg argColorReg = regCache_.Find(PixelRegCache::ARG_COLOR, PixelRegCache::T_VEC); if (blendState.usesFactors) { X64Reg srcFactorReg = regCache_.Alloc(PixelRegCache::TEMP1, PixelRegCache::T_VEC); X64Reg dstFactorReg = regCache_.Alloc(PixelRegCache::TEMP2, PixelRegCache::T_VEC); @@ -1119,6 +1159,7 @@ bool PixelJitCache::Jit_AlphaBlend(const PixelFuncID &id) { regCache_.Release(tempReg, PixelRegCache::T_VEC); regCache_.Release(dstReg, PixelRegCache::T_VEC); + regCache_.Unlock(argColorReg, PixelRegCache::T_VEC); return success; } @@ -1128,6 +1169,7 @@ bool PixelJitCache::Jit_BlendFactor(const PixelFuncID &id, PixelRegCache::Reg fa X64Reg constReg = INVALID_REG; X64Reg gstateReg = INVALID_REG; X64Reg tempReg = INVALID_REG; + X64Reg argColorReg = regCache_.Find(PixelRegCache::ARG_COLOR, PixelRegCache::T_VEC); // Everything below expects an expanded 16-bit color _assert_(colorIs16Bit_); @@ -1224,6 +1266,7 @@ bool PixelJitCache::Jit_BlendFactor(const PixelFuncID &id, PixelRegCache::Reg fa regCache_.Unlock(gstateReg, PixelRegCache::T_GEN); if (tempReg != INVALID_REG) regCache_.Release(tempReg, PixelRegCache::T_VEC); + regCache_.Unlock(argColorReg, PixelRegCache::T_VEC); return true; } @@ -1232,6 +1275,7 @@ bool PixelJitCache::Jit_DstBlendFactor(const PixelFuncID &id, PixelRegCache::Reg bool success = true; X64Reg constReg = INVALID_REG; X64Reg gstateReg = INVALID_REG; + X64Reg argColorReg = regCache_.Find(PixelRegCache::ARG_COLOR, PixelRegCache::T_VEC); // Everything below expects an expanded 16-bit color _assert_(colorIs16Bit_); @@ -1291,6 +1335,7 @@ bool PixelJitCache::Jit_DstBlendFactor(const PixelFuncID &id, PixelRegCache::Reg regCache_.Unlock(constReg, PixelRegCache::T_GEN); if (gstateReg != INVALID_REG) regCache_.Unlock(gstateReg, PixelRegCache::T_GEN); + regCache_.Unlock(argColorReg, PixelRegCache::T_VEC); return success; } @@ -1305,6 +1350,7 @@ bool PixelJitCache::Jit_Dither(const PixelFuncID &id) { X64Reg valueReg = regCache_.Alloc(PixelRegCache::TEMP0, PixelRegCache::T_GEN); // Load the row dither matrix entry (will still need to get the X.) + X64Reg argYReg = regCache_.Find(PixelRegCache::ARG_Y, PixelRegCache::T_GEN); MOV(32, R(valueReg), R(argYReg)); AND(32, R(valueReg), Imm8(3)); #ifndef SOFTPIXEL_USE_CACHE @@ -1316,8 +1362,11 @@ bool PixelJitCache::Jit_Dither(const PixelFuncID &id) { // Then we can modify x and throw it away too, which is our actual goal. regCache_.Unlock(GetColorOff(id), PixelRegCache::T_GEN); regCache_.ForceLock(PixelRegCache::COLOR_OFF, PixelRegCache::T_GEN); - regCache_.Release(argYReg, PixelRegCache::T_GEN); + // And get rid of y, we can use for other regs. + regCache_.Unlock(argYReg, PixelRegCache::T_GEN); + regCache_.ForceRelease(PixelRegCache::ARG_Y, PixelRegCache::T_GEN); + X64Reg argXReg = regCache_.Find(PixelRegCache::ARG_X, PixelRegCache::T_GEN); AND(32, R(argXReg), Imm32(3)); #ifndef SOFTPIXEL_USE_CACHE @@ -1333,8 +1382,8 @@ bool PixelJitCache::Jit_Dither(const PixelFuncID &id) { if (valueReg == RCX) valueReg = argXReg; } else { + // We'll unlock and force release argXReg later, but copy for now. MOV(32, R(RCX), R(argXReg)); - regCache_.Release(argXReg, PixelRegCache::T_GEN); } } @@ -1342,8 +1391,9 @@ bool PixelJitCache::Jit_Dither(const PixelFuncID &id) { SHR(32, R(valueReg), R(CL)); AND(16, R(valueReg), Imm16(0x000F)); - // This will either be argXReg on Windows, or RCX we explicitly grabbed. - regCache_.Release(RCX, PixelRegCache::T_GEN); + // Release RCX if we explicitly grabbed. + if (argXReg != RCX) + regCache_.Release(RCX, PixelRegCache::T_GEN); // Now we need to make 0-7 positive, 8-F negative.. so sign extend. SHL(32, R(valueReg), Imm8(4)); @@ -1357,8 +1407,9 @@ bool PixelJitCache::Jit_Dither(const PixelFuncID &id) { // Okay, now abuse argXReg to read the PixelFuncID pointer on the stack. MOV(PTRBITS, R(argXReg), mArgID); MOVSX(32, 16, valueReg, MRegSum(argXReg, valueReg)); - regCache_.Release(argXReg, PixelRegCache::T_GEN); #endif + regCache_.Unlock(argXReg, PixelRegCache::T_GEN); + regCache_.ForceRelease(PixelRegCache::ARG_X, PixelRegCache::T_GEN); // Copy that value into a vec to add to the color. X64Reg vecValueReg = regCache_.Alloc(PixelRegCache::TEMP0, PixelRegCache::T_VEC); @@ -1369,7 +1420,9 @@ bool PixelJitCache::Jit_Dither(const PixelFuncID &id) { // Luckily, we know that second lane (in 16-bit) is zero from valueReg's high 16 bits. // We use 16-bit because we need a signed add, but we also want to saturate. PSHUFLW(vecValueReg, R(vecValueReg), _MM_SHUFFLE(1, 0, 0, 0)); + // With that, now let's convert the color to 16 bit... + X64Reg argColorReg = regCache_.Find(PixelRegCache::ARG_COLOR, PixelRegCache::T_VEC); if (!colorIs16Bit_) { if (cpu_info.bSSE4_1) { PMOVZXBW(argColorReg, R(argColorReg)); @@ -1383,20 +1436,25 @@ bool PixelJitCache::Jit_Dither(const PixelFuncID &id) { // And simply add the dither values. PADDSW(argColorReg, R(vecValueReg)); regCache_.Release(vecValueReg, PixelRegCache::T_VEC); + regCache_.Unlock(argColorReg, PixelRegCache::T_VEC); return true; } bool PixelJitCache::Jit_WriteColor(const PixelFuncID &id) { X64Reg colorOff = GetColorOff(id); - if (!id.useStandardStride && !id.dithering) { - // We won't need X or Y anymore, so toss them for reg space. - regCache_.Release(argXReg, PixelRegCache::T_GEN); - regCache_.Release(argYReg, PixelRegCache::T_GEN); + if (regCache_.Has(PixelRegCache::ARG_X, PixelRegCache::T_GEN)) { + // We normally toss x and y during dithering or useStandardStride with no dithering. + // Free up the regs now to get more reg space. + regCache_.ForceRelease(PixelRegCache::ARG_X, PixelRegCache::T_GEN); + regCache_.ForceRelease(PixelRegCache::ARG_Y, PixelRegCache::T_GEN); + + // But make sure we don't lose COLOR_OFF, we'll be lost without that now. regCache_.ForceLock(PixelRegCache::COLOR_OFF, PixelRegCache::T_GEN); } // Convert back to 8888 and clamp. + X64Reg argColorReg = regCache_.Find(PixelRegCache::ARG_COLOR, PixelRegCache::T_VEC); if (colorIs16Bit_) { PACKUSWB(argColorReg, R(argColorReg)); colorIs16Bit_ = false; @@ -1416,6 +1474,7 @@ bool PixelJitCache::Jit_WriteColor(const PixelFuncID &id) { bool success = Jit_WriteStencilOnly(id, alphaReg); regCache_.Release(alphaReg, PixelRegCache::T_GEN); + regCache_.Unlock(argColorReg, PixelRegCache::T_VEC); return success; } @@ -1424,6 +1483,7 @@ bool PixelJitCache::Jit_WriteColor(const PixelFuncID &id) { X64Reg colorReg = regCache_.Alloc(PixelRegCache::TEMP0, PixelRegCache::T_GEN); MOVD_xmm(R(colorReg), argColorReg); + regCache_.Unlock(argColorReg, PixelRegCache::T_VEC); X64Reg stencilReg = INVALID_REG; if (regCache_.Has(PixelRegCache::STENCIL, PixelRegCache::T_GEN)) @@ -1592,8 +1652,8 @@ bool PixelJitCache::Jit_WriteColor(const PixelFuncID &id) { if (maskReg != INVALID_REG) regCache_.Release(maskReg, PixelRegCache::T_GEN); if (stencilReg != INVALID_REG) { - regCache_.ForceLock(PixelRegCache::STENCIL, PixelRegCache::T_GEN, false); - regCache_.Release(stencilReg, PixelRegCache::T_GEN); + regCache_.Unlock(stencilReg, PixelRegCache::T_GEN); + regCache_.ForceRelease(PixelRegCache::STENCIL, PixelRegCache::T_GEN); } return success;