ppsspp/GPU/Software/DrawPixelX86.cpp

2333 lines
72 KiB
C++

// Copyright (c) 2017- PPSSPP Project.
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, version 2.0 or later versions.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License 2.0 for more details.
// A copy of the GPL 2.0 should have been included with the program.
// If not, see http://www.gnu.org/licenses/
// Official git repository and contact information can be found at
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
#include "ppsspp_config.h"
#if PPSSPP_ARCH(AMD64)
#include <emmintrin.h>
#include "Common/x64Emitter.h"
#include "Common/CPUDetect.h"
#include "Core/Reporting.h"
#include "GPU/GPUState.h"
#include "GPU/Software/DrawPixel.h"
#include "GPU/Software/SoftGpu.h"
#include "GPU/ge_constants.h"
using namespace Gen;
namespace Rasterizer {
template <typename T>
static bool Accessible(const T *t1, const T *t2) {
ptrdiff_t diff = (const uint8_t *)t1 - (const uint8_t *)t2;
return diff > -0x7FFFFFE0 && diff < 0x7FFFFFE0;
}
template <typename T>
static OpArg MAccessibleDisp(X64Reg r, const T *tbase, const T *t) {
_assert_(Accessible(tbase, t));
ptrdiff_t diff = (const uint8_t *)t - (const uint8_t *)tbase;
return MDisp(r, (int)diff);
}
SingleFunc PixelJitCache::CompileSingle(const PixelFuncID &id) {
// Setup the reg cache and disallow spill for arguments.
regCache_.SetupABI({
RegCache::GEN_ARG_X,
RegCache::GEN_ARG_Y,
RegCache::GEN_ARG_Z,
RegCache::GEN_ARG_FOG,
RegCache::VEC_ARG_COLOR,
RegCache::GEN_ARG_ID,
});
BeginWrite();
Describe("Init");
WriteConstantPool(id);
const u8 *resetPos = AlignCode16();
bool success = true;
#if PPSSPP_PLATFORM(WINDOWS)
// RET + Windows reserves space to save args, half of 1 xmm + 4 ints before the id.
_assert_(!regCache_.Has(RegCache::GEN_ARG_ID));
int stackSpace = 0;
if (id.hasStencilTestMask)
stackSpace = WriteProlog(0, { XMM6, XMM7, XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15 }, { R12, R13, R14, R15 });
else
stackSpace = WriteProlog(0, {}, {});
stackIDOffset_ = stackSpace + 8 + 8 + 4 * PTRBITS / 8;
#else
_assert_(regCache_.Has(RegCache::GEN_ARG_ID));
WriteProlog(0, {}, {});
stackIDOffset_ = -1;
#endif
// Start with the depth range.
success = success && Jit_ApplyDepthRange(id);
// Next, let's clamp the color (might affect alpha test, and everything expects it clamped.)
// We simply convert to 4x8-bit to clamp. Everything else expects color in this format.
Describe("ClampColor");
X64Reg argColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);
PACKSSDW(argColorReg, R(argColorReg));
PACKUSWB(argColorReg, R(argColorReg));
regCache_.Unlock(argColorReg, RegCache::VEC_ARG_COLOR);
colorIs16Bit_ = false;
success = success && Jit_AlphaTest(id);
// Fog is applied prior to color test. Maybe before alpha test too, but it doesn't affect it...
success = success && Jit_ApplyFog(id);
success = success && Jit_ColorTest(id);
if (id.stencilTest && !id.clearMode)
success = success && Jit_StencilAndDepthTest(id);
else if (!id.clearMode)
success = success && Jit_DepthTest(id);
success = success && Jit_WriteDepth(id);
success = success && Jit_AlphaBlend(id);
success = success && Jit_Dither(id);
success = success && Jit_WriteColor(id);
for (auto &fixup : discards_) {
SetJumpTarget(fixup);
}
discards_.clear();
if (regCache_.Has(RegCache::GEN_ARG_ID))
regCache_.ForceRelease(RegCache::GEN_ARG_ID);
if (!success) {
ERROR_LOG_REPORT(G3D, "Could not compile pixel func: %s", DescribePixelFuncID(id).c_str());
regCache_.Reset(false);
EndWrite();
ResetCodePtr(GetOffset(resetPos));
return nullptr;
}
const u8 *start = WriteFinalizedEpilog();
regCache_.Reset(true);
return (SingleFunc)start;
}
RegCache::Reg PixelJitCache::GetPixelID() {
if (regCache_.Has(RegCache::GEN_ARG_ID))
return regCache_.Find(RegCache::GEN_ARG_ID);
if (!regCache_.Has(RegCache::GEN_ID)) {
X64Reg r = regCache_.Alloc(RegCache::GEN_ID);
_assert_(stackIDOffset_ != -1);
MOV(PTRBITS, R(r), MDisp(RSP, stackIDOffset_));
return r;
}
return regCache_.Find(RegCache::GEN_ID);
}
void PixelJitCache::UnlockPixelID(RegCache::Reg &r) {
if (regCache_.Has(RegCache::GEN_ARG_ID))
regCache_.Unlock(r, RegCache::GEN_ARG_ID);
else
regCache_.Unlock(r, RegCache::GEN_ID);
}
RegCache::Reg PixelJitCache::GetColorOff(const PixelFuncID &id) {
if (!regCache_.Has(RegCache::GEN_COLOR_OFF)) {
Describe("GetColorOff");
if (id.useStandardStride && !id.dithering) {
bool loadDepthOff = id.depthWrite || (id.DepthTestFunc() != GE_COMP_ALWAYS && !id.earlyZChecks);
X64Reg depthTemp = INVALID_REG;
X64Reg argYReg = regCache_.Find(RegCache::GEN_ARG_Y);
X64Reg argXReg = regCache_.Find(RegCache::GEN_ARG_X);
// In this mode, we force argXReg to the off, and throw away argYReg.
SHL(32, R(argYReg), Imm8(9));
ADD(32, R(argXReg), R(argYReg));
// Now add the pointer for the color buffer.
if (loadDepthOff) {
_assert_(Accessible(&fb.data, &depthbuf.data));
depthTemp = regCache_.Alloc(RegCache::GEN_DEPTH_OFF);
if (RipAccessible(&fb.data) && RipAccessible(&depthbuf.data)) {
MOV(PTRBITS, R(argYReg), M(&fb.data));
} else {
MOV(PTRBITS, R(depthTemp), ImmPtr(&fb.data));
MOV(PTRBITS, R(argYReg), MatR(depthTemp));
}
} else {
if (RipAccessible(&fb.data)) {
MOV(PTRBITS, R(argYReg), M(&fb.data));
} else {
MOV(PTRBITS, R(argYReg), ImmPtr(&fb.data));
MOV(PTRBITS, R(argYReg), MatR(argYReg));
}
}
LEA(PTRBITS, argYReg, MComplex(argYReg, argXReg, id.FBFormat() == GE_FORMAT_8888 ? 4 : 2, 0));
// With that, argYOff is now GEN_COLOR_OFF.
regCache_.Unlock(argYReg, RegCache::GEN_ARG_Y);
regCache_.Change(RegCache::GEN_ARG_Y, RegCache::GEN_COLOR_OFF);
// Retain it, because we can't recalculate this.
regCache_.ForceRetain(RegCache::GEN_COLOR_OFF);
// Next, also calculate the depth offset, unless we won't need it at all.
if (loadDepthOff) {
if (RipAccessible(&fb.data) && RipAccessible(&depthbuf.data)) {
MOV(PTRBITS, R(depthTemp), M(&depthbuf.data));
} else {
MOV(PTRBITS, R(depthTemp), MAccessibleDisp(depthTemp, &fb.data, &depthbuf.data));
}
LEA(PTRBITS, argXReg, MComplex(depthTemp, argXReg, 2, 0));
regCache_.Release(depthTemp, RegCache::GEN_DEPTH_OFF);
// Okay, same deal - release as GEN_DEPTH_OFF and force retain it.
regCache_.Unlock(argXReg, RegCache::GEN_ARG_X);
regCache_.Change(RegCache::GEN_ARG_X, RegCache::GEN_DEPTH_OFF);
regCache_.ForceRetain(RegCache::GEN_DEPTH_OFF);
} else {
regCache_.Unlock(argXReg, RegCache::GEN_ARG_X);
regCache_.ForceRelease(RegCache::GEN_ARG_X);
}
return regCache_.Find(RegCache::GEN_COLOR_OFF);
}
X64Reg argYReg = regCache_.Find(RegCache::GEN_ARG_Y);
X64Reg r = regCache_.Alloc(RegCache::GEN_COLOR_OFF);
if (id.useStandardStride) {
MOV(32, R(r), R(argYReg));
SHL(32, R(r), Imm8(9));
} else {
if (regCache_.Has(RegCache::GEN_ARG_ID) || regCache_.Has(RegCache::GEN_ID)) {
X64Reg idReg = GetPixelID();
MOVZX(32, 16, r, MDisp(idReg, offsetof(PixelFuncID, cached.framebufStride)));
UnlockPixelID(idReg);
} else {
_assert_(stackIDOffset_ != -1);
MOV(PTRBITS, R(r), MDisp(RSP, stackIDOffset_));
MOVZX(32, 16, r, MDisp(r, offsetof(PixelFuncID, cached.framebufStride)));
}
IMUL(32, r, R(argYReg));
}
regCache_.Unlock(argYReg, RegCache::GEN_ARG_Y);
X64Reg argXReg = regCache_.Find(RegCache::GEN_ARG_X);
ADD(32, R(r), R(argXReg));
regCache_.Unlock(argXReg, RegCache::GEN_ARG_X);
X64Reg temp = regCache_.Alloc(RegCache::GEN_TEMP_HELPER);
if (RipAccessible(&fb.data)) {
MOV(PTRBITS, R(temp), M(&fb.data));
} else {
MOV(PTRBITS, R(temp), ImmPtr(&fb.data));
MOV(PTRBITS, R(temp), MatR(temp));
}
LEA(PTRBITS, r, MComplex(temp, r, id.FBFormat() == GE_FORMAT_8888 ? 4 : 2, 0));
regCache_.Release(temp, RegCache::GEN_TEMP_HELPER);
return r;
}
return regCache_.Find(RegCache::GEN_COLOR_OFF);
}
RegCache::Reg PixelJitCache::GetDepthOff(const PixelFuncID &id) {
if (!regCache_.Has(RegCache::GEN_DEPTH_OFF)) {
// If both color and depth use 512, the offsets are the same.
if (id.useStandardStride && !id.dithering) {
// Calculate once inside GetColorOff().
X64Reg colorOffReg = GetColorOff(id);
regCache_.Unlock(colorOffReg, RegCache::GEN_COLOR_OFF);
return regCache_.Find(RegCache::GEN_DEPTH_OFF);
}
Describe("GetDepthOff");
X64Reg argYReg = regCache_.Find(RegCache::GEN_ARG_Y);
X64Reg r = regCache_.Alloc(RegCache::GEN_DEPTH_OFF);
if (id.useStandardStride) {
MOV(32, R(r), R(argYReg));
SHL(32, R(r), Imm8(9));
} else {
if (regCache_.Has(RegCache::GEN_ARG_ID) || regCache_.Has(RegCache::GEN_ID)) {
X64Reg idReg = GetPixelID();
MOVZX(32, 16, r, MDisp(idReg, offsetof(PixelFuncID, cached.depthbufStride)));
UnlockPixelID(idReg);
} else {
_assert_(stackIDOffset_ != -1);
MOV(PTRBITS, R(r), MDisp(RSP, stackIDOffset_));
MOVZX(32, 16, r, MDisp(r, offsetof(PixelFuncID, cached.depthbufStride)));
}
IMUL(32, r, R(argYReg));
}
regCache_.Unlock(argYReg, RegCache::GEN_ARG_Y);
X64Reg argXReg = regCache_.Find(RegCache::GEN_ARG_X);
ADD(32, R(r), R(argXReg));
regCache_.Unlock(argXReg, RegCache::GEN_ARG_X);
X64Reg temp = regCache_.Alloc(RegCache::GEN_TEMP_HELPER);
if (RipAccessible(&depthbuf.data)) {
MOV(PTRBITS, R(temp), M(&depthbuf.data));
} else {
MOV(PTRBITS, R(temp), ImmPtr(&depthbuf.data));
MOV(PTRBITS, R(temp), MatR(temp));
}
LEA(PTRBITS, r, MComplex(temp, r, 2, 0));
regCache_.Release(temp, RegCache::GEN_TEMP_HELPER);
return r;
}
return regCache_.Find(RegCache::GEN_DEPTH_OFF);
}
RegCache::Reg PixelJitCache::GetDestStencil(const PixelFuncID &id) {
// Skip if 565, since stencil is fixed zero.
if (id.FBFormat() == GE_FORMAT_565)
return INVALID_REG;
X64Reg colorOffReg = GetColorOff(id);
Describe("GetDestStencil");
X64Reg stencilReg = regCache_.Alloc(RegCache::GEN_STENCIL);
if (id.FBFormat() == GE_FORMAT_8888) {
MOVZX(32, 8, stencilReg, MDisp(colorOffReg, 3));
} else if (id.FBFormat() == GE_FORMAT_5551) {
MOVZX(32, 8, stencilReg, MDisp(colorOffReg, 1));
SAR(8, R(stencilReg), Imm8(7));
} else if (id.FBFormat() == GE_FORMAT_4444) {
MOVZX(32, 8, stencilReg, MDisp(colorOffReg, 1));
SHR(32, R(stencilReg), Imm8(4));
X64Reg temp = regCache_.Alloc(RegCache::GEN_TEMP_HELPER);
MOV(32, R(temp), R(stencilReg));
SHL(32, R(temp), Imm8(4));
OR(32, R(stencilReg), R(temp));
regCache_.Release(temp, RegCache::GEN_TEMP_HELPER);
}
regCache_.Unlock(colorOffReg, RegCache::GEN_COLOR_OFF);
return stencilReg;
}
void PixelJitCache::Discard() {
discards_.push_back(J(true));
}
void PixelJitCache::Discard(Gen::CCFlags cc) {
discards_.push_back(J_CC(cc, true));
}
void PixelJitCache::WriteConstantPool(const PixelFuncID &id) {
// This is used to add a fixed point 0.5 (as s.11.4) for blend factors to multiply accurately.
WriteSimpleConst8x16(constBlendHalf_11_4s_, 1 << 3);
// This is used for shifted blend factors, to inverse them.
WriteSimpleConst8x16(constBlendInvert_11_4s_, 0xFF << 4);
}
bool PixelJitCache::Jit_ApplyDepthRange(const PixelFuncID &id) {
if (id.applyDepthRange && !id.earlyZChecks) {
Describe("ApplyDepthR");
X64Reg argZReg = regCache_.Find(RegCache::GEN_ARG_Z);
X64Reg idReg = GetPixelID();
// We expanded this to 32 bits, so it's convenient to compare.
CMP(32, R(argZReg), MDisp(idReg, offsetof(PixelFuncID, cached.minz)));
Discard(CC_L);
// We load the low 16 bits, but compare all 32 of z. Above handles < 0.
CMP(32, R(argZReg), MDisp(idReg, offsetof(PixelFuncID, cached.maxz)));
Discard(CC_G);
UnlockPixelID(idReg);
regCache_.Unlock(argZReg, RegCache::GEN_ARG_Z);
}
// Since this is early on, try to free up the z reg if we don't need it anymore.
if (id.clearMode && !id.DepthClear())
regCache_.ForceRelease(RegCache::GEN_ARG_Z);
else if (!id.clearMode && !id.depthWrite && (id.DepthTestFunc() == GE_COMP_ALWAYS || id.earlyZChecks))
regCache_.ForceRelease(RegCache::GEN_ARG_Z);
return true;
}
bool PixelJitCache::Jit_AlphaTest(const PixelFuncID &id) {
// Take care of ALWAYS/NEVER first. ALWAYS is common, means disabled.
Describe("AlphaTest");
switch (id.AlphaTestFunc()) {
case GE_COMP_NEVER:
Discard();
return true;
case GE_COMP_ALWAYS:
return true;
default:
break;
}
// Load alpha into its own general reg.
X64Reg alphaReg;
if (regCache_.Has(RegCache::GEN_SRC_ALPHA)) {
alphaReg = regCache_.Find(RegCache::GEN_SRC_ALPHA);
} else {
alphaReg = regCache_.Alloc(RegCache::GEN_SRC_ALPHA);
_assert_(!colorIs16Bit_);
X64Reg argColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);
MOVD_xmm(R(alphaReg), argColorReg);
regCache_.Unlock(argColorReg, RegCache::VEC_ARG_COLOR);
SHR(32, R(alphaReg), Imm8(24));
}
if (id.hasAlphaTestMask) {
// Unfortunate, we'll need pixelID to load the mask.
// Note: we leave the ALPHA purpose untouched and free it, because later code may reuse.
X64Reg idReg = GetPixelID();
X64Reg maskedReg = regCache_.Alloc(RegCache::GEN_TEMP0);
MOVZX(32, 8, maskedReg, MDisp(idReg, offsetof(PixelFuncID, cached.alphaTestMask)));
UnlockPixelID(idReg);
AND(32, R(maskedReg), R(alphaReg));
regCache_.Unlock(alphaReg, RegCache::GEN_SRC_ALPHA);
// Okay now do the rest using the masked reg, which we modified.
alphaReg = maskedReg;
}
// We hardcode the ref into this jit func.
CMP(8, R(alphaReg), Imm8(id.alphaTestRef));
if (id.hasAlphaTestMask)
regCache_.Release(alphaReg, RegCache::GEN_TEMP0);
else
regCache_.Unlock(alphaReg, RegCache::GEN_SRC_ALPHA);
switch (id.AlphaTestFunc()) {
case GE_COMP_NEVER:
case GE_COMP_ALWAYS:
break;
case GE_COMP_EQUAL:
Discard(CC_NE);
break;
case GE_COMP_NOTEQUAL:
Discard(CC_E);
break;
case GE_COMP_LESS:
Discard(CC_AE);
break;
case GE_COMP_LEQUAL:
Discard(CC_A);
break;
case GE_COMP_GREATER:
Discard(CC_BE);
break;
case GE_COMP_GEQUAL:
Discard(CC_B);
break;
}
return true;
}
bool PixelJitCache::Jit_ColorTest(const PixelFuncID &id) {
if (!id.colorTest || id.clearMode)
return true;
// We'll have 4 with fog released, so we're using them all...
Describe("ColorTest");
X64Reg idReg = GetPixelID();
X64Reg funcReg = regCache_.Alloc(RegCache::GEN_TEMP0);
X64Reg maskReg = regCache_.Alloc(RegCache::GEN_TEMP1);
X64Reg refReg = regCache_.Alloc(RegCache::GEN_TEMP2);
// First, load the registers: mask and ref.
MOV(32, R(maskReg), MDisp(idReg, offsetof(PixelFuncID, cached.colorTestMask)));
MOV(32, R(refReg), MDisp(idReg, offsetof(PixelFuncID, cached.colorTestRef)));
X64Reg argColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);
if (colorIs16Bit_) {
// If it's expanded, we need to clamp anyway if it was fogged.
PACKUSWB(argColorReg, R(argColorReg));
colorIs16Bit_ = false;
}
// Temporarily abuse funcReg to grab the color into maskReg.
MOVD_xmm(R(funcReg), argColorReg);
AND(32, R(maskReg), R(funcReg));
regCache_.Unlock(argColorReg, RegCache::VEC_ARG_COLOR);
// Now that we're setup, get the func and follow it.
MOVZX(32, 8, funcReg, MDisp(idReg, offsetof(PixelFuncID, cached.colorTestFunc)));
UnlockPixelID(idReg);
CMP(8, R(funcReg), Imm8(GE_COMP_ALWAYS));
// Discard for GE_COMP_NEVER...
Discard(CC_B);
FixupBranch skip = J_CC(CC_E);
CMP(8, R(funcReg), Imm8(GE_COMP_EQUAL));
FixupBranch doEqual = J_CC(CC_E);
regCache_.Release(funcReg, RegCache::GEN_TEMP0);
// The not equal path here... if they are equal, we discard.
CMP(32, R(refReg), R(maskReg));
Discard(CC_E);
FixupBranch skip2 = J();
SetJumpTarget(doEqual);
CMP(32, R(refReg), R(maskReg));
Discard(CC_NE);
regCache_.Release(maskReg, RegCache::GEN_TEMP1);
regCache_.Release(refReg, RegCache::GEN_TEMP2);
SetJumpTarget(skip);
SetJumpTarget(skip2);
return true;
}
bool PixelJitCache::Jit_ApplyFog(const PixelFuncID &id) {
if (!id.applyFog) {
// Okay, anyone can use the fog register then.
regCache_.ForceRelease(RegCache::GEN_ARG_FOG);
return true;
}
// Load fog and expand to 16 bit. Ignore the high 8 bits, which'll match up with A.
Describe("ApplyFog");
X64Reg fogColorReg = regCache_.Alloc(RegCache::VEC_TEMP1);
X64Reg idReg = GetPixelID();
if (cpu_info.bSSE4_1) {
PMOVZXBW(fogColorReg, MDisp(idReg, offsetof(PixelFuncID, cached.fogColor)));
} else {
X64Reg zeroReg = GetZeroVec();
MOVD_xmm(fogColorReg, MDisp(idReg, offsetof(PixelFuncID, cached.fogColor)));
PUNPCKLBW(fogColorReg, R(zeroReg));
regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
}
UnlockPixelID(idReg);
// Load a set of 255s at 16 bit into a reg for later...
X64Reg invertReg = regCache_.Alloc(RegCache::VEC_TEMP2);
PCMPEQW(invertReg, R(invertReg));
PSRLW(invertReg, 8);
// Expand (we clamped) color to 16 bit as well, so we can multiply with fog.
X64Reg argColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);
if (!colorIs16Bit_) {
if (cpu_info.bSSE4_1) {
PMOVZXBW(argColorReg, R(argColorReg));
} else {
X64Reg zeroReg = GetZeroVec();
PUNPCKLBW(argColorReg, R(zeroReg));
regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
}
colorIs16Bit_ = true;
}
// Save A so we can put it back, we don't "fog" A.
X64Reg alphaReg;
if (regCache_.Has(RegCache::GEN_SRC_ALPHA)) {
alphaReg = regCache_.Find(RegCache::GEN_SRC_ALPHA);
} else {
alphaReg = regCache_.Alloc(RegCache::GEN_SRC_ALPHA);
PEXTRW(alphaReg, argColorReg, 3);
}
// Okay, let's broadcast fog to an XMM.
X64Reg fogMultReg = regCache_.Alloc(RegCache::VEC_TEMP3);
X64Reg argFogReg = regCache_.Find(RegCache::GEN_ARG_FOG);
MOVD_xmm(fogMultReg, R(argFogReg));
PSHUFLW(fogMultReg, R(fogMultReg), _MM_SHUFFLE(0, 0, 0, 0));
regCache_.Unlock(argFogReg, RegCache::GEN_ARG_FOG);
// We can free up the actual fog reg now.
regCache_.ForceRelease(RegCache::GEN_ARG_FOG);
// Our goal here is to calculate this formula:
// (argColor * fog + fogColor * (255 - fog) + 255) / 256
// Now we multiply the existing color by fog...
PMULLW(argColorReg, R(fogMultReg));
// Before inversing, let's add that 255 we loaded in as well, since we have it.
PADDW(argColorReg, R(invertReg));
// And then inverse the fog value using those 255s, and multiply by fog color.
PSUBW(invertReg, R(fogMultReg));
PMULLW(fogColorReg, R(invertReg));
// At this point, argColorReg and fogColorReg are multiplied at 16-bit, so we need to sum.
PADDW(argColorReg, R(fogColorReg));
regCache_.Release(fogColorReg, RegCache::VEC_TEMP1);
regCache_.Release(invertReg, RegCache::VEC_TEMP2);
regCache_.Release(fogMultReg, RegCache::VEC_TEMP3);
// Now we simply divide by 256, or in other words shift by 8.
PSRLW(argColorReg, 8);
// Okay, put A back in, we'll shrink it to 8888 when needed.
PINSRW(argColorReg, R(alphaReg), 3);
regCache_.Unlock(argColorReg, RegCache::VEC_ARG_COLOR);
// We most likely won't use alphaReg again.
regCache_.Unlock(alphaReg, RegCache::GEN_SRC_ALPHA);
return true;
}
bool PixelJitCache::Jit_StencilAndDepthTest(const PixelFuncID &id) {
_assert_(!id.clearMode && id.stencilTest);
X64Reg stencilReg = GetDestStencil(id);
Describe("StencilAndDepth");
X64Reg maskedReg = stencilReg;
if (id.hasStencilTestMask && stencilReg != INVALID_REG) {
X64Reg idReg = GetPixelID();
maskedReg = regCache_.Alloc(RegCache::GEN_TEMP0);
MOV(32, R(maskedReg), R(stencilReg));
AND(8, R(maskedReg), MDisp(idReg, offsetof(PixelFuncID, cached.stencilTestMask)));
UnlockPixelID(idReg);
}
bool success = true;
success = success && Jit_StencilTest(id, stencilReg, maskedReg);
if (maskedReg != stencilReg)
regCache_.Release(maskedReg, RegCache::GEN_TEMP0);
// Next up, the depth test.
if (stencilReg == INVALID_REG) {
// Just use the standard one, since we don't need to write stencil.
// We also don't need to worry about cleanup either.
return success && Jit_DepthTest(id);
}
success = success && Jit_DepthTestForStencil(id, stencilReg);
success = success && Jit_ApplyStencilOp(id, id.ZPass(), stencilReg);
// At this point, stencilReg can't be spilled. It contains the updated value.
regCache_.Unlock(stencilReg, RegCache::GEN_STENCIL);
regCache_.ForceRetain(RegCache::GEN_STENCIL);
return success;
}
bool PixelJitCache::Jit_StencilTest(const PixelFuncID &id, RegCache::Reg stencilReg, RegCache::Reg maskedReg) {
Describe("StencilTest");
bool hasFixedResult = false;
bool fixedResult = false;
FixupBranch toPass;
if (stencilReg == INVALID_REG) {
// This means stencil is a fixed value 0.
hasFixedResult = true;
switch (id.StencilTestFunc()) {
case GE_COMP_NEVER: fixedResult = false; break;
case GE_COMP_ALWAYS: fixedResult = true; break;
case GE_COMP_EQUAL: fixedResult = id.stencilTestRef == 0; break;
case GE_COMP_NOTEQUAL: fixedResult = id.stencilTestRef != 0; break;
case GE_COMP_LESS: fixedResult = false; break;
case GE_COMP_LEQUAL: fixedResult = id.stencilTestRef == 0; break;
case GE_COMP_GREATER: fixedResult = id.stencilTestRef != 0; break;
case GE_COMP_GEQUAL: fixedResult = true; break;
}
} else if (id.StencilTestFunc() == GE_COMP_ALWAYS) {
// Fairly common, skip the CMP.
hasFixedResult = true;
fixedResult = true;
} else {
// Reversed here because of the imm, so tests below are reversed.
CMP(8, R(maskedReg), Imm8(id.stencilTestRef));
switch (id.StencilTestFunc()) {
case GE_COMP_NEVER:
hasFixedResult = true;
fixedResult = false;
break;
case GE_COMP_ALWAYS:
_assert_(false);
break;
case GE_COMP_EQUAL:
toPass = J_CC(CC_E);
break;
case GE_COMP_NOTEQUAL:
toPass = J_CC(CC_NE);
break;
case GE_COMP_LESS:
toPass = J_CC(CC_A);
break;
case GE_COMP_LEQUAL:
toPass = J_CC(CC_AE);
break;
case GE_COMP_GREATER:
toPass = J_CC(CC_B);
break;
case GE_COMP_GEQUAL:
toPass = J_CC(CC_BE);
break;
}
}
if (hasFixedResult && !fixedResult && stencilReg == INVALID_REG) {
Discard();
return true;
}
bool hadColorOffReg = regCache_.Has(RegCache::GEN_COLOR_OFF);
bool hadIdReg = regCache_.Has(RegCache::GEN_ID);
bool success = true;
if (stencilReg != INVALID_REG && (!hasFixedResult || !fixedResult)) {
// This is the fail path.
success = success && Jit_ApplyStencilOp(id, id.SFail(), stencilReg);
success = success && Jit_WriteStencilOnly(id, stencilReg);
Discard();
}
// If we allocated either id or colorOff in the conditional, forget.
if (!hadColorOffReg && regCache_.Has(RegCache::GEN_COLOR_OFF))
regCache_.Change(RegCache::GEN_COLOR_OFF, RegCache::GEN_INVALID);
if (!hadIdReg && regCache_.Has(RegCache::GEN_ID))
regCache_.Change(RegCache::GEN_ID, RegCache::GEN_INVALID);
if (!hasFixedResult)
SetJumpTarget(toPass);
return success;
}
bool PixelJitCache::Jit_DepthTestForStencil(const PixelFuncID &id, RegCache::Reg stencilReg) {
if (id.DepthTestFunc() == GE_COMP_ALWAYS || id.earlyZChecks)
return true;
X64Reg depthOffReg = GetDepthOff(id);
Describe("DepthTestStencil");
X64Reg argZReg = regCache_.Find(RegCache::GEN_ARG_Z);
CMP(16, R(argZReg), MatR(depthOffReg));
regCache_.Unlock(depthOffReg, RegCache::GEN_DEPTH_OFF);
regCache_.Unlock(argZReg, RegCache::GEN_ARG_Z);
// We discard the opposite of the passing test.
FixupBranch skip;
switch (id.DepthTestFunc()) {
case GE_COMP_NEVER:
// Shouldn't happen, just do an extra CMP.
CMP(32, R(RAX), R(RAX));
// This is just to have a skip that is valid.
skip = J_CC(CC_NE);
break;
case GE_COMP_ALWAYS:
// Shouldn't happen, just do an extra CMP.
CMP(32, R(RAX), R(RAX));
skip = J_CC(CC_E);
break;
case GE_COMP_EQUAL:
skip = J_CC(CC_E);
break;
case GE_COMP_NOTEQUAL:
skip = J_CC(CC_NE);
break;
case GE_COMP_LESS:
skip = J_CC(CC_B);
break;
case GE_COMP_LEQUAL:
skip = J_CC(CC_BE);
break;
case GE_COMP_GREATER:
skip = J_CC(CC_A);
break;
case GE_COMP_GEQUAL:
skip = J_CC(CC_AE);
break;
}
bool hadColorOffReg = regCache_.Has(RegCache::GEN_COLOR_OFF);
bool hadIdReg = regCache_.Has(RegCache::GEN_ID);
bool success = true;
success = success && Jit_ApplyStencilOp(id, id.ZFail(), stencilReg);
success = success && Jit_WriteStencilOnly(id, stencilReg);
Discard();
// If we allocated either id or colorOff in the conditional, forget.
if (!hadColorOffReg && regCache_.Has(RegCache::GEN_COLOR_OFF))
regCache_.Change(RegCache::GEN_COLOR_OFF, RegCache::GEN_INVALID);
if (!hadIdReg && regCache_.Has(RegCache::GEN_ID))
regCache_.Change(RegCache::GEN_ID, RegCache::GEN_INVALID);
SetJumpTarget(skip);
// Like in Jit_DepthTest(), at this point we may not need this reg anymore.
if (!id.depthWrite)
regCache_.ForceRelease(RegCache::GEN_ARG_Z);
return success;
}
bool PixelJitCache::Jit_ApplyStencilOp(const PixelFuncID &id, GEStencilOp op, RegCache::Reg stencilReg) {
_assert_(stencilReg != INVALID_REG);
Describe("ApplyStencil");
FixupBranch skip;
switch (op) {
case GE_STENCILOP_KEEP:
// Nothing to do.
break;
case GE_STENCILOP_ZERO:
XOR(32, R(stencilReg), R(stencilReg));
break;
case GE_STENCILOP_REPLACE:
if (id.hasStencilTestMask) {
// Load the unmasked value.
X64Reg idReg = GetPixelID();
MOVZX(32, 8, stencilReg, MDisp(idReg, offsetof(PixelFuncID, cached.stencilRef)));
UnlockPixelID(idReg);
} else {
MOV(8, R(stencilReg), Imm8(id.stencilTestRef));
}
break;
case GE_STENCILOP_INVERT:
NOT(8, R(stencilReg));
break;
case GE_STENCILOP_INCR:
switch (id.fbFormat) {
case GE_FORMAT_565:
break;
case GE_FORMAT_5551:
MOV(8, R(stencilReg), Imm8(0xFF));
break;
case GE_FORMAT_4444:
CMP(8, R(stencilReg), Imm8(0xF0));
skip = J_CC(CC_AE);
ADD(8, R(stencilReg), Imm8(0x11));
SetJumpTarget(skip);
break;
case GE_FORMAT_8888:
CMP(8, R(stencilReg), Imm8(0xFF));
skip = J_CC(CC_E);
ADD(8, R(stencilReg), Imm8(0x01));
SetJumpTarget(skip);
break;
}
break;
case GE_STENCILOP_DECR:
switch (id.fbFormat) {
case GE_FORMAT_565:
break;
case GE_FORMAT_5551:
XOR(32, R(stencilReg), R(stencilReg));
break;
case GE_FORMAT_4444:
CMP(8, R(stencilReg), Imm8(0x11));
skip = J_CC(CC_B);
SUB(8, R(stencilReg), Imm8(0x11));
SetJumpTarget(skip);
break;
case GE_FORMAT_8888:
CMP(8, R(stencilReg), Imm8(0x00));
skip = J_CC(CC_E);
SUB(8, R(stencilReg), Imm8(0x01));
SetJumpTarget(skip);
break;
}
break;
}
return true;
}
bool PixelJitCache::Jit_WriteStencilOnly(const PixelFuncID &id, RegCache::Reg stencilReg) {
_assert_(stencilReg != INVALID_REG);
// It's okay to destroy stencilReg here, we know we're the last writing it.
X64Reg colorOffReg = GetColorOff(id);
Describe("WriteStencil");
if (id.applyColorWriteMask) {
X64Reg idReg = GetPixelID();
X64Reg maskReg = regCache_.Alloc(RegCache::GEN_TEMP5);
switch (id.fbFormat) {
case GE_FORMAT_565:
break;
case GE_FORMAT_5551:
// Read the high 8 bits of the 16-bit color mask.
MOVZX(32, 8, maskReg, MDisp(idReg, offsetof(PixelFuncID, cached.colorWriteMask) + 1));
OR(8, R(maskReg), Imm8(0x7F));
// Poor man's BIC...
NOT(32, R(stencilReg));
OR(32, R(stencilReg), R(maskReg));
NOT(32, R(stencilReg));
AND(8, MDisp(colorOffReg, 1), R(maskReg));
OR(8, MDisp(colorOffReg, 1), R(stencilReg));
break;
case GE_FORMAT_4444:
// Read the high 8 bits of the 16-bit color mask.
MOVZX(32, 8, maskReg, MDisp(idReg, offsetof(PixelFuncID, cached.colorWriteMask) + 1));
OR(8, R(maskReg), Imm8(0x0F));
// Poor man's BIC...
NOT(32, R(stencilReg));
OR(32, R(stencilReg), R(maskReg));
NOT(32, R(stencilReg));
AND(8, MDisp(colorOffReg, 1), R(maskReg));
OR(8, MDisp(colorOffReg, 1), R(stencilReg));
break;
case GE_FORMAT_8888:
// Read the high 8 bits of the 32-bit color mask.
MOVZX(32, 8, maskReg, MDisp(idReg, offsetof(PixelFuncID, cached.colorWriteMask) + 3));
// Poor man's BIC...
NOT(32, R(stencilReg));
OR(32, R(stencilReg), R(maskReg));
NOT(32, R(stencilReg));
AND(8, MDisp(colorOffReg, 3), R(maskReg));
OR(8, MDisp(colorOffReg, 3), R(stencilReg));
break;
}
regCache_.Release(maskReg, RegCache::GEN_TEMP5);
UnlockPixelID(idReg);
} else {
switch (id.fbFormat) {
case GE_FORMAT_565:
break;
case GE_FORMAT_5551:
AND(8, R(stencilReg), Imm8(0x80));
AND(8, MDisp(colorOffReg, 1), Imm8(0x7F));
OR(8, MDisp(colorOffReg, 1), R(stencilReg));
break;
case GE_FORMAT_4444:
AND(8, MDisp(colorOffReg, 1), Imm8(0x0F));
AND(8, R(stencilReg), Imm8(0xF0));
OR(8, MDisp(colorOffReg, 1), R(stencilReg));
break;
case GE_FORMAT_8888:
MOV(8, MDisp(colorOffReg, 3), R(stencilReg));
break;
}
}
regCache_.Unlock(colorOffReg, RegCache::GEN_COLOR_OFF);
return true;
}
bool PixelJitCache::Jit_DepthTest(const PixelFuncID &id) {
if (id.DepthTestFunc() == GE_COMP_ALWAYS || id.earlyZChecks)
return true;
if (id.DepthTestFunc() == GE_COMP_NEVER) {
Discard();
// This should be uncommon, just keep going to have shared cleanup...
}
X64Reg depthOffReg = GetDepthOff(id);
Describe("DepthTest");
X64Reg argZReg = regCache_.Find(RegCache::GEN_ARG_Z);
CMP(16, R(argZReg), MatR(depthOffReg));
regCache_.Unlock(depthOffReg, RegCache::GEN_DEPTH_OFF);
regCache_.Unlock(argZReg, RegCache::GEN_ARG_Z);
// We discard the opposite of the passing test.
switch (id.DepthTestFunc()) {
case GE_COMP_NEVER:
case GE_COMP_ALWAYS:
break;
case GE_COMP_EQUAL:
Discard(CC_NE);
break;
case GE_COMP_NOTEQUAL:
Discard(CC_E);
break;
case GE_COMP_LESS:
Discard(CC_AE);
break;
case GE_COMP_LEQUAL:
Discard(CC_A);
break;
case GE_COMP_GREATER:
Discard(CC_BE);
break;
case GE_COMP_GEQUAL:
Discard(CC_B);
break;
}
// If we're not writing, we don't need Z anymore. We'll free GEN_DEPTH_OFF in Jit_WriteDepth().
if (!id.depthWrite)
regCache_.ForceRelease(RegCache::GEN_ARG_Z);
return true;
}
bool PixelJitCache::Jit_WriteDepth(const PixelFuncID &id) {
// Clear mode shares depthWrite for DepthClear().
if (id.depthWrite) {
X64Reg depthOffReg = GetDepthOff(id);
Describe("WriteDepth");
X64Reg argZReg = regCache_.Find(RegCache::GEN_ARG_Z);
MOV(16, MatR(depthOffReg), R(argZReg));
regCache_.Unlock(depthOffReg, RegCache::GEN_DEPTH_OFF);
regCache_.Unlock(argZReg, RegCache::GEN_ARG_Z);
regCache_.ForceRelease(RegCache::GEN_ARG_Z);
}
// We can free up this reg if we force locked it.
if (regCache_.Has(RegCache::GEN_DEPTH_OFF)) {
regCache_.ForceRelease(RegCache::GEN_DEPTH_OFF);
}
return true;
}
bool PixelJitCache::Jit_AlphaBlend(const PixelFuncID &id) {
if (!id.alphaBlend)
return true;
// Check if we need to load and prep factors.
PixelBlendState blendState;
ComputePixelBlendState(blendState, id);
bool success = true;
// Step 1: Load and expand dest color.
X64Reg dstReg = regCache_.Alloc(RegCache::VEC_TEMP0);
if (!blendState.readsDstPixel) {
// Let's load colorOff just for registers to be consistent.
X64Reg colorOff = GetColorOff(id);
regCache_.Unlock(colorOff, RegCache::GEN_COLOR_OFF);
PXOR(dstReg, R(dstReg));
} else if (id.FBFormat() == GE_FORMAT_8888) {
X64Reg colorOff = GetColorOff(id);
Describe("AlphaBlend");
MOVD_xmm(dstReg, MatR(colorOff));
regCache_.Unlock(colorOff, RegCache::GEN_COLOR_OFF);
} else {
X64Reg colorOff = GetColorOff(id);
Describe("AlphaBlend");
X64Reg dstGenReg = regCache_.Alloc(RegCache::GEN_TEMP0);
MOVZX(32, 16, dstGenReg, MatR(colorOff));
regCache_.Unlock(colorOff, RegCache::GEN_COLOR_OFF);
X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);
switch (id.fbFormat) {
case GE_FORMAT_565:
success = success && Jit_ConvertFrom565(id, dstGenReg, temp1Reg, temp2Reg);
break;
case GE_FORMAT_5551:
success = success && Jit_ConvertFrom5551(id, dstGenReg, temp1Reg, temp2Reg, blendState.usesDstAlpha);
break;
case GE_FORMAT_4444:
success = success && Jit_ConvertFrom4444(id, dstGenReg, temp1Reg, temp2Reg, blendState.usesDstAlpha);
break;
case GE_FORMAT_8888:
break;
}
Describe("AlphaBlend");
MOVD_xmm(dstReg, R(dstGenReg));
regCache_.Release(dstGenReg, RegCache::GEN_TEMP0);
regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);
}
// Step 2: Load and apply factors.
X64Reg argColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);
if (blendState.usesFactors) {
X64Reg srcFactorReg = regCache_.Alloc(RegCache::VEC_TEMP1);
X64Reg dstFactorReg = regCache_.Alloc(RegCache::VEC_TEMP2);
// We apply these at 16-bit, because they can be doubled and have a half offset.
if (cpu_info.bSSE4_1) {
if (!colorIs16Bit_)
PMOVZXBW(argColorReg, R(argColorReg));
PMOVZXBW(dstReg, R(dstReg));
} else {
X64Reg zeroReg = GetZeroVec();
if (!colorIs16Bit_)
PUNPCKLBW(argColorReg, R(zeroReg));
PUNPCKLBW(dstReg, R(zeroReg));
regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
}
colorIs16Bit_ = true;
// Skip multiplying by factors if we can.
bool multiplySrc = id.AlphaBlendSrc() != PixelBlendFactor::ZERO && id.AlphaBlendSrc() != PixelBlendFactor::ONE;
bool multiplyDst = id.AlphaBlendDst() != PixelBlendFactor::ZERO && id.AlphaBlendDst() != PixelBlendFactor::ONE;
// We also shift left by 4, so mulhi gives us a free shift
// We also need to add a half bit later, so this gives us space.
if (multiplySrc || blendState.srcColorAsFactor)
PSLLW(argColorReg, 4);
if (multiplyDst || blendState.dstColorAsFactor || blendState.usesDstAlpha)
PSLLW(dstReg, 4);
// Okay, now grab our factors. Don't bother if they're known values.
if (id.AlphaBlendSrc() < PixelBlendFactor::ZERO)
success = success && Jit_BlendFactor(id, srcFactorReg, dstReg, id.AlphaBlendSrc());
if (id.AlphaBlendDst() < PixelBlendFactor::ZERO)
success = success && Jit_DstBlendFactor(id, srcFactorReg, dstFactorReg, dstReg);
X64Reg halfReg = INVALID_REG;
if (multiplySrc || multiplyDst) {
halfReg = regCache_.Alloc(RegCache::VEC_TEMP3);
// We'll use this several times, so load into a reg.
MOVDQA(halfReg, M(constBlendHalf_11_4s_));
}
// Add in the half bit to the factors and color values, then multiply.
// We take the high 16 bits to get a free right shift by 16.
if (multiplySrc) {
POR(srcFactorReg, R(halfReg));
POR(argColorReg, R(halfReg));
PMULHUW(argColorReg, R(srcFactorReg));
} else if (id.AlphaBlendSrc() == PixelBlendFactor::ZERO) {
PXOR(argColorReg, R(argColorReg));
} else if (id.AlphaBlendSrc() == PixelBlendFactor::ONE) {
if (blendState.srcColorAsFactor)
PSRLW(argColorReg, 4);
}
if (multiplyDst) {
POR(dstFactorReg, R(halfReg));
POR(dstReg, R(halfReg));
PMULHUW(dstReg, R(dstFactorReg));
} else if (id.AlphaBlendDst() == PixelBlendFactor::ZERO) {
// No need to add or subtract zero, unless we're negating.
// This is common for bloom preparation.
if (id.AlphaBlendEq() == GE_BLENDMODE_MUL_AND_SUBTRACT_REVERSE)
PXOR(dstReg, R(dstReg));
} else if (id.AlphaBlendDst() == PixelBlendFactor::ONE) {
if (blendState.dstColorAsFactor || blendState.usesDstAlpha)
PSRLW(dstReg, 4);
}
regCache_.Release(srcFactorReg, RegCache::VEC_TEMP1);
regCache_.Release(dstFactorReg, RegCache::VEC_TEMP2);
if (halfReg != INVALID_REG)
regCache_.Release(halfReg, RegCache::VEC_TEMP3);
} else if (colorIs16Bit_) {
// If it's expanded, shrink and clamp for our min/max/absdiff handling.
PACKUSWB(argColorReg, R(argColorReg));
colorIs16Bit_ = false;
}
// Step 3: Apply equation.
// Note: below, we completely ignore what happens to the alpha bits.
// It won't matter, since we'll replace those with stencil anyway.
X64Reg tempReg = regCache_.Alloc(RegCache::VEC_TEMP1);
switch (id.AlphaBlendEq()) {
case GE_BLENDMODE_MUL_AND_ADD:
if (id.AlphaBlendDst() != PixelBlendFactor::ZERO)
PADDUSW(argColorReg, R(dstReg));
break;
case GE_BLENDMODE_MUL_AND_SUBTRACT:
if (id.AlphaBlendDst() != PixelBlendFactor::ZERO)
PSUBUSW(argColorReg, R(dstReg));
break;
case GE_BLENDMODE_MUL_AND_SUBTRACT_REVERSE:
if (cpu_info.bAVX) {
VPSUBUSW(128, argColorReg, dstReg, R(argColorReg));
} else {
MOVDQA(tempReg, R(argColorReg));
MOVDQA(argColorReg, R(dstReg));
PSUBUSW(argColorReg, R(tempReg));
}
break;
case GE_BLENDMODE_MIN:
PMINUB(argColorReg, R(dstReg));
break;
case GE_BLENDMODE_MAX:
PMAXUB(argColorReg, R(dstReg));
break;
case GE_BLENDMODE_ABSDIFF:
// Calculate A=(dst-src < 0 ? 0 : dst-src) and B=(src-dst < 0 ? 0 : src-dst)...
MOVDQA(tempReg, R(dstReg));
PSUBUSB(tempReg, R(argColorReg));
PSUBUSB(argColorReg, R(dstReg));
// Now, one of those must be zero, and the other one is the result (could also be zero.)
POR(argColorReg, R(tempReg));
break;
}
regCache_.Release(dstReg, RegCache::VEC_TEMP0);
regCache_.Release(tempReg, RegCache::VEC_TEMP1);
regCache_.Unlock(argColorReg, RegCache::VEC_ARG_COLOR);
return success;
}
bool PixelJitCache::Jit_BlendFactor(const PixelFuncID &id, RegCache::Reg factorReg, RegCache::Reg dstReg, PixelBlendFactor factor) {
X64Reg idReg = INVALID_REG;
X64Reg tempReg = INVALID_REG;
X64Reg argColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);
// Everything below expects an expanded 16-bit color
_assert_(colorIs16Bit_);
// Between source and dest factors, only DSTCOLOR, INVDSTCOLOR, and FIXA differ.
// In those cases, it uses SRCCOLOR, INVSRCCOLOR, and FIXB respectively.
// Load the invert constant first off, if needed.
switch (factor) {
case PixelBlendFactor::INVOTHERCOLOR:
case PixelBlendFactor::INVSRCALPHA:
case PixelBlendFactor::INVDSTALPHA:
case PixelBlendFactor::DOUBLEINVSRCALPHA:
case PixelBlendFactor::DOUBLEINVDSTALPHA:
MOVDQA(factorReg, M(constBlendInvert_11_4s_));
break;
default:
break;
}
switch (factor) {
case PixelBlendFactor::OTHERCOLOR:
MOVDQA(factorReg, R(dstReg));
break;
case PixelBlendFactor::INVOTHERCOLOR:
PSUBUSW(factorReg, R(dstReg));
break;
case PixelBlendFactor::SRCALPHA:
PSHUFLW(factorReg, R(argColorReg), _MM_SHUFFLE(3, 3, 3, 3));
break;
case PixelBlendFactor::INVSRCALPHA:
tempReg = regCache_.Alloc(RegCache::VEC_TEMP3);
PSHUFLW(tempReg, R(argColorReg), _MM_SHUFFLE(3, 3, 3, 3));
PSUBUSW(factorReg, R(tempReg));
break;
case PixelBlendFactor::DSTALPHA:
PSHUFLW(factorReg, R(dstReg), _MM_SHUFFLE(3, 3, 3, 3));
break;
case PixelBlendFactor::INVDSTALPHA:
tempReg = regCache_.Alloc(RegCache::VEC_TEMP3);
PSHUFLW(tempReg, R(dstReg), _MM_SHUFFLE(3, 3, 3, 3));
PSUBUSW(factorReg, R(tempReg));
break;
case PixelBlendFactor::DOUBLESRCALPHA:
PSHUFLW(factorReg, R(argColorReg), _MM_SHUFFLE(3, 3, 3, 3));
PSLLW(factorReg, 1);
break;
case PixelBlendFactor::DOUBLEINVSRCALPHA:
tempReg = regCache_.Alloc(RegCache::VEC_TEMP3);
PSHUFLW(tempReg, R(argColorReg), _MM_SHUFFLE(3, 3, 3, 3));
PSLLW(tempReg, 1);
PSUBUSW(factorReg, R(tempReg));
break;
case PixelBlendFactor::DOUBLEDSTALPHA:
PSHUFLW(factorReg, R(dstReg), _MM_SHUFFLE(3, 3, 3, 3));
PSLLW(factorReg, 1);
break;
case PixelBlendFactor::DOUBLEINVDSTALPHA:
tempReg = regCache_.Alloc(RegCache::VEC_TEMP3);
PSHUFLW(tempReg, R(dstReg), _MM_SHUFFLE(3, 3, 3, 3));
PSLLW(tempReg, 1);
PSUBUSW(factorReg, R(tempReg));
break;
case PixelBlendFactor::ZERO:
// Special value meaning zero.
PXOR(factorReg, R(factorReg));
break;
case PixelBlendFactor::ONE:
// Special value meaning all 255s.
PCMPEQD(factorReg, R(factorReg));
PSLLW(factorReg, 8);
PSRLW(factorReg, 4);
break;
case PixelBlendFactor::FIX:
default:
idReg = GetPixelID();
if (cpu_info.bSSE4_1) {
PMOVZXBW(factorReg, MDisp(idReg, offsetof(PixelFuncID, cached.alphaBlendSrc)));
} else {
X64Reg zeroReg = GetZeroVec();
MOVD_xmm(factorReg, MDisp(idReg, offsetof(PixelFuncID, cached.alphaBlendSrc)));
PUNPCKLBW(factorReg, R(zeroReg));
regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
}
// Round it out by shifting into place.
PSLLW(factorReg, 4);
break;
}
if (idReg != INVALID_REG)
UnlockPixelID(idReg);
if (tempReg != INVALID_REG)
regCache_.Release(tempReg, RegCache::VEC_TEMP3);
regCache_.Unlock(argColorReg, RegCache::VEC_ARG_COLOR);
return true;
}
bool PixelJitCache::Jit_DstBlendFactor(const PixelFuncID &id, RegCache::Reg srcFactorReg, RegCache::Reg dstFactorReg, RegCache::Reg dstReg) {
bool success = true;
X64Reg idReg = INVALID_REG;
X64Reg argColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);
// Everything below expects an expanded 16-bit color
_assert_(colorIs16Bit_);
PixelBlendState blendState;
ComputePixelBlendState(blendState, id);
// We might be able to reuse srcFactorReg for dst, in some cases.
switch (id.AlphaBlendDst()) {
case PixelBlendFactor::OTHERCOLOR:
MOVDQA(dstFactorReg, R(argColorReg));
break;
case PixelBlendFactor::INVOTHERCOLOR:
MOVDQA(dstFactorReg, M(constBlendInvert_11_4s_));
PSUBUSW(dstFactorReg, R(argColorReg));
break;
case PixelBlendFactor::SRCALPHA:
case PixelBlendFactor::INVSRCALPHA:
case PixelBlendFactor::DSTALPHA:
case PixelBlendFactor::INVDSTALPHA:
case PixelBlendFactor::DOUBLESRCALPHA:
case PixelBlendFactor::DOUBLEINVSRCALPHA:
case PixelBlendFactor::DOUBLEDSTALPHA:
case PixelBlendFactor::DOUBLEINVDSTALPHA:
case PixelBlendFactor::ZERO:
case PixelBlendFactor::ONE:
// These are all equivalent for src factor, so reuse that logic.
if (id.AlphaBlendSrc() == id.AlphaBlendDst()) {
MOVDQA(dstFactorReg, R(srcFactorReg));
} else if (blendState.dstFactorIsInverse) {
MOVDQA(dstFactorReg, M(constBlendInvert_11_4s_));
PSUBUSW(dstFactorReg, R(srcFactorReg));
} else {
success = success && Jit_BlendFactor(id, dstFactorReg, dstReg, id.AlphaBlendDst());
}
break;
case PixelBlendFactor::FIX:
default:
idReg = GetPixelID();
if (cpu_info.bSSE4_1) {
PMOVZXBW(dstFactorReg, MDisp(idReg, offsetof(PixelFuncID, cached.alphaBlendDst)));
} else {
X64Reg zeroReg = GetZeroVec();
MOVD_xmm(dstFactorReg, MDisp(idReg, offsetof(PixelFuncID, cached.alphaBlendDst)));
PUNPCKLBW(dstFactorReg, R(zeroReg));
regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
}
// Round it out by shifting into place.
PSLLW(dstFactorReg, 4);
break;
}
if (idReg != INVALID_REG)
UnlockPixelID(idReg);
regCache_.Unlock(argColorReg, RegCache::VEC_ARG_COLOR);
return success;
}
bool PixelJitCache::Jit_Dither(const PixelFuncID &id) {
if (!id.dithering)
return true;
Describe("Dither");
X64Reg valueReg = regCache_.Alloc(RegCache::GEN_TEMP0);
// Load the row dither matrix entry (will still need to get the X.)
X64Reg argYReg = regCache_.Find(RegCache::GEN_ARG_Y);
MOV(32, R(valueReg), R(argYReg));
AND(32, R(valueReg), Imm8(3));
// At this point, we're done with depth and y, so let's grab GEN_COLOR_OFF and retain it.
// Then we can modify x and throw it away too, which is our actual goal.
X64Reg colorOffReg = GetColorOff(id);
Describe("Dither");
regCache_.Unlock(colorOffReg, RegCache::GEN_COLOR_OFF);
regCache_.ForceRetain(RegCache::GEN_COLOR_OFF);
// And get rid of y, we can use for other regs.
regCache_.Unlock(argYReg, RegCache::GEN_ARG_Y);
regCache_.ForceRelease(RegCache::GEN_ARG_Y);
X64Reg argXReg = regCache_.Find(RegCache::GEN_ARG_X);
AND(32, R(argXReg), Imm32(3));
// Sum up (x + y * 4) + ditherMatrix offset to valueReg.
LEA(32, valueReg, MComplex(argXReg, valueReg, 4, offsetof(PixelFuncID, cached.ditherMatrix)));
// Okay, now abuse argXReg to read the PixelFuncID pointer on the stack.
if (regCache_.Has(RegCache::GEN_ARG_ID) || regCache_.Has(RegCache::GEN_ID)) {
X64Reg idReg = GetPixelID();
MOVSX(32, 8, valueReg, MRegSum(idReg, valueReg));
UnlockPixelID(idReg);
} else {
_assert_(stackIDOffset_ != -1);
MOV(PTRBITS, R(argXReg), MDisp(RSP, stackIDOffset_));
MOVSX(32, 8, valueReg, MRegSum(argXReg, valueReg));
}
regCache_.Unlock(argXReg, RegCache::GEN_ARG_X);
regCache_.ForceRelease(RegCache::GEN_ARG_X);
// Copy that value into a vec to add to the color.
X64Reg vecValueReg = regCache_.Alloc(RegCache::VEC_TEMP0);
MOVD_xmm(vecValueReg, R(valueReg));
regCache_.Release(valueReg, RegCache::GEN_TEMP0);
// Now we want to broadcast RGB in 16-bit, but keep A as 0.
// Luckily, we know that third lane (in 16-bit) is zero from MOVD clearing it.
// We use 16-bit because we need a signed add, but we also want to saturate.
PSHUFLW(vecValueReg, R(vecValueReg), _MM_SHUFFLE(2, 0, 0, 0));
// With that, now let's convert the color to 16 bit...
X64Reg argColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);
if (!colorIs16Bit_) {
if (cpu_info.bSSE4_1) {
PMOVZXBW(argColorReg, R(argColorReg));
} else {
X64Reg zeroReg = GetZeroVec();
PUNPCKLBW(argColorReg, R(zeroReg));
regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
}
colorIs16Bit_ = true;
}
// And simply add the dither values.
PADDSW(argColorReg, R(vecValueReg));
regCache_.Release(vecValueReg, RegCache::VEC_TEMP0);
regCache_.Unlock(argColorReg, RegCache::VEC_ARG_COLOR);
return true;
}
bool PixelJitCache::Jit_WriteColor(const PixelFuncID &id) {
X64Reg colorOff = GetColorOff(id);
Describe("WriteColor");
if (regCache_.Has(RegCache::GEN_ARG_X)) {
// We normally toss x and y during dithering or useStandardStride with no dithering.
// Free up the regs now to get more reg space.
regCache_.ForceRelease(RegCache::GEN_ARG_X);
regCache_.ForceRelease(RegCache::GEN_ARG_Y);
// But make sure we don't lose GEN_COLOR_OFF, we'll be lost without that now.
regCache_.ForceRetain(RegCache::GEN_COLOR_OFF);
}
// Convert back to 8888 and clamp.
X64Reg argColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);
if (colorIs16Bit_) {
PACKUSWB(argColorReg, R(argColorReg));
colorIs16Bit_ = false;
}
if (id.clearMode) {
bool drawingDone = false;
if (!id.ColorClear() && !id.StencilClear())
drawingDone = true;
if (!id.ColorClear() && id.FBFormat() == GE_FORMAT_565)
drawingDone = true;
bool success = true;
if (!id.ColorClear() && !drawingDone) {
// Let's reuse Jit_WriteStencilOnly for this path.
X64Reg alphaReg;
if (regCache_.Has(RegCache::GEN_SRC_ALPHA)) {
alphaReg = regCache_.Find(RegCache::GEN_SRC_ALPHA);
} else {
alphaReg = regCache_.Alloc(RegCache::GEN_SRC_ALPHA);
MOVD_xmm(R(alphaReg), argColorReg);
SHR(32, R(alphaReg), Imm8(24));
}
success = Jit_WriteStencilOnly(id, alphaReg);
regCache_.Release(alphaReg, RegCache::GEN_SRC_ALPHA);
drawingDone = true;
}
if (drawingDone) {
regCache_.Unlock(argColorReg, RegCache::VEC_ARG_COLOR);
regCache_.ForceRelease(RegCache::VEC_ARG_COLOR);
regCache_.Unlock(colorOff, RegCache::GEN_COLOR_OFF);
regCache_.ForceRelease(RegCache::GEN_COLOR_OFF);
return success;
}
// In this case, we're clearing only color or only color and stencil. Proceed.
}
X64Reg colorReg = regCache_.Alloc(RegCache::GEN_TEMP0);
MOVD_xmm(R(colorReg), argColorReg);
regCache_.Unlock(argColorReg, RegCache::VEC_ARG_COLOR);
regCache_.ForceRelease(RegCache::VEC_ARG_COLOR);
X64Reg stencilReg = INVALID_REG;
if (regCache_.Has(RegCache::GEN_STENCIL))
stencilReg = regCache_.Find(RegCache::GEN_STENCIL);
X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);
bool convertAlpha = id.clearMode && id.StencilClear();
bool writeAlpha = convertAlpha || stencilReg != INVALID_REG;
uint32_t fixedKeepMask = 0x00000000;
bool success = true;
// Step 1: Load the color into colorReg.
switch (id.fbFormat) {
case GE_FORMAT_565:
// In this case, stencil doesn't matter.
success = success && Jit_ConvertTo565(id, colorReg, temp1Reg, temp2Reg);
break;
case GE_FORMAT_5551:
success = success && Jit_ConvertTo5551(id, colorReg, temp1Reg, temp2Reg, convertAlpha);
if (stencilReg != INVALID_REG) {
// Truncate off the top bit of the stencil.
SHR(32, R(stencilReg), Imm8(7));
SHL(32, R(stencilReg), Imm8(15));
} else if (!writeAlpha) {
fixedKeepMask = 0x8000;
}
break;
case GE_FORMAT_4444:
success = success && Jit_ConvertTo4444(id, colorReg, temp1Reg, temp2Reg, convertAlpha);
if (stencilReg != INVALID_REG) {
// Truncate off the top bit of the stencil.
SHR(32, R(stencilReg), Imm8(4));
SHL(32, R(stencilReg), Imm8(12));
} else if (!writeAlpha) {
fixedKeepMask = 0xF000;
}
break;
case GE_FORMAT_8888:
if (stencilReg != INVALID_REG) {
SHL(32, R(stencilReg), Imm8(24));
// Clear out the alpha bits so we can fit the stencil.
AND(32, R(colorReg), Imm32(0x00FFFFFF));
} else if (!writeAlpha) {
fixedKeepMask = 0xFF000000;
}
break;
}
// Step 2: Load write mask if needed.
// Note that we apply the write mask at the destination bit depth.
Describe("WriteColor");
X64Reg maskReg = INVALID_REG;
if (id.applyColorWriteMask) {
maskReg = regCache_.Alloc(RegCache::GEN_TEMP3);
// Load the pre-converted and combined write mask.
if (regCache_.Has(RegCache::GEN_ARG_ID) || regCache_.Has(RegCache::GEN_ID)) {
X64Reg idReg = GetPixelID();
MOV(32, R(maskReg), MDisp(idReg, offsetof(PixelFuncID, cached.colorWriteMask)));
UnlockPixelID(idReg);
} else {
_assert_(stackIDOffset_ != -1);
MOV(PTRBITS, R(maskReg), MDisp(RSP, stackIDOffset_));
MOV(32, R(maskReg), MDisp(maskReg, offsetof(PixelFuncID, cached.colorWriteMask)));
}
}
// We've run out of regs, let's live without temp2 from here on.
regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);
// Step 3: Apply logic op, combine stencil.
skipStandardWrites_.clear();
if (id.applyLogicOp) {
// Note: we combine stencil during logic op, because it's a bit complex to retain.
success = success && Jit_ApplyLogicOp(id, colorReg, maskReg);
} else if (stencilReg != INVALID_REG) {
OR(32, R(colorReg), R(stencilReg));
}
// Step 4: Write and apply write mask.
Describe("WriteColor");
switch (id.fbFormat) {
case GE_FORMAT_565:
case GE_FORMAT_5551:
case GE_FORMAT_4444:
if (maskReg != INVALID_REG) {
// Zero all other bits, then flip maskReg to clear the bits we're keeping in colorReg.
AND(16, MatR(colorOff), R(maskReg));
if (cpu_info.bBMI1) {
ANDN(32, colorReg, maskReg, R(colorReg));
} else {
NOT(32, R(maskReg));
AND(32, R(colorReg), R(maskReg));
}
OR(16, MatR(colorOff), R(colorReg));
} else if (fixedKeepMask == 0) {
MOV(16, MatR(colorOff), R(colorReg));
} else {
// Clear the non-stencil bits and or in the color.
AND(16, MatR(colorOff), Imm16((uint16_t)fixedKeepMask));
OR(16, MatR(colorOff), R(colorReg));
}
break;
case GE_FORMAT_8888:
if (maskReg != INVALID_REG) {
// Zero all other bits, then flip maskReg to clear the bits we're keeping in colorReg.
AND(32, MatR(colorOff), R(maskReg));
if (cpu_info.bBMI1) {
ANDN(32, colorReg, maskReg, R(colorReg));
} else {
NOT(32, R(maskReg));
AND(32, R(colorReg), R(maskReg));
}
OR(32, MatR(colorOff), R(colorReg));
} else if (fixedKeepMask == 0) {
MOV(32, MatR(colorOff), R(colorReg));
} else if (fixedKeepMask == 0xFF000000) {
// We want to set 24 bits only, since we're not changing stencil.
// For now, let's do two writes rather than reading in the old stencil.
MOV(16, MatR(colorOff), R(colorReg));
SHR(32, R(colorReg), Imm8(16));
MOV(8, MDisp(colorOff, 2), R(colorReg));
} else {
AND(32, MatR(colorOff), Imm32(fixedKeepMask));
OR(32, MatR(colorOff), R(colorReg));
}
break;
}
for (FixupBranch &fixup : skipStandardWrites_)
SetJumpTarget(fixup);
skipStandardWrites_.clear();
regCache_.Unlock(colorOff, RegCache::GEN_COLOR_OFF);
regCache_.ForceRelease(RegCache::GEN_COLOR_OFF);
regCache_.Release(colorReg, RegCache::GEN_TEMP0);
regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
if (maskReg != INVALID_REG)
regCache_.Release(maskReg, RegCache::GEN_TEMP3);
if (stencilReg != INVALID_REG) {
regCache_.Unlock(stencilReg, RegCache::GEN_STENCIL);
regCache_.ForceRelease(RegCache::GEN_STENCIL);
}
return success;
}
bool PixelJitCache::Jit_ApplyLogicOp(const PixelFuncID &id, RegCache::Reg colorReg, RegCache::Reg maskReg) {
Describe("LogicOp");
X64Reg logicOpReg = regCache_.Alloc(RegCache::GEN_TEMP4);
if (regCache_.Has(RegCache::GEN_ARG_ID) || regCache_.Has(RegCache::GEN_ID)) {
X64Reg idReg = GetPixelID();
MOVZX(32, 8, logicOpReg, MDisp(idReg, offsetof(PixelFuncID, cached.logicOp)));
UnlockPixelID(idReg);
} else {
_assert_(stackIDOffset_ != -1);
MOV(PTRBITS, R(logicOpReg), MDisp(RSP, stackIDOffset_));
MOVZX(32, 8, logicOpReg, MDisp(logicOpReg, offsetof(PixelFuncID, cached.logicOp)));
}
X64Reg stencilReg = INVALID_REG;
if (regCache_.Has(RegCache::GEN_STENCIL))
stencilReg = regCache_.Find(RegCache::GEN_STENCIL);
// Should already be allocated.
X64Reg colorOff = regCache_.Find(RegCache::GEN_COLOR_OFF);
X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP5);
// We'll use these in several cases, so prepare.
int bits = id.fbFormat == GE_FORMAT_8888 ? 32 : 16;
OpArg stencilMask, notStencilMask;
switch (id.fbFormat) {
case GE_FORMAT_565:
stencilMask = Imm16(0);
notStencilMask = Imm16(0xFFFF);
break;
case GE_FORMAT_5551:
stencilMask = Imm16(0x8000);
notStencilMask = Imm16(0x7FFF);
break;
case GE_FORMAT_4444:
stencilMask = Imm16(0xF000);
notStencilMask = Imm16(0x0FFF);
break;
case GE_FORMAT_8888:
stencilMask = Imm32(0xFF000000);
notStencilMask = Imm32(0x00FFFFFF);
break;
}
std::vector<FixupBranch> finishes;
finishes.reserve(11);
FixupBranch skipTable = J(true);
const u8 *tableValues[16]{};
tableValues[GE_LOGIC_CLEAR] = GetCodePointer();
if (stencilReg != INVALID_REG) {
// If clearing and setting the stencil, that's easy - stencilReg has it.
MOV(32, R(colorReg), R(stencilReg));
finishes.push_back(J(true));
} else if (maskReg != INVALID_REG) {
// Just and out the unmasked bits (stencil already included in maskReg.)
AND(bits, MatR(colorOff), R(maskReg));
skipStandardWrites_.push_back(J(true));
} else {
// Otherwise, no mask, just AND the stencil bits to zero the rest.
AND(bits, MatR(colorOff), stencilMask);
skipStandardWrites_.push_back(J(true));
}
tableValues[GE_LOGIC_AND] = GetCodePointer();
if (stencilReg != INVALID_REG && maskReg != INVALID_REG) {
// Since we're ANDing, set the mask bits (AND will keep them as-is.)
OR(32, R(colorReg), R(maskReg));
OR(32, R(colorReg), R(stencilReg));
// To apply stencil, we'll OR the stencil unmasked bits in memory, so our AND keeps them.
NOT(32, R(maskReg));
AND(bits, R(maskReg), stencilMask);
OR(bits, MatR(colorOff), R(maskReg));
} else if (stencilReg != INVALID_REG) {
OR(32, R(colorReg), R(stencilReg));
// No mask, so just or in the stencil bits so our AND can set any we want.
OR(bits, MatR(colorOff), stencilMask);
} else if (maskReg != INVALID_REG) {
// Force in the mask (which includes all stencil bits) so both are kept as-is.
OR(32, R(colorReg), R(maskReg));
} else {
// Force on the stencil bits so they AND and keep the existing value.
if (stencilMask.GetImmValue() != 0)
OR(bits, R(colorReg), stencilMask);
}
// Now the AND, which applies stencil and the logic op.
AND(bits, MatR(colorOff), R(colorReg));
skipStandardWrites_.push_back(J(true));
tableValues[GE_LOGIC_AND_REVERSE] = GetCodePointer();
// Reverse memory in a temp reg so we can apply the write mask easily.
MOV(bits, R(temp1Reg), MatR(colorOff));
if (cpu_info.bBMI1) {
ANDN(32, colorReg, temp1Reg, R(colorReg));
} else {
NOT(32, R(temp1Reg));
AND(32, R(colorReg), R(temp1Reg));
}
// Now add in the stencil bits (must be zero before, since we used AND.)
if (stencilReg != INVALID_REG) {
OR(32, R(colorReg), R(stencilReg));
}
finishes.push_back(J(true));
tableValues[GE_LOGIC_COPY] = GetCodePointer();
// This is just a standard write, nothing complex.
if (stencilReg != INVALID_REG) {
OR(32, R(colorReg), R(stencilReg));
}
finishes.push_back(J(true));
tableValues[GE_LOGIC_AND_INVERTED] = GetCodePointer();
if (stencilReg != INVALID_REG) {
// Set the stencil bits, so they're zero when we invert.
OR(bits, R(colorReg), stencilMask);
NOT(32, R(colorReg));
OR(32, R(colorReg), R(stencilReg));
if (maskReg != INVALID_REG) {
// This way our AND will keep all those bits.
OR(32, R(colorReg), R(maskReg));
// To apply stencil, we'll OR the stencil unmasked bits in memory, so our AND keeps them.
NOT(32, R(maskReg));
AND(bits, R(maskReg), stencilMask);
OR(bits, MatR(colorOff), R(maskReg));
} else {
// Force memory to take our stencil bits by ORing for the AND.
OR(bits, MatR(colorOff), stencilMask);
}
} else if (maskReg != INVALID_REG) {
NOT(32, R(colorReg));
// This way our AND will keep all those bits.
OR(32, R(colorReg), R(maskReg));
} else {
// Invert our color, but then add in stencil bits so the AND keeps them.
NOT(32, R(colorReg));
// We only do this for 8888 since the rest will have had 0 stencil bits (which turned to 1s.)
if (id.FBFormat() == GE_FORMAT_8888)
OR(bits, R(colorReg), stencilMask);
}
AND(bits, MatR(colorOff), R(colorReg));
skipStandardWrites_.push_back(J(true));
tableValues[GE_LOGIC_NOOP] = GetCodePointer();
if (stencilReg != INVALID_REG && maskReg != INVALID_REG) {
// Start by clearing masked bits from stencilReg.
if (cpu_info.bBMI1) {
ANDN(32, stencilReg, maskReg, R(stencilReg));
} else {
NOT(32, R(maskReg));
AND(32, R(stencilReg), R(maskReg));
NOT(32, R(maskReg));
}
// Now mask out the stencil bits we're writing from memory.
OR(bits, R(maskReg), notStencilMask);
AND(bits, MatR(colorOff), R(maskReg));
// Now set those remaining stencil bits.
OR(bits, MatR(colorOff), R(stencilReg));
skipStandardWrites_.push_back(J(true));
} else if (stencilReg != INVALID_REG) {
// Clear and set just the stencil bits.
AND(bits, MatR(colorOff), notStencilMask);
OR(bits, MatR(colorOff), R(stencilReg));
skipStandardWrites_.push_back(J(true));
} else {
Discard();
}
tableValues[GE_LOGIC_XOR] = GetCodePointer();
XOR(bits, R(colorReg), MatR(colorOff));
if (stencilReg != INVALID_REG) {
// Purge out the stencil bits from the XOR and copy ours in.
AND(bits, R(colorReg), notStencilMask);
OR(32, R(colorReg), R(stencilReg));
} else if (maskReg == INVALID_REG && stencilMask.GetImmValue() != 0) {
// XOR might've set some bits, and without a maskReg we won't clear them.
AND(bits, R(colorReg), notStencilMask);
}
finishes.push_back(J(true));
tableValues[GE_LOGIC_OR] = GetCodePointer();
if (stencilReg != INVALID_REG && maskReg != INVALID_REG) {
OR(32, R(colorReg), R(stencilReg));
// Clear the bits we should be masking out.
if (cpu_info.bBMI1) {
ANDN(32, colorReg, maskReg, R(colorReg));
} else {
NOT(32, R(maskReg));
AND(32, R(colorReg), R(maskReg));
NOT(32, R(maskReg));
}
// Clear all the unmasked stencil bits, so we can set our own.
OR(bits, R(maskReg), notStencilMask);
AND(bits, MatR(colorOff), R(maskReg));
} else if (stencilReg != INVALID_REG) {
OR(32, R(colorReg), R(stencilReg));
// AND out the stencil bits so we set our own.
AND(bits, MatR(colorOff), notStencilMask);
} else if (maskReg != INVALID_REG) {
// Clear the bits we should be masking out.
if (cpu_info.bBMI1) {
ANDN(32, colorReg, maskReg, R(colorReg));
} else {
NOT(32, R(maskReg));
AND(32, R(colorReg), R(maskReg));
}
} else if (id.FBFormat() == GE_FORMAT_8888) {
// We only need to do this for 8888, the others already have 0 stencil.
AND(bits, R(colorReg), notStencilMask);
}
// Now the OR, which applies stencil and the logic op itself.
OR(bits, MatR(colorOff), R(colorReg));
skipStandardWrites_.push_back(J(true));
tableValues[GE_LOGIC_NOR] = GetCodePointer();
OR(bits, R(colorReg), MatR(colorOff));
NOT(32, R(colorReg));
if (stencilReg != INVALID_REG) {
AND(bits, R(colorReg), notStencilMask);
OR(32, R(colorReg), R(stencilReg));
} else if (maskReg == INVALID_REG && stencilMask.GetImmValue() != 0) {
// We need to clear the stencil bits since the standard write logic assumes they're zero.
AND(bits, R(colorReg), notStencilMask);
}
finishes.push_back(J(true));
tableValues[GE_LOGIC_EQUIV] = GetCodePointer();
XOR(bits, R(colorReg), MatR(colorOff));
NOT(32, R(colorReg));
if (stencilReg != INVALID_REG) {
AND(bits, R(colorReg), notStencilMask);
OR(32, R(colorReg), R(stencilReg));
} else if (maskReg == INVALID_REG && stencilMask.GetImmValue() != 0) {
// We need to clear the stencil bits since the standard write logic assumes they're zero.
AND(bits, R(colorReg), notStencilMask);
}
finishes.push_back(J(true));
tableValues[GE_LOGIC_INVERTED] = GetCodePointer();
// We just toss our color entirely.
MOV(bits, R(colorReg), MatR(colorOff));
NOT(32, R(colorReg));
if (stencilReg != INVALID_REG) {
AND(bits, R(colorReg), notStencilMask);
OR(32, R(colorReg), R(stencilReg));
} else if (maskReg == INVALID_REG && stencilMask.GetImmValue() != 0) {
// We need to clear the stencil bits since the standard write logic assumes they're zero.
AND(bits, R(colorReg), notStencilMask);
}
finishes.push_back(J(true));
tableValues[GE_LOGIC_OR_REVERSE] = GetCodePointer();
// Reverse in a temp reg so we can mask properly.
MOV(bits, R(temp1Reg), MatR(colorOff));
NOT(32, R(temp1Reg));
OR(32, R(colorReg), R(temp1Reg));
if (stencilReg != INVALID_REG) {
AND(bits, R(colorReg), notStencilMask);
OR(32, R(colorReg), R(stencilReg));
} else if (maskReg == INVALID_REG && stencilMask.GetImmValue() != 0) {
// We need to clear the stencil bits since the standard write logic assumes they're zero.
AND(bits, R(colorReg), notStencilMask);
}
finishes.push_back(J(true));
tableValues[GE_LOGIC_COPY_INVERTED] = GetCodePointer();
NOT(32, R(colorReg));
if (stencilReg != INVALID_REG) {
AND(bits, R(colorReg), notStencilMask);
OR(32, R(colorReg), R(stencilReg));
} else if (maskReg == INVALID_REG && stencilMask.GetImmValue() != 0) {
// We need to clear the stencil bits since the standard write logic assumes they're zero.
AND(bits, R(colorReg), notStencilMask);
}
finishes.push_back(J(true));
tableValues[GE_LOGIC_OR_INVERTED] = GetCodePointer();
NOT(32, R(colorReg));
if (stencilReg != INVALID_REG && maskReg != INVALID_REG) {
AND(bits, R(colorReg), notStencilMask);
OR(32, R(colorReg), R(stencilReg));
// Clear the bits we should be masking out.
if (cpu_info.bBMI1) {
ANDN(32, colorReg, maskReg, R(colorReg));
} else {
NOT(32, R(maskReg));
AND(32, R(colorReg), R(maskReg));
NOT(32, R(maskReg));
}
// Clear all the unmasked stencil bits, so we can set our own.
OR(bits, R(maskReg), notStencilMask);
AND(bits, MatR(colorOff), R(maskReg));
} else if (stencilReg != INVALID_REG) {
AND(bits, R(colorReg), notStencilMask);
OR(32, R(colorReg), R(stencilReg));
// AND out the stencil bits so we set our own.
AND(bits, MatR(colorOff), notStencilMask);
} else if (maskReg != INVALID_REG) {
// Clear the bits we should be masking out.
NOT(32, R(maskReg));
AND(32, R(colorReg), R(maskReg));
} else if (id.FBFormat() == GE_FORMAT_8888) {
// We only need to do this for 8888, the others already have 0 stencil.
AND(bits, R(colorReg), notStencilMask);
}
OR(bits, MatR(colorOff), R(colorReg));
skipStandardWrites_.push_back(J(true));
tableValues[GE_LOGIC_NAND] = GetCodePointer();
AND(bits, R(temp1Reg), MatR(colorOff));
NOT(32, R(colorReg));
if (stencilReg != INVALID_REG) {
AND(bits, R(colorReg), notStencilMask);
OR(32, R(colorReg), R(stencilReg));
} else if (maskReg == INVALID_REG && stencilMask.GetImmValue() != 0) {
// We need to clear the stencil bits since the standard write logic assumes they're zero.
AND(bits, R(colorReg), notStencilMask);
}
finishes.push_back(J(true));
tableValues[GE_LOGIC_SET] = GetCodePointer();
if (stencilReg != INVALID_REG && maskReg != INVALID_REG) {
OR(32, R(colorReg), R(stencilReg));
OR(bits, R(colorReg), notStencilMask);
finishes.push_back(J(true));
} else if (stencilReg != INVALID_REG) {
// Set bits directly in stencilReg, and then put in memory.
OR(bits, R(stencilReg), notStencilMask);
MOV(bits, MatR(colorOff), R(stencilReg));
skipStandardWrites_.push_back(J(true));
} else if (maskReg != INVALID_REG) {
// OR in the bits we're allowed to write (won't be any stencil.)
NOT(32, R(maskReg));
OR(bits, MatR(colorOff), R(maskReg));
skipStandardWrites_.push_back(J(true));
} else {
OR(bits, MatR(colorOff), notStencilMask);
skipStandardWrites_.push_back(J(true));
}
const u8 *tablePtr = GetCodePointer();
for (int i = 0; i < 16; ++i) {
Write64((uintptr_t)tableValues[i]);
}
SetJumpTarget(skipTable);
LEA(64, temp1Reg, M(tablePtr));
JMPptr(MComplex(temp1Reg, logicOpReg, 8, 0));
for (FixupBranch &fixup : finishes)
SetJumpTarget(fixup);
regCache_.Unlock(colorOff, RegCache::GEN_COLOR_OFF);
regCache_.Release(logicOpReg, RegCache::GEN_TEMP4);
regCache_.Release(temp1Reg, RegCache::GEN_TEMP5);
if (stencilReg != INVALID_REG)
regCache_.Unlock(stencilReg, RegCache::GEN_STENCIL);
return true;
}
bool PixelJitCache::Jit_ConvertTo565(const PixelFuncID &id, RegCache::Reg colorReg, RegCache::Reg temp1Reg, RegCache::Reg temp2Reg) {
Describe("ConvertTo565");
if (cpu_info.bBMI2_fast) {
MOV(32, R(temp1Reg), Imm32(0x00F8FCF8));
PEXT(32, colorReg, colorReg, R(temp1Reg));
return true;
}
// Assemble the 565 color, starting with R...
MOV(32, R(temp1Reg), R(colorReg));
SHR(32, R(temp1Reg), Imm8(3));
AND(16, R(temp1Reg), Imm16(0x1F << 0));
// For G, move right 5 (because the top 6 are offset by 10.)
MOV(32, R(temp2Reg), R(colorReg));
SHR(32, R(temp2Reg), Imm8(5));
AND(16, R(temp2Reg), Imm16(0x3F << 5));
OR(32, R(temp1Reg), R(temp2Reg));
// And finally B, move right 8 (top 5 are offset by 19.)
SHR(32, R(colorReg), Imm8(8));
AND(16, R(colorReg), Imm16(0x1F << 11));
OR(32, R(colorReg), R(temp1Reg));
return true;
}
bool PixelJitCache::Jit_ConvertTo5551(const PixelFuncID &id, RegCache::Reg colorReg, RegCache::Reg temp1Reg, RegCache::Reg temp2Reg, bool keepAlpha) {
Describe("ConvertTo5551");
if (cpu_info.bBMI2_fast) {
MOV(32, R(temp1Reg), Imm32(keepAlpha ? 0x80F8F8F8 : 0x00F8F8F8));
PEXT(32, colorReg, colorReg, R(temp1Reg));
return true;
}
// This is R, pretty simple.
MOV(32, R(temp1Reg), R(colorReg));
SHR(32, R(temp1Reg), Imm8(3));
AND(16, R(temp1Reg), Imm16(0x1F << 0));
// G moves right 6, to match the top 5 at 11.
MOV(32, R(temp2Reg), R(colorReg));
SHR(32, R(temp2Reg), Imm8(6));
AND(16, R(temp2Reg), Imm16(0x1F << 5));
OR(32, R(temp1Reg), R(temp2Reg));
if (keepAlpha) {
// Grab A into tempReg2 before handling B.
MOV(32, R(temp2Reg), R(colorReg));
SHR(32, R(temp2Reg), Imm8(31));
SHL(32, R(temp2Reg), Imm8(15));
}
// B moves right 9, to match the top 5 at 19.
SHR(32, R(colorReg), Imm8(9));
AND(16, R(colorReg), Imm16(0x1F << 10));
OR(32, R(colorReg), R(temp1Reg));
if (keepAlpha)
OR(32, R(colorReg), R(temp2Reg));
return true;
}
bool PixelJitCache::Jit_ConvertTo4444(const PixelFuncID &id, RegCache::Reg colorReg, RegCache::Reg temp1Reg, RegCache::Reg temp2Reg, bool keepAlpha) {
Describe("ConvertTo4444");
if (cpu_info.bBMI2_fast) {
MOV(32, R(temp1Reg), Imm32(keepAlpha ? 0xF0F0F0F0 : 0x00F0F0F0));
PEXT(32, colorReg, colorReg, R(temp1Reg));
return true;
}
// Shift and mask out R.
MOV(32, R(temp1Reg), R(colorReg));
SHR(32, R(temp1Reg), Imm8(4));
AND(16, R(temp1Reg), Imm16(0xF << 0));
// Shift G into position and mask.
MOV(32, R(temp2Reg), R(colorReg));
SHR(32, R(temp2Reg), Imm8(8));
AND(16, R(temp2Reg), Imm16(0xF << 4));
OR(32, R(temp1Reg), R(temp2Reg));
if (keepAlpha) {
// Grab A into tempReg2 before handling B.
MOV(32, R(temp2Reg), R(colorReg));
SHR(32, R(temp2Reg), Imm8(28));
SHL(32, R(temp2Reg), Imm8(12));
}
// B moves right 12, to match the top 4 at 20.
SHR(32, R(colorReg), Imm8(12));
AND(16, R(colorReg), Imm16(0xF << 8));
OR(32, R(colorReg), R(temp1Reg));
if (keepAlpha)
OR(32, R(colorReg), R(temp2Reg));
return true;
}
bool PixelJitCache::Jit_ConvertFrom565(const PixelFuncID &id, RegCache::Reg colorReg, RegCache::Reg temp1Reg, RegCache::Reg temp2Reg) {
Describe("ConvertFrom565");
if (cpu_info.bBMI2_fast) {
// Start off with the high bits.
MOV(32, R(temp1Reg), Imm32(0x00F8FCF8));
PDEP(32, temp1Reg, colorReg, R(temp1Reg));
// Now grab the low bits (they end up packed.)
MOV(32, R(temp2Reg), Imm32(0x0000E61C));
PEXT(32, colorReg, colorReg, R(temp2Reg));
// And spread them back out.
MOV(32, R(temp2Reg), Imm32(0x00070307));
PDEP(32, colorReg, colorReg, R(temp2Reg));
// Finally put the high bits in, we're done.
OR(32, R(colorReg), R(temp1Reg));
return true;
}
// Filter out red only into temp1.
MOV(32, R(temp1Reg), R(colorReg));
AND(16, R(temp1Reg), Imm16(0x1F << 0));
// Move it left to the top of the 8 bits.
SHL(32, R(temp1Reg), Imm8(3));
// Now we bring in blue, since it's also 5 like red.
MOV(32, R(temp2Reg), R(colorReg));
AND(16, R(temp2Reg), Imm16(0x1F << 11));
// Shift blue into place, 8 left (at 19), and merge back to temp1.
SHL(32, R(temp2Reg), Imm8(8));
OR(32, R(temp1Reg), R(temp2Reg));
// Make a copy back in temp2, and shift left 1 so we can swizzle together with G.
OR(32, R(temp2Reg), R(temp1Reg));
SHL(32, R(temp2Reg), Imm8(1));
// We go to green last because it's the different one. Put it in place.
AND(16, R(colorReg), Imm16(0x3F << 5));
SHL(32, R(colorReg), Imm8(5));
// Combine with temp2 (for swizzling), then merge in temp1 (R+B pre-swizzle.)
OR(32, R(temp2Reg), R(colorReg));
OR(32, R(colorReg), R(temp1Reg));
// Now shift and mask temp2 for swizzle.
SHR(32, R(temp2Reg), Imm8(6));
AND(32, R(temp2Reg), Imm32(0x00070307));
// And then OR that in too. We're done.
OR(32, R(colorReg), R(temp2Reg));
return true;
}
bool PixelJitCache::Jit_ConvertFrom5551(const PixelFuncID &id, RegCache::Reg colorReg, RegCache::Reg temp1Reg, RegCache::Reg temp2Reg, bool keepAlpha) {
Describe("ConvertFrom5551");
if (cpu_info.bBMI2_fast) {
// First, grab the top bits.
MOV(32, R(temp1Reg), Imm32(keepAlpha ? 0x01F8F8F8 : 0x00F8F8F8));
PDEP(32, colorReg, colorReg, R(temp1Reg));
// Now make the swizzle bits.
MOV(32, R(temp2Reg), R(colorReg));
SHR(32, R(temp2Reg), Imm8(5));
AND(32, R(temp2Reg), Imm32(0x00070707));
if (keepAlpha) {
// Sign extend the alpha bit to 8 bits.
SHL(32, R(colorReg), Imm8(7));
SAR(32, R(colorReg), Imm8(7));
}
OR(32, R(colorReg), R(temp2Reg));
return true;
}
// Filter out red only into temp1.
MOV(32, R(temp1Reg), R(colorReg));
AND(16, R(temp1Reg), Imm16(0x1F << 0));
// Move it left to the top of the 8 bits.
SHL(32, R(temp1Reg), Imm8(3));
// Add in green and shift into place (top bits.)
MOV(32, R(temp2Reg), R(colorReg));
AND(16, R(temp2Reg), Imm16(0x1F << 5));
SHL(32, R(temp2Reg), Imm8(6));
OR(32, R(temp1Reg), R(temp2Reg));
if (keepAlpha) {
// Now take blue and alpha together.
AND(16, R(colorReg), Imm16(0x8000 | (0x1F << 10)));
// We move all the way left, then sign extend right to expand alpha.
SHL(32, R(colorReg), Imm8(16));
SAR(32, R(colorReg), Imm8(7));
} else {
AND(16, R(colorReg), Imm16(0x1F << 10));
SHL(32, R(colorReg), Imm8(9));
}
// Combine both together, we still need to swizzle.
OR(32, R(colorReg), R(temp1Reg));
OR(32, R(temp1Reg), R(colorReg));
// Now for swizzle, we'll mask carefully to avoid overflow.
SHR(32, R(temp1Reg), Imm8(5));
AND(32, R(temp1Reg), Imm32(0x00070707));
// Then finally merge in the swizzle bits.
OR(32, R(colorReg), R(temp1Reg));
return true;
}
bool PixelJitCache::Jit_ConvertFrom4444(const PixelFuncID &id, RegCache::Reg colorReg, RegCache::Reg temp1Reg, RegCache::Reg temp2Reg, bool keepAlpha) {
Describe("ConvertFrom4444");
if (cpu_info.bBMI2_fast) {
// First, spread the bits out with spaces.
MOV(32, R(temp1Reg), Imm32(keepAlpha ? 0xF0F0F0F0 : 0x00F0F0F0));
PDEP(32, colorReg, colorReg, R(temp1Reg));
// Now swizzle the low bits in.
MOV(32, R(temp1Reg), R(colorReg));
SHR(32, R(temp1Reg), Imm8(4));
OR(32, R(colorReg), R(temp1Reg));
return true;
}
// Move red into position within temp1.
MOV(32, R(temp1Reg), R(colorReg));
AND(16, R(temp1Reg), Imm16(0xF << 0));
SHL(32, R(temp1Reg), Imm8(4));
// Green is just as simple.
MOV(32, R(temp2Reg), R(colorReg));
AND(16, R(temp2Reg), Imm16(0xF << 4));
SHL(32, R(temp2Reg), Imm8(8));
OR(32, R(temp1Reg), R(temp2Reg));
// Blue isn't last this time, but it's next.
MOV(32, R(temp2Reg), R(colorReg));
AND(16, R(temp2Reg), Imm16(0xF << 8));
SHL(32, R(temp2Reg), Imm8(12));
OR(32, R(temp1Reg), R(temp2Reg));
if (keepAlpha) {
// Last but not least, alpha.
AND(16, R(colorReg), Imm16(0xF << 12));
SHL(32, R(colorReg), Imm8(16));
OR(32, R(colorReg), R(temp1Reg));
// Copy to temp1 again for swizzling.
OR(32, R(temp1Reg), R(colorReg));
} else {
// Overwrite colorReg (we need temp1 as a copy anyway.)
MOV(32, R(colorReg), R(temp1Reg));
}
// Masking isn't necessary here since everything is 4 wide.
SHR(32, R(temp1Reg), Imm8(4));
OR(32, R(colorReg), R(temp1Reg));
return true;
}
};
#endif