ppsspp/GPU/Software/DrawPixelX86.cpp

// Copyright (c) 2017- PPSSPP Project.

// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, version 2.0 or later versions.

// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License 2.0 for more details.

// A copy of the GPL 2.0 should have been included with the program.
// If not, see http://www.gnu.org/licenses/

// Official git repository and contact information can be found at
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.

#include "ppsspp_config.h"
#if PPSSPP_ARCH(AMD64)

#include <emmintrin.h>
#include "Common/x64Emitter.h"
#include "Common/CPUDetect.h"
#include "Common/LogReporting.h"
#include "GPU/GPUState.h"
#include "GPU/Software/DrawPixel.h"
#include "GPU/Software/SoftGpu.h"
#include "GPU/ge_constants.h"

using namespace Gen;

namespace Rasterizer {

SingleFunc PixelJitCache::CompileSingle(const PixelFuncID &id) {
	// Setup the reg cache and disallow spill for arguments.
	regCache_.SetupABI({
		RegCache::GEN_ARG_X,
		RegCache::GEN_ARG_Y,
		RegCache::GEN_ARG_Z,
		RegCache::GEN_ARG_FOG,
		RegCache::VEC_ARG_COLOR,
		RegCache::GEN_ARG_ID,
	});

	BeginWrite(64);
	Describe("Init");
	WriteConstantPool(id);

	const u8 *resetPos = AlignCode16();
	EndWrite();
	bool success = true;

#if PPSSPP_PLATFORM(WINDOWS)
	// RET + Windows reserves space to save args, half of 1 xmm + 4 ints before the id.
	_assert_(!regCache_.Has(RegCache::GEN_ARG_ID));
	int stackSpace = 0;
	if (id.hasStencilTestMask)
		stackSpace = WriteProlog(0, { XMM6, XMM7, XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15 }, { R12, R13, R14, R15 });
	else
		stackSpace = WriteProlog(0, {}, {});
	stackIDOffset_ = stackSpace + 8 + 8 + 4 * PTRBITS / 8;
#else
	_assert_(regCache_.Has(RegCache::GEN_ARG_ID));
	WriteProlog(0, {}, {});
	stackIDOffset_ = -1;
#endif

	// Start with the depth range.
	success = success && Jit_ApplyDepthRange(id);

	// Next, let's clamp the color (might affect alpha test, and everything expects it clamped.)
	// We simply convert to 4x8-bit to clamp.  Everything else expects color in this format.
	Describe("ClampColor");
	X64Reg argColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);
	PACKSSDW(argColorReg, R(argColorReg));
	PACKUSWB(argColorReg, R(argColorReg));
	regCache_.Unlock(argColorReg, RegCache::VEC_ARG_COLOR);
	colorIs16Bit_ = false;

	success = success && Jit_AlphaTest(id);
	// Fog is applied prior to color test.  Maybe before alpha test too, but it doesn't affect it...
	success = success && Jit_ApplyFog(id);
	success = success && Jit_ColorTest(id);

	if (id.stencilTest && !id.clearMode)
		success = success && Jit_StencilAndDepthTest(id);
	else if (!id.clearMode)
		success = success && Jit_DepthTest(id);
	success = success && Jit_WriteDepth(id);

	success = success && Jit_AlphaBlend(id);
	success = success && Jit_Dither(id);
	success = success && Jit_WriteColor(id);

	for (auto &fixup : discards_) {
		SetJumpTarget(fixup);
	}
	discards_.clear();

	if (regCache_.Has(RegCache::GEN_ARG_ID))
		regCache_.ForceRelease(RegCache::GEN_ARG_ID);

	if (!success) {
		ERROR_LOG_REPORT(G3D, "Could not compile pixel func: %s", DescribePixelFuncID(id).c_str());

		regCache_.Reset(false);
		EndWrite();
		ResetCodePtr(GetOffset(resetPos));
		return nullptr;
	}

	const u8 *start = WriteFinalizedEpilog();
	regCache_.Reset(true);
	return (SingleFunc)start;
}

RegCache::Reg PixelJitCache::GetPixelID() {
	if (regCache_.Has(RegCache::GEN_ARG_ID))
		return regCache_.Find(RegCache::GEN_ARG_ID);
	if (!regCache_.Has(RegCache::GEN_ID)) {
		X64Reg r = regCache_.Alloc(RegCache::GEN_ID);
		_assert_(stackIDOffset_ != -1);
		MOV(PTRBITS, R(r), MDisp(RSP, stackIDOffset_));
		return r;
	}
	return regCache_.Find(RegCache::GEN_ID);
}

void PixelJitCache::UnlockPixelID(RegCache::Reg &r) {
	if (regCache_.Has(RegCache::GEN_ARG_ID))
		regCache_.Unlock(r, RegCache::GEN_ARG_ID);
	else
		regCache_.Unlock(r, RegCache::GEN_ID);
}

RegCache::Reg PixelJitCache::GetColorOff(const PixelFuncID &id) {
	if (!regCache_.Has(RegCache::GEN_COLOR_OFF)) {
		Describe("GetColorOff");
		if (id.useStandardStride && !id.dithering) {
			bool loadDepthOff = id.depthWrite || (id.DepthTestFunc() != GE_COMP_ALWAYS && !id.earlyZChecks);
			X64Reg depthTemp = INVALID_REG;
			X64Reg argYReg = regCache_.Find(RegCache::GEN_ARG_Y);
			X64Reg argXReg = regCache_.Find(RegCache::GEN_ARG_X);

			// In this mode, we force argXReg to the off, and throw away argYReg.
			SHL(32, R(argYReg), Imm8(9));
			ADD(32, R(argXReg), R(argYReg));

			// Now add the pointer for the color buffer.
			if (loadDepthOff) {
				_assert_(Accessible(&fb.data, &depthbuf.data));
				depthTemp = regCache_.Alloc(RegCache::GEN_DEPTH_OFF);
				if (RipAccessible(&fb.data) && RipAccessible(&depthbuf.data)) {
					MOV(PTRBITS, R(argYReg), M(&fb.data));
				} else {
					MOV(PTRBITS, R(depthTemp), ImmPtr(&fb.data));
					MOV(PTRBITS, R(argYReg), MatR(depthTemp));
				}
			} else {
				if (RipAccessible(&fb.data)) {
					MOV(PTRBITS, R(argYReg), M(&fb.data));
				} else {
					MOV(PTRBITS, R(argYReg), ImmPtr(&fb.data));
					MOV(PTRBITS, R(argYReg), MatR(argYReg));
				}
			}
			LEA(PTRBITS, argYReg, MComplex(argYReg, argXReg, id.FBFormat() == GE_FORMAT_8888 ? 4 : 2, 0));
			// With that, argYOff is now GEN_COLOR_OFF.
			regCache_.Unlock(argYReg, RegCache::GEN_ARG_Y);
			regCache_.Change(RegCache::GEN_ARG_Y, RegCache::GEN_COLOR_OFF);
			// Retain it, because we can't recalculate this.
			regCache_.ForceRetain(RegCache::GEN_COLOR_OFF);

			// Next, also calculate the depth offset, unless we won't need it at all.
			if (loadDepthOff) {
				if (RipAccessible(&fb.data) && RipAccessible(&depthbuf.data)) {
					MOV(PTRBITS, R(depthTemp), M(&depthbuf.data));
				} else {
					MOV(PTRBITS, R(depthTemp), MAccessibleDisp(depthTemp, &fb.data, &depthbuf.data));
				}
				LEA(PTRBITS, argXReg, MComplex(depthTemp, argXReg, 2, 0));
				regCache_.Release(depthTemp, RegCache::GEN_DEPTH_OFF);

				// Okay, same deal - release as GEN_DEPTH_OFF and force retain it.
				regCache_.Unlock(argXReg, RegCache::GEN_ARG_X);
				regCache_.Change(RegCache::GEN_ARG_X, RegCache::GEN_DEPTH_OFF);
				regCache_.ForceRetain(RegCache::GEN_DEPTH_OFF);
			} else {
				regCache_.Unlock(argXReg, RegCache::GEN_ARG_X);
				regCache_.ForceRelease(RegCache::GEN_ARG_X);
			}

			return regCache_.Find(RegCache::GEN_COLOR_OFF);
		}

		X64Reg argYReg = regCache_.Find(RegCache::GEN_ARG_Y);
		X64Reg r = regCache_.Alloc(RegCache::GEN_COLOR_OFF);
		if (id.useStandardStride) {
			MOV(32, R(r), R(argYReg));
			SHL(32, R(r), Imm8(9));
		} else {
			if (regCache_.Has(RegCache::GEN_ARG_ID) || regCache_.Has(RegCache::GEN_ID)) {
				X64Reg idReg = GetPixelID();
				MOVZX(32, 16, r, MDisp(idReg, offsetof(PixelFuncID, cached.framebufStride)));
				UnlockPixelID(idReg);
			} else {
				_assert_(stackIDOffset_ != -1);
				MOV(PTRBITS, R(r), MDisp(RSP, stackIDOffset_));
				MOVZX(32, 16, r, MDisp(r, offsetof(PixelFuncID, cached.framebufStride)));
			}

			IMUL(32, r, R(argYReg));
		}
		regCache_.Unlock(argYReg, RegCache::GEN_ARG_Y);

		X64Reg argXReg = regCache_.Find(RegCache::GEN_ARG_X);
		ADD(32, R(r), R(argXReg));
		regCache_.Unlock(argXReg, RegCache::GEN_ARG_X);

		X64Reg temp = regCache_.Alloc(RegCache::GEN_TEMP_HELPER);
		if (RipAccessible(&fb.data)) {
			MOV(PTRBITS, R(temp), M(&fb.data));
		} else {
			MOV(PTRBITS, R(temp), ImmPtr(&fb.data));
			MOV(PTRBITS, R(temp), MatR(temp));
		}
		LEA(PTRBITS, r, MComplex(temp, r, id.FBFormat() == GE_FORMAT_8888 ? 4 : 2, 0));
		regCache_.Release(temp, RegCache::GEN_TEMP_HELPER);

		return r;
	}
	return regCache_.Find(RegCache::GEN_COLOR_OFF);
}

RegCache::Reg PixelJitCache::GetDepthOff(const PixelFuncID &id) {
	if (!regCache_.Has(RegCache::GEN_DEPTH_OFF)) {
		// If both color and depth use 512, the offsets are the same.
		if (id.useStandardStride && !id.dithering) {
			// Calculate once inside GetColorOff().
			X64Reg colorOffReg = GetColorOff(id);
			regCache_.Unlock(colorOffReg, RegCache::GEN_COLOR_OFF);
			return regCache_.Find(RegCache::GEN_DEPTH_OFF);
		}

		Describe("GetDepthOff");
		X64Reg argYReg = regCache_.Find(RegCache::GEN_ARG_Y);
		X64Reg r = regCache_.Alloc(RegCache::GEN_DEPTH_OFF);
		if (id.useStandardStride) {
			MOV(32, R(r), R(argYReg));
			SHL(32, R(r), Imm8(9));
		} else {
			if (regCache_.Has(RegCache::GEN_ARG_ID) || regCache_.Has(RegCache::GEN_ID)) {
				X64Reg idReg = GetPixelID();
				MOVZX(32, 16, r, MDisp(idReg, offsetof(PixelFuncID, cached.depthbufStride)));
				UnlockPixelID(idReg);
			} else {
				_assert_(stackIDOffset_ != -1);
				MOV(PTRBITS, R(r), MDisp(RSP, stackIDOffset_));
				MOVZX(32, 16, r, MDisp(r, offsetof(PixelFuncID, cached.depthbufStride)));
			}

			IMUL(32, r, R(argYReg));
		}
		regCache_.Unlock(argYReg, RegCache::GEN_ARG_Y);

		X64Reg argXReg = regCache_.Find(RegCache::GEN_ARG_X);
		ADD(32, R(r), R(argXReg));
		regCache_.Unlock(argXReg, RegCache::GEN_ARG_X);

		X64Reg temp = regCache_.Alloc(RegCache::GEN_TEMP_HELPER);
		if (RipAccessible(&depthbuf.data)) {
			MOV(PTRBITS, R(temp), M(&depthbuf.data));
		} else {
			MOV(PTRBITS, R(temp), ImmPtr(&depthbuf.data));
			MOV(PTRBITS, R(temp), MatR(temp));
		}
		LEA(PTRBITS, r, MComplex(temp, r, 2, 0));
		regCache_.Release(temp, RegCache::GEN_TEMP_HELPER);

		return r;
	}
	return regCache_.Find(RegCache::GEN_DEPTH_OFF);
}


RegCache::Reg PixelJitCache::GetDestStencil(const PixelFuncID &id) {
	// Skip if 565, since stencil is fixed zero.
	if (id.FBFormat() == GE_FORMAT_565)
		return INVALID_REG;

	X64Reg colorOffReg = GetColorOff(id);
	Describe("GetDestStencil");
	X64Reg stencilReg = regCache_.Alloc(RegCache::GEN_STENCIL);
	if (id.FBFormat() == GE_FORMAT_8888) {
		MOVZX(32, 8, stencilReg, MDisp(colorOffReg, 3));
	} else if (id.FBFormat() == GE_FORMAT_5551) {
		MOVZX(32, 8, stencilReg, MDisp(colorOffReg, 1));
		SAR(8, R(stencilReg), Imm8(7));
	} else if (id.FBFormat() == GE_FORMAT_4444) {
		MOVZX(32, 8, stencilReg, MDisp(colorOffReg, 1));
		SHR(32, R(stencilReg), Imm8(4));
		X64Reg temp = regCache_.Alloc(RegCache::GEN_TEMP_HELPER);
		MOV(32, R(temp), R(stencilReg));
		SHL(32, R(temp), Imm8(4));
		OR(32, R(stencilReg), R(temp));
		regCache_.Release(temp, RegCache::GEN_TEMP_HELPER);
	}
	regCache_.Unlock(colorOffReg, RegCache::GEN_COLOR_OFF);

	return stencilReg;
}

void PixelJitCache::Discard() {
	discards_.push_back(J(true));
}

void PixelJitCache::Discard(Gen::CCFlags cc) {
	discards_.push_back(J_CC(cc, true));
}

void PixelJitCache::WriteConstantPool(const PixelFuncID &id) {
	// This is used to add a fixed point 0.5 (as s.11.4) for blend factors to multiply accurately.
	WriteSimpleConst8x16(constBlendHalf_11_4s_, 1 << 3);

	// This is used for shifted blend factors, to inverse them.
	WriteSimpleConst8x16(constBlendInvert_11_4s_, 0xFF << 4);
}

bool PixelJitCache::Jit_ApplyDepthRange(const PixelFuncID &id) {
	if (id.applyDepthRange && !id.earlyZChecks) {
		Describe("ApplyDepthR");
		X64Reg argZReg = regCache_.Find(RegCache::GEN_ARG_Z);
		X64Reg idReg = GetPixelID();

		// We expanded this to 32 bits, so it's convenient to compare.
		CMP(32, R(argZReg), MDisp(idReg, offsetof(PixelFuncID, cached.minz)));
		Discard(CC_L);

		// We load the low 16 bits, but compare all 32 of z.  Above handles < 0.
		CMP(32, R(argZReg), MDisp(idReg, offsetof(PixelFuncID, cached.maxz)));
		Discard(CC_G);

		UnlockPixelID(idReg);
		regCache_.Unlock(argZReg, RegCache::GEN_ARG_Z);
	}

	// Since this is early on, try to free up the z reg if we don't need it anymore.
	if (id.clearMode && !id.DepthClear())
		regCache_.ForceRelease(RegCache::GEN_ARG_Z);
	else if (!id.clearMode && !id.depthWrite && (id.DepthTestFunc() == GE_COMP_ALWAYS || id.earlyZChecks))
		regCache_.ForceRelease(RegCache::GEN_ARG_Z);

	return true;
}

bool PixelJitCache::Jit_AlphaTest(const PixelFuncID &id) {
	// Take care of ALWAYS/NEVER first.  ALWAYS is common, means disabled.
	Describe("AlphaTest");
	switch (id.AlphaTestFunc()) {
	case GE_COMP_NEVER:
		Discard();
		return true;

	case GE_COMP_ALWAYS:
		return true;

	default:
		break;
	}

	// Load alpha into its own general reg.
	X64Reg alphaReg;
	if (regCache_.Has(RegCache::GEN_SRC_ALPHA)) {
		alphaReg = regCache_.Find(RegCache::GEN_SRC_ALPHA);
	} else {
		alphaReg = regCache_.Alloc(RegCache::GEN_SRC_ALPHA);
		_assert_(!colorIs16Bit_);
		X64Reg argColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);
		MOVD_xmm(R(alphaReg), argColorReg);
		regCache_.Unlock(argColorReg, RegCache::VEC_ARG_COLOR);
		SHR(32, R(alphaReg), Imm8(24));
	}

	if (id.hasAlphaTestMask) {
		// Unfortunate, we'll need pixelID to load the mask.
		// Note: we leave the ALPHA purpose untouched and free it, because later code may reuse.
		X64Reg idReg = GetPixelID();
		X64Reg maskedReg = regCache_.Alloc(RegCache::GEN_TEMP0);

		MOVZX(32, 8, maskedReg, MDisp(idReg, offsetof(PixelFuncID, cached.alphaTestMask)));
		UnlockPixelID(idReg);
		AND(32, R(maskedReg), R(alphaReg));
		regCache_.Unlock(alphaReg, RegCache::GEN_SRC_ALPHA);

		// Okay now do the rest using the masked reg, which we modified.
		alphaReg = maskedReg;
	}

	// We hardcode the ref into this jit func.
	CMP(8, R(alphaReg), Imm8(id.alphaTestRef));
	if (id.hasAlphaTestMask)
		regCache_.Release(alphaReg, RegCache::GEN_TEMP0);
	else
		regCache_.Unlock(alphaReg, RegCache::GEN_SRC_ALPHA);

	switch (id.AlphaTestFunc()) {
	case GE_COMP_NEVER:
	case GE_COMP_ALWAYS:
		break;

	case GE_COMP_EQUAL:
		Discard(CC_NE);
		break;

	case GE_COMP_NOTEQUAL:
		Discard(CC_E);
		break;

	case GE_COMP_LESS:
		Discard(CC_AE);
		break;

	case GE_COMP_LEQUAL:
		Discard(CC_A);
		break;

	case GE_COMP_GREATER:
		Discard(CC_BE);
		break;

	case GE_COMP_GEQUAL:
		Discard(CC_B);
		break;
	}

	return true;
}

bool PixelJitCache::Jit_ColorTest(const PixelFuncID &id) {
	if (!id.colorTest || id.clearMode)
		return true;

	// We'll have 4 with fog released, so we're using them all...
	Describe("ColorTest");
	X64Reg idReg = GetPixelID();
	X64Reg funcReg = regCache_.Alloc(RegCache::GEN_TEMP0);
	X64Reg maskReg = regCache_.Alloc(RegCache::GEN_TEMP1);
	X64Reg refReg = regCache_.Alloc(RegCache::GEN_TEMP2);

	// First, load the registers: mask and ref.
	MOV(32, R(maskReg), MDisp(idReg, offsetof(PixelFuncID, cached.colorTestMask)));
	MOV(32, R(refReg), MDisp(idReg, offsetof(PixelFuncID, cached.colorTestRef)));

	X64Reg argColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);
	if (colorIs16Bit_) {
		// If it's expanded, we need to clamp anyway if it was fogged.
		PACKUSWB(argColorReg, R(argColorReg));
		colorIs16Bit_ = false;
	}

	// Temporarily abuse funcReg to grab the color into maskReg.
	MOVD_xmm(R(funcReg), argColorReg);
	AND(32, R(maskReg), R(funcReg));
	regCache_.Unlock(argColorReg, RegCache::VEC_ARG_COLOR);

	// Now that we're setup, get the func and follow it.
	MOVZX(32, 8, funcReg, MDisp(idReg, offsetof(PixelFuncID, cached.colorTestFunc)));
	UnlockPixelID(idReg);

	CMP(8, R(funcReg), Imm8(GE_COMP_ALWAYS));
	// Discard for GE_COMP_NEVER...
	Discard(CC_B);
	FixupBranch skip = J_CC(CC_E);

	CMP(8, R(funcReg), Imm8(GE_COMP_EQUAL));
	FixupBranch doEqual = J_CC(CC_E);
	regCache_.Release(funcReg, RegCache::GEN_TEMP0);

	// The not equal path here... if they are equal, we discard.
	CMP(32, R(refReg), R(maskReg));
	Discard(CC_E);
	FixupBranch skip2 = J();

	SetJumpTarget(doEqual);
	CMP(32, R(refReg), R(maskReg));
	Discard(CC_NE);

	regCache_.Release(maskReg, RegCache::GEN_TEMP1);
	regCache_.Release(refReg, RegCache::GEN_TEMP2);

	SetJumpTarget(skip);
	SetJumpTarget(skip2);

	return true;
}

bool PixelJitCache::Jit_ApplyFog(const PixelFuncID &id) {
	if (!id.applyFog) {
		// Okay, anyone can use the fog register then.
		regCache_.ForceRelease(RegCache::GEN_ARG_FOG);
		return true;
	}

	// Load fog and expand to 16 bit.  Ignore the high 8 bits, which'll match up with A.
	Describe("ApplyFog");
	X64Reg fogColorReg = regCache_.Alloc(RegCache::VEC_TEMP1);
	X64Reg idReg = GetPixelID();
	if (cpu_info.bSSE4_1) {
		PMOVZXBW(fogColorReg, MDisp(idReg, offsetof(PixelFuncID, cached.fogColor)));
	} else {
		X64Reg zeroReg = GetZeroVec();
		MOVD_xmm(fogColorReg, MDisp(idReg, offsetof(PixelFuncID, cached.fogColor)));
		PUNPCKLBW(fogColorReg, R(zeroReg));
		regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
	}
	UnlockPixelID(idReg);

	// Load a set of 255s at 16 bit into a reg for later...
	X64Reg invertReg = regCache_.Alloc(RegCache::VEC_TEMP2);
	PCMPEQW(invertReg, R(invertReg));
	PSRLW(invertReg, 8);

	// Expand (we clamped) color to 16 bit as well, so we can multiply with fog.
	X64Reg argColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);
	if (!colorIs16Bit_) {
		if (cpu_info.bSSE4_1) {
			PMOVZXBW(argColorReg, R(argColorReg));
		} else {
			X64Reg zeroReg = GetZeroVec();
			PUNPCKLBW(argColorReg, R(zeroReg));
			regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
		}
		colorIs16Bit_ = true;
	}

	// Save A so we can put it back, we don't "fog" A.
	X64Reg alphaReg;
	if (regCache_.Has(RegCache::GEN_SRC_ALPHA)) {
		alphaReg = regCache_.Find(RegCache::GEN_SRC_ALPHA);
	} else {
		alphaReg = regCache_.Alloc(RegCache::GEN_SRC_ALPHA);
		PEXTRW(alphaReg, argColorReg, 3);
	}

	// Okay, let's broadcast fog to an XMM.
	X64Reg fogMultReg = regCache_.Alloc(RegCache::VEC_TEMP3);
	X64Reg argFogReg = regCache_.Find(RegCache::GEN_ARG_FOG);
	MOVD_xmm(fogMultReg, R(argFogReg));
	PSHUFLW(fogMultReg, R(fogMultReg), _MM_SHUFFLE(0, 0, 0, 0));
	regCache_.Unlock(argFogReg, RegCache::GEN_ARG_FOG);
	// We can free up the actual fog reg now.
	regCache_.ForceRelease(RegCache::GEN_ARG_FOG);

	// Our goal here is to calculate this formula:
	// (argColor * fog + fogColor * (255 - fog) + 255) / 256

	// Now we multiply the existing color by fog...
	PMULLW(argColorReg, R(fogMultReg));
	// Before inversing, let's add that 255 we loaded in as well, since we have it.
	PADDW(argColorReg, R(invertReg));
	// And then inverse the fog value using those 255s, and multiply by fog color.
	PSUBW(invertReg, R(fogMultReg));
	PMULLW(fogColorReg, R(invertReg));
	// At this point, argColorReg and fogColorReg are multiplied at 16-bit, so we need to sum.
	PADDW(argColorReg, R(fogColorReg));
	regCache_.Release(fogColorReg, RegCache::VEC_TEMP1);
	regCache_.Release(invertReg, RegCache::VEC_TEMP2);
	regCache_.Release(fogMultReg, RegCache::VEC_TEMP3);

	// Now we simply divide by 256, or in other words shift by 8.
	PSRLW(argColorReg, 8);

	// Okay, put A back in, we'll shrink it to 8888 when needed.
	PINSRW(argColorReg, R(alphaReg), 3);
	regCache_.Unlock(argColorReg, RegCache::VEC_ARG_COLOR);

	// We most likely won't use alphaReg again.
	regCache_.Unlock(alphaReg, RegCache::GEN_SRC_ALPHA);

	return true;
}

bool PixelJitCache::Jit_StencilAndDepthTest(const PixelFuncID &id) {
	_assert_(!id.clearMode && id.stencilTest);

	X64Reg stencilReg = GetDestStencil(id);
	Describe("StencilAndDepth");
	X64Reg maskedReg = stencilReg;
	if (id.hasStencilTestMask && stencilReg != INVALID_REG) {
		X64Reg idReg = GetPixelID();
		maskedReg = regCache_.Alloc(RegCache::GEN_TEMP0);
		MOV(32, R(maskedReg), R(stencilReg));
		AND(8, R(maskedReg), MDisp(idReg, offsetof(PixelFuncID, cached.stencilTestMask)));
		UnlockPixelID(idReg);
	}

	bool success = true;
	success = success && Jit_StencilTest(id, stencilReg, maskedReg);
	if (maskedReg != stencilReg)
		regCache_.Release(maskedReg, RegCache::GEN_TEMP0);

	// Next up, the depth test.
	if (stencilReg == INVALID_REG) {
		// Just use the standard one, since we don't need to write stencil.
		// We also don't need to worry about cleanup either.
		return success && Jit_DepthTest(id);
	}

	success = success && Jit_DepthTestForStencil(id, stencilReg);
	success = success && Jit_ApplyStencilOp(id, id.ZPass(), stencilReg);

	// At this point, stencilReg can't be spilled.  It contains the updated value.
	regCache_.Unlock(stencilReg, RegCache::GEN_STENCIL);
	regCache_.ForceRetain(RegCache::GEN_STENCIL);

	return success;
}

bool PixelJitCache::Jit_StencilTest(const PixelFuncID &id, RegCache::Reg stencilReg, RegCache::Reg maskedReg) {
	Describe("StencilTest");

	bool hasFixedResult = false;
	bool fixedResult = false;
	FixupBranch toPass;
	if (stencilReg == INVALID_REG) {
		// This means stencil is a fixed value 0.
		hasFixedResult = true;
		switch (id.StencilTestFunc()) {
		case GE_COMP_NEVER: fixedResult = false; break;
		case GE_COMP_ALWAYS: fixedResult = true; break;
		case GE_COMP_EQUAL: fixedResult = id.stencilTestRef == 0; break;
		case GE_COMP_NOTEQUAL: fixedResult = id.stencilTestRef != 0; break;
		case GE_COMP_LESS: fixedResult = false; break;
		case GE_COMP_LEQUAL: fixedResult = id.stencilTestRef == 0; break;
		case GE_COMP_GREATER: fixedResult = id.stencilTestRef != 0; break;
		case GE_COMP_GEQUAL: fixedResult = true; break;
		}
	} else if (id.StencilTestFunc() == GE_COMP_ALWAYS) {
		// Fairly common, skip the CMP.
		hasFixedResult = true;
		fixedResult = true;
	} else {
		// Reversed here because of the imm, so tests below are reversed.
		CMP(8, R(maskedReg), Imm8(id.stencilTestRef));
		switch (id.StencilTestFunc()) {
		case GE_COMP_NEVER:
			hasFixedResult = true;
			fixedResult = false;
			break;

		case GE_COMP_ALWAYS:
			_assert_(false);
			break;

		case GE_COMP_EQUAL:
			toPass = J_CC(CC_E);
			break;

		case GE_COMP_NOTEQUAL:
			toPass = J_CC(CC_NE);
			break;

		case GE_COMP_LESS:
			toPass = J_CC(CC_A);
			break;

		case GE_COMP_LEQUAL:
			toPass = J_CC(CC_AE);
			break;

		case GE_COMP_GREATER:
			toPass = J_CC(CC_B);
			break;

		case GE_COMP_GEQUAL:
			toPass = J_CC(CC_BE);
			break;
		}
	}

	if (hasFixedResult && !fixedResult && stencilReg == INVALID_REG) {
		Discard();
		return true;
	}

	bool hadColorOffReg = regCache_.Has(RegCache::GEN_COLOR_OFF);
	bool hadIdReg = regCache_.Has(RegCache::GEN_ID);

	bool success = true;
	if (stencilReg != INVALID_REG && (!hasFixedResult || !fixedResult)) {
		// This is the fail path.
		success = success && Jit_ApplyStencilOp(id, id.SFail(), stencilReg);
		success = success && Jit_WriteStencilOnly(id, stencilReg);

		Discard();
	}

	// If we allocated either id or colorOff in the conditional, forget.
	if (!hadColorOffReg && regCache_.Has(RegCache::GEN_COLOR_OFF))
		regCache_.Change(RegCache::GEN_COLOR_OFF, RegCache::GEN_INVALID);
	if (!hadIdReg && regCache_.Has(RegCache::GEN_ID))
		regCache_.Change(RegCache::GEN_ID, RegCache::GEN_INVALID);

	if (!hasFixedResult)
		SetJumpTarget(toPass);
	return success;
}

bool PixelJitCache::Jit_DepthTestForStencil(const PixelFuncID &id, RegCache::Reg stencilReg) {
	if (id.DepthTestFunc() == GE_COMP_ALWAYS || id.earlyZChecks)
		return true;

	X64Reg depthOffReg = GetDepthOff(id);
	Describe("DepthTestStencil");
	X64Reg argZReg = regCache_.Find(RegCache::GEN_ARG_Z);
	CMP(16, R(argZReg), MatR(depthOffReg));
	regCache_.Unlock(depthOffReg, RegCache::GEN_DEPTH_OFF);
	regCache_.Unlock(argZReg, RegCache::GEN_ARG_Z);

	// We discard the opposite of the passing test.
	FixupBranch skip;
	switch (id.DepthTestFunc()) {
	case GE_COMP_NEVER:
		// Shouldn't happen, just do an extra CMP.
		CMP(32, R(RAX), R(RAX));
		// This is just to have a skip that is valid.
		skip = J_CC(CC_NE);
		break;

	case GE_COMP_ALWAYS:
		// Shouldn't happen, just do an extra CMP.
		CMP(32, R(RAX), R(RAX));
		skip = J_CC(CC_E);
		break;

	case GE_COMP_EQUAL:
		skip = J_CC(CC_E);
		break;

	case GE_COMP_NOTEQUAL:
		skip = J_CC(CC_NE);
		break;

	case GE_COMP_LESS:
		skip = J_CC(CC_B);
		break;

	case GE_COMP_LEQUAL:
		skip = J_CC(CC_BE);
		break;

	case GE_COMP_GREATER:
		skip = J_CC(CC_A);
		break;

	case GE_COMP_GEQUAL:
		skip = J_CC(CC_AE);
		break;
	}

	bool hadColorOffReg = regCache_.Has(RegCache::GEN_COLOR_OFF);
	bool hadIdReg = regCache_.Has(RegCache::GEN_ID);

	bool success = true;
	success = success && Jit_ApplyStencilOp(id, id.ZFail(), stencilReg);
	success = success && Jit_WriteStencilOnly(id, stencilReg);
	Discard();

	// If we allocated either id or colorOff in the conditional, forget.
	if (!hadColorOffReg && regCache_.Has(RegCache::GEN_COLOR_OFF))
		regCache_.Change(RegCache::GEN_COLOR_OFF, RegCache::GEN_INVALID);
	if (!hadIdReg && regCache_.Has(RegCache::GEN_ID))
		regCache_.Change(RegCache::GEN_ID, RegCache::GEN_INVALID);

	SetJumpTarget(skip);

	// Like in Jit_DepthTest(), at this point we may not need this reg anymore.
	if (!id.depthWrite)
		regCache_.ForceRelease(RegCache::GEN_ARG_Z);

	return success;
}

bool PixelJitCache::Jit_ApplyStencilOp(const PixelFuncID &id, GEStencilOp op, RegCache::Reg stencilReg) {
	_assert_(stencilReg != INVALID_REG);

	Describe("ApplyStencil");
	FixupBranch skip;
	switch (op) {
	case GE_STENCILOP_KEEP:
		// Nothing to do.
		break;

	case GE_STENCILOP_ZERO:
		XOR(32, R(stencilReg), R(stencilReg));
		break;

	case GE_STENCILOP_REPLACE:
		if (id.hasStencilTestMask) {
			// Load the unmasked value.
			X64Reg idReg = GetPixelID();
			MOVZX(32, 8, stencilReg, MDisp(idReg, offsetof(PixelFuncID, cached.stencilRef)));
			UnlockPixelID(idReg);
		} else {
			MOV(8, R(stencilReg), Imm8(id.stencilTestRef));
		}
		break;

	case GE_STENCILOP_INVERT:
		NOT(8, R(stencilReg));
		break;

	case GE_STENCILOP_INCR:
		switch (id.fbFormat) {
		case GE_FORMAT_565:
			break;

		case GE_FORMAT_5551:
			MOV(8, R(stencilReg), Imm8(0xFF));
			break;

		case GE_FORMAT_4444:
			CMP(8, R(stencilReg), Imm8(0xF0));
			skip = J_CC(CC_AE);
			ADD(8, R(stencilReg), Imm8(0x11));
			SetJumpTarget(skip);
			break;

		case GE_FORMAT_8888:
			CMP(8, R(stencilReg), Imm8(0xFF));
			skip = J_CC(CC_E);
			ADD(8, R(stencilReg), Imm8(0x01));
			SetJumpTarget(skip);
			break;
		}
		break;

	case GE_STENCILOP_DECR:
		switch (id.fbFormat) {
		case GE_FORMAT_565:
			break;

		case GE_FORMAT_5551:
			XOR(32, R(stencilReg), R(stencilReg));
			break;

		case GE_FORMAT_4444:
			CMP(8, R(stencilReg), Imm8(0x11));
			skip = J_CC(CC_B);
			SUB(8, R(stencilReg), Imm8(0x11));
			SetJumpTarget(skip);
			break;

		case GE_FORMAT_8888:
			CMP(8, R(stencilReg), Imm8(0x00));
			skip = J_CC(CC_E);
			SUB(8, R(stencilReg), Imm8(0x01));
			SetJumpTarget(skip);
			break;
		}
		break;
	}

	return true;
}

bool PixelJitCache::Jit_WriteStencilOnly(const PixelFuncID &id, RegCache::Reg stencilReg) {
	_assert_(stencilReg != INVALID_REG);

	// It's okay to destroy stencilReg here, we know we're the last writing it.
	X64Reg colorOffReg = GetColorOff(id);
	Describe("WriteStencil");
	if (id.applyColorWriteMask) {
		X64Reg idReg = GetPixelID();
		X64Reg maskReg = regCache_.Alloc(RegCache::GEN_TEMP5);

		switch (id.fbFormat) {
		case GE_FORMAT_565:
			break;

		case GE_FORMAT_5551:
			// Read the high 8 bits of the 16-bit color mask.
			MOVZX(32, 8, maskReg, MDisp(idReg, offsetof(PixelFuncID, cached.colorWriteMask) + 1));
			OR(8, R(maskReg), Imm8(0x7F));

			// Poor man's BIC...
			NOT(32, R(stencilReg));
			OR(32, R(stencilReg), R(maskReg));
			NOT(32, R(stencilReg));

			AND(8, MDisp(colorOffReg, 1), R(maskReg));
			OR(8, MDisp(colorOffReg, 1), R(stencilReg));
			break;

		case GE_FORMAT_4444:
			// Read the high 8 bits of the 16-bit color mask.
			MOVZX(32, 8, maskReg, MDisp(idReg, offsetof(PixelFuncID, cached.colorWriteMask) + 1));
			OR(8, R(maskReg), Imm8(0x0F));

			// Poor man's BIC...
			NOT(32, R(stencilReg));
			OR(32, R(stencilReg), R(maskReg));
			NOT(32, R(stencilReg));

			AND(8, MDisp(colorOffReg, 1), R(maskReg));
			OR(8, MDisp(colorOffReg, 1), R(stencilReg));
			break;

		case GE_FORMAT_8888:
			// Read the high 8 bits of the 32-bit color mask.
			MOVZX(32, 8, maskReg, MDisp(idReg, offsetof(PixelFuncID, cached.colorWriteMask) + 3));

			// Poor man's BIC...
			NOT(32, R(stencilReg));
			OR(32, R(stencilReg), R(maskReg));
			NOT(32, R(stencilReg));

			AND(8, MDisp(colorOffReg, 3), R(maskReg));
			OR(8, MDisp(colorOffReg, 3), R(stencilReg));
			break;
		}

		regCache_.Release(maskReg, RegCache::GEN_TEMP5);
		UnlockPixelID(idReg);
	} else {
		switch (id.fbFormat) {
		case GE_FORMAT_565:
			break;

		case GE_FORMAT_5551:
			AND(8, R(stencilReg), Imm8(0x80));
			AND(8, MDisp(colorOffReg, 1), Imm8(0x7F));
			OR(8, MDisp(colorOffReg, 1), R(stencilReg));
			break;

		case GE_FORMAT_4444:
			AND(8, MDisp(colorOffReg, 1), Imm8(0x0F));
			AND(8, R(stencilReg), Imm8(0xF0));
			OR(8, MDisp(colorOffReg, 1), R(stencilReg));
			break;

		case GE_FORMAT_8888:
			MOV(8, MDisp(colorOffReg, 3), R(stencilReg));
			break;
		}
	}

	regCache_.Unlock(colorOffReg, RegCache::GEN_COLOR_OFF);
	return true;
}

bool PixelJitCache::Jit_DepthTest(const PixelFuncID &id) {
	if (id.DepthTestFunc() == GE_COMP_ALWAYS || id.earlyZChecks)
		return true;

	if (id.DepthTestFunc() == GE_COMP_NEVER) {
		Discard();
		// This should be uncommon, just keep going to have shared cleanup...
	}

	X64Reg depthOffReg = GetDepthOff(id);
	Describe("DepthTest");
	X64Reg argZReg = regCache_.Find(RegCache::GEN_ARG_Z);
	CMP(16, R(argZReg), MatR(depthOffReg));
	regCache_.Unlock(depthOffReg, RegCache::GEN_DEPTH_OFF);
	regCache_.Unlock(argZReg, RegCache::GEN_ARG_Z);

	// We discard the opposite of the passing test.
	switch (id.DepthTestFunc()) {
	case GE_COMP_NEVER:
	case GE_COMP_ALWAYS:
		break;

	case GE_COMP_EQUAL:
		Discard(CC_NE);
		break;

	case GE_COMP_NOTEQUAL:
		Discard(CC_E);
		break;

	case GE_COMP_LESS:
		Discard(CC_AE);
		break;

	case GE_COMP_LEQUAL:
		Discard(CC_A);
		break;

	case GE_COMP_GREATER:
		Discard(CC_BE);
		break;

	case GE_COMP_GEQUAL:
		Discard(CC_B);
		break;
	}

	// If we're not writing, we don't need Z anymore.  We'll free GEN_DEPTH_OFF in Jit_WriteDepth().
	if (!id.depthWrite)
		regCache_.ForceRelease(RegCache::GEN_ARG_Z);

	return true;
}

bool PixelJitCache::Jit_WriteDepth(const PixelFuncID &id) {
	// Clear mode shares depthWrite for DepthClear().
	if (id.depthWrite) {
		X64Reg depthOffReg = GetDepthOff(id);
		Describe("WriteDepth");
		X64Reg argZReg = regCache_.Find(RegCache::GEN_ARG_Z);
		MOV(16, MatR(depthOffReg), R(argZReg));
		regCache_.Unlock(depthOffReg, RegCache::GEN_DEPTH_OFF);
		regCache_.Unlock(argZReg, RegCache::GEN_ARG_Z);
		regCache_.ForceRelease(RegCache::GEN_ARG_Z);
	}

	// We can free up this reg if we force locked it.
	if (regCache_.Has(RegCache::GEN_DEPTH_OFF)) {
		regCache_.ForceRelease(RegCache::GEN_DEPTH_OFF);
	}

	return true;
}

bool PixelJitCache::Jit_AlphaBlend(const PixelFuncID &id) {
	if (!id.alphaBlend)
		return true;

	// Check if we need to load and prep factors.
	PixelBlendState blendState;
	ComputePixelBlendState(blendState, id);

	bool success = true;

	// Step 1: Load and expand dest color.
	X64Reg dstReg = regCache_.Alloc(RegCache::VEC_TEMP0);
	if (!blendState.readsDstPixel) {
		// Let's load colorOff just for registers to be consistent.
		X64Reg colorOff = GetColorOff(id);
		regCache_.Unlock(colorOff, RegCache::GEN_COLOR_OFF);

		PXOR(dstReg, R(dstReg));
	} else if (id.FBFormat() == GE_FORMAT_8888) {
		X64Reg colorOff = GetColorOff(id);
		Describe("AlphaBlend");
		MOVD_xmm(dstReg, MatR(colorOff));
		regCache_.Unlock(colorOff, RegCache::GEN_COLOR_OFF);
	} else {
		X64Reg colorOff = GetColorOff(id);
		Describe("AlphaBlend");
		X64Reg dstGenReg = regCache_.Alloc(RegCache::GEN_TEMP0);
		MOVZX(32, 16, dstGenReg, MatR(colorOff));
		regCache_.Unlock(colorOff, RegCache::GEN_COLOR_OFF);

		X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
		X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);

		switch (id.fbFormat) {
		case GE_FORMAT_565:
			success = success && Jit_ConvertFrom565(id, dstGenReg, temp1Reg, temp2Reg);
			break;

		case GE_FORMAT_5551:
			success = success && Jit_ConvertFrom5551(id, dstGenReg, temp1Reg, temp2Reg, blendState.usesDstAlpha);
			break;

		case GE_FORMAT_4444:
			success = success && Jit_ConvertFrom4444(id, dstGenReg, temp1Reg, temp2Reg, blendState.usesDstAlpha);
			break;

		case GE_FORMAT_8888:
			break;
		}

		Describe("AlphaBlend");
		MOVD_xmm(dstReg, R(dstGenReg));

		regCache_.Release(dstGenReg, RegCache::GEN_TEMP0);
		regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
		regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);
	}

	// Step 2: Load and apply factors.
	X64Reg argColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);
	if (blendState.usesFactors) {
		X64Reg srcFactorReg = regCache_.Alloc(RegCache::VEC_TEMP1);
		X64Reg dstFactorReg = regCache_.Alloc(RegCache::VEC_TEMP2);

		// We apply these at 16-bit, because they can be doubled and have a half offset.
		if (cpu_info.bSSE4_1) {
			if (!colorIs16Bit_)
				PMOVZXBW(argColorReg, R(argColorReg));
			PMOVZXBW(dstReg, R(dstReg));
		} else {
			X64Reg zeroReg = GetZeroVec();
			if (!colorIs16Bit_)
				PUNPCKLBW(argColorReg, R(zeroReg));
			PUNPCKLBW(dstReg, R(zeroReg));
			regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
		}
		colorIs16Bit_ = true;

		// Skip multiplying by factors if we can.
		bool multiplySrc = id.AlphaBlendSrc() != PixelBlendFactor::ZERO && id.AlphaBlendSrc() != PixelBlendFactor::ONE;
		bool multiplyDst = id.AlphaBlendDst() != PixelBlendFactor::ZERO && id.AlphaBlendDst() != PixelBlendFactor::ONE;
		// We also shift left by 4, so mulhi gives us a free shift
		// We also need to add a half bit later, so this gives us space.
		if (multiplySrc || blendState.srcColorAsFactor)
			PSLLW(argColorReg, 4);
		if (multiplyDst || blendState.dstColorAsFactor || blendState.usesDstAlpha)
			PSLLW(dstReg, 4);

		// Okay, now grab our factors.  Don't bother if they're known values.
		if (id.AlphaBlendSrc() < PixelBlendFactor::ZERO)
			success = success && Jit_BlendFactor(id, srcFactorReg, dstReg, id.AlphaBlendSrc());
		if (id.AlphaBlendDst() < PixelBlendFactor::ZERO)
			success = success && Jit_DstBlendFactor(id, srcFactorReg, dstFactorReg, dstReg);

		X64Reg halfReg = INVALID_REG;
		if (multiplySrc || multiplyDst) {
			halfReg = regCache_.Alloc(RegCache::VEC_TEMP3);
			// We'll use this several times, so load into a reg.
			MOVDQA(halfReg, M(constBlendHalf_11_4s_));
		}

		// Add in the half bit to the factors and color values, then multiply.
		// We take the high 16 bits to get a free right shift by 16.
		if (multiplySrc) {
			POR(srcFactorReg, R(halfReg));
			POR(argColorReg, R(halfReg));
			PMULHUW(argColorReg, R(srcFactorReg));
		} else if (id.AlphaBlendSrc() == PixelBlendFactor::ZERO) {
			PXOR(argColorReg, R(argColorReg));
		} else if (id.AlphaBlendSrc() == PixelBlendFactor::ONE) {
			if (blendState.srcColorAsFactor)
				PSRLW(argColorReg, 4);
		}

		if (multiplyDst) {
			POR(dstFactorReg, R(halfReg));
			POR(dstReg, R(halfReg));
			PMULHUW(dstReg, R(dstFactorReg));
		} else if (id.AlphaBlendDst() == PixelBlendFactor::ZERO) {
			// No need to add or subtract zero, unless we're negating.
			// This is common for bloom preparation.
			if (id.AlphaBlendEq() == GE_BLENDMODE_MUL_AND_SUBTRACT_REVERSE)
				PXOR(dstReg, R(dstReg));
		} else if (id.AlphaBlendDst() == PixelBlendFactor::ONE) {
			if (blendState.dstColorAsFactor || blendState.usesDstAlpha)
				PSRLW(dstReg, 4);
		}

		regCache_.Release(srcFactorReg, RegCache::VEC_TEMP1);
		regCache_.Release(dstFactorReg, RegCache::VEC_TEMP2);
		if (halfReg != INVALID_REG)
			regCache_.Release(halfReg, RegCache::VEC_TEMP3);
	} else if (colorIs16Bit_) {
		// If it's expanded, shrink and clamp for our min/max/absdiff handling.
		PACKUSWB(argColorReg, R(argColorReg));
		colorIs16Bit_ = false;
	}

	// Step 3: Apply equation.
	// Note: below, we completely ignore what happens to the alpha bits.
	// It won't matter, since we'll replace those with stencil anyway.
	X64Reg tempReg = regCache_.Alloc(RegCache::VEC_TEMP1);
	switch (id.AlphaBlendEq()) {
	case GE_BLENDMODE_MUL_AND_ADD:
		if (id.AlphaBlendDst() != PixelBlendFactor::ZERO)
			PADDUSW(argColorReg, R(dstReg));
		break;

	case GE_BLENDMODE_MUL_AND_SUBTRACT:
		if (id.AlphaBlendDst() != PixelBlendFactor::ZERO)
			PSUBUSW(argColorReg, R(dstReg));
		break;

	case GE_BLENDMODE_MUL_AND_SUBTRACT_REVERSE:
		if (cpu_info.bAVX) {
			VPSUBUSW(128, argColorReg, dstReg, R(argColorReg));
		} else {
			MOVDQA(tempReg, R(argColorReg));
			MOVDQA(argColorReg, R(dstReg));
			PSUBUSW(argColorReg, R(tempReg));
		}
		break;

	case GE_BLENDMODE_MIN:
		PMINUB(argColorReg, R(dstReg));
		break;

	case GE_BLENDMODE_MAX:
		PMAXUB(argColorReg, R(dstReg));
		break;

	case GE_BLENDMODE_ABSDIFF:
		// Calculate A=(dst-src < 0 ? 0 : dst-src) and B=(src-dst < 0 ? 0 : src-dst)...
		MOVDQA(tempReg, R(dstReg));
		PSUBUSB(tempReg, R(argColorReg));
		PSUBUSB(argColorReg, R(dstReg));

		// Now, one of those must be zero, and the other one is the result (could also be zero.)
		POR(argColorReg, R(tempReg));
		break;
	}

	regCache_.Release(dstReg, RegCache::VEC_TEMP0);
	regCache_.Release(tempReg, RegCache::VEC_TEMP1);
	regCache_.Unlock(argColorReg, RegCache::VEC_ARG_COLOR);

	return success;
}

bool PixelJitCache::Jit_BlendFactor(const PixelFuncID &id, RegCache::Reg factorReg, RegCache::Reg dstReg, PixelBlendFactor factor) {
	X64Reg idReg = INVALID_REG;
	X64Reg tempReg = INVALID_REG;
	X64Reg argColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);

	// Everything below expects an expanded 16-bit color
	_assert_(colorIs16Bit_);

	// Between source and dest factors, only DSTCOLOR, INVDSTCOLOR, and FIXA differ.
	// In those cases, it uses SRCCOLOR, INVSRCCOLOR, and FIXB respectively.

	// Load the invert constant first off, if needed.
	switch (factor) {
	case PixelBlendFactor::INVOTHERCOLOR:
	case PixelBlendFactor::INVSRCALPHA:
	case PixelBlendFactor::INVDSTALPHA:
	case PixelBlendFactor::DOUBLEINVSRCALPHA:
	case PixelBlendFactor::DOUBLEINVDSTALPHA:
		MOVDQA(factorReg, M(constBlendInvert_11_4s_));
		break;

	default:
		break;
	}

	switch (factor) {
	case PixelBlendFactor::OTHERCOLOR:
		MOVDQA(factorReg, R(dstReg));
		break;

	case PixelBlendFactor::INVOTHERCOLOR:
		PSUBUSW(factorReg, R(dstReg));
		break;

	case PixelBlendFactor::SRCALPHA:
		PSHUFLW(factorReg, R(argColorReg), _MM_SHUFFLE(3, 3, 3, 3));
		break;

	case PixelBlendFactor::INVSRCALPHA:
		tempReg = regCache_.Alloc(RegCache::VEC_TEMP3);

		PSHUFLW(tempReg, R(argColorReg), _MM_SHUFFLE(3, 3, 3, 3));
		PSUBUSW(factorReg, R(tempReg));
		break;

	case PixelBlendFactor::DSTALPHA:
		PSHUFLW(factorReg, R(dstReg), _MM_SHUFFLE(3, 3, 3, 3));
		break;

	case PixelBlendFactor::INVDSTALPHA:
		tempReg = regCache_.Alloc(RegCache::VEC_TEMP3);

		PSHUFLW(tempReg, R(dstReg), _MM_SHUFFLE(3, 3, 3, 3));
		PSUBUSW(factorReg, R(tempReg));
		break;

	case PixelBlendFactor::DOUBLESRCALPHA:
		PSHUFLW(factorReg, R(argColorReg), _MM_SHUFFLE(3, 3, 3, 3));
		PSLLW(factorReg, 1);
		break;

	case PixelBlendFactor::DOUBLEINVSRCALPHA:
		tempReg = regCache_.Alloc(RegCache::VEC_TEMP3);

		PSHUFLW(tempReg, R(argColorReg), _MM_SHUFFLE(3, 3, 3, 3));
		PSLLW(tempReg, 1);
		PSUBUSW(factorReg, R(tempReg));
		break;

	case PixelBlendFactor::DOUBLEDSTALPHA:
		PSHUFLW(factorReg, R(dstReg), _MM_SHUFFLE(3, 3, 3, 3));
		PSLLW(factorReg, 1);
		break;

	case PixelBlendFactor::DOUBLEINVDSTALPHA:
		tempReg = regCache_.Alloc(RegCache::VEC_TEMP3);

		PSHUFLW(tempReg, R(dstReg), _MM_SHUFFLE(3, 3, 3, 3));
		PSLLW(tempReg, 1);
		PSUBUSW(factorReg, R(tempReg));
		break;

	case PixelBlendFactor::ZERO:
		// Special value meaning zero.
		PXOR(factorReg, R(factorReg));
		break;

	case PixelBlendFactor::ONE:
		// Special value meaning all 255s.
		PCMPEQD(factorReg, R(factorReg));
		PSLLW(factorReg, 8);
		PSRLW(factorReg, 4);
		break;

	case PixelBlendFactor::FIX:
	default:
		idReg = GetPixelID();
		if (cpu_info.bSSE4_1) {
			PMOVZXBW(factorReg, MDisp(idReg, offsetof(PixelFuncID, cached.alphaBlendSrc)));
		} else {
			X64Reg zeroReg = GetZeroVec();
			MOVD_xmm(factorReg, MDisp(idReg, offsetof(PixelFuncID, cached.alphaBlendSrc)));
			PUNPCKLBW(factorReg, R(zeroReg));
			regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
		}
		// Round it out by shifting into place.
		PSLLW(factorReg, 4);
		break;
	}

	if (idReg != INVALID_REG)
		UnlockPixelID(idReg);
	if (tempReg != INVALID_REG)
		regCache_.Release(tempReg, RegCache::VEC_TEMP3);
	regCache_.Unlock(argColorReg, RegCache::VEC_ARG_COLOR);

	return true;
}

bool PixelJitCache::Jit_DstBlendFactor(const PixelFuncID &id, RegCache::Reg srcFactorReg, RegCache::Reg dstFactorReg, RegCache::Reg dstReg) {
	bool success = true;
	X64Reg idReg = INVALID_REG;
	X64Reg argColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);

	// Everything below expects an expanded 16-bit color
	_assert_(colorIs16Bit_);

	PixelBlendState blendState;
	ComputePixelBlendState(blendState, id);

	// We might be able to reuse srcFactorReg for dst, in some cases.
	switch (id.AlphaBlendDst()) {
	case PixelBlendFactor::OTHERCOLOR:
		MOVDQA(dstFactorReg, R(argColorReg));
		break;

	case PixelBlendFactor::INVOTHERCOLOR:
		MOVDQA(dstFactorReg, M(constBlendInvert_11_4s_));
		PSUBUSW(dstFactorReg, R(argColorReg));
		break;

	case PixelBlendFactor::SRCALPHA:
	case PixelBlendFactor::INVSRCALPHA:
	case PixelBlendFactor::DSTALPHA:
	case PixelBlendFactor::INVDSTALPHA:
	case PixelBlendFactor::DOUBLESRCALPHA:
	case PixelBlendFactor::DOUBLEINVSRCALPHA:
	case PixelBlendFactor::DOUBLEDSTALPHA:
	case PixelBlendFactor::DOUBLEINVDSTALPHA:
	case PixelBlendFactor::ZERO:
	case PixelBlendFactor::ONE:
		// These are all equivalent for src factor, so reuse that logic.
		if (id.AlphaBlendSrc() == id.AlphaBlendDst()) {
			MOVDQA(dstFactorReg, R(srcFactorReg));
		} else if (blendState.dstFactorIsInverse) {
			MOVDQA(dstFactorReg, M(constBlendInvert_11_4s_));
			PSUBUSW(dstFactorReg, R(srcFactorReg));
		} else {
			success = success && Jit_BlendFactor(id, dstFactorReg, dstReg, id.AlphaBlendDst());
		}
		break;

	case PixelBlendFactor::FIX:
	default:
		idReg = GetPixelID();
		if (cpu_info.bSSE4_1) {
			PMOVZXBW(dstFactorReg, MDisp(idReg, offsetof(PixelFuncID, cached.alphaBlendDst)));
		} else {
			X64Reg zeroReg = GetZeroVec();
			MOVD_xmm(dstFactorReg, MDisp(idReg, offsetof(PixelFuncID, cached.alphaBlendDst)));
			PUNPCKLBW(dstFactorReg, R(zeroReg));
			regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
		}
		// Round it out by shifting into place.
		PSLLW(dstFactorReg, 4);
		break;
	}

	if (idReg != INVALID_REG)
		UnlockPixelID(idReg);
	regCache_.Unlock(argColorReg, RegCache::VEC_ARG_COLOR);

	return success;
}

bool PixelJitCache::Jit_Dither(const PixelFuncID &id) {
	if (!id.dithering)
		return true;

	Describe("Dither");
	X64Reg valueReg = regCache_.Alloc(RegCache::GEN_TEMP0);

	// Load the row dither matrix entry (will still need to get the X.)
	X64Reg argYReg = regCache_.Find(RegCache::GEN_ARG_Y);
	MOV(32, R(valueReg), R(argYReg));
	AND(32, R(valueReg), Imm8(3));

	// At this point, we're done with depth and y, so let's grab GEN_COLOR_OFF and retain it.
	// Then we can modify x and throw it away too, which is our actual goal.
	X64Reg colorOffReg = GetColorOff(id);
	Describe("Dither");
	regCache_.Unlock(colorOffReg, RegCache::GEN_COLOR_OFF);
	regCache_.ForceRetain(RegCache::GEN_COLOR_OFF);
	// And get rid of y, we can use for other regs.
	regCache_.Unlock(argYReg, RegCache::GEN_ARG_Y);
	regCache_.ForceRelease(RegCache::GEN_ARG_Y);

	X64Reg argXReg = regCache_.Find(RegCache::GEN_ARG_X);
	AND(32, R(argXReg), Imm32(3));

	// Sum up (x + y * 4) + ditherMatrix offset to valueReg.
	LEA(32, valueReg, MComplex(argXReg, valueReg, 4, offsetof(PixelFuncID, cached.ditherMatrix)));

	// Okay, now abuse argXReg to read the PixelFuncID pointer on the stack.
	if (regCache_.Has(RegCache::GEN_ARG_ID) || regCache_.Has(RegCache::GEN_ID)) {
		X64Reg idReg = GetPixelID();
		MOVSX(32, 8, valueReg, MRegSum(idReg, valueReg));
		UnlockPixelID(idReg);
	} else {
		_assert_(stackIDOffset_ != -1);
		MOV(PTRBITS, R(argXReg), MDisp(RSP, stackIDOffset_));
		MOVSX(32, 8, valueReg, MRegSum(argXReg, valueReg));
	}
	regCache_.Unlock(argXReg, RegCache::GEN_ARG_X);
	regCache_.ForceRelease(RegCache::GEN_ARG_X);

	// Copy that value into a vec to add to the color.
	X64Reg vecValueReg = regCache_.Alloc(RegCache::VEC_TEMP0);
	MOVD_xmm(vecValueReg, R(valueReg));
	regCache_.Release(valueReg, RegCache::GEN_TEMP0);

	// Now we want to broadcast RGB in 16-bit, but keep A as 0.
	// Luckily, we know that third lane (in 16-bit) is zero from MOVD clearing it.
	// We use 16-bit because we need a signed add, but we also want to saturate.
	PSHUFLW(vecValueReg, R(vecValueReg), _MM_SHUFFLE(2, 0, 0, 0));

	// With that, now let's convert the color to 16 bit...
	X64Reg argColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);
	if (!colorIs16Bit_) {
		if (cpu_info.bSSE4_1) {
			PMOVZXBW(argColorReg, R(argColorReg));
		} else {
			X64Reg zeroReg = GetZeroVec();
			PUNPCKLBW(argColorReg, R(zeroReg));
			regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
		}
		colorIs16Bit_ = true;
	}
	// And simply add the dither values.
	PADDSW(argColorReg, R(vecValueReg));
	regCache_.Release(vecValueReg, RegCache::VEC_TEMP0);
	regCache_.Unlock(argColorReg, RegCache::VEC_ARG_COLOR);

	return true;
}

bool PixelJitCache::Jit_WriteColor(const PixelFuncID &id) {
	X64Reg colorOff = GetColorOff(id);
	Describe("WriteColor");
	if (regCache_.Has(RegCache::GEN_ARG_X)) {
		// We normally toss x and y during dithering or useStandardStride with no dithering.
		// Free up the regs now to get more reg space.
		regCache_.ForceRelease(RegCache::GEN_ARG_X);
		regCache_.ForceRelease(RegCache::GEN_ARG_Y);

		// But make sure we don't lose GEN_COLOR_OFF, we'll be lost without that now.
		regCache_.ForceRetain(RegCache::GEN_COLOR_OFF);
	}

	// Convert back to 8888 and clamp.
	X64Reg argColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);
	if (colorIs16Bit_) {
		PACKUSWB(argColorReg, R(argColorReg));
		colorIs16Bit_ = false;
	}

	if (id.clearMode) {
		bool drawingDone = false;
		if (!id.ColorClear() && !id.StencilClear())
			drawingDone = true;
		if (!id.ColorClear() && id.FBFormat() == GE_FORMAT_565)
			drawingDone = true;

		bool success = true;
		if (!id.ColorClear() && !drawingDone) {
			// Let's reuse Jit_WriteStencilOnly for this path.
			X64Reg alphaReg;
			if (regCache_.Has(RegCache::GEN_SRC_ALPHA)) {
				alphaReg = regCache_.Find(RegCache::GEN_SRC_ALPHA);
			} else {
				alphaReg = regCache_.Alloc(RegCache::GEN_SRC_ALPHA);
				MOVD_xmm(R(alphaReg), argColorReg);
				SHR(32, R(alphaReg), Imm8(24));
			}
			success = Jit_WriteStencilOnly(id, alphaReg);
			regCache_.Release(alphaReg, RegCache::GEN_SRC_ALPHA);

			drawingDone = true;
		}

		if (drawingDone) {
			regCache_.Unlock(argColorReg, RegCache::VEC_ARG_COLOR);
			regCache_.ForceRelease(RegCache::VEC_ARG_COLOR);
			regCache_.Unlock(colorOff, RegCache::GEN_COLOR_OFF);
			regCache_.ForceRelease(RegCache::GEN_COLOR_OFF);
			return success;
		}

		// In this case, we're clearing only color or only color and stencil.  Proceed.
	}

	X64Reg colorReg = regCache_.Alloc(RegCache::GEN_TEMP0);
	MOVD_xmm(R(colorReg), argColorReg);
	regCache_.Unlock(argColorReg, RegCache::VEC_ARG_COLOR);
	regCache_.ForceRelease(RegCache::VEC_ARG_COLOR);

	X64Reg stencilReg = INVALID_REG;
	if (regCache_.Has(RegCache::GEN_STENCIL))
		stencilReg = regCache_.Find(RegCache::GEN_STENCIL);

	X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
	X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);
	bool convertAlpha = id.clearMode && id.StencilClear();
	bool writeAlpha = convertAlpha || stencilReg != INVALID_REG;
	uint32_t fixedKeepMask = 0x00000000;

	bool success = true;

	// Step 1: Load the color into colorReg.
	switch (id.fbFormat) {
	case GE_FORMAT_565:
		// In this case, stencil doesn't matter.
		success = success && Jit_ConvertTo565(id, colorReg, temp1Reg, temp2Reg);
		break;

	case GE_FORMAT_5551:
		success = success && Jit_ConvertTo5551(id, colorReg, temp1Reg, temp2Reg, convertAlpha);

		if (stencilReg != INVALID_REG) {
			// Truncate off the top bit of the stencil.
			SHR(32, R(stencilReg), Imm8(7));
			SHL(32, R(stencilReg), Imm8(15));
		} else if (!writeAlpha) {
			fixedKeepMask = 0x8000;
		}
		break;

	case GE_FORMAT_4444:
		success = success && Jit_ConvertTo4444(id, colorReg, temp1Reg, temp2Reg, convertAlpha);

		if (stencilReg != INVALID_REG) {
			// Truncate off the top bit of the stencil.
			SHR(32, R(stencilReg), Imm8(4));
			SHL(32, R(stencilReg), Imm8(12));
		} else if (!writeAlpha) {
			fixedKeepMask = 0xF000;
		}
		break;

	case GE_FORMAT_8888:
		if (stencilReg != INVALID_REG) {
			SHL(32, R(stencilReg), Imm8(24));
			// Clear out the alpha bits so we can fit the stencil.
			AND(32, R(colorReg), Imm32(0x00FFFFFF));
		} else if (!writeAlpha) {
			fixedKeepMask = 0xFF000000;
		}
		break;
	}

	// Step 2: Load write mask if needed.
	// Note that we apply the write mask at the destination bit depth.
	Describe("WriteColor");
	X64Reg maskReg = INVALID_REG;
	if (id.applyColorWriteMask) {
		maskReg = regCache_.Alloc(RegCache::GEN_TEMP3);
		// Load the pre-converted and combined write mask.
		if (regCache_.Has(RegCache::GEN_ARG_ID) || regCache_.Has(RegCache::GEN_ID)) {
			X64Reg idReg = GetPixelID();
			MOV(32, R(maskReg), MDisp(idReg, offsetof(PixelFuncID, cached.colorWriteMask)));
			UnlockPixelID(idReg);
		} else {
			_assert_(stackIDOffset_ != -1);
			MOV(PTRBITS, R(maskReg), MDisp(RSP, stackIDOffset_));
			MOV(32, R(maskReg), MDisp(maskReg, offsetof(PixelFuncID, cached.colorWriteMask)));
		}
	}

	// We've run out of regs, let's live without temp2 from here on.
	regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);

	// Step 3: Apply logic op, combine stencil.
	skipStandardWrites_.clear();
	if (id.applyLogicOp) {
		// Note: we combine stencil during logic op, because it's a bit complex to retain.
		success = success && Jit_ApplyLogicOp(id, colorReg, maskReg);
	} else if (stencilReg != INVALID_REG) {
		OR(32, R(colorReg), R(stencilReg));
	}

	// Step 4: Write and apply write mask.
	Describe("WriteColor");
	switch (id.fbFormat) {
	case GE_FORMAT_565:
	case GE_FORMAT_5551:
	case GE_FORMAT_4444:
		if (maskReg != INVALID_REG) {
			// Zero all other bits, then flip maskReg to clear the bits we're keeping in colorReg.
			AND(16, MatR(colorOff), R(maskReg));
			if (cpu_info.bBMI1) {
				ANDN(32, colorReg, maskReg, R(colorReg));
			} else {
				NOT(32, R(maskReg));
				AND(32, R(colorReg), R(maskReg));
			}
			OR(16, MatR(colorOff), R(colorReg));
		} else if (fixedKeepMask == 0) {
			MOV(16, MatR(colorOff), R(colorReg));
		} else {
			// Clear the non-stencil bits and or in the color.
			AND(16, MatR(colorOff), Imm16((uint16_t)fixedKeepMask));
			OR(16, MatR(colorOff), R(colorReg));
		}
		break;

	case GE_FORMAT_8888:
		if (maskReg != INVALID_REG) {
			// Zero all other bits, then flip maskReg to clear the bits we're keeping in colorReg.
			AND(32, MatR(colorOff), R(maskReg));
			if (cpu_info.bBMI1) {
				ANDN(32, colorReg, maskReg, R(colorReg));
			} else {
				NOT(32, R(maskReg));
				AND(32, R(colorReg), R(maskReg));
			}
			OR(32, MatR(colorOff), R(colorReg));
		} else if (fixedKeepMask == 0) {
			MOV(32, MatR(colorOff), R(colorReg));
		} else if (fixedKeepMask == 0xFF000000) {
			// We want to set 24 bits only, since we're not changing stencil.
			// For now, let's do two writes rather than reading in the old stencil.
			MOV(16, MatR(colorOff), R(colorReg));
			SHR(32, R(colorReg), Imm8(16));
			MOV(8, MDisp(colorOff, 2), R(colorReg));
		} else {
			AND(32, MatR(colorOff), Imm32(fixedKeepMask));
			OR(32, MatR(colorOff), R(colorReg));
		}
		break;
	}

	for (FixupBranch &fixup : skipStandardWrites_)
		SetJumpTarget(fixup);
	skipStandardWrites_.clear();

	regCache_.Unlock(colorOff, RegCache::GEN_COLOR_OFF);
	regCache_.ForceRelease(RegCache::GEN_COLOR_OFF);
	regCache_.Release(colorReg, RegCache::GEN_TEMP0);
	regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
	if (maskReg != INVALID_REG)
		regCache_.Release(maskReg, RegCache::GEN_TEMP3);
	if (stencilReg != INVALID_REG) {
		regCache_.Unlock(stencilReg, RegCache::GEN_STENCIL);
		regCache_.ForceRelease(RegCache::GEN_STENCIL);
	}

	return success;
}

bool PixelJitCache::Jit_ApplyLogicOp(const PixelFuncID &id, RegCache::Reg colorReg, RegCache::Reg maskReg) {
	Describe("LogicOp");
	X64Reg logicOpReg = regCache_.Alloc(RegCache::GEN_TEMP4);
	if (regCache_.Has(RegCache::GEN_ARG_ID) || regCache_.Has(RegCache::GEN_ID)) {
		X64Reg idReg = GetPixelID();
		MOVZX(32, 8, logicOpReg, MDisp(idReg, offsetof(PixelFuncID, cached.logicOp)));
		UnlockPixelID(idReg);
	} else {
		_assert_(stackIDOffset_ != -1);
		MOV(PTRBITS, R(logicOpReg), MDisp(RSP, stackIDOffset_));
		MOVZX(32, 8, logicOpReg, MDisp(logicOpReg, offsetof(PixelFuncID, cached.logicOp)));
	}

	X64Reg stencilReg = INVALID_REG;
	if (regCache_.Has(RegCache::GEN_STENCIL))
		stencilReg = regCache_.Find(RegCache::GEN_STENCIL);

	// Should already be allocated.
	X64Reg colorOff = regCache_.Find(RegCache::GEN_COLOR_OFF);
	X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP5);

	// We'll use these in several cases, so prepare.
	int bits = id.fbFormat == GE_FORMAT_8888 ? 32 : 16;
	OpArg stencilMask, notStencilMask;
	switch (id.fbFormat) {
	case GE_FORMAT_565:
		stencilMask = Imm16(0);
		notStencilMask = Imm16(0xFFFF);
		break;
	case GE_FORMAT_5551:
		stencilMask = Imm16(0x8000);
		notStencilMask = Imm16(0x7FFF);
		break;
	case GE_FORMAT_4444:
		stencilMask = Imm16(0xF000);
		notStencilMask = Imm16(0x0FFF);
		break;
	case GE_FORMAT_8888:
		stencilMask = Imm32(0xFF000000);
		notStencilMask = Imm32(0x00FFFFFF);
		break;
	}

	std::vector<FixupBranch> finishes;
	finishes.reserve(11);
	FixupBranch skipTable = J(true);
	const u8 *tableValues[16]{};

	tableValues[GE_LOGIC_CLEAR] = GetCodePointer();
	if (stencilReg != INVALID_REG) {
		// If clearing and setting the stencil, that's easy - stencilReg has it.
		MOV(32, R(colorReg), R(stencilReg));
		finishes.push_back(J(true));
	} else if (maskReg != INVALID_REG) {
		// Just and out the unmasked bits (stencil already included in maskReg.)
		AND(bits, MatR(colorOff), R(maskReg));
		skipStandardWrites_.push_back(J(true));
	} else {
		// Otherwise, no mask, just AND the stencil bits to zero the rest.
		AND(bits, MatR(colorOff), stencilMask);
		skipStandardWrites_.push_back(J(true));
	}

	tableValues[GE_LOGIC_AND] = GetCodePointer();
	if (stencilReg != INVALID_REG && maskReg != INVALID_REG) {
		// Since we're ANDing, set the mask bits (AND will keep them as-is.)
		OR(32, R(colorReg), R(maskReg));
		OR(32, R(colorReg), R(stencilReg));

		// To apply stencil, we'll OR the stencil unmasked bits in memory, so our AND keeps them.
		NOT(32, R(maskReg));
		AND(bits, R(maskReg), stencilMask);
		OR(bits, MatR(colorOff), R(maskReg));
	} else if (stencilReg != INVALID_REG) {
		OR(32, R(colorReg), R(stencilReg));
		// No mask, so just or in the stencil bits so our AND can set any we want.
		OR(bits, MatR(colorOff), stencilMask);
	} else if (maskReg != INVALID_REG) {
		// Force in the mask (which includes all stencil bits) so both are kept as-is.
		OR(32, R(colorReg), R(maskReg));
	} else {
		// Force on the stencil bits so they AND and keep the existing value.
		if (stencilMask.GetImmValue() != 0)
			OR(bits, R(colorReg), stencilMask);
	}
	// Now the AND, which applies stencil and the logic op.
	AND(bits, MatR(colorOff), R(colorReg));
	skipStandardWrites_.push_back(J(true));

	tableValues[GE_LOGIC_AND_REVERSE] = GetCodePointer();
	// Reverse memory in a temp reg so we can apply the write mask easily.
	MOV(bits, R(temp1Reg), MatR(colorOff));
	if (cpu_info.bBMI1) {
		ANDN(32, colorReg, temp1Reg, R(colorReg));
	} else {
		NOT(32, R(temp1Reg));
		AND(32, R(colorReg), R(temp1Reg));
	}
	// Now add in the stencil bits (must be zero before, since we used AND.)
	if (stencilReg != INVALID_REG) {
		OR(32, R(colorReg), R(stencilReg));
	}
	finishes.push_back(J(true));

	tableValues[GE_LOGIC_COPY] = GetCodePointer();
	// This is just a standard write, nothing complex.
	if (stencilReg != INVALID_REG) {
		OR(32, R(colorReg), R(stencilReg));
	}
	finishes.push_back(J(true));

	tableValues[GE_LOGIC_AND_INVERTED] = GetCodePointer();
	if (stencilReg != INVALID_REG) {
		// Set the stencil bits, so they're zero when we invert.
		OR(bits, R(colorReg), stencilMask);
		NOT(32, R(colorReg));
		OR(32, R(colorReg), R(stencilReg));

		if (maskReg != INVALID_REG) {
			// This way our AND will keep all those bits.
			OR(32, R(colorReg), R(maskReg));

			// To apply stencil, we'll OR the stencil unmasked bits in memory, so our AND keeps them.
			NOT(32, R(maskReg));
			AND(bits, R(maskReg), stencilMask);
			OR(bits, MatR(colorOff), R(maskReg));
		} else {
			// Force memory to take our stencil bits by ORing for the AND.
			OR(bits, MatR(colorOff), stencilMask);
		}
	} else if (maskReg != INVALID_REG) {
		NOT(32, R(colorReg));
		// This way our AND will keep all those bits.
		OR(32, R(colorReg), R(maskReg));
	} else {
		// Invert our color, but then add in stencil bits so the AND keeps them.
		NOT(32, R(colorReg));
		// We only do this for 8888 since the rest will have had 0 stencil bits (which turned to 1s.)
		if (id.FBFormat() == GE_FORMAT_8888)
			OR(bits, R(colorReg), stencilMask);
	}
	AND(bits, MatR(colorOff), R(colorReg));
	skipStandardWrites_.push_back(J(true));

	tableValues[GE_LOGIC_NOOP] = GetCodePointer();
	if (stencilReg != INVALID_REG && maskReg != INVALID_REG) {
		// Start by clearing masked bits from stencilReg.
		if (cpu_info.bBMI1) {
			ANDN(32, stencilReg, maskReg, R(stencilReg));
		} else {
			NOT(32, R(maskReg));
			AND(32, R(stencilReg), R(maskReg));
			NOT(32, R(maskReg));
		}

		// Now mask out the stencil bits we're writing from memory.
		OR(bits, R(maskReg), notStencilMask);
		AND(bits, MatR(colorOff), R(maskReg));

		// Now set those remaining stencil bits.
		OR(bits, MatR(colorOff), R(stencilReg));
		skipStandardWrites_.push_back(J(true));
	} else if (stencilReg != INVALID_REG) {
		// Clear and set just the stencil bits.
		AND(bits, MatR(colorOff), notStencilMask);
		OR(bits, MatR(colorOff), R(stencilReg));
		skipStandardWrites_.push_back(J(true));
	} else {
		Discard();
	}

	tableValues[GE_LOGIC_XOR] = GetCodePointer();
	XOR(bits, R(colorReg), MatR(colorOff));
	if (stencilReg != INVALID_REG) {
		// Purge out the stencil bits from the XOR and copy ours in.
		AND(bits, R(colorReg), notStencilMask);
		OR(32, R(colorReg), R(stencilReg));
	} else if (maskReg == INVALID_REG && stencilMask.GetImmValue() != 0) {
		// XOR might've set some bits, and without a maskReg we won't clear them.
		AND(bits, R(colorReg), notStencilMask);
	}
	finishes.push_back(J(true));

	tableValues[GE_LOGIC_OR] = GetCodePointer();
	if (stencilReg != INVALID_REG && maskReg != INVALID_REG) {
		OR(32, R(colorReg), R(stencilReg));

		// Clear the bits we should be masking out.
		if (cpu_info.bBMI1) {
			ANDN(32, colorReg, maskReg, R(colorReg));
		} else {
			NOT(32, R(maskReg));
			AND(32, R(colorReg), R(maskReg));
			NOT(32, R(maskReg));
		}

		// Clear all the unmasked stencil bits, so we can set our own.
		OR(bits, R(maskReg), notStencilMask);
		AND(bits, MatR(colorOff), R(maskReg));
	} else if (stencilReg != INVALID_REG) {
		OR(32, R(colorReg), R(stencilReg));
		// AND out the stencil bits so we set our own.
		AND(bits, MatR(colorOff), notStencilMask);
	} else if (maskReg != INVALID_REG) {
		// Clear the bits we should be masking out.
		if (cpu_info.bBMI1) {
			ANDN(32, colorReg, maskReg, R(colorReg));
		} else {
			NOT(32, R(maskReg));
			AND(32, R(colorReg), R(maskReg));
		}
	} else if (id.FBFormat() == GE_FORMAT_8888) {
		// We only need to do this for 8888, the others already have 0 stencil.
		AND(bits, R(colorReg), notStencilMask);
	}
	// Now the OR, which applies stencil and the logic op itself.
	OR(bits, MatR(colorOff), R(colorReg));
	skipStandardWrites_.push_back(J(true));

	tableValues[GE_LOGIC_NOR] = GetCodePointer();
	OR(bits, R(colorReg), MatR(colorOff));
	NOT(32, R(colorReg));
	if (stencilReg != INVALID_REG) {
		AND(bits, R(colorReg), notStencilMask);
		OR(32, R(colorReg), R(stencilReg));
	} else if (maskReg == INVALID_REG && stencilMask.GetImmValue() != 0) {
		// We need to clear the stencil bits since the standard write logic assumes they're zero.
		AND(bits, R(colorReg), notStencilMask);
	}
	finishes.push_back(J(true));

	tableValues[GE_LOGIC_EQUIV] = GetCodePointer();
	XOR(bits, R(colorReg), MatR(colorOff));
	NOT(32, R(colorReg));
	if (stencilReg != INVALID_REG) {
		AND(bits, R(colorReg), notStencilMask);
		OR(32, R(colorReg), R(stencilReg));
	} else if (maskReg == INVALID_REG && stencilMask.GetImmValue() != 0) {
		// We need to clear the stencil bits since the standard write logic assumes they're zero.
		AND(bits, R(colorReg), notStencilMask);
	}
	finishes.push_back(J(true));

	tableValues[GE_LOGIC_INVERTED] = GetCodePointer();
	// We just toss our color entirely.
	MOV(bits, R(colorReg), MatR(colorOff));
	NOT(32, R(colorReg));
	if (stencilReg != INVALID_REG) {
		AND(bits, R(colorReg), notStencilMask);
		OR(32, R(colorReg), R(stencilReg));
	} else if (maskReg == INVALID_REG && stencilMask.GetImmValue() != 0) {
		// We need to clear the stencil bits since the standard write logic assumes they're zero.
		AND(bits, R(colorReg), notStencilMask);
	}
	finishes.push_back(J(true));

	tableValues[GE_LOGIC_OR_REVERSE] = GetCodePointer();
	// Reverse in a temp reg so we can mask properly.
	MOV(bits, R(temp1Reg), MatR(colorOff));
	NOT(32, R(temp1Reg));
	OR(32, R(colorReg), R(temp1Reg));
	if (stencilReg != INVALID_REG) {
		AND(bits, R(colorReg), notStencilMask);
		OR(32, R(colorReg), R(stencilReg));
	} else if (maskReg == INVALID_REG && stencilMask.GetImmValue() != 0) {
		// We need to clear the stencil bits since the standard write logic assumes they're zero.
		AND(bits, R(colorReg), notStencilMask);
	}
	finishes.push_back(J(true));

	tableValues[GE_LOGIC_COPY_INVERTED] = GetCodePointer();
	NOT(32, R(colorReg));
	if (stencilReg != INVALID_REG) {
		AND(bits, R(colorReg), notStencilMask);
		OR(32, R(colorReg), R(stencilReg));
	} else if (maskReg == INVALID_REG && stencilMask.GetImmValue() != 0) {
		// We need to clear the stencil bits since the standard write logic assumes they're zero.
		AND(bits, R(colorReg), notStencilMask);
	}
	finishes.push_back(J(true));

	tableValues[GE_LOGIC_OR_INVERTED] = GetCodePointer();
	NOT(32, R(colorReg));
	if (stencilReg != INVALID_REG && maskReg != INVALID_REG) {
		AND(bits, R(colorReg), notStencilMask);
		OR(32, R(colorReg), R(stencilReg));

		// Clear the bits we should be masking out.
		if (cpu_info.bBMI1) {
			ANDN(32, colorReg, maskReg, R(colorReg));
		} else {
			NOT(32, R(maskReg));
			AND(32, R(colorReg), R(maskReg));
			NOT(32, R(maskReg));
		}

		// Clear all the unmasked stencil bits, so we can set our own.
		OR(bits, R(maskReg), notStencilMask);
		AND(bits, MatR(colorOff), R(maskReg));
	} else if (stencilReg != INVALID_REG) {
		AND(bits, R(colorReg), notStencilMask);
		OR(32, R(colorReg), R(stencilReg));
		// AND out the stencil bits so we set our own.
		AND(bits, MatR(colorOff), notStencilMask);
	} else if (maskReg != INVALID_REG) {
		// Clear the bits we should be masking out.
		NOT(32, R(maskReg));
		AND(32, R(colorReg), R(maskReg));
	} else if (id.FBFormat() == GE_FORMAT_8888) {
		// We only need to do this for 8888, the others already have 0 stencil.
		AND(bits, R(colorReg), notStencilMask);
	}
	OR(bits, MatR(colorOff), R(colorReg));
	skipStandardWrites_.push_back(J(true));

	tableValues[GE_LOGIC_NAND] = GetCodePointer();
	AND(bits, R(temp1Reg), MatR(colorOff));
	NOT(32, R(colorReg));
	if (stencilReg != INVALID_REG) {
		AND(bits, R(colorReg), notStencilMask);
		OR(32, R(colorReg), R(stencilReg));
	} else if (maskReg == INVALID_REG && stencilMask.GetImmValue() != 0) {
		// We need to clear the stencil bits since the standard write logic assumes they're zero.
		AND(bits, R(colorReg), notStencilMask);
	}
	finishes.push_back(J(true));

	tableValues[GE_LOGIC_SET] = GetCodePointer();
	if (stencilReg != INVALID_REG && maskReg != INVALID_REG) {
		OR(32, R(colorReg), R(stencilReg));
		OR(bits, R(colorReg), notStencilMask);
		finishes.push_back(J(true));
	} else if (stencilReg != INVALID_REG) {
		// Set bits directly in stencilReg, and then put in memory.
		OR(bits, R(stencilReg), notStencilMask);
		MOV(bits, MatR(colorOff), R(stencilReg));
		skipStandardWrites_.push_back(J(true));
	} else if (maskReg != INVALID_REG) {
		// OR in the bits we're allowed to write (won't be any stencil.)
		NOT(32, R(maskReg));
		OR(bits, MatR(colorOff), R(maskReg));
		skipStandardWrites_.push_back(J(true));
	} else {
		OR(bits, MatR(colorOff), notStencilMask);
		skipStandardWrites_.push_back(J(true));
	}

	const u8 *tablePtr = GetCodePointer();
	for (int i = 0; i < 16; ++i) {
		Write64((uintptr_t)tableValues[i]);
	}

	SetJumpTarget(skipTable);
	LEA(64, temp1Reg, M(tablePtr));
	JMPptr(MComplex(temp1Reg, logicOpReg, 8, 0));

	for (FixupBranch &fixup : finishes)
		SetJumpTarget(fixup);

	regCache_.Unlock(colorOff, RegCache::GEN_COLOR_OFF);
	regCache_.Release(logicOpReg, RegCache::GEN_TEMP4);
	regCache_.Release(temp1Reg, RegCache::GEN_TEMP5);
	if (stencilReg != INVALID_REG)
		regCache_.Unlock(stencilReg, RegCache::GEN_STENCIL);

	return true;
}

bool PixelJitCache::Jit_ConvertTo565(const PixelFuncID &id, RegCache::Reg colorReg, RegCache::Reg temp1Reg, RegCache::Reg temp2Reg) {
	Describe("ConvertTo565");

	if (cpu_info.bBMI2_fast) {
		MOV(32, R(temp1Reg), Imm32(0x00F8FCF8));
		PEXT(32, colorReg, colorReg, R(temp1Reg));
		return true;
	}

	// Assemble the 565 color, starting with R...
	MOV(32, R(temp1Reg), R(colorReg));
	SHR(32, R(temp1Reg), Imm8(3));
	AND(16, R(temp1Reg), Imm16(0x1F << 0));

	// For G, move right 5 (because the top 6 are offset by 10.)
	MOV(32, R(temp2Reg), R(colorReg));
	SHR(32, R(temp2Reg), Imm8(5));
	AND(16, R(temp2Reg), Imm16(0x3F << 5));
	OR(32, R(temp1Reg), R(temp2Reg));

	// And finally B, move right 8 (top 5 are offset by 19.)
	SHR(32, R(colorReg), Imm8(8));
	AND(16, R(colorReg), Imm16(0x1F << 11));
	OR(32, R(colorReg), R(temp1Reg));

	return true;
}

bool PixelJitCache::Jit_ConvertTo5551(const PixelFuncID &id, RegCache::Reg colorReg, RegCache::Reg temp1Reg, RegCache::Reg temp2Reg, bool keepAlpha) {
	Describe("ConvertTo5551");

	if (cpu_info.bBMI2_fast) {
		MOV(32, R(temp1Reg), Imm32(keepAlpha ? 0x80F8F8F8 : 0x00F8F8F8));
		PEXT(32, colorReg, colorReg, R(temp1Reg));
		return true;
	}

	// This is R, pretty simple.
	MOV(32, R(temp1Reg), R(colorReg));
	SHR(32, R(temp1Reg), Imm8(3));
	AND(16, R(temp1Reg), Imm16(0x1F << 0));

	// G moves right 6, to match the top 5 at 11.
	MOV(32, R(temp2Reg), R(colorReg));
	SHR(32, R(temp2Reg), Imm8(6));
	AND(16, R(temp2Reg), Imm16(0x1F << 5));
	OR(32, R(temp1Reg), R(temp2Reg));

	if (keepAlpha) {
		// Grab A into tempReg2 before handling B.
		MOV(32, R(temp2Reg), R(colorReg));
		SHR(32, R(temp2Reg), Imm8(31));
		SHL(32, R(temp2Reg), Imm8(15));
	}

	// B moves right 9, to match the top 5 at 19.
	SHR(32, R(colorReg), Imm8(9));
	AND(16, R(colorReg), Imm16(0x1F << 10));
	OR(32, R(colorReg), R(temp1Reg));

	if (keepAlpha)
		OR(32, R(colorReg), R(temp2Reg));

	return true;
}

bool PixelJitCache::Jit_ConvertTo4444(const PixelFuncID &id, RegCache::Reg colorReg, RegCache::Reg temp1Reg, RegCache::Reg temp2Reg, bool keepAlpha) {
	Describe("ConvertTo4444");

	if (cpu_info.bBMI2_fast) {
		MOV(32, R(temp1Reg), Imm32(keepAlpha ? 0xF0F0F0F0 : 0x00F0F0F0));
		PEXT(32, colorReg, colorReg, R(temp1Reg));
		return true;
	}

	// Shift and mask out R.
	MOV(32, R(temp1Reg), R(colorReg));
	SHR(32, R(temp1Reg), Imm8(4));
	AND(16, R(temp1Reg), Imm16(0xF << 0));

	// Shift G into position and mask.
	MOV(32, R(temp2Reg), R(colorReg));
	SHR(32, R(temp2Reg), Imm8(8));
	AND(16, R(temp2Reg), Imm16(0xF << 4));
	OR(32, R(temp1Reg), R(temp2Reg));

	if (keepAlpha) {
		// Grab A into tempReg2 before handling B.
		MOV(32, R(temp2Reg), R(colorReg));
		SHR(32, R(temp2Reg), Imm8(28));
		SHL(32, R(temp2Reg), Imm8(12));
	}

	// B moves right 12, to match the top 4 at 20.
	SHR(32, R(colorReg), Imm8(12));
	AND(16, R(colorReg), Imm16(0xF << 8));
	OR(32, R(colorReg), R(temp1Reg));

	if (keepAlpha)
		OR(32, R(colorReg), R(temp2Reg));

	return true;
}

bool PixelJitCache::Jit_ConvertFrom565(const PixelFuncID &id, RegCache::Reg colorReg, RegCache::Reg temp1Reg, RegCache::Reg temp2Reg) {
	Describe("ConvertFrom565");

	if (cpu_info.bBMI2_fast) {
		// Start off with the high bits.
		MOV(32, R(temp1Reg), Imm32(0x00F8FCF8));
		PDEP(32, temp1Reg, colorReg, R(temp1Reg));

		// Now grab the low bits (they end up packed.)
		MOV(32, R(temp2Reg), Imm32(0x0000E61C));
		PEXT(32, colorReg, colorReg, R(temp2Reg));
		// And spread them back out.
		MOV(32, R(temp2Reg), Imm32(0x00070307));
		PDEP(32, colorReg, colorReg, R(temp2Reg));

		// Finally put the high bits in, we're done.
		OR(32, R(colorReg), R(temp1Reg));
		return true;
	}

	// Filter out red only into temp1.
	MOV(32, R(temp1Reg), R(colorReg));
	AND(16, R(temp1Reg), Imm16(0x1F << 0));
	// Move it left to the top of the 8 bits.
	SHL(32, R(temp1Reg), Imm8(3));

	// Now we bring in blue, since it's also 5 like red.
	MOV(32, R(temp2Reg), R(colorReg));
	AND(16, R(temp2Reg), Imm16(0x1F << 11));
	// Shift blue into place, 8 left (at 19), and merge back to temp1.
	SHL(32, R(temp2Reg), Imm8(8));
	OR(32, R(temp1Reg), R(temp2Reg));

	// Make a copy back in temp2, and shift left 1 so we can swizzle together with G.
	OR(32, R(temp2Reg), R(temp1Reg));
	SHL(32, R(temp2Reg), Imm8(1));

	// We go to green last because it's the different one.  Put it in place.
	AND(16, R(colorReg), Imm16(0x3F << 5));
	SHL(32, R(colorReg), Imm8(5));
	// Combine with temp2 (for swizzling), then merge in temp1 (R+B pre-swizzle.)
	OR(32, R(temp2Reg), R(colorReg));
	OR(32, R(colorReg), R(temp1Reg));

	// Now shift and mask temp2 for swizzle.
	SHR(32, R(temp2Reg), Imm8(6));
	AND(32, R(temp2Reg), Imm32(0x00070307));
	// And then OR that in too.  We're done.
	OR(32, R(colorReg), R(temp2Reg));

	return true;
}

bool PixelJitCache::Jit_ConvertFrom5551(const PixelFuncID &id, RegCache::Reg colorReg, RegCache::Reg temp1Reg, RegCache::Reg temp2Reg, bool keepAlpha) {
	Describe("ConvertFrom5551");

	if (cpu_info.bBMI2_fast) {
		// First, grab the top bits.
		MOV(32, R(temp1Reg), Imm32(keepAlpha ? 0x01F8F8F8 : 0x00F8F8F8));
		PDEP(32, colorReg, colorReg, R(temp1Reg));

		// Now make the swizzle bits.
		MOV(32, R(temp2Reg), R(colorReg));
		SHR(32, R(temp2Reg), Imm8(5));
		AND(32, R(temp2Reg), Imm32(0x00070707));

		if (keepAlpha) {
			// Sign extend the alpha bit to 8 bits.
			SHL(32, R(colorReg), Imm8(7));
			SAR(32, R(colorReg), Imm8(7));
		}

		OR(32, R(colorReg), R(temp2Reg));
		return true;
	}

	// Filter out red only into temp1.
	MOV(32, R(temp1Reg), R(colorReg));
	AND(16, R(temp1Reg), Imm16(0x1F << 0));
	// Move it left to the top of the 8 bits.
	SHL(32, R(temp1Reg), Imm8(3));

	// Add in green and shift into place (top bits.)
	MOV(32, R(temp2Reg), R(colorReg));
	AND(16, R(temp2Reg), Imm16(0x1F << 5));
	SHL(32, R(temp2Reg), Imm8(6));
	OR(32, R(temp1Reg), R(temp2Reg));

	if (keepAlpha) {
		// Now take blue and alpha together.
		AND(16, R(colorReg), Imm16(0x8000 | (0x1F << 10)));
		// We move all the way left, then sign extend right to expand alpha.
		SHL(32, R(colorReg), Imm8(16));
		SAR(32, R(colorReg), Imm8(7));
	} else {
		AND(16, R(colorReg), Imm16(0x1F << 10));
		SHL(32, R(colorReg), Imm8(9));
	}

	// Combine both together, we still need to swizzle.
	OR(32, R(colorReg), R(temp1Reg));
	OR(32, R(temp1Reg), R(colorReg));
	// Now for swizzle, we'll mask carefully to avoid overflow.
	SHR(32, R(temp1Reg), Imm8(5));
	AND(32, R(temp1Reg), Imm32(0x00070707));

	// Then finally merge in the swizzle bits.
	OR(32, R(colorReg), R(temp1Reg));
	return true;
}

bool PixelJitCache::Jit_ConvertFrom4444(const PixelFuncID &id, RegCache::Reg colorReg, RegCache::Reg temp1Reg, RegCache::Reg temp2Reg, bool keepAlpha) {
	Describe("ConvertFrom4444");

	if (cpu_info.bBMI2_fast) {
		// First, spread the bits out with spaces.
		MOV(32, R(temp1Reg), Imm32(keepAlpha ? 0xF0F0F0F0 : 0x00F0F0F0));
		PDEP(32, colorReg, colorReg, R(temp1Reg));

		// Now swizzle the low bits in.
		MOV(32, R(temp1Reg), R(colorReg));
		SHR(32, R(temp1Reg), Imm8(4));
		OR(32, R(colorReg), R(temp1Reg));
		return true;
	}

	// Move red into position within temp1.
	MOV(32, R(temp1Reg), R(colorReg));
	AND(16, R(temp1Reg), Imm16(0xF << 0));
	SHL(32, R(temp1Reg), Imm8(4));

	// Green is just as simple.
	MOV(32, R(temp2Reg), R(colorReg));
	AND(16, R(temp2Reg), Imm16(0xF << 4));
	SHL(32, R(temp2Reg), Imm8(8));
	OR(32, R(temp1Reg), R(temp2Reg));

	// Blue isn't last this time, but it's next.
	MOV(32, R(temp2Reg), R(colorReg));
	AND(16, R(temp2Reg), Imm16(0xF << 8));
	SHL(32, R(temp2Reg), Imm8(12));
	OR(32, R(temp1Reg), R(temp2Reg));

	if (keepAlpha) {
		// Last but not least, alpha.
		AND(16, R(colorReg), Imm16(0xF << 12));
		SHL(32, R(colorReg), Imm8(16));
		OR(32, R(colorReg), R(temp1Reg));

		// Copy to temp1 again for swizzling.
		OR(32, R(temp1Reg), R(colorReg));
	} else {
		// Overwrite colorReg (we need temp1 as a copy anyway.)
		MOV(32, R(colorReg), R(temp1Reg));
	}

	// Masking isn't necessary here since everything is 4 wide.
	SHR(32, R(temp1Reg), Imm8(4));
	OR(32, R(colorReg), R(temp1Reg));
	return true;
}

};

#endif