Merge pull request #15271 from unknownbrackets/samplerjit-color16

samplerjit: Decode colors in parallel
2024-12-14 08:59:47 +00:00 · 2022-01-02 17:55:46 +01:00 · 2022-01-02 17:55:46 +01:00 · c7062d7063
commit c7062d7063
parent 6fb5d82fe0 0eec4e7e4d
3 changed files with 238 additions and 6 deletions
--- a/GPU/Software/Sampler.cpp
+++ b/GPU/Software/Sampler.cpp
@ -133,6 +133,9 @@ void SamplerJitCache::Clear() {
 	constOnes16_ = nullptr;
 	constUNext_ = nullptr;
 	constVNext_ = nullptr;
+
+	const5551Swizzle_ = nullptr;
+	const5650Swizzle_ = nullptr;
 }

 void SamplerJitCache::Describe(const std::string &message) {
--- a/GPU/Software/Sampler.h
+++ b/GPU/Software/Sampler.h
@ -102,6 +102,10 @@ private:
 	bool Jit_PrepareDataDirectOffsets(const SamplerID &id, Rasterizer::RegCache::Reg uReg, Rasterizer::RegCache::Reg vReg, bool level1, int bitsPerTexel);
 	bool Jit_PrepareDataSwizzledOffsets(const SamplerID &id, Rasterizer::RegCache::Reg uReg, Rasterizer::RegCache::Reg vReg, bool level1, int bitsPerTexel);
 	bool Jit_BlendQuad(const SamplerID &id, bool level1);
+	bool Jit_DecodeQuad(const SamplerID &id, bool level1);
+	bool Jit_Decode5650Quad(const SamplerID &id, Rasterizer::RegCache::Reg quadReg);
+	bool Jit_Decode5551Quad(const SamplerID &id, Rasterizer::RegCache::Reg quadReg);
+	bool Jit_Decode4444Quad(const SamplerID &id, Rasterizer::RegCache::Reg quadReg);

 	bool Jit_ApplyTextureFunc(const SamplerID &id);

@ -123,6 +127,8 @@ private:
 	const u8 *const10All16_ = nullptr;
 	const u8 *const10Low_ = nullptr;
 	const u8 *const10All8_ = nullptr;
+	const u8 *const5551Swizzle_ = nullptr;
+	const u8 *const5650Swizzle_ = nullptr;

 	std::unordered_map<SamplerID, NearestFunc> cache_;
 	std::unordered_map<SamplerID, const u8 *> addresses_;
--- a/GPU/Software/SamplerX86.cpp
+++ b/GPU/Software/SamplerX86.cpp
@ -733,6 +733,7 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) {
 	if (regCache_.Has(RegCache::GEN_ARG_LEVEL))
 		regCache_.ForceRelease(RegCache::GEN_ARG_LEVEL);

+	success = success && Jit_DecodeQuad(id, false);
 	success = success && Jit_BlendQuad(id, false);
 	if (id.hasAnyMips) {
 		Describe("BlendMips");
@ -747,6 +748,7 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) {
 		CMP(8, R(levelFracReg), Imm8(0));
 		FixupBranch skip = J_CC(CC_Z, true);

+		success = success && Jit_DecodeQuad(id, true);
 		success = success && Jit_BlendQuad(id, true);

 		Describe("BlendMips");
@ -872,6 +874,18 @@ void SamplerJitCache::WriteConstantPool(const SamplerID &id) {
 		Write32(0); Write32(0); Write32(1); Write32(1);
 	}

+	if (const5551Swizzle_ == nullptr) {
+		const5551Swizzle_ = AlignCode16();
+		for (int i = 0; i < 4; ++i)
+			Write32(0x00070707);
+	}
+
+	if (const5650Swizzle_ == nullptr) {
+		const5650Swizzle_ = AlignCode16();
+		for (int i = 0; i < 4; ++i)
+			Write32(0x00070307);
+	}
+
 	// These are unique to the sampler ID.
 	if (!id.hasAnyMips) {
 		constWidth256f_ = AlignCode16();
@ -1310,19 +1324,19 @@ bool SamplerJitCache::Jit_ReadTextureFormat(const SamplerID &id) {
 	switch (fmt) {
 	case GE_TFMT_5650:
 		success = Jit_GetTexData(id, 16);
-		if (success)
+		if (success && !id.linear)
 			success = Jit_Decode5650();
 		break;

 	case GE_TFMT_5551:
 		success = Jit_GetTexData(id, 16);
-		if (success)
+		if (success && !id.linear)
 			success = Jit_Decode5551();
 		break;

 	case GE_TFMT_4444:
 		success = Jit_GetTexData(id, 16);
-		if (success)
+		if (success && !id.linear)
 			success = Jit_Decode4444();
 		break;

@ -2711,6 +2725,101 @@ bool SamplerJitCache::Jit_PrepareDataSwizzledOffsets(const SamplerID &id, RegCac
 	return true;
 }

+bool SamplerJitCache::Jit_DecodeQuad(const SamplerID &id, bool level1) {
+	GETextureFormat decodeFmt = id.TexFmt();
+	switch (id.TexFmt()) {
+	case GE_TFMT_CLUT32:
+	case GE_TFMT_CLUT16:
+	case GE_TFMT_CLUT8:
+	case GE_TFMT_CLUT4:
+		// The values match, so just use the clut fmt.
+		decodeFmt = (GETextureFormat)id.ClutFmt();
+		break;
+
+	default:
+		// We'll decode below.
+		break;
+	}
+
+	bool success = true;
+	X64Reg quadReg = regCache_.Find(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
+
+	switch (decodeFmt) {
+	case GE_TFMT_5650:
+		success = Jit_Decode5650Quad(id, quadReg);
+		break;
+
+	case GE_TFMT_5551:
+		success = Jit_Decode5551Quad(id, quadReg);
+		break;
+
+	case GE_TFMT_4444:
+		success = Jit_Decode4444Quad(id, quadReg);
+		break;
+
+	default:
+		// Doesn't need decoding.
+		break;
+	}
+
+	regCache_.Unlock(quadReg, level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
+	return success;
+}
+
+bool SamplerJitCache::Jit_Decode5650Quad(const SamplerID &id, Rasterizer::RegCache::Reg quadReg) {
+	Describe("5650Quad");
+	X64Reg temp1Reg = regCache_.Alloc(RegCache::VEC_TEMP1);
+	X64Reg temp2Reg = regCache_.Alloc(RegCache::VEC_TEMP2);
+
+	// Filter out red only into temp1.  We do this by shifting into a wall.
+	if (cpu_info.bAVX) {
+		VPSLLD(128, temp1Reg, quadReg, 32 - 5);
+	} else {
+		MOVDQA(temp1Reg, R(quadReg));
+		PSLLD(temp1Reg, 32 - 5);
+	}
+	// Move it right to the top of the 8 bits.
+	PSRLD(temp1Reg, 24);
+
+	// Now we bring in blue, since it's also 5 like red.
+	if (cpu_info.bAVX) {
+		VPSRLD(128, temp2Reg, quadReg, 11);
+	} else {
+		MOVDQA(temp2Reg, R(quadReg));
+		// Luckily, we know the top 16 bits are zero.  Shift right into a wall.
+		PSRLD(temp2Reg, 11);
+	}
+	// Shift blue into place at 19, and merge back to temp1.
+	PSLLD(temp2Reg, 19);
+	POR(temp1Reg, R(temp2Reg));
+
+	// Make a copy back in temp2, and shift left 1 so we can swizzle together with G.
+	if (cpu_info.bAVX) {
+		VPSLLD(128, temp2Reg, temp1Reg, 1);
+	} else {
+		MOVDQA(temp2Reg, R(temp1Reg));
+		PSLLD(temp2Reg, 1);
+	}
+
+	// We go to green last because it's the different one.  Shift off red and blue.
+	PSRLD(quadReg, 5);
+	// Use a word shift to put a wall just at the right place, top 6 bits of second byte.
+	PSLLW(quadReg, 10);
+	// Combine with temp2 (for swizzling), then merge in temp1 (R+B pre-swizzle.)
+	POR(temp2Reg, R(quadReg));
+	POR(quadReg, R(temp1Reg));
+
+	// Now shift and mask temp2 for swizzle.
+	PSRLD(temp2Reg, 6);
+	PAND(temp2Reg, M(const5650Swizzle_));
+	// And then OR that in too.  We're done.
+	POR(quadReg, R(temp2Reg));
+
+	regCache_.Release(temp1Reg, RegCache::VEC_TEMP1);
+	regCache_.Release(temp2Reg, RegCache::VEC_TEMP2);
+	return true;
+}
+
 bool SamplerJitCache::Jit_Decode5650() {
 	Describe("5650");
 	X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
@ -2753,6 +2862,56 @@ bool SamplerJitCache::Jit_Decode5650() {
 	return true;
 }

+bool SamplerJitCache::Jit_Decode5551Quad(const SamplerID &id, Rasterizer::RegCache::Reg quadReg) {
+	Describe("5551Quad");
+	X64Reg temp1Reg = regCache_.Alloc(RegCache::VEC_TEMP1);
+	X64Reg temp2Reg = regCache_.Alloc(RegCache::VEC_TEMP2);
+
+	// Filter out red only into temp1.  We do this by shifting into a wall.
+	if (cpu_info.bAVX) {
+		VPSLLD(128, temp1Reg, quadReg, 32 - 5);
+	} else {
+		MOVDQA(temp1Reg, R(quadReg));
+		PSLLD(temp1Reg, 32 - 5);
+	}
+	// Move it right to the top of the 8 bits.
+	PSRLD(temp1Reg, 24);
+
+	// Add in green and shift into place (top 5 bits of byte 2.)
+	if (cpu_info.bAVX) {
+		VPSRLD(128, temp2Reg, quadReg, 5);
+	} else {
+		MOVDQA(temp2Reg, R(quadReg));
+		PSRLD(temp2Reg, 5);
+	}
+	PSLLW(temp2Reg, 11);
+	POR(temp1Reg, R(temp2Reg));
+
+	// First, extend alpha using an arithmetic shift.
+	// We use 10 to meanwhile get rid of green too.  The extra alpha bits are fine.
+	PSRAW(quadReg, 10);
+	// This gets rid of those extra alpha bits.
+	PSLLD(quadReg, 19);
+
+	// Combine both together, we still need to swizzle.
+	POR(quadReg, R(temp1Reg));
+	if (cpu_info.bAVX) {
+		VPSRLD(128, temp1Reg, quadReg, 5);
+	} else {
+		MOVDQA(temp1Reg, R(quadReg));
+		PSRLD(temp1Reg, 5);
+	}
+
+	// Now for swizzle, we'll mask carefully to avoid overflow.
+	PAND(temp1Reg, M(const5551Swizzle_));
+	// Then finally merge in the swizzle bits.
+	POR(quadReg, R(temp1Reg));
+
+	regCache_.Release(temp1Reg, RegCache::VEC_TEMP1);
+	regCache_.Release(temp2Reg, RegCache::VEC_TEMP2);
+	return true;
+}
+
 bool SamplerJitCache::Jit_Decode5551() {
 	Describe("5551");
 	X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
@ -2794,6 +2953,70 @@ bool SamplerJitCache::Jit_Decode5551() {
 	return true;
 }

+bool SamplerJitCache::Jit_Decode4444Quad(const SamplerID &id, Rasterizer::RegCache::Reg quadReg) {
+	Describe("4444Quad");
+	X64Reg temp1Reg = regCache_.Alloc(RegCache::VEC_TEMP1);
+	X64Reg temp2Reg = regCache_.Alloc(RegCache::VEC_TEMP2);
+
+	// Mask and move red into position within temp1.
+	if (cpu_info.bAVX) {
+		VPSLLD(128, temp1Reg, quadReg, 28);
+	} else {
+		MOVDQA(temp1Reg, R(quadReg));
+		PSLLD(temp1Reg, 28);
+	}
+	PSRLD(temp1Reg, 24);
+
+	// Green is easy too, we use a word shift to get a free wall.
+	if (cpu_info.bAVX) {
+		VPSRLD(128, temp2Reg, quadReg, 4);
+	} else {
+		MOVDQA(temp2Reg, R(quadReg));
+		PSRLD(temp2Reg, 4);
+	}
+	PSLLW(temp2Reg, 12);
+	POR(temp1Reg, R(temp2Reg));
+
+	// Blue isn't last this time, but it's next.
+	if (cpu_info.bAVX) {
+		VPSRLD(128, temp2Reg, quadReg, 8);
+	} else {
+		MOVDQA(temp2Reg, R(quadReg));
+		PSRLD(temp2Reg, 8);
+	}
+	PSLLD(temp2Reg, 28);
+	PSRLD(temp2Reg, 8);
+	POR(temp1Reg, R(temp2Reg));
+
+	if (id.useTextureAlpha) {
+		// Last but not least, alpha.
+		PSRLW(quadReg, 12);
+		PSLLD(quadReg, 28);
+		POR(quadReg, R(temp1Reg));
+
+		// Masking isn't necessary here since everything is 4 wide.
+		if (cpu_info.bAVX) {
+			VPSRLD(128, temp1Reg, quadReg, 4);
+		} else {
+			MOVDQA(temp1Reg, R(quadReg));
+			PSRLD(temp1Reg, 4);
+		}
+		POR(quadReg, R(temp1Reg));
+	} else if (cpu_info.bAVX) {
+		VPSRLD(128, quadReg, temp1Reg, 4);
+		POR(quadReg, R(temp1Reg));
+	} else {
+		// Overwrite colorReg (we need temp1 as a copy anyway.)
+		MOVDQA(quadReg, R(temp1Reg));
+		PSRLD(temp1Reg, 4);
+		POR(quadReg, R(temp1Reg));
+	}
+
+	regCache_.Release(temp1Reg, RegCache::VEC_TEMP1);
+	regCache_.Release(temp2Reg, RegCache::VEC_TEMP2);
+	return true;
+}
+
 alignas(16) static const u32 color4444mask[4] = { 0xf00ff00f, 0xf00ff00f, 0xf00ff00f, 0xf00ff00f, };

 bool SamplerJitCache::Jit_Decode4444() {
@ -2945,13 +3168,13 @@ bool SamplerJitCache::Jit_ReadClutColor(const SamplerID &id) {

 	switch (id.ClutFmt()) {
 	case GE_CMODE_16BIT_BGR5650:
-		return Jit_Decode5650();
+		return id.linear || Jit_Decode5650();

 	case GE_CMODE_16BIT_ABGR5551:
-		return Jit_Decode5551();
+		return id.linear || Jit_Decode5551();

 	case GE_CMODE_16BIT_ABGR4444:
-		return Jit_Decode4444();
+		return id.linear || Jit_Decode4444();

 	case GE_CMODE_32BIT_ABGR8888:
 		return true;