Merge pull request #4442 from unknownbrackets/vertex-decoder-jit2

Add arm color conversions for the vertex decoder jit
2025-02-14 22:24:05 +00:00 · 2013-11-04 01:22:35 -08:00 · 2013-11-04 01:22:35 -08:00 · af3bc546fb
commit af3bc546fb
parent 5d35c58f0e 06194ac261
3 changed files with 145 additions and 39 deletions
--- a/Common/ArmEmitter.cpp
+++ b/Common/ArmEmitter.cpp
@ -192,6 +192,17 @@ void ARMXEmitter::CMPI2R(ARMReg rs, u32 val, ARMReg scratch)
 	}
 }

+void ARMXEmitter::TSTI2R(ARMReg rs, u32 val, ARMReg scratch)
+{
+	Operand2 op2;
+	if (TryMakeOperand2(val, op2)) {
+		TST(rs, op2);
+	} else {
+		MOVI2R(scratch, val);
+		TST(rs, scratch);
+	}
+}
+
 void ARMXEmitter::ORI2R(ARMReg rd, ARMReg rs, u32 val, ARMReg scratch)
 {
 	Operand2 op2;
@ -845,6 +856,19 @@ void ARMXEmitter::STR  (ARMReg result, ARMReg base, Operand2 op2, bool RegAdd) {
 void ARMXEmitter::STRH (ARMReg result, ARMReg base, Operand2 op2, bool RegAdd) { WriteStoreOp(4, result, base, op2, RegAdd);}
 void ARMXEmitter::STRB (ARMReg result, ARMReg base, Operand2 op2, bool RegAdd) { WriteStoreOp(2, result, base, op2, RegAdd);}

+#define VA_TO_REGLIST(RegList, Regnum) \
+{ \
+	u8 Reg; \
+	va_list vl; \
+	va_start(vl, Regnum); \
+	for (int i = 0; i < Regnum; i++) \
+	{ \
+		Reg = va_arg(vl, u32); \
+		RegList |= (1 << Reg); \
+	} \
+	va_end(vl); \
+}
+
 void ARMXEmitter::WriteRegStoreOp(u32 op, ARMReg dest, bool WriteBack, u16 RegList)
 {
 	Write32(condition | (op << 20) | (WriteBack << 21) | (dest << 16) | RegList);
@ -852,33 +876,41 @@ void ARMXEmitter::WriteRegStoreOp(u32 op, ARMReg dest, bool WriteBack, u16 RegLi
 void ARMXEmitter::STMFD(ARMReg dest, bool WriteBack, const int Regnum, ...)
 {
 	u16 RegList = 0;
-	u8 Reg;
-	int i;
-	va_list vl;
-	va_start(vl, Regnum);
-	for (i=0;i<Regnum;i++)
-	{
-		Reg = va_arg(vl, u32);
-		RegList |= (1 << Reg);
-	}
-	va_end(vl);
-	WriteRegStoreOp(0x90, dest, WriteBack, RegList);
+	VA_TO_REGLIST(RegList, Regnum);
+	WriteRegStoreOp(0x80 | 0x10 | 0, dest, WriteBack, RegList);
 }
 void ARMXEmitter::LDMFD(ARMReg dest, bool WriteBack, const int Regnum, ...)
 {
 	u16 RegList = 0;
-	u8 Reg;
-	int i;
-	va_list vl;
-	va_start(vl, Regnum);
-	for (i=0;i<Regnum;i++)
-	{
-		Reg = va_arg(vl, u32);
-		RegList |= (1 << Reg);
-	}
-	va_end(vl);
-	WriteRegStoreOp(0x89, dest, WriteBack, RegList);
+	VA_TO_REGLIST(RegList, Regnum);
+	WriteRegStoreOp(0x80 | 0x08 | 1, dest, WriteBack, RegList);
 }
+void ARMXEmitter::STMIA(ARMReg dest, bool WriteBack, const int Regnum, ...)
+{
+	u16 RegList = 0;
+	VA_TO_REGLIST(RegList, Regnum);
+	WriteRegStoreOp(0x80 | 0x08 | 0, dest, WriteBack, RegList);
+}
+void ARMXEmitter::LDMIA(ARMReg dest, bool WriteBack, const int Regnum, ...)
+{
+	u16 RegList = 0;
+	VA_TO_REGLIST(RegList, Regnum);
+	WriteRegStoreOp(0x80 | 0x08 | 1, dest, WriteBack, RegList);
+}
+void ARMXEmitter::STM(ARMReg dest, bool Add, bool Before, bool WriteBack, const int Regnum, ...)
+{
+	u16 RegList = 0;
+	VA_TO_REGLIST(RegList, Regnum);
+	WriteRegStoreOp(0x80 | (Before << 4) | (Add << 3) | 0, dest, WriteBack, RegList);
+}
+void ARMXEmitter::LDM(ARMReg dest, bool Add, bool Before, bool WriteBack, const int Regnum, ...)
+{
+	u16 RegList = 0;
+	VA_TO_REGLIST(RegList, Regnum);
+	WriteRegStoreOp(0x80 | (Before << 4) | (Add << 3) | 1, dest, WriteBack, RegList);
+}
+
+#undef VA_TO_REGLIST

 ARMReg ARMXEmitter::SubBase(ARMReg Reg)
 {
--- a/Common/ArmEmitter.h
+++ b/Common/ArmEmitter.h
@ -194,7 +194,7 @@ public:
 				shift = 0;
 			break;
 		case ST_ASR:
-			_assert_msg_(JIT, shift < 32, "Invalid Operand2: LSR %u", shift);
+			_assert_msg_(JIT, shift < 32, "Invalid Operand2: ASR %u", shift);
 			if (!shift)
 				type = ST_LSL;
 			if (shift == 32)
@ -524,6 +524,10 @@ public:

 	void STMFD(ARMReg dest, bool WriteBack, const int Regnum, ...);
 	void LDMFD(ARMReg dest, bool WriteBack, const int Regnum, ...);
+	void STMIA(ARMReg dest, bool WriteBack, const int Regnum, ...);
+	void LDMIA(ARMReg dest, bool WriteBack, const int Regnum, ...);
+	void STM(ARMReg dest, bool Add, bool Before, bool WriteBack, const int Regnum, ...);
+	void LDM(ARMReg dest, bool Add, bool Before, bool WriteBack, const int Regnum, ...);

 	// Exclusive Access operations
 	void LDREX(ARMReg dest, ARMReg base);
@ -589,6 +593,7 @@ public:
 	void ADDI2R(ARMReg rd, ARMReg rs, u32 val, ARMReg scratch);
 	void ANDI2R(ARMReg rd, ARMReg rs, u32 val, ARMReg scratch);
 	void CMPI2R(ARMReg rs, u32 val, ARMReg scratch);
+	void TSTI2R(ARMReg rs, u32 val, ARMReg scratch);
 	void ORI2R(ARMReg rd, ARMReg rs, u32 val, ARMReg scratch);


--- a/GPU/GLES/VertexDecoder.cpp
+++ b/GPU/GLES/VertexDecoder.cpp
@ -813,7 +813,9 @@ static const JitLookup jitLookup[] = {
 	{&VertexDecoder::Step_NormalFloat, &VertexDecoderJitCache::Jit_NormalFloat},

 	{&VertexDecoder::Step_Color8888, &VertexDecoderJitCache::Jit_Color8888},
-	// Todo: The compressed color formats
+	{&VertexDecoder::Step_Color4444, &VertexDecoderJitCache::Jit_Color4444},
+	{&VertexDecoder::Step_Color565, &VertexDecoderJitCache::Jit_Color565},
+	{&VertexDecoder::Step_Color5551, &VertexDecoderJitCache::Jit_Color5551},

 	{&VertexDecoder::Step_PosS8Through, &VertexDecoderJitCache::Jit_PosS8Through},
 	{&VertexDecoder::Step_PosS16Through, &VertexDecoderJitCache::Jit_PosS16Through},
@ -945,16 +947,77 @@ void VertexDecoderJitCache::Jit_Color8888() {
 }

 void VertexDecoderJitCache::Jit_Color4444() {
-	// TODO
+	// Ignoring the top 16 bits.
+	LDR(tempReg1, srcReg, dec_->coloff);
+
+	// Spread out the components.
+	ANDI2R(tempReg2, tempReg1, 0x000F, scratchReg);
+	ANDI2R(tempReg3, tempReg1, 0x00F0, scratchReg);
+	ORR(tempReg2, tempReg2, Operand2(tempReg3, ST_LSL, 4));
+	ANDI2R(tempReg3, tempReg1, 0x0F00, scratchReg);
+	ORR(tempReg2, tempReg2, Operand2(tempReg3, ST_LSL, 8));
+	ANDI2R(tempReg3, tempReg1, 0xF000, scratchReg);
+	ORR(tempReg2, tempReg2, Operand2(tempReg3, ST_LSL, 12));
+
+	// And saturate.
+	ORR(tempReg1, tempReg2, Operand2(tempReg2, ST_LSL, 4));
+
+	STR(tempReg1, dstReg, dec_->decFmt.c0off);
 }

 void VertexDecoderJitCache::Jit_Color565() {
-	// TODO
+	// Ignoring the top 16 bits.
+	LDR(tempReg1, srcReg, dec_->coloff);
+
+	// Spread out R and B first.  This puts them in 0x001F001F.
+	ANDI2R(tempReg2, tempReg1, 0x001F, scratchReg);
+	ANDI2R(tempReg3, tempReg1, 0xF800, scratchReg);
+	ORR(tempReg2, tempReg2, Operand2(tempReg3, ST_LSL, 5));
+
+	// Expand 5 -> 8.
+	LSL(tempReg3, tempReg2, 3);
+	ORR(tempReg2, tempReg3, Operand2(tempReg2, ST_LSR, 2));
+	ANDI2R(tempReg2, tempReg2, 0x00FF00FF, scratchReg);
+
+	// Now finally G.  We start by shoving it into a wall.
+	LSR(tempReg1, tempReg1, 5);
+	ANDI2R(tempReg1, tempReg1, 0x003F, scratchReg);
+	LSL(tempReg3, tempReg1, 2);
+	// Don't worry, shifts into a wall.
+	ORR(tempReg3, tempReg3, Operand2(tempReg1, ST_LSR, 4));
+	ORR(tempReg2, tempReg2, Operand2(tempReg3, ST_LSL, 8));
+
+	// Add in full alpha.
+	ORI2R(tempReg1, tempReg2, 0xFF000000, scratchReg);
+
+	STR(tempReg1, dstReg, dec_->decFmt.c0off);
 }

 void VertexDecoderJitCache::Jit_Color5551() {
-	// TODO
+	// Ignoring the top 16 bits.
+	LDR(tempReg1, srcReg, dec_->coloff);
+
+	ANDI2R(tempReg2, tempReg1, 0x001F, scratchReg);
+	ANDI2R(tempReg3, tempReg1, 0x07E0, scratchReg);
+	ORR(tempReg2, tempReg2, Operand2(tempReg3, ST_LSL, 3));
+	ANDI2R(tempReg3, tempReg1, 0xF800, scratchReg);
+	ORR(tempReg2, tempReg2, Operand2(tempReg3, ST_LSL, 6));
+
+	// Expand 5 -> 8.
+	LSR(tempReg3, tempReg2, 2);
+	// Clean up the bits that were shifted right.
+	ANDI2R(tempReg3, tempReg1, 0x07070707, scratchReg);
+	ORR(tempReg2, tempReg3, Operand2(tempReg2, ST_LSL, 3));
+
+	// Now we just need alpha.
+	TSTI2R(tempReg1, 0x8000, scratchReg);
+	SetCC(CC_NEQ);
+	ORI2R(tempReg2, tempReg2, 0xFF000000, scratchReg);
+	SetCC(CC_AL);
+
+	STR(tempReg2, dstReg, dec_->decFmt.c0off);
 }
+
 // Copy 3 bytes and then a zero. Might as well copy four.
 void VertexDecoderJitCache::Jit_NormalS8() {
 	LDR(tempReg1, srcReg, dec_->nrmoff);
@ -964,20 +1027,23 @@ void VertexDecoderJitCache::Jit_NormalS8() {

 // Copy 6 bytes and then 2 zeroes.
 void VertexDecoderJitCache::Jit_NormalS16() {
-	LDR(tempReg1, srcReg, dec_->nrmoff, false);
-	LDRH(tempReg2, srcReg, dec_->nrmoff + 4, false);
-	STR(tempReg1, dstReg, dec_->decFmt.nrmoff, false);
-	STR(tempReg2, dstReg, dec_->decFmt.nrmoff + 4, false);
+	LDR(tempReg1, srcReg, dec_->nrmoff);
+	LDRH(tempReg2, srcReg, dec_->nrmoff + 4);
+	STR(tempReg1, dstReg, dec_->decFmt.nrmoff);
+	STR(tempReg2, dstReg, dec_->decFmt.nrmoff + 4);
 }

 void VertexDecoderJitCache::Jit_NormalFloat() {
-	// ldmia?
-	LDR(tempReg1, srcReg, dec_->nrmoff, false);
-	LDR(tempReg2, srcReg, dec_->nrmoff + 4, false);
-	LDR(tempReg3, srcReg, dec_->nrmoff + 8, false);
-	STR(tempReg1, dstReg, dec_->decFmt.nrmoff, false);
-	STR(tempReg2, dstReg, dec_->decFmt.nrmoff + 4, false);
-	STR(tempReg3, dstReg, dec_->decFmt.nrmoff + 8, false);
+	//ADD(scratchReg, srcReg, dec_->nrmoff);
+	//LDMIA(scratchReg, false, 3, tempReg1, tempReg2, tempReg3);
+	//ADD(scratchReg, dstReg, dec_->decFmt.nrmoff);
+	//STMIA(scratchReg, false, 3, tempReg1, tempReg2, tempReg3);
+	LDR(tempReg1, srcReg, dec_->nrmoff);
+	LDR(tempReg2, srcReg, dec_->nrmoff + 4);
+	LDR(tempReg3, srcReg, dec_->nrmoff + 8);
+	STR(tempReg1, dstReg, dec_->decFmt.nrmoff);
+	STR(tempReg2, dstReg, dec_->decFmt.nrmoff + 4);
+	STR(tempReg3, dstReg, dec_->decFmt.nrmoff + 8);
 }

 // Through expands into floats, always. Might want to look at changing this.
@ -1019,7 +1085,10 @@ void VertexDecoderJitCache::Jit_PosS16() {

 // Just copy 12 bytes.
 void VertexDecoderJitCache::Jit_PosFloat() {
-	// ldmia?
+	//ADD(scratchReg, srcReg, dec_->posoff);
+	//LDMIA(scratchReg, false, 3, tempReg1, tempReg2, tempReg3);
+	//ADD(scratchReg, dstReg, dec_->decFmt.posoff);
+	//STMIA(scratchReg, false, 3, tempReg1, tempReg2, tempReg3);
 	LDR(tempReg1, srcReg, dec_->posoff);
 	LDR(tempReg2, srcReg, dec_->posoff + 4);
 	LDR(tempReg3, srcReg, dec_->posoff + 8);