ARM: Use PLD (cache preload) in vertex decoder loop.

2024-11-26 23:10:38 +00:00 · 2013-11-24 15:08:47 +01:00 · 2013-11-24 15:08:47 +01:00 · dfea160491
commit dfea160491
parent f650b23c90
4 changed files with 23 additions and 5 deletions
--- a/Common/ArmEmitter.cpp
+++ b/Common/ArmEmitter.cpp
@ -801,6 +801,17 @@ void ARMXEmitter::CLZ(ARMReg rd, ARMReg rm)
 	Write32(condition | (0x16F << 16) | (rd << 12) | (0xF1 << 4) | rm);
 }

+void ARMXEmitter::PLD(ARMReg rn, int offset, bool forWrite) {
+	_dbg_assert_msg_(JIT, offset < 0x3ff && offset > -0x3ff, "PLD: Max 12 bits of offset allowed");
+
+	bool U = offset >= 0;
+	if (offset < 0) offset = -offset;
+	bool R = !forWrite;
+	// Conditions not allowed
+	Write32((0xF5 << 24) | (U << 23) | (R << 22) | (1 << 20) | ((int)rn << 16) | (0xF << 12) | offset);
+}
+
+
 void ARMXEmitter::BFI(ARMReg rd, ARMReg rn, u8 lsb, u8 width)
 {
 	u32 msb = (lsb + width - 1);
--- a/Common/ArmEmitter.h
+++ b/Common/ArmEmitter.h
@ -541,6 +541,7 @@ public:
 	void BFI(ARMReg rd, ARMReg rn, u8 lsb, u8 width);
 	void UBFX(ARMReg dest, ARMReg op2, u8 lsb, u8 width);
 	void CLZ(ARMReg rd, ARMReg rm);
+	void PLD(ARMReg rd, int offset, bool forWrite = false);

 	// Using just MSR here messes with our defines on the PPC side of stuff (when this code was in dolphin...)
 	// Just need to put an underscore here, bit annoying.
--- a/GPU/GLES/VertexDecoder.cpp
+++ b/GPU/GLES/VertexDecoder.cpp
@ -1058,6 +1058,8 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) {
 	}

 	JumpTarget loopStart = GetCodePtr();
+	// Preload data cache ahead of reading. TODO: Experiment with the offset.
+	PLD(srcReg, 64);
 	for (int i = 0; i < dec.numSteps_; i++) {
 		if (!CompileStep(dec, i)) {
 			// Reset the code ptr and return zero to indicate that we failed.
@ -1265,13 +1267,14 @@ void VertexDecoderJitCache::Jit_TcU16ThroughDouble() {
 }

 void VertexDecoderJitCache::Jit_TcU8Prescale() {
-	if (false && cpu_info.bNEON) {
+	if (cpu_info.bNEON) {
 		// TODO: Needs testing
 		ADD(scratchReg, srcReg, dec_->tcoff);
 		VLD1_lane(I_16, neonScratchReg, scratchReg, 0, false);
-		VMOVL(I_8 | I_UNSIGNED, neonScratchRegQ, neonScratchReg);  // Widen to 32-bit
+		VMOVL(I_8 | I_UNSIGNED, neonScratchRegQ, neonScratchReg);  // Widen to 16-bit
 		VMOVL(I_16 | I_UNSIGNED, neonScratchRegQ, neonScratchReg);  // Widen to 32-bit
 		VCVT(F_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ);
+		ADD(scratchReg2, dstReg, dec_->decFmt.uvoff);
 		VMUL(F_32, neonScratchReg, neonScratchReg, neonUVScaleReg);
 		VADD(F_32, neonScratchReg, neonScratchReg, neonUVOffsetReg);
 		VST1(F_32, neonScratchReg, scratchReg2, 1, ALIGN_NONE);
@ -1294,12 +1297,13 @@ void VertexDecoderJitCache::Jit_TcU8Prescale() {
 }

 void VertexDecoderJitCache::Jit_TcU16Prescale() {
-	if (false && cpu_info.bNEON) {
+	if (cpu_info.bNEON) {
 		// TODO: Needs testing
 		ADD(scratchReg, srcReg, dec_->tcoff);
 		VLD1_lane(I_32, neonScratchReg, scratchReg, 0, false);
 		VMOVL(I_16 | I_UNSIGNED, neonScratchRegQ, neonScratchReg);  // Widen to 32-bit
 		VCVT(F_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ);
+		ADD(scratchReg2, dstReg, dec_->decFmt.uvoff);
 		VMUL(F_32, neonScratchReg, neonScratchReg, neonUVScaleReg);
 		VADD(F_32, neonScratchReg, neonScratchReg, neonUVOffsetReg);
 		VST1(F_32, neonScratchReg, scratchReg2, 1, ALIGN_NONE);
--- a/android/jni/ArmEmitterTest.cpp
+++ b/android/jni/ArmEmitterTest.cpp
@ -52,8 +52,6 @@ void TestCode::Generate()
 	VLD1_all_lanes(F_32, Q2, R1, true);
 	ADD(R0, R0, 12);
 	VLD1_lane(F_32, D4, R0, 1, true);
-	u32 word = *(u32 *)(GetCodePtr() - 4);
-	ILOG("Instruction Word: %08x", word);
 	// VMUL(F_32, Q2, Q0, Q1);
 	VST1(F_32, D4, R2, 2);
 	*/
@ -70,6 +68,10 @@ void TestCode::Generate()
 	VST1(I_32, D2, R1, 2);
 	VST1(I_32, D4, R2, 2);
 	VST1(I_32, D6, R3, 2);
+	PLD(R1, 32);
+	u32 word = *(u32 *)(GetCodePtr() - 4);
+	ILOG("Instruction Word: %08x", word);
+

 	// This works!