ARM: Add NEON widening and narrowing moves, and float/int convert.

Experiment a little in the vertex decoder.
2025-02-12 09:38:20 +00:00 · 2013-11-24 13:29:56 +01:00 · 2013-11-24 13:29:56 +01:00 · f650b23c90
commit f650b23c90
parent 52d4ede2f6
8 changed files with 177 additions and 46 deletions
--- a/Common/ArmEmitter.cpp
+++ b/Common/ArmEmitter.cpp
@ -2325,7 +2325,7 @@ void ARMXEmitter::VSWP(ARMReg Vd, ARMReg Vm)
 }
 void ARMXEmitter::VTRN(u32 Size, ARMReg Vd, ARMReg Vm)
 {
-	_dbg_assert_msg_(JIT, Vd >= Q0, "Pass invalid register to " __FUNCTION__);
+	_dbg_assert_msg_(JIT, Vd >= D0, "Pass invalid register to " __FUNCTION__);
 	_dbg_assert_msg_(JIT, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");

 	bool register_quad = Vd >= Q0;
@ -2335,7 +2335,7 @@ void ARMXEmitter::VTRN(u32 Size, ARMReg Vd, ARMReg Vm)
 }
 void ARMXEmitter::VTST(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
 {
-	_dbg_assert_msg_(JIT, Vd >= Q0, "Pass invalid register to " __FUNCTION__);
+	_dbg_assert_msg_(JIT, Vd >= D0, "Pass invalid register to " __FUNCTION__);
 	_dbg_assert_msg_(JIT, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");

 	bool register_quad = Vd >= Q0;
@ -2345,7 +2345,7 @@ void ARMXEmitter::VTST(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
 }
 void ARMXEmitter::VUZP(u32 Size, ARMReg Vd, ARMReg Vm)
 {
-	_dbg_assert_msg_(JIT, Vd >= Q0, "Pass invalid register to " __FUNCTION__);
+	_dbg_assert_msg_(JIT, Vd >= D0, "Pass invalid register to " __FUNCTION__);
 	_dbg_assert_msg_(JIT, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");

 	bool register_quad = Vd >= Q0;
@ -2355,7 +2355,7 @@ void ARMXEmitter::VUZP(u32 Size, ARMReg Vd, ARMReg Vm)
 }
 void ARMXEmitter::VZIP(u32 Size, ARMReg Vd, ARMReg Vm)
 {
-	_dbg_assert_msg_(JIT, Vd >= Q0, "Pass invalid register to " __FUNCTION__);
+	 _dbg_assert_msg_(JIT, Vd >= D0, "Pass invalid register to " __FUNCTION__);
 	_dbg_assert_msg_(JIT, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");

 	bool register_quad = Vd >= Q0;
@ -2364,6 +2364,45 @@ void ARMXEmitter::VZIP(u32 Size, ARMReg Vd, ARMReg Vm)
 			(0x18 << 4) | (register_quad << 6) | EncodeVm(Vm));
 }

+void ARMXEmitter::VMOVL(u32 Size, ARMReg Vd, ARMReg Vm)
+{
+	_dbg_assert_msg_(JIT, Vd >= Q0, "Pass invalid register to " __FUNCTION__);
+	_dbg_assert_msg_(JIT, Vm >= D0 && Vm <= D31, "Pass invalid register to " __FUNCTION__);
+	_dbg_assert_msg_(JIT, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
+	_dbg_assert_msg_(JIT, (Size & (I_UNSIGNED | I_SIGNED)) != 0, "Must specify I_SIGNED or I_UNSIGNED in VMOVL");
+
+	bool unsign = (Size & I_UNSIGNED) != 0;
+	int imm3 = 0;
+	if (Size & I_8) imm3 = 1;
+	if (Size & I_16) imm3 = 2;
+	if (Size & I_32) imm3 = 4;
+
+	Write32((0xF2 << 24) | (unsign << 24) | (1 << 23) | (imm3 << 19) | EncodeVd(Vd) | \
+		(0xA1 << 4) | EncodeVm(Vm));
+}
+
+void ARMXEmitter::VMOVN(u32 Size, ARMReg Vd, ARMReg Vm)
+{
+	_dbg_assert_msg_(JIT, Vm >= Q0, "Pass invalid register to " __FUNCTION__);
+	_dbg_assert_msg_(JIT, Vd >= D0 && Vd <= D31, "Pass invalid register to " __FUNCTION__);
+	_dbg_assert_msg_(JIT, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
+
+	bool register_quad = Vd >= Q0;
+
+	Write32((0xF3B << 20) | (encodedSize(Size) << 18) | (1 << 17) | EncodeVd(Vd) | (1 << 9) | EncodeVm(Vm));
+}
+
+void ARMXEmitter::VCVT(u32 Size, ARMReg Vd, ARMReg Vm)
+{
+	_dbg_assert_msg_(JIT, (Size & (I_UNSIGNED | I_SIGNED)) != 0, "Must specify I_SIGNED or I_UNSIGNED in VCVT NEON");
+
+	bool register_quad = Vd >= Q0;
+	bool toInteger = (Size & I_32) != 0;
+	bool isUnsigned = (Size & I_UNSIGNED) != 0;
+	int op = (toInteger << 1) | (int)isUnsigned;
+
+	Write32((0xF3 << 24) | (0xBB << 16) | EncodeVd(Vd) | (0x3 << 9) | (op << 7) | (register_quad << 6) | EncodeVm(Vm));
+}

 static int RegCountToType(int nRegs, NEONAlignment align) {
 	switch (nRegs) {
--- a/Common/ArmEmitter.h
+++ b/Common/ArmEmitter.h
@ -726,6 +726,15 @@ public:
 	void VREV32(u32 Size, ARMReg Vd, ARMReg Vm);
 	void VREV16(u32 Size, ARMReg Vd, ARMReg Vm);

+
+	// Widening and narrowing moves
+	void VMOVL(u32 Size, ARMReg Vd, ARMReg Vm);
+	void VMOVN(u32 Size, ARMReg Vd, ARMReg Vm);
+
+	// Vector VCVT
+	void VCVT(u32 DestSize, ARMReg Dest, ARMReg Src);
+
+
 	// Notes:
 	// Rm == _PC  is interpreted as no offset, otherwise, effective address is sum of Rn and Rm
 	// Rm == R13  is interpreted as   VLD1,   ....  [Rn]!    Added a REG_UPDATE pseudo register.
--- a/Core/MIPS/ARM/ArmRegCacheFPU.cpp
+++ b/Core/MIPS/ARM/ArmRegCacheFPU.cpp
@ -59,6 +59,10 @@ static const ARMReg *GetMIPSAllocationOrder(int &count) {
 	// With NEON, we have many more.
 	// In the future I plan to use S0-S7 (Q0-Q1) for FPU and S8 forwards (Q2-Q15, yes, 15) for VFPU.
 	// VFPU will use NEON to do SIMD and it will be awkward to mix with FPU.
+
+	// We should attempt to map scalars to low Q registers and wider things to high registers,
+	// as the NEON instructions are all 2-vector or 4-vector, they don't do scalar, we want to be
+	// able to use regular VFP instructions too.
 	static const ARMReg allocationOrderNEON[] = {
 		// Reserve four temp registers. Useful when building quads until we really figure out
 		// how to do that best.
--- a/Core/MIPS/ARM/ArmRegCacheFPU.h
+++ b/Core/MIPS/ARM/ArmRegCacheFPU.h
@ -48,7 +48,6 @@ struct FPURegMIPS {
 	// If loc == ML_MEM, it's back in its location in the CPU context struct.
 };

-
 class ArmRegCacheFPU
 {
 public:
--- a/GPU/GLES/VertexDecoder.cpp
+++ b/GPU/GLES/VertexDecoder.cpp
@ -18,6 +18,7 @@
 #include "base/basictypes.h"
 #include "base/logging.h"

+#include "Common/CPUDetect.h"
 #include "Core/Config.h"
 #include "Core/MemMap.h"
 #include "GPU/ge_constants.h"
@ -46,6 +47,19 @@ static float MEMORY_ALIGNED16(skinMatrix[12]);
 // using SSE / NEON and store them here.
 static float MEMORY_ALIGNED16(bones[16 * 8]);

+// The rest will be dumped to bones as on x86.
+
+// NEON register allocation:
+// Q0: Texture scaling parameters
+// Q1: Temp storage
+// Q2: Vector-by-matrix accumulator
+// Q3: Unused
+//
+// We'll use Q4-Q7 as the "matrix accumulator".
+// First two matrices will be preloaded into Q8-Q11 and Q12-Q15 to reduce
+// memory bandwidth requirements.
+
+
 inline int align(int n, int align) {
 	return (n + (align - 1)) & ~(align - 1);
 }
@ -924,11 +938,18 @@ static const ARMReg counterReg = R2;
 static const ARMReg fpScratchReg = S4;
 static const ARMReg fpScratchReg2 = S5;
 static const ARMReg fpScratchReg3 = S6;
-
+static const ARMReg fpScratchReg4 = S7;
 static const ARMReg fpUscaleReg = S0;
 static const ARMReg fpVscaleReg = S1;
 static const ARMReg fpUoffsetReg = S2;
 static const ARMReg fpVoffsetReg = S3;
+
+// Simpler aliases for NEON. Overlaps with corresponding VFP regs.
+static const ARMReg neonUVScaleReg = D0;
+static const ARMReg neonUVOffsetReg = D1;
+static const ARMReg neonScratchReg = D2;
+static const ARMReg neonScratchRegQ = Q1;  // Overlaps with all the scratch regs
+
 // Everything above S6 is fair game for skinning

 // S8-S15 are used during matrix generation
@ -1244,48 +1265,79 @@ void VertexDecoderJitCache::Jit_TcU16ThroughDouble() {
 }

 void VertexDecoderJitCache::Jit_TcU8Prescale() {
-	// TODO: SIMD
-	LDRB(tempReg1, srcReg, dec_->tcoff);
-	LDRB(tempReg2, srcReg, dec_->tcoff + 1);
-	VMOV(fpScratchReg, tempReg1);
-	VMOV(fpScratchReg2, tempReg2);
-	VCVT(fpScratchReg, fpScratchReg, TO_FLOAT);
-	VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT);
-	// Could replace VMUL + VADD with VMLA but would require 2 more regs as we don't want to destroy fp*offsetReg. Later.
-	VMUL(fpScratchReg, fpScratchReg, fpUscaleReg);
-	VMUL(fpScratchReg2, fpScratchReg2, fpVscaleReg);
-	VADD(fpScratchReg, fpScratchReg, fpUoffsetReg);
-	VADD(fpScratchReg2, fpScratchReg2, fpVoffsetReg);
-	VSTR(fpScratchReg, dstReg, dec_->decFmt.uvoff);
-	VSTR(fpScratchReg2, dstReg, dec_->decFmt.uvoff + 4);
+	if (false && cpu_info.bNEON) {
+		// TODO: Needs testing
+		ADD(scratchReg, srcReg, dec_->tcoff);
+		VLD1_lane(I_16, neonScratchReg, scratchReg, 0, false);
+		VMOVL(I_8 | I_UNSIGNED, neonScratchRegQ, neonScratchReg);  // Widen to 32-bit
+		VMOVL(I_16 | I_UNSIGNED, neonScratchRegQ, neonScratchReg);  // Widen to 32-bit
+		VCVT(F_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ);
+		VMUL(F_32, neonScratchReg, neonScratchReg, neonUVScaleReg);
+		VADD(F_32, neonScratchReg, neonScratchReg, neonUVOffsetReg);
+		VST1(F_32, neonScratchReg, scratchReg2, 1, ALIGN_NONE);
+	} else {
+		// TODO: SIMD
+		LDRB(tempReg1, srcReg, dec_->tcoff);
+		LDRB(tempReg2, srcReg, dec_->tcoff + 1);
+		VMOV(fpScratchReg, tempReg1);
+		VMOV(fpScratchReg2, tempReg2);
+		VCVT(fpScratchReg, fpScratchReg, TO_FLOAT);
+		VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT);
+		// Could replace VMUL + VADD with VMLA but would require 2 more regs as we don't want to destroy fp*offsetReg. Later.
+		VMUL(fpScratchReg, fpScratchReg, fpUscaleReg);
+		VMUL(fpScratchReg2, fpScratchReg2, fpVscaleReg);
+		VADD(fpScratchReg, fpScratchReg, fpUoffsetReg);
+		VADD(fpScratchReg2, fpScratchReg2, fpVoffsetReg);
+		VSTR(fpScratchReg, dstReg, dec_->decFmt.uvoff);
+		VSTR(fpScratchReg2, dstReg, dec_->decFmt.uvoff + 4);
+	}
 }

 void VertexDecoderJitCache::Jit_TcU16Prescale() {
-	// TODO: SIMD
-	LDRH(tempReg1, srcReg, dec_->tcoff);
-	LDRH(tempReg2, srcReg, dec_->tcoff + 2);
-	VMOV(fpScratchReg, tempReg1);
-	VMOV(fpScratchReg2, tempReg2);
-	VCVT(fpScratchReg, fpScratchReg, TO_FLOAT);
-	VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT);
-	VMUL(fpScratchReg, fpScratchReg, fpUscaleReg);
-	VMUL(fpScratchReg2, fpScratchReg2, fpVscaleReg);
-	VADD(fpScratchReg, fpScratchReg, fpUoffsetReg);
-	VADD(fpScratchReg2, fpScratchReg2, fpVoffsetReg);
-	VSTR(fpScratchReg, dstReg, dec_->decFmt.uvoff);
-	VSTR(fpScratchReg2, dstReg, dec_->decFmt.uvoff + 4);
+	if (false && cpu_info.bNEON) {
+		// TODO: Needs testing
+		ADD(scratchReg, srcReg, dec_->tcoff);
+		VLD1_lane(I_32, neonScratchReg, scratchReg, 0, false);
+		VMOVL(I_16 | I_UNSIGNED, neonScratchRegQ, neonScratchReg);  // Widen to 32-bit
+		VCVT(F_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ);
+		VMUL(F_32, neonScratchReg, neonScratchReg, neonUVScaleReg);
+		VADD(F_32, neonScratchReg, neonScratchReg, neonUVOffsetReg);
+		VST1(F_32, neonScratchReg, scratchReg2, 1, ALIGN_NONE);
+	} else {
+		LDRH(tempReg1, srcReg, dec_->tcoff);
+		LDRH(tempReg2, srcReg, dec_->tcoff + 2);
+		VMOV(fpScratchReg, tempReg1);
+		VMOV(fpScratchReg2, tempReg2);
+		VCVT(fpScratchReg, fpScratchReg, TO_FLOAT);
+		VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT);
+		VMUL(fpScratchReg, fpScratchReg, fpUscaleReg);
+		VMUL(fpScratchReg2, fpScratchReg2, fpVscaleReg);
+		VADD(fpScratchReg, fpScratchReg, fpUoffsetReg);
+		VADD(fpScratchReg2, fpScratchReg2, fpVoffsetReg);
+		VSTR(fpScratchReg, dstReg, dec_->decFmt.uvoff);
+		VSTR(fpScratchReg2, dstReg, dec_->decFmt.uvoff + 4);
+	}
 }

 void VertexDecoderJitCache::Jit_TcFloatPrescale() {
-	// TODO: SIMD
-	VLDR(fpScratchReg, srcReg, dec_->tcoff);
-	VLDR(fpScratchReg2, srcReg, dec_->tcoff + 4);
-	VMUL(fpScratchReg, fpScratchReg, fpUscaleReg);
-	VMUL(fpScratchReg2, fpScratchReg2, fpVscaleReg);
-	VADD(fpScratchReg, fpScratchReg, fpUoffsetReg);
-	VADD(fpScratchReg2, fpScratchReg2, fpVoffsetReg);
-	VSTR(fpScratchReg, dstReg, dec_->decFmt.uvoff);
-	VSTR(fpScratchReg2, dstReg, dec_->decFmt.uvoff + 4);
+	if (cpu_info.bNEON) {
+		ADD(scratchReg, srcReg, dec_->tcoff);
+		VLD1(F_32, neonScratchReg, scratchReg, 1, ALIGN_NONE);
+		ADD(scratchReg2, dstReg, dec_->decFmt.uvoff);
+		VMUL(F_32, neonScratchReg, neonScratchReg, neonUVScaleReg);
+		VADD(F_32, neonScratchReg, neonScratchReg, neonUVOffsetReg);
+		VST1(F_32, neonScratchReg, scratchReg2, 1, ALIGN_NONE);
+	} else {
+		// TODO: SIMD
+		VLDR(fpScratchReg, srcReg, dec_->tcoff);
+		VLDR(fpScratchReg2, srcReg, dec_->tcoff + 4);
+		VMUL(fpScratchReg, fpScratchReg, fpUscaleReg);
+		VMUL(fpScratchReg2, fpScratchReg2, fpVscaleReg);
+		VADD(fpScratchReg, fpScratchReg, fpUoffsetReg);
+		VADD(fpScratchReg2, fpScratchReg2, fpVoffsetReg);
+		VSTR(fpScratchReg, dstReg, dec_->decFmt.uvoff);
+		VSTR(fpScratchReg2, dstReg, dec_->decFmt.uvoff + 4);
+	}
 }

 void VertexDecoderJitCache::Jit_Color8888() {
--- a/UI/MainScreen.h
+++ b/UI/MainScreen.h
@ -73,8 +73,6 @@ private:
 	UI::EventReturn OnLoadState(UI::EventParams &e);
 	UI::EventReturn OnRewind(UI::EventParams &e);

-	UI::EventReturn OnLanguageChange(UI::EventParams &e);
-
 	UI::EventReturn OnStateSelected(UI::EventParams &e);
 	UI::EventReturn OnCwCheat(UI::EventParams &e);

--- a/android/jni/Application.mk
+++ b/android/jni/Application.mk
@ -1,6 +1,6 @@
 APP_STL := gnustl_static
 #APP_ABI := armeabi-v7a x86
-APP_ABI := armeabi-v7a armeabi x86
-#APP_ABI := armeabi-v7a
+#APP_ABI := armeabi-v7a armeabi x86
+APP_ABI := armeabi-v7a
 APP_GNUSTL_CPP_FEATURES := 
 NDK_TOOLCHAIN_VERSION := 4.8
--- a/android/jni/ArmEmitterTest.cpp
+++ b/android/jni/ArmEmitterTest.cpp
@ -2,6 +2,7 @@
 #include "ArmEmitterTest.h"

 #include "Common/ArmEmitter.h"
+#include "Common/CPUDetect.h"

 static bool functionWasCalled;

@ -26,6 +27,9 @@ static float a[4] = {1.0f, 2.0f, 3.0f, 4.5f};
 static float b[4] = {1.0f, 1.0f, 1.0f, 0.5f};
 static float c[4] = {0.0f, 0.0f, 0.0f, 0.0f};

+static u32 x[4] = {0x04030201, 0x08070605, 0x0, 0x0};
+static u32 y[4] = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF};
+static u32 z[4] = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF};

 void TestCode::Generate()
 {
@ -34,6 +38,7 @@ void TestCode::Generate()
 	PUSH(2, R11, _LR);

 	// Load the three pointers
+	/*
 	MOVP2R(R0, a);
 	MOVP2R(R1, b);
 	MOVP2R(R2, c);
@ -43,10 +48,28 @@ void TestCode::Generate()
 	VLD1(F_32, D2, R1, 2);  // Load another 2 doubles
 	// VADD(F_32, Q2, Q0, Q1);  // Add them, seeing them as floating point quads
 	VMUL_scalar(F_32, Q2, Q0, DScalar(D3, 1));   // Multiply a quad by a scalar (ultra efficient for matrix mul! limitation: Scalar has to come out of D0-D15)
+	ADD(R1, R1, 12);
+	VLD1_all_lanes(F_32, Q2, R1, true);
+	ADD(R0, R0, 12);
+	VLD1_lane(F_32, D4, R0, 1, true);
 	u32 word = *(u32 *)(GetCodePtr() - 4);
 	ILOG("Instruction Word: %08x", word);
 	// VMUL(F_32, Q2, Q0, Q1);
 	VST1(F_32, D4, R2, 2);
+	*/
+
+	// Let's try some integer stuff
+	MOVP2R(R0, x);
+	MOVP2R(R1, y);
+	MOVP2R(R2, z);
+	MOVP2R(R3, c);
+	VLD1(I_32, D0, R0, 1);  // Load 1 double
+	VMOVL(I_8 | I_UNSIGNED, Q1, D0);
+	VMOVL(I_16 | I_UNSIGNED, Q2, D2);
+	VCVT(F_32 | I_SIGNED, Q3, Q2);
+	VST1(I_32, D2, R1, 2);
+	VST1(I_32, D4, R2, 2);
+	VST1(I_32, D6, R3, 2);

 	// This works!

@ -92,6 +115,10 @@ void ArmEmitterTest()
 	// Disabled for now.
 	return;

+	// If I commit with it enabled by accident, let's not blow up.
+	if (!cpu_info.bNEON)
+		return;
+
 	for (int i = 0; i < 6; i++) {
 		ILOG("--------------------------");
 	}
@ -106,6 +133,9 @@ void ArmEmitterTest()

 	u32 retval = CallPtr(gen.testCodePtr);
 	// ILOG("ARM emitter test 1 passed if %f == 3.0! retval = %08x", abc[32 + 31], retval);
+	ILOG("x: %08x %08x %08x %08x", x[0], x[1], x[2], x[3]);
+	ILOG("y: %08x %08x %08x %08x", y[0], y[1], y[2], y[3]);
+	ILOG("z: %08x %08x %08x %08x", z[0], z[1], z[2], z[3]);
 	ILOG("c: %f %f %f %f", c[0], c[1], c[2], c[3]);
 	for (int i = 0; i < 6; i++) {
 		ILOG("--------------------------");