ARM64: Port over some missing VFPU instructions from ARM. Not much left now.

2024-11-26 23:10:38 +00:00 · 2015-03-22 00:04:50 +01:00 · 2015-03-22 00:04:50 +01:00 · ca58f322e5
commit ca58f322e5
parent f06e9a9d18
4 changed files with 273 additions and 8 deletions
--- a/Common/Arm64Emitter.cpp
+++ b/Common/Arm64Emitter.cpp
@ -3280,12 +3280,12 @@ void ARM64XEmitter::ANDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch)
 	}
 }
-void ARM64XEmitter::ORI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) {
+void ARM64XEmitter::ORRI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) {
 	unsigned int n, imm_s, imm_r;
 	if (IsImmLogical(imm, Is64Bit(Rn) ? 64 : 32, &n, &imm_s, &imm_r)) {
 		ORR(Rd, Rn, imm_r, imm_s, n);
 	} else {
-		_assert_msg_(JIT, scratch != INVALID_REG, "ORI2R - failed to construct immediate value from %08x, need scratch", (u32)imm);
+		_assert_msg_(JIT, scratch != INVALID_REG, "ORRI2R - failed to construct immediate value from %08x, need scratch", (u32)imm);
 		MOVI2R(scratch, imm);
 		ORR(Rd, Rn, scratch);
 	}
--- a/Common/Arm64Emitter.h
+++ b/Common/Arm64Emitter.h
@ -691,7 +691,7 @@ public:
 	void ANDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
 	void ANDSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
 	void TSTI2R(ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG) { ANDSI2R(Is64Bit(Rn) ? ZR : WZR, Rn, imm, scratch); }
-	void ORI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+	void ORRI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
 	void EORI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
 	void CMPI2R(ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
--- a/Common/ArmCommon.h
+++ b/Common/ArmCommon.h
@ -28,3 +28,8 @@ enum CCFlags
 };
 const u32 NO_COND = 0xE0000000;
 inline CCFlags InvertCond(CCFlags fl) {
 	int x = (int)fl;
 	x ^= 1;
 	return (CCFlags)x;
 }
--- a/Core/MIPS/ARM64/Arm64CompVFPU.cpp
+++ b/Core/MIPS/ARM64/Arm64CompVFPU.cpp
@ -1195,11 +1195,231 @@ namespace MIPSComp
 	}
 	void Arm64Jit::Comp_Vcmp(MIPSOpcode op) {
-		DISABLE;
+		CONDITIONAL_DISABLE;
 		if (js.HasUnknownPrefix())
 			DISABLE;
 		VectorSize sz = GetVecSize(op);
 		int n = GetNumVectorElements(sz);
 		VCondition cond = (VCondition)(op & 0xF);
 		u8 sregs[4], tregs[4];
 		GetVectorRegsPrefixS(sregs, sz, _VS);
 		GetVectorRegsPrefixT(tregs, sz, _VT);
 		// Some, we just fall back to the interpreter.
 		// ES is just really equivalent to (value & 0x7F800000) == 0x7F800000.
 		switch (cond) {
 		case VC_EI: // c = my_isinf(s[i]); break;
 		case VC_NI: // c = !my_isinf(s[i]); break;
 			DISABLE;
 		case VC_ES: // c = my_isnan(s[i]) || my_isinf(s[i]); break;   // Tekken Dark Resurrection
 		case VC_NS: // c = !my_isnan(s[i]) && !my_isinf(s[i]); break;
 		case VC_EN: // c = my_isnan(s[i]); break;
 		case VC_NN: // c = !my_isnan(s[i]); break;
 			if (_VS != _VT)
 				DISABLE;
 			break;
 		case VC_EZ:
 		case VC_NZ:
 			break;
 		default:
 			;
 		}
 		// First, let's get the trivial ones.
 		int affected_bits = (1 << 4) | (1 << 5);  // 4 and 5
 		MOVI2R(SCRATCH1, 0);
 		for (int i = 0; i < n; ++i) {
 			// Let's only handle the easy ones, and fall back on the interpreter for the rest.
 			CCFlags flag = CC_AL;
 			switch (cond) {
 			case VC_FL: // c = 0;
 				break;
 			case VC_TR: // c = 1
 				if (i == 0) {
 					if (n == 1) {
 						MOVI2R(SCRATCH1, 0x31);
 					} else {
 						MOVI2R(SCRATCH1, 1 << i);
 					}
 				} else {
 					ORRI2R(SCRATCH1, SCRATCH1, 1 << i);
 				}
 				break;
 			case VC_ES: // c = my_isnan(s[i]) || my_isinf(s[i]); break;   // Tekken Dark Resurrection
 			case VC_NS: // c = !(my_isnan(s[i]) || my_isinf(s[i])); break;
 				// For these, we use the integer ALU as there is no support on ARM for testing for INF.
 				// Testing for nan or inf is the same as testing for &= 0x7F800000 == 0x7F800000.
 				// We need an extra temporary register so we store away SCRATCH1.
 				STR(INDEX_UNSIGNED, SCRATCH1, CTXREG, offsetof(MIPSState, temp));
 				fpr.MapRegV(sregs[i], 0);
 				MOVI2R(SCRATCH1, 0x7F800000);
 				fp.FMOV(SCRATCH2, fpr.V(sregs[i]));
 				AND(SCRATCH2, SCRATCH2, SCRATCH1);
 				CMP(SCRATCH2, SCRATCH1);   // (SCRATCH2 & 0x7F800000) == 0x7F800000
 				flag = cond == VC_ES ? CC_EQ : CC_NEQ;
 				LDR(INDEX_UNSIGNED, SCRATCH1, CTXREG, offsetof(MIPSState, temp));
 				break;
 			case VC_EN: // c = my_isnan(s[i]); break;  // Tekken 6
 				// Should we involve T? Where I found this used, it compared a register with itself so should be fine.
 				fpr.MapInInV(sregs[i], tregs[i]);
 				fp.FCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));
 				flag = CC_VS;  // overflow = unordered : http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204j/Chdhcfbc.html
 				break;
 			case VC_NN: // c = !my_isnan(s[i]); break;
 				// Should we involve T? Where I found this used, it compared a register with itself so should be fine.
 				fpr.MapInInV(sregs[i], tregs[i]);
 				fp.FCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));
 				flag = CC_VC;  // !overflow = !unordered : http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204j/Chdhcfbc.html
 				break;
 			case VC_EQ: // c = s[i] == t[i]
 				fpr.MapInInV(sregs[i], tregs[i]);
 				fp.FCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));
 				flag = CC_EQ;
 				break;
 			case VC_LT: // c = s[i] < t[i]
 				fpr.MapInInV(sregs[i], tregs[i]);
 				fp.FCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));
 				flag = CC_LO;
 				break;
 			case VC_LE: // c = s[i] <= t[i]; 
 				fpr.MapInInV(sregs[i], tregs[i]);
 				fp.FCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));
 				flag = CC_LS;
 				break;
 			case VC_NE: // c = s[i] != t[i]
 				fpr.MapInInV(sregs[i], tregs[i]);
 				fp.FCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));
 				flag = CC_NEQ;
 				break;
 			case VC_GE: // c = s[i] >= t[i]
 				fpr.MapInInV(sregs[i], tregs[i]);
 				fp.FCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));
 				flag = CC_GE;
 				break;
 			case VC_GT: // c = s[i] > t[i]
 				fpr.MapInInV(sregs[i], tregs[i]);
 				fp.FCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));
 				flag = CC_GT;
 				break;
 			case VC_EZ: // c = s[i] == 0.0f || s[i] == -0.0f
 				fpr.MapRegV(sregs[i]);
 				fp.FCMP(fpr.V(sregs[i])); // vcmp(sregs[i], #0.0)
 				flag = CC_EQ;
 				break;
 			case VC_NZ: // c = s[i] != 0
 				fpr.MapRegV(sregs[i]);
 				fp.FCMP(fpr.V(sregs[i])); // vcmp(sregs[i], #0.0)
 				flag = CC_NEQ;
 				break;
 			default:
 				DISABLE;
 			}
 			if (flag != CC_AL) {
 				FixupBranch b = B(InvertCond(flag));
 				if (i == 0) {
 					if (n == 1) {
 						MOVI2R(SCRATCH1, 0x31);
 					} else {
 						MOVI2R(SCRATCH1, 1);  // 1 << i, but i == 0
 					}
 				} else {
 					ORRI2R(SCRATCH1, SCRATCH1, 1 << i);
 				}
 				SetJumpTarget(b);
 			}
 			affected_bits |= 1 << i;
 		}
 		// Aggregate the bits. Urgh, expensive. Can optimize for the case of one comparison, which is the most common
 		// after all.
 		if (n > 1) {
 			CMP(SCRATCH1, affected_bits & 0xF);
 			FixupBranch skip1 = B(CC_NEQ);
 			ORRI2R(SCRATCH1, SCRATCH1, 1 << 5);
 			SetJumpTarget(skip1);
 			CMP(SCRATCH1, 0);
 			FixupBranch skip2 = B(CC_EQ);
 			ORRI2R(SCRATCH1, SCRATCH1, 1 << 4);
 			SetJumpTarget(skip2);
 		}
 		gpr.MapReg(MIPS_REG_VFPUCC, MAP_DIRTY);
 		ANDI2R(gpr.R(MIPS_REG_VFPUCC), gpr.R(MIPS_REG_VFPUCC), ~affected_bits);
 		ORR(gpr.R(MIPS_REG_VFPUCC), gpr.R(MIPS_REG_VFPUCC), SCRATCH1);
 		fpr.ReleaseSpillLocksAndDiscardTemps();
 	}
 	void Arm64Jit::Comp_Vcmov(MIPSOpcode op) {
-		DISABLE;
+		CONDITIONAL_DISABLE;
 		if (js.HasUnknownPrefix()) {
 			DISABLE;
 		}
 		VectorSize sz = GetVecSize(op);
 		int n = GetNumVectorElements(sz);
 		u8 sregs[4], dregs[4];
 		GetVectorRegsPrefixS(sregs, sz, _VS);
 		GetVectorRegsPrefixD(dregs, sz, _VD);
 		int tf = (op >> 19) & 1;
 		int imm3 = (op >> 16) & 7;
 		for (int i = 0; i < n; ++i) {
 			// Simplification: Disable if overlap unsafe
 			if (!IsOverlapSafeAllowS(dregs[i], i, n, sregs)) {
 				DISABLE;
 			}
 		}
 		if (imm3 < 6) {
 			// Test one bit of CC. This bit decides whether none or all subregisters are copied.
 			fpr.MapRegsAndSpillLockV(dregs, sz, MAP_DIRTY);
 			fpr.MapRegsAndSpillLockV(sregs, sz, 0);
 			gpr.MapReg(MIPS_REG_VFPUCC);
 			TSTI2R(gpr.R(MIPS_REG_VFPUCC), 1 << imm3);
 			// TODO: Use fsel?
 			FixupBranch b = B(tf ? CC_NEQ : CC_EQ);
 			for (int i = 0; i < n; i++) {
 				fp.FMOV(fpr.V(dregs[i]), fpr.V(sregs[i]));
 			}
 			SetJumpTarget(b);
 		} else {
 			// Look at the bottom four bits of CC to individually decide if the subregisters should be copied.
 			fpr.MapRegsAndSpillLockV(dregs, sz, MAP_DIRTY);
 			fpr.MapRegsAndSpillLockV(sregs, sz, 0);
 			gpr.MapReg(MIPS_REG_VFPUCC);
 			for (int i = 0; i < n; i++) {
 				TSTI2R(gpr.R(MIPS_REG_VFPUCC), 1 << i);
 				FixupBranch b = B(tf ? CC_NEQ : CC_EQ);
 				fp.FMOV(fpr.V(dregs[i]), fpr.V(sregs[i]));
 				SetJumpTarget(b);
 			}
 		}
 		ApplyPrefixD(dregs, sz);
 		fpr.ReleaseSpillLocksAndDiscardTemps();
 	}
 	void Arm64Jit::Comp_Viim(MIPSOpcode op) {
@ -1303,7 +1523,7 @@ namespace MIPSComp
 	// Very heavily used by FF:CC. Should be replaced by a fast approximation instead of
 	// calling the math library.
 	void Arm64Jit::Comp_VRot(MIPSOpcode op) {
-		DISABLE;
+		DISABLE;  // Need to figure out how to deal with the return values from the function.
 		// VRot probably doesn't accept prefixes anyway.
 		CONDITIONAL_DISABLE;
@ -1343,7 +1563,6 @@ namespace MIPSComp
 		bool negSin1 = (imm & 0x10) ? true : false;
 		fpr.MapRegV(sreg);
 		// We should write a custom pure-asm function instead.
 		fp.FMOV(S0, fpr.V(sreg));
 		QuickCallFunction(SCRATCH2_64, negSin1 ? (void *)&SinCosNegSin : (void *)&SinCos);
 		CompVrotShuffle(dregs, imm, sz, false);
@ -1362,7 +1581,48 @@ namespace MIPSComp
 	}
 	void Arm64Jit::Comp_Vocp(MIPSOpcode op) {
-		DISABLE;
+		CONDITIONAL_DISABLE;
 		if (js.HasUnknownPrefix()) {
 			DISABLE;
 		}
 		VectorSize sz = GetVecSize(op);
 		int n = GetNumVectorElements(sz);
 		u8 sregs[4], dregs[4];
 		// Actually, not sure that this instruction accepts an S prefix. We don't apply it in the
 		// interpreter. But whatever.
 		GetVectorRegsPrefixS(sregs, sz, _VS);
 		GetVectorRegsPrefixD(dregs, sz, _VD);
 		MIPSReg tempregs[4];
 		for (int i = 0; i < n; ++i) {
 			if (!IsOverlapSafe(dregs[i], i, n, sregs)) {
 				tempregs[i] = fpr.GetTempV();
 			} else {
 				tempregs[i] = dregs[i];
 			}
 		}
 		fp.MOVI2F(S0, 1.0f, SCRATCH1);
 		for (int i = 0; i < n; ++i) {
 			fpr.MapDirtyInV(tempregs[i], sregs[i]);
 			// Let's do it integer registers for now. NEON later.
 			// There's gotta be a shorter way, can't find one though that takes
 			// care of NaNs like the interpreter (ignores them and just operates on the bits).
 			fp.FSUB(fpr.V(tempregs[i]), S0, fpr.V(sregs[i]));
 		}
 		for (int i = 0; i < n; ++i) {
 			if (dregs[i] != tempregs[i]) {
 				fpr.MapDirtyInV(dregs[i], tempregs[i]);
 				fp.FMOV(fpr.V(dregs[i]), fpr.V(tempregs[i]));
 			}
 		}
 		ApplyPrefixD(dregs, sz);
 		fpr.ReleaseSpillLocksAndDiscardTemps();
 	}
 	void Arm64Jit::Comp_ColorConv(MIPSOpcode op) {