From ca58f322e53e1c40c69e9c7d02edd8fa6c649a82 Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Sun, 22 Mar 2015 00:04:50 +0100
Subject: [PATCH] ARM64: Port over some missing VFPU instructions from ARM. Not
 much left now.

---
 Common/Arm64Emitter.cpp           |   4 +-
 Common/Arm64Emitter.h             |   2 +-
 Common/ArmCommon.h                |   5 +
 Core/MIPS/ARM64/Arm64CompVFPU.cpp | 270 +++++++++++++++++++++++++++++-
 4 files changed, 273 insertions(+), 8 deletions(-)

diff --git a/Common/Arm64Emitter.cpp b/Common/Arm64Emitter.cpp
index d636ec41b1..24becd38b2 100644
--- a/Common/Arm64Emitter.cpp
+++ b/Common/Arm64Emitter.cpp
@@ -3280,12 +3280,12 @@ void ARM64XEmitter::ANDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch)
 	}
 }
 
-void ARM64XEmitter::ORI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) {
+void ARM64XEmitter::ORRI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) {
 	unsigned int n, imm_s, imm_r;
 	if (IsImmLogical(imm, Is64Bit(Rn) ? 64 : 32, &n, &imm_s, &imm_r)) {
 		ORR(Rd, Rn, imm_r, imm_s, n);
 	} else {
-		_assert_msg_(JIT, scratch != INVALID_REG, "ORI2R - failed to construct immediate value from %08x, need scratch", (u32)imm);
+		_assert_msg_(JIT, scratch != INVALID_REG, "ORRI2R - failed to construct immediate value from %08x, need scratch", (u32)imm);
 		MOVI2R(scratch, imm);
 		ORR(Rd, Rn, scratch);
 	}
diff --git a/Common/Arm64Emitter.h b/Common/Arm64Emitter.h
index 28a3a277b9..c388b8ad7a 100644
--- a/Common/Arm64Emitter.h
+++ b/Common/Arm64Emitter.h
@@ -691,7 +691,7 @@ public:
 	void ANDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
 	void ANDSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
 	void TSTI2R(ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG) { ANDSI2R(Is64Bit(Rn) ? ZR : WZR, Rn, imm, scratch); }
-	void ORI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+	void ORRI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
 	void EORI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
 	void CMPI2R(ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
 
diff --git a/Common/ArmCommon.h b/Common/ArmCommon.h
index e55756016b..01968a5062 100644
--- a/Common/ArmCommon.h
+++ b/Common/ArmCommon.h
@@ -28,3 +28,8 @@ enum CCFlags
 };
 const u32 NO_COND = 0xE0000000;
 
+inline CCFlags InvertCond(CCFlags fl) {
+	int x = (int)fl;
+	x ^= 1;
+	return (CCFlags)x;
+}
\ No newline at end of file
diff --git a/Core/MIPS/ARM64/Arm64CompVFPU.cpp b/Core/MIPS/ARM64/Arm64CompVFPU.cpp
index 361d661edb..f1459db446 100644
--- a/Core/MIPS/ARM64/Arm64CompVFPU.cpp
+++ b/Core/MIPS/ARM64/Arm64CompVFPU.cpp
@@ -1195,11 +1195,231 @@ namespace MIPSComp
 	}
 
 	void Arm64Jit::Comp_Vcmp(MIPSOpcode op) {
-		DISABLE;
+		CONDITIONAL_DISABLE;
+		if (js.HasUnknownPrefix())
+			DISABLE;
+
+		VectorSize sz = GetVecSize(op);
+		int n = GetNumVectorElements(sz);
+
+		VCondition cond = (VCondition)(op & 0xF);
+
+		u8 sregs[4], tregs[4];
+		GetVectorRegsPrefixS(sregs, sz, _VS);
+		GetVectorRegsPrefixT(tregs, sz, _VT);
+
+		// Some, we just fall back to the interpreter.
+		// ES is just really equivalent to (value & 0x7F800000) == 0x7F800000.
+
+		switch (cond) {
+		case VC_EI: // c = my_isinf(s[i]); break;
+		case VC_NI: // c = !my_isinf(s[i]); break;
+			DISABLE;
+		case VC_ES: // c = my_isnan(s[i]) || my_isinf(s[i]); break;   // Tekken Dark Resurrection
+		case VC_NS: // c = !my_isnan(s[i]) && !my_isinf(s[i]); break;
+		case VC_EN: // c = my_isnan(s[i]); break;
+		case VC_NN: // c = !my_isnan(s[i]); break;
+			if (_VS != _VT)
+				DISABLE;
+			break;
+
+		case VC_EZ:
+		case VC_NZ:
+			break;
+		default:
+			;
+		}
+
+		// First, let's get the trivial ones.
+		int affected_bits = (1 << 4) | (1 << 5);  // 4 and 5
+
+		MOVI2R(SCRATCH1, 0);
+		for (int i = 0; i < n; ++i) {
+			// Let's only handle the easy ones, and fall back on the interpreter for the rest.
+			CCFlags flag = CC_AL;
+			switch (cond) {
+			case VC_FL: // c = 0;
+				break;
+
+			case VC_TR: // c = 1
+				if (i == 0) {
+					if (n == 1) {
+						MOVI2R(SCRATCH1, 0x31);
+					} else {
+						MOVI2R(SCRATCH1, 1 << i);
+					}
+				} else {
+					ORRI2R(SCRATCH1, SCRATCH1, 1 << i);
+				}
+				break;
+
+			case VC_ES: // c = my_isnan(s[i]) || my_isinf(s[i]); break;   // Tekken Dark Resurrection
+			case VC_NS: // c = !(my_isnan(s[i]) || my_isinf(s[i])); break;
+				// For these, we use the integer ALU as there is no support on ARM for testing for INF.
+				// Testing for nan or inf is the same as testing for &= 0x7F800000 == 0x7F800000.
+				// We need an extra temporary register so we store away SCRATCH1.
+				STR(INDEX_UNSIGNED, SCRATCH1, CTXREG, offsetof(MIPSState, temp));
+				fpr.MapRegV(sregs[i], 0);
+				MOVI2R(SCRATCH1, 0x7F800000);
+				fp.FMOV(SCRATCH2, fpr.V(sregs[i]));
+				AND(SCRATCH2, SCRATCH2, SCRATCH1);
+				CMP(SCRATCH2, SCRATCH1);   // (SCRATCH2 & 0x7F800000) == 0x7F800000
+				flag = cond == VC_ES ? CC_EQ : CC_NEQ;
+				LDR(INDEX_UNSIGNED, SCRATCH1, CTXREG, offsetof(MIPSState, temp));
+				break;
+
+			case VC_EN: // c = my_isnan(s[i]); break;  // Tekken 6
+				// Should we involve T? Where I found this used, it compared a register with itself so should be fine.
+				fpr.MapInInV(sregs[i], tregs[i]);
+				fp.FCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));
+				flag = CC_VS;  // overflow = unordered : http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204j/Chdhcfbc.html
+				break;
+
+			case VC_NN: // c = !my_isnan(s[i]); break;
+				// Should we involve T? Where I found this used, it compared a register with itself so should be fine.
+				fpr.MapInInV(sregs[i], tregs[i]);
+				fp.FCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));
+				flag = CC_VC;  // !overflow = !unordered : http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204j/Chdhcfbc.html
+				break;
+
+			case VC_EQ: // c = s[i] == t[i]
+				fpr.MapInInV(sregs[i], tregs[i]);
+				fp.FCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));
+				flag = CC_EQ;
+				break;
+
+			case VC_LT: // c = s[i] < t[i]
+				fpr.MapInInV(sregs[i], tregs[i]);
+				fp.FCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));
+				flag = CC_LO;
+				break;
+
+			case VC_LE: // c = s[i] <= t[i]; 
+				fpr.MapInInV(sregs[i], tregs[i]);
+				fp.FCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));
+				flag = CC_LS;
+				break;
+
+			case VC_NE: // c = s[i] != t[i]
+				fpr.MapInInV(sregs[i], tregs[i]);
+				fp.FCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));
+				flag = CC_NEQ;
+				break;
+
+			case VC_GE: // c = s[i] >= t[i]
+				fpr.MapInInV(sregs[i], tregs[i]);
+				fp.FCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));
+				flag = CC_GE;
+				break;
+
+			case VC_GT: // c = s[i] > t[i]
+				fpr.MapInInV(sregs[i], tregs[i]);
+				fp.FCMP(fpr.V(sregs[i]), fpr.V(tregs[i]));
+				flag = CC_GT;
+				break;
+
+			case VC_EZ: // c = s[i] == 0.0f || s[i] == -0.0f
+				fpr.MapRegV(sregs[i]);
+				fp.FCMP(fpr.V(sregs[i])); // vcmp(sregs[i], #0.0)
+				flag = CC_EQ;
+				break;
+
+			case VC_NZ: // c = s[i] != 0
+				fpr.MapRegV(sregs[i]);
+				fp.FCMP(fpr.V(sregs[i])); // vcmp(sregs[i], #0.0)
+				flag = CC_NEQ;
+				break;
+
+			default:
+				DISABLE;
+			}
+			if (flag != CC_AL) {
+				FixupBranch b = B(InvertCond(flag));
+				if (i == 0) {
+					if (n == 1) {
+						MOVI2R(SCRATCH1, 0x31);
+					} else {
+						MOVI2R(SCRATCH1, 1);  // 1 << i, but i == 0
+					}
+				} else {
+					ORRI2R(SCRATCH1, SCRATCH1, 1 << i);
+				}
+				SetJumpTarget(b);
+			}
+
+			affected_bits |= 1 << i;
+		}
+
+		// Aggregate the bits. Urgh, expensive. Can optimize for the case of one comparison, which is the most common
+		// after all.
+		if (n > 1) {
+			CMP(SCRATCH1, affected_bits & 0xF);
+			FixupBranch skip1 = B(CC_NEQ);
+			ORRI2R(SCRATCH1, SCRATCH1, 1 << 5);
+			SetJumpTarget(skip1);
+
+			CMP(SCRATCH1, 0);
+			FixupBranch skip2 = B(CC_EQ);
+			ORRI2R(SCRATCH1, SCRATCH1, 1 << 4);
+			SetJumpTarget(skip2);
+		}
+
+		gpr.MapReg(MIPS_REG_VFPUCC, MAP_DIRTY);
+		ANDI2R(gpr.R(MIPS_REG_VFPUCC), gpr.R(MIPS_REG_VFPUCC), ~affected_bits);
+		ORR(gpr.R(MIPS_REG_VFPUCC), gpr.R(MIPS_REG_VFPUCC), SCRATCH1);
+
+		fpr.ReleaseSpillLocksAndDiscardTemps();
 	}
 
 	void Arm64Jit::Comp_Vcmov(MIPSOpcode op) {
-		DISABLE;
+		CONDITIONAL_DISABLE;
+		if (js.HasUnknownPrefix()) {
+			DISABLE;
+		}
+
+		VectorSize sz = GetVecSize(op);
+		int n = GetNumVectorElements(sz);
+
+		u8 sregs[4], dregs[4];
+		GetVectorRegsPrefixS(sregs, sz, _VS);
+		GetVectorRegsPrefixD(dregs, sz, _VD);
+		int tf = (op >> 19) & 1;
+		int imm3 = (op >> 16) & 7;
+
+		for (int i = 0; i < n; ++i) {
+			// Simplification: Disable if overlap unsafe
+			if (!IsOverlapSafeAllowS(dregs[i], i, n, sregs)) {
+				DISABLE;
+			}
+		}
+
+		if (imm3 < 6) {
+			// Test one bit of CC. This bit decides whether none or all subregisters are copied.
+			fpr.MapRegsAndSpillLockV(dregs, sz, MAP_DIRTY);
+			fpr.MapRegsAndSpillLockV(sregs, sz, 0);
+			gpr.MapReg(MIPS_REG_VFPUCC);
+			TSTI2R(gpr.R(MIPS_REG_VFPUCC), 1 << imm3);
+			// TODO: Use fsel?
+			FixupBranch b = B(tf ? CC_NEQ : CC_EQ);
+			for (int i = 0; i < n; i++) {
+				fp.FMOV(fpr.V(dregs[i]), fpr.V(sregs[i]));
+			}
+			SetJumpTarget(b);
+		} else {
+			// Look at the bottom four bits of CC to individually decide if the subregisters should be copied.
+			fpr.MapRegsAndSpillLockV(dregs, sz, MAP_DIRTY);
+			fpr.MapRegsAndSpillLockV(sregs, sz, 0);
+			gpr.MapReg(MIPS_REG_VFPUCC);
+			for (int i = 0; i < n; i++) {
+				TSTI2R(gpr.R(MIPS_REG_VFPUCC), 1 << i);
+				FixupBranch b = B(tf ? CC_NEQ : CC_EQ);
+				fp.FMOV(fpr.V(dregs[i]), fpr.V(sregs[i]));
+				SetJumpTarget(b);
+			}
+		}
+
+		ApplyPrefixD(dregs, sz);
+		fpr.ReleaseSpillLocksAndDiscardTemps();
 	}
 
 	void Arm64Jit::Comp_Viim(MIPSOpcode op) {
@@ -1303,7 +1523,7 @@ namespace MIPSComp
 	// Very heavily used by FF:CC. Should be replaced by a fast approximation instead of
 	// calling the math library.
 	void Arm64Jit::Comp_VRot(MIPSOpcode op) {
-		DISABLE;
+		DISABLE;  // Need to figure out how to deal with the return values from the function.
 
 		// VRot probably doesn't accept prefixes anyway.
 		CONDITIONAL_DISABLE;
@@ -1343,7 +1563,6 @@ namespace MIPSComp
 		bool negSin1 = (imm & 0x10) ? true : false;
 
 		fpr.MapRegV(sreg);
-		// We should write a custom pure-asm function instead.
 		fp.FMOV(S0, fpr.V(sreg));
 		QuickCallFunction(SCRATCH2_64, negSin1 ? (void *)&SinCosNegSin : (void *)&SinCos);
 		CompVrotShuffle(dregs, imm, sz, false);
@@ -1362,7 +1581,48 @@ namespace MIPSComp
 	}
 
 	void Arm64Jit::Comp_Vocp(MIPSOpcode op) {
-		DISABLE;
+		CONDITIONAL_DISABLE;
+		if (js.HasUnknownPrefix()) {
+			DISABLE;
+		}
+
+		VectorSize sz = GetVecSize(op);
+		int n = GetNumVectorElements(sz);
+
+		u8 sregs[4], dregs[4];
+		// Actually, not sure that this instruction accepts an S prefix. We don't apply it in the
+		// interpreter. But whatever.
+		GetVectorRegsPrefixS(sregs, sz, _VS);
+		GetVectorRegsPrefixD(dregs, sz, _VD);
+
+		MIPSReg tempregs[4];
+		for (int i = 0; i < n; ++i) {
+			if (!IsOverlapSafe(dregs[i], i, n, sregs)) {
+				tempregs[i] = fpr.GetTempV();
+			} else {
+				tempregs[i] = dregs[i];
+			}
+		}
+
+		fp.MOVI2F(S0, 1.0f, SCRATCH1);
+		for (int i = 0; i < n; ++i) {
+			fpr.MapDirtyInV(tempregs[i], sregs[i]);
+			// Let's do it integer registers for now. NEON later.
+			// There's gotta be a shorter way, can't find one though that takes
+			// care of NaNs like the interpreter (ignores them and just operates on the bits).
+			fp.FSUB(fpr.V(tempregs[i]), S0, fpr.V(sregs[i]));
+		}
+
+		for (int i = 0; i < n; ++i) {
+			if (dregs[i] != tempregs[i]) {
+				fpr.MapDirtyInV(dregs[i], tempregs[i]);
+				fp.FMOV(fpr.V(dregs[i]), fpr.V(tempregs[i]));
+			}
+		}
+
+		ApplyPrefixD(dregs, sz);
+
+		fpr.ReleaseSpillLocksAndDiscardTemps();
 	}
 
 	void Arm64Jit::Comp_ColorConv(MIPSOpcode op) {