arm64jit: Use CBZ/CBNZ for zero compare branches.

These are pretty common, so it reduces bloat decently. Seems about the same speed, though.
2024-11-24 05:49:58 +00:00 · 2017-12-29 13:13:07 -08:00 · 2017-12-29 13:13:07 -08:00 · c71285c970
commit c71285c970
parent 7f8a871e30
2 changed files with 50 additions and 28 deletions
--- a/Core/MIPS/ARM64/Arm64CompBranch.cpp
+++ b/Core/MIPS/ARM64/Arm64CompBranch.cpp
@ -128,36 +128,56 @@ void Arm64Jit::BranchRSRTComp(MIPSOpcode op, CCFlags cc, bool likely)

 		// We might be able to flip the condition (EQ/NEQ are easy.)
 		const bool canFlip = cc == CC_EQ || cc == CC_NEQ;
-
-		u32 val;
-		bool shift;
-		if (gpr.IsImm(rt) && IsImmArithmetic(gpr.GetImm(rt), &val, &shift)) {
-			gpr.MapReg(rs);
-			CMP(gpr.R(rs), val, shift);
-		} else if (gpr.IsImm(rt) && IsImmArithmetic((u64)(s64)-(s32)gpr.GetImm(rt), &val, &shift)) {
-			gpr.MapReg(rs);
-			CMN(gpr.R(rs), val, shift);
-		} else if (gpr.IsImm(rs) && IsImmArithmetic(gpr.GetImm(rs), &val, &shift) && canFlip) {
-			gpr.MapReg(rt);
-			CMP(gpr.R(rt), val, shift);
-		} else if (gpr.IsImm(rs) && IsImmArithmetic((u64)(s64)-(s32)gpr.GetImm(rs), &val, &shift) && canFlip) {
-			gpr.MapReg(rt);
-			CMN(gpr.R(rt), val, shift);
-		} else {
-			gpr.MapInIn(rs, rt);
-			CMP(gpr.R(rs), gpr.R(rt));
-		}
+		const bool rsIsZero = gpr.IsImm(rs) && gpr.GetImm(rs) == 0;
+		const bool rtIsZero = gpr.IsImm(rt) && gpr.GetImm(rt) == 0;

 		Arm64Gen::FixupBranch ptr;
-		if (!likely) {
-			if (!delaySlotIsNice)
-				CompileDelaySlot(DELAYSLOT_SAFE_FLUSH);
-			else
-				FlushAll();
-			ptr = B(cc);
-		} else {
+		if ((likely || delaySlotIsNice) && (rsIsZero || rtIsZero) && canFlip) {
+			// Special case, we can just use CBZ/CBNZ directly.
+			MIPSGPReg r = rsIsZero ? rt : rs;
+			gpr.MapReg(r);
+			// Flush should keep r in the same armreg.
+			ARM64Reg ar = gpr.R(r);
 			FlushAll();
-			ptr = B(cc);
+			if (cc == CC_EQ) {
+				ptr = CBZ(ar);
+			} else {
+				ptr = CBNZ(ar);
+			}
+		} else {
+			u32 val;
+			bool shift;
+			if (gpr.IsImm(rt) && IsImmArithmetic(gpr.GetImm(rt), &val, &shift)) {
+				gpr.MapReg(rs);
+				CMP(gpr.R(rs), val, shift);
+			} else if (gpr.IsImm(rt) && IsImmArithmetic((u64)(s64)-(s32)gpr.GetImm(rt), &val, &shift)) {
+				gpr.MapReg(rs);
+				CMN(gpr.R(rs), val, shift);
+			} else if (gpr.IsImm(rs) && IsImmArithmetic(gpr.GetImm(rs), &val, &shift) && canFlip) {
+				gpr.MapReg(rt);
+				CMP(gpr.R(rt), val, shift);
+			} else if (gpr.IsImm(rs) && IsImmArithmetic((u64)(s64)-(s32)gpr.GetImm(rs), &val, &shift) && canFlip) {
+				gpr.MapReg(rt);
+				CMN(gpr.R(rt), val, shift);
+			} else {
+				gpr.MapInIn(rs, rt);
+				CMP(gpr.R(rs), gpr.R(rt));
+			}
+
+			if (!likely) {
+				if (!delaySlotIsNice)
+					CompileDelaySlot(DELAYSLOT_SAFE_FLUSH);
+				else
+					FlushAll();
+				ptr = B(cc);
+			} else {
+				FlushAll();
+				ptr = B(cc);
+			}
+		}
+
+		if (likely) {
+			// Only executed when taking the branch.
 			CompileDelaySlot(DELAYSLOT_FLUSH);
 		}

@ -244,7 +264,6 @@ void Arm64Jit::BranchRSZeroComp(MIPSOpcode op, CCFlags cc, bool andLink, bool li
 		if (!likely && delaySlotIsNice)
 			CompileDelaySlot(DELAYSLOT_NICE);

-		// TODO: Maybe we could use BZ here?
 		gpr.MapReg(rs);
 		CMP(gpr.R(rs), 0);

--- a/Core/MIPS/ARM64/Arm64RegCache.cpp
+++ b/Core/MIPS/ARM64/Arm64RegCache.cpp
@ -690,6 +690,9 @@ void Arm64RegCache::FlushR(MIPSGPReg r) {
 }

 void Arm64RegCache::FlushAll() {
+	// Note: make sure not to change the registers when flushing:
+	// Branching code expects the armreg to retain its value.
+
 	// LO can't be included in a 32-bit pair, since it's 64 bit.
 	// Flush it first so we don't get it confused.
 	FlushR(MIPS_REG_LO);