JitArm64: Special-case fmadds with single-precision inputs

If all inputs to an fmadds instruction (including cousins like fmsubs,
fnmadd...) are single-precision, then the result is identical between a
double-precision calculation with an error-free transform (whether the
calculation is fused or not) and a single-precision FMA instruction
(must be fused). So as a performance optimization in JitArm64, if we
were going to use double precision with EFT but the inputs are singles,
instead we'll use a normal single-precision FMA instruction without
anything extra. This lets us skip both the EFT and double-to-single
conversions.

Also renaming `inaccurate_fma` to `nonfused` because it's confusing that
`inaccurate_fma` and `m_accurate_fmadds` have such similar names
despite controlling separate things.
This commit is contained in:
JosJuice
2025-10-03 20:32:27 +02:00
parent 58487f1633
commit 3b1a4739bc
2 changed files with 50 additions and 41 deletions

View File

@@ -82,7 +82,8 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
const bool negate_b = op5 == 28 || op5 == 30;
const bool output_is_single = inst.OPCD == 59;
const bool inaccurate_fma = fma && !Config::Get(Config::SESSION_USE_FMA);
const bool nonfused_requested = fma && !Config::Get(Config::SESSION_USE_FMA);
const bool error_free_transformation_requested = fma && m_accurate_fmadds;
const bool round_c = use_c && output_is_single && !js.op->fprIsSingle[c];
const auto inputs_are_singles_func = [&] {
@@ -90,14 +91,18 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
(!use_c || fpr.IsSingle(c, true));
};
const bool single = inputs_are_singles_func() && output_is_single && !inaccurate_fma;
const bool single = inputs_are_singles_func() && output_is_single &&
(error_free_transformation_requested || !nonfused_requested);
const RegType type = single ? RegType::LowerPairSingle : RegType::LowerPair;
const RegType type_out = output_is_single ?
(single ? RegType::DuplicatedSingle : RegType::Duplicated) :
RegType::LowerPair;
const auto reg_encoder = single ? EncodeRegToSingle : EncodeRegToDouble;
const bool error_free_transformation = fma && !single && output_is_single && m_accurate_fmadds;
const bool nonfused = nonfused_requested && !single;
const bool error_free_transformation =
error_free_transformation_requested && !single && output_is_single;
if (error_free_transformation)
{
gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W30);
@@ -120,13 +125,13 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
}
ARM64Reg result_reg = VD;
ARM64Reg inaccurate_fma_reg = VD;
ARM64Reg nonfused_reg = VD;
if (error_free_transformation)
{
result_reg = reg_encoder(ARM64Reg::Q0);
inaccurate_fma_reg = reg_encoder(ARM64Reg::Q0);
nonfused_reg = reg_encoder(ARM64Reg::Q0);
if (inaccurate_fma && V0Q == ARM64Reg::INVALID_REG)
if (nonfused && V0Q == ARM64Reg::INVALID_REG)
V0Q = fpr.GetScopedReg();
}
else
@@ -138,13 +143,13 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
if (V0Q == ARM64Reg::INVALID_REG)
V0Q = fpr.GetScopedReg();
result_reg = reg_encoder(V0Q);
inaccurate_fma_reg = reg_encoder(V0Q);
nonfused_reg = reg_encoder(V0Q);
}
else if (fma && inaccurate_fma && VD == VB)
else if (fma && nonfused && VD == VB)
{
if (V0Q == ARM64Reg::INVALID_REG)
V0Q = fpr.GetScopedReg();
inaccurate_fma_reg = reg_encoder(V0Q);
nonfused_reg = reg_encoder(V0Q);
}
}
@@ -174,10 +179,10 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
// So, we negate using a separate FNEG instruction instead of using AArch64's nmadd/msub.
case 28: // fmsub: "D = A*C - B" vs "Vd = (-Va) + Vn*Vm"
case 30: // fnmsub: "D = -(A*C - B)" vs "Vd = -((-Va) + Vn*Vm)"
if (inaccurate_fma)
if (nonfused)
{
m_float_emit.FMUL(inaccurate_fma_reg, VA, rounded_c_reg);
m_float_emit.FSUB(result_reg, inaccurate_fma_reg, VB);
m_float_emit.FMUL(nonfused_reg, VA, rounded_c_reg);
m_float_emit.FSUB(result_reg, nonfused_reg, VB);
}
else
{
@@ -186,10 +191,10 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
break;
case 29: // fmadd: "D = A*C + B" vs "Vd = Va + Vn*Vm"
case 31: // fnmadd: "D = -(A*C + B)" vs "Vd = -(Va + Vn*Vm)"
if (inaccurate_fma)
if (nonfused)
{
m_float_emit.FMUL(inaccurate_fma_reg, VA, rounded_c_reg);
m_float_emit.FADD(result_reg, inaccurate_fma_reg, VB);
m_float_emit.FMUL(nonfused_reg, VA, rounded_c_reg);
m_float_emit.FADD(result_reg, nonfused_reg, VB);
}
else
{
@@ -269,7 +274,7 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
m_float_emit.FSUB(ARM64Reg::D2, result_reg, ARM64Reg::D1);
// da := a - a'
if (inaccurate_fma)
if (nonfused)
{
m_float_emit.FMUL(EncodeRegToDouble(V0Q), VA, rounded_c_reg);
m_float_emit.FSUB(ARM64Reg::D1, EncodeRegToDouble(V0Q), ARM64Reg::D1);

View File

@@ -94,19 +94,23 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
const bool negate_result = (op5 & ~0x1) == 30;
const bool negate_b = op5 == 28 || op5 == 30;
const bool inaccurate_fma = fma && !Config::Get(Config::SESSION_USE_FMA);
const bool nonfused_requested = fma && !Config::Get(Config::SESSION_USE_FMA);
const bool error_free_transformation_requested = fma && m_accurate_fmadds;
const bool round_c = use_c && !js.op->fprIsSingle[c];
const auto inputs_are_singles_func = [&] {
return fpr.IsSingle(a) && (!use_b || fpr.IsSingle(b)) && (!use_c || fpr.IsSingle(c));
};
const bool single = inputs_are_singles_func() && !inaccurate_fma;
const bool single =
inputs_are_singles_func() && (error_free_transformation_requested || !nonfused_requested);
const RegType type = single ? RegType::Single : RegType::Register;
const u8 size = single ? 32 : 64;
const auto reg_encoder = single ? EncodeRegToDouble : EncodeRegToQuad;
const bool error_free_transformation = fma && !single && m_accurate_fmadds;
const bool nonfused = nonfused_requested && !single;
const bool error_free_transformation = error_free_transformation_requested && !single;
if (error_free_transformation)
{
gpr.Lock(ARM64Reg::W0, ARM64Reg::W30);
@@ -139,36 +143,36 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
}
ARM64Reg result_reg = VD;
ARM64Reg inaccurate_fma_reg = VD;
ARM64Reg nonfused_reg = VD;
if (error_free_transformation)
{
result_reg = reg_encoder(ARM64Reg::Q0);
inaccurate_fma_reg = reg_encoder(ARM64Reg::Q0);
nonfused_reg = reg_encoder(ARM64Reg::Q0);
}
else
{
const bool need_accurate_fma_reg =
fma && !inaccurate_fma && (negate_b || VD != VB) && (VD == VA || VD == rounded_c_reg);
const bool need_fused_fma_reg =
fma && !nonfused && (negate_b || VD != VB) && (VD == VA || VD == rounded_c_reg);
const bool preserve_d =
m_accurate_nans && (VD == VA || (use_b && VD == VB) || (use_c && VD == VC));
if (need_accurate_fma_reg || preserve_d)
if (need_fused_fma_reg || preserve_d)
{
if (V0Q == ARM64Reg::INVALID_REG)
V0Q = fpr.GetScopedReg();
result_reg = reg_encoder(V0Q);
inaccurate_fma_reg = reg_encoder(V0Q);
nonfused_reg = reg_encoder(V0Q);
if (need_accurate_fma_reg && round_c)
if (need_fused_fma_reg && round_c)
{
V1Q = fpr.GetScopedReg();
rounded_c_reg = reg_encoder(V1Q);
}
}
else if (fma && inaccurate_fma && VD == VB)
else if (fma && nonfused && VD == VB)
{
if (V0Q == ARM64Reg::INVALID_REG)
V0Q = fpr.GetScopedReg();
inaccurate_fma_reg = reg_encoder(V0Q);
nonfused_reg = reg_encoder(V0Q);
}
}
@@ -206,10 +210,10 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
m_float_emit.FMUL(size, result_reg, VA, rounded_c_reg, 1);
break;
case 14: // ps_madds0: d = a * c.ps0 + b
if (inaccurate_fma)
if (nonfused)
{
m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg, 0);
m_float_emit.FADD(size, result_reg, inaccurate_fma_reg, VB);
m_float_emit.FMUL(size, nonfused_reg, VA, rounded_c_reg, 0);
m_float_emit.FADD(size, result_reg, nonfused_reg, VB);
}
else
{
@@ -219,10 +223,10 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
}
break;
case 15: // ps_madds1: d = a * c.ps1 + b
if (inaccurate_fma)
if (nonfused)
{
m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg, 1);
m_float_emit.FADD(size, result_reg, inaccurate_fma_reg, VB);
m_float_emit.FMUL(size, nonfused_reg, VA, rounded_c_reg, 1);
m_float_emit.FADD(size, result_reg, nonfused_reg, VB);
}
else
{
@@ -245,10 +249,10 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
break;
case 28: // ps_msub: d = a * c - b
case 30: // ps_nmsub: d = -(a * c - b)
if (inaccurate_fma)
if (nonfused)
{
m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg);
m_float_emit.FSUB(size, result_reg, inaccurate_fma_reg, VB);
m_float_emit.FMUL(size, nonfused_reg, VA, rounded_c_reg);
m_float_emit.FSUB(size, result_reg, nonfused_reg, VB);
}
else
{
@@ -263,10 +267,10 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
break;
case 29: // ps_madd: d = a * c + b
case 31: // ps_nmadd: d = -(a * c + b)
if (inaccurate_fma)
if (nonfused)
{
m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg);
m_float_emit.FADD(size, result_reg, inaccurate_fma_reg, VB);
m_float_emit.FMUL(size, nonfused_reg, VA, rounded_c_reg);
m_float_emit.FADD(size, result_reg, nonfused_reg, VB);
}
else
{
@@ -307,7 +311,7 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
// da := a - a'
// (Transformed into da := a + -a')
if (inaccurate_fma)
if (nonfused)
{
switch (op5)
{