target-arm: A64: Add [UF]RSQRTE (reciprocal root estimate)

This adds support for [UF]RSQRTE instructions. It utilises the existing NEON helpers with some changes. The changes include an explicit passing of fpstatus (so the correct one is used between arm32 and aarch64), denormilzation, more correct error handling and also proper scaling of the fraction going into the estimate. Signed-off-by: Alex Bennée <alex.bennee@linaro.org> Reviewed-by: Richard Henderson <rth@twiddle.net> Message-id: 1394822294-14837-25-git-send-email-peter.maydell@linaro.org Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
2024-11-27 21:40:49 +00:00 · 2014-03-17 16:31:53 +00:00 · 2014-03-17 16:31:53 +00:00 · c2fb418e35
commit c2fb418e35
parent 5553955eb6
4 changed files with 143 additions and 40 deletions
--- a/target-arm/helper.c
+++ b/target-arm/helper.c
@ -4721,12 +4721,12 @@ float64 HELPER(recpe_f64)(float64 input, void *fpstp)
 /* The algorithm that must be used to calculate the estimate
 * is specified by the ARM ARM.
 */
-static float64 recip_sqrt_estimate(float64 a, CPUARMState *env)
+static float64 recip_sqrt_estimate(float64 a, float_status *real_fp_status)
 {
    /* These calculations mustn't set any fp exception flags,
     * so we use a local copy of the fp_status.
     */
-    float_status dummy_status = env->vfp.standard_fp_status;
+    float_status dummy_status = *real_fp_status;
    float_status *s = &dummy_status;
    float64 q;
    int64_t q_int;
@ -4773,49 +4773,64 @@ static float64 recip_sqrt_estimate(float64 a, CPUARMState *env)
    return float64_div(int64_to_float64(q_int, s), float64_256, s);
 }

-float32 HELPER(rsqrte_f32)(float32 a, CPUARMState *env)
+float32 HELPER(rsqrte_f32)(float32 input, void *fpstp)
 {
-    float_status *s = &env->vfp.standard_fp_status;
+    float_status *s = fpstp;
+    float32 f32 = float32_squash_input_denormal(input, s);
+    uint32_t val = float32_val(f32);
+    uint32_t f32_sbit = 0x80000000 & val;
+    int32_t f32_exp = extract32(val, 23, 8);
+    uint32_t f32_frac = extract32(val, 0, 23);
+    uint64_t f64_frac;
+    uint64_t val64;
    int result_exp;
    float64 f64;
-    uint32_t val;
-    uint64_t val64;

-    val = float32_val(a);
-
-    if (float32_is_any_nan(a)) {
-        if (float32_is_signaling_nan(a)) {
+    if (float32_is_any_nan(f32)) {
+        float32 nan = f32;
+        if (float32_is_signaling_nan(f32)) {
            float_raise(float_flag_invalid, s);
+            nan = float32_maybe_silence_nan(f32);
        }
-        return float32_default_nan;
-    } else if (float32_is_zero_or_denormal(a)) {
-        if (!float32_is_zero(a)) {
-            float_raise(float_flag_input_denormal, s);
+        if (s->default_nan_mode) {
+            nan =  float32_default_nan;
        }
+        return nan;
+    } else if (float32_is_zero(f32)) {
        float_raise(float_flag_divbyzero, s);
-        return float32_set_sign(float32_infinity, float32_is_neg(a));
-    } else if (float32_is_neg(a)) {
+        return float32_set_sign(float32_infinity, float32_is_neg(f32));
+    } else if (float32_is_neg(f32)) {
        float_raise(float_flag_invalid, s);
        return float32_default_nan;
-    } else if (float32_is_infinity(a)) {
+    } else if (float32_is_infinity(f32)) {
        return float32_zero;
    }

-    /* Normalize to a double-precision value between 0.25 and 1.0,
+    /* Scale and normalize to a double-precision value between 0.25 and 1.0,
     * preserving the parity of the exponent.  */
-    if ((val & 0x800000) == 0) {
-        f64 = make_float64(((uint64_t)(val & 0x80000000) << 32)
-                           | (0x3feULL << 52)
-                           | ((uint64_t)(val & 0x7fffff) << 29));
-    } else {
-        f64 = make_float64(((uint64_t)(val & 0x80000000) << 32)
-                           | (0x3fdULL << 52)
-                           | ((uint64_t)(val & 0x7fffff) << 29));
+
+    f64_frac = ((uint64_t) f32_frac) << 29;
+    if (f32_exp == 0) {
+        while (extract64(f64_frac, 51, 1) == 0) {
+            f64_frac = f64_frac << 1;
+            f32_exp = f32_exp-1;
+        }
+        f64_frac = extract64(f64_frac, 0, 51) << 1;
    }

-    result_exp = (380 - ((val & 0x7f800000) >> 23)) / 2;
+    if (extract64(f32_exp, 0, 1) == 0) {
+        f64 = make_float64(((uint64_t) f32_sbit) << 32
+                           | (0x3feULL << 52)
+                           | f64_frac);
+    } else {
+        f64 = make_float64(((uint64_t) f32_sbit) << 32
+                           | (0x3fdULL << 52)
+                           | f64_frac);
+    }

-    f64 = recip_sqrt_estimate(f64, env);
+    result_exp = (380 - f32_exp) / 2;
+
+    f64 = recip_sqrt_estimate(f64, s);

    val64 = float64_val(f64);

@ -4824,6 +4839,69 @@ float32 HELPER(rsqrte_f32)(float32 a, CPUARMState *env)
    return make_float32(val);
 }

+float64 HELPER(rsqrte_f64)(float64 input, void *fpstp)
+{
+    float_status *s = fpstp;
+    float64 f64 = float64_squash_input_denormal(input, s);
+    uint64_t val = float64_val(f64);
+    uint64_t f64_sbit = 0x8000000000000000ULL & val;
+    int64_t f64_exp = extract64(val, 52, 11);
+    uint64_t f64_frac = extract64(val, 0, 52);
+    int64_t result_exp;
+    uint64_t result_frac;
+
+    if (float64_is_any_nan(f64)) {
+        float64 nan = f64;
+        if (float64_is_signaling_nan(f64)) {
+            float_raise(float_flag_invalid, s);
+            nan = float64_maybe_silence_nan(f64);
+        }
+        if (s->default_nan_mode) {
+            nan =  float64_default_nan;
+        }
+        return nan;
+    } else if (float64_is_zero(f64)) {
+        float_raise(float_flag_divbyzero, s);
+        return float64_set_sign(float64_infinity, float64_is_neg(f64));
+    } else if (float64_is_neg(f64)) {
+        float_raise(float_flag_invalid, s);
+        return float64_default_nan;
+    } else if (float64_is_infinity(f64)) {
+        return float64_zero;
+    }
+
+    /* Scale and normalize to a double-precision value between 0.25 and 1.0,
+     * preserving the parity of the exponent.  */
+
+    if (f64_exp == 0) {
+        while (extract64(f64_frac, 51, 1) == 0) {
+            f64_frac = f64_frac << 1;
+            f64_exp = f64_exp - 1;
+        }
+        f64_frac = extract64(f64_frac, 0, 51) << 1;
+    }
+
+    if (extract64(f64_exp, 0, 1) == 0) {
+        f64 = make_float64(f64_sbit
+                           | (0x3feULL << 52)
+                           | f64_frac);
+    } else {
+        f64 = make_float64(f64_sbit
+                           | (0x3fdULL << 52)
+                           | f64_frac);
+    }
+
+    result_exp = (3068 - f64_exp) / 2;
+
+    f64 = recip_sqrt_estimate(f64, s);
+
+    result_frac = extract64(float64_val(f64), 0, 52);
+
+    return make_float64(f64_sbit |
+                        ((result_exp & 0x7ff) << 52) |
+                        result_frac);
+}
+
 uint32_t HELPER(recpe_u32)(uint32_t a, void *fpstp)
 {
    float_status *s = fpstp;
@ -4841,8 +4919,9 @@ uint32_t HELPER(recpe_u32)(uint32_t a, void *fpstp)
    return 0x80000000 | ((float64_val(f64) >> 21) & 0x7fffffff);
 }

-uint32_t HELPER(rsqrte_u32)(uint32_t a, CPUARMState *env)
+uint32_t HELPER(rsqrte_u32)(uint32_t a, void *fpstp)
 {
+    float_status *fpst = fpstp;
    float64 f64;

    if ((a & 0xc0000000) == 0) {
@ -4857,7 +4936,7 @@ uint32_t HELPER(rsqrte_u32)(uint32_t a, CPUARMState *env)
                           | ((uint64_t)(a & 0x3fffffff) << 22));
    }

-    f64 = recip_sqrt_estimate(f64, env);
+    f64 = recip_sqrt_estimate(f64, fpst);

    return 0x80000000 | ((float64_val(f64) >> 21) & 0x7fffffff);
 }
--- a/target-arm/helper.h
+++ b/target-arm/helper.h
@ -169,9 +169,10 @@ DEF_HELPER_3(recps_f32, f32, f32, f32, env)
 DEF_HELPER_3(rsqrts_f32, f32, f32, f32, env)
 DEF_HELPER_FLAGS_2(recpe_f32, TCG_CALL_NO_RWG, f32, f32, ptr)
 DEF_HELPER_FLAGS_2(recpe_f64, TCG_CALL_NO_RWG, f64, f64, ptr)
-DEF_HELPER_2(rsqrte_f32, f32, f32, env)
+DEF_HELPER_FLAGS_2(rsqrte_f32, TCG_CALL_NO_RWG, f32, f32, ptr)
+DEF_HELPER_FLAGS_2(rsqrte_f64, TCG_CALL_NO_RWG, f64, f64, ptr)
 DEF_HELPER_2(recpe_u32, i32, i32, ptr)
-DEF_HELPER_2(rsqrte_u32, i32, i32, env)
+DEF_HELPER_FLAGS_2(rsqrte_u32, TCG_CALL_NO_RWG, i32, i32, ptr)
 DEF_HELPER_5(neon_tbl, i32, env, i32, i32, i32, i32)

 DEF_HELPER_3(shl_cc, i32, env, i32, i32)
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@ -7146,6 +7146,9 @@ static void handle_2misc_reciprocal(DisasContext *s, int opcode,
            case 0x3f: /* FRECPX */
                gen_helper_frecpx_f64(tcg_res, tcg_op, fpst);
                break;
+            case 0x7d: /* FRSQRTE */
+                gen_helper_rsqrte_f64(tcg_res, tcg_op, fpst);
+                break;
            default:
                g_assert_not_reached();
            }
@ -7181,6 +7184,9 @@ static void handle_2misc_reciprocal(DisasContext *s, int opcode,
            case 0x3f: /* FRECPX */
                gen_helper_frecpx_f32(tcg_res, tcg_op, fpst);
                break;
+            case 0x7d: /* FRSQRTE */
+                gen_helper_rsqrte_f32(tcg_res, tcg_op, fpst);
+                break;
            default:
                g_assert_not_reached();
            }
@ -7378,6 +7384,7 @@ static void disas_simd_scalar_two_reg_misc(DisasContext *s, uint32_t insn)
        }
        case 0x3d: /* FRECPE */
        case 0x3f: /* FRECPX */
+        case 0x7d: /* FRSQRTE */
            handle_2misc_reciprocal(s, opcode, true, u, true, size, rn, rd);
            return;
        case 0x1a: /* FCVTNS */
@ -7404,9 +7411,6 @@ static void disas_simd_scalar_two_reg_misc(DisasContext *s, uint32_t insn)
            }
            handle_2misc_narrow(s, true, opcode, u, false, size - 1, rn, rd);
            return;
-        case 0x7d: /* FRSQRTE */
-            unsupported_encoding(s, insn);
-            return;
        default:
            unallocated_encoding(s);
            return;
@ -9255,6 +9259,11 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn)
            }
            /* fall through */
        case 0x3d: /* FRECPE */
+        case 0x7d: /* FRSQRTE */
+            if (size == 3 && !is_q) {
+                unallocated_encoding(s);
+                return;
+            }
            handle_2misc_reciprocal(s, opcode, false, u, is_q, size, rn, rd);
            return;
        case 0x56: /* FCVTXN, FCVTXN2 */
@ -9297,9 +9306,12 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn)
            }
            break;
        case 0x7c: /* URSQRTE */
-        case 0x7d: /* FRSQRTE */
-            unsupported_encoding(s, insn);
-            return;
+            if (size == 3) {
+                unallocated_encoding(s);
+                return;
+            }
+            need_fpstatus = true;
+            break;
        default:
            unallocated_encoding(s);
            return;
@ -9432,6 +9444,9 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn)
                case 0x59: /* FRINTX */
                    gen_helper_rints_exact(tcg_res, tcg_op, tcg_fpstatus);
                    break;
+                case 0x7c: /* URSQRTE */
+                    gen_helper_rsqrte_u32(tcg_res, tcg_op, tcg_fpstatus);
+                    break;
                default:
                    g_assert_not_reached();
                }
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@ -6689,8 +6689,12 @@ static int disas_neon_data_insn(CPUARMState * env, DisasContext *s, uint32_t ins
                            break;
                        }
                        case NEON_2RM_VRSQRTE:
-                            gen_helper_rsqrte_u32(tmp, tmp, cpu_env);
+                        {
+                            TCGv_ptr fpstatus = get_fpstatus_ptr(1);
+                            gen_helper_rsqrte_u32(tmp, tmp, fpstatus);
+                            tcg_temp_free_ptr(fpstatus);
                            break;
+                        }
                        case NEON_2RM_VRECPE_F:
                        {
                            TCGv_ptr fpstatus = get_fpstatus_ptr(1);
@ -6699,8 +6703,12 @@ static int disas_neon_data_insn(CPUARMState * env, DisasContext *s, uint32_t ins
                            break;
                        }
                        case NEON_2RM_VRSQRTE_F:
-                            gen_helper_rsqrte_f32(cpu_F0s, cpu_F0s, cpu_env);
+                        {
+                            TCGv_ptr fpstatus = get_fpstatus_ptr(1);
+                            gen_helper_rsqrte_f32(cpu_F0s, cpu_F0s, fpstatus);
+                            tcg_temp_free_ptr(fpstatus);
                            break;
+                        }
                        case NEON_2RM_VCVT_FS: /* VCVT.F32.S32 */
                            gen_vfp_sito(0, 1);
                            break;