Make LIR_ov work for LIR_mul on ARM. (bug 521161, r=gal)

2024-10-10 03:45:46 +00:00 · 2009-11-02 09:35:01 +00:00 · 2009-11-02 09:35:01 +00:00 · 146c5ecf6b
commit 146c5ecf6b
parent ad303a4bd9
3 changed files with 96 additions and 19 deletions
--- a/js/src/jstracer.cpp
+++ b/js/src/jstracer.cpp
@ -8093,13 +8093,11 @@ TraceRecorder::alu(LOpcode v, jsdouble v0, jsdouble v1, LIns* s0, LIns* s1)
    case LIR_fsub:
        r = v0 - v1;
        break;
-#if !defined NANOJIT_ARM
    case LIR_fmul:
        r = v0 * v1;
        if (r == 0.0)
            goto out;
        break;
-#endif
 #if defined NANOJIT_IA32 || defined NANOJIT_X64
    case LIR_fdiv:
        if (v1 == 0)
--- a/js/src/nanojit/NativeARM.cpp
+++ b/js/src/nanojit/NativeARM.cpp
@ -1857,6 +1857,14 @@ Assembler::asm_branch(bool branchOnFalse, LInsp cond, NIns* targ)
    // Detect whether or not this is a floating-point comparison.
    bool    fp_cond;

+    // Because MUL can't set the V flag, we use SMULL and CMP to set the Z flag
+    // to detect overflow on multiply. Thus, if cond points to a LIR_ov which
+    // in turn points to a LIR_mul, we must be conditional on !Z, not V.
+    if ((condop == LIR_ov) && (cond->oprnd1()->isop(LIR_mul))) {
+        condop = LIR_eq;
+        branchOnFalse = !branchOnFalse;
+    }
+
    // Select the appropriate ARM condition code to match the LIR instruction.
    switch (condop)
    {
@ -1991,10 +1999,11 @@ void
 Assembler::asm_cond(LInsp ins)
 {
    Register r = prepResultReg(ins, AllowableFlagRegs);
-    switch(ins->opcode())
+    LOpcode op = ins->opcode();
+    
+    switch(op)
    {
        case LIR_eq:  SETEQ(r); break;
-        case LIR_ov:  SETVS(r); break;
        case LIR_lt:  SETLT(r); break;
        case LIR_le:  SETLE(r); break;
        case LIR_gt:  SETGT(r); break;
@ -2003,6 +2012,17 @@ Assembler::asm_cond(LInsp ins)
        case LIR_ule: SETLS(r); break;
        case LIR_ugt: SETHI(r); break;
        case LIR_uge: SETHS(r); break;
+        case LIR_ov:
+            // Because MUL can't set the V flag, we use SMULL and CMP to set
+            // the Z flag to detect overflow on multiply. Thus, if ins points
+            // to a LIR_ov which in turn points to a LIR_mul, we must be
+            // conditional on !Z, not V.
+            if (!ins->oprnd1()->isop(LIR_mul)) {
+                SETVS(r);
+            } else {
+                SETNE(r);
+            }
+            break;
        default:      NanoAssert(0);  break;
    }
    asm_cmp(ins);
@ -2106,12 +2126,19 @@ Assembler::asm_arith(LInsp ins)
            //
            // We try to use rb as the first operand by default because it is
            // common for (rr == ra) and is thus likely to be the most
-            // efficient case; if ra is no longer used after this LIR
-            // instruction, it is re-used for the result register (rr).
+            // efficient method.
+
            if ((ARM_ARCH > 5) || (rr != rb)) {
-                // Newer cores place no restrictions on the registers used in a
-                // MUL instruction (compared to other arithmetic instructions).
-                MUL(rr, rb, ra);
+                // IP is used to temporarily store the high word of the result from
+                // SMULL, so we make use of this to perform an overflow check, as
+                // ARM's MUL instruction can't set the overflow flag by itself.
+                // We can check for overflow using the following:
+                //   SMULL  rr, ip, ra, rb
+                //   CMP    ip, rr, ASR #31
+                // An explanation can be found in bug 521161. This sets Z if we did
+                // _not_ overflow, and clears it if we did.
+                ALUr_shi(AL, cmp, 1, IP, IP, rr, ASR_imm, 31);
+                SMULL(rr, IP, rb, ra);
            } else {
                // ARM_ARCH is ARMv5 (or below) and rr == rb, so we must
                // find a different way to encode the instruction.
@ -2120,19 +2147,40 @@ Assembler::asm_arith(LInsp ins)
                if (rr != ra) {
                    // We know that rr == rb, so this will be something like
                    // rX = rY * rX.
-                    MUL(rr, ra, rb);
+                    // Other than swapping ra and rb, this works in the same as
+                    // as the ARMv6+ case, above.
+                    ALUr_shi(AL, cmp, 1, IP, IP, rr, ASR_imm, 31);
+                    SMULL(rr, IP, ra, rb);
                } else {
-                    // We're trying to do rX = rX * rX, so we must use a
-                    // temporary register to achieve this correctly on ARMv5.
+                    // We're trying to do rX = rX * rX, but we also need to
+                    // check for overflow so we would need two extra registers
+                    // on ARMv5 and below. We achieve this by observing the
+                    // following:
+                    //   - abs(rX)*abs(rX) = rX*rX, so we force the input to be
+                    //     positive to simplify the detection logic.
+                    //   - Any argument greater than 0xffff will _always_
+                    //     overflow, and we can easily check that the top 16
+                    //     bits are zero.
+                    //   - Any argument lower than (or equal to) 0xffff that
+                    //     also overflows is guaranteed to set output bit 31.
+                    // 
+                    // Thus, we know we have _not_ overflowed if:
+                    //   abs(rX)&0xffff0000 == 0 AND result[31] == 0
+                    //
+                    // The following instruction sequence will be emitted:
+                    // MOVS     IP, rX      // Put abs(rX) into IP.
+                    // RSBMI    IP, IP, #0  // ...
+                    // MUL      rX, IP, IP  // Do the actual multiplication.
+                    // MOVS     IP, IP, LSR #16 // Check that abs(arg)<=0xffff
+                    // CMPEQ    IP, rX, ASR #31 // Check that result[31] == 0

-                    // The register allocator will never allocate IP so it will
-                    // be safe to use here.
-                    NanoAssert(ra != IP);
                    NanoAssert(rr != IP);

-                    // In this case, rr == ra == rb.
-                    MUL(rr, IP, rb);
-                    MOV(IP, ra);
+                    ALUr_shi(AL, cmp, 1, IP, rr, rr, ASR_imm, 31);
+                    ALUr_shi(AL, mov, 1, IP, IP, IP, LSR_imm, 16);
+                    MUL(rr, IP, IP);
+                    ALUi(MI, rsb, 0, IP, IP, 0);
+                    ALUr(AL, mov, 1, IP, ra, ra);
                }
            }
            break;
@ -2229,7 +2277,6 @@ Assembler::asm_cmov(LInsp ins)
    switch (condval->opcode()) {
        // note that these are all opposites...
        case LIR_eq:    MOVNE(rr, iffalsereg);  break;
-        case LIR_ov:    MOVVC(rr, iffalsereg);  break;
        case LIR_lt:    MOVGE(rr, iffalsereg);  break;
        case LIR_le:    MOVGT(rr, iffalsereg);  break;
        case LIR_gt:    MOVLE(rr, iffalsereg);  break;
@ -2238,6 +2285,17 @@ Assembler::asm_cmov(LInsp ins)
        case LIR_ule:   MOVHI(rr, iffalsereg);  break;
        case LIR_ugt:   MOVLS(rr, iffalsereg);  break;
        case LIR_uge:   MOVLO(rr, iffalsereg);  break;
+        case LIR_ov:
+            // Because MUL can't set the V flag, we use SMULL and CMP to set
+            // the Z flag to detect overflow on multiply. Thus, if ins points
+            // to a LIR_ov which in turn points to a LIR_mul, we must be
+            // conditional on !Z, not V.
+            if (!condval->oprnd1()->isop(LIR_mul)) {
+                MOVVC(rr, iffalsereg);
+            } else {
+                MOVEQ(rr, iffalsereg);
+            }
+            break;
        default: debug_only( NanoAssert(0) );   break;
    }
    /*const Register iftruereg =*/ findSpecificRegFor(iftrue, rr);
--- a/js/src/nanojit/NativeARM.h
+++ b/js/src/nanojit/NativeARM.h
@ -477,6 +477,26 @@ enum {
 // Other operations.
 // --------

+// [_d_hi,_d] = _l * _r
+#define SMULL_dont_check_op1(_d, _d_hi, _l, _r)  do {                               \
+        underrunProtect(4);                                                         \
+        NanoAssert((ARM_ARCH >= 6) || ((_d) != (_l)));                              \
+        NanoAssert(IsGpReg(_d) && IsGpReg(_d_hi) && IsGpReg(_l) && IsGpReg(_r));    \
+        NanoAssert(((_d) != PC) && ((_d_hi) != PC) && ((_l) != PC) && ((_r) != PC));\
+        *(--_nIns) = (NIns)( COND_AL | 0xc00090 | (_d_hi)<<16 | (_d)<<12 | (_r)<<8 | (_l) );\
+        asm_output("smull %s, %s, %s, %s",gpn(_d),gpn(_d_hi),gpn(_l),gpn(_r));      \
+} while(0)
+
+#if NJ_ARM_ARCH >= NJ_ARM_V6
+#define SMULL(_d, _d_hi, _l, _r) SMULL_dont_check_op1(_d, _d_hi, _l, _r)
+#else
+#define SMULL(_d, _d_hi, _l, _r) do {               \
+        NanoAssert(   (_d)!=(_l));                  \
+        NanoAssert((_d_hi)!=(_l));                  \
+        SMULL_dont_check_op1(_d, _d_hi, _l, _r);    \
+    } while(0)
+#endif
+
 // _d = _l * _r
 #define MUL_dont_check_op1(_d, _l, _r)  do {                                \
        underrunProtect(4);                                                 \
@ -727,6 +747,7 @@ enum {
    } while (0)

 #define SETEQ(r)    SET(r,EQ)
+#define SETNE(r)    SET(r,NE)
 #define SETLT(r)    SET(r,LT)
 #define SETLE(r)    SET(r,LE)
 #define SETGT(r)    SET(r,GT)