[LA64_DYNAREC] Added more opcodes (#1416)

* [LA64_DYNAREC] Added 0F AF IMUL opcode * Update clang-format rules * Added 08 OR opcode * Added F7 /3 NEG opcode and fixed some potential bugs
2024-11-23 06:30:22 +00:00 · 2024-04-05 22:34:59 +08:00 · 2024-04-05 22:34:59 +08:00 · 4d26021705
commit 4d26021705
parent 2e9b8eff59
8 changed files with 296 additions and 4 deletions
--- a/.clang-format
+++ b/.clang-format
@ -12,6 +12,6 @@ MaxEmptyLinesToKeep: 2
 IndentCaseLabels: true
 AlignConsecutiveMacros: true
 WhitespaceSensitiveMacros: ['QUOTE']
-IfMacros: ['IFX', 'IFX2', 'IF_PEND0R0', 'IFXX', 'IFX2X', 'IFXN', 'UFLAG_IF', 'PASS2IF']
+IfMacros: ['IFX', 'IFX2', 'IFXA', 'IF_PEND0R0', 'IFXX', 'IFX2X', 'IFXN', 'UFLAG_IF', 'PASS2IF']
 UseTab: Never
 ---
--- a/src/dynarec/la64/dynarec_la64_00.c
+++ b/src/dynarec/la64/dynarec_la64_00.c
@ -102,6 +102,15 @@ uintptr_t dynarec64_00(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
            i64 = F32S;
            emit_add32c(dyn, ninst, rex, xRAX, i64, x3, x4, x5, x6);
            break;
+        case 0x08:
+            INST_NAME("OR Eb, Gb");
+            SETFLAGS(X_ALL, SF_SET_PENDING);
+            nextop = F8;
+            GETEB(x1, 0);
+            GETGB(x2);
+            emit_or8(dyn, ninst, x1, x2, x4, x5);
+            EBBACK();
+            break;
        case 0x09:
            INST_NAME("OR Ed, Gd");
            SETFLAGS(X_ALL, SF_SET_PENDING);
@ -1128,6 +1137,13 @@ uintptr_t dynarec64_00(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
                        ZEROUP(ed);
                    WBACK;
                    break;
+                case 3:
+                    INST_NAME("NEG Ed");
+                    SETFLAGS(X_ALL, SF_SET_PENDING);
+                    GETED(0);
+                    emit_neg32(dyn, ninst, rex, ed, x3, x4);
+                    WBACK;
+                    break;
                default:
                    DEFAULT;
            }
--- a/src/dynarec/la64/dynarec_la64_0f.c
+++ b/src/dynarec/la64/dynarec_la64_0f.c
@ -261,6 +261,73 @@ uintptr_t dynarec64_0F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
            LD_D(xRDX, xEmu, offsetof(x64emu_t, regs[_DX]));
            LD_D(xRBX, xEmu, offsetof(x64emu_t, regs[_BX]));
            break;
+        case 0xAF:
+            INST_NAME("IMUL Gd, Ed");
+            SETFLAGS(X_ALL, SF_PENDING);
+            nextop = F8;
+            GETGD;
+            GETED(0);
+            if (box64_dynarec_test) {
+                // avoid noise during test
+                CLEAR_FLAGS(x3);
+            }
+            if (rex.w) {
+                // 64bits imul
+                UFLAG_IF {
+                    MULH_D(x3, gd, ed);
+                    MUL_D(gd, gd, ed);
+                    IFX (X_PEND) {
+                        UFLAG_OP1(x3);
+                        UFLAG_RES(gd);
+                        UFLAG_DF(x3, d_imul64);
+                    } else {
+                        SET_DFNONE();
+                    }
+                    IFX (X_CF | X_OF) {
+                        SRAI_D(x4, gd, 63);
+                        XOR(x3, x3, x4);
+                        SNEZ(x3, x3);
+                        IFX (X_CF) {
+                            BSTRINS_D(xFlags, x3, F_CF, F_CF);
+                        }
+                        IFX (X_OF) {
+                            BSTRINS_D(xFlags, x3, F_OF, F_OF);
+                        }
+                    }
+                } else {
+                    MULxw(gd, gd, ed);
+                }
+            } else {
+                // 32bits imul
+                UFLAG_IF {
+                    MUL_D(gd, gd, ed);
+                    SRLI_D(x3, gd, 32);
+                    SLLI_W(gd, gd, 0);
+                    IFX (X_PEND) {
+                        UFLAG_RES(gd);
+                        UFLAG_OP1(x3);
+                        UFLAG_DF(x4, d_imul32);
+                    } else IFX (X_CF | X_OF) {
+                        SET_DFNONE();
+                    }
+                    IFX (X_CF | X_OF) {
+                        SRAI_W(x4, gd, 31);
+                        SUB_D(x3, x3, x4);
+                        SNEZ(x3, x3);
+                        IFX (X_CF) {
+                            BSTRINS_D(xFlags, x3, F_CF, F_CF);
+                        }
+                        IFX (X_OF) {
+                            BSTRINS_D(xFlags, x3, F_OF, F_OF);
+                        }
+                    }
+                } else {
+                    MULxw(gd, gd, ed);
+                }
+                SLLI_D(gd, gd, 32);
+                SRLI_D(gd, gd, 32);
+            }
+            break;
        case 0xB6:
            INST_NAME("MOVZX Gd, Eb");
            nextop = F8;
--- a/src/dynarec/la64/dynarec_la64_emit_logic.c
+++ b/src/dynarec/la64/dynarec_la64_emit_logic.c
@ -245,7 +245,7 @@ void emit_or32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3
        SET_DFNONE();
    }

-    IFXA(X_ALL, la64_lbt) {
+    IFXA (X_ALL, la64_lbt) {
        if (rex.w)
            X64_OR_D(s1, s2);
        else
@ -326,3 +326,40 @@ void emit_or32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, in
        emit_pf(dyn, ninst, s1, s3, s4);
    }
 }
+
+
+// emit OR8 instruction, from s1, s2, store result in s1 using s3 and s4 as scratch, s4 can be same as s2 (and so s2 destroyed)
+void emit_or8(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3, int s4)
+{
+    IFX (X_PEND) {
+        SET_DF(s3, d_or8);
+    } else IFX (X_ALL) {
+        SET_DFNONE();
+    }
+
+    IFXA (X_ALL, la64_lbt) {
+        X64_OR_B(s1, s2);
+    }
+
+    OR(s1, s1, s2);
+
+    IFX (X_PEND) {
+        ST_B(s1, xEmu, offsetof(x64emu_t, res));
+    }
+
+    if (la64_lbt) return;
+
+    CLEAR_FLAGS(s3);
+    IFX (X_SF) {
+        SRLI_D(s3, s1, 7);
+        BEQZ(s3, 8);
+        ORI(xFlags, xFlags, 1 << F_SF);
+    }
+    IFX (X_ZF) {
+        BNEZ(s1, 8);
+        ORI(xFlags, xFlags, 1 << F_ZF);
+    }
+    IFX (X_PF) {
+        emit_pf(dyn, ninst, s1, s3, s4);
+    }
+}
--- a/src/dynarec/la64/dynarec_la64_emit_math.c
+++ b/src/dynarec/la64/dynarec_la64_emit_math.c
@ -39,7 +39,10 @@ void emit_add32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s

    if (la64_lbt) {
        IFX(X_ALL) {
-            X64_ADD_WU(s1, s2);
+            if (rex.w)
+                X64_ADD_DU(s1, s2);
+            else
+                X64_ADD_WU(s1, s2);
        }
        ADDxw(s1, s1, s2);
        if (!rex.w) ZEROUP(s1);
@ -465,7 +468,10 @@ void emit_sub32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s

    if (la64_lbt) {
        IFX(X_ALL) {
-            X64_SUB_WU(s1, s2);
+            if (rex.w)
+                X64_SUB_DU(s1, s2);
+            else
+                X64_SUB_WU(s1, s2);
        }
        SUBxw(s1, s1, s2);
        if (!rex.w) ZEROUP(s1);
@ -577,3 +583,79 @@ void emit_sub32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, i
        emit_pf(dyn, ninst, s1, s3, s4);
    }
 }
+
+
+// emit NEG32 instruction, from s1, store result in s1 using s2 and s3 as scratch
+void emit_neg32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3)
+{
+    IFX (X_PEND) {
+        SDxw(s1, xEmu, offsetof(x64emu_t, op1));
+        SET_DF(s3, rex.w ? d_neg64 : d_neg32);
+    } else IFX (X_ALL) {
+        SET_DFNONE();
+    }
+
+    if (!la64_lbt) {
+        IFX (X_AF | X_OF) {
+            MV(s3, s1); // s3 = op1
+        }
+    }
+
+    IFXA (X_ALL, la64_lbt) {
+        if (rex.w)
+            X64_SUB_DU(xZR, s1);
+        else
+            X64_SUB_WU(xZR, s1);
+    }
+
+    NEGxw(s1, s1);
+    IFX (X_PEND) {
+        SDxw(s1, xEmu, offsetof(x64emu_t, res));
+    }
+
+    if (la64_lbt) {
+        if (!rex.w) {
+            ZEROUP(s1);
+        }
+        return;
+    }
+
+    CLEAR_FLAGS(s3);
+    IFX (X_CF) {
+        BEQZ(s1, 8);
+        ORI(xFlags, xFlags, 1 << F_CF);
+    }
+
+    IFX (X_AF | X_OF) {
+        OR(s3, s1, s3); // s3 = res | op1
+        IFX (X_AF) {
+            /* af = bc & 0x8 */
+            ANDI(s2, s3, 8);
+            BEQZ(s2, 8);
+            ORI(xFlags, xFlags, 1 << F_AF);
+        }
+        IFX (X_OF) {
+            /* of = ((bc >> (width-2)) ^ (bc >> (width-1))) & 0x1; */
+            SRLI_D(s2, s3, (rex.w ? 64 : 32) - 2);
+            SRLI_D(s3, s2, 1);
+            XOR(s2, s2, s3);
+            ANDI(s2, s2, 1);
+            BEQZ(s2, 8);
+            ORI(xFlags, xFlags, 1 << F_OF);
+        }
+    }
+    IFX (X_SF) {
+        BGE(s1, xZR, 8);
+        ORI(xFlags, xFlags, 1 << F_SF);
+    }
+    if (!rex.w) {
+        ZEROUP(s1);
+    }
+    IFX (X_PF) {
+        emit_pf(dyn, ninst, s1, s3, s2);
+    }
+    IFX (X_ZF) {
+        BNEZ(s1, 8);
+        ORI(xFlags, xFlags, 1 << F_ZF);
+    }
+}
--- a/src/dynarec/la64/dynarec_la64_helper.h
+++ b/src/dynarec/la64/dynarec_la64_helper.h
@ -441,6 +441,19 @@
 #ifndef SET_HASCALLRET
 #define SET_HASCALLRET()
 #endif
+#define UFLAG_OP1(A) \
+    if (dyn->insts[ninst].x64.gen_flags) { SDxw(A, xEmu, offsetof(x64emu_t, op1)); }
+#define UFLAG_OP2(A) \
+    if (dyn->insts[ninst].x64.gen_flags) { SDxw(A, xEmu, offsetof(x64emu_t, op2)); }
+#define UFLAG_OP12(A1, A2)                       \
+    if (dyn->insts[ninst].x64.gen_flags) {       \
+        SDxw(A1, xEmu, offsetof(x64emu_t, op1)); \
+        SDxw(A2, xEmu, offsetof(x64emu_t, op2)); \
+    }
+#define UFLAG_RES(A) \
+    if (dyn->insts[ninst].x64.gen_flags) { SDxw(A, xEmu, offsetof(x64emu_t, res)); }
+#define UFLAG_DF(r, A) \
+    if (dyn->insts[ninst].x64.gen_flags) { SET_DF(r, A) }
 #define UFLAG_IF if (dyn->insts[ninst].x64.gen_flags)
 #ifndef DEFAULT
 #define DEFAULT \
@ -542,8 +555,10 @@ void* la64_next(x64emu_t* emu, uintptr_t addr);
 #define emit_sub32c         STEPNAME(emit_sub32c)
 #define emit_sub8           STEPNAME(emit_sub8)
 #define emit_sub8c          STEPNAME(emit_sub8c)
+#define emit_neg32          STEPNAME(emit_neg32)
 #define emit_or32           STEPNAME(emit_or32)
 #define emit_or32c          STEPNAME(emit_or32c)
+#define emit_or8            STEPNAME(emit_or8)
 #define emit_xor32          STEPNAME(emit_xor32)
 #define emit_and8           STEPNAME(emit_and8)
 #define emit_and8c          STEPNAME(emit_and8c)
@ -601,8 +616,10 @@ void emit_sub32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s
 void emit_sub32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, int s2, int s3, int s4, int s5);
 void emit_sub8(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5);
 void emit_sub8c(dynarec_la64_t* dyn, int ninst, int s1, int32_t c, int s2, int s3, int s4, int s5);
+void emit_neg32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3);
 void emit_or32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4);
 void emit_or32c(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int64_t c, int s3, int s4);
+void emit_or8(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3, int s4);
 void emit_xor32(dynarec_la64_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4);
 void emit_and8(dynarec_la64_t* dyn, int ninst, int s1, int s2, int s3, int s4);
 void emit_and8c(dynarec_la64_t* dyn, int ninst, int s1, int32_t c, int s3, int s4);
--- a/src/dynarec/la64/la64_emitter.h
+++ b/src/dynarec/la64/la64_emitter.h
@ -309,6 +309,76 @@ f24-f31  fs0-fs7   Static registers                Callee

 #define SEXT_W(rd, rs1) SLLI_W(rd, rs1, 0)

+// product = signed(GR[rj][31:0]) * signed(GR[rk][31:0])
+// GR[rd] = SignExtend(product[31:0], GRLEN)
+#define MUL_W(rd, rj, rk) EMIT(type_3R(0b00000000000111000, rk, rj, rd))
+
+// product = signed(GR[rj][31:0]) * signed(GR[rk][31:0])
+// GR[rd] = SignExtend(product[63:32], GRLEN)
+#define MULH_W(rd, rj, rk) EMIT(type_3R(0b00000000000111001, rk, rj, rd))
+
+// product = unsigned(GR[rj][31:0]) * unsigned(GR[rk][31:0])
+// GR[rd] = SignExtend(product[63:32], GRLEN)
+#define MULH_WU(rd, rj, rk) EMIT(type_3R(0b00000000000111010, rk, rj, rd))
+
+// product = signed(GR[rj][63:0]) * signed(GR[rk][63:0])
+// GR[rd] = product[63:0]
+#define MUL_D(rd, rj, rk) EMIT(type_3R(0b00000000000111011, rk, rj, rd))
+
+// product = signed(GR[rj][63:0]) * signed(GR[rk][63:0])
+// GR[rd] = product[127:64]
+#define MULH_D(rd, rj, rk) EMIT(type_3R(0b00000000000111100, rk, rj, rd))
+
+// product = unsigned(GR[rj][63:0]) * unsigned(GR[rk][63:0])
+// GR[rd] = product[127:64]
+#define MULH_DU(rd, rj, rk) EMIT(type_3R(0b00000000000111101, rk, rj, rd))
+
+// product = signed(GR[rj][31:0]) * signed(GR[rk][31:0])
+// GR[rd] = product[63:0]
+#define MULW_D_W(rd, rj, rk) EMIT(type_3R(0b00000000000111110, rk, rj, rd))
+
+// product = unsigned(GR[rj][31:0]) * unsigned(GR[rk][31:0])
+// GR[rd] = product[63:0]
+#define MULW_D_WU(rd, rj, rk) EMIT(type_3R(0b00000000000111111, rk, rj, rd))
+
+// quotient = signed(GR[rj][31:0]) / signed(GR[rk][31:0])
+// GR[rd] = SignExtend(quotient[31:0], GRLEN)
+#define DIV_W(rd, rj, rk) EMIT(type_3R(0b00000000001000000, rk, rj, rd))
+
+// quotient = unsigned(GR[rj][31:0]) / unsigned(GR[rk][31:0])
+// GR[rd] = SignExtend(quotient[31:0], GRLEN)
+#define DIV_WU(rd, rj, rk) EMIT(type_3R(0b00000000001000010, rk, rj, rd))
+
+// remainder = signed(GR[rj][31:0]) % signed(GR[rk][31:0])
+// GR[rd] = SignExtend(remainder[31:0], GRLEN)
+#define MOD_W(rd, rj, rk) EMIT(type_3R(0b00000000001000001, rk, rj, rd))
+
+// remainder = unsigned(GR[rj][31:0]) % unsigned(GR[rk][31:0])
+// GR[rd] = SignExtend(remainder[31:0], GRLEN)
+#define MOD_WU(rd, rj, rk) EMIT(type_3R(0b00000000001000011, rk, rj, rd))
+
+// GR[rd] = signed(GR[rj][63:0]) / signed(GR[rk][63:0])
+#define DIV_D(rd, rj, rk) EMIT(type_3R(0b00000000001000100, rk, rj, rd))
+
+// GR[rd] = unsigned(GR[rj][63:0]) / unsigned(GR[rk][63:0])
+#define DIV_DU(rd, rj, rk) EMIT(type_3R(0b00000000001000110, rk, rj, rd))
+
+// GR[rd] = signed(GR[rj] [63:0]) % signed(GR[rk] [63:0])
+#define MOD_D(rd, rj, rk) EMIT(type_3R(0b00000000001000101, rk, rj, rd))
+
+// GR[rd] = unsigned(GR[rj] [63:0]) % unsigned(GR[rk] [63:0])
+#define MOD_DU(rd, rj, rk) EMIT(type_3R(0b00000000001000111, rk, rj, rd))
+
+#define MULxw(rd, rj, rk)      \
+    do {                       \
+        if (rex.w) {           \
+            MUL_D(rd, rj, rk); \
+        } else {               \
+            MUL_W(rd, rj, rk); \
+        }                      \
+    } while (0)
+
+
 // bstr32[31:msbw+1] = GR[rd][31: msbw+1]
 // bstr32[msbw:lsbw] = GR[rj][msbw-lsbw:0]
 // bstr32[lsbw-1:0] = GR[rd][lsbw-1:0]
@ -1709,6 +1779,8 @@ LSX instruction starts with V, LASX instruction starts with XV.
            SUB_W(rd, rj, rk); \
    } while (0)

+#define NEGxw(rd, rs1) SUBxw(rd, xZR, rs1)
+
 #define SUBz(rd, rj, rk)       \
    do {                       \
        if (rex.is32bits)      \
--- a/src/dynarec/rv64/dynarec_rv64_0f.c
+++ b/src/dynarec/rv64/dynarec_rv64_0f.c
@ -1445,6 +1445,7 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
            }
            break;
        case 0xAF:
+            // TODO: Refine this
            INST_NAME("IMUL Gd, Ed");
            SETFLAGS(X_ALL, SF_PENDING);
            nextop = F8;