[RV64_DYNAREC] Added more 0F opcodes for vector and optimized some opcodes too (#1816)
Some checks failed
Build and Release Box64 / build (ubuntu-latest, ANDROID, Release) (push) Failing after 0s
Build and Release Box64 / build (ubuntu-latest, ANDROID, Trace) (push) Failing after 0s
Build and Release Box64 / build (ubuntu-latest, ARM64, Box32) (push) Failing after 0s
Build and Release Box64 / build (ubuntu-latest, ARM64, Release) (push) Failing after 0s
Build and Release Box64 / build (ubuntu-latest, ARM64, StaticBuild) (push) Failing after 0s
Build and Release Box64 / build (ubuntu-latest, ARM64, Trace) (push) Failing after 0s
Build and Release Box64 / build (ubuntu-latest, LARCH64, Box32) (push) Failing after 0s
Build and Release Box64 / build (ubuntu-latest, LARCH64, Release) (push) Failing after 0s
Build and Release Box64 / build (ubuntu-latest, LARCH64, StaticBuild) (push) Failing after 0s
Build and Release Box64 / build (ubuntu-latest, LARCH64, Trace) (push) Failing after 0s
Build and Release Box64 / build (ubuntu-latest, RISCV, Box32) (push) Failing after 0s
Build and Release Box64 / build (ubuntu-latest, RISCV, Release) (push) Failing after 0s
Build and Release Box64 / build (ubuntu-latest, RISCV, StaticBuild) (push) Failing after 0s
Build and Release Box64 / build (ubuntu-latest, RISCV, Trace) (push) Failing after 0s
Build and Release Box64 / build (ubuntu-latest, RK3588, Box32) (push) Failing after 0s
Build and Release Box64 / build (ubuntu-latest, RK3588, Release) (push) Failing after 0s
Build and Release Box64 / build (ubuntu-latest, RK3588, StaticBuild) (push) Failing after 0s
Build and Release Box64 / build (ubuntu-latest, RK3588, Trace) (push) Failing after 0s
Build and Release Box64 / build (ubuntu-latest, TERMUX, Release) (push) Failing after 0s
Build and Release Box64 / build (ubuntu-latest, TERMUX, Trace) (push) Failing after 0s
Build and Release Box64 / build (ubuntu-latest, X64, Box32) (push) Failing after 0s
Build and Release Box64 / build (ubuntu-latest, X64, Release) (push) Failing after 0s
Build and Release Box64 / build (ubuntu-latest, X64, Trace) (push) Failing after 0s

* [RV64_DYNAREC] Optimized 66 0F 67 PACKUSWB opcode

* [RV64_DYNAREC] Optimized 66 0F 6C PUNPCKLQDQ opcode

* [RV64_DYNAREC] Added some 0F opcodes for vector

* review
This commit is contained in:
Yang Liu 2024-09-11 16:25:04 +08:00 committed by GitHub
parent fc9900c8f6
commit 4d60b75240
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 180 additions and 34 deletions

View File

@ -910,8 +910,8 @@ if(RV64_DYNAREC)
"${BOX64_ROOT}/src/dynarec/rv64/dynarec_rv64_00_2.c"
"${BOX64_ROOT}/src/dynarec/rv64/dynarec_rv64_00_3.c"
"${BOX64_ROOT}/src/dynarec/rv64/dynarec_rv64_0f.c"
"${BOX64_ROOT}/src/dynarec/rv64/dynarec_rv64_0f_vector.c"
"${BOX64_ROOT}/src/dynarec/rv64/dynarec_rv64_64.c"
#"${BOX64_ROOT}/src/dynarec/rv64/dynarec_rv64_65.c"
"${BOX64_ROOT}/src/dynarec/rv64/dynarec_rv64_66.c"
"${BOX64_ROOT}/src/dynarec/rv64/dynarec_rv64_67.c"
"${BOX64_ROOT}/src/dynarec/rv64/dynarec_rv64_67_32.c"

View File

@ -40,6 +40,7 @@ uintptr_t dynarec64_00_0(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
int64_t fixedaddress;
int lock;
int cacheupd = 0;
uintptr_t retaddr = 0;
opcode = F8;
MAYUSE(eb1);
@ -177,7 +178,9 @@ uintptr_t dynarec64_00_0(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
case 0x0F:
switch(rep) {
case 0:
addr = dynarec64_0F(dyn, addr, ip, ninst, rex, ok, need_epilog);
if (rv64_vector)
retaddr = dynarec64_0F_vector(dyn, addr, ip, ninst, rex, ok, need_epilog);
addr = retaddr ? retaddr : dynarec64_0F(dyn, addr, ip, ninst, rex, ok, need_epilog);
break;
case 1:
addr = dynarec64_F20F(dyn, addr, ip, ninst, rex, ok, need_epilog);

View File

@ -0,0 +1,145 @@
#include <stdio.h>
#include <stdlib.h>
#include <stddef.h>
#include <errno.h>
#include "debug.h"
#include "box64context.h"
#include "dynarec.h"
#include "emu/x64emu_private.h"
#include "emu/x64run_private.h"
#include "x64run.h"
#include "x64emu.h"
#include "box64stack.h"
#include "callback.h"
#include "emu/x64run_private.h"
#include "x64trace.h"
#include "dynarec_native.h"
#include "my_cpuid.h"
#include "emu/x87emu_private.h"
#include "emu/x64shaext.h"
#include "bitutils.h"
#include "rv64_printer.h"
#include "dynarec_rv64_private.h"
#include "dynarec_rv64_functions.h"
#include "dynarec_rv64_helper.h"
uintptr_t dynarec64_0F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog)
{
(void)ip;
(void)need_epilog;
uint8_t opcode = F8;
uint8_t nextop, u8;
uint8_t gd, ed;
uint8_t wb1, wback, wb2, gback;
uint8_t eb1, eb2;
uint8_t gb1, gb2;
int32_t i32, i32_;
int cacheupd = 0;
int v0, v1;
int q0, q1;
int d0, d1;
int s0, s1;
uint64_t tmp64u;
int64_t j64;
int64_t fixedaddress, gdoffset;
int unscaled;
MAYUSE(wb2);
MAYUSE(gback);
MAYUSE(eb1);
MAYUSE(eb2);
MAYUSE(q0);
MAYUSE(q1);
MAYUSE(d0);
MAYUSE(d1);
MAYUSE(s0);
MAYUSE(j64);
MAYUSE(cacheupd);
switch (opcode) {
case 0x10:
INST_NAME("MOVUPS Gx, Ex");
nextop = F8;
GETG;
SET_ELEMENT_WIDTH(x1, VECTOR_SEWANY, 1);
if (MODREG) {
ed = (nextop & 7) + (rex.b << 3);
v1 = sse_get_reg_vector(dyn, ninst, x1, ed, 0, dyn->vector_eew);
v0 = sse_get_reg_empty_vector(dyn, ninst, x1, gd);
VMV_V_V(v0, v1);
} else {
SMREAD();
v0 = sse_get_reg_empty_vector(dyn, ninst, x1, gd);
addr = geted(dyn, addr, ninst, nextop, &ed, x2, x3, &fixedaddress, rex, NULL, 0, 0);
VLE_V(v0, ed, dyn->vector_eew, VECTOR_UNMASKED, VECTOR_NFIELD1);
}
break;
case 0x11:
INST_NAME("MOVUPS Ex, Gx");
nextop = F8;
SET_ELEMENT_WIDTH(x1, VECTOR_SEWANY, 1);
GETGX_vector(v0, 0, dyn->vector_eew);
if (MODREG) {
ed = (nextop & 7) + (rex.b << 3);
v1 = sse_get_reg_empty_vector(dyn, ninst, x1, ed);
VMV_V_V(v1, v0);
} else {
addr = geted(dyn, addr, ninst, nextop, &ed, x2, x3, &fixedaddress, rex, NULL, 0, 0);
VSE_V(v0, ed, dyn->vector_eew, VECTOR_UNMASKED, VECTOR_NFIELD1);
SMWRITE2();
}
break;
case 0x16:
nextop = F8;
if (MODREG) {
INST_NAME("MOVLHPS Gx, Ex");
SET_ELEMENT_WIDTH(x1, VECTOR_SEW64, 1);
GETGX_vector(v0, 1, VECTOR_SEW64);
v1 = sse_get_reg_vector(dyn, ninst, x1, (nextop & 7) + (rex.b << 3), 0, VECTOR_SEW64);
if (v0 == v1) {
// for vslideup.vi, cannot be overlapped
v1 = fpu_get_scratch(dyn);
VMV_V_V(v1, v0);
}
VSLIDEUP_VI(v0, 1, v1, VECTOR_UNMASKED);
} else {
INST_NAME("MOVHPS Gx, Ex");
SET_ELEMENT_WIDTH(x1, VECTOR_SEW64, 1);
GETGX_vector(v0, 1, VECTOR_SEW64);
q0 = fpu_get_scratch(dyn);
VXOR_VV(q0, q0, q0, VECTOR_UNMASKED);
VMV_V_I(VMASK, 0b10);
SMREAD();
addr = geted(dyn, addr, ninst, nextop, &ed, x3, x2, &fixedaddress, rex, NULL, 0, 0);
VLUXEI64_V(v0, ed, q0, VECTOR_MASKED, VECTOR_NFIELD1);
}
break;
case 0x29:
INST_NAME("MOVAPS Ex, Gx");
nextop = F8;
SET_ELEMENT_WIDTH(x1, VECTOR_SEWANY, 1);
GETGX_vector(v0, 0, dyn->vector_eew);
if (MODREG) {
ed = (nextop & 7) + (rex.b << 3);
v1 = sse_get_reg_empty_vector(dyn, ninst, x1, ed);
VMV_V_V(v1, v0);
} else {
addr = geted(dyn, addr, ninst, nextop, &ed, x2, x3, &fixedaddress, rex, NULL, 0, 0);
VSE_V(v0, ed, dyn->vector_eew, VECTOR_UNMASKED, VECTOR_NFIELD1);
SMWRITE2();
}
break;
case 0x00 ... 0x0F:
case 0x18:
case 0x1F:
case 0x31:
case 0x40 ... 0x4F:
case 0x80 ... 0xBF:
return 0;
default:
DEFAULT_VECTOR;
}
return addr;
}

View File

@ -131,11 +131,9 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
case 0x0F:
switch(rep) {
case 0: {
if (rv64_vector) {
if (rv64_vector)
retaddr = dynarec64_660F_vector(dyn, addr, ip, ninst, rex, ok, need_epilog);
addr = retaddr ? retaddr : dynarec64_660F(dyn, addr, ip, ninst, rex, ok, need_epilog);
} else
addr = dynarec64_660F(dyn, addr, ip, ninst, rex, ok, need_epilog);
addr = retaddr ? retaddr : dynarec64_660F(dyn, addr, ip, ninst, rex, ok, need_epilog);
break;
}
case 1: addr = dynarec64_66F20F(dyn, addr, ip, ninst, rex, ok, need_epilog); break;

View File

@ -149,14 +149,13 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
fpu_get_scratch(dyn); // HACK: skip v3, for vector register group alignment!
d0 = fpu_get_scratch(dyn);
d1 = fpu_get_scratch(dyn);
VMAX_VX(d0, xZR, q0, VECTOR_UNMASKED);
VMAX_VX(d1, xZR, q1, VECTOR_UNMASKED);
if (rv64_vlen >= 256) {
/* mu tu sew lmul=1 */
vtypei = (0b0 << 7) | (0b0 << 6) | (VECTOR_SEW16 << 3) | 0b000;
ADDI(x1, xZR, 16); // double the vl for slideup.
VSETVLI(xZR, x1, vtypei);
VSLIDEUP_VI(d0, 8, d1, VECTOR_UNMASKED); // splice d0 and d1 here!
vector_vsetvl_emul1(dyn, ninst, x1, VECTOR_SEW16, 2); // double the vl for slideup.
VSLIDEUP_VI(q0, 8, q1, VECTOR_UNMASKED); // splice q0 and q1 here!
VMAX_VX(d0, xZR, q0, VECTOR_UNMASKED);
} else {
VMAX_VX(d0, xZR, q0, VECTOR_UNMASKED);
VMAX_VX(d1, xZR, q1, VECTOR_UNMASKED);
}
SET_ELEMENT_WIDTH(x1, VECTOR_SEW8, 1);
VNCLIPU_WI(q0, 0, d0, VECTOR_UNMASKED);
@ -185,18 +184,18 @@ uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
// GX->q[0] = GX->q[0]; -> unchanged
// GX->q[1] = EX->q[0];
GETGX_vector(v0, 1, VECTOR_SEW64);
q0 = fpu_get_scratch(dyn);
VXOR_VV(q0, q0, q0, VECTOR_UNMASKED);
VMV_V_I(VMASK, 0b10);
if (MODREG) {
v1 = sse_get_reg_vector(dyn, ninst, x1, (nextop & 7) + (rex.b << 3), 0, VECTOR_SEW64);
if (v0 == v1) {
// for vrgather.vv, cannot be overlapped
// for vslideup.vi, cannot be overlapped
v1 = fpu_get_scratch(dyn);
VMV_V_V(v1, v0);
}
VRGATHER_VV(v0, q0, v1, VECTOR_MASKED);
VSLIDEUP_VI(v0, 1, v1, VECTOR_UNMASKED);
} else {
q0 = fpu_get_scratch(dyn);
VXOR_VV(q0, q0, q0, VECTOR_UNMASKED);
VMV_V_I(VMASK, 0b10);
SMREAD();
addr = geted(dyn, addr, ninst, nextop, &ed, x3, x2, &fixedaddress, rex, NULL, 0, 0);
VLUXEI64_V(v0, ed, q0, VECTOR_MASKED, VECTOR_NFIELD1);

View File

@ -2434,7 +2434,7 @@ static void sewTransform(dynarec_rv64_t* dyn, int ninst, int s1)
if (jmp < 0) return;
if (dyn->insts[jmp].vector_sew == VECTOR_SEWNA) return;
MESSAGE(LOG_DUMP, "\tSEW changed to %d ---- ninst=%d -> %d\n", dyn->insts[jmp].vector_sew, ninst, jmp);
vector_vsetvl_emul1(dyn, ninst, s1, dyn->insts[jmp].vector_sew);
vector_vsetvl_emul1(dyn, ninst, s1, dyn->insts[jmp].vector_sew, 1);
}
void CacheTransform(dynarec_rv64_t* dyn, int ninst, int cacheupd, int s1, int s2, int s3)
@ -2590,9 +2590,8 @@ void fpu_propagate_stack(dynarec_rv64_t* dyn, int ninst)
dyn->e.swapped = 0;
}
// Use vector extension as like SIMD for now, this function sets the specified element width,
// other configs are set automatically.
int vector_vsetvl_emul1(dynarec_rv64_t* dyn, int ninst, int s1, int sew)
// Simple wrapper for vsetvli
int vector_vsetvl_emul1(dynarec_rv64_t* dyn, int ninst, int s1, int sew, int multiple)
{
if (sew == VECTOR_SEWNA) return VECTOR_SEW8;
if (sew == VECTOR_SEWANY) sew = VECTOR_SEW8;
@ -2603,7 +2602,7 @@ int vector_vsetvl_emul1(dynarec_rv64_t* dyn, int ninst, int s1, int sew)
*
* mu tu sew lmul=1 */
uint32_t vtypei = (0b0 << 7) | (0b0 << 6) | (sew << 3) | 0b000;
ADDI(s1, xZR, 16 >> sew);
ADDI(s1, xZR, (16 >> sew) * multiple);
VSETVLI(xZR, s1, vtypei);
return sew;
}

View File

@ -1081,16 +1081,16 @@
#define MODREG ((nextop & 0xC0) == 0xC0)
#ifndef SET_ELEMENT_WIDTH
#define SET_ELEMENT_WIDTH(s1, sew, set) \
do { \
if (sew == VECTOR_SEWANY && dyn->vector_sew != VECTOR_SEWNA) { \
dyn->vector_eew = dyn->vector_sew; \
} else if (sew == dyn->vector_sew) { \
dyn->vector_eew = dyn->vector_sew; \
} else { \
dyn->vector_eew = vector_vsetvl_emul1(dyn, ninst, s1, sew); \
} \
if (set) dyn->vector_sew = dyn->vector_eew; \
#define SET_ELEMENT_WIDTH(s1, sew, set) \
do { \
if (sew == VECTOR_SEWANY && dyn->vector_sew != VECTOR_SEWNA) { \
dyn->vector_eew = dyn->vector_sew; \
} else if (sew == dyn->vector_sew) { \
dyn->vector_eew = dyn->vector_sew; \
} else { \
dyn->vector_eew = vector_vsetvl_emul1(dyn, ninst, s1, sew, 1); \
} \
if (set) dyn->vector_sew = dyn->vector_eew; \
} while (0)
#endif
@ -1134,6 +1134,7 @@ void* rv64_next(x64emu_t* emu, uintptr_t addr);
#define dynarec64_F20F STEPNAME(dynarec64_F20F)
#define dynarec64_F30F STEPNAME(dynarec64_F30F)
#define dynarec64_0F_vector STEPNAME(dynarec64_0F_vector)
#define dynarec64_660F_vector STEPNAME(dynarec64_660F_vector)
#define geted STEPNAME(geted)
@ -1441,7 +1442,7 @@ void CacheTransform(dynarec_rv64_t* dyn, int ninst, int cacheupd, int s1, int s2
void rv64_move64(dynarec_rv64_t* dyn, int ninst, int reg, int64_t val);
void rv64_move32(dynarec_rv64_t* dyn, int ninst, int reg, int32_t val, int zeroup);
int vector_vsetvl_emul1(dynarec_rv64_t* dyn, int ninst, int s1, int sew);
int vector_vsetvl_emul1(dynarec_rv64_t* dyn, int ninst, int s1, int sew, int multiple);
#if STEP < 2
#define CHECK_CACHE() 0
@ -1546,6 +1547,7 @@ uintptr_t dynarec64_66F0(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
uintptr_t dynarec64_F20F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog);
uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog);
uintptr_t dynarec64_0F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog);
uintptr_t dynarec64_660F_vector(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog);
#if STEP < 2