mirror of
https://github.com/FEX-Emu/FEX.git
synced 2025-02-17 05:18:49 +00:00
Merge pull request #3786 from Sonicadvance1/non_temporal_stores
OpcodeDispatcher: Implement support for non-temporal vector stores
This commit is contained in:
commit
472a373861
@ -3321,7 +3321,18 @@ public:
|
||||
|
||||
// SVE Memory - Contiguous Store with Immediate Offset
|
||||
// SVE contiguous non-temporal store (scalar plus immediate)
|
||||
// XXX:
|
||||
void stnt1b(ZRegister zt, PRegister pg, Register rn, int32_t Imm = 0) {
|
||||
SVEContiguousNontemporalStore(0b00, zt, pg, rn, Imm);
|
||||
}
|
||||
void stnt1h(ZRegister zt, PRegister pg, Register rn, int32_t Imm = 0) {
|
||||
SVEContiguousNontemporalStore(0b01, zt, pg, rn, Imm);
|
||||
}
|
||||
void stnt1w(ZRegister zt, PRegister pg, Register rn, int32_t Imm = 0) {
|
||||
SVEContiguousNontemporalStore(0b10, zt, pg, rn, Imm);
|
||||
}
|
||||
void stnt1d(ZRegister zt, PRegister pg, Register rn, int32_t Imm = 0) {
|
||||
SVEContiguousNontemporalStore(0b11, zt, pg, rn, Imm);
|
||||
}
|
||||
|
||||
// SVE store multiple structures (scalar plus immediate)
|
||||
void st2b(ZRegister zt1, ZRegister zt2, PRegister pg, Register rn, int32_t Imm = 0) {
|
||||
@ -4481,6 +4492,22 @@ private:
|
||||
dc32(Instr);
|
||||
}
|
||||
|
||||
// SVE contiguous non-temporal store (scalar plus immediate)
|
||||
void SVEContiguousNontemporalStore(uint32_t msz, ZRegister zt, PRegister pg, Register rn, int32_t imm) {
|
||||
LOGMAN_THROW_A_FMT(pg <= PReg::p7, "Can only use p0-p7 as a governing predicate");
|
||||
LOGMAN_THROW_AA_FMT(imm >= -8 && imm <= 7,
|
||||
"Invalid loadstore offset ({}). Must be between [-8, 7]", imm);
|
||||
|
||||
const auto imm4 = static_cast<uint32_t>(imm) & 0xF;
|
||||
uint32_t Instr = 0b1110'0100'0001'0000'1110'0000'0000'0000;
|
||||
Instr |= msz << 23;
|
||||
Instr |= imm4 << 16;
|
||||
Instr |= pg.Idx() << 10;
|
||||
Instr |= Encode_rn(rn);
|
||||
Instr |= zt.Idx();
|
||||
dc32(Instr);
|
||||
}
|
||||
|
||||
void SVEContiguousLoadImm(bool is_store, uint32_t dtype, int32_t imm, PRegister pg, Register rn, ZRegister zt) {
|
||||
LOGMAN_THROW_A_FMT(pg <= PReg::p7, "Can only use p0-p7 as a governing predicate");
|
||||
LOGMAN_THROW_AA_FMT(imm >= -8 && imm <= 7,
|
||||
|
@ -2171,5 +2171,47 @@ DEF_OP(Prefetch) {
|
||||
prfm(PrefetchType[LUT], MemSrc);
|
||||
}
|
||||
|
||||
DEF_OP(VStoreNonTemporal) {
|
||||
const auto Op = IROp->C<IR::IROp_VStoreNonTemporal>();
|
||||
const auto OpSize = IROp->Size;
|
||||
|
||||
const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE;
|
||||
const auto Is128Bit = OpSize == Core::CPUState::XMM_SSE_REG_SIZE;
|
||||
|
||||
const auto Value = GetVReg(Op->Value.ID());
|
||||
const auto MemReg = GetReg(Op->Addr.ID());
|
||||
const auto Offset = Op->Offset;
|
||||
|
||||
if (Is256Bit) {
|
||||
LOGMAN_THROW_A_FMT(HostSupportsSVE256, "Need SVE256 support in order to use VStoreNonTemporal with 256-bit operation");
|
||||
const auto GoverningPredicate = PRED_TMP_32B.Zeroing();
|
||||
const auto OffsetScaled = Offset / 32;
|
||||
stnt1b(Value.Z(), GoverningPredicate, MemReg, OffsetScaled);
|
||||
} else if (Is128Bit && HostSupportsSVE128) {
|
||||
const auto GoverningPredicate = PRED_TMP_16B.Zeroing();
|
||||
const auto OffsetScaled = Offset / 16;
|
||||
stnt1b(Value.Z(), GoverningPredicate, MemReg, OffsetScaled);
|
||||
} else {
|
||||
// Treat the non-temporal store as a regular vector store in this case for compatibility
|
||||
str(Value.Q(), MemReg, Offset);
|
||||
}
|
||||
}
|
||||
|
||||
DEF_OP(VStoreNonTemporalPair) {
|
||||
const auto Op = IROp->C<IR::IROp_VStoreNonTemporalPair>();
|
||||
const auto OpSize = IROp->Size;
|
||||
|
||||
const auto Is128Bit = OpSize == Core::CPUState::XMM_SSE_REG_SIZE;
|
||||
LOGMAN_THROW_A_FMT(Is128Bit, "This IR operation only operates at 128-bit wide");
|
||||
|
||||
const auto ValueLow = GetVReg(Op->ValueLow.ID());
|
||||
const auto ValueHigh = GetVReg(Op->ValueHigh.ID());
|
||||
|
||||
const auto MemReg = GetReg(Op->Addr.ID());
|
||||
const auto Offset = Op->Offset;
|
||||
|
||||
stnp(ValueLow.Q(), ValueHigh.Q(), MemReg, Offset);
|
||||
}
|
||||
|
||||
#undef DEF_OP
|
||||
} // namespace FEXCore::CPU
|
||||
|
@ -816,12 +816,25 @@ void OpDispatchBuilder::AVX128_MOVVectorNT(OpcodeArgs) {
|
||||
const auto SrcSize = GetSrcSize(Op);
|
||||
const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE;
|
||||
|
||||
auto Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit, MemoryAccessType::STREAM);
|
||||
if (Op->Dest.IsGPR()) {
|
||||
///< MOVNTDQA load non-temporal comes from SSE4.1 and is extended by AVX/AVX2.
|
||||
auto Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit, MemoryAccessType::STREAM);
|
||||
if (Is128Bit) {
|
||||
Src.High = LoadZeroVector(OpSize::i128Bit);
|
||||
}
|
||||
AVX128_StoreResult_WithOpSize(Op, Op->Dest, Src);
|
||||
} else {
|
||||
auto Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit, MemoryAccessType::STREAM);
|
||||
Ref Dest = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.LoadData = false});
|
||||
|
||||
if (Op->Dest.IsGPR() && Is128Bit) {
|
||||
Src.High = LoadZeroVector(OpSize::i128Bit);
|
||||
if (Is128Bit) {
|
||||
// Single store non-temporal for 128-bit operations.
|
||||
_VStoreNonTemporal(OpSize::i128Bit, Src.Low, Dest, 0);
|
||||
} else {
|
||||
// For a 256-bit store, use a non-temporal store pair
|
||||
_VStoreNonTemporalPair(OpSize::i128Bit, Src.Low, Src.High, Dest, 0);
|
||||
}
|
||||
}
|
||||
AVX128_StoreResult_WithOpSize(Op, Op->Dest, Src);
|
||||
}
|
||||
|
||||
void OpDispatchBuilder::AVX128_MOVQ(OpcodeArgs) {
|
||||
|
@ -43,8 +43,27 @@ void OpDispatchBuilder::MOVVectorUnalignedOp(OpcodeArgs) {
|
||||
}
|
||||
|
||||
void OpDispatchBuilder::MOVVectorNTOp(OpcodeArgs) {
|
||||
Ref Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags, {.Align = 1, .AccessType = MemoryAccessType::STREAM});
|
||||
StoreResult(FPRClass, Op, Src, 1, MemoryAccessType::STREAM);
|
||||
const auto Size = GetDstSize(Op);
|
||||
|
||||
if (Op->Dest.IsGPR()) {
|
||||
///< MOVNTDQA load non-temporal comes from SSE4.1 and is extended by AVX/AVX2.
|
||||
Ref Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags, {.Align = 1, .AccessType = MemoryAccessType::STREAM});
|
||||
StoreResult(FPRClass, Op, Src, 1, MemoryAccessType::STREAM);
|
||||
} else {
|
||||
LOGMAN_THROW_A_FMT(!Op->Dest.IsGPR(), "Destination can't be GPR for non-temporal stores");
|
||||
Ref Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags, {.Align = 1, .AccessType = MemoryAccessType::STREAM});
|
||||
if (Size < OpSize::i128Bit) {
|
||||
// Normal streaming store if less than 128-bit
|
||||
// XMM Scalar 32-bit and 64-bit comes from SSE4a MOVNTSS, MOVNTSD
|
||||
// MMX 64-bit comes from MOVNTQ
|
||||
StoreResult(FPRClass, Op, Src, 1, MemoryAccessType::STREAM);
|
||||
} else {
|
||||
Ref Dest = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.LoadData = false});
|
||||
|
||||
// Single store non-temporal for larger operations.
|
||||
_VStoreNonTemporal(Size, Src, Dest, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void OpDispatchBuilder::VMOVAPS_VMOVAPDOp(OpcodeArgs) {
|
||||
|
@ -653,6 +653,30 @@
|
||||
],
|
||||
"HasSideEffects": true,
|
||||
"DestSize": "8"
|
||||
},
|
||||
"VStoreNonTemporal u8:#RegisterSize, FPR:$Value, GPR:$Addr, i8:$Offset": {
|
||||
"Desc": ["Does a non-temporal memory store of a vector.",
|
||||
"Matches arm64 SVE stnt1b semantics.",
|
||||
"Specifically weak-memory model ordered to match x86 non-temporal stores."
|
||||
],
|
||||
"HasSideEffects": true,
|
||||
"DestSize": "RegisterSize",
|
||||
"EmitValidation": [
|
||||
"_Offset % RegisterSize == 0",
|
||||
"RegisterSize == FEXCore::IR::OpSize::i128Bit || RegisterSize == FEXCore::IR::OpSize::i256Bit"
|
||||
]
|
||||
},
|
||||
"VStoreNonTemporalPair u8:#RegisterSize, FPR:$ValueLow, FPR:$ValueHigh, GPR:$Addr, i8:$Offset": {
|
||||
"Desc": ["Does a non-temporal memory store of two vector registers.",
|
||||
"Matches arm64 stnp semantics.",
|
||||
"Specifically weak-memory model ordered to match x86 non-temporal stores."
|
||||
],
|
||||
"HasSideEffects": true,
|
||||
"DestSize": "RegisterSize",
|
||||
"EmitValidation": [
|
||||
"_Offset % RegisterSize == 0",
|
||||
"RegisterSize == FEXCore::IR::OpSize::i128Bit"
|
||||
]
|
||||
}
|
||||
},
|
||||
"Atomic": {
|
||||
|
@ -4539,6 +4539,24 @@ TEST_CASE_METHOD(TestDisassembler, "Emitter: SVE: SVE store multiple structures
|
||||
"[x29, x30, lsl #3]");
|
||||
}
|
||||
|
||||
TEST_CASE_METHOD(TestDisassembler, "Emitter: SVE: SVE contiguous non-temporal store (scalar plus immediate)") {
|
||||
TEST_SINGLE(stnt1b(ZReg::z31, PReg::p6, Reg::r29, 0), "stnt1b {z31.b}, p6, [x29]");
|
||||
TEST_SINGLE(stnt1b(ZReg::z31, PReg::p6, Reg::r29, -8), "stnt1b {z31.b}, p6, [x29, #-8, mul vl]");
|
||||
TEST_SINGLE(stnt1b(ZReg::z31, PReg::p6, Reg::r29, 7), "stnt1b {z31.b}, p6, [x29, #7, mul vl]");
|
||||
|
||||
TEST_SINGLE(stnt1h(ZReg::z31, PReg::p6, Reg::r29, 0), "stnt1h {z31.h}, p6, [x29]");
|
||||
TEST_SINGLE(stnt1h(ZReg::z31, PReg::p6, Reg::r29, -8), "stnt1h {z31.h}, p6, [x29, #-8, mul vl]");
|
||||
TEST_SINGLE(stnt1h(ZReg::z31, PReg::p6, Reg::r29, 7), "stnt1h {z31.h}, p6, [x29, #7, mul vl]");
|
||||
|
||||
TEST_SINGLE(stnt1w(ZReg::z31, PReg::p6, Reg::r29, 0), "stnt1w {z31.s}, p6, [x29]");
|
||||
TEST_SINGLE(stnt1w(ZReg::z31, PReg::p6, Reg::r29, -8), "stnt1w {z31.s}, p6, [x29, #-8, mul vl]");
|
||||
TEST_SINGLE(stnt1w(ZReg::z31, PReg::p6, Reg::r29, 7), "stnt1w {z31.s}, p6, [x29, #7, mul vl]");
|
||||
|
||||
TEST_SINGLE(stnt1d(ZReg::z31, PReg::p6, Reg::r29, 0), "stnt1d {z31.d}, p6, [x29]");
|
||||
TEST_SINGLE(stnt1d(ZReg::z31, PReg::p6, Reg::r29, -8), "stnt1d {z31.d}, p6, [x29, #-8, mul vl]");
|
||||
TEST_SINGLE(stnt1d(ZReg::z31, PReg::p6, Reg::r29, 7), "stnt1d {z31.d}, p6, [x29, #7, mul vl]");
|
||||
}
|
||||
|
||||
TEST_CASE_METHOD(TestDisassembler, "Emitter: SVE: SVE store multiple structures (scalar plus immediate)") {
|
||||
TEST_SINGLE(st2b(ZReg::z31, ZReg::z0, PReg::p6, Reg::r29, 0), "st2b {z31.b, z0.b}, p6, [x29]");
|
||||
TEST_SINGLE(st2b(ZReg::z26, ZReg::z27, PReg::p6, Reg::r29, 0), "st2b {z26.b, z27.b}, p6, [x29]");
|
||||
|
@ -2853,14 +2853,13 @@
|
||||
]
|
||||
},
|
||||
"vmovntps [rax], ymm0": {
|
||||
"ExpectedInstructionCount": 3,
|
||||
"ExpectedInstructionCount": 2,
|
||||
"Comment": [
|
||||
"Map 1 0b00 0x2B 256-bit"
|
||||
],
|
||||
"ExpectedArm64ASM": [
|
||||
"ldr q2, [x28, #16]",
|
||||
"str q16, [x4]",
|
||||
"str q2, [x4, #16]"
|
||||
"stnp q16, q2, [x4]"
|
||||
]
|
||||
},
|
||||
"vmovntpd [rax], xmm0": {
|
||||
@ -2873,14 +2872,13 @@
|
||||
]
|
||||
},
|
||||
"vmovntpd [rax], ymm0": {
|
||||
"ExpectedInstructionCount": 3,
|
||||
"ExpectedInstructionCount": 2,
|
||||
"Comment": [
|
||||
"Map 1 0b01 0x2B 256-bit"
|
||||
],
|
||||
"ExpectedArm64ASM": [
|
||||
"ldr q2, [x28, #16]",
|
||||
"str q16, [x4]",
|
||||
"str q2, [x4, #16]"
|
||||
"stnp q16, q2, [x4]"
|
||||
]
|
||||
},
|
||||
"vcvttss2si eax, xmm0": {
|
||||
@ -4929,14 +4927,13 @@
|
||||
]
|
||||
},
|
||||
"vmovntdq [rax], ymm0": {
|
||||
"ExpectedInstructionCount": 3,
|
||||
"ExpectedInstructionCount": 2,
|
||||
"Comment": [
|
||||
"Map 1 0b01 0xe7 256-bit"
|
||||
],
|
||||
"ExpectedArm64ASM": [
|
||||
"ldr q2, [x28, #16]",
|
||||
"str q16, [x4]",
|
||||
"str q2, [x4, #16]"
|
||||
"stnp q16, q2, [x4]"
|
||||
]
|
||||
},
|
||||
"vpsubsb xmm0, xmm1, xmm2": {
|
||||
|
73
unittests/InstructionCountCI/AVX128/VEX_map1_SVE128.json
Normal file
73
unittests/InstructionCountCI/AVX128/VEX_map1_SVE128.json
Normal file
@ -0,0 +1,73 @@
|
||||
{
|
||||
"Features": {
|
||||
"Bitness": 64,
|
||||
"EnabledHostFeatures": [
|
||||
"SVE128"
|
||||
],
|
||||
"DisabledHostFeatures": [
|
||||
"AFP",
|
||||
"FLAGM",
|
||||
"FLAGM2",
|
||||
"SVE256"
|
||||
]
|
||||
},
|
||||
"Instructions": {
|
||||
"vmovntps [rax], xmm0": {
|
||||
"ExpectedInstructionCount": 1,
|
||||
"Comment": [
|
||||
"Map 1 0b00 0x2B 128-bit"
|
||||
],
|
||||
"ExpectedArm64ASM": [
|
||||
"stnt1b {z16.b}, p6, [x4]"
|
||||
]
|
||||
},
|
||||
"vmovntps [rax], ymm0": {
|
||||
"ExpectedInstructionCount": 2,
|
||||
"Comment": [
|
||||
"Map 1 0b00 0x2B 256-bit"
|
||||
],
|
||||
"ExpectedArm64ASM": [
|
||||
"ldr q2, [x28, #16]",
|
||||
"stnp q16, q2, [x4]"
|
||||
]
|
||||
},
|
||||
"vmovntpd [rax], xmm0": {
|
||||
"ExpectedInstructionCount": 1,
|
||||
"Comment": [
|
||||
"Map 1 0b01 0x2B 128-bit"
|
||||
],
|
||||
"ExpectedArm64ASM": [
|
||||
"stnt1b {z16.b}, p6, [x4]"
|
||||
]
|
||||
},
|
||||
"vmovntpd [rax], ymm0": {
|
||||
"ExpectedInstructionCount": 2,
|
||||
"Comment": [
|
||||
"Map 1 0b01 0x2B 256-bit"
|
||||
],
|
||||
"ExpectedArm64ASM": [
|
||||
"ldr q2, [x28, #16]",
|
||||
"stnp q16, q2, [x4]"
|
||||
]
|
||||
},
|
||||
"vmovntdq [rax], xmm0": {
|
||||
"ExpectedInstructionCount": 1,
|
||||
"Comment": [
|
||||
"Map 1 0b01 0xe7 128-bit"
|
||||
],
|
||||
"ExpectedArm64ASM": [
|
||||
"stnt1b {z16.b}, p6, [x4]"
|
||||
]
|
||||
},
|
||||
"vmovntdq [rax], ymm0": {
|
||||
"ExpectedInstructionCount": 2,
|
||||
"Comment": [
|
||||
"Map 1 0b01 0xe7 256-bit"
|
||||
],
|
||||
"ExpectedArm64ASM": [
|
||||
"ldr q2, [x28, #16]",
|
||||
"stnp q16, q2, [x4]"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
@ -3246,7 +3246,7 @@
|
||||
"Map 1 0b00 0x2B 128-bit"
|
||||
],
|
||||
"ExpectedArm64ASM": [
|
||||
"str q16, [x4]"
|
||||
"stnt1b {z16.b}, p6, [x4]"
|
||||
]
|
||||
},
|
||||
"vmovntps [rax], ymm0": {
|
||||
@ -3255,7 +3255,7 @@
|
||||
"Map 1 0b00 0x2B 256-bit"
|
||||
],
|
||||
"ExpectedArm64ASM": [
|
||||
"st1b {z16.b}, p7, [x4]"
|
||||
"stnt1b {z16.b}, p7, [x4]"
|
||||
]
|
||||
},
|
||||
"vmovntpd [rax], xmm0": {
|
||||
@ -3264,7 +3264,7 @@
|
||||
"Map 1 0b01 0x2B 128-bit"
|
||||
],
|
||||
"ExpectedArm64ASM": [
|
||||
"str q16, [x4]"
|
||||
"stnt1b {z16.b}, p6, [x4]"
|
||||
]
|
||||
},
|
||||
"vmovntpd [rax], ymm0": {
|
||||
@ -3273,7 +3273,7 @@
|
||||
"Map 1 0b01 0x2B 256-bit"
|
||||
],
|
||||
"ExpectedArm64ASM": [
|
||||
"st1b {z16.b}, p7, [x4]"
|
||||
"stnt1b {z16.b}, p7, [x4]"
|
||||
]
|
||||
},
|
||||
"vcvttss2si eax, xmm0": {
|
||||
@ -5004,7 +5004,7 @@
|
||||
"Map 1 0b01 0xe7 128-bit"
|
||||
],
|
||||
"ExpectedArm64ASM": [
|
||||
"str q16, [x4]"
|
||||
"stnt1b {z16.b}, p6, [x4]"
|
||||
]
|
||||
},
|
||||
"vmovntdq [rax], ymm0": {
|
||||
@ -5013,7 +5013,7 @@
|
||||
"Map 1 0b01 0xe7 256-bit"
|
||||
],
|
||||
"ExpectedArm64ASM": [
|
||||
"st1b {z16.b}, p7, [x4]"
|
||||
"stnt1b {z16.b}, p7, [x4]"
|
||||
]
|
||||
},
|
||||
"vpsubsb xmm0, xmm1, xmm2": {
|
||||
|
Loading…
x
Reference in New Issue
Block a user