Merge pull request #3786 from Sonicadvance1/non_temporal_stores

OpcodeDispatcher: Implement support for non-temporal vector stores
This commit is contained in:
Ryan Houdek 2024-07-01 18:57:38 -07:00 committed by GitHub
commit 472a373861
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 235 additions and 22 deletions

View File

@ -3321,7 +3321,18 @@ public:
// SVE Memory - Contiguous Store with Immediate Offset
// SVE contiguous non-temporal store (scalar plus immediate)
// XXX:
void stnt1b(ZRegister zt, PRegister pg, Register rn, int32_t Imm = 0) {
SVEContiguousNontemporalStore(0b00, zt, pg, rn, Imm);
}
void stnt1h(ZRegister zt, PRegister pg, Register rn, int32_t Imm = 0) {
SVEContiguousNontemporalStore(0b01, zt, pg, rn, Imm);
}
void stnt1w(ZRegister zt, PRegister pg, Register rn, int32_t Imm = 0) {
SVEContiguousNontemporalStore(0b10, zt, pg, rn, Imm);
}
void stnt1d(ZRegister zt, PRegister pg, Register rn, int32_t Imm = 0) {
SVEContiguousNontemporalStore(0b11, zt, pg, rn, Imm);
}
// SVE store multiple structures (scalar plus immediate)
void st2b(ZRegister zt1, ZRegister zt2, PRegister pg, Register rn, int32_t Imm = 0) {
@ -4481,6 +4492,22 @@ private:
dc32(Instr);
}
// SVE contiguous non-temporal store (scalar plus immediate)
void SVEContiguousNontemporalStore(uint32_t msz, ZRegister zt, PRegister pg, Register rn, int32_t imm) {
LOGMAN_THROW_A_FMT(pg <= PReg::p7, "Can only use p0-p7 as a governing predicate");
LOGMAN_THROW_AA_FMT(imm >= -8 && imm <= 7,
"Invalid loadstore offset ({}). Must be between [-8, 7]", imm);
const auto imm4 = static_cast<uint32_t>(imm) & 0xF;
uint32_t Instr = 0b1110'0100'0001'0000'1110'0000'0000'0000;
Instr |= msz << 23;
Instr |= imm4 << 16;
Instr |= pg.Idx() << 10;
Instr |= Encode_rn(rn);
Instr |= zt.Idx();
dc32(Instr);
}
void SVEContiguousLoadImm(bool is_store, uint32_t dtype, int32_t imm, PRegister pg, Register rn, ZRegister zt) {
LOGMAN_THROW_A_FMT(pg <= PReg::p7, "Can only use p0-p7 as a governing predicate");
LOGMAN_THROW_AA_FMT(imm >= -8 && imm <= 7,

View File

@ -2171,5 +2171,47 @@ DEF_OP(Prefetch) {
prfm(PrefetchType[LUT], MemSrc);
}
DEF_OP(VStoreNonTemporal) {
const auto Op = IROp->C<IR::IROp_VStoreNonTemporal>();
const auto OpSize = IROp->Size;
const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE;
const auto Is128Bit = OpSize == Core::CPUState::XMM_SSE_REG_SIZE;
const auto Value = GetVReg(Op->Value.ID());
const auto MemReg = GetReg(Op->Addr.ID());
const auto Offset = Op->Offset;
if (Is256Bit) {
LOGMAN_THROW_A_FMT(HostSupportsSVE256, "Need SVE256 support in order to use VStoreNonTemporal with 256-bit operation");
const auto GoverningPredicate = PRED_TMP_32B.Zeroing();
const auto OffsetScaled = Offset / 32;
stnt1b(Value.Z(), GoverningPredicate, MemReg, OffsetScaled);
} else if (Is128Bit && HostSupportsSVE128) {
const auto GoverningPredicate = PRED_TMP_16B.Zeroing();
const auto OffsetScaled = Offset / 16;
stnt1b(Value.Z(), GoverningPredicate, MemReg, OffsetScaled);
} else {
// Treat the non-temporal store as a regular vector store in this case for compatibility
str(Value.Q(), MemReg, Offset);
}
}
DEF_OP(VStoreNonTemporalPair) {
const auto Op = IROp->C<IR::IROp_VStoreNonTemporalPair>();
const auto OpSize = IROp->Size;
const auto Is128Bit = OpSize == Core::CPUState::XMM_SSE_REG_SIZE;
LOGMAN_THROW_A_FMT(Is128Bit, "This IR operation only operates at 128-bit wide");
const auto ValueLow = GetVReg(Op->ValueLow.ID());
const auto ValueHigh = GetVReg(Op->ValueHigh.ID());
const auto MemReg = GetReg(Op->Addr.ID());
const auto Offset = Op->Offset;
stnp(ValueLow.Q(), ValueHigh.Q(), MemReg, Offset);
}
#undef DEF_OP
} // namespace FEXCore::CPU

View File

@ -816,12 +816,25 @@ void OpDispatchBuilder::AVX128_MOVVectorNT(OpcodeArgs) {
const auto SrcSize = GetSrcSize(Op);
const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE;
auto Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit, MemoryAccessType::STREAM);
if (Op->Dest.IsGPR()) {
///< MOVNTDQA load non-temporal comes from SSE4.1 and is extended by AVX/AVX2.
auto Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit, MemoryAccessType::STREAM);
if (Is128Bit) {
Src.High = LoadZeroVector(OpSize::i128Bit);
}
AVX128_StoreResult_WithOpSize(Op, Op->Dest, Src);
} else {
auto Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit, MemoryAccessType::STREAM);
Ref Dest = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.LoadData = false});
if (Op->Dest.IsGPR() && Is128Bit) {
Src.High = LoadZeroVector(OpSize::i128Bit);
if (Is128Bit) {
// Single store non-temporal for 128-bit operations.
_VStoreNonTemporal(OpSize::i128Bit, Src.Low, Dest, 0);
} else {
// For a 256-bit store, use a non-temporal store pair
_VStoreNonTemporalPair(OpSize::i128Bit, Src.Low, Src.High, Dest, 0);
}
}
AVX128_StoreResult_WithOpSize(Op, Op->Dest, Src);
}
void OpDispatchBuilder::AVX128_MOVQ(OpcodeArgs) {

View File

@ -43,8 +43,27 @@ void OpDispatchBuilder::MOVVectorUnalignedOp(OpcodeArgs) {
}
void OpDispatchBuilder::MOVVectorNTOp(OpcodeArgs) {
Ref Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags, {.Align = 1, .AccessType = MemoryAccessType::STREAM});
StoreResult(FPRClass, Op, Src, 1, MemoryAccessType::STREAM);
const auto Size = GetDstSize(Op);
if (Op->Dest.IsGPR()) {
///< MOVNTDQA load non-temporal comes from SSE4.1 and is extended by AVX/AVX2.
Ref Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags, {.Align = 1, .AccessType = MemoryAccessType::STREAM});
StoreResult(FPRClass, Op, Src, 1, MemoryAccessType::STREAM);
} else {
LOGMAN_THROW_A_FMT(!Op->Dest.IsGPR(), "Destination can't be GPR for non-temporal stores");
Ref Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags, {.Align = 1, .AccessType = MemoryAccessType::STREAM});
if (Size < OpSize::i128Bit) {
// Normal streaming store if less than 128-bit
// XMM Scalar 32-bit and 64-bit comes from SSE4a MOVNTSS, MOVNTSD
// MMX 64-bit comes from MOVNTQ
StoreResult(FPRClass, Op, Src, 1, MemoryAccessType::STREAM);
} else {
Ref Dest = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.LoadData = false});
// Single store non-temporal for larger operations.
_VStoreNonTemporal(Size, Src, Dest, 0);
}
}
}
void OpDispatchBuilder::VMOVAPS_VMOVAPDOp(OpcodeArgs) {

View File

@ -653,6 +653,30 @@
],
"HasSideEffects": true,
"DestSize": "8"
},
"VStoreNonTemporal u8:#RegisterSize, FPR:$Value, GPR:$Addr, i8:$Offset": {
"Desc": ["Does a non-temporal memory store of a vector.",
"Matches arm64 SVE stnt1b semantics.",
"Specifically weak-memory model ordered to match x86 non-temporal stores."
],
"HasSideEffects": true,
"DestSize": "RegisterSize",
"EmitValidation": [
"_Offset % RegisterSize == 0",
"RegisterSize == FEXCore::IR::OpSize::i128Bit || RegisterSize == FEXCore::IR::OpSize::i256Bit"
]
},
"VStoreNonTemporalPair u8:#RegisterSize, FPR:$ValueLow, FPR:$ValueHigh, GPR:$Addr, i8:$Offset": {
"Desc": ["Does a non-temporal memory store of two vector registers.",
"Matches arm64 stnp semantics.",
"Specifically weak-memory model ordered to match x86 non-temporal stores."
],
"HasSideEffects": true,
"DestSize": "RegisterSize",
"EmitValidation": [
"_Offset % RegisterSize == 0",
"RegisterSize == FEXCore::IR::OpSize::i128Bit"
]
}
},
"Atomic": {

View File

@ -4539,6 +4539,24 @@ TEST_CASE_METHOD(TestDisassembler, "Emitter: SVE: SVE store multiple structures
"[x29, x30, lsl #3]");
}
TEST_CASE_METHOD(TestDisassembler, "Emitter: SVE: SVE contiguous non-temporal store (scalar plus immediate)") {
TEST_SINGLE(stnt1b(ZReg::z31, PReg::p6, Reg::r29, 0), "stnt1b {z31.b}, p6, [x29]");
TEST_SINGLE(stnt1b(ZReg::z31, PReg::p6, Reg::r29, -8), "stnt1b {z31.b}, p6, [x29, #-8, mul vl]");
TEST_SINGLE(stnt1b(ZReg::z31, PReg::p6, Reg::r29, 7), "stnt1b {z31.b}, p6, [x29, #7, mul vl]");
TEST_SINGLE(stnt1h(ZReg::z31, PReg::p6, Reg::r29, 0), "stnt1h {z31.h}, p6, [x29]");
TEST_SINGLE(stnt1h(ZReg::z31, PReg::p6, Reg::r29, -8), "stnt1h {z31.h}, p6, [x29, #-8, mul vl]");
TEST_SINGLE(stnt1h(ZReg::z31, PReg::p6, Reg::r29, 7), "stnt1h {z31.h}, p6, [x29, #7, mul vl]");
TEST_SINGLE(stnt1w(ZReg::z31, PReg::p6, Reg::r29, 0), "stnt1w {z31.s}, p6, [x29]");
TEST_SINGLE(stnt1w(ZReg::z31, PReg::p6, Reg::r29, -8), "stnt1w {z31.s}, p6, [x29, #-8, mul vl]");
TEST_SINGLE(stnt1w(ZReg::z31, PReg::p6, Reg::r29, 7), "stnt1w {z31.s}, p6, [x29, #7, mul vl]");
TEST_SINGLE(stnt1d(ZReg::z31, PReg::p6, Reg::r29, 0), "stnt1d {z31.d}, p6, [x29]");
TEST_SINGLE(stnt1d(ZReg::z31, PReg::p6, Reg::r29, -8), "stnt1d {z31.d}, p6, [x29, #-8, mul vl]");
TEST_SINGLE(stnt1d(ZReg::z31, PReg::p6, Reg::r29, 7), "stnt1d {z31.d}, p6, [x29, #7, mul vl]");
}
TEST_CASE_METHOD(TestDisassembler, "Emitter: SVE: SVE store multiple structures (scalar plus immediate)") {
TEST_SINGLE(st2b(ZReg::z31, ZReg::z0, PReg::p6, Reg::r29, 0), "st2b {z31.b, z0.b}, p6, [x29]");
TEST_SINGLE(st2b(ZReg::z26, ZReg::z27, PReg::p6, Reg::r29, 0), "st2b {z26.b, z27.b}, p6, [x29]");

View File

@ -2853,14 +2853,13 @@
]
},
"vmovntps [rax], ymm0": {
"ExpectedInstructionCount": 3,
"ExpectedInstructionCount": 2,
"Comment": [
"Map 1 0b00 0x2B 256-bit"
],
"ExpectedArm64ASM": [
"ldr q2, [x28, #16]",
"str q16, [x4]",
"str q2, [x4, #16]"
"stnp q16, q2, [x4]"
]
},
"vmovntpd [rax], xmm0": {
@ -2873,14 +2872,13 @@
]
},
"vmovntpd [rax], ymm0": {
"ExpectedInstructionCount": 3,
"ExpectedInstructionCount": 2,
"Comment": [
"Map 1 0b01 0x2B 256-bit"
],
"ExpectedArm64ASM": [
"ldr q2, [x28, #16]",
"str q16, [x4]",
"str q2, [x4, #16]"
"stnp q16, q2, [x4]"
]
},
"vcvttss2si eax, xmm0": {
@ -4929,14 +4927,13 @@
]
},
"vmovntdq [rax], ymm0": {
"ExpectedInstructionCount": 3,
"ExpectedInstructionCount": 2,
"Comment": [
"Map 1 0b01 0xe7 256-bit"
],
"ExpectedArm64ASM": [
"ldr q2, [x28, #16]",
"str q16, [x4]",
"str q2, [x4, #16]"
"stnp q16, q2, [x4]"
]
},
"vpsubsb xmm0, xmm1, xmm2": {

View File

@ -0,0 +1,73 @@
{
"Features": {
"Bitness": 64,
"EnabledHostFeatures": [
"SVE128"
],
"DisabledHostFeatures": [
"AFP",
"FLAGM",
"FLAGM2",
"SVE256"
]
},
"Instructions": {
"vmovntps [rax], xmm0": {
"ExpectedInstructionCount": 1,
"Comment": [
"Map 1 0b00 0x2B 128-bit"
],
"ExpectedArm64ASM": [
"stnt1b {z16.b}, p6, [x4]"
]
},
"vmovntps [rax], ymm0": {
"ExpectedInstructionCount": 2,
"Comment": [
"Map 1 0b00 0x2B 256-bit"
],
"ExpectedArm64ASM": [
"ldr q2, [x28, #16]",
"stnp q16, q2, [x4]"
]
},
"vmovntpd [rax], xmm0": {
"ExpectedInstructionCount": 1,
"Comment": [
"Map 1 0b01 0x2B 128-bit"
],
"ExpectedArm64ASM": [
"stnt1b {z16.b}, p6, [x4]"
]
},
"vmovntpd [rax], ymm0": {
"ExpectedInstructionCount": 2,
"Comment": [
"Map 1 0b01 0x2B 256-bit"
],
"ExpectedArm64ASM": [
"ldr q2, [x28, #16]",
"stnp q16, q2, [x4]"
]
},
"vmovntdq [rax], xmm0": {
"ExpectedInstructionCount": 1,
"Comment": [
"Map 1 0b01 0xe7 128-bit"
],
"ExpectedArm64ASM": [
"stnt1b {z16.b}, p6, [x4]"
]
},
"vmovntdq [rax], ymm0": {
"ExpectedInstructionCount": 2,
"Comment": [
"Map 1 0b01 0xe7 256-bit"
],
"ExpectedArm64ASM": [
"ldr q2, [x28, #16]",
"stnp q16, q2, [x4]"
]
}
}
}

View File

@ -3246,7 +3246,7 @@
"Map 1 0b00 0x2B 128-bit"
],
"ExpectedArm64ASM": [
"str q16, [x4]"
"stnt1b {z16.b}, p6, [x4]"
]
},
"vmovntps [rax], ymm0": {
@ -3255,7 +3255,7 @@
"Map 1 0b00 0x2B 256-bit"
],
"ExpectedArm64ASM": [
"st1b {z16.b}, p7, [x4]"
"stnt1b {z16.b}, p7, [x4]"
]
},
"vmovntpd [rax], xmm0": {
@ -3264,7 +3264,7 @@
"Map 1 0b01 0x2B 128-bit"
],
"ExpectedArm64ASM": [
"str q16, [x4]"
"stnt1b {z16.b}, p6, [x4]"
]
},
"vmovntpd [rax], ymm0": {
@ -3273,7 +3273,7 @@
"Map 1 0b01 0x2B 256-bit"
],
"ExpectedArm64ASM": [
"st1b {z16.b}, p7, [x4]"
"stnt1b {z16.b}, p7, [x4]"
]
},
"vcvttss2si eax, xmm0": {
@ -5004,7 +5004,7 @@
"Map 1 0b01 0xe7 128-bit"
],
"ExpectedArm64ASM": [
"str q16, [x4]"
"stnt1b {z16.b}, p6, [x4]"
]
},
"vmovntdq [rax], ymm0": {
@ -5013,7 +5013,7 @@
"Map 1 0b01 0xe7 256-bit"
],
"ExpectedArm64ASM": [
"st1b {z16.b}, p7, [x4]"
"stnt1b {z16.b}, p7, [x4]"
]
},
"vpsubsb xmm0, xmm1, xmm2": {