mirror of
https://github.com/FEX-Emu/FEX.git
synced 2025-03-04 20:47:03 +00:00
Arm64: Implement support for emulated masked vector loadstores
In order to support `vmaskmov{ps,pd}` without SVE128 this is required. It's pretty gnarly but they aren't often used so that's fine from a compatibility perspective. Example SVE128 implementation: ```json "vmaskmovps ymm0, ymm1, [rax]": { "ExpectedInstructionCount": 9, "Comment": [ "Map 2 0b01 0x2c 256-bit" ], "ExpectedArm64ASM": [ "ldr q2, [x28, #32]", "mrs x20, nzcv", "cmplt p0.s, p6/z, z17.s, #0", "ld1w {z16.s}, p0/z, [x4]", "add x21, x4, #0x10 (16)", "cmplt p0.s, p6/z, z2.s, #0", "ld1w {z2.s}, p0/z, [x21]", "str q2, [x28, #16]", "msr nzcv, x20" ] }, ``` Example ASIMD implementation ```json "vmaskmovps ymm0, ymm1, [rax]": { "ExpectedInstructionCount": 37, "Comment": [ "Map 2 0b01 0x2c 256-bit" ], "ExpectedArm64ASM": [ "ldr q2, [x28, #32]", "mrs x20, nzcv", "movi v0.2d, #0x0", "mov x1, x4", "mov x0, v17.d[0]", "tbz x0, #63, #+0x8", "ld1 {v0.s}[0], [x1]", "add x1, x1, #0x4 (4)", "tbz w0, #31, #+0x8", "ld1 {v0.s}[1], [x1]", "add x1, x1, #0x4 (4)", "mov x0, v17.d[1]", "tbz x0, #63, #+0x8", "ld1 {v0.s}[2], [x1]", "add x1, x1, #0x4 (4)", "tbz w0, #31, #+0x8", "ld1 {v0.s}[3], [x1]", "mov v16.16b, v0.16b", "add x21, x4, #0x10 (16)", "movi v0.2d, #0x0", "mov x1, x21", "mov x0, v2.d[0]", "tbz x0, #63, #+0x8", "ld1 {v0.s}[0], [x1]", "add x1, x1, #0x4 (4)", "tbz w0, #31, #+0x8", "ld1 {v0.s}[1], [x1]", "add x1, x1, #0x4 (4)", "mov x0, v2.d[1]", "tbz x0, #63, #+0x8", "ld1 {v0.s}[2], [x1]", "add x1, x1, #0x4 (4)", "tbz w0, #31, #+0x8", "ld1 {v0.s}[3], [x1]", "mov v2.16b, v0.16b", "str q2, [x28, #16]", "msr nzcv, x20" ] }, ``` There's a little bit of an improvement where nzcv isn't needed to get touched on the ASIMD implementation, but I'll leave that for a future improvement.
This commit is contained in:
parent
3c293b9aed
commit
e95c8d703c
@ -752,7 +752,6 @@ DEF_OP(LoadMemTSO) {
|
||||
}
|
||||
|
||||
DEF_OP(VLoadVectorMasked) {
|
||||
LOGMAN_THROW_A_FMT(HostSupportsSVE128 || HostSupportsSVE256, "Need SVE support in order to use VLoadVectorMasked");
|
||||
|
||||
const auto Op = IROp->C<IR::IROp_VLoadVectorMasked>();
|
||||
const auto OpSize = IROp->Size;
|
||||
@ -769,35 +768,83 @@ DEF_OP(VLoadVectorMasked) {
|
||||
const auto Dst = GetVReg(Node);
|
||||
const auto MaskReg = GetVReg(Op->Mask.ID());
|
||||
const auto MemReg = GetReg(Op->Addr.ID());
|
||||
const auto MemSrc = GenerateSVEMemOperand(OpSize, MemReg, Op->Offset, Op->OffsetType, Op->OffsetScale);
|
||||
|
||||
// Check if the sign bit is set for the given element size.
|
||||
cmplt(SubRegSize, CMPPredicate, GoverningPredicate.Zeroing(), MaskReg.Z(), 0);
|
||||
if (HostSupportsSVE128 || HostSupportsSVE256) {
|
||||
const auto MemSrc = GenerateSVEMemOperand(OpSize, MemReg, Op->Offset, Op->OffsetType, Op->OffsetScale);
|
||||
|
||||
switch (IROp->ElementSize) {
|
||||
case 1: {
|
||||
ld1b<ARMEmitter::SubRegSize::i8Bit>(Dst.Z(), CMPPredicate.Zeroing(), MemSrc);
|
||||
break;
|
||||
}
|
||||
case 2: {
|
||||
ld1h<ARMEmitter::SubRegSize::i16Bit>(Dst.Z(), CMPPredicate.Zeroing(), MemSrc);
|
||||
break;
|
||||
}
|
||||
case 4: {
|
||||
ld1w<ARMEmitter::SubRegSize::i32Bit>(Dst.Z(), CMPPredicate.Zeroing(), MemSrc);
|
||||
break;
|
||||
}
|
||||
case 8: {
|
||||
ld1d(Dst.Z(), CMPPredicate.Zeroing(), MemSrc);
|
||||
break;
|
||||
}
|
||||
default: break;
|
||||
// Check if the sign bit is set for the given element size.
|
||||
cmplt(SubRegSize, CMPPredicate, GoverningPredicate.Zeroing(), MaskReg.Z(), 0);
|
||||
|
||||
switch (IROp->ElementSize) {
|
||||
case 1: {
|
||||
ld1b<ARMEmitter::SubRegSize::i8Bit>(Dst.Z(), CMPPredicate.Zeroing(), MemSrc);
|
||||
break;
|
||||
}
|
||||
case 2: {
|
||||
ld1h<ARMEmitter::SubRegSize::i16Bit>(Dst.Z(), CMPPredicate.Zeroing(), MemSrc);
|
||||
break;
|
||||
}
|
||||
case 4: {
|
||||
ld1w<ARMEmitter::SubRegSize::i32Bit>(Dst.Z(), CMPPredicate.Zeroing(), MemSrc);
|
||||
break;
|
||||
}
|
||||
case 8: {
|
||||
ld1d(Dst.Z(), CMPPredicate.Zeroing(), MemSrc);
|
||||
break;
|
||||
}
|
||||
default: break;
|
||||
}
|
||||
} else {
|
||||
// Prepare yourself adventurer. For a masked load without instructions that implement it.
|
||||
LOGMAN_THROW_A_FMT(OpSize == Core::CPUState::XMM_SSE_REG_SIZE, "Only supports 128-bit without SVE256");
|
||||
size_t NumElements = IROp->Size / IROp->ElementSize;
|
||||
|
||||
// Use VTMP1 as the temporary destination
|
||||
auto TempDst = VTMP1;
|
||||
auto WorkingReg = TMP1;
|
||||
auto TempMemReg = MemReg;
|
||||
movi(ARMEmitter::SubRegSize::i64Bit, TempDst.Q(), 0);
|
||||
LOGMAN_THROW_A_FMT(Op->Offset.IsInvalid(), "Complex addressing requested and not supported!");
|
||||
|
||||
uint64_t MaskIndex {};
|
||||
const uint64_t ElementSizeInBits = IROp->ElementSize * 8;
|
||||
for (size_t i = 0; i < NumElements; ++i) {
|
||||
if (((i * IROp->ElementSize) % 8) == 0) {
|
||||
// Extract the mask element.
|
||||
umov<ARMEmitter::SubRegSize::i64Bit>(WorkingReg, MaskReg, MaskIndex);
|
||||
++MaskIndex;
|
||||
}
|
||||
|
||||
// If the sign bit is zero then skip the load
|
||||
ARMEmitter::SingleUseForwardLabel Skip {};
|
||||
const size_t ElementOffset = (64 - (i * ElementSizeInBits) % 64) - 1;
|
||||
tbz(WorkingReg, ElementOffset, &Skip);
|
||||
// Do the gather load for this element into the destination
|
||||
switch (IROp->ElementSize) {
|
||||
case 1: ld1<ARMEmitter::SubRegSize::i8Bit>(TempDst.Q(), i, TempMemReg); break;
|
||||
case 2: ld1<ARMEmitter::SubRegSize::i16Bit>(TempDst.Q(), i, TempMemReg); break;
|
||||
case 4: ld1<ARMEmitter::SubRegSize::i32Bit>(TempDst.Q(), i, TempMemReg); break;
|
||||
case 8: ld1<ARMEmitter::SubRegSize::i64Bit>(TempDst.Q(), i, TempMemReg); break;
|
||||
case 16: ldr(TempDst.Q(), TempMemReg, 0); break;
|
||||
default: LOGMAN_MSG_A_FMT("Unhandled {} size: {}", __func__, IROp->ElementSize); return;
|
||||
}
|
||||
|
||||
Bind(&Skip);
|
||||
|
||||
if ((i + 1) != NumElements) {
|
||||
// Handle register rename to save a move.
|
||||
auto WorkingReg = TempMemReg;
|
||||
TempMemReg = TMP2;
|
||||
add(ARMEmitter::Size::i64Bit, TempMemReg, WorkingReg, IROp->ElementSize);
|
||||
}
|
||||
}
|
||||
|
||||
// Move result.
|
||||
mov(Dst.Q(), TempDst.Q());
|
||||
}
|
||||
}
|
||||
|
||||
DEF_OP(VStoreVectorMasked) {
|
||||
LOGMAN_THROW_A_FMT(HostSupportsSVE128 || HostSupportsSVE256, "Need SVE support in order to use VStoreVectorMasked");
|
||||
|
||||
const auto Op = IROp->C<IR::IROp_VStoreVectorMasked>();
|
||||
const auto OpSize = IROp->Size;
|
||||
|
||||
@ -813,29 +860,73 @@ DEF_OP(VStoreVectorMasked) {
|
||||
const auto RegData = GetVReg(Op->Data.ID());
|
||||
const auto MaskReg = GetVReg(Op->Mask.ID());
|
||||
const auto MemReg = GetReg(Op->Addr.ID());
|
||||
const auto MemDst = GenerateSVEMemOperand(OpSize, MemReg, Op->Offset, Op->OffsetType, Op->OffsetScale);
|
||||
if (HostSupportsSVE128 || HostSupportsSVE256) {
|
||||
const auto MemDst = GenerateSVEMemOperand(OpSize, MemReg, Op->Offset, Op->OffsetType, Op->OffsetScale);
|
||||
|
||||
// Check if the sign bit is set for the given element size.
|
||||
cmplt(SubRegSize, CMPPredicate, GoverningPredicate.Zeroing(), MaskReg.Z(), 0);
|
||||
// Check if the sign bit is set for the given element size.
|
||||
cmplt(SubRegSize, CMPPredicate, GoverningPredicate.Zeroing(), MaskReg.Z(), 0);
|
||||
|
||||
switch (IROp->ElementSize) {
|
||||
case 1: {
|
||||
st1b<ARMEmitter::SubRegSize::i8Bit>(RegData.Z(), CMPPredicate.Zeroing(), MemDst);
|
||||
break;
|
||||
}
|
||||
case 2: {
|
||||
st1h<ARMEmitter::SubRegSize::i16Bit>(RegData.Z(), CMPPredicate.Zeroing(), MemDst);
|
||||
break;
|
||||
}
|
||||
case 4: {
|
||||
st1w<ARMEmitter::SubRegSize::i32Bit>(RegData.Z(), CMPPredicate.Zeroing(), MemDst);
|
||||
break;
|
||||
}
|
||||
case 8: {
|
||||
st1d(RegData.Z(), CMPPredicate.Zeroing(), MemDst);
|
||||
break;
|
||||
}
|
||||
default: break;
|
||||
switch (IROp->ElementSize) {
|
||||
case 1: {
|
||||
st1b<ARMEmitter::SubRegSize::i8Bit>(RegData.Z(), CMPPredicate.Zeroing(), MemDst);
|
||||
break;
|
||||
}
|
||||
case 2: {
|
||||
st1h<ARMEmitter::SubRegSize::i16Bit>(RegData.Z(), CMPPredicate.Zeroing(), MemDst);
|
||||
break;
|
||||
}
|
||||
case 4: {
|
||||
st1w<ARMEmitter::SubRegSize::i32Bit>(RegData.Z(), CMPPredicate.Zeroing(), MemDst);
|
||||
break;
|
||||
}
|
||||
case 8: {
|
||||
st1d(RegData.Z(), CMPPredicate.Zeroing(), MemDst);
|
||||
break;
|
||||
}
|
||||
default: break;
|
||||
}
|
||||
} else {
|
||||
// Prepare yourself adventurer. For a masked store without instructions that implement it.
|
||||
LOGMAN_THROW_A_FMT(OpSize == Core::CPUState::XMM_SSE_REG_SIZE, "Only supports 128-bit without SVE256");
|
||||
size_t NumElements = IROp->Size / IROp->ElementSize;
|
||||
|
||||
// Use VTMP1 as the temporary destination
|
||||
auto WorkingReg = TMP1;
|
||||
auto TempMemReg = MemReg;
|
||||
LOGMAN_THROW_A_FMT(Op->Offset.IsInvalid(), "Complex addressing requested and not supported!");
|
||||
|
||||
uint64_t MaskIndex {};
|
||||
const uint64_t ElementSizeInBits = IROp->ElementSize * 8;
|
||||
for (size_t i = 0; i < NumElements; ++i) {
|
||||
if (((i * IROp->ElementSize) % 8) == 0) {
|
||||
// Extract the mask element.
|
||||
umov<ARMEmitter::SubRegSize::i64Bit>(WorkingReg, MaskReg, MaskIndex);
|
||||
++MaskIndex;
|
||||
}
|
||||
|
||||
// If the sign bit is zero then skip the load
|
||||
ARMEmitter::SingleUseForwardLabel Skip {};
|
||||
const size_t ElementOffset = (64 - (i * ElementSizeInBits) % 64) - 1;
|
||||
tbz(WorkingReg, ElementOffset, &Skip);
|
||||
// Do the gather load for this element into the destination
|
||||
switch (IROp->ElementSize) {
|
||||
case 1: st1<ARMEmitter::SubRegSize::i8Bit>(RegData.Q(), i, TempMemReg); break;
|
||||
case 2: st1<ARMEmitter::SubRegSize::i16Bit>(RegData.Q(), i, TempMemReg); break;
|
||||
case 4: st1<ARMEmitter::SubRegSize::i32Bit>(RegData.Q(), i, TempMemReg); break;
|
||||
case 8: st1<ARMEmitter::SubRegSize::i64Bit>(RegData.Q(), i, TempMemReg); break;
|
||||
case 16: str(RegData.Q(), TempMemReg, 0); break;
|
||||
default: LOGMAN_MSG_A_FMT("Unhandled {} size: {}", __func__, IROp->ElementSize); return;
|
||||
}
|
||||
|
||||
Bind(&Skip);
|
||||
|
||||
if ((i + 1) != NumElements) {
|
||||
// Handle register rename to save a move.
|
||||
auto WorkingReg = TempMemReg;
|
||||
TempMemReg = TMP2;
|
||||
add(ARMEmitter::Size::i64Bit, TempMemReg, WorkingReg, IROp->ElementSize);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user