Merge pull request #3757 from Sonicadvance1/avx_16

AVX128: Implement support for gathers
This commit is contained in:
Ryan Houdek 2024-06-26 13:29:58 -07:00 committed by GitHub
commit 6226c7f4f3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 2217 additions and 135 deletions

View File

@ -940,6 +940,187 @@ DEF_OP(VStoreVectorMasked) {
}
}
DEF_OP(VLoadVectorGatherMasked) {
const auto Op = IROp->C<IR::IROp_VLoadVectorGatherMasked>();
const auto OpSize = IROp->Size;
const auto VectorIndexSize = Op->VectorIndexElementSize;
const auto OffsetScale = Op->OffsetScale;
const auto DataElementOffsetStart = Op->DataElementOffsetStart;
const auto IndexElementOffsetStart = Op->IndexElementOffsetStart;
///< This IR operation handles discontiguous masked gather loadstore instructions. Some things to note about its behaviour.
/// - VSIB behaviour is mostly entirely exposed in the IR operation directly.
/// - Displacement is the only value missing as that can be added directly to AddrBase.
/// - VectorIndex{Low,High} contains the index offsets for each element getting loaded.
/// - These element sizes are decoupled from the resulting element size. These can be 32-bit or 64-bit.
/// - When the element size is 32-bit then the value is zero-extended to the full 64-bit address calculation
/// - When loading a 128-bit result with 64-bit VectorIndex Elements, this requires the use of both VectorIndexLow and VectorIndexHigh
/// to get enough pointers.
/// - When VectorIndexElementSize and OffsetScale matches Arm64 SVE behaviour then the operation becomes more optimal
/// - When the behaviour doesn't match then it gets decomposed to ASIMD style masked load.
/// - AddrBase also doesn't need to exist
/// - If the instruction is using 64-bit vector indexing or 32-bit addresses where the top-bit isn't set then this is valid!
const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE;
if (Is256Bit) {
LOGMAN_THROW_A_FMT(HostSupportsSVE256, "Need SVE256 support in order to use VStoreVectorMasked with 256-bit operation");
}
const auto Dst = GetVReg(Node);
const auto IncomingDst = GetVReg(Op->Incoming.ID());
const auto MaskReg = GetVReg(Op->Mask.ID());
std::optional<ARMEmitter::Register> BaseAddr = !Op->AddrBase.IsInvalid() ? std::make_optional(GetReg(Op->AddrBase.ID())) : std::nullopt;
const auto VectorIndexLow = GetVReg(Op->VectorIndexLow.ID());
std::optional<ARMEmitter::VRegister> VectorIndexHigh =
!Op->VectorIndexHigh.IsInvalid() ? std::make_optional(GetVReg(Op->VectorIndexHigh.ID())) : std::nullopt;
///< If the host supports SVE and the offset scale matches SVE limitations then it can do an SVE style load.
const bool SupportsSVELoad = (HostSupportsSVE128 || HostSupportsSVE256) && (OffsetScale == 1 || OffsetScale == VectorIndexSize) &&
(VectorIndexSize == IROp->ElementSize);
const auto PerformSMove = [this](size_t ElementSize, const ARMEmitter::Register Dst, const ARMEmitter::VRegister Vector, int index) {
switch (ElementSize) {
case 1: smov<ARMEmitter::SubRegSize::i8Bit>(Dst.X(), Vector, index); break;
case 2: smov<ARMEmitter::SubRegSize::i16Bit>(Dst.X(), Vector, index); break;
case 4: smov<ARMEmitter::SubRegSize::i32Bit>(Dst.X(), Vector, index); break;
case 8: umov<ARMEmitter::SubRegSize::i64Bit>(Dst.X(), Vector, index); break;
default: LOGMAN_MSG_A_FMT("Unhandled ExtractElementSize: {}", ElementSize); break;
}
};
const auto PerformMove = [this](size_t ElementSize, const ARMEmitter::Register Dst, const ARMEmitter::VRegister Vector, int index) {
switch (ElementSize) {
case 1: umov<ARMEmitter::SubRegSize::i8Bit>(Dst, Vector, index); break;
case 2: umov<ARMEmitter::SubRegSize::i16Bit>(Dst, Vector, index); break;
case 4: umov<ARMEmitter::SubRegSize::i32Bit>(Dst, Vector, index); break;
case 8: umov<ARMEmitter::SubRegSize::i64Bit>(Dst, Vector, index); break;
default: LOGMAN_MSG_A_FMT("Unhandled ExtractElementSize: {}", ElementSize); break;
}
};
if (SupportsSVELoad) {
ARMEmitter::SVEModType ModType = ARMEmitter::SVEModType::MOD_NONE;
uint8_t SVEScale = FEXCore::ilog2(OffsetScale);
if (VectorIndexSize == 4) {
ModType = ARMEmitter::SVEModType::MOD_SXTW;
} else if (VectorIndexSize == 8 && OffsetScale != 1) {
ModType = ARMEmitter::SVEModType::MOD_LSL;
}
ARMEmitter::Register AddrReg = TMP1;
if (BaseAddr.has_value()) {
AddrReg = GetReg(Op->AddrBase.ID());
} else {
///< OpcodeDispatcher didn't provide a Base address while SVE requires one.
LoadConstant(ARMEmitter::Size::i64Bit, AddrReg, 0);
}
const auto MemDst = ARMEmitter::SVEMemOperand(AddrReg.X(), VectorIndexLow.Z(), ModType, SVEScale);
const auto SubRegSize = ConvertSubRegSize8(IROp);
const auto CMPPredicate = ARMEmitter::PReg::p0;
const auto GoverningPredicate = Is256Bit ? PRED_TMP_32B : PRED_TMP_16B;
// Check if the sign bit is set for the given element size.
cmplt(SubRegSize, CMPPredicate, GoverningPredicate.Zeroing(), MaskReg.Z(), 0);
auto TempDst = VTMP1;
switch (IROp->ElementSize) {
case 1: {
ld1b<ARMEmitter::SubRegSize::i8Bit>(TempDst.Z(), CMPPredicate.Zeroing(), MemDst);
break;
}
case 2: {
ld1h<ARMEmitter::SubRegSize::i16Bit>(TempDst.Z(), CMPPredicate.Zeroing(), MemDst);
break;
}
case 4: {
ld1w<ARMEmitter::SubRegSize::i32Bit>(TempDst.Z(), CMPPredicate.Zeroing(), MemDst);
break;
}
case 8: {
ld1d(TempDst.Z(), CMPPredicate.Zeroing(), MemDst);
break;
}
default: break;
}
///< Merge elements based on predicate.
sel(SubRegSize, Dst.Z(), CMPPredicate, TempDst.Z(), IncomingDst.Z());
} else {
LOGMAN_THROW_A_FMT(!Is256Bit, "Can't emulate this gather load in the backend! Programming error!");
///< Adventurers beware, emulated ASIMD style gather masked load operation.
// Number of elements to load is calculated by the number of index elements available.
size_t NumAddrElements = (VectorIndexHigh.has_value() ? 32 : 16) / VectorIndexSize;
// The number of elements is clamped by the resulting register size.
size_t NumDataElements = std::min<size_t>(IROp->Size / IROp->ElementSize, NumAddrElements);
size_t IndexElementsSizeBytes = NumAddrElements * VectorIndexSize;
if (IndexElementsSizeBytes > 16) {
// We must have a high register in this case.
LOGMAN_THROW_A_FMT(VectorIndexHigh.has_value(), "Need High vector index register!");
}
// Use VTMP1 as the temporary destination
auto TempReg = VTMP1;
auto WorkingReg = TMP1;
auto TempMemReg = TMP2;
const uint64_t ElementSizeInBits = IROp->ElementSize * 8;
mov(TempReg.Q(), IncomingDst.Q());
for (size_t i = DataElementOffsetStart, IndexElement = IndexElementOffsetStart; i < NumDataElements; ++i, ++IndexElement) {
ARMEmitter::SingleUseForwardLabel Skip {};
// Extract mask element
PerformMove(IROp->ElementSize, WorkingReg, MaskReg, i);
// Skip if the mask's sign bit isn't set
tbz(WorkingReg, ElementSizeInBits - 1, &Skip);
// Extract Index Element
if ((IndexElement * VectorIndexSize) >= 16) {
// Fetch from the high index register.
PerformSMove(VectorIndexSize, WorkingReg, *VectorIndexHigh, IndexElement - (16 / VectorIndexSize));
} else {
// Fetch from the low index register.
PerformSMove(VectorIndexSize, WorkingReg, VectorIndexLow, IndexElement);
}
// Calculate memory position for this gather load
if (BaseAddr.has_value()) {
if (VectorIndexSize == 4) {
add(ARMEmitter::Size::i64Bit, TempMemReg, *BaseAddr, WorkingReg, ARMEmitter::ExtendedType::SXTW, FEXCore::ilog2(OffsetScale));
} else {
add(ARMEmitter::Size::i64Bit, TempMemReg, *BaseAddr, WorkingReg, ARMEmitter::ShiftType::LSL, FEXCore::ilog2(OffsetScale));
}
} else {
///< In this case we have no base address, All addresses come from the vector register itself
if (VectorIndexSize == 4) {
// Sign extend and shift in to the 64-bit register
sbfiz(ARMEmitter::Size::i64Bit, TempMemReg, WorkingReg, FEXCore::ilog2(OffsetScale), 32);
} else {
lsl(ARMEmitter::Size::i64Bit, TempMemReg, WorkingReg, FEXCore::ilog2(OffsetScale));
}
}
// Now that the address is calculated. Do the load.
switch (IROp->ElementSize) {
case 1: ld1<ARMEmitter::SubRegSize::i8Bit>(TempReg.Q(), i, TempMemReg); break;
case 2: ld1<ARMEmitter::SubRegSize::i16Bit>(TempReg.Q(), i, TempMemReg); break;
case 4: ld1<ARMEmitter::SubRegSize::i32Bit>(TempReg.Q(), i, TempMemReg); break;
case 8: ld1<ARMEmitter::SubRegSize::i64Bit>(TempReg.Q(), i, TempMemReg); break;
case 16: ldr(TempReg.Q(), TempMemReg, 0); break;
default: LOGMAN_MSG_A_FMT("Unhandled {} size: {}", __func__, IROp->ElementSize); FEX_UNREACHABLE;
}
Bind(&Skip);
}
// Move result.
mov(Dst.Q(), TempReg.Q());
}
}
DEF_OP(VLoadVectorElement) {
const auto Op = IROp->C<IR::IROp_VLoadVectorElement>();
const auto OpSize = IROp->Size;

View File

@ -5294,6 +5294,11 @@ void OpDispatchBuilder::InstallHostSpecificOpcodeHandlers() {
{OPD(2, 0b01, 0x8C), 1, &OpDispatchBuilder::VPMASKMOVOp<false>},
{OPD(2, 0b01, 0x8E), 1, &OpDispatchBuilder::VPMASKMOVOp<true>},
{OPD(2, 0b01, 0x90), 1, &OpDispatchBuilder::VPGATHER<OpSize::i32Bit>},
{OPD(2, 0b01, 0x91), 1, &OpDispatchBuilder::VPGATHER<OpSize::i64Bit>},
{OPD(2, 0b01, 0x92), 1, &OpDispatchBuilder::VPGATHER<OpSize::i32Bit>},
{OPD(2, 0b01, 0x93), 1, &OpDispatchBuilder::VPGATHER<OpSize::i64Bit>},
{OPD(2, 0b01, 0x96), 1, &OpDispatchBuilder::VFMADDSUB<1, 3, 2>},
{OPD(2, 0b01, 0x97), 1, &OpDispatchBuilder::VFMSUBADD<1, 3, 2>},

View File

@ -975,6 +975,17 @@ public:
template<uint8_t Src1Idx, uint8_t Src2Idx, uint8_t AddendIdx>
void VFMSUBADD(OpcodeArgs);
struct RefVSIB {
Ref Low, High;
Ref BaseAddr;
int32_t Displacement;
uint8_t Scale;
};
RefVSIB LoadVSIB(const X86Tables::DecodedOp& Op, const X86Tables::DecodedOperand& Operand, uint32_t Flags);
template<size_t AddrElementSize>
void VPGATHER(OpcodeArgs);
template<size_t ElementSize, size_t DstElementSize, bool Signed>
void ExtendVectorElements(OpcodeArgs);
template<size_t ElementSize>
@ -1019,6 +1030,7 @@ public:
RefPair AVX128_LoadSource_WithOpSize(const X86Tables::DecodedOp& Op, const X86Tables::DecodedOperand& Operand, uint32_t Flags,
bool NeedsHigh, MemoryAccessType AccessType = MemoryAccessType::DEFAULT);
RefVSIB AVX128_LoadVSIB(const X86Tables::DecodedOp& Op, const X86Tables::DecodedOperand& Operand, uint32_t Flags, bool NeedsHigh);
void AVX128_StoreResult_WithOpSize(FEXCore::X86Tables::DecodedOp Op, const FEXCore::X86Tables::DecodedOperand& Operand, const RefPair Src,
MemoryAccessType AccessType = MemoryAccessType::DEFAULT);
void InstallAVX128Handlers();
@ -1247,6 +1259,12 @@ public:
template<uint8_t Src1Idx, uint8_t Src2Idx, uint8_t AddendIdx>
void AVX128_VFMSUBADD(OpcodeArgs);
template<size_t AddrElementSize>
RefPair AVX128_VPGatherImpl(OpSize Size, OpSize ElementLoadSize, RefPair Dest, RefPair Mask, RefVSIB VSIB);
template<size_t AddrElementSize>
void AVX128_VPGATHER(OpcodeArgs);
// End of AVX 128-bit implementation
void InvalidOp(OpcodeArgs);

View File

@ -327,6 +327,11 @@ void OpDispatchBuilder::InstallAVX128Handlers() {
{OPD(2, 0b01, 0x8C), 1, &OpDispatchBuilder::AVX128_VPMASKMOV<false>},
{OPD(2, 0b01, 0x8E), 1, &OpDispatchBuilder::AVX128_VPMASKMOV<true>},
{OPD(2, 0b01, 0x90), 1, &OpDispatchBuilder::AVX128_VPGATHER<OpSize::i32Bit>},
{OPD(2, 0b01, 0x91), 1, &OpDispatchBuilder::AVX128_VPGATHER<OpSize::i64Bit>},
{OPD(2, 0b01, 0x92), 1, &OpDispatchBuilder::AVX128_VPGATHER<OpSize::i32Bit>},
{OPD(2, 0b01, 0x93), 1, &OpDispatchBuilder::AVX128_VPGATHER<OpSize::i64Bit>},
{OPD(2, 0b01, 0x96), 1, &OpDispatchBuilder::AVX128_VFMADDSUB<1, 3, 2>},
{OPD(2, 0b01, 0x97), 1, &OpDispatchBuilder::AVX128_VFMSUBADD<1, 3, 2>},
@ -486,10 +491,9 @@ OpDispatchBuilder::RefPair OpDispatchBuilder::AVX128_LoadSource_WithOpSize(
AddressMode HighA = A;
HighA.Offset += 16;
///< TODO: Implement VSIB once we get there.
if (Operand.IsSIB()) {
const bool IsVSIB = (Op->Flags & X86Tables::DecodeFlags::FLAG_VSIB_BYTE) != 0;
LOGMAN_THROW_AA_FMT(!IsVSIB, "VSIB currently unsupported");
LOGMAN_THROW_AA_FMT(!IsVSIB, "VSIB uses LoadVSIB instead");
}
return {
@ -499,6 +503,31 @@ OpDispatchBuilder::RefPair OpDispatchBuilder::AVX128_LoadSource_WithOpSize(
}
}
OpDispatchBuilder::RefVSIB
OpDispatchBuilder::AVX128_LoadVSIB(const X86Tables::DecodedOp& Op, const X86Tables::DecodedOperand& Operand, uint32_t Flags, bool NeedsHigh) {
const bool IsVSIB = (Op->Flags & X86Tables::DecodeFlags::FLAG_VSIB_BYTE) != 0;
LOGMAN_THROW_A_FMT(Operand.IsSIB() && IsVSIB, "Trying to load VSIB for something that isn't the correct type!");
// VSIB is a very special case which has a ton of encoded data.
// Get it in a format we can reason about.
const auto Index_gpr = Operand.Data.SIB.Index;
const auto Base_gpr = Operand.Data.SIB.Base;
LOGMAN_THROW_AA_FMT(Index_gpr >= FEXCore::X86State::REG_XMM_0 && Index_gpr <= FEXCore::X86State::REG_XMM_15, "must be AVX reg");
LOGMAN_THROW_AA_FMT(
Base_gpr == FEXCore::X86State::REG_INVALID || (Base_gpr >= FEXCore::X86State::REG_RAX && Base_gpr <= FEXCore::X86State::REG_R15),
"Base must be a GPR.");
const auto Index_XMM_gpr = Index_gpr - X86State::REG_XMM_0;
return {
.Low = AVX128_LoadXMMRegister(Index_XMM_gpr, false),
.High = NeedsHigh ? AVX128_LoadXMMRegister(Index_XMM_gpr, true) : Invalid(),
.BaseAddr = Base_gpr != FEXCore::X86State::REG_INVALID ? LoadGPRRegister(Base_gpr, OpSize::i64Bit, 0, false) : nullptr,
.Displacement = Operand.Data.SIB.Offset,
.Scale = Operand.Data.SIB.Scale,
};
}
void OpDispatchBuilder::AVX128_StoreResult_WithOpSize(FEXCore::X86Tables::DecodedOp Op, const FEXCore::X86Tables::DecodedOperand& Operand,
const RefPair Src, MemoryAccessType AccessType) {
if (Operand.IsGPR()) {
@ -2007,7 +2036,6 @@ void OpDispatchBuilder::AVX128_VMASKMOVImpl(OpcodeArgs, size_t ElementSize, size
return MakeSegmentAddress(Op, Data, CTX->GetGPRSize());
};
///< TODO: Needs SVE for masked loadstores.
if (IsStore) {
auto Address = MakeAddress(Op->Dest);
@ -2487,4 +2515,96 @@ void OpDispatchBuilder::AVX128_VFMSUBADD(OpcodeArgs) {
AVX128_VFMAddSubImpl(Op, false, Src1Idx, Src2Idx, AddendIdx);
}
template<size_t AddrElementSize>
OpDispatchBuilder::RefPair OpDispatchBuilder::AVX128_VPGatherImpl(OpSize Size, OpSize ElementLoadSize, RefPair Dest, RefPair Mask, RefVSIB VSIB) {
LOGMAN_THROW_A_FMT(AddrElementSize == OpSize::i32Bit || AddrElementSize == OpSize::i64Bit, "Unknown address element size");
const auto Is128Bit = Size == Core::CPUState::XMM_SSE_REG_SIZE;
///< BaseAddr doesn't need to exist, calculate that here.
Ref BaseAddr = VSIB.BaseAddr;
if (BaseAddr && VSIB.Displacement) {
BaseAddr = _Add(OpSize::i64Bit, BaseAddr, _Constant(VSIB.Displacement));
} else if (VSIB.Displacement) {
BaseAddr = _Constant(VSIB.Displacement);
} else if (!BaseAddr) {
BaseAddr = Invalid();
}
RefPair Result {};
///< Calculate the low-half.
Result.Low = _VLoadVectorGatherMasked(OpSize::i128Bit, ElementLoadSize, Dest.Low, Mask.Low, BaseAddr, VSIB.Low, VSIB.High,
AddrElementSize, VSIB.Scale, 0, 0);
if (Is128Bit) {
Result.High = LoadZeroVector(OpSize::i128Bit);
if (AddrElementSize == OpSize::i64Bit && ElementLoadSize == OpSize::i32Bit) {
// Special case for the 128-bit gather load using 64-bit address indexes with 32-bit results.
// Only loads two 32-bit elements in to the lower 64-bits of the first destination.
// Bits [255:65] all become zero.
Result.Low = _VZip(OpSize::i128Bit, OpSize::i64Bit, Result.Low, Result.High);
}
} else {
RefPair AddrAddressing {};
Ref DestReg = Dest.High;
Ref MaskReg = Mask.High;
uint8_t IndexElementOffset {};
uint8_t DataElementOffset {};
if (AddrElementSize == ElementLoadSize) {
// If the address size matches the loading element size then it will be fetching at the same rate between low and high
AddrAddressing.Low = VSIB.High;
AddrAddressing.High = Invalid();
} else if (AddrElementSize == OpSize::i32Bit && ElementLoadSize == OpSize::i64Bit) {
// If the address element size if half the size of the Element load size then we need to start fetching half-way through the low register.
AddrAddressing.Low = VSIB.Low;
AddrAddressing.High = VSIB.High;
IndexElementOffset = OpSize::i128Bit / AddrElementSize / 2;
} else if (AddrElementSize == OpSize::i64Bit && ElementLoadSize == OpSize::i32Bit) {
AddrAddressing.Low = VSIB.High;
AddrAddressing.High = Invalid();
DestReg = Result.Low; ///< Start mixing with the low register.
MaskReg = Mask.Low; ///< Mask starts with the low mask here.
IndexElementOffset = 0;
DataElementOffset = OpSize::i128Bit / ElementLoadSize / 2;
}
///< Calculate the high-half.
auto ResultHigh = _VLoadVectorGatherMasked(OpSize::i128Bit, ElementLoadSize, DestReg, MaskReg, BaseAddr, AddrAddressing.Low,
AddrAddressing.High, AddrElementSize, VSIB.Scale, DataElementOffset, IndexElementOffset);
if (AddrElementSize == OpSize::i64Bit && ElementLoadSize == OpSize::i32Bit) {
// If we only fetched 128-bits worth of data then the upper-result is all zero.
Result = AVX128_Zext(ResultHigh);
} else {
Result.High = ResultHigh;
}
}
return Result;
}
template<size_t AddrElementSize>
void OpDispatchBuilder::AVX128_VPGATHER(OpcodeArgs) {
const auto Size = GetDstSize(Op);
const auto Is128Bit = Size == Core::CPUState::XMM_SSE_REG_SIZE;
///< Element size is determined by W flag.
const OpSize ElementLoadSize = Op->Flags & X86Tables::DecodeFlags::FLAG_OPTION_AVX_W ? OpSize::i64Bit : OpSize::i32Bit;
auto Dest = AVX128_LoadSource_WithOpSize(Op, Op->Dest, Op->Flags, !Is128Bit);
auto VSIB = AVX128_LoadVSIB(Op, Op->Src[0], Op->Flags, !Is128Bit);
auto Mask = AVX128_LoadSource_WithOpSize(Op, Op->Src[1], Op->Flags, !Is128Bit);
RefPair Result {};
Result = AVX128_VPGatherImpl<AddrElementSize>(SizeToOpSize(Size), ElementLoadSize, Dest, Mask, VSIB);
AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result);
///< Assume non-faulting behaviour and clear the mask register.
RefPair ZeroPair {};
ZeroPair.Low = LoadZeroVector(OpSize::i128Bit);
ZeroPair.High = ZeroPair.Low;
AVX128_StoreResult_WithOpSize(Op, Op->Src[1], ZeroPair);
}
} // namespace FEXCore::IR

View File

@ -5106,4 +5106,109 @@ template void OpDispatchBuilder::VFMSUBADD<1, 3, 2>(OpcodeArgs);
template void OpDispatchBuilder::VFMSUBADD<2, 1, 3>(OpcodeArgs);
template void OpDispatchBuilder::VFMSUBADD<2, 3, 1>(OpcodeArgs);
OpDispatchBuilder::RefVSIB OpDispatchBuilder::LoadVSIB(const X86Tables::DecodedOp& Op, const X86Tables::DecodedOperand& Operand, uint32_t Flags) {
const bool IsVSIB = (Op->Flags & X86Tables::DecodeFlags::FLAG_VSIB_BYTE) != 0;
LOGMAN_THROW_A_FMT(Operand.IsSIB() && IsVSIB, "Trying to load VSIB for something that isn't the correct type!");
// VSIB is a very special case which has a ton of encoded data.
// Get it in a format we can reason about.
const auto Index_gpr = Operand.Data.SIB.Index;
const auto Base_gpr = Operand.Data.SIB.Base;
LOGMAN_THROW_AA_FMT(Index_gpr >= FEXCore::X86State::REG_XMM_0 && Index_gpr <= FEXCore::X86State::REG_XMM_15, "must be AVX reg");
LOGMAN_THROW_AA_FMT(
Base_gpr == FEXCore::X86State::REG_INVALID || (Base_gpr >= FEXCore::X86State::REG_RAX && Base_gpr <= FEXCore::X86State::REG_R15),
"Base must be a GPR.");
const auto Index_XMM_gpr = Index_gpr - X86State::REG_XMM_0;
return {
.Low = LoadXMMRegister(Index_XMM_gpr),
.BaseAddr = Base_gpr != FEXCore::X86State::REG_INVALID ? LoadGPRRegister(Base_gpr, OpSize::i64Bit, 0, false) : nullptr,
.Displacement = Operand.Data.SIB.Offset,
.Scale = Operand.Data.SIB.Scale,
};
}
template<size_t AddrElementSize>
void OpDispatchBuilder::VPGATHER(OpcodeArgs) {
LOGMAN_THROW_A_FMT(AddrElementSize == OpSize::i32Bit || AddrElementSize == OpSize::i64Bit, "Unknown address element size");
const auto Size = GetDstSize(Op);
const auto Is128Bit = Size == Core::CPUState::XMM_SSE_REG_SIZE;
///< Element size is determined by W flag.
const OpSize ElementLoadSize = Op->Flags & X86Tables::DecodeFlags::FLAG_OPTION_AVX_W ? OpSize::i64Bit : OpSize::i32Bit;
auto VSIB = LoadVSIB(Op, Op->Src[0], Op->Flags);
const bool SupportsSVELoad = (VSIB.Scale == 1 || VSIB.Scale == AddrElementSize) && (AddrElementSize == ElementLoadSize);
Ref Dest = LoadSource(FPRClass, Op, Op->Dest, Op->Flags);
Ref Mask = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
Ref Result {};
if (!SupportsSVELoad) {
// We need to go down the fallback path in the case that we don't hit the backend's SVE mode.
RefPair Dest128 {
.Low = Dest,
.High = _VDupElement(OpSize::i256Bit, OpSize::i128Bit, Dest, 1),
};
RefPair Mask128 {
.Low = Mask,
.High = _VDupElement(OpSize::i256Bit, OpSize::i128Bit, Mask, 1),
};
RefVSIB VSIB128 = VSIB;
if (Is128Bit) {
///< A bit careful for the VSIB index register duplicating.
VSIB128.High = VSIB128.Low;
} else {
VSIB128.High = _VDupElement(OpSize::i256Bit, OpSize::i128Bit, VSIB128.Low, 1);
}
auto Result128 = AVX128_VPGatherImpl<AddrElementSize>(SizeToOpSize(Size), ElementLoadSize, Dest128, Mask128, VSIB128);
// The registers are current split, need to merge them.
Result = _VInsElement(OpSize::i256Bit, OpSize::i128Bit, 1, 0, Result128.Low, Result128.High);
} else {
///< Calculate the full operation.
///< BaseAddr doesn't need to exist, calculate that here.
Ref BaseAddr = VSIB.BaseAddr;
if (BaseAddr && VSIB.Displacement) {
BaseAddr = _Add(OpSize::i64Bit, BaseAddr, _Constant(VSIB.Displacement));
} else if (VSIB.Displacement) {
BaseAddr = _Constant(VSIB.Displacement);
} else if (!BaseAddr) {
BaseAddr = Invalid();
}
Result = _VLoadVectorGatherMasked(Size, ElementLoadSize, Dest, Mask, BaseAddr, VSIB.Low, Invalid(), AddrElementSize, VSIB.Scale, 0, 0);
}
if (Is128Bit) {
if (AddrElementSize == OpSize::i64Bit && ElementLoadSize == OpSize::i32Bit) {
// Special case for the 128-bit gather load using 64-bit address indexes with 32-bit results.
// Only loads two 32-bit elements in to the lower 64-bits of the first destination.
// Bits [255:65] all become zero.
Result = _VMov(OpSize::i64Bit, Result);
} else if (Is128Bit) {
Result = _VMov(OpSize::i128Bit, Result);
}
} else {
if (AddrElementSize == OpSize::i64Bit && ElementLoadSize == OpSize::i32Bit) {
// If we only fetched 128-bits worth of data then the upper-result is all zero.
Result = _VMov(OpSize::i128Bit, Result);
}
}
StoreResult(FPRClass, Op, Result, -1);
///< Assume non-faulting behaviour and clear the mask register.
auto Zero = LoadZeroVector(Size);
StoreResult_WithOpSize(FPRClass, Op, Op->Src[1], Zero, Size, -1);
}
template void OpDispatchBuilder::VPGATHER<4>(OpcodeArgs);
template void OpDispatchBuilder::VPGATHER<8>(OpcodeArgs);
} // namespace FEXCore::IR

View File

@ -343,10 +343,10 @@ std::array<X86InstInfo, MAX_VEX_TABLE_SIZE> VEXTableOps = []() consteval {
{OPD(2, 0b01, 0x8C), 1, X86InstInfo{"VPMASKMOV", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0, nullptr}},
{OPD(2, 0b01, 0x8E), 1, X86InstInfo{"VPMASKMOV", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY | FLAGS_SF_MOD_DST | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0, nullptr}},
{OPD(2, 0b01, 0x90), 1, X86InstInfo{"VPGATHERDD/Q", TYPE_UNDEC, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_2ND_SRC | FLAGS_VEX_VSIB | FLAGS_XMM_FLAGS, 0, nullptr}},
{OPD(2, 0b01, 0x91), 1, X86InstInfo{"VPGATHERQD/Q", TYPE_UNDEC, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_2ND_SRC | FLAGS_VEX_VSIB | FLAGS_XMM_FLAGS, 0, nullptr}},
{OPD(2, 0b01, 0x92), 1, X86InstInfo{"VGATHERDPS/D", TYPE_UNDEC, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_2ND_SRC | FLAGS_VEX_VSIB | FLAGS_XMM_FLAGS, 0, nullptr}},
{OPD(2, 0b01, 0x93), 1, X86InstInfo{"VGATHERQPS/D", TYPE_UNDEC, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_2ND_SRC | FLAGS_VEX_VSIB | FLAGS_XMM_FLAGS, 0, nullptr}},
{OPD(2, 0b01, 0x90), 1, X86InstInfo{"VPGATHERDD/Q", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_2ND_SRC | FLAGS_VEX_VSIB | FLAGS_XMM_FLAGS, 0, nullptr}},
{OPD(2, 0b01, 0x91), 1, X86InstInfo{"VPGATHERQD/Q", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_2ND_SRC | FLAGS_VEX_VSIB | FLAGS_XMM_FLAGS, 0, nullptr}},
{OPD(2, 0b01, 0x92), 1, X86InstInfo{"VGATHERDPS/D", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_2ND_SRC | FLAGS_VEX_VSIB | FLAGS_XMM_FLAGS, 0, nullptr}},
{OPD(2, 0b01, 0x93), 1, X86InstInfo{"VGATHERQPS/D", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_2ND_SRC | FLAGS_VEX_VSIB | FLAGS_XMM_FLAGS, 0, nullptr}},
{OPD(2, 0b01, 0x96), 1, X86InstInfo{"VFMADDSUB132", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0, nullptr}},
{OPD(2, 0b01, 0x97), 1, X86InstInfo{"VFMSUBADD132", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0, nullptr}},

View File

@ -545,6 +545,19 @@
"DestSize": "RegisterSize",
"NumElements": "RegisterSize / ElementSize"
},
"FPR = VLoadVectorGatherMasked u8:#RegisterSize, u8:#ElementSize, FPR:$Incoming, FPR:$Mask, GPR:$AddrBase, FPR:$VectorIndexLow, FPR:$VectorIndexHigh, u8:$VectorIndexElementSize, u8:$OffsetScale, u8:$DataElementOffsetStart, u8:$IndexElementOffsetStart": {
"Desc": [
"Does a masked load similar to VPGATHERD* where the upper bit of each element",
"determines whether or not that element will be loaded from memory.",
"Most of VSIB encoding is passed directly through to the IR operation."
],
"ImplicitFlagClobber": true,
"DestSize": "RegisterSize",
"NumElements": "RegisterSize / ElementSize",
"EmitValidation": [
"$VectorIndexElementSize == OpSize::i32Bit || $VectorIndexElementSize == OpSize::i64Bit"
]
},
"FPR = VLoadVectorElement u8:#RegisterSize, u8:#ElementSize, FPR:$DstSrc, u8:$Index, GPR:$Addr": {
"Desc": ["Does a memory load to a single element of a vector.",
"Leaves the rest of the vector's data intact.",

File diff suppressed because it is too large Load Diff