mirror of
https://github.com/FEX-Emu/FEX.git
synced 2025-01-18 20:35:03 +00:00
Merge pull request #3757 from Sonicadvance1/avx_16
AVX128: Implement support for gathers
This commit is contained in:
commit
6226c7f4f3
@ -940,6 +940,187 @@ DEF_OP(VStoreVectorMasked) {
|
||||
}
|
||||
}
|
||||
|
||||
DEF_OP(VLoadVectorGatherMasked) {
|
||||
const auto Op = IROp->C<IR::IROp_VLoadVectorGatherMasked>();
|
||||
const auto OpSize = IROp->Size;
|
||||
|
||||
const auto VectorIndexSize = Op->VectorIndexElementSize;
|
||||
const auto OffsetScale = Op->OffsetScale;
|
||||
const auto DataElementOffsetStart = Op->DataElementOffsetStart;
|
||||
const auto IndexElementOffsetStart = Op->IndexElementOffsetStart;
|
||||
|
||||
///< This IR operation handles discontiguous masked gather loadstore instructions. Some things to note about its behaviour.
|
||||
/// - VSIB behaviour is mostly entirely exposed in the IR operation directly.
|
||||
/// - Displacement is the only value missing as that can be added directly to AddrBase.
|
||||
/// - VectorIndex{Low,High} contains the index offsets for each element getting loaded.
|
||||
/// - These element sizes are decoupled from the resulting element size. These can be 32-bit or 64-bit.
|
||||
/// - When the element size is 32-bit then the value is zero-extended to the full 64-bit address calculation
|
||||
/// - When loading a 128-bit result with 64-bit VectorIndex Elements, this requires the use of both VectorIndexLow and VectorIndexHigh
|
||||
/// to get enough pointers.
|
||||
/// - When VectorIndexElementSize and OffsetScale matches Arm64 SVE behaviour then the operation becomes more optimal
|
||||
/// - When the behaviour doesn't match then it gets decomposed to ASIMD style masked load.
|
||||
/// - AddrBase also doesn't need to exist
|
||||
/// - If the instruction is using 64-bit vector indexing or 32-bit addresses where the top-bit isn't set then this is valid!
|
||||
const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE;
|
||||
if (Is256Bit) {
|
||||
LOGMAN_THROW_A_FMT(HostSupportsSVE256, "Need SVE256 support in order to use VStoreVectorMasked with 256-bit operation");
|
||||
}
|
||||
|
||||
const auto Dst = GetVReg(Node);
|
||||
const auto IncomingDst = GetVReg(Op->Incoming.ID());
|
||||
|
||||
const auto MaskReg = GetVReg(Op->Mask.ID());
|
||||
std::optional<ARMEmitter::Register> BaseAddr = !Op->AddrBase.IsInvalid() ? std::make_optional(GetReg(Op->AddrBase.ID())) : std::nullopt;
|
||||
const auto VectorIndexLow = GetVReg(Op->VectorIndexLow.ID());
|
||||
std::optional<ARMEmitter::VRegister> VectorIndexHigh =
|
||||
!Op->VectorIndexHigh.IsInvalid() ? std::make_optional(GetVReg(Op->VectorIndexHigh.ID())) : std::nullopt;
|
||||
|
||||
///< If the host supports SVE and the offset scale matches SVE limitations then it can do an SVE style load.
|
||||
const bool SupportsSVELoad = (HostSupportsSVE128 || HostSupportsSVE256) && (OffsetScale == 1 || OffsetScale == VectorIndexSize) &&
|
||||
(VectorIndexSize == IROp->ElementSize);
|
||||
|
||||
const auto PerformSMove = [this](size_t ElementSize, const ARMEmitter::Register Dst, const ARMEmitter::VRegister Vector, int index) {
|
||||
switch (ElementSize) {
|
||||
case 1: smov<ARMEmitter::SubRegSize::i8Bit>(Dst.X(), Vector, index); break;
|
||||
case 2: smov<ARMEmitter::SubRegSize::i16Bit>(Dst.X(), Vector, index); break;
|
||||
case 4: smov<ARMEmitter::SubRegSize::i32Bit>(Dst.X(), Vector, index); break;
|
||||
case 8: umov<ARMEmitter::SubRegSize::i64Bit>(Dst.X(), Vector, index); break;
|
||||
default: LOGMAN_MSG_A_FMT("Unhandled ExtractElementSize: {}", ElementSize); break;
|
||||
}
|
||||
};
|
||||
|
||||
const auto PerformMove = [this](size_t ElementSize, const ARMEmitter::Register Dst, const ARMEmitter::VRegister Vector, int index) {
|
||||
switch (ElementSize) {
|
||||
case 1: umov<ARMEmitter::SubRegSize::i8Bit>(Dst, Vector, index); break;
|
||||
case 2: umov<ARMEmitter::SubRegSize::i16Bit>(Dst, Vector, index); break;
|
||||
case 4: umov<ARMEmitter::SubRegSize::i32Bit>(Dst, Vector, index); break;
|
||||
case 8: umov<ARMEmitter::SubRegSize::i64Bit>(Dst, Vector, index); break;
|
||||
default: LOGMAN_MSG_A_FMT("Unhandled ExtractElementSize: {}", ElementSize); break;
|
||||
}
|
||||
};
|
||||
|
||||
if (SupportsSVELoad) {
|
||||
ARMEmitter::SVEModType ModType = ARMEmitter::SVEModType::MOD_NONE;
|
||||
uint8_t SVEScale = FEXCore::ilog2(OffsetScale);
|
||||
if (VectorIndexSize == 4) {
|
||||
ModType = ARMEmitter::SVEModType::MOD_SXTW;
|
||||
} else if (VectorIndexSize == 8 && OffsetScale != 1) {
|
||||
ModType = ARMEmitter::SVEModType::MOD_LSL;
|
||||
}
|
||||
|
||||
ARMEmitter::Register AddrReg = TMP1;
|
||||
|
||||
if (BaseAddr.has_value()) {
|
||||
AddrReg = GetReg(Op->AddrBase.ID());
|
||||
} else {
|
||||
///< OpcodeDispatcher didn't provide a Base address while SVE requires one.
|
||||
LoadConstant(ARMEmitter::Size::i64Bit, AddrReg, 0);
|
||||
}
|
||||
|
||||
const auto MemDst = ARMEmitter::SVEMemOperand(AddrReg.X(), VectorIndexLow.Z(), ModType, SVEScale);
|
||||
const auto SubRegSize = ConvertSubRegSize8(IROp);
|
||||
|
||||
const auto CMPPredicate = ARMEmitter::PReg::p0;
|
||||
const auto GoverningPredicate = Is256Bit ? PRED_TMP_32B : PRED_TMP_16B;
|
||||
|
||||
// Check if the sign bit is set for the given element size.
|
||||
cmplt(SubRegSize, CMPPredicate, GoverningPredicate.Zeroing(), MaskReg.Z(), 0);
|
||||
auto TempDst = VTMP1;
|
||||
|
||||
switch (IROp->ElementSize) {
|
||||
case 1: {
|
||||
ld1b<ARMEmitter::SubRegSize::i8Bit>(TempDst.Z(), CMPPredicate.Zeroing(), MemDst);
|
||||
break;
|
||||
}
|
||||
case 2: {
|
||||
ld1h<ARMEmitter::SubRegSize::i16Bit>(TempDst.Z(), CMPPredicate.Zeroing(), MemDst);
|
||||
break;
|
||||
}
|
||||
case 4: {
|
||||
ld1w<ARMEmitter::SubRegSize::i32Bit>(TempDst.Z(), CMPPredicate.Zeroing(), MemDst);
|
||||
break;
|
||||
}
|
||||
case 8: {
|
||||
ld1d(TempDst.Z(), CMPPredicate.Zeroing(), MemDst);
|
||||
break;
|
||||
}
|
||||
default: break;
|
||||
}
|
||||
|
||||
///< Merge elements based on predicate.
|
||||
sel(SubRegSize, Dst.Z(), CMPPredicate, TempDst.Z(), IncomingDst.Z());
|
||||
} else {
|
||||
LOGMAN_THROW_A_FMT(!Is256Bit, "Can't emulate this gather load in the backend! Programming error!");
|
||||
///< Adventurers beware, emulated ASIMD style gather masked load operation.
|
||||
// Number of elements to load is calculated by the number of index elements available.
|
||||
size_t NumAddrElements = (VectorIndexHigh.has_value() ? 32 : 16) / VectorIndexSize;
|
||||
// The number of elements is clamped by the resulting register size.
|
||||
size_t NumDataElements = std::min<size_t>(IROp->Size / IROp->ElementSize, NumAddrElements);
|
||||
|
||||
size_t IndexElementsSizeBytes = NumAddrElements * VectorIndexSize;
|
||||
if (IndexElementsSizeBytes > 16) {
|
||||
// We must have a high register in this case.
|
||||
LOGMAN_THROW_A_FMT(VectorIndexHigh.has_value(), "Need High vector index register!");
|
||||
}
|
||||
|
||||
// Use VTMP1 as the temporary destination
|
||||
auto TempReg = VTMP1;
|
||||
auto WorkingReg = TMP1;
|
||||
auto TempMemReg = TMP2;
|
||||
const uint64_t ElementSizeInBits = IROp->ElementSize * 8;
|
||||
|
||||
mov(TempReg.Q(), IncomingDst.Q());
|
||||
for (size_t i = DataElementOffsetStart, IndexElement = IndexElementOffsetStart; i < NumDataElements; ++i, ++IndexElement) {
|
||||
ARMEmitter::SingleUseForwardLabel Skip {};
|
||||
// Extract mask element
|
||||
PerformMove(IROp->ElementSize, WorkingReg, MaskReg, i);
|
||||
|
||||
// Skip if the mask's sign bit isn't set
|
||||
tbz(WorkingReg, ElementSizeInBits - 1, &Skip);
|
||||
|
||||
// Extract Index Element
|
||||
if ((IndexElement * VectorIndexSize) >= 16) {
|
||||
// Fetch from the high index register.
|
||||
PerformSMove(VectorIndexSize, WorkingReg, *VectorIndexHigh, IndexElement - (16 / VectorIndexSize));
|
||||
} else {
|
||||
// Fetch from the low index register.
|
||||
PerformSMove(VectorIndexSize, WorkingReg, VectorIndexLow, IndexElement);
|
||||
}
|
||||
|
||||
// Calculate memory position for this gather load
|
||||
if (BaseAddr.has_value()) {
|
||||
if (VectorIndexSize == 4) {
|
||||
add(ARMEmitter::Size::i64Bit, TempMemReg, *BaseAddr, WorkingReg, ARMEmitter::ExtendedType::SXTW, FEXCore::ilog2(OffsetScale));
|
||||
} else {
|
||||
add(ARMEmitter::Size::i64Bit, TempMemReg, *BaseAddr, WorkingReg, ARMEmitter::ShiftType::LSL, FEXCore::ilog2(OffsetScale));
|
||||
}
|
||||
} else {
|
||||
///< In this case we have no base address, All addresses come from the vector register itself
|
||||
if (VectorIndexSize == 4) {
|
||||
// Sign extend and shift in to the 64-bit register
|
||||
sbfiz(ARMEmitter::Size::i64Bit, TempMemReg, WorkingReg, FEXCore::ilog2(OffsetScale), 32);
|
||||
} else {
|
||||
lsl(ARMEmitter::Size::i64Bit, TempMemReg, WorkingReg, FEXCore::ilog2(OffsetScale));
|
||||
}
|
||||
}
|
||||
|
||||
// Now that the address is calculated. Do the load.
|
||||
switch (IROp->ElementSize) {
|
||||
case 1: ld1<ARMEmitter::SubRegSize::i8Bit>(TempReg.Q(), i, TempMemReg); break;
|
||||
case 2: ld1<ARMEmitter::SubRegSize::i16Bit>(TempReg.Q(), i, TempMemReg); break;
|
||||
case 4: ld1<ARMEmitter::SubRegSize::i32Bit>(TempReg.Q(), i, TempMemReg); break;
|
||||
case 8: ld1<ARMEmitter::SubRegSize::i64Bit>(TempReg.Q(), i, TempMemReg); break;
|
||||
case 16: ldr(TempReg.Q(), TempMemReg, 0); break;
|
||||
default: LOGMAN_MSG_A_FMT("Unhandled {} size: {}", __func__, IROp->ElementSize); FEX_UNREACHABLE;
|
||||
}
|
||||
|
||||
Bind(&Skip);
|
||||
}
|
||||
// Move result.
|
||||
mov(Dst.Q(), TempReg.Q());
|
||||
}
|
||||
}
|
||||
|
||||
DEF_OP(VLoadVectorElement) {
|
||||
const auto Op = IROp->C<IR::IROp_VLoadVectorElement>();
|
||||
const auto OpSize = IROp->Size;
|
||||
|
@ -5294,6 +5294,11 @@ void OpDispatchBuilder::InstallHostSpecificOpcodeHandlers() {
|
||||
{OPD(2, 0b01, 0x8C), 1, &OpDispatchBuilder::VPMASKMOVOp<false>},
|
||||
{OPD(2, 0b01, 0x8E), 1, &OpDispatchBuilder::VPMASKMOVOp<true>},
|
||||
|
||||
{OPD(2, 0b01, 0x90), 1, &OpDispatchBuilder::VPGATHER<OpSize::i32Bit>},
|
||||
{OPD(2, 0b01, 0x91), 1, &OpDispatchBuilder::VPGATHER<OpSize::i64Bit>},
|
||||
{OPD(2, 0b01, 0x92), 1, &OpDispatchBuilder::VPGATHER<OpSize::i32Bit>},
|
||||
{OPD(2, 0b01, 0x93), 1, &OpDispatchBuilder::VPGATHER<OpSize::i64Bit>},
|
||||
|
||||
{OPD(2, 0b01, 0x96), 1, &OpDispatchBuilder::VFMADDSUB<1, 3, 2>},
|
||||
{OPD(2, 0b01, 0x97), 1, &OpDispatchBuilder::VFMSUBADD<1, 3, 2>},
|
||||
|
||||
|
@ -975,6 +975,17 @@ public:
|
||||
template<uint8_t Src1Idx, uint8_t Src2Idx, uint8_t AddendIdx>
|
||||
void VFMSUBADD(OpcodeArgs);
|
||||
|
||||
struct RefVSIB {
|
||||
Ref Low, High;
|
||||
Ref BaseAddr;
|
||||
int32_t Displacement;
|
||||
uint8_t Scale;
|
||||
};
|
||||
|
||||
RefVSIB LoadVSIB(const X86Tables::DecodedOp& Op, const X86Tables::DecodedOperand& Operand, uint32_t Flags);
|
||||
template<size_t AddrElementSize>
|
||||
void VPGATHER(OpcodeArgs);
|
||||
|
||||
template<size_t ElementSize, size_t DstElementSize, bool Signed>
|
||||
void ExtendVectorElements(OpcodeArgs);
|
||||
template<size_t ElementSize>
|
||||
@ -1019,6 +1030,7 @@ public:
|
||||
RefPair AVX128_LoadSource_WithOpSize(const X86Tables::DecodedOp& Op, const X86Tables::DecodedOperand& Operand, uint32_t Flags,
|
||||
bool NeedsHigh, MemoryAccessType AccessType = MemoryAccessType::DEFAULT);
|
||||
|
||||
RefVSIB AVX128_LoadVSIB(const X86Tables::DecodedOp& Op, const X86Tables::DecodedOperand& Operand, uint32_t Flags, bool NeedsHigh);
|
||||
void AVX128_StoreResult_WithOpSize(FEXCore::X86Tables::DecodedOp Op, const FEXCore::X86Tables::DecodedOperand& Operand, const RefPair Src,
|
||||
MemoryAccessType AccessType = MemoryAccessType::DEFAULT);
|
||||
void InstallAVX128Handlers();
|
||||
@ -1247,6 +1259,12 @@ public:
|
||||
template<uint8_t Src1Idx, uint8_t Src2Idx, uint8_t AddendIdx>
|
||||
void AVX128_VFMSUBADD(OpcodeArgs);
|
||||
|
||||
template<size_t AddrElementSize>
|
||||
RefPair AVX128_VPGatherImpl(OpSize Size, OpSize ElementLoadSize, RefPair Dest, RefPair Mask, RefVSIB VSIB);
|
||||
|
||||
template<size_t AddrElementSize>
|
||||
void AVX128_VPGATHER(OpcodeArgs);
|
||||
|
||||
// End of AVX 128-bit implementation
|
||||
void InvalidOp(OpcodeArgs);
|
||||
|
||||
|
@ -327,6 +327,11 @@ void OpDispatchBuilder::InstallAVX128Handlers() {
|
||||
{OPD(2, 0b01, 0x8C), 1, &OpDispatchBuilder::AVX128_VPMASKMOV<false>},
|
||||
{OPD(2, 0b01, 0x8E), 1, &OpDispatchBuilder::AVX128_VPMASKMOV<true>},
|
||||
|
||||
{OPD(2, 0b01, 0x90), 1, &OpDispatchBuilder::AVX128_VPGATHER<OpSize::i32Bit>},
|
||||
{OPD(2, 0b01, 0x91), 1, &OpDispatchBuilder::AVX128_VPGATHER<OpSize::i64Bit>},
|
||||
{OPD(2, 0b01, 0x92), 1, &OpDispatchBuilder::AVX128_VPGATHER<OpSize::i32Bit>},
|
||||
{OPD(2, 0b01, 0x93), 1, &OpDispatchBuilder::AVX128_VPGATHER<OpSize::i64Bit>},
|
||||
|
||||
{OPD(2, 0b01, 0x96), 1, &OpDispatchBuilder::AVX128_VFMADDSUB<1, 3, 2>},
|
||||
{OPD(2, 0b01, 0x97), 1, &OpDispatchBuilder::AVX128_VFMSUBADD<1, 3, 2>},
|
||||
|
||||
@ -486,10 +491,9 @@ OpDispatchBuilder::RefPair OpDispatchBuilder::AVX128_LoadSource_WithOpSize(
|
||||
AddressMode HighA = A;
|
||||
HighA.Offset += 16;
|
||||
|
||||
///< TODO: Implement VSIB once we get there.
|
||||
if (Operand.IsSIB()) {
|
||||
const bool IsVSIB = (Op->Flags & X86Tables::DecodeFlags::FLAG_VSIB_BYTE) != 0;
|
||||
LOGMAN_THROW_AA_FMT(!IsVSIB, "VSIB currently unsupported");
|
||||
LOGMAN_THROW_AA_FMT(!IsVSIB, "VSIB uses LoadVSIB instead");
|
||||
}
|
||||
|
||||
return {
|
||||
@ -499,6 +503,31 @@ OpDispatchBuilder::RefPair OpDispatchBuilder::AVX128_LoadSource_WithOpSize(
|
||||
}
|
||||
}
|
||||
|
||||
OpDispatchBuilder::RefVSIB
|
||||
OpDispatchBuilder::AVX128_LoadVSIB(const X86Tables::DecodedOp& Op, const X86Tables::DecodedOperand& Operand, uint32_t Flags, bool NeedsHigh) {
|
||||
const bool IsVSIB = (Op->Flags & X86Tables::DecodeFlags::FLAG_VSIB_BYTE) != 0;
|
||||
LOGMAN_THROW_A_FMT(Operand.IsSIB() && IsVSIB, "Trying to load VSIB for something that isn't the correct type!");
|
||||
|
||||
// VSIB is a very special case which has a ton of encoded data.
|
||||
// Get it in a format we can reason about.
|
||||
|
||||
const auto Index_gpr = Operand.Data.SIB.Index;
|
||||
const auto Base_gpr = Operand.Data.SIB.Base;
|
||||
LOGMAN_THROW_AA_FMT(Index_gpr >= FEXCore::X86State::REG_XMM_0 && Index_gpr <= FEXCore::X86State::REG_XMM_15, "must be AVX reg");
|
||||
LOGMAN_THROW_AA_FMT(
|
||||
Base_gpr == FEXCore::X86State::REG_INVALID || (Base_gpr >= FEXCore::X86State::REG_RAX && Base_gpr <= FEXCore::X86State::REG_R15),
|
||||
"Base must be a GPR.");
|
||||
const auto Index_XMM_gpr = Index_gpr - X86State::REG_XMM_0;
|
||||
|
||||
return {
|
||||
.Low = AVX128_LoadXMMRegister(Index_XMM_gpr, false),
|
||||
.High = NeedsHigh ? AVX128_LoadXMMRegister(Index_XMM_gpr, true) : Invalid(),
|
||||
.BaseAddr = Base_gpr != FEXCore::X86State::REG_INVALID ? LoadGPRRegister(Base_gpr, OpSize::i64Bit, 0, false) : nullptr,
|
||||
.Displacement = Operand.Data.SIB.Offset,
|
||||
.Scale = Operand.Data.SIB.Scale,
|
||||
};
|
||||
}
|
||||
|
||||
void OpDispatchBuilder::AVX128_StoreResult_WithOpSize(FEXCore::X86Tables::DecodedOp Op, const FEXCore::X86Tables::DecodedOperand& Operand,
|
||||
const RefPair Src, MemoryAccessType AccessType) {
|
||||
if (Operand.IsGPR()) {
|
||||
@ -2007,7 +2036,6 @@ void OpDispatchBuilder::AVX128_VMASKMOVImpl(OpcodeArgs, size_t ElementSize, size
|
||||
return MakeSegmentAddress(Op, Data, CTX->GetGPRSize());
|
||||
};
|
||||
|
||||
///< TODO: Needs SVE for masked loadstores.
|
||||
if (IsStore) {
|
||||
auto Address = MakeAddress(Op->Dest);
|
||||
|
||||
@ -2487,4 +2515,96 @@ void OpDispatchBuilder::AVX128_VFMSUBADD(OpcodeArgs) {
|
||||
AVX128_VFMAddSubImpl(Op, false, Src1Idx, Src2Idx, AddendIdx);
|
||||
}
|
||||
|
||||
template<size_t AddrElementSize>
|
||||
OpDispatchBuilder::RefPair OpDispatchBuilder::AVX128_VPGatherImpl(OpSize Size, OpSize ElementLoadSize, RefPair Dest, RefPair Mask, RefVSIB VSIB) {
|
||||
LOGMAN_THROW_A_FMT(AddrElementSize == OpSize::i32Bit || AddrElementSize == OpSize::i64Bit, "Unknown address element size");
|
||||
const auto Is128Bit = Size == Core::CPUState::XMM_SSE_REG_SIZE;
|
||||
|
||||
///< BaseAddr doesn't need to exist, calculate that here.
|
||||
Ref BaseAddr = VSIB.BaseAddr;
|
||||
if (BaseAddr && VSIB.Displacement) {
|
||||
BaseAddr = _Add(OpSize::i64Bit, BaseAddr, _Constant(VSIB.Displacement));
|
||||
} else if (VSIB.Displacement) {
|
||||
BaseAddr = _Constant(VSIB.Displacement);
|
||||
} else if (!BaseAddr) {
|
||||
BaseAddr = Invalid();
|
||||
}
|
||||
|
||||
RefPair Result {};
|
||||
///< Calculate the low-half.
|
||||
Result.Low = _VLoadVectorGatherMasked(OpSize::i128Bit, ElementLoadSize, Dest.Low, Mask.Low, BaseAddr, VSIB.Low, VSIB.High,
|
||||
AddrElementSize, VSIB.Scale, 0, 0);
|
||||
|
||||
if (Is128Bit) {
|
||||
Result.High = LoadZeroVector(OpSize::i128Bit);
|
||||
if (AddrElementSize == OpSize::i64Bit && ElementLoadSize == OpSize::i32Bit) {
|
||||
// Special case for the 128-bit gather load using 64-bit address indexes with 32-bit results.
|
||||
// Only loads two 32-bit elements in to the lower 64-bits of the first destination.
|
||||
// Bits [255:65] all become zero.
|
||||
Result.Low = _VZip(OpSize::i128Bit, OpSize::i64Bit, Result.Low, Result.High);
|
||||
}
|
||||
} else {
|
||||
RefPair AddrAddressing {};
|
||||
|
||||
Ref DestReg = Dest.High;
|
||||
Ref MaskReg = Mask.High;
|
||||
uint8_t IndexElementOffset {};
|
||||
uint8_t DataElementOffset {};
|
||||
if (AddrElementSize == ElementLoadSize) {
|
||||
// If the address size matches the loading element size then it will be fetching at the same rate between low and high
|
||||
AddrAddressing.Low = VSIB.High;
|
||||
AddrAddressing.High = Invalid();
|
||||
} else if (AddrElementSize == OpSize::i32Bit && ElementLoadSize == OpSize::i64Bit) {
|
||||
// If the address element size if half the size of the Element load size then we need to start fetching half-way through the low register.
|
||||
AddrAddressing.Low = VSIB.Low;
|
||||
AddrAddressing.High = VSIB.High;
|
||||
IndexElementOffset = OpSize::i128Bit / AddrElementSize / 2;
|
||||
} else if (AddrElementSize == OpSize::i64Bit && ElementLoadSize == OpSize::i32Bit) {
|
||||
AddrAddressing.Low = VSIB.High;
|
||||
AddrAddressing.High = Invalid();
|
||||
DestReg = Result.Low; ///< Start mixing with the low register.
|
||||
MaskReg = Mask.Low; ///< Mask starts with the low mask here.
|
||||
IndexElementOffset = 0;
|
||||
DataElementOffset = OpSize::i128Bit / ElementLoadSize / 2;
|
||||
}
|
||||
|
||||
///< Calculate the high-half.
|
||||
auto ResultHigh = _VLoadVectorGatherMasked(OpSize::i128Bit, ElementLoadSize, DestReg, MaskReg, BaseAddr, AddrAddressing.Low,
|
||||
AddrAddressing.High, AddrElementSize, VSIB.Scale, DataElementOffset, IndexElementOffset);
|
||||
|
||||
if (AddrElementSize == OpSize::i64Bit && ElementLoadSize == OpSize::i32Bit) {
|
||||
// If we only fetched 128-bits worth of data then the upper-result is all zero.
|
||||
Result = AVX128_Zext(ResultHigh);
|
||||
} else {
|
||||
Result.High = ResultHigh;
|
||||
}
|
||||
}
|
||||
|
||||
return Result;
|
||||
}
|
||||
|
||||
template<size_t AddrElementSize>
|
||||
void OpDispatchBuilder::AVX128_VPGATHER(OpcodeArgs) {
|
||||
|
||||
const auto Size = GetDstSize(Op);
|
||||
const auto Is128Bit = Size == Core::CPUState::XMM_SSE_REG_SIZE;
|
||||
|
||||
///< Element size is determined by W flag.
|
||||
const OpSize ElementLoadSize = Op->Flags & X86Tables::DecodeFlags::FLAG_OPTION_AVX_W ? OpSize::i64Bit : OpSize::i32Bit;
|
||||
|
||||
auto Dest = AVX128_LoadSource_WithOpSize(Op, Op->Dest, Op->Flags, !Is128Bit);
|
||||
auto VSIB = AVX128_LoadVSIB(Op, Op->Src[0], Op->Flags, !Is128Bit);
|
||||
auto Mask = AVX128_LoadSource_WithOpSize(Op, Op->Src[1], Op->Flags, !Is128Bit);
|
||||
|
||||
RefPair Result {};
|
||||
Result = AVX128_VPGatherImpl<AddrElementSize>(SizeToOpSize(Size), ElementLoadSize, Dest, Mask, VSIB);
|
||||
AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result);
|
||||
|
||||
///< Assume non-faulting behaviour and clear the mask register.
|
||||
RefPair ZeroPair {};
|
||||
ZeroPair.Low = LoadZeroVector(OpSize::i128Bit);
|
||||
ZeroPair.High = ZeroPair.Low;
|
||||
AVX128_StoreResult_WithOpSize(Op, Op->Src[1], ZeroPair);
|
||||
}
|
||||
|
||||
} // namespace FEXCore::IR
|
||||
|
@ -5106,4 +5106,109 @@ template void OpDispatchBuilder::VFMSUBADD<1, 3, 2>(OpcodeArgs);
|
||||
template void OpDispatchBuilder::VFMSUBADD<2, 1, 3>(OpcodeArgs);
|
||||
template void OpDispatchBuilder::VFMSUBADD<2, 3, 1>(OpcodeArgs);
|
||||
|
||||
OpDispatchBuilder::RefVSIB OpDispatchBuilder::LoadVSIB(const X86Tables::DecodedOp& Op, const X86Tables::DecodedOperand& Operand, uint32_t Flags) {
|
||||
const bool IsVSIB = (Op->Flags & X86Tables::DecodeFlags::FLAG_VSIB_BYTE) != 0;
|
||||
LOGMAN_THROW_A_FMT(Operand.IsSIB() && IsVSIB, "Trying to load VSIB for something that isn't the correct type!");
|
||||
|
||||
// VSIB is a very special case which has a ton of encoded data.
|
||||
// Get it in a format we can reason about.
|
||||
|
||||
const auto Index_gpr = Operand.Data.SIB.Index;
|
||||
const auto Base_gpr = Operand.Data.SIB.Base;
|
||||
LOGMAN_THROW_AA_FMT(Index_gpr >= FEXCore::X86State::REG_XMM_0 && Index_gpr <= FEXCore::X86State::REG_XMM_15, "must be AVX reg");
|
||||
LOGMAN_THROW_AA_FMT(
|
||||
Base_gpr == FEXCore::X86State::REG_INVALID || (Base_gpr >= FEXCore::X86State::REG_RAX && Base_gpr <= FEXCore::X86State::REG_R15),
|
||||
"Base must be a GPR.");
|
||||
const auto Index_XMM_gpr = Index_gpr - X86State::REG_XMM_0;
|
||||
|
||||
return {
|
||||
.Low = LoadXMMRegister(Index_XMM_gpr),
|
||||
.BaseAddr = Base_gpr != FEXCore::X86State::REG_INVALID ? LoadGPRRegister(Base_gpr, OpSize::i64Bit, 0, false) : nullptr,
|
||||
.Displacement = Operand.Data.SIB.Offset,
|
||||
.Scale = Operand.Data.SIB.Scale,
|
||||
};
|
||||
}
|
||||
|
||||
template<size_t AddrElementSize>
|
||||
void OpDispatchBuilder::VPGATHER(OpcodeArgs) {
|
||||
LOGMAN_THROW_A_FMT(AddrElementSize == OpSize::i32Bit || AddrElementSize == OpSize::i64Bit, "Unknown address element size");
|
||||
|
||||
const auto Size = GetDstSize(Op);
|
||||
const auto Is128Bit = Size == Core::CPUState::XMM_SSE_REG_SIZE;
|
||||
|
||||
///< Element size is determined by W flag.
|
||||
const OpSize ElementLoadSize = Op->Flags & X86Tables::DecodeFlags::FLAG_OPTION_AVX_W ? OpSize::i64Bit : OpSize::i32Bit;
|
||||
|
||||
auto VSIB = LoadVSIB(Op, Op->Src[0], Op->Flags);
|
||||
|
||||
const bool SupportsSVELoad = (VSIB.Scale == 1 || VSIB.Scale == AddrElementSize) && (AddrElementSize == ElementLoadSize);
|
||||
|
||||
Ref Dest = LoadSource(FPRClass, Op, Op->Dest, Op->Flags);
|
||||
Ref Mask = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
|
||||
|
||||
Ref Result {};
|
||||
if (!SupportsSVELoad) {
|
||||
// We need to go down the fallback path in the case that we don't hit the backend's SVE mode.
|
||||
RefPair Dest128 {
|
||||
.Low = Dest,
|
||||
.High = _VDupElement(OpSize::i256Bit, OpSize::i128Bit, Dest, 1),
|
||||
};
|
||||
|
||||
RefPair Mask128 {
|
||||
.Low = Mask,
|
||||
.High = _VDupElement(OpSize::i256Bit, OpSize::i128Bit, Mask, 1),
|
||||
};
|
||||
|
||||
RefVSIB VSIB128 = VSIB;
|
||||
if (Is128Bit) {
|
||||
///< A bit careful for the VSIB index register duplicating.
|
||||
VSIB128.High = VSIB128.Low;
|
||||
} else {
|
||||
VSIB128.High = _VDupElement(OpSize::i256Bit, OpSize::i128Bit, VSIB128.Low, 1);
|
||||
}
|
||||
|
||||
auto Result128 = AVX128_VPGatherImpl<AddrElementSize>(SizeToOpSize(Size), ElementLoadSize, Dest128, Mask128, VSIB128);
|
||||
// The registers are current split, need to merge them.
|
||||
Result = _VInsElement(OpSize::i256Bit, OpSize::i128Bit, 1, 0, Result128.Low, Result128.High);
|
||||
} else {
|
||||
///< Calculate the full operation.
|
||||
///< BaseAddr doesn't need to exist, calculate that here.
|
||||
Ref BaseAddr = VSIB.BaseAddr;
|
||||
if (BaseAddr && VSIB.Displacement) {
|
||||
BaseAddr = _Add(OpSize::i64Bit, BaseAddr, _Constant(VSIB.Displacement));
|
||||
} else if (VSIB.Displacement) {
|
||||
BaseAddr = _Constant(VSIB.Displacement);
|
||||
} else if (!BaseAddr) {
|
||||
BaseAddr = Invalid();
|
||||
}
|
||||
|
||||
Result = _VLoadVectorGatherMasked(Size, ElementLoadSize, Dest, Mask, BaseAddr, VSIB.Low, Invalid(), AddrElementSize, VSIB.Scale, 0, 0);
|
||||
}
|
||||
|
||||
if (Is128Bit) {
|
||||
if (AddrElementSize == OpSize::i64Bit && ElementLoadSize == OpSize::i32Bit) {
|
||||
// Special case for the 128-bit gather load using 64-bit address indexes with 32-bit results.
|
||||
// Only loads two 32-bit elements in to the lower 64-bits of the first destination.
|
||||
// Bits [255:65] all become zero.
|
||||
Result = _VMov(OpSize::i64Bit, Result);
|
||||
} else if (Is128Bit) {
|
||||
Result = _VMov(OpSize::i128Bit, Result);
|
||||
}
|
||||
} else {
|
||||
if (AddrElementSize == OpSize::i64Bit && ElementLoadSize == OpSize::i32Bit) {
|
||||
// If we only fetched 128-bits worth of data then the upper-result is all zero.
|
||||
Result = _VMov(OpSize::i128Bit, Result);
|
||||
}
|
||||
}
|
||||
|
||||
StoreResult(FPRClass, Op, Result, -1);
|
||||
|
||||
///< Assume non-faulting behaviour and clear the mask register.
|
||||
auto Zero = LoadZeroVector(Size);
|
||||
StoreResult_WithOpSize(FPRClass, Op, Op->Src[1], Zero, Size, -1);
|
||||
}
|
||||
|
||||
template void OpDispatchBuilder::VPGATHER<4>(OpcodeArgs);
|
||||
template void OpDispatchBuilder::VPGATHER<8>(OpcodeArgs);
|
||||
|
||||
} // namespace FEXCore::IR
|
||||
|
@ -343,10 +343,10 @@ std::array<X86InstInfo, MAX_VEX_TABLE_SIZE> VEXTableOps = []() consteval {
|
||||
{OPD(2, 0b01, 0x8C), 1, X86InstInfo{"VPMASKMOV", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0, nullptr}},
|
||||
{OPD(2, 0b01, 0x8E), 1, X86InstInfo{"VPMASKMOV", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY | FLAGS_SF_MOD_DST | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0, nullptr}},
|
||||
|
||||
{OPD(2, 0b01, 0x90), 1, X86InstInfo{"VPGATHERDD/Q", TYPE_UNDEC, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_2ND_SRC | FLAGS_VEX_VSIB | FLAGS_XMM_FLAGS, 0, nullptr}},
|
||||
{OPD(2, 0b01, 0x91), 1, X86InstInfo{"VPGATHERQD/Q", TYPE_UNDEC, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_2ND_SRC | FLAGS_VEX_VSIB | FLAGS_XMM_FLAGS, 0, nullptr}},
|
||||
{OPD(2, 0b01, 0x92), 1, X86InstInfo{"VGATHERDPS/D", TYPE_UNDEC, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_2ND_SRC | FLAGS_VEX_VSIB | FLAGS_XMM_FLAGS, 0, nullptr}},
|
||||
{OPD(2, 0b01, 0x93), 1, X86InstInfo{"VGATHERQPS/D", TYPE_UNDEC, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_2ND_SRC | FLAGS_VEX_VSIB | FLAGS_XMM_FLAGS, 0, nullptr}},
|
||||
{OPD(2, 0b01, 0x90), 1, X86InstInfo{"VPGATHERDD/Q", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_2ND_SRC | FLAGS_VEX_VSIB | FLAGS_XMM_FLAGS, 0, nullptr}},
|
||||
{OPD(2, 0b01, 0x91), 1, X86InstInfo{"VPGATHERQD/Q", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_2ND_SRC | FLAGS_VEX_VSIB | FLAGS_XMM_FLAGS, 0, nullptr}},
|
||||
{OPD(2, 0b01, 0x92), 1, X86InstInfo{"VGATHERDPS/D", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_2ND_SRC | FLAGS_VEX_VSIB | FLAGS_XMM_FLAGS, 0, nullptr}},
|
||||
{OPD(2, 0b01, 0x93), 1, X86InstInfo{"VGATHERQPS/D", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_2ND_SRC | FLAGS_VEX_VSIB | FLAGS_XMM_FLAGS, 0, nullptr}},
|
||||
|
||||
{OPD(2, 0b01, 0x96), 1, X86InstInfo{"VFMADDSUB132", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0, nullptr}},
|
||||
{OPD(2, 0b01, 0x97), 1, X86InstInfo{"VFMSUBADD132", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0, nullptr}},
|
||||
|
@ -545,6 +545,19 @@
|
||||
"DestSize": "RegisterSize",
|
||||
"NumElements": "RegisterSize / ElementSize"
|
||||
},
|
||||
"FPR = VLoadVectorGatherMasked u8:#RegisterSize, u8:#ElementSize, FPR:$Incoming, FPR:$Mask, GPR:$AddrBase, FPR:$VectorIndexLow, FPR:$VectorIndexHigh, u8:$VectorIndexElementSize, u8:$OffsetScale, u8:$DataElementOffsetStart, u8:$IndexElementOffsetStart": {
|
||||
"Desc": [
|
||||
"Does a masked load similar to VPGATHERD* where the upper bit of each element",
|
||||
"determines whether or not that element will be loaded from memory.",
|
||||
"Most of VSIB encoding is passed directly through to the IR operation."
|
||||
],
|
||||
"ImplicitFlagClobber": true,
|
||||
"DestSize": "RegisterSize",
|
||||
"NumElements": "RegisterSize / ElementSize",
|
||||
"EmitValidation": [
|
||||
"$VectorIndexElementSize == OpSize::i32Bit || $VectorIndexElementSize == OpSize::i64Bit"
|
||||
]
|
||||
},
|
||||
"FPR = VLoadVectorElement u8:#RegisterSize, u8:#ElementSize, FPR:$DstSrc, u8:$Index, GPR:$Addr": {
|
||||
"Desc": ["Does a memory load to a single element of a vector.",
|
||||
"Leaves the rest of the vector's data intact.",
|
||||
|
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user