OpcodeDispatcher: Implement AVX gathers with SVE256

Just to ensure we still have feature parity.
This commit is contained in:
Ryan Houdek 2024-06-22 06:46:55 -07:00 committed by Alyssa Rosenzweig
parent 77ba708933
commit a4fa3a460e
3 changed files with 114 additions and 0 deletions

View File

@ -5294,6 +5294,11 @@ void OpDispatchBuilder::InstallHostSpecificOpcodeHandlers() {
{OPD(2, 0b01, 0x8C), 1, &OpDispatchBuilder::VPMASKMOVOp<false>},
{OPD(2, 0b01, 0x8E), 1, &OpDispatchBuilder::VPMASKMOVOp<true>},
{OPD(2, 0b01, 0x90), 1, &OpDispatchBuilder::VPGATHER<OpSize::i32Bit>},
{OPD(2, 0b01, 0x91), 1, &OpDispatchBuilder::VPGATHER<OpSize::i64Bit>},
{OPD(2, 0b01, 0x92), 1, &OpDispatchBuilder::VPGATHER<OpSize::i32Bit>},
{OPD(2, 0b01, 0x93), 1, &OpDispatchBuilder::VPGATHER<OpSize::i64Bit>},
{OPD(2, 0b01, 0x96), 1, &OpDispatchBuilder::VFMADDSUB<1, 3, 2>},
{OPD(2, 0b01, 0x97), 1, &OpDispatchBuilder::VFMSUBADD<1, 3, 2>},

View File

@ -982,6 +982,10 @@ public:
uint8_t Scale;
};
RefVSIB LoadVSIB(const X86Tables::DecodedOp& Op, const X86Tables::DecodedOperand& Operand, uint32_t Flags);
template<size_t AddrElementSize>
void VPGATHER(OpcodeArgs);
template<size_t ElementSize, size_t DstElementSize, bool Signed>
void ExtendVectorElements(OpcodeArgs);
template<size_t ElementSize>

View File

@ -5106,4 +5106,109 @@ template void OpDispatchBuilder::VFMSUBADD<1, 3, 2>(OpcodeArgs);
template void OpDispatchBuilder::VFMSUBADD<2, 1, 3>(OpcodeArgs);
template void OpDispatchBuilder::VFMSUBADD<2, 3, 1>(OpcodeArgs);
OpDispatchBuilder::RefVSIB OpDispatchBuilder::LoadVSIB(const X86Tables::DecodedOp& Op, const X86Tables::DecodedOperand& Operand, uint32_t Flags) {
const bool IsVSIB = (Op->Flags & X86Tables::DecodeFlags::FLAG_VSIB_BYTE) != 0;
LOGMAN_THROW_A_FMT(Operand.IsSIB() && IsVSIB, "Trying to load VSIB for something that isn't the correct type!");
// VSIB is a very special case which has a ton of encoded data.
// Get it in a format we can reason about.
const auto Index_gpr = Operand.Data.SIB.Index;
const auto Base_gpr = Operand.Data.SIB.Base;
LOGMAN_THROW_AA_FMT(Index_gpr >= FEXCore::X86State::REG_XMM_0 && Index_gpr <= FEXCore::X86State::REG_XMM_15, "must be AVX reg");
LOGMAN_THROW_AA_FMT(
Base_gpr == FEXCore::X86State::REG_INVALID || (Base_gpr >= FEXCore::X86State::REG_RAX && Base_gpr <= FEXCore::X86State::REG_R15),
"Base must be a GPR.");
const auto Index_XMM_gpr = Index_gpr - X86State::REG_XMM_0;
return {
.Low = LoadXMMRegister(Index_XMM_gpr),
.BaseAddr = Base_gpr != FEXCore::X86State::REG_INVALID ? LoadGPRRegister(Base_gpr, OpSize::i64Bit, 0, false) : nullptr,
.Displacement = Operand.Data.SIB.Offset,
.Scale = Operand.Data.SIB.Scale,
};
}
template<size_t AddrElementSize>
void OpDispatchBuilder::VPGATHER(OpcodeArgs) {
LOGMAN_THROW_A_FMT(AddrElementSize == OpSize::i32Bit || AddrElementSize == OpSize::i64Bit, "Unknown address element size");
const auto Size = GetDstSize(Op);
const auto Is128Bit = Size == Core::CPUState::XMM_SSE_REG_SIZE;
///< Element size is determined by W flag.
const OpSize ElementLoadSize = Op->Flags & X86Tables::DecodeFlags::FLAG_OPTION_AVX_W ? OpSize::i64Bit : OpSize::i32Bit;
auto VSIB = LoadVSIB(Op, Op->Src[0], Op->Flags);
const bool SupportsSVELoad = (VSIB.Scale == 1 || VSIB.Scale == AddrElementSize) && (AddrElementSize == ElementLoadSize);
Ref Dest = LoadSource(FPRClass, Op, Op->Dest, Op->Flags);
Ref Mask = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
Ref Result {};
if (!SupportsSVELoad) {
// We need to go down the fallback path in the case that we don't hit the backend's SVE mode.
RefPair Dest128 {
.Low = Dest,
.High = _VDupElement(OpSize::i256Bit, OpSize::i128Bit, Dest, 1),
};
RefPair Mask128 {
.Low = Mask,
.High = _VDupElement(OpSize::i256Bit, OpSize::i128Bit, Mask, 1),
};
RefVSIB VSIB128 = VSIB;
if (Is128Bit) {
///< A bit careful for the VSIB index register duplicating.
VSIB128.High = VSIB128.Low;
} else {
VSIB128.High = _VDupElement(OpSize::i256Bit, OpSize::i128Bit, VSIB128.Low, 1);
}
auto Result128 = AVX128_VPGatherImpl<AddrElementSize>(SizeToOpSize(Size), ElementLoadSize, Dest128, Mask128, VSIB128);
// The registers are current split, need to merge them.
Result = _VInsElement(OpSize::i256Bit, OpSize::i128Bit, 1, 0, Result128.Low, Result128.High);
} else {
///< Calculate the full operation.
///< BaseAddr doesn't need to exist, calculate that here.
Ref BaseAddr = VSIB.BaseAddr;
if (BaseAddr && VSIB.Displacement) {
BaseAddr = _Add(OpSize::i64Bit, BaseAddr, _Constant(VSIB.Displacement));
} else if (VSIB.Displacement) {
BaseAddr = _Constant(VSIB.Displacement);
} else if (!BaseAddr) {
BaseAddr = Invalid();
}
Result = _VLoadVectorGatherMasked(Size, ElementLoadSize, Dest, Mask, BaseAddr, VSIB.Low, Invalid(), AddrElementSize, VSIB.Scale, 0, 0);
}
if (Is128Bit) {
if (AddrElementSize == OpSize::i64Bit && ElementLoadSize == OpSize::i32Bit) {
// Special case for the 128-bit gather load using 64-bit address indexes with 32-bit results.
// Only loads two 32-bit elements in to the lower 64-bits of the first destination.
// Bits [255:65] all become zero.
Result = _VMov(OpSize::i64Bit, Result);
} else if (Is128Bit) {
Result = _VMov(OpSize::i128Bit, Result);
}
} else {
if (AddrElementSize == OpSize::i64Bit && ElementLoadSize == OpSize::i32Bit) {
// If we only fetched 128-bits worth of data then the upper-result is all zero.
Result = _VMov(OpSize::i128Bit, Result);
}
}
StoreResult(FPRClass, Op, Result, -1);
///< Assume non-faulting behaviour and clear the mask register.
auto Zero = LoadZeroVector(Size);
StoreResult_WithOpSize(FPRClass, Op, Op->Src[1], Zero, Size, -1);
}
template void OpDispatchBuilder::VPGATHER<4>(OpcodeArgs);
template void OpDispatchBuilder::VPGATHER<8>(OpcodeArgs);
} // namespace FEXCore::IR