Merge pull request #3200 from lioncash/mov

OpcodeDispatcher: Remove unnecessary 128-bit truncating moves from StoreResult
This commit is contained in:
Ryan Houdek 2023-10-17 12:12:48 +02:00 committed by GitHub
commit ef321e4bf8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 1107 additions and 1118 deletions

View File

@ -1520,13 +1520,13 @@ DEF_OP(VBroadcastFromMem) {
ElementSize == 4 || ElementSize == 8 ||
ElementSize == 16, "Invalid element size");
if (HostSupportsSVE128 || HostSupportsSVE256) {
if (Is256Bit) {
LOGMAN_THROW_A_FMT(HostSupportsSVE256, "Need SVE256 support in order to use SVE 256-bit broadcast");
}
if (Is256Bit && !HostSupportsSVE256) {
LOGMAN_MSG_A_FMT("{}: 256-bit vectors must support SVE256", __func__);
return;
}
const auto GoverningPredicate = Is256Bit ? PRED_TMP_32B.Zeroing()
: PRED_TMP_16B.Zeroing();
if (Is256Bit && HostSupportsSVE256) {
const auto GoverningPredicate = PRED_TMP_32B.Zeroing();
switch (ElementSize) {
case 1:

View File

@ -5260,11 +5260,8 @@ void OpDispatchBuilder::StoreResult_WithOpSize(FEXCore::IR::RegisterClassType Cl
LOGMAN_THROW_A_FMT(Class != IR::GPRClass, "Partial writes from GPR not allowed. Instruction: {}",
Op->TableInfo->Name);
// OpSize of 16 is special in that it is expected to zero the upper bits of the 256-bit operation.
// TODO: Longer term we should enforce the difference between zero and insert.
if (VectorSize == Core::CPUState::XMM_AVX_REG_SIZE && OpSize == Core::CPUState::XMM_SSE_REG_SIZE) {
Result = _VMov(OpSize, Src);
} else {
// XMM-size is handled in implementations.
if (VectorSize != Core::CPUState::XMM_AVX_REG_SIZE || OpSize != Core::CPUState::XMM_SSE_REG_SIZE) {
auto SrcVector = LoadXMMRegister(gprIndex);
Result = _VInsElement(VectorSize, OpSize, 0, 0, SrcVector, Src);
}
@ -5884,12 +5881,12 @@ void OpDispatchBuilder::InstallHostSpecificOpcodeHandlers() {
#define OPD(map_select, pp, opcode) (((map_select - 1) << 10) | (pp << 8) | (opcode))
static constexpr std::tuple<uint16_t, uint8_t, FEXCore::X86Tables::OpDispatchPtr> AVXTable[] = {
{OPD(1, 0b00, 0x10), 1, &OpDispatchBuilder::MOVUPS_MOVUPDOp},
{OPD(1, 0b01, 0x10), 1, &OpDispatchBuilder::MOVUPS_MOVUPDOp},
{OPD(1, 0b00, 0x10), 1, &OpDispatchBuilder::VMOVUPS_VMOVUPDOp},
{OPD(1, 0b01, 0x10), 1, &OpDispatchBuilder::VMOVUPS_VMOVUPDOp},
{OPD(1, 0b10, 0x10), 1, &OpDispatchBuilder::VMOVSSOp},
{OPD(1, 0b11, 0x10), 1, &OpDispatchBuilder::VMOVSDOp},
{OPD(1, 0b00, 0x11), 1, &OpDispatchBuilder::MOVUPS_MOVUPDOp},
{OPD(1, 0b01, 0x11), 1, &OpDispatchBuilder::MOVUPS_MOVUPDOp},
{OPD(1, 0b00, 0x11), 1, &OpDispatchBuilder::VMOVUPS_VMOVUPDOp},
{OPD(1, 0b01, 0x11), 1, &OpDispatchBuilder::VMOVUPS_VMOVUPDOp},
{OPD(1, 0b10, 0x11), 1, &OpDispatchBuilder::VMOVSSOp},
{OPD(1, 0b11, 0x11), 1, &OpDispatchBuilder::VMOVSDOp},
@ -5912,10 +5909,10 @@ void OpDispatchBuilder::InstallHostSpecificOpcodeHandlers() {
{OPD(1, 0b00, 0x17), 1, &OpDispatchBuilder::VMOVHPOp},
{OPD(1, 0b01, 0x17), 1, &OpDispatchBuilder::VMOVHPOp},
{OPD(1, 0b00, 0x28), 1, &OpDispatchBuilder::MOVAPS_MOVAPDOp},
{OPD(1, 0b01, 0x28), 1, &OpDispatchBuilder::MOVAPS_MOVAPDOp},
{OPD(1, 0b00, 0x29), 1, &OpDispatchBuilder::MOVAPS_MOVAPDOp},
{OPD(1, 0b01, 0x29), 1, &OpDispatchBuilder::MOVAPS_MOVAPDOp},
{OPD(1, 0b00, 0x28), 1, &OpDispatchBuilder::VMOVAPS_VMOVAPDOp},
{OPD(1, 0b01, 0x28), 1, &OpDispatchBuilder::VMOVAPS_VMOVAPDOp},
{OPD(1, 0b00, 0x29), 1, &OpDispatchBuilder::VMOVAPS_VMOVAPDOp},
{OPD(1, 0b01, 0x29), 1, &OpDispatchBuilder::VMOVAPS_VMOVAPDOp},
{OPD(1, 0b10, 0x2A), 1, &OpDispatchBuilder::AVXInsertCVTGPR_To_FPR<4>},
{OPD(1, 0b11, 0x2A), 1, &OpDispatchBuilder::AVXInsertCVTGPR_To_FPR<8>},
@ -5970,8 +5967,8 @@ void OpDispatchBuilder::InstallHostSpecificOpcodeHandlers() {
{OPD(1, 0b10, 0x59), 1, &OpDispatchBuilder::AVXVectorScalarInsertALUOp<IR::OP_VFMULSCALARINSERT, 4>},
{OPD(1, 0b11, 0x59), 1, &OpDispatchBuilder::AVXVectorScalarInsertALUOp<IR::OP_VFMULSCALARINSERT, 8>},
{OPD(1, 0b00, 0x5A), 1, &OpDispatchBuilder::Vector_CVT_Float_To_Float<8, 4>},
{OPD(1, 0b01, 0x5A), 1, &OpDispatchBuilder::Vector_CVT_Float_To_Float<4, 8>},
{OPD(1, 0b00, 0x5A), 1, &OpDispatchBuilder::AVXVector_CVT_Float_To_Float<8, 4>},
{OPD(1, 0b01, 0x5A), 1, &OpDispatchBuilder::AVXVector_CVT_Float_To_Float<4, 8>},
{OPD(1, 0b10, 0x5A), 1, &OpDispatchBuilder::AVXInsertScalar_CVT_Float_To_Float<8, 4>},
{OPD(1, 0b11, 0x5A), 1, &OpDispatchBuilder::AVXInsertScalar_CVT_Float_To_Float<4, 8>},
@ -6015,8 +6012,8 @@ void OpDispatchBuilder::InstallHostSpecificOpcodeHandlers() {
{OPD(1, 0b01, 0x6D), 1, &OpDispatchBuilder::VPUNPCKHOp<8>},
{OPD(1, 0b01, 0x6E), 1, &OpDispatchBuilder::MOVBetweenGPR_FPR},
{OPD(1, 0b01, 0x6F), 1, &OpDispatchBuilder::MOVAPS_MOVAPDOp},
{OPD(1, 0b10, 0x6F), 1, &OpDispatchBuilder::MOVUPS_MOVUPDOp},
{OPD(1, 0b01, 0x6F), 1, &OpDispatchBuilder::VMOVAPS_VMOVAPDOp},
{OPD(1, 0b10, 0x6F), 1, &OpDispatchBuilder::VMOVUPS_VMOVUPDOp},
{OPD(1, 0b01, 0x70), 1, &OpDispatchBuilder::VPSHUFWOp<4, true>},
{OPD(1, 0b10, 0x70), 1, &OpDispatchBuilder::VPSHUFWOp<2, false>},
@ -6036,8 +6033,8 @@ void OpDispatchBuilder::InstallHostSpecificOpcodeHandlers() {
{OPD(1, 0b01, 0x7E), 1, &OpDispatchBuilder::MOVBetweenGPR_FPR},
{OPD(1, 0b10, 0x7E), 1, &OpDispatchBuilder::MOVQOp},
{OPD(1, 0b01, 0x7F), 1, &OpDispatchBuilder::MOVAPS_MOVAPDOp},
{OPD(1, 0b10, 0x7F), 1, &OpDispatchBuilder::MOVUPS_MOVUPDOp},
{OPD(1, 0b01, 0x7F), 1, &OpDispatchBuilder::VMOVAPS_VMOVAPDOp},
{OPD(1, 0b10, 0x7F), 1, &OpDispatchBuilder::VMOVUPS_VMOVUPDOp},
{OPD(1, 0b00, 0xC2), 1, &OpDispatchBuilder::AVXVFCMPOp<4>},
{OPD(1, 0b01, 0xC2), 1, &OpDispatchBuilder::AVXVFCMPOp<8>},

View File

@ -482,6 +482,9 @@ public:
template<FEXCore::IR::IROps IROp, size_t ElementSize>
void AVXVectorScalarUnaryInsertALUOp(OpcodeArgs);
template<size_t DstElementSize, size_t SrcElementSize>
void AVXVector_CVT_Float_To_Float(OpcodeArgs);
void InsertMMX_To_XMM_Vector_CVT_Int_To_Float(OpcodeArgs);
template<size_t DstElementSize>
void InsertCVTGPR_To_FPR(OpcodeArgs);
@ -552,6 +555,9 @@ public:
void VMOVSDOp(OpcodeArgs);
void VMOVSSOp(OpcodeArgs);
void VMOVAPS_VMOVAPDOp(OpcodeArgs);
void VMOVUPS_VMOVUPDOp(OpcodeArgs);
void VMPSADBWOp(OpcodeArgs);
template <size_t ElementSize>
@ -1121,7 +1127,7 @@ private:
const X86Tables::DecodedOperand& Src1Op,
const X86Tables::DecodedOperand& Src2Op);
void Vector_CVT_Float_To_FloatImpl(OpcodeArgs, size_t DstElementSize, size_t SrcElementSize);
void Vector_CVT_Float_To_FloatImpl(OpcodeArgs, size_t DstElementSize, size_t SrcElementSize, bool IsAVX);
OrderedNode* Vector_CVT_Float_To_IntImpl(OpcodeArgs, size_t SrcElementSize, bool Narrow, bool HostRoundingMode);

View File

@ -45,11 +45,35 @@ void OpDispatchBuilder::MOVAPS_MOVAPDOp(OpcodeArgs) {
StoreResult(FPRClass, Op, Src, -1);
}
void OpDispatchBuilder::VMOVAPS_VMOVAPDOp(OpcodeArgs) {
const auto SrcSize = GetSrcSize(Op);
const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE;
OrderedNode *Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
if (Is128Bit && Op->Dest.IsGPR()) {
Src = _VMov(16, Src);
}
StoreResult(FPRClass, Op, Src, -1);
}
void OpDispatchBuilder::MOVUPS_MOVUPDOp(OpcodeArgs) {
OrderedNode *Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags, {.Align = 1});
StoreResult(FPRClass, Op, Src, 1);
}
void OpDispatchBuilder::VMOVUPS_VMOVUPDOp(OpcodeArgs) {
const auto SrcSize = GetSrcSize(Op);
const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE;
OrderedNode *Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags, {.Align = 1});
if (Is128Bit && Op->Dest.IsGPR()) {
Src = _VMov(16, Src);
}
StoreResult(FPRClass, Op, Src, 1);
}
void OpDispatchBuilder::MOVHPDOp(OpcodeArgs) {
if (Op->Dest.IsGPR()) {
if (Op->Src[0].IsGPR()) {
@ -1810,18 +1834,26 @@ void OpDispatchBuilder::PINSROp<8>(OpcodeArgs);
void OpDispatchBuilder::VPINSRBOp(OpcodeArgs) {
OrderedNode *Result = PINSROpImpl(Op, 1, Op->Src[0], Op->Src[1], Op->Src[2]);
if (Op->Dest.Data.GPR.GPR == Op->Src[0].Data.GPR.GPR) {
Result = _VMov(16, Result);
}
StoreResult(FPRClass, Op, Result, -1);
}
void OpDispatchBuilder::VPINSRDQOp(OpcodeArgs) {
const auto SrcSize = GetSrcSize(Op);
OrderedNode *Result = PINSROpImpl(Op, SrcSize, Op->Src[0], Op->Src[1], Op->Src[2]);
if (Op->Dest.Data.GPR.GPR == Op->Src[0].Data.GPR.GPR) {
Result = _VMov(16, Result);
}
StoreResult(FPRClass, Op, Result, -1);
}
void OpDispatchBuilder::VPINSRWOp(OpcodeArgs) {
OrderedNode *Result = PINSROpImpl(Op, 2, Op->Src[0], Op->Src[1], Op->Src[2]);
if (Op->Dest.Data.GPR.GPR == Op->Src[0].Data.GPR.GPR) {
Result = _VMov(16, Result);
}
StoreResult(FPRClass, Op, Result, -1);
}
@ -2026,10 +2058,16 @@ void OpDispatchBuilder::PSRLDOp<8>(OpcodeArgs);
template <size_t ElementSize>
void OpDispatchBuilder::VPSRLDOp(OpcodeArgs) {
const auto DstSize = GetDstSize(Op);
const auto Is128Bit = DstSize == Core::CPUState::XMM_SSE_REG_SIZE;
OrderedNode *Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
OrderedNode *Shift = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
OrderedNode *Result = PSRLDOpImpl(Op, ElementSize, Src, Shift);
if (Is128Bit) {
Result = _VMov(16, Result);
}
StoreResult(FPRClass, Op, Result, -1);
}
@ -2066,6 +2104,7 @@ void OpDispatchBuilder::PSRLI<8>(OpcodeArgs);
template <size_t ElementSize>
void OpDispatchBuilder::VPSRLIOp(OpcodeArgs) {
const auto Size = GetSrcSize(Op);
const auto Is128Bit = Size == Core::CPUState::XMM_SSE_REG_SIZE;
LOGMAN_THROW_A_FMT(Op->Src[1].IsLiteral(), "Src1 needs to be literal here");
const uint64_t ShiftConstant = Op->Src[1].Data.Literal.Value;
@ -2075,6 +2114,10 @@ void OpDispatchBuilder::VPSRLIOp(OpcodeArgs) {
if (ShiftConstant != 0) [[likely]] {
Result = _VUShrI(Size, ElementSize, Src, ShiftConstant);
} else {
if (Is128Bit) {
Result = _VMov(16, Result);
}
}
StoreResult(FPRClass, Op, Result, -1);
@ -2123,9 +2166,15 @@ template <size_t ElementSize>
void OpDispatchBuilder::VPSLLIOp(OpcodeArgs) {
LOGMAN_THROW_A_FMT(Op->Src[1].IsLiteral(), "Src1 needs to be literal here");
const uint64_t ShiftConstant = Op->Src[1].Data.Literal.Value;
const auto DstSize = GetDstSize(Op);
const auto Is128Bit = DstSize == Core::CPUState::XMM_SSE_REG_SIZE;
OrderedNode *Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
OrderedNode *Result = PSLLIImpl(Op, ElementSize, Src, ShiftConstant);
if (ShiftConstant == 0 && Is128Bit) {
Result = _VMov(16, Result);
}
StoreResult(FPRClass, Op, Result, -1);
}
@ -2162,10 +2211,16 @@ void OpDispatchBuilder::PSLL<8>(OpcodeArgs);
template <size_t ElementSize>
void OpDispatchBuilder::VPSLLOp(OpcodeArgs) {
const auto DstSize = GetDstSize(Op);
const auto Is128Bit = DstSize == Core::CPUState::XMM_SSE_REG_SIZE;
OrderedNode *Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
OrderedNode *Src2 = LoadSource_WithOpSize(FPRClass, Op, Op->Src[1], 16, Op->Flags);
OrderedNode *Result = PSLLImpl(Op, ElementSize, Src1, Src2);
if (Is128Bit) {
Result = _VMov(16, Result);
}
StoreResult(FPRClass, Op, Result, -1);
}
@ -2200,10 +2255,16 @@ void OpDispatchBuilder::PSRAOp<4>(OpcodeArgs);
template <size_t ElementSize>
void OpDispatchBuilder::VPSRAOp(OpcodeArgs) {
const auto DstSize = GetDstSize(Op);
const auto Is128Bit = DstSize == Core::CPUState::XMM_SSE_REG_SIZE;
OrderedNode *Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
OrderedNode *Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
OrderedNode *Result = PSRAOpImpl(Op, ElementSize, Src1, Src2);
if (Is128Bit) {
Result = _VMov(16, Result);
}
StoreResult(FPRClass, Op, Result, -1);
}
@ -2242,7 +2303,11 @@ void OpDispatchBuilder::VPSRLDQOp(OpcodeArgs) {
OrderedNode *Result{};
if (Shift == 0) [[unlikely]] {
Result = Src;
if (Is128Bit) {
Result = _VMov(16, Src);
} else {
Result = Src;
}
} else {
Result = LoadAndCacheNamedVectorConstant(DstSize, FEXCore::IR::NamedVectorConstant::NAMED_VECTOR_ZERO);
@ -2292,7 +2357,12 @@ void OpDispatchBuilder::VPSLLDQOp(OpcodeArgs) {
OrderedNode *Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
OrderedNode *Result = Src;
if (Shift != 0) {
if (Shift == 0) {
if (Is128Bit) {
Result = _VMov(16, Result);
}
} else {
Result = LoadAndCacheNamedVectorConstant(DstSize, FEXCore::IR::NamedVectorConstant::NAMED_VECTOR_ZERO);
if (Is128Bit) {
if (Shift < DstSize) {
@ -2336,12 +2406,17 @@ void OpDispatchBuilder::VPSRAIOp(OpcodeArgs) {
LOGMAN_THROW_A_FMT(Op->Src[1].IsLiteral(), "Src1 needs to be literal here");
const uint64_t Shift = Op->Src[1].Data.Literal.Value;
const auto Size = GetDstSize(Op);
const auto Is128Bit = Size == Core::CPUState::XMM_SSE_REG_SIZE;
OrderedNode *Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
OrderedNode *Result = Src;
if (Shift != 0) [[likely]] {
Result = _VSShrI(Size, ElementSize, Src, Shift);
} else {
if (Is128Bit) {
Result = _VMov(16, Result);
}
}
StoreResult(FPRClass, Op, Result, -1);
@ -2637,12 +2712,12 @@ void OpDispatchBuilder::AVXScalar_CVT_Float_To_Float<4, 8>(OpcodeArgs);
template
void OpDispatchBuilder::AVXScalar_CVT_Float_To_Float<8, 4>(OpcodeArgs);
void OpDispatchBuilder::Vector_CVT_Float_To_FloatImpl(OpcodeArgs, size_t DstElementSize, size_t SrcElementSize) {
const auto IsFloatSrc = SrcElementSize == 4;
void OpDispatchBuilder::Vector_CVT_Float_To_FloatImpl(OpcodeArgs, size_t DstElementSize, size_t SrcElementSize, bool IsAVX) {
const auto SrcSize = GetSrcSize(Op);
const auto StoreSize = IsFloatSrc ? SrcSize
: 16;
const auto IsFloatSrc = SrcElementSize == 4;
const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE;
const auto LoadSize = IsFloatSrc && !Op->Src[0].IsGPR() ?
SrcSize / 2 :
SrcSize;
@ -2656,19 +2731,36 @@ void OpDispatchBuilder::Vector_CVT_Float_To_FloatImpl(OpcodeArgs, size_t DstElem
Result = _Vector_FToF(SrcSize, SrcElementSize >> 1, Src, SrcElementSize);
}
StoreResult_WithOpSize(FPRClass, Op, Op->Dest, Result, StoreSize, -1);
if (IsAVX) {
if (!IsFloatSrc && !Is128Bit) {
// VCVTPD2PS path
Result = _VMov(16, Result);
} else if (IsFloatSrc && Is128Bit) {
// VCVTPS2PD path
Result = _VMov(16, Result);
}
}
StoreResult(FPRClass, Op, Result, -1);
}
template<size_t DstElementSize, size_t SrcElementSize>
void OpDispatchBuilder::Vector_CVT_Float_To_Float(OpcodeArgs) {
Vector_CVT_Float_To_FloatImpl(Op, DstElementSize, SrcElementSize);
Vector_CVT_Float_To_FloatImpl(Op, DstElementSize, SrcElementSize, false);
}
template
void OpDispatchBuilder::Vector_CVT_Float_To_Float<4, 8>(OpcodeArgs);
template
void OpDispatchBuilder::Vector_CVT_Float_To_Float<8, 4>(OpcodeArgs);
template<size_t DstElementSize, size_t SrcElementSize>
void OpDispatchBuilder::AVXVector_CVT_Float_To_Float(OpcodeArgs) {
Vector_CVT_Float_To_FloatImpl(Op, DstElementSize, SrcElementSize, true);
}
template
void OpDispatchBuilder::AVXVector_CVT_Float_To_Float<4, 8>(OpcodeArgs);
template
void OpDispatchBuilder::AVXVector_CVT_Float_To_Float<8, 4>(OpcodeArgs);
void OpDispatchBuilder::MMX_To_XMM_Vector_CVT_Int_To_Float(OpcodeArgs) {
OrderedNode *Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
@ -2760,8 +2852,14 @@ void OpDispatchBuilder::VMASKMOVOpImpl(OpcodeArgs, size_t ElementSize, size_t Da
OrderedNode *Address = MakeAddress(Op->Dest);
_VStoreVectorMasked(DataSize, ElementSize, Mask, Data, Address, Invalid(), MEM_OFFSET_SXTX, 1);
} else {
const auto Is128Bit = GetDstSize(Op) == Core::CPUState::XMM_SSE_REG_SIZE;
OrderedNode *Address = MakeAddress(DataOp);
OrderedNode *Result = _VLoadVectorMasked(DataSize, ElementSize, Mask, Address, Invalid(), MEM_OFFSET_SXTX, 1);
if (Is128Bit) {
Result = _VMov(16, Result);
}
StoreResult(FPRClass, Op, Result, -1);
}
}
@ -3823,10 +3921,16 @@ void OpDispatchBuilder::PMULHW<true>(OpcodeArgs);
template <bool Signed>
void OpDispatchBuilder::VPMULHWOp(OpcodeArgs) {
const auto DstSize = GetDstSize(Op);
const auto Is128Bit = DstSize == Core::CPUState::XMM_SSE_REG_SIZE;
OrderedNode *Dest = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
OrderedNode *Src = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
OrderedNode *Result = PMULHWOpImpl(Op, Signed, Dest, Src);
if (Is128Bit) {
Result = _VMov(16, Result);
}
StoreResult(FPRClass, Op, Result, -1);
}
@ -4906,12 +5010,14 @@ void OpDispatchBuilder::VBLENDPDOp(OpcodeArgs) {
const auto Selector = Op->Src[2].Data.Literal.Value;
if (Selector == 0) {
StoreResult(FPRClass, Op, Src1, -1);
OrderedNode *Result = Is256Bit ? Src1 : _VMov(16, Src1);
StoreResult(FPRClass, Op, Result, -1);
return;
}
// Only the first four bits of the 8-bit immediate are used, so only check them.
if (((Selector & 0b11) == 0b11 && !Is256Bit) || (Selector & 0b1111) == 0b1111) {
StoreResult(FPRClass, Op, Src2, -1);
OrderedNode *Result = Is256Bit ? Src2 : _VMov(16, Src2);
StoreResult(FPRClass, Op, Result, -1);
return;
}
@ -4940,11 +5046,13 @@ void OpDispatchBuilder::VPBLENDDOp(OpcodeArgs) {
// silly is happening, we have your back.
if (Selector == 0) {
StoreResult(FPRClass, Op, Src1, -1);
OrderedNode* Result = Is256Bit ? Src1 : _VMov(16, Src1);
StoreResult(FPRClass, Op, Result, -1);
return;
}
if (Selector == 0xFF && Is256Bit) {
StoreResult(FPRClass, Op, Src2, -1);
OrderedNode* Result = Is256Bit ? Src2 : _VMov(16, Src2);
StoreResult(FPRClass, Op, Result, -1);
return;
}
// The only bits we care about from the 8-bit immediate for 128-bit operations
@ -4952,17 +5060,21 @@ void OpDispatchBuilder::VPBLENDDOp(OpcodeArgs) {
// silliness is going on and the upper bits are being set even when they'll
// be ignored
if ((Selector & 0xF) == 0xF && !Is256Bit) {
StoreResult(FPRClass, Op, Src2, -1);
StoreResult(FPRClass, Op, _VMov(16, Src2), -1);
return;
}
const auto ZeroRegister = LoadAndCacheNamedVectorConstant(DstSize, FEXCore::IR::NamedVectorConstant::NAMED_VECTOR_ZERO);
OrderedNode *Result = VBLENDOpImpl(*this, DstSize, 4, Src1, Src2, ZeroRegister, Selector);
if (!Is256Bit) {
Result = _VMov(16, Result);
}
StoreResult(FPRClass, Op, Result, -1);
}
void OpDispatchBuilder::VPBLENDWOp(OpcodeArgs) {
const auto DstSize = GetDstSize(Op);
const auto Is128Bit = DstSize == Core::CPUState::XMM_SSE_REG_SIZE;
OrderedNode *Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
OrderedNode *Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
@ -4971,11 +5083,13 @@ void OpDispatchBuilder::VPBLENDWOp(OpcodeArgs) {
const auto Selector = Op->Src[2].Data.Literal.Value;
if (Selector == 0) {
StoreResult(FPRClass, Op, Src1, -1);
OrderedNode *Result = Is128Bit ? _VMov(16, Src1) : Src1;
StoreResult(FPRClass, Op, Result, -1);
return;
}
if (Selector == 0xFF) {
StoreResult(FPRClass, Op, Src2, -1);
OrderedNode *Result = Is128Bit ? _VMov(16, Src2) : Src2;
StoreResult(FPRClass, Op, Result, -1);
return;
}
@ -4986,6 +5100,9 @@ void OpDispatchBuilder::VPBLENDWOp(OpcodeArgs) {
const auto ZeroRegister = LoadAndCacheNamedVectorConstant(DstSize, FEXCore::IR::NamedVectorConstant::NAMED_VECTOR_ZERO);
OrderedNode *Result = VBLENDOpImpl(*this, DstSize, 2, Src1, Src2, ZeroRegister, NewSelector);
if (Is128Bit) {
Result = _VMov(16, Result);
}
StoreResult(FPRClass, Op, Result, -1);
}

View File

@ -435,6 +435,58 @@
"mov v16.16b, v17.16b",
"fmax d16, d17, d18"
]
},
"vminps xmm0, xmm1, xmm2": {
"ExpectedInstructionCount": 3,
"Optimal": "No",
"Comment": [
"Map 1 0b00 0x5d 128-bit"
],
"ExpectedArm64ASM": [
"fcmgt v0.4s, v18.4s, v17.4s",
"mov v16.16b, v17.16b",
"bif v16.16b, v18.16b, v0.16b"
]
},
"vminps ymm0, ymm1, ymm2": {
"ExpectedInstructionCount": 5,
"Optimal": "No",
"Comment": [
"Map 1 0b00 0x5d 256-bit"
],
"ExpectedArm64ASM": [
"fcmgt p0.s, p7/z, z18.s, z17.s",
"not p0.b, p7/z, p0.b",
"mov z0.d, z17.d",
"mov z0.s, p0/m, z18.s",
"mov z16.d, z0.d"
]
},
"vminpd xmm0, xmm1, xmm2": {
"ExpectedInstructionCount": 3,
"Optimal": "No",
"Comment": [
"Map 1 0b01 0x5d 128-bit"
],
"ExpectedArm64ASM": [
"fcmgt v0.2d, v18.2d, v17.2d",
"mov v16.16b, v17.16b",
"bif v16.16b, v18.16b, v0.16b"
]
},
"vminpd ymm0, ymm1, ymm2": {
"ExpectedInstructionCount": 5,
"Optimal": "No",
"Comment": [
"Map 1 0b01 0x5d 256-bit"
],
"ExpectedArm64ASM": [
"fcmgt p0.d, p7/z, z18.d, z17.d",
"not p0.b, p7/z, p0.b",
"mov z0.d, z17.d",
"mov z0.d, p0/m, z18.d",
"mov z16.d, z0.d"
]
}
}
}

View File

@ -12,14 +12,13 @@
},
"Instructions": {
"vrsqrtps xmm0, xmm1": {
"ExpectedInstructionCount": 2,
"Optimal": "No",
"ExpectedInstructionCount": 1,
"Optimal": "Yes",
"Comment": [
"Map 1 0b00 0x52 128-bit"
],
"ExpectedArm64ASM": [
"frsqrte v2.4s, v17.4s",
"mov v16.16b, v2.16b"
"frsqrte v16.4s, v17.4s"
]
},
"vrsqrtps ymm0, ymm1": {
@ -46,14 +45,13 @@
]
},
"vrcpps xmm0, xmm1": {
"ExpectedInstructionCount": 2,
"Optimal": "No",
"ExpectedInstructionCount": 1,
"Optimal": "Yes",
"Comment": [
"Map 1 0b00 0x53 128-bit"
],
"ExpectedArm64ASM": [
"frecpe v2.4s, v17.4s",
"mov v16.16b, v2.16b"
"frecpe v16.4s, v17.4s"
]
},
"vrcpps ymm0, ymm1": {

View File

@ -11,14 +11,13 @@
},
"Instructions": {
"vrsqrtps xmm0, xmm1": {
"ExpectedInstructionCount": 2,
"Optimal": "No",
"ExpectedInstructionCount": 1,
"Optimal": "Yes",
"Comment": [
"Map 1 0b00 0x52 128-bit"
],
"ExpectedArm64ASM": [
"frsqrte v2.4s, v17.4s",
"mov v16.16b, v2.16b"
"frsqrte v16.4s, v17.4s"
]
},
"vrsqrtps ymm0, ymm1": {
@ -44,14 +43,13 @@
]
},
"vrcpps xmm0, xmm1": {
"ExpectedInstructionCount": 2,
"Optimal": "No",
"ExpectedInstructionCount": 1,
"Optimal": "Yes",
"Comment": [
"Map 1 0b00 0x53 128-bit"
],
"ExpectedArm64ASM": [
"frecpe v2.4s, v17.4s",
"mov v16.16b, v2.16b"
"frecpe v16.4s, v17.4s"
]
},
"vrcpps ymm0, ymm1": {

View File

@ -9,29 +9,25 @@
},
"Instructions": {
"pmulhuw xmm0, xmm1": {
"ExpectedInstructionCount": 3,
"Optimal": "No",
"ExpectedInstructionCount": 1,
"Optimal": "Yes",
"Comment": [
"SVE-256bit changes behaviour slightly",
"0x66 0x0f 0xe4"
],
"ExpectedArm64ASM": [
"movprfx z2, z16",
"umulh z2.h, p6/m, z2.h, z17.h",
"mov v16.16b, v2.16b"
"umulh z16.h, p6/m, z16.h, z17.h"
]
},
"pmulhw xmm0, xmm1": {
"ExpectedInstructionCount": 3,
"Optimal": "No",
"ExpectedInstructionCount": 1,
"Optimal": "Yes",
"Comment": [
"SVE-256bit changes behaviour slightly",
"0x66 0x0f 0xe5"
],
"ExpectedArm64ASM": [
"movprfx z2, z16",
"smulh z2.h, p6/m, z2.h, z17.h",
"mov v16.16b, v2.16b"
"smulh z16.h, p6/m, z16.h, z17.h"
]
}
}

File diff suppressed because it is too large Load Diff

View File

@ -10,15 +10,14 @@
},
"Instructions": {
"vaddsubpd xmm0, xmm1, xmm2": {
"ExpectedInstructionCount": 3,
"Optimal": "No",
"ExpectedInstructionCount": 2,
"Optimal": "Yes",
"Comment": [
"Map 1 0b01 0xd0 128-bit"
],
"ExpectedArm64ASM": [
"ext v2.16b, v18.16b, v18.16b, #8",
"fcadd v2.2d, v17.2d, v2.2d, #90",
"mov v16.16b, v2.16b"
"fcadd v16.2d, v17.2d, v2.2d, #90"
]
},
"vaddsubpd ymm0, ymm1, ymm2": {
@ -37,15 +36,14 @@
]
},
"vaddsubps xmm0, xmm1, xmm2": {
"ExpectedInstructionCount": 3,
"ExpectedInstructionCount": 2,
"Optimal": "No",
"Comment": [
"Map 1 0b11 0xd0 128-bit"
],
"ExpectedArm64ASM": [
"rev64 v2.4s, v18.4s",
"fcadd v2.4s, v17.4s, v2.4s, #90",
"mov v16.16b, v2.16b"
"fcadd v16.4s, v17.4s, v2.4s, #90"
]
},
"vaddsubps ymm0, ymm1, ymm2": {

File diff suppressed because it is too large Load Diff

View File

@ -437,8 +437,8 @@
"mov v2.s[0], v17.s[0]",
"mov v2.s[1], v17.s[0]",
"mov v2.s[2], v17.s[0]",
"mov v2.s[3], v17.s[0]",
"mov v16.16b, v2.16b"
"mov v16.16b, v2.16b",
"mov v16.s[3], v17.s[0]"
]
},
"vpermilps xmm0, xmm1, 01010101b": {
@ -452,8 +452,8 @@
"mov v2.s[0], v17.s[1]",
"mov v2.s[1], v17.s[1]",
"mov v2.s[2], v17.s[1]",
"mov v2.s[3], v17.s[1]",
"mov v16.16b, v2.16b"
"mov v16.16b, v2.16b",
"mov v16.s[3], v17.s[1]"
]
},
"vpermilps xmm0, xmm1, 10101010b": {
@ -467,8 +467,8 @@
"mov v2.s[0], v17.s[2]",
"mov v2.s[1], v17.s[2]",
"mov v2.s[2], v17.s[2]",
"mov v2.s[3], v17.s[2]",
"mov v16.16b, v2.16b"
"mov v16.16b, v2.16b",
"mov v16.s[3], v17.s[2]"
]
},
"vpermilps xmm0, xmm1, 11111111b": {
@ -482,8 +482,8 @@
"mov v2.s[0], v17.s[3]",
"mov v2.s[1], v17.s[3]",
"mov v2.s[2], v17.s[3]",
"mov v2.s[3], v17.s[3]",
"mov v16.16b, v2.16b"
"mov v16.16b, v2.16b",
"mov v16.s[3], v17.s[3]"
]
},
"vpermilps ymm0, ymm1, 00000000b": {
@ -667,8 +667,8 @@
"ExpectedArm64ASM": [
"movi v2.2d, #0x0",
"mov v2.d[0], v17.d[0]",
"mov v2.d[1], v17.d[0]",
"mov v16.16b, v2.16b"
"mov v16.16b, v2.16b",
"mov v16.d[1], v17.d[0]"
]
},
"vpermilpd xmm0, xmm1, 01b": {
@ -680,8 +680,8 @@
"ExpectedArm64ASM": [
"movi v2.2d, #0x0",
"mov v2.d[0], v17.d[1]",
"mov v2.d[1], v17.d[0]",
"mov v16.16b, v2.16b"
"mov v16.16b, v2.16b",
"mov v16.d[1], v17.d[0]"
]
},
"vpermilpd xmm0, xmm1, 10b": {
@ -693,8 +693,8 @@
"ExpectedArm64ASM": [
"movi v2.2d, #0x0",
"mov v2.d[0], v17.d[0]",
"mov v2.d[1], v17.d[1]",
"mov v16.16b, v2.16b"
"mov v16.16b, v2.16b",
"mov v16.d[1], v17.d[1]"
]
},
"vpermilpd xmm0, xmm1, 11b": {
@ -706,8 +706,8 @@
"ExpectedArm64ASM": [
"movi v2.2d, #0x0",
"mov v2.d[0], v17.d[1]",
"mov v2.d[1], v17.d[1]",
"mov v16.16b, v2.16b"
"mov v16.16b, v2.16b",
"mov v16.d[1], v17.d[1]"
]
},
"vpermilpd ymm0, ymm1, 0000b": {
@ -1517,63 +1517,58 @@
]
},
"vroundps xmm0, xmm1, 00000000b": {
"ExpectedInstructionCount": 2,
"Optimal": "No",
"ExpectedInstructionCount": 1,
"Optimal": "Yes",
"Comment": [
"nearest rounding",
"Map 3 0b01 0x08 128-bit"
],
"ExpectedArm64ASM": [
"frintn v2.4s, v17.4s",
"mov v16.16b, v2.16b"
"frintn v16.4s, v17.4s"
]
},
"vroundps xmm0, xmm1, 00000001b": {
"ExpectedInstructionCount": 2,
"Optimal": "No",
"ExpectedInstructionCount": 1,
"Optimal": "Yes",
"Comment": [
"-inf rounding",
"Map 3 0b01 0x08 128-bit"
],
"ExpectedArm64ASM": [
"frintm v2.4s, v17.4s",
"mov v16.16b, v2.16b"
"frintm v16.4s, v17.4s"
]
},
"vroundps xmm0, xmm1, 00000010b": {
"ExpectedInstructionCount": 2,
"Optimal": "No",
"ExpectedInstructionCount": 1,
"Optimal": "Yes",
"Comment": [
"+inf rounding",
"Map 3 0b01 0x08 128-bit"
],
"ExpectedArm64ASM": [
"frintp v2.4s, v17.4s",
"mov v16.16b, v2.16b"
"frintp v16.4s, v17.4s"
]
},
"vroundps xmm0, xmm1, 00000011b": {
"ExpectedInstructionCount": 2,
"Optimal": "No",
"ExpectedInstructionCount": 1,
"Optimal": "Yes",
"Comment": [
"truncate rounding",
"Map 3 0b01 0x08 128-bit"
],
"ExpectedArm64ASM": [
"frintz v2.4s, v17.4s",
"mov v16.16b, v2.16b"
"frintz v16.4s, v17.4s"
]
},
"vroundps xmm0, xmm1, 00000100b": {
"ExpectedInstructionCount": 2,
"Optimal": "No",
"ExpectedInstructionCount": 1,
"Optimal": "Yes",
"Comment": [
"host mode rounding",
"Map 3 0b01 0x08 128-bit"
],
"ExpectedArm64ASM": [
"frinti v2.4s, v17.4s",
"mov v16.16b, v2.16b"
"frinti v16.4s, v17.4s"
]
},
"vroundps ymm0, ymm1, 00000000b": {
@ -1632,63 +1627,58 @@
]
},
"vroundpd xmm0, xmm1, 00000000b": {
"ExpectedInstructionCount": 2,
"Optimal": "No",
"ExpectedInstructionCount": 1,
"Optimal": "Yes",
"Comment": [
"nearest rounding",
"Map 3 0b01 0x09 128-bit"
],
"ExpectedArm64ASM": [
"frintn v2.2d, v17.2d",
"mov v16.16b, v2.16b"
"frintn v16.2d, v17.2d"
]
},
"vroundpd xmm0, xmm1, 00000001b": {
"ExpectedInstructionCount": 2,
"Optimal": "No",
"ExpectedInstructionCount": 1,
"Optimal": "Yes",
"Comment": [
"-inf rounding",
"Map 3 0b01 0x09 128-bit"
],
"ExpectedArm64ASM": [
"frintm v2.2d, v17.2d",
"mov v16.16b, v2.16b"
"frintm v16.2d, v17.2d"
]
},
"vroundpd xmm0, xmm1, 00000010b": {
"ExpectedInstructionCount": 2,
"Optimal": "No",
"ExpectedInstructionCount": 1,
"Optimal": "Yes",
"Comment": [
"+inf rounding",
"Map 3 0b01 0x09 128-bit"
],
"ExpectedArm64ASM": [
"frintp v2.2d, v17.2d",
"mov v16.16b, v2.16b"
"frintp v16.2d, v17.2d"
]
},
"vroundpd xmm0, xmm1, 00000011b": {
"ExpectedInstructionCount": 2,
"Optimal": "No",
"ExpectedInstructionCount": 1,
"Optimal": "Yes",
"Comment": [
"truncate rounding",
"Map 3 0b01 0x09 128-bit"
],
"ExpectedArm64ASM": [
"frintz v2.2d, v17.2d",
"mov v16.16b, v2.16b"
"frintz v16.2d, v17.2d"
]
},
"vroundpd xmm0, xmm1, 00000100b": {
"ExpectedInstructionCount": 2,
"Optimal": "No",
"ExpectedInstructionCount": 1,
"Optimal": "Yes",
"Comment": [
"host mode rounding",
"Map 3 0b01 0x09 128-bit"
],
"ExpectedArm64ASM": [
"frinti v2.2d, v17.2d",
"mov v16.16b, v2.16b"
"frinti v16.2d, v17.2d"
]
},
"vroundpd ymm0, ymm1, 00000000b": {
@ -1748,7 +1738,7 @@
},
"vroundss xmm0, xmm1, 00000000b": {
"ExpectedInstructionCount": 3,
"Optimal": "No",
"Optimal": "Yes",
"Comment": [
"nearest rounding",
"Map 3 0b01 0x0a 128-bit"
@ -1761,7 +1751,7 @@
},
"vroundss xmm0, xmm1, 00000001b": {
"ExpectedInstructionCount": 3,
"Optimal": "No",
"Optimal": "Yes",
"Comment": [
"-inf rounding",
"Map 3 0b01 0x0a 128-bit"
@ -1774,7 +1764,7 @@
},
"vroundss xmm0, xmm1, 00000010b": {
"ExpectedInstructionCount": 3,
"Optimal": "No",
"Optimal": "Yes",
"Comment": [
"+inf rounding",
"Map 3 0b01 0x0a 128-bit"
@ -1787,7 +1777,7 @@
},
"vroundss xmm0, xmm1, 00000011b": {
"ExpectedInstructionCount": 3,
"Optimal": "No",
"Optimal": "Yes",
"Comment": [
"truncate rounding",
"Map 3 0b01 0x0a 128-bit"
@ -1800,7 +1790,7 @@
},
"vroundss xmm0, xmm1, 00000100b": {
"ExpectedInstructionCount": 3,
"Optimal": "No",
"Optimal": "Yes",
"Comment": [
"host mode rounding",
"Map 3 0b01 0x0a 128-bit"
@ -1813,7 +1803,7 @@
},
"vroundsd xmm0, xmm1, 00000000b": {
"ExpectedInstructionCount": 3,
"Optimal": "No",
"Optimal": "Yes",
"Comment": [
"nearest rounding",
"Map 3 0b01 0x0b 128-bit"
@ -1826,7 +1816,7 @@
},
"vroundsd xmm0, xmm1, 00000001b": {
"ExpectedInstructionCount": 3,
"Optimal": "No",
"Optimal": "Yes",
"Comment": [
"-inf rounding",
"Map 3 0b01 0x0b 128-bit"
@ -1839,7 +1829,7 @@
},
"vroundsd xmm0, xmm1, 00000010b": {
"ExpectedInstructionCount": 3,
"Optimal": "No",
"Optimal": "Yes",
"Comment": [
"+inf rounding",
"Map 3 0b01 0x0b 128-bit"
@ -1852,7 +1842,7 @@
},
"vroundsd xmm0, xmm1, 00000011b": {
"ExpectedInstructionCount": 3,
"Optimal": "No",
"Optimal": "Yes",
"Comment": [
"truncate rounding",
"Map 3 0b01 0x0b 128-bit"
@ -1865,7 +1855,7 @@
},
"vroundsd xmm0, xmm1, 00000100b": {
"ExpectedInstructionCount": 3,
"Optimal": "No",
"Optimal": "Yes",
"Comment": [
"host mode rounding",
"Map 3 0b01 0x0b 128-bit"
@ -1993,8 +1983,8 @@
"ExpectedArm64ASM": [
"movi v2.2d, #0x0",
"mov v2.d[0], v18.d[0]",
"mov v2.d[1], v17.d[1]",
"mov v16.16b, v2.16b"
"mov v16.16b, v2.16b",
"mov v16.d[1], v17.d[1]"
]
},
"vblendpd xmm0, xmm1, xmm2, 10b": {
@ -2006,8 +1996,8 @@
"ExpectedArm64ASM": [
"movi v2.2d, #0x0",
"mov v2.d[0], v17.d[0]",
"mov v2.d[1], v18.d[1]",
"mov v16.16b, v2.16b"
"mov v16.16b, v2.16b",
"mov v16.d[1], v18.d[1]"
]
},
"vblendpd xmm0, xmm1, xmm2, 11b": {
@ -2553,48 +2543,44 @@
]
},
"vpalignr xmm0, xmm1, xmm2, 0": {
"ExpectedInstructionCount": 2,
"Optimal": "No",
"ExpectedInstructionCount": 1,
"Optimal": "Yes",
"Comment": [
"Map 3 0b01 0x0f 128-bit"
],
"ExpectedArm64ASM": [
"ext v2.16b, v18.16b, v17.16b, #0",
"mov v16.16b, v2.16b"
"ext v16.16b, v18.16b, v17.16b, #0"
]
},
"vpalignr xmm0, xmm1, xmm2, 1": {
"ExpectedInstructionCount": 2,
"Optimal": "No",
"ExpectedInstructionCount": 1,
"Optimal": "Yes",
"Comment": [
"Map 3 0b01 0x0f 128-bit"
],
"ExpectedArm64ASM": [
"ext v2.16b, v18.16b, v17.16b, #1",
"mov v16.16b, v2.16b"
"ext v16.16b, v18.16b, v17.16b, #1"
]
},
"vpalignr xmm0, xmm1, xmm2, 15": {
"ExpectedInstructionCount": 2,
"Optimal": "No",
"ExpectedInstructionCount": 1,
"Optimal": "Yes",
"Comment": [
"Map 3 0b01 0x0f 128-bit"
],
"ExpectedArm64ASM": [
"ext v2.16b, v18.16b, v17.16b, #15",
"mov v16.16b, v2.16b"
"ext v16.16b, v18.16b, v17.16b, #15"
]
},
"vpalignr xmm0, xmm1, xmm2, 16": {
"ExpectedInstructionCount": 3,
"Optimal": "No",
"ExpectedInstructionCount": 2,
"Optimal": "Yes",
"Comment": [
"Map 3 0b01 0x0f 128-bit"
],
"ExpectedArm64ASM": [
"movi v0.2d, #0x0",
"ext v2.16b, v17.16b, v0.16b, #0",
"mov v16.16b, v2.16b"
"ext v16.16b, v17.16b, v0.16b, #0"
]
},
"vpalignr ymm0, ymm1, ymm2, 0": {
@ -2959,111 +2945,138 @@
"Map 3 0b01 0x1D 256-bit"
]
},
"vpinsrb xmm0, xmm1, eax, 0": {
"vpinsrb xmm0, xmm0, eax, 0": {
"ExpectedInstructionCount": 3,
"Optimal": "No",
"Comment": [
"Map 3 0b01 0x20 128-bit"
],
"ExpectedArm64ASM": [
"mov v2.16b, v17.16b",
"mov v2.16b, v16.16b",
"mov v2.b[0], w4",
"mov v16.16b, v2.16b"
]
},
"vpinsrb xmm0, xmm1, eax, 15": {
"ExpectedInstructionCount": 3,
"Optimal": "No",
"vpinsrb xmm0, xmm1, eax, 0": {
"ExpectedInstructionCount": 2,
"Optimal": "Yes",
"Comment": [
"Map 3 0b01 0x20 128-bit"
],
"ExpectedArm64ASM": [
"mov v2.16b, v17.16b",
"mov v2.b[15], w4",
"mov v16.16b, v2.16b"
"mov v16.16b, v17.16b",
"mov v16.b[0], w4"
]
},
"vpinsrb xmm0, xmm1, eax, 15": {
"ExpectedInstructionCount": 2,
"Optimal": "Yes",
"Comment": [
"Map 3 0b01 0x20 128-bit"
],
"ExpectedArm64ASM": [
"mov v16.16b, v17.16b",
"mov v16.b[15], w4"
]
},
"vinsertps xmm0, xmm1, xmm2, ((0b00 << 6) | (0b00 << 4) | (0b0000))": {
"ExpectedInstructionCount": 3,
"Optimal": "No",
"ExpectedInstructionCount": 2,
"Optimal": "Yes",
"Comment": [
"Map 3 0b01 0x21 128-bit"
],
"ExpectedArm64ASM": [
"mov v2.16b, v17.16b",
"mov v2.s[0], v18.s[0]",
"mov v16.16b, v2.16b"
"mov v16.16b, v17.16b",
"mov v16.s[0], v18.s[0]"
]
},
"vinsertps xmm0, xmm1, xmm2, ((0b00 << 6) | (0b00 << 4) | (0b1111))": {
"ExpectedInstructionCount": 2,
"Optimal": "No",
"ExpectedInstructionCount": 1,
"Optimal": "Yes",
"Comment": [
"Map 3 0b01 0x21 128-bit"
],
"ExpectedArm64ASM": [
"movi v2.2d, #0x0",
"mov v16.16b, v2.16b"
"movi v16.2d, #0x0"
]
},
"vinsertps xmm0, xmm1, xmm2, ((0b11 << 6) | (0b11 << 4) | (0b0000))": {
"ExpectedInstructionCount": 3,
"Optimal": "No",
"ExpectedInstructionCount": 2,
"Optimal": "Yes",
"Comment": [
"Map 3 0b01 0x21 128-bit"
],
"ExpectedArm64ASM": [
"mov v2.16b, v17.16b",
"mov v2.s[3], v18.s[3]",
"mov v16.16b, v2.16b"
"mov v16.16b, v17.16b",
"mov v16.s[3], v18.s[3]"
]
},
"vpinsrd xmm0, xmm1, eax, 0": {
"vpinsrd xmm0, xmm0, eax, 0": {
"ExpectedInstructionCount": 3,
"Optimal": "No",
"Comment": [
"Map 3 0b01 0x22 128-bit"
],
"ExpectedArm64ASM": [
"mov v2.16b, v17.16b",
"mov v2.16b, v16.16b",
"mov v2.s[0], w4",
"mov v16.16b, v2.16b"
]
},
"vpinsrd xmm0, xmm1, eax, 3": {
"ExpectedInstructionCount": 3,
"Optimal": "No",
"vpinsrd xmm0, xmm1, eax, 0": {
"ExpectedInstructionCount": 2,
"Optimal": "Yes",
"Comment": [
"Map 3 0b01 0x22 128-bit"
],
"ExpectedArm64ASM": [
"mov v2.16b, v17.16b",
"mov v2.s[3], w4",
"mov v16.16b, v2.16b"
"mov v16.16b, v17.16b",
"mov v16.s[0], w4"
]
},
"vpinsrq xmm0, xmm1, rax, 0": {
"vpinsrd xmm0, xmm1, eax, 3": {
"ExpectedInstructionCount": 2,
"Optimal": "Yes",
"Comment": [
"Map 3 0b01 0x22 128-bit"
],
"ExpectedArm64ASM": [
"mov v16.16b, v17.16b",
"mov v16.s[3], w4"
]
},
"vpinsrq xmm0, xmm0, rax, 0": {
"ExpectedInstructionCount": 3,
"Optimal": "No",
"Comment": [
"Map 3 0b01 0x22 128-bit"
],
"ExpectedArm64ASM": [
"mov v2.16b, v17.16b",
"mov v2.16b, v16.16b",
"mov v2.d[0], x4",
"mov v16.16b, v2.16b"
]
},
"vpinsrq xmm0, xmm1, rax, 1": {
"ExpectedInstructionCount": 3,
"Optimal": "No",
"vpinsrq xmm0, xmm1, rax, 0": {
"ExpectedInstructionCount": 2,
"Optimal": "Yes",
"Comment": [
"Map 3 0b01 0x22 128-bit"
],
"ExpectedArm64ASM": [
"mov v2.16b, v17.16b",
"mov v2.d[1], x4",
"mov v16.16b, v2.16b"
"mov v16.16b, v17.16b",
"mov v16.d[0], x4"
]
},
"vpinsrq xmm0, xmm1, rax, 1": {
"ExpectedInstructionCount": 2,
"Optimal": "Yes",
"Comment": [
"Map 3 0b01 0x22 128-bit"
],
"ExpectedArm64ASM": [
"mov v16.16b, v17.16b",
"mov v16.d[1], x4"
]
},
"vinserti128 ymm0, ymm1, xmm2, 0": {
@ -3113,14 +3126,13 @@
]
},
"vdpps xmm0, xmm1, xmm2, 00000000b": {
"ExpectedInstructionCount": 2,
"Optimal": "No",
"ExpectedInstructionCount": 1,
"Optimal": "Yes",
"Comment": [
"Map 3 0b01 0x40 128-bit"
],
"ExpectedArm64ASM": [
"movi v2.2d, #0x0",
"mov v16.16b, v2.16b"
"movi v16.2d, #0x0"
]
},
"vdpps xmm0, xmm1, xmm2, 00001111b": {
@ -3141,19 +3153,18 @@
"mov v2.s[0], v3.s[0]",
"mov v2.s[1], v3.s[0]",
"mov v2.s[2], v3.s[0]",
"mov v2.s[3], v3.s[0]",
"mov v16.16b, v2.16b"
"mov v16.16b, v2.16b",
"mov v16.s[3], v3.s[0]"
]
},
"vdpps xmm0, xmm1, xmm2, 11110000b": {
"ExpectedInstructionCount": 2,
"Optimal": "No",
"ExpectedInstructionCount": 1,
"Optimal": "Yes",
"Comment": [
"Map 3 0b01 0x40 128-bit"
],
"ExpectedArm64ASM": [
"movi v2.2d, #0x0",
"mov v16.16b, v2.16b"
"movi v16.2d, #0x0"
]
},
"vdpps xmm0, xmm1, xmm2, 11111111b": {
@ -3170,8 +3181,8 @@
"mov v2.s[0], v3.s[0]",
"mov v2.s[1], v3.s[0]",
"mov v2.s[2], v3.s[0]",
"mov v2.s[3], v3.s[0]",
"mov v16.16b, v2.16b"
"mov v16.16b, v2.16b",
"mov v16.s[3], v3.s[0]"
]
},
"vdpps ymm0, ymm1, ymm2, 00000000b": {
@ -3335,14 +3346,13 @@
]
},
"vdppd xmm0, xmm1, xmm2, 00000000b": {
"ExpectedInstructionCount": 2,
"Optimal": "No",
"ExpectedInstructionCount": 1,
"Optimal": "Yes",
"Comment": [
"Map 3 0b01 0x41 128-bit"
],
"ExpectedArm64ASM": [
"movi v2.2d, #0x0",
"mov v16.16b, v2.16b"
"movi v16.2d, #0x0"
]
},
"vdppd xmm0, xmm1, xmm2, 00001111b": {
@ -3358,19 +3368,18 @@
"mov v3.d[1], v2.d[0]",
"faddp v3.2d, v3.2d, v2.2d",
"mov v2.d[0], v3.d[0]",
"mov v2.d[1], v3.d[0]",
"mov v16.16b, v2.16b"
"mov v16.16b, v2.16b",
"mov v16.d[1], v3.d[0]"
]
},
"vdppd xmm0, xmm1, xmm2, 11110000b": {
"ExpectedInstructionCount": 2,
"Optimal": "No",
"ExpectedInstructionCount": 1,
"Optimal": "Yes",
"Comment": [
"Map 3 0b01 0x41 128-bit"
],
"ExpectedArm64ASM": [
"movi v2.2d, #0x0",
"mov v16.16b, v2.16b"
"movi v16.2d, #0x0"
]
},
"vdppd xmm0, xmm1, xmm2, 11111111b": {
@ -3384,12 +3393,12 @@
"fmul v3.2d, v17.2d, v18.2d",
"faddp v3.2d, v3.2d, v2.2d",
"mov v2.d[0], v3.d[0]",
"mov v2.d[1], v3.d[0]",
"mov v16.16b, v2.16b"
"mov v16.16b, v2.16b",
"mov v16.d[1], v3.d[0]"
]
},
"vmpsadbw xmm0, xmm1, xmm2, 000b": {
"ExpectedInstructionCount": 15,
"ExpectedInstructionCount": 14,
"Optimal": "No",
"Comment": [
"Map 3 0b01 0x42 128-bit"
@ -3408,12 +3417,11 @@
"addp v2.8h, v4.8h, v2.8h",
"trn1 v4.4s, v3.4s, v2.4s",
"trn2 v2.4s, v3.4s, v2.4s",
"addp v2.8h, v4.8h, v2.8h",
"mov v16.16b, v2.16b"
"addp v16.8h, v4.8h, v2.8h"
]
},
"vmpsadbw xmm0, xmm1, xmm2, 001b": {
"ExpectedInstructionCount": 15,
"ExpectedInstructionCount": 14,
"Optimal": "No",
"Comment": [
"Map 3 0b01 0x42 128-bit"
@ -3432,12 +3440,11 @@
"addp v2.8h, v4.8h, v2.8h",
"trn1 v4.4s, v3.4s, v2.4s",
"trn2 v2.4s, v3.4s, v2.4s",
"addp v2.8h, v4.8h, v2.8h",
"mov v16.16b, v2.16b"
"addp v16.8h, v4.8h, v2.8h"
]
},
"vmpsadbw xmm0, xmm1, xmm2, 010b": {
"ExpectedInstructionCount": 15,
"ExpectedInstructionCount": 14,
"Optimal": "No",
"Comment": [
"Map 3 0b01 0x42 128-bit"
@ -3456,12 +3463,11 @@
"addp v2.8h, v4.8h, v2.8h",
"trn1 v4.4s, v3.4s, v2.4s",
"trn2 v2.4s, v3.4s, v2.4s",
"addp v2.8h, v4.8h, v2.8h",
"mov v16.16b, v2.16b"
"addp v16.8h, v4.8h, v2.8h"
]
},
"vmpsadbw xmm0, xmm1, xmm2, 011b": {
"ExpectedInstructionCount": 15,
"ExpectedInstructionCount": 14,
"Optimal": "No",
"Comment": [
"Map 3 0b01 0x42 128-bit"
@ -3480,12 +3486,11 @@
"addp v2.8h, v4.8h, v2.8h",
"trn1 v4.4s, v3.4s, v2.4s",
"trn2 v2.4s, v3.4s, v2.4s",
"addp v2.8h, v4.8h, v2.8h",
"mov v16.16b, v2.16b"
"addp v16.8h, v4.8h, v2.8h"
]
},
"vmpsadbw xmm0, xmm1, xmm2, 100b": {
"ExpectedInstructionCount": 15,
"ExpectedInstructionCount": 14,
"Optimal": "No",
"Comment": [
"Map 3 0b01 0x42 128-bit"
@ -3504,12 +3509,11 @@
"addp v2.8h, v4.8h, v2.8h",
"trn1 v4.4s, v3.4s, v2.4s",
"trn2 v2.4s, v3.4s, v2.4s",
"addp v2.8h, v4.8h, v2.8h",
"mov v16.16b, v2.16b"
"addp v16.8h, v4.8h, v2.8h"
]
},
"vmpsadbw xmm0, xmm1, xmm2, 101b": {
"ExpectedInstructionCount": 15,
"ExpectedInstructionCount": 14,
"Optimal": "No",
"Comment": [
"Map 3 0b01 0x42 128-bit"
@ -3528,12 +3532,11 @@
"addp v2.8h, v4.8h, v2.8h",
"trn1 v4.4s, v3.4s, v2.4s",
"trn2 v2.4s, v3.4s, v2.4s",
"addp v2.8h, v4.8h, v2.8h",
"mov v16.16b, v2.16b"
"addp v16.8h, v4.8h, v2.8h"
]
},
"vmpsadbw xmm0, xmm1, xmm2, 110b": {
"ExpectedInstructionCount": 15,
"ExpectedInstructionCount": 14,
"Optimal": "No",
"Comment": [
"Map 3 0b01 0x42 128-bit"
@ -3552,12 +3555,11 @@
"addp v2.8h, v4.8h, v2.8h",
"trn1 v4.4s, v3.4s, v2.4s",
"trn2 v2.4s, v3.4s, v2.4s",
"addp v2.8h, v4.8h, v2.8h",
"mov v16.16b, v2.16b"
"addp v16.8h, v4.8h, v2.8h"
]
},
"vmpsadbw xmm0, xmm1, xmm2, 111b": {
"ExpectedInstructionCount": 15,
"ExpectedInstructionCount": 14,
"Optimal": "No",
"Comment": [
"Map 3 0b01 0x42 128-bit"
@ -3576,8 +3578,7 @@
"addp v2.8h, v4.8h, v2.8h",
"trn1 v4.4s, v3.4s, v2.4s",
"trn2 v2.4s, v3.4s, v2.4s",
"addp v2.8h, v4.8h, v2.8h",
"mov v16.16b, v2.16b"
"addp v16.8h, v4.8h, v2.8h"
]
},
"vmpsadbw ymm0, ymm1, ymm2, 000b": {
@ -3925,49 +3926,45 @@
]
},
"vpclmulqdq xmm0, xmm1, xmm2, 00000b": {
"ExpectedInstructionCount": 2,
"Optimal": "No",
"ExpectedInstructionCount": 1,
"Optimal": "Yes",
"Comment": [
"Map 3 0b01 0x44 128-bit"
],
"ExpectedArm64ASM": [
"unallocated (Unallocated)",
"mov v16.16b, v2.16b"
"unallocated (Unallocated)"
]
},
"vpclmulqdq xmm0, xmm1, xmm2, 00001b": {
"ExpectedInstructionCount": 3,
"ExpectedInstructionCount": 2,
"Optimal": "No",
"Comment": [
"Map 3 0b01 0x44 128-bit"
],
"ExpectedArm64ASM": [
"dup v0.2d, v17.d[1]",
"unallocated (Unallocated)",
"mov v16.16b, v2.16b"
"unallocated (Unallocated)"
]
},
"vpclmulqdq xmm0, xmm1, xmm2, 10000b": {
"ExpectedInstructionCount": 3,
"Optimal": "No",
"Comment": [
"Map 3 0b01 0x44 128-bit"
],
"ExpectedArm64ASM": [
"dup v0.2d, v18.d[1]",
"unallocated (Unallocated)",
"mov v16.16b, v2.16b"
]
},
"vpclmulqdq xmm0, xmm1, xmm2, 10001b": {
"ExpectedInstructionCount": 2,
"Optimal": "No",
"Comment": [
"Map 3 0b01 0x44 128-bit"
],
"ExpectedArm64ASM": [
"unallocated (Unallocated)",
"mov v16.16b, v2.16b"
"dup v0.2d, v18.d[1]",
"unallocated (Unallocated)"
]
},
"vpclmulqdq xmm0, xmm1, xmm2, 10001b": {
"ExpectedInstructionCount": 1,
"Optimal": "No",
"Comment": [
"Map 3 0b01 0x44 128-bit"
],
"ExpectedArm64ASM": [
"unallocated (Unallocated)"
]
},
"vpclmulqdq ymm0, ymm1, ymm2, 00000b": {
@ -4384,8 +4381,8 @@
],
"ExpectedArm64ASM": [
"sshr v2.4s, v19.4s, #31",
"bsl v2.16b, v18.16b, v17.16b",
"mov v16.16b, v2.16b"
"mov v16.16b, v2.16b",
"bsl v16.16b, v18.16b, v17.16b"
]
},
"vblendvps ymm0, ymm1, ymm2, ymm3": {
@ -4410,8 +4407,8 @@
],
"ExpectedArm64ASM": [
"sshr v2.2d, v19.2d, #63",
"bsl v2.16b, v18.16b, v17.16b",
"mov v16.16b, v2.16b"
"mov v16.16b, v2.16b",
"bsl v16.16b, v18.16b, v17.16b"
]
},
"vblendvpd ymm0, ymm1, ymm2, ymm3": {
@ -4436,8 +4433,8 @@
],
"ExpectedArm64ASM": [
"sshr v2.16b, v19.16b, #7",
"bsl v2.16b, v18.16b, v17.16b",
"mov v16.16b, v2.16b"
"mov v16.16b, v2.16b",
"bsl v16.16b, v18.16b, v17.16b"
]
},
"vpblendvb ymm0, ymm1, ymm2, ymm3": {
@ -4695,7 +4692,7 @@
]
},
"vaeskeygenassist xmm0, xmm1, 0": {
"ExpectedInstructionCount": 6,
"ExpectedInstructionCount": 5,
"Optimal": "No",
"Comment": [
"Map 3 0b01 0xdf 128-bit"
@ -4703,14 +4700,13 @@
"ExpectedArm64ASM": [
"ldr q2, [x28, #2000]",
"movi v3.2d, #0x0",
"mov v2.16b, v17.16b",
"mov v16.16b, v17.16b",
"unimplemented (Unimplemented)",
"tbl v2.16b, {v2.16b}, v2.16b",
"mov v16.16b, v2.16b"
"tbl v16.16b, {v16.16b}, v2.16b"
]
},
"vaeskeygenassist xmm0, xmm1, 0xFF": {
"ExpectedInstructionCount": 9,
"ExpectedInstructionCount": 8,
"Optimal": "No",
"Comment": [
"Map 3 0b01 0xdf 128-bit"
@ -4718,13 +4714,12 @@
"ExpectedArm64ASM": [
"ldr q2, [x28, #2000]",
"movi v3.2d, #0x0",
"mov v2.16b, v17.16b",
"mov v16.16b, v17.16b",
"unimplemented (Unimplemented)",
"tbl v2.16b, {v2.16b}, v2.16b",
"tbl v16.16b, {v16.16b}, v2.16b",
"mov x0, #0xff00000000",
"dup v1.2d, x0",
"eor v2.16b, v2.16b, v1.16b",
"mov v16.16b, v2.16b"
"eor v16.16b, v16.16b, v1.16b"
]
},
"rorx eax, ebx, 0": {

View File

@ -18,25 +18,23 @@
]
},
"vpsrlw xmm0, xmm1, 15": {
"ExpectedInstructionCount": 2,
"Optimal": "No",
"ExpectedInstructionCount": 1,
"Optimal": "Yes",
"Comment": [
"Map group 12 0b010 128-bit"
],
"ExpectedArm64ASM": [
"ushr v2.8h, v17.8h, #15",
"mov v16.16b, v2.16b"
"ushr v16.8h, v17.8h, #15"
]
},
"vpsrlw xmm0, xmm1, 16": {
"ExpectedInstructionCount": 2,
"Optimal": "No",
"ExpectedInstructionCount": 1,
"Optimal": "Yes",
"Comment": [
"Map group 12 0b010 128-bit"
],
"ExpectedArm64ASM": [
"movi v2.2d, #0x0",
"mov v16.16b, v2.16b"
"movi v16.2d, #0x0"
]
},
"vpsrlw ymm0, ymm1, 0": {
@ -81,25 +79,23 @@
]
},
"vpsraw xmm0, xmm1, 15": {
"ExpectedInstructionCount": 2,
"Optimal": "No",
"ExpectedInstructionCount": 1,
"Optimal": "Yes",
"Comment": [
"Map group 12 0b100 128-bit"
],
"ExpectedArm64ASM": [
"sshr v2.8h, v17.8h, #15",
"mov v16.16b, v2.16b"
"sshr v16.8h, v17.8h, #15"
]
},
"vpsraw xmm0, xmm1, 16": {
"ExpectedInstructionCount": 2,
"Optimal": "No",
"ExpectedInstructionCount": 1,
"Optimal": "Yes",
"Comment": [
"Map group 12 0b100 128-bit"
],
"ExpectedArm64ASM": [
"sshr v2.8h, v17.8h, #15",
"mov v16.16b, v2.16b"
"sshr v16.8h, v17.8h, #15"
]
},
"vpsraw ymm0, ymm1, 0": {
@ -145,25 +141,23 @@
]
},
"vpsllw xmm0, xmm1, 15": {
"ExpectedInstructionCount": 2,
"Optimal": "No",
"ExpectedInstructionCount": 1,
"Optimal": "Yes",
"Comment": [
"Map group 12 0b110 128-bit"
],
"ExpectedArm64ASM": [
"shl v2.8h, v17.8h, #15",
"mov v16.16b, v2.16b"
"shl v16.8h, v17.8h, #15"
]
},
"vpsllw xmm0, xmm1, 16": {
"ExpectedInstructionCount": 2,
"Optimal": "No",
"ExpectedInstructionCount": 1,
"Optimal": "Yes",
"Comment": [
"Map group 12 0b110 128-bit"
],
"ExpectedArm64ASM": [
"movi v2.2d, #0x0",
"mov v16.16b, v2.16b"
"movi v16.2d, #0x0"
]
},
"vpsllw ymm0, ymm1, 0": {
@ -208,25 +202,23 @@
]
},
"vpsrld xmm0, xmm1, 31": {
"ExpectedInstructionCount": 2,
"Optimal": "No",
"ExpectedInstructionCount": 1,
"Optimal": "Yes",
"Comment": [
"Map group 13 0b010 128-bit"
],
"ExpectedArm64ASM": [
"ushr v2.4s, v17.4s, #31",
"mov v16.16b, v2.16b"
"ushr v16.4s, v17.4s, #31"
]
},
"vpsrld xmm0, xmm1, 32": {
"ExpectedInstructionCount": 2,
"Optimal": "No",
"ExpectedInstructionCount": 1,
"Optimal": "Yes",
"Comment": [
"Map group 13 0b010 128-bit"
],
"ExpectedArm64ASM": [
"movi v2.2d, #0x0",
"mov v16.16b, v2.16b"
"movi v16.2d, #0x0"
]
},
"vpsrld ymm0, ymm1, 0": {
@ -271,25 +263,23 @@
]
},
"vpsrad xmm0, xmm1, 31": {
"ExpectedInstructionCount": 2,
"Optimal": "No",
"ExpectedInstructionCount": 1,
"Optimal": "Yes",
"Comment": [
"Map group 13 0b100 128-bit"
],
"ExpectedArm64ASM": [
"sshr v2.4s, v17.4s, #31",
"mov v16.16b, v2.16b"
"sshr v16.4s, v17.4s, #31"
]
},
"vpsrad xmm0, xmm1, 32": {
"ExpectedInstructionCount": 2,
"Optimal": "No",
"ExpectedInstructionCount": 1,
"Optimal": "Yes",
"Comment": [
"Map group 13 0b100 128-bit"
],
"ExpectedArm64ASM": [
"sshr v2.4s, v17.4s, #31",
"mov v16.16b, v2.16b"
"sshr v16.4s, v17.4s, #31"
]
},
"vpsrad ymm0, ymm1, 0": {
@ -335,25 +325,23 @@
]
},
"vpslld xmm0, xmm1, 31": {
"ExpectedInstructionCount": 2,
"Optimal": "No",
"ExpectedInstructionCount": 1,
"Optimal": "Yes",
"Comment": [
"Map group 13 0b110 128-bit"
],
"ExpectedArm64ASM": [
"shl v2.4s, v17.4s, #31",
"mov v16.16b, v2.16b"
"shl v16.4s, v17.4s, #31"
]
},
"vpslld xmm0, xmm1, 32": {
"ExpectedInstructionCount": 2,
"Optimal": "No",
"ExpectedInstructionCount": 1,
"Optimal": "Yes",
"Comment": [
"Map group 13 0b110 128-bit"
],
"ExpectedArm64ASM": [
"movi v2.2d, #0x0",
"mov v16.16b, v2.16b"
"movi v16.2d, #0x0"
]
},
"vpslld ymm0, ymm1, 0": {
@ -398,25 +386,23 @@
]
},
"vpsrlq xmm0, xmm1, 63": {
"ExpectedInstructionCount": 2,
"Optimal": "No",
"ExpectedInstructionCount": 1,
"Optimal": "Yes",
"Comment": [
"Map group 14 0b010 128-bit"
],
"ExpectedArm64ASM": [
"ushr v2.2d, v17.2d, #63",
"mov v16.16b, v2.16b"
"ushr v16.2d, v17.2d, #63"
]
},
"vpsrlq xmm0, xmm1, 64": {
"ExpectedInstructionCount": 2,
"Optimal": "No",
"ExpectedInstructionCount": 1,
"Optimal": "Yes",
"Comment": [
"Map group 14 0b010 128-bit"
],
"ExpectedArm64ASM": [
"movi v2.2d, #0x0",
"mov v16.16b, v2.16b"
"movi v16.2d, #0x0"
]
},
"vpsrlq ymm0, ymm1, 0": {
@ -461,18 +447,6 @@
]
},
"vpsrldq xmm0, xmm1, 15": {
"ExpectedInstructionCount": 3,
"Optimal": "No",
"Comment": [
"Map group 14 0b011 128-bit"
],
"ExpectedArm64ASM": [
"movi v2.2d, #0x0",
"ext v2.16b, v17.16b, v2.16b, #15",
"mov v16.16b, v2.16b"
]
},
"vpsrldq xmm0, xmm1, 16": {
"ExpectedInstructionCount": 2,
"Optimal": "No",
"Comment": [
@ -480,7 +454,17 @@
],
"ExpectedArm64ASM": [
"movi v2.2d, #0x0",
"mov v16.16b, v2.16b"
"ext v16.16b, v17.16b, v2.16b, #15"
]
},
"vpsrldq xmm0, xmm1, 16": {
"ExpectedInstructionCount": 1,
"Optimal": "Yes",
"Comment": [
"Map group 14 0b011 128-bit"
],
"ExpectedArm64ASM": [
"movi v16.2d, #0x0"
]
},
"vpsrldq ymm0, ymm1, 0": {
@ -532,25 +516,23 @@
]
},
"vpsllq xmm0, xmm1, 63": {
"ExpectedInstructionCount": 2,
"ExpectedInstructionCount": 1,
"Optimal": "No",
"Comment": [
"Map group 14 0b110 128-bit"
],
"ExpectedArm64ASM": [
"shl v2.2d, v17.2d, #63",
"mov v16.16b, v2.16b"
"shl v16.2d, v17.2d, #63"
]
},
"vpsllq xmm0, xmm1, 64": {
"ExpectedInstructionCount": 2,
"ExpectedInstructionCount": 1,
"Optimal": "No",
"Comment": [
"Map group 14 0b110 128-bit"
],
"ExpectedArm64ASM": [
"movi v2.2d, #0x0",
"mov v16.16b, v2.16b"
"movi v16.2d, #0x0"
]
},
"vpsllq ymm0, ymm1, 0": {
@ -595,18 +577,6 @@
]
},
"vpslldq xmm0, xmm1, 15": {
"ExpectedInstructionCount": 3,
"Optimal": "No",
"Comment": [
"Map group 14 0b111 128-bit"
],
"ExpectedArm64ASM": [
"movi v2.2d, #0x0",
"ext v2.16b, v2.16b, v17.16b, #1",
"mov v16.16b, v2.16b"
]
},
"vpslldq xmm0, xmm1, 16": {
"ExpectedInstructionCount": 2,
"Optimal": "No",
"Comment": [
@ -614,7 +584,17 @@
],
"ExpectedArm64ASM": [
"movi v2.2d, #0x0",
"mov v16.16b, v2.16b"
"ext v16.16b, v2.16b, v17.16b, #1"
]
},
"vpslldq xmm0, xmm1, 16": {
"ExpectedInstructionCount": 1,
"Optimal": "Yes",
"Comment": [
"Map group 14 0b111 128-bit"
],
"ExpectedArm64ASM": [
"movi v16.2d, #0x0"
]
},
"vpslldq ymm0, ymm1, 0": {