Merge pull request #3980 from alyssarosenzweig/opt/avx

Optimize AVX load/store with ldp/stp
This commit is contained in:
Ryan Houdek 2024-08-20 16:29:55 -07:00 committed by GitHub
commit 5ac7d5dfcd
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 836 additions and 548 deletions

View File

@ -48,6 +48,31 @@ DEF_OP(LoadContext) {
}
}
DEF_OP(LoadContextPair) {
const auto Op = IROp->C<IR::IROp_LoadContextPair>();
if (Op->Class == FEXCore::IR::GPRClass) {
const auto Dst1 = GetReg(Op->OutValue1.ID());
const auto Dst2 = GetReg(Op->OutValue2.ID());
switch (IROp->Size) {
case 4: ldp<ARMEmitter::IndexType::OFFSET>(Dst1.W(), Dst2.W(), STATE, Op->Offset); break;
case 8: ldp<ARMEmitter::IndexType::OFFSET>(Dst1.X(), Dst2.X(), STATE, Op->Offset); break;
default: LOGMAN_MSG_A_FMT("Unhandled LoadMemPair size: {}", IROp->Size); break;
}
} else {
const auto Dst1 = GetVReg(Op->OutValue1.ID());
const auto Dst2 = GetVReg(Op->OutValue2.ID());
switch (IROp->Size) {
case 4: ldp<ARMEmitter::IndexType::OFFSET>(Dst1.S(), Dst2.S(), STATE, Op->Offset); break;
case 8: ldp<ARMEmitter::IndexType::OFFSET>(Dst1.D(), Dst2.D(), STATE, Op->Offset); break;
case 16: ldp<ARMEmitter::IndexType::OFFSET>(Dst1.Q(), Dst2.Q(), STATE, Op->Offset); break;
default: LOGMAN_MSG_A_FMT("Unhandled LoadMemPair size: {}", IROp->Size); break;
}
}
}
DEF_OP(StoreContext) {
const auto Op = IROp->C<IR::IROp_StoreContext>();
const auto OpSize = IROp->Size;
@ -80,6 +105,32 @@ DEF_OP(StoreContext) {
}
}
DEF_OP(StoreContextPair) {
const auto Op = IROp->C<IR::IROp_StoreContextPair>();
const auto OpSize = IROp->Size;
if (Op->Class == FEXCore::IR::GPRClass) {
auto Src1 = GetZeroableReg(Op->Value1);
auto Src2 = GetZeroableReg(Op->Value2);
switch (OpSize) {
case 4: stp<ARMEmitter::IndexType::OFFSET>(Src1.W(), Src2.W(), STATE, Op->Offset); break;
case 8: stp<ARMEmitter::IndexType::OFFSET>(Src1.X(), Src2.X(), STATE, Op->Offset); break;
default: LOGMAN_MSG_A_FMT("Unhandled StoreContext size: {}", OpSize); break;
}
} else {
const auto Src1 = GetVReg(Op->Value1.ID());
const auto Src2 = GetVReg(Op->Value2.ID());
switch (OpSize) {
case 4: stp<ARMEmitter::IndexType::OFFSET>(Src1.S(), Src2.S(), STATE, Op->Offset); break;
case 8: stp<ARMEmitter::IndexType::OFFSET>(Src1.D(), Src2.D(), STATE, Op->Offset); break;
case 16: stp<ARMEmitter::IndexType::OFFSET>(Src1.Q(), Src2.Q(), STATE, Op->Offset); break;
default: LOGMAN_MSG_A_FMT("Unhandled StoreContextPair size: {}", OpSize); break;
}
}
}
DEF_OP(LoadRegister) {
const auto Op = IROp->C<IR::IROp_LoadRegister>();
const auto OpSize = IROp->Size;
@ -597,6 +648,32 @@ DEF_OP(LoadMem) {
}
}
DEF_OP(LoadMemPair) {
const auto Op = IROp->C<IR::IROp_LoadMemPair>();
const auto Addr = GetReg(Op->Addr.ID());
if (Op->Class == FEXCore::IR::GPRClass) {
const auto Dst1 = GetReg(Op->OutValue1.ID());
const auto Dst2 = GetReg(Op->OutValue2.ID());
switch (IROp->Size) {
case 4: ldp<ARMEmitter::IndexType::OFFSET>(Dst1.W(), Dst2.W(), Addr, Op->Offset); break;
case 8: ldp<ARMEmitter::IndexType::OFFSET>(Dst1.X(), Dst2.X(), Addr, Op->Offset); break;
default: LOGMAN_MSG_A_FMT("Unhandled LoadMemPair size: {}", IROp->Size); break;
}
} else {
const auto Dst1 = GetVReg(Op->OutValue1.ID());
const auto Dst2 = GetVReg(Op->OutValue2.ID());
switch (IROp->Size) {
case 4: ldp<ARMEmitter::IndexType::OFFSET>(Dst1.S(), Dst2.S(), Addr, Op->Offset); break;
case 8: ldp<ARMEmitter::IndexType::OFFSET>(Dst1.D(), Dst2.D(), Addr, Op->Offset); break;
case 16: ldp<ARMEmitter::IndexType::OFFSET>(Dst1.Q(), Dst2.Q(), Addr, Op->Offset); break;
default: LOGMAN_MSG_A_FMT("Unhandled LoadMemPair size: {}", IROp->Size); break;
}
}
}
DEF_OP(LoadMemTSO) {
const auto Op = IROp->C<IR::IROp_LoadMemTSO>();
const auto OpSize = IROp->Size;
@ -1443,6 +1520,32 @@ DEF_OP(StoreMem) {
}
}
DEF_OP(StoreMemPair) {
const auto Op = IROp->C<IR::IROp_StoreMemPair>();
const auto OpSize = IROp->Size;
const auto Addr = GetReg(Op->Addr.ID());
if (Op->Class == FEXCore::IR::GPRClass) {
const auto Src1 = GetReg(Op->Value1.ID());
const auto Src2 = GetReg(Op->Value2.ID());
switch (OpSize) {
case 4: stp<ARMEmitter::IndexType::OFFSET>(Src1.W(), Src2.W(), Addr, Op->Offset); break;
case 8: stp<ARMEmitter::IndexType::OFFSET>(Src1.X(), Src2.X(), Addr, Op->Offset); break;
default: LOGMAN_MSG_A_FMT("Unhandled StoreMem size: {}", OpSize); break;
}
} else {
const auto Src1 = GetVReg(Op->Value1.ID());
const auto Src2 = GetVReg(Op->Value2.ID());
switch (OpSize) {
case 4: stp<ARMEmitter::IndexType::OFFSET>(Src1.S(), Src2.S(), Addr, Op->Offset); break;
case 8: stp<ARMEmitter::IndexType::OFFSET>(Src1.D(), Src2.D(), Addr, Op->Offset); break;
case 16: stp<ARMEmitter::IndexType::OFFSET>(Src1.Q(), Src2.Q(), Addr, Op->Offset); break;
default: LOGMAN_MSG_A_FMT("Unhandled StoreMemPair size: {}", OpSize); break;
}
}
}
DEF_OP(StoreMemTSO) {
const auto Op = IROp->C<IR::IROp_StoreMemTSO>();
const auto OpSize = IROp->Size;

View File

@ -20,6 +20,7 @@ namespace FEXCore::CPU {
DEF_OP(AllocateGPR) {}
DEF_OP(AllocateGPRAfter) {}
DEF_OP(AllocateFPR) {}
DEF_OP(GuestOpcode) {
auto Op = IROp->C<IR::IROp_GuestOpcode>();

View File

@ -1268,7 +1268,23 @@ public:
} else {
bool Partial = RegCache.Partial & (1ull << Index);
unsigned Size = Partial ? 8 : CacheIndexToSize(Index);
_StoreContext(Size, CacheIndexClass(Index), Value, CacheIndexToContextOffset(Index));
uint64_t NextBit = (1ull << (Index - 1));
uint32_t Offset = CacheIndexToContextOffset(Index);
auto Class = CacheIndexClass(Index);
// Use stp where possible to store multiple values at a time. This accelerates AVX.
// TODO: this is all really confusing because of backwards iteration,
// can we peel back that hack?
if ((Bits & NextBit) && !Partial && Size >= 4 && CacheIndexToContextOffset(Index - 1) == Offset - Size && (Offset - Size) / Size < 64) {
LOGMAN_THROW_A_FMT(CacheIndexClass(Index - 1) == Class, "construction");
LOGMAN_THROW_A_FMT((Offset % Size) == 0, "construction");
Ref ValueNext = RegCache.Value[Index - 1];
_StoreContextPair(Size, Class, ValueNext, Value, Offset - Size);
Bits &= ~NextBit;
} else {
_StoreContext(Size, Class, Value, Offset);
}
}
Bits &= ~(1ull << Index);
@ -1901,6 +1917,43 @@ private:
return RegCache.Value[Index];
}
RefPair AllocatePair(FEXCore::IR::RegisterClassType Class, uint8_t Size) {
if (Class == FPRClass) {
return {_AllocateFPR(Size, Size), _AllocateFPR(Size, Size)};
} else {
return {_AllocateGPR(false), _AllocateGPR(false)};
}
}
RefPair LoadContextPair_Uncached(FEXCore::IR::RegisterClassType Class, uint8_t Size, unsigned Offset) {
RefPair Values = AllocatePair(Class, Size);
_LoadContextPair(Size, Class, Offset, Values.Low, Values.High);
return Values;
}
RefPair LoadRegCachePair(uint64_t Offset, uint8_t Index, RegisterClassType RegClass, uint8_t Size) {
LOGMAN_THROW_AA_FMT(Index != DFIndex, "must be pairable");
// Try to load a pair into the cache
uint64_t Bits = (3ull << (uint64_t)Index);
if (((RegCache.Partial | RegCache.Cached) & Bits) == 0 && ((Offset / Size) < 64)) {
auto Values = LoadContextPair_Uncached(RegClass, Size, Offset);
RegCache.Value[Index] = Values.Low;
RegCache.Value[Index + 1] = Values.High;
RegCache.Cached |= Bits;
if (Size == 8) {
RegCache.Partial |= Bits;
}
return Values;
}
// Fallback on a pair of loads
return {
.Low = LoadRegCache(Offset, Index, RegClass, Size),
.High = LoadRegCache(Offset + Size, Index + 1, RegClass, Size),
};
}
Ref LoadGPR(uint8_t Reg) {
return LoadRegCache(Reg, GPR0Index + Reg, GPRClass, CTX->GetGPRSize());
}
@ -1909,6 +1962,10 @@ private:
return LoadRegCache(CacheIndexToContextOffset(Index), Index, CacheIndexClass(Index), Size);
}
RefPair LoadContextPair(uint8_t Size, uint8_t Index) {
return LoadRegCachePair(CacheIndexToContextOffset(Index), Index, CacheIndexClass(Index), Size);
}
Ref LoadContext(uint8_t Index) {
return LoadContext(CacheIndexToSize(Index), Index);
}
@ -2342,7 +2399,7 @@ private:
IROp_IRHeader* CurrentHeader {};
Ref _StoreMemAutoTSO(FEXCore::IR::RegisterClassType Class, uint8_t Size, Ref Addr, Ref Value, uint8_t Align = 1) {
if (CTX->IsAtomicTSOEnabled()) {
if (Class == FPRClass ? CTX->IsVectorAtomicTSOEnabled() : CTX->IsAtomicTSOEnabled()) {
return _StoreMemTSO(Class, Size, Value, Addr, Invalid(), Align, MEM_OFFSET_SXTX, 1);
} else {
return _StoreMem(Class, Size, Value, Addr, Invalid(), Align, MEM_OFFSET_SXTX, 1);
@ -2350,7 +2407,7 @@ private:
}
Ref _LoadMemAutoTSO(FEXCore::IR::RegisterClassType Class, uint8_t Size, Ref ssa0, uint8_t Align = 1) {
if (CTX->IsAtomicTSOEnabled()) {
if (Class == FPRClass ? CTX->IsVectorAtomicTSOEnabled() : CTX->IsAtomicTSOEnabled()) {
return _LoadMemTSO(Class, Size, ssa0, Invalid(), Align, MEM_OFFSET_SXTX, 1);
} else {
return _LoadMem(Class, Size, ssa0, Invalid(), Align, MEM_OFFSET_SXTX, 1);
@ -2368,6 +2425,44 @@ private:
}
}
AddressMode SelectPairAddressMode(AddressMode A, uint8_t Size) {
AddressMode Out {};
signed OffsetEl = A.Offset / Size;
if ((A.Offset % Size) == 0 && OffsetEl >= -64 && OffsetEl < 64) {
Out.Offset = A.Offset;
A.Offset = 0;
}
Out.Base = LoadEffectiveAddress(A, true, false);
return Out;
}
RefPair LoadMemPair(FEXCore::IR::RegisterClassType Class, uint8_t Size, Ref Base, unsigned Offset) {
RefPair Values = AllocatePair(Class, Size);
_LoadMemPair(Class, Size, Base, Offset, Values.Low, Values.High);
return Values;
}
RefPair _LoadMemPairAutoTSO(FEXCore::IR::RegisterClassType Class, uint8_t Size, AddressMode A, uint8_t Align = 1) {
bool AtomicTSO = CTX->IsAtomicTSOEnabled() && !A.NonTSO;
// Use ldp if possible, otherwise fallback on two loads.
if (!AtomicTSO && !A.Segment && Size >= 4 & Size <= 16) {
A = SelectPairAddressMode(A, Size);
return LoadMemPair(Class, Size, A.Base, A.Offset);
} else {
AddressMode HighA = A;
HighA.Offset += 16;
return {
.Low = _LoadMemAutoTSO(Class, Size, A, Align),
.High = _LoadMemAutoTSO(Class, Size, HighA, Align),
};
}
}
Ref _StoreMemAutoTSO(FEXCore::IR::RegisterClassType Class, uint8_t Size, AddressMode A, Ref Value, uint8_t Align = 1) {
bool AtomicTSO = CTX->IsAtomicTSOEnabled() && !A.NonTSO;
A = SelectAddressMode(A, AtomicTSO, Class != GPRClass, Size);
@ -2379,6 +2474,20 @@ private:
}
}
void _StoreMemPairAutoTSO(FEXCore::IR::RegisterClassType Class, uint8_t Size, AddressMode A, Ref Value1, Ref Value2, uint8_t Align = 1) {
bool AtomicTSO = CTX->IsAtomicTSOEnabled() && !A.NonTSO;
// Use stp if possible, otherwise fallback on two stores.
if (!AtomicTSO && !A.Segment && Size >= 4 & Size <= 16) {
A = SelectPairAddressMode(A, Size);
_StoreMemPair(Class, Size, Value1, Value2, A.Base, A.Offset);
} else {
_StoreMemAutoTSO(Class, Size, A, Value1, 1);
A.Offset += Size;
_StoreMemAutoTSO(Class, Size, A, Value2, 1);
}
}
Ref Prefetch(bool ForStore, bool Stream, uint8_t CacheLevel, Ref ssa0) {
return _Prefetch(ForStore, Stream, CacheLevel, ssa0, Invalid(), MEM_OFFSET_SXTX, 1);
}

View File

@ -508,10 +508,11 @@ OpDispatchBuilder::RefPair OpDispatchBuilder::AVX128_LoadSource_WithOpSize(
LOGMAN_THROW_AA_FMT(!IsVSIB, "VSIB uses LoadVSIB instead");
}
return {
.Low = _LoadMemAutoTSO(FPRClass, 16, A, 1),
.High = NeedsHigh ? _LoadMemAutoTSO(FPRClass, 16, HighA, 1) : nullptr,
};
if (NeedsHigh) {
return _LoadMemPairAutoTSO(FPRClass, 16, A, 1);
} else {
return {.Low = _LoadMemAutoTSO(FPRClass, 16, A, 1)};
}
}
}
@ -557,13 +558,10 @@ void OpDispatchBuilder::AVX128_StoreResult_WithOpSize(FEXCore::X86Tables::Decode
} else {
AddressMode A = DecodeAddress(Op, Operand, AccessType, false /* IsLoad */);
_StoreMemAutoTSO(FPRClass, 16, A, Src.Low, 1);
if (Src.High) {
AddressMode HighA = A;
HighA.Offset += 16;
_StoreMemAutoTSO(FPRClass, 16, HighA, Src.High, 1);
_StoreMemPairAutoTSO(FPRClass, 16, A, Src.Low, Src.High, 1);
} else {
_StoreMemAutoTSO(FPRClass, 16, A, Src.Low, 1);
}
}
}
@ -2173,18 +2171,20 @@ void OpDispatchBuilder::AVX128_VectorVariableBlend(OpcodeArgs) {
void OpDispatchBuilder::AVX128_SaveAVXState(Ref MemBase) {
const auto NumRegs = CTX->Config.Is64BitMode ? 16U : 8U;
for (uint32_t i = 0; i < NumRegs; ++i) {
Ref Upper = AVX128_LoadXMMRegister(i, true);
_StoreMem(FPRClass, 16, Upper, MemBase, _Constant(i * 16 + 576), 16, MEM_OFFSET_SXTX, 1);
for (uint32_t i = 0; i < NumRegs; i += 2) {
RefPair Pair = LoadContextPair(16, AVXHigh0Index + i);
_StoreMemPair(FPRClass, 16, Pair.Low, Pair.High, MemBase, i * 16 + 576);
}
}
void OpDispatchBuilder::AVX128_RestoreAVXState(Ref MemBase) {
const auto NumRegs = CTX->Config.Is64BitMode ? 16U : 8U;
for (uint32_t i = 0; i < NumRegs; ++i) {
Ref YMMHReg = _LoadMem(FPRClass, 16, MemBase, _Constant(i * 16 + 576), 16, MEM_OFFSET_SXTX, 1);
AVX128_StoreXMMRegister(i, YMMHReg, true);
for (uint32_t i = 0; i < NumRegs; i += 2) {
auto YMMHRegs = LoadMemPair(FPRClass, 16, MemBase, i * 16 + 576);
AVX128_StoreXMMRegister(i, YMMHRegs.Low, true);
AVX128_StoreXMMRegister(i + 1, YMMHRegs.High, true);
}
}

View File

@ -2736,37 +2736,33 @@ void OpDispatchBuilder::SaveX87State(OpcodeArgs, Ref MemBase) {
// MXCSR_MASK: Mask for writes to the MXCSR register
// If OSFXSR bit in CR4 is not set than FXSAVE /may/ not save the XMM registers
// This is implementation dependent
for (uint32_t i = 0; i < Core::CPUState::NUM_MMS; ++i) {
Ref MMReg = LoadContext(MM0Index + i);
_StoreMem(FPRClass, 16, MMReg, MemBase, _Constant(i * 16 + 32), 16, MEM_OFFSET_SXTX, 1);
for (uint32_t i = 0; i < Core::CPUState::NUM_MMS; i += 2) {
RefPair MMRegs = LoadContextPair(16, MM0Index + i);
_StoreMemPair(FPRClass, 16, MMRegs.Low, MMRegs.High, MemBase, i * 16 + 32);
}
}
void OpDispatchBuilder::SaveSSEState(Ref MemBase) {
const auto NumRegs = CTX->Config.Is64BitMode ? 16U : 8U;
for (uint32_t i = 0; i < NumRegs; ++i) {
Ref XMMReg = LoadXMMRegister(i);
_StoreMem(FPRClass, 16, XMMReg, MemBase, _Constant(i * 16 + 160), 16, MEM_OFFSET_SXTX, 1);
for (uint32_t i = 0; i < NumRegs; i += 2) {
_StoreMemPair(FPRClass, 16, LoadXMMRegister(i), LoadXMMRegister(i + 1), MemBase, i * 16 + 160);
}
}
void OpDispatchBuilder::SaveMXCSRState(Ref MemBase) {
_StoreMem(GPRClass, 4, GetMXCSR(), MemBase, _Constant(24), 4, MEM_OFFSET_SXTX, 1);
// Store the mask for all bits.
_StoreMem(GPRClass, 4, _Constant(0xFFFF), MemBase, _Constant(28), 4, MEM_OFFSET_SXTX, 1);
// Store MXCSR and the mask for all bits.
_StoreMemPair(GPRClass, 4, GetMXCSR(), _Constant(0xFFFF), MemBase, 24);
}
void OpDispatchBuilder::SaveAVXState(Ref MemBase) {
const auto NumRegs = CTX->Config.Is64BitMode ? 16U : 8U;
for (uint32_t i = 0; i < NumRegs; ++i) {
Ref Upper = _VDupElement(32, 16, LoadXMMRegister(i), 1);
for (uint32_t i = 0; i < NumRegs; i += 2) {
Ref Upper0 = _VDupElement(32, 16, LoadXMMRegister(i + 0), 1);
Ref Upper1 = _VDupElement(32, 16, LoadXMMRegister(i + 1), 1);
_StoreMem(FPRClass, 16, Upper, MemBase, _Constant(i * 16 + 576), 16, MEM_OFFSET_SXTX, 1);
_StoreMemPair(FPRClass, 16, Upper0, Upper1, MemBase, i * 16 + 576);
}
}
@ -2868,18 +2864,22 @@ void OpDispatchBuilder::RestoreX87State(Ref MemBase) {
StoreContext(AbridgedFTWIndex, _LoadMem(GPRClass, 1, MemBase, _Constant(4), 2, MEM_OFFSET_SXTX, 1));
}
for (uint32_t i = 0; i < Core::CPUState::NUM_MMS; ++i) {
auto MMReg = _LoadMem(FPRClass, 16, MemBase, _Constant(i * 16 + 32), 16, MEM_OFFSET_SXTX, 1);
StoreContext(MM0Index + i, MMReg);
for (uint32_t i = 0; i < Core::CPUState::NUM_MMS; i += 2) {
auto MMRegs = LoadMemPair(FPRClass, 16, MemBase, i * 16 + 32);
StoreContext(MM0Index + i, MMRegs.Low);
StoreContext(MM0Index + i + 1, MMRegs.High);
}
}
void OpDispatchBuilder::RestoreSSEState(Ref MemBase) {
const auto NumRegs = CTX->Config.Is64BitMode ? 16U : 8U;
for (uint32_t i = 0; i < NumRegs; ++i) {
Ref XMMReg = _LoadMem(FPRClass, 16, MemBase, _Constant(i * 16 + 160), 16, MEM_OFFSET_SXTX, 1);
StoreXMMRegister(i, XMMReg);
for (uint32_t i = 0; i < NumRegs; i += 2) {
auto XMMRegs = LoadMemPair(FPRClass, 16, MemBase, i * 16 + 160);
StoreXMMRegister(i, XMMRegs.Low);
StoreXMMRegister(i + 1, XMMRegs.High);
}
}
@ -2896,11 +2896,12 @@ void OpDispatchBuilder::RestoreMXCSRState(Ref MXCSR) {
void OpDispatchBuilder::RestoreAVXState(Ref MemBase) {
const auto NumRegs = CTX->Config.Is64BitMode ? 16U : 8U;
for (uint32_t i = 0; i < NumRegs; ++i) {
Ref XMMReg = LoadXMMRegister(i);
Ref YMMHReg = _LoadMem(FPRClass, 16, MemBase, _Constant(i * 16 + 576), 16, MEM_OFFSET_SXTX, 1);
Ref YMM = _VInsElement(32, 16, 1, 0, XMMReg, YMMHReg);
StoreXMMRegister(i, YMM);
for (uint32_t i = 0; i < NumRegs; i += 2) {
Ref XMMReg0 = LoadXMMRegister(i + 0);
Ref XMMReg1 = LoadXMMRegister(i + 1);
auto YMMHRegs = LoadMemPair(FPRClass, 16, MemBase, i * 16 + 576);
StoreXMMRegister(i + 0, _VInsElement(32, 16, 1, 0, XMMReg0, YMMHRegs.Low));
StoreXMMRegister(i + 1, _VInsElement(32, 16, 1, 0, XMMReg1, YMMHRegs.High));
}
}

View File

@ -253,6 +253,11 @@
"If ForPair is set, RA will try to allocate the base of a register pair"],
"DestSize": "8"
},
"FPR = AllocateFPR u8:#RegisterSize, u8:#ElementSize": {
"Desc": ["Like AllocateGPR, but for FPR"],
"DestSize": "RegisterSize",
"NumElements": "RegisterSize / ElementSize"
},
"GPR = AllocateGPRAfter GPR:$After": {
"Desc": ["Silly pseudo-instruction to allocate a register for a future destination",
"This is a kludge to deal with the IR's lack of multiple destinations",
@ -386,6 +391,20 @@
]
},
"SSA:$Value1, SSA:$Value2 = LoadContextPair u8:#ByteSize, RegisterClass:$Class, u32:$Offset": {
"Desc": ["Loads a pair of values from the context with offset",
"Value0 = Ctx[Offset], Value1 = Ctx[Offset + ByteSize]"
],
"HasSideEffects": true,
"DestSize": "ByteSize",
"EmitValidation": [
"($Class == GPRClass && (#ByteSize == 1 || #ByteSize == 2 || #ByteSize == 4 || #ByteSize == 8)) || $Class == FPRClass",
"($Class == FPRClass && (#ByteSize == 1 || #ByteSize == 2 || #ByteSize == 4 || #ByteSize == 8 || #ByteSize == 16 || #ByteSize == 32)) || $Class == GPRClass",
"!($Offset >= offsetof(Core::CPUState, gregs[0]) && $Offset < offsetof(Core::CPUState, gregs[16])) && \"Can't LoadContext to GPR\"",
"!($Offset >= offsetof(Core::CPUState, xmm.avx.data[0]) && $Offset < offsetof(Core::CPUState, xmm.avx.data[16])) && \"Can't LoadContext to XMM\""
]
},
"StoreContext u8:#ByteSize, RegisterClass:$Class, SSA:$Value, u32:$Offset": {
"Desc": ["Stores a value to the context with offset",
"Ctx[Offset] = Value",
@ -403,6 +422,24 @@
]
},
"StoreContextPair u8:#ByteSize, RegisterClass:$Class, SSA:$Value1, SSA:$Value2, u32:$Offset": {
"Desc": ["Stores a pair of values to the context with offset",
"Ctx[Offset] = Value1, Ctx[Offset + ByteSize] = Value2",
"Zero Extends if value's type is too small",
"Truncates if value's type is too large"
],
"HasSideEffects": true,
"DestSize": "ByteSize",
"EmitValidation": [
"WalkFindRegClass($Value1) == $Class",
"WalkFindRegClass($Value2) == $Class",
"($Class == GPRClass && (#ByteSize == 1 || #ByteSize == 2 || #ByteSize == 4 || #ByteSize == 8)) || $Class == FPRClass",
"($Class == FPRClass && (#ByteSize == 1 || #ByteSize == 2 || #ByteSize == 4 || #ByteSize == 8 || #ByteSize == 16 || #ByteSize == 32)) || $Class == GPRClass",
"!($Offset >= offsetof(Core::CPUState, gregs[0]) && $Offset < offsetof(Core::CPUState, gregs[16])) && \"Can't StoreContext to GPR\"",
"!($Offset >= offsetof(Core::CPUState, xmm.avx.data[0]) && $Offset < offsetof(Core::CPUState, xmm.avx.data[16])) && \"Can't StoreContext to XMM\""
]
},
"SSA = LoadContextIndexed GPR:$Index, u8:#ByteSize, u32:$BaseOffset, u32:$Stride, RegisterClass:$Class": {
"Desc": ["Loads a value from the context with offset and indexed by SSA value",
"Dest = Ctx[BaseOffset + Index * Stride]"
@ -476,6 +513,12 @@
"DestSize": "Size"
},
"SSA:$Value1, SSA:$Value2 = LoadMemPair RegisterClass:$Class, u8:#Size, GPR:$Addr, u32:$Offset": {
"Desc": ["Load a pair of values from memory."],
"DestSize": "Size",
"HasSideEffects": true
},
"StoreMem RegisterClass:$Class, u8:#Size, SSA:$Value, GPR:$Addr, GPR:$Offset, u8:$Align, MemOffsetType:$OffsetType, u8:$OffsetScale": {
"Desc": [ "Stores a value to memory.",
"Zero Extends if value's type is too small",
@ -488,6 +531,19 @@
]
},
"StoreMemPair RegisterClass:$Class, u8:#Size, SSA:$Value1, SSA:$Value2, GPR:$Addr, u32:$Offset": {
"Desc": [ "Stores a pair of values to memory.",
"Zero Extends if value's type is too small",
"Truncates if value's type is too large"
],
"HasSideEffects": true,
"DestSize": "Size",
"EmitValidation": [
"WalkFindRegClass($Value1) == $Class",
"WalkFindRegClass($Value2) == $Class"
]
},
"SSA = LoadMemTSO RegisterClass:$Class, u8:#Size, GPR:$Addr, GPR:$Offset, u8:$Align, MemOffsetType:$OffsetType, u8:$OffsetScale": {
"Desc": ["Does a x86 TSO compatible load from memory. Offset must be Invalid()."
],

View File

@ -46,13 +46,12 @@
]
},
"vmovups ymm0, [rax]": {
"ExpectedInstructionCount": 3,
"ExpectedInstructionCount": 2,
"Comment": [
"Map 1 0b00 0x10 256-bit"
],
"ExpectedArm64ASM": [
"ldr q16, [x4]",
"ldr q2, [x4, #16]",
"ldp q16, q2, [x4]",
"str q2, [x28, #16]"
]
},
@ -89,13 +88,12 @@
]
},
"vmovupd ymm0, [rax]": {
"ExpectedInstructionCount": 3,
"ExpectedInstructionCount": 2,
"Comment": [
"Map 1 0b01 0x10 256-bit"
],
"ExpectedArm64ASM": [
"ldr q16, [x4]",
"ldr q2, [x4, #16]",
"ldp q16, q2, [x4]",
"str q2, [x28, #16]"
]
},
@ -156,14 +154,13 @@
]
},
"vmovups [rax], ymm0": {
"ExpectedInstructionCount": 3,
"ExpectedInstructionCount": 2,
"Comment": [
"Map 1 0b00 0x11 256-bit"
],
"ExpectedArm64ASM": [
"ldr q2, [x28, #16]",
"str q16, [x4]",
"str q2, [x4, #16]"
"stp q16, q2, [x4]"
]
},
"vmovupd [rax], xmm0": {
@ -176,14 +173,13 @@
]
},
"vmovupd [rax], ymm0": {
"ExpectedInstructionCount": 3,
"ExpectedInstructionCount": 2,
"Comment": [
"Map 1 0b01 0x11 256-bit"
],
"ExpectedArm64ASM": [
"ldr q2, [x28, #16]",
"str q16, [x4]",
"str q2, [x4, #16]"
"stp q16, q2, [x4]"
]
},
"vmovss [rax], xmm0": {
@ -272,13 +268,12 @@
]
},
"vmovsldup ymm0, [rax]": {
"ExpectedInstructionCount": 5,
"ExpectedInstructionCount": 4,
"Comment": [
"Map 1 0b10 0x12 256-bit"
],
"ExpectedArm64ASM": [
"ldr q2, [x4]",
"ldr q3, [x4, #16]",
"ldp q2, q3, [x4]",
"trn1 v16.4s, v2.4s, v2.4s",
"trn1 v2.4s, v3.4s, v3.4s",
"str q2, [x28, #16]"
@ -297,13 +292,12 @@
]
},
"vmovddup ymm0, [rax]": {
"ExpectedInstructionCount": 5,
"ExpectedInstructionCount": 4,
"Comment": [
"Map 1 0b11 0x12 256-bit"
],
"ExpectedArm64ASM": [
"ldr q2, [x4]",
"ldr q3, [x4, #16]",
"ldp q2, q3, [x4]",
"dup v16.2d, v2.d[0]",
"dup v2.2d, v3.d[0]",
"str q2, [x28, #16]"
@ -340,14 +334,13 @@
]
},
"vunpcklps ymm0, ymm1, [rax]": {
"ExpectedInstructionCount": 6,
"ExpectedInstructionCount": 5,
"Comment": [
"Map 1 0b00 0x14 256-bit"
],
"ExpectedArm64ASM": [
"ldr q2, [x28, #32]",
"ldr q3, [x4]",
"ldr q4, [x4, #16]",
"ldp q3, q4, [x4]",
"zip1 v16.4s, v17.4s, v3.4s",
"zip1 v2.4s, v2.4s, v4.4s",
"str q2, [x28, #16]"
@ -366,14 +359,13 @@
]
},
"vunpcklpd ymm0, ymm1, [rax]": {
"ExpectedInstructionCount": 6,
"ExpectedInstructionCount": 5,
"Comment": [
"Map 1 0b01 0x14 256-bit"
],
"ExpectedArm64ASM": [
"ldr q2, [x28, #32]",
"ldr q3, [x4]",
"ldr q4, [x4, #16]",
"ldp q3, q4, [x4]",
"zip1 v16.2d, v17.2d, v3.2d",
"zip1 v2.2d, v2.2d, v4.2d",
"str q2, [x28, #16]"
@ -392,14 +384,13 @@
]
},
"vunpckhps ymm0, ymm1, [rax]": {
"ExpectedInstructionCount": 6,
"ExpectedInstructionCount": 5,
"Comment": [
"Map 1 0b00 0x15 256-bit"
],
"ExpectedArm64ASM": [
"ldr q2, [x28, #32]",
"ldr q3, [x4]",
"ldr q4, [x4, #16]",
"ldp q3, q4, [x4]",
"zip2 v16.4s, v17.4s, v3.4s",
"zip2 v2.4s, v2.4s, v4.4s",
"str q2, [x28, #16]"
@ -418,14 +409,13 @@
]
},
"vunpckhpd ymm0, ymm1, [rax]": {
"ExpectedInstructionCount": 6,
"ExpectedInstructionCount": 5,
"Comment": [
"Map 1 0b01 0x15 256-bit"
],
"ExpectedArm64ASM": [
"ldr q2, [x28, #32]",
"ldr q3, [x4]",
"ldr q4, [x4, #16]",
"ldp q3, q4, [x4]",
"zip2 v16.2d, v17.2d, v3.2d",
"zip2 v2.2d, v2.2d, v4.2d",
"str q2, [x28, #16]"
@ -479,13 +469,12 @@
]
},
"vmovshdup ymm0, [rax]": {
"ExpectedInstructionCount": 5,
"ExpectedInstructionCount": 4,
"Comment": [
"Map 1 0b10 0x16 256-bit"
],
"ExpectedArm64ASM": [
"ldr q2, [x4]",
"ldr q3, [x4, #16]",
"ldp q2, q3, [x4]",
"trn2 v16.4s, v2.4s, v2.4s",
"trn2 v2.4s, v3.4s, v3.4s",
"str q2, [x28, #16]"
@ -1634,33 +1623,25 @@
]
},
"vzeroupper": {
"ExpectedInstructionCount": 17,
"ExpectedInstructionCount": 9,
"Comment": [
"Might be able to use DZ ZVA",
"Map 1 0b01 0x77 L=0"
],
"ExpectedArm64ASM": [
"movi v2.2d, #0x0",
"str q2, [x28, #256]",
"str q2, [x28, #240]",
"str q2, [x28, #224]",
"str q2, [x28, #208]",
"str q2, [x28, #192]",
"str q2, [x28, #176]",
"str q2, [x28, #160]",
"str q2, [x28, #144]",
"str q2, [x28, #128]",
"str q2, [x28, #112]",
"str q2, [x28, #96]",
"str q2, [x28, #80]",
"str q2, [x28, #64]",
"str q2, [x28, #48]",
"str q2, [x28, #32]",
"str q2, [x28, #16]"
"stp q2, q2, [x28, #240]",
"stp q2, q2, [x28, #208]",
"stp q2, q2, [x28, #176]",
"stp q2, q2, [x28, #144]",
"stp q2, q2, [x28, #112]",
"stp q2, q2, [x28, #80]",
"stp q2, q2, [x28, #48]",
"stp q2, q2, [x28, #16]"
]
},
"vzeroall": {
"ExpectedInstructionCount": 32,
"ExpectedInstructionCount": 24,
"Comment": [
"Might be able to use DZ ZVA",
"Map 1 0b01 0x77 L=1"
@ -1682,22 +1663,14 @@
"movi v29.2d, #0x0",
"movi v30.2d, #0x0",
"movi v31.2d, #0x0",
"str q31, [x28, #256]",
"str q31, [x28, #240]",
"str q31, [x28, #224]",
"str q31, [x28, #208]",
"str q31, [x28, #192]",
"str q31, [x28, #176]",
"str q31, [x28, #160]",
"str q31, [x28, #144]",
"str q31, [x28, #128]",
"str q31, [x28, #112]",
"str q31, [x28, #96]",
"str q31, [x28, #80]",
"str q31, [x28, #64]",
"str q31, [x28, #48]",
"str q31, [x28, #32]",
"str q31, [x28, #16]"
"stp q31, q31, [x28, #240]",
"stp q31, q31, [x28, #208]",
"stp q31, q31, [x28, #176]",
"stp q31, q31, [x28, #144]",
"stp q31, q31, [x28, #112]",
"stp q31, q31, [x28, #80]",
"stp q31, q31, [x28, #48]",
"stp q31, q31, [x28, #16]"
]
},
"vcmpps xmm0, xmm1, xmm2, 0x00": {
@ -2631,13 +2604,12 @@
]
},
"vmovaps ymm0, [rax]": {
"ExpectedInstructionCount": 3,
"ExpectedInstructionCount": 2,
"Comment": [
"Map 1 0b00 0x28 256-bit"
],
"ExpectedArm64ASM": [
"ldr q16, [x4]",
"ldr q2, [x4, #16]",
"ldp q16, q2, [x4]",
"str q2, [x28, #16]"
]
},
@ -2675,13 +2647,12 @@
]
},
"vmovapd ymm0, [rax]": {
"ExpectedInstructionCount": 3,
"ExpectedInstructionCount": 2,
"Comment": [
"Map 1 0b01 0x28 256-bit"
],
"ExpectedArm64ASM": [
"ldr q16, [x4]",
"ldr q2, [x4, #16]",
"ldp q16, q2, [x4]",
"str q2, [x28, #16]"
]
},
@ -2717,14 +2688,13 @@
]
},
"vmovaps [rax], ymm0": {
"ExpectedInstructionCount": 3,
"ExpectedInstructionCount": 2,
"Comment": [
"Map 1 0b00 0x29 256-bit"
],
"ExpectedArm64ASM": [
"ldr q2, [x28, #16]",
"str q16, [x4]",
"str q2, [x4, #16]"
"stp q16, q2, [x4]"
]
},
"vmovapd [rax], xmm0": {
@ -2737,14 +2707,13 @@
]
},
"vmovapd [rax], ymm0": {
"ExpectedInstructionCount": 3,
"ExpectedInstructionCount": 2,
"Comment": [
"Map 1 0b01 0x29 256-bit"
],
"ExpectedArm64ASM": [
"ldr q2, [x28, #16]",
"str q16, [x4]",
"str q2, [x4, #16]"
"stp q16, q2, [x4]"
]
},
"vcvtsi2ss xmm0, xmm1, eax": {
@ -3161,13 +3130,12 @@
]
},
"vcvtpd2ps xmm0, yword [rax]": {
"ExpectedInstructionCount": 8,
"ExpectedInstructionCount": 7,
"Comment": [
"Map 1 0b01 0x5a 128-bit"
],
"ExpectedArm64ASM": [
"ldr q2, [x4]",
"ldr q3, [x4, #16]",
"ldp q2, q3, [x4]",
"fcvtn v2.2s, v2.2d",
"fcvtn v3.2s, v3.2d",
"mov v16.16b, v2.16b",
@ -4005,47 +3973,43 @@
]
},
"vmovdqa ymm0, [rax]": {
"ExpectedInstructionCount": 3,
"ExpectedInstructionCount": 2,
"Comment": [
"Map 1 0b01 0x7f 128-bit"
],
"ExpectedArm64ASM": [
"ldr q16, [x4]",
"ldr q2, [x4, #16]",
"ldp q16, q2, [x4]",
"str q2, [x28, #16]"
]
},
"vmovdqa [rax], ymm0": {
"ExpectedInstructionCount": 3,
"ExpectedInstructionCount": 2,
"Comment": [
"Map 1 0b01 0x7f 128-bit"
],
"ExpectedArm64ASM": [
"ldr q2, [x28, #16]",
"str q16, [x4]",
"str q2, [x4, #16]"
"stp q16, q2, [x4]"
]
},
"vmovdqu ymm0, [rax]": {
"ExpectedInstructionCount": 3,
"ExpectedInstructionCount": 2,
"Comment": [
"Map 1 0b10 0x7f 128-bit"
],
"ExpectedArm64ASM": [
"ldr q16, [x4]",
"ldr q2, [x4, #16]",
"ldp q16, q2, [x4]",
"str q2, [x28, #16]"
]
},
"vmovdqu [rax], ymm0": {
"ExpectedInstructionCount": 3,
"ExpectedInstructionCount": 2,
"Comment": [
"Map 1 0b10 0x7f 128-bit"
],
"ExpectedArm64ASM": [
"ldr q2, [x28, #16]",
"str q16, [x4]",
"str q2, [x4, #16]"
"stp q16, q2, [x4]"
]
},
"vaddsubpd xmm0, xmm1, xmm2": {
@ -5178,13 +5142,12 @@
]
},
"vlddqu ymm0, [rax]": {
"ExpectedInstructionCount": 3,
"ExpectedInstructionCount": 2,
"Comment": [
"Map 1 0b11 0xf0 256-bit"
],
"ExpectedArm64ASM": [
"ldr q16, [x4]",
"ldr q2, [x4, #16]",
"ldp q16, q2, [x4]",
"str q2, [x28, #16]"
]
},

View File

@ -1052,6 +1052,86 @@
"ldr x20, [x28, #960]",
"ldur x7, [x20, #20]"
]
},
"vmovdqu ymm7,yword [rsi+0x60]": {
"ExpectedInstructionCount": 2,
"ExpectedArm64ASM": [
"ldp q23, q2, [x10, #96]",
"str q2, [x28, #128]"
]
},
"vmovdqu ymm7,yword [rsi+0x120]": {
"ExpectedInstructionCount": 2,
"ExpectedArm64ASM": [
"ldp q23, q2, [x10, #288]",
"str q2, [x28, #128]"
]
},
"vmovdqu ymm7,yword [rsi-0x60]": {
"ExpectedInstructionCount": 2,
"ExpectedArm64ASM": [
"ldp q23, q2, [x10, #-96]",
"str q2, [x28, #128]"
]
},
"vmovdqu ymm7,yword [rsi-0x400]": {
"ExpectedInstructionCount": 2,
"ExpectedArm64ASM": [
"ldp q23, q2, [x10, #-1024]",
"str q2, [x28, #128]"
]
},
"vmovdqu ymm7,yword [rsi-0x420]": {
"ExpectedInstructionCount": 3,
"ExpectedArm64ASM": [
"sub x20, x10, #0x420 (1056)",
"ldp q23, q2, [x20]",
"str q2, [x28, #128]"
]
},
"vmovdqu ymm7,yword [rsi+0x3d0]": {
"ExpectedInstructionCount": 2,
"ExpectedArm64ASM": [
"ldp q23, q2, [x10, #976]",
"str q2, [x28, #128]"
]
},
"vmovdqu ymm7,yword [rsi+0x400]": {
"ExpectedInstructionCount": 3,
"ExpectedArm64ASM": [
"add x20, x10, #0x400 (1024)",
"ldp q23, q2, [x20]",
"str q2, [x28, #128]"
]
},
"vmovdqa yword [rcx+0x60],ymm1": {
"ExpectedInstructionCount": 2,
"ExpectedArm64ASM": [
"ldr q2, [x28, #32]",
"stp q17, q2, [x5, #96]"
]
},
"vmovdqa yword [rcx+0x3d0],ymm1": {
"ExpectedInstructionCount": 2,
"ExpectedArm64ASM": [
"ldr q2, [x28, #32]",
"stp q17, q2, [x5, #976]"
]
},
"vmovdqa yword [rcx-0x3d0],ymm1": {
"ExpectedInstructionCount": 2,
"ExpectedArm64ASM": [
"ldr q2, [x28, #32]",
"stp q17, q2, [x5, #-976]"
]
},
"vmovdqa yword [rcx+rsi-0x3d0],ymm1": {
"ExpectedInstructionCount": 3,
"ExpectedArm64ASM": [
"ldr q2, [x28, #32]",
"add x20, x5, x10",
"stp q17, q2, [x20, #-976]"
]
}
}
}

View File

@ -428,6 +428,161 @@
"mov x11, x26"
]
},
"glibc AVX memcpy block 1": {
"ExpectedInstructionCount": 26,
"x86Insts": [
"vmovdqu ymm5,yword [rsi+0x20]",
"vmovdqu ymm6,yword [rsi+0x40]",
"lea rcx,[rdi+rdx*1-0x81]",
"vmovdqu ymm7,yword [rsi+0x60]",
"vmovdqu ymm8,yword [rsi+rdx*1-0x20]",
"sub rsi,rdi",
"and rcx,0xffffffffffffffe0",
"add rsi,rcx",
"nop dword [rax+0x0]",
"vmovdqu ymm1,yword [rsi+0x60]",
"vmovdqu ymm2,yword [rsi+0x40]",
"vmovdqu ymm3,yword [rsi+0x20]",
"vmovdqu ymm4,yword [rsi]",
"add rsi,0xffffffffffffff80",
"vmovdqa yword [rcx+0x60],ymm1",
"vmovdqa yword [rcx+0x40],ymm2",
"vmovdqa yword [rcx+0x20],ymm3",
"vmovdqa yword [rcx],ymm4",
"add rcx,0xffffffffffffff80",
"cmp rdi,rcx"
],
"ExpectedArm64ASM": [
"ldp q21, q2, [x10, #32]",
"ldp q22, q3, [x10, #64]",
"sub x20, x11, #0x81 (129)",
"add x5, x20, x6",
"ldp q23, q4, [x10, #96]",
"add x20, x10, x6",
"ldp q24, q5, [x20, #-32]",
"sub x10, x10, x11",
"and x5, x5, #0xffffffffffffffe0",
"add x10, x10, x5",
"ldp q17, q6, [x10, #96]",
"ldp q18, q7, [x10, #64]",
"ldp q19, q8, [x10, #32]",
"ldp q20, q9, [x10]",
"sub x10, x10, #0x80 (128)",
"stp q17, q6, [x5, #96]",
"stp q18, q7, [x5, #64]",
"stp q19, q8, [x5, #32]",
"stp q20, q9, [x5]",
"sub x5, x5, #0x80 (128)",
"eor w27, w11, w5",
"subs x26, x11, x5",
"stp q4, q5, [x28, #128]",
"stp q2, q3, [x28, #96]",
"stp q8, q9, [x28, #64]",
"stp q6, q7, [x28, #32]"
]
},
"glibc AVX memcpy block 2": {
"ExpectedInstructionCount": 31,
"x86Insts": [
"vmovdqu ymm5,yword [rsi+rdx*1-0x20]",
"vmovdqu ymm6,yword [rsi+rdx*1-0x40]",
"mov rcx,rdi",
"or rdi,0x1f",
"vmovdqu ymm7,yword [rsi+rdx*1-0x60]",
"vmovdqu ymm8,yword [rsi+rdx*1-0x80]",
"sub rsi,rcx",
"inc rdi",
"add rsi,rdi",
"lea rdx,[rcx+rdx*1-0x80]",
"nop dword [rax+rax*1+0x0]",
"vmovdqu ymm1,yword [rsi]",
"vmovdqu ymm2,yword [rsi+0x20]",
"vmovdqu ymm3,yword [rsi+0x40]",
"vmovdqu ymm4,yword [rsi+0x60]",
"sub rsi,0xffffffffffffff80",
"vmovdqa yword [rdi],ymm1",
"vmovdqa yword [rdi+0x20],ymm2",
"vmovdqa yword [rdi+0x40],ymm3",
"vmovdqa yword [rdi+0x60],ymm4",
"sub rdi,0xffffffffffffff80",
"cmp rdx,rdi"
],
"ExpectedArm64ASM": [
"add x20, x10, x6",
"ldp q21, q2, [x20, #-32]",
"add x20, x10, x6",
"ldp q22, q3, [x20, #-64]",
"mov x5, x11",
"orr x11, x11, #0x1f",
"add x20, x10, x6",
"ldp q23, q4, [x20, #-96]",
"add x20, x10, x6",
"ldp q24, q5, [x20, #-128]",
"sub x10, x10, x5",
"add x11, x11, #0x1 (1)",
"add x10, x10, x11",
"sub x20, x5, #0x80 (128)",
"add x6, x20, x6",
"ldp q17, q6, [x10]",
"ldp q18, q7, [x10, #32]",
"ldp q19, q8, [x10, #64]",
"ldp q20, q9, [x10, #96]",
"add x10, x10, #0x80 (128)",
"stp q17, q6, [x11]",
"stp q18, q7, [x11, #32]",
"stp q19, q8, [x11, #64]",
"stp q20, q9, [x11, #96]",
"add x11, x11, #0x80 (128)",
"eor w27, w6, w11",
"subs x26, x6, x11",
"stp q4, q5, [x28, #128]",
"stp q2, q3, [x28, #96]",
"stp q8, q9, [x28, #64]",
"stp q6, q7, [x28, #32]"
]
},
"bytemark strsift": {
"ExpectedInstructionCount": 20,
"x86Insts": [
"mov rsi,rdx",
"and rsi,0xfffffffffffffffc",
"movq xmm0,rcx",
"pshufd xmm0,xmm0,0x44",
"mov rdi,qword [rsp+0x20]",
"lea rdi,[rdi+r13*8]",
"xor r8d,r8d",
"movdqu xmm1,oword [rdi+r8*8-0x10]",
"movdqu xmm2,oword [rdi+r8*8]",
"paddq xmm1,xmm0",
"paddq xmm2,xmm0",
"movdqu oword [rdi+r8*8-0x10],xmm1",
"movdqu oword [rdi+r8*8],xmm2",
"add r8,0x4",
"cmp rsi,r8"
],
"ExpectedArm64ASM": [
"mov x10, x6",
"and x10, x10, #0xfffffffffffffffc",
"fmov d16, x5",
"dup v16.2d, v16.d[0]",
"ldr x11, [x8, #32]",
"add x11, x11, x17, lsl #3",
"mov w12, #0x0",
"add x20, x11, x12, lsl #3",
"ldur q17, [x20, #-16]",
"add x20, x11, x12, lsl #3",
"ldr q18, [x20]",
"add v17.2d, v17.2d, v16.2d",
"add v18.2d, v18.2d, v16.2d",
"add x20, x11, x12, lsl #3",
"stur q17, [x20, #-16]",
"add x20, x11, x12, lsl #3",
"str q18, [x20]",
"add x12, x12, #0x4 (4)",
"eor w27, w10, w12",
"subs x26, x10, x12"
]
},
"pcmpistri xmm0, xmm1, 0_0_00_11_01b": {
"ExpectedInstructionCount": 41,
"Comment": [

View File

@ -1216,7 +1216,7 @@
]
},
"fxsave [rax]": {
"ExpectedInstructionCount": 52,
"ExpectedInstructionCount": 39,
"Comment": "GROUP15 0x0F 0xAE /0",
"ExpectedArm64ASM": [
"ldrh w20, [x28, #1296]",
@ -1235,42 +1235,29 @@
"ldrb w20, [x28, #1298]",
"strb w20, [x4, #4]",
"ldr q2, [x28, #1040]",
"str q2, [x4, #32]",
"ldr q2, [x28, #1056]",
"str q2, [x4, #48]",
"ldr q3, [x28, #1056]",
"stp q2, q3, [x4, #32]",
"ldr q2, [x28, #1072]",
"str q2, [x4, #64]",
"ldr q2, [x28, #1088]",
"str q2, [x4, #80]",
"ldr q3, [x28, #1088]",
"stp q2, q3, [x4, #64]",
"ldr q2, [x28, #1104]",
"str q2, [x4, #96]",
"ldr q2, [x28, #1120]",
"str q2, [x4, #112]",
"ldr q3, [x28, #1120]",
"stp q2, q3, [x4, #96]",
"ldr q2, [x28, #1136]",
"str q2, [x4, #128]",
"ldr q2, [x28, #1152]",
"str q2, [x4, #144]",
"str q16, [x4, #160]",
"str q17, [x4, #176]",
"str q18, [x4, #192]",
"str q19, [x4, #208]",
"str q20, [x4, #224]",
"str q21, [x4, #240]",
"str q22, [x4, #256]",
"str q23, [x4, #272]",
"str q24, [x4, #288]",
"str q25, [x4, #304]",
"str q26, [x4, #320]",
"str q27, [x4, #336]",
"str q28, [x4, #352]",
"str q29, [x4, #368]",
"str q30, [x4, #384]",
"str q31, [x4, #400]",
"ldr q3, [x28, #1152]",
"stp q2, q3, [x4, #128]",
"stp q16, q17, [x4, #160]",
"stp q18, q19, [x4, #192]",
"stp q20, q21, [x4, #224]",
"stp q22, q23, [x4, #256]",
"stp q24, q25, [x4, #288]",
"stp q26, q27, [x4, #320]",
"stp q28, q29, [x4, #352]",
"stp q30, q31, [x4, #384]",
"ldr w20, [x28, #940]",
"and w20, w20, #0xffc0",
"str w20, [x4, #24]",
"mov w20, #0xffff",
"str w20, [x4, #28]"
"mov w21, #0xffff",
"stp w20, w21, [x4, #24]"
]
},
"rdfsbase eax": {
@ -1288,7 +1275,7 @@
]
},
"fxrstor [rax]": {
"ExpectedInstructionCount": 58,
"ExpectedInstructionCount": 46,
"Comment": "GROUP15 0x0F 0xAE /1",
"ExpectedArm64ASM": [
"ldrh w20, [x4]",
@ -1305,30 +1292,18 @@
"strb w23, [x28, #1018]",
"strb w20, [x28, #1022]",
"ldrb w20, [x4, #4]",
"ldr q2, [x4, #32]",
"ldr q3, [x4, #48]",
"ldr q4, [x4, #64]",
"ldr q5, [x4, #80]",
"ldr q6, [x4, #96]",
"ldr q7, [x4, #112]",
"ldr q8, [x4, #128]",
"ldr q9, [x4, #144]",
"ldr q16, [x4, #160]",
"ldr q17, [x4, #176]",
"ldr q18, [x4, #192]",
"ldr q19, [x4, #208]",
"ldr q20, [x4, #224]",
"ldr q21, [x4, #240]",
"ldr q22, [x4, #256]",
"ldr q23, [x4, #272]",
"ldr q24, [x4, #288]",
"ldr q25, [x4, #304]",
"ldr q26, [x4, #320]",
"ldr q27, [x4, #336]",
"ldr q28, [x4, #352]",
"ldr q29, [x4, #368]",
"ldr q30, [x4, #384]",
"ldr q31, [x4, #400]",
"ldp q2, q3, [x4, #32]",
"ldp q4, q5, [x4, #64]",
"ldp q6, q7, [x4, #96]",
"ldp q8, q9, [x4, #128]",
"ldp q16, q17, [x4, #160]",
"ldp q18, q19, [x4, #192]",
"ldp q20, q21, [x4, #224]",
"ldp q22, q23, [x4, #256]",
"ldp q24, q25, [x4, #288]",
"ldp q26, q27, [x4, #320]",
"ldp q28, q29, [x4, #352]",
"ldp q30, q31, [x4, #384]",
"ldr w21, [x4, #24]",
"and w21, w21, #0xffc0",
"str w21, [x28, #940]",
@ -1422,12 +1397,12 @@
]
},
"xsave [rax]": {
"ExpectedInstructionCount": 98,
"ExpectedInstructionCount": 69,
"Comment": "GROUP15 0x0F 0xAE /4",
"ExpectedArm64ASM": [
"ubfx x20, x4, #0, #1",
"cbnz x20, #+0x8",
"b #+0x80",
"b #+0x70",
"ldrh w20, [x28, #1296]",
"strh w20, [x4]",
"ldrb w20, [x28, #1019]",
@ -1444,83 +1419,54 @@
"ldrb w20, [x28, #1298]",
"strb w20, [x4, #4]",
"ldr q2, [x28, #1040]",
"str q2, [x4, #32]",
"ldr q2, [x28, #1056]",
"str q2, [x4, #48]",
"ldr q3, [x28, #1056]",
"stp q2, q3, [x4, #32]",
"ldr q2, [x28, #1072]",
"str q2, [x4, #64]",
"ldr q2, [x28, #1088]",
"str q2, [x4, #80]",
"ldr q3, [x28, #1088]",
"stp q2, q3, [x4, #64]",
"ldr q2, [x28, #1104]",
"str q2, [x4, #96]",
"ldr q2, [x28, #1120]",
"str q2, [x4, #112]",
"ldr q3, [x28, #1120]",
"stp q2, q3, [x4, #96]",
"ldr q2, [x28, #1136]",
"str q2, [x4, #128]",
"ldr q2, [x28, #1152]",
"str q2, [x4, #144]",
"ldr q3, [x28, #1152]",
"stp q2, q3, [x4, #128]",
"ubfx x20, x4, #1, #1",
"cbnz x20, #+0x8",
"b #+0x44",
"str q16, [x4, #160]",
"str q17, [x4, #176]",
"str q18, [x4, #192]",
"str q19, [x4, #208]",
"str q20, [x4, #224]",
"str q21, [x4, #240]",
"str q22, [x4, #256]",
"str q23, [x4, #272]",
"str q24, [x4, #288]",
"str q25, [x4, #304]",
"str q26, [x4, #320]",
"str q27, [x4, #336]",
"str q28, [x4, #352]",
"str q29, [x4, #368]",
"str q30, [x4, #384]",
"str q31, [x4, #400]",
"b #+0x24",
"stp q16, q17, [x4, #160]",
"stp q18, q19, [x4, #192]",
"stp q20, q21, [x4, #224]",
"stp q22, q23, [x4, #256]",
"stp q24, q25, [x4, #288]",
"stp q26, q27, [x4, #320]",
"stp q28, q29, [x4, #352]",
"stp q30, q31, [x4, #384]",
"ubfx x20, x4, #2, #1",
"cbnz x20, #+0x8",
"b #+0x84",
"ldr q2, [x28, #16]",
"str q2, [x4, #576]",
"ldr q2, [x28, #32]",
"str q2, [x4, #592]",
"ldr q2, [x28, #48]",
"str q2, [x4, #608]",
"ldr q2, [x28, #64]",
"str q2, [x4, #624]",
"ldr q2, [x28, #80]",
"str q2, [x4, #640]",
"ldr q2, [x28, #96]",
"str q2, [x4, #656]",
"ldr q2, [x28, #112]",
"str q2, [x4, #672]",
"ldr q2, [x28, #128]",
"str q2, [x4, #688]",
"ldr q2, [x28, #144]",
"str q2, [x4, #704]",
"ldr q2, [x28, #160]",
"str q2, [x4, #720]",
"ldr q2, [x28, #176]",
"str q2, [x4, #736]",
"ldr q2, [x28, #192]",
"str q2, [x4, #752]",
"ldr q2, [x28, #208]",
"str q2, [x4, #768]",
"ldr q2, [x28, #224]",
"str q2, [x4, #784]",
"ldr q2, [x28, #240]",
"str q2, [x4, #800]",
"ldr q2, [x28, #256]",
"str q2, [x4, #816]",
"b #+0x44",
"ldp q2, q3, [x28, #16]",
"stp q2, q3, [x4, #576]",
"ldp q2, q3, [x28, #48]",
"stp q2, q3, [x4, #608]",
"ldp q2, q3, [x28, #80]",
"stp q2, q3, [x4, #640]",
"ldp q2, q3, [x28, #112]",
"stp q2, q3, [x4, #672]",
"ldp q2, q3, [x28, #144]",
"stp q2, q3, [x4, #704]",
"ldp q2, q3, [x28, #176]",
"stp q2, q3, [x4, #736]",
"ldp q2, q3, [x28, #208]",
"stp q2, q3, [x4, #768]",
"ldp q2, q3, [x28, #240]",
"stp q2, q3, [x4, #800]",
"ubfx x20, x4, #1, #2",
"cbnz x20, #+0x8",
"b #+0x18",
"b #+0x14",
"ldr w20, [x28, #940]",
"and w20, w20, #0xffc0",
"str w20, [x4, #24]",
"mov w20, #0xffff",
"str w20, [x4, #28]",
"mov w21, #0xffff",
"stp w20, w21, [x4, #24]",
"ubfx x20, x4, #0, #3",
"str x20, [x4, #512]"
]
@ -1533,14 +1479,14 @@
]
},
"xrstor [rax]": {
"ExpectedInstructionCount": 166,
"ExpectedInstructionCount": 130,
"Comment": "GROUP15 0x0F 0xAE /5",
"ExpectedArm64ASM": [
"sub sp, sp, #0x40 (64)",
"ldr x20, [x4, #512]",
"ubfx x20, x20, #0, #1",
"cbnz x20, #+0x8",
"b #+0x84",
"b #+0x74",
"ldrh w20, [x4]",
"strh w20, [x28, #1296]",
"ldrh w20, [x4, #2]",
@ -1555,14 +1501,10 @@
"strb w23, [x28, #1018]",
"strb w20, [x28, #1022]",
"ldrb w20, [x4, #4]",
"ldr q2, [x4, #32]",
"ldr q3, [x4, #48]",
"ldr q4, [x4, #64]",
"ldr q5, [x4, #80]",
"ldr q6, [x4, #96]",
"ldr q7, [x4, #112]",
"ldr q8, [x4, #128]",
"ldr q9, [x4, #144]",
"ldp q2, q3, [x4, #32]",
"ldp q4, q5, [x4, #64]",
"ldp q6, q7, [x4, #96]",
"ldp q8, q9, [x4, #128]",
"strb w20, [x28, #1298]",
"str q9, [x28, #1152]",
"str q8, [x28, #1136]",
@ -1593,23 +1535,15 @@
"ldr x20, [x4, #512]",
"ubfx x20, x20, #1, #1",
"cbnz x20, #+0x8",
"b #+0x48",
"ldr q16, [x4, #160]",
"ldr q17, [x4, #176]",
"ldr q18, [x4, #192]",
"ldr q19, [x4, #208]",
"ldr q20, [x4, #224]",
"ldr q21, [x4, #240]",
"ldr q22, [x4, #256]",
"ldr q23, [x4, #272]",
"ldr q24, [x4, #288]",
"ldr q25, [x4, #304]",
"ldr q26, [x4, #320]",
"ldr q27, [x4, #336]",
"ldr q28, [x4, #352]",
"ldr q29, [x4, #368]",
"ldr q30, [x4, #384]",
"ldr q31, [x4, #400]",
"b #+0x28",
"ldp q16, q17, [x4, #160]",
"ldp q18, q19, [x4, #192]",
"ldp q20, q21, [x4, #224]",
"ldp q22, q23, [x4, #256]",
"ldp q24, q25, [x4, #288]",
"ldp q26, q27, [x4, #320]",
"ldp q28, q29, [x4, #352]",
"ldp q30, q31, [x4, #384]",
"b #+0x44",
"movi v31.2d, #0x0",
"mov v30.16b, v31.16b",
@ -1630,61 +1564,37 @@
"ldr x20, [x4, #512]",
"ubfx x20, x20, #2, #1",
"cbnz x20, #+0x8",
"b #+0x98",
"ldr q2, [x4, #576]",
"ldr q3, [x4, #592]",
"ldr q4, [x4, #608]",
"ldr q5, [x4, #624]",
"ldr q6, [x4, #640]",
"ldr q7, [x4, #656]",
"ldr q8, [x4, #672]",
"ldr q9, [x4, #688]",
"ldr q10, [x4, #704]",
"ldr q11, [x4, #720]",
"ldr q12, [x4, #736]",
"ldr q13, [x4, #752]",
"ldr q14, [x4, #768]",
"ldr q15, [x4, #784]",
"b #+0x58",
"ldp q2, q3, [x4, #576]",
"ldp q4, q5, [x4, #608]",
"ldp q6, q7, [x4, #640]",
"ldp q8, q9, [x4, #672]",
"ldp q10, q11, [x4, #704]",
"ldp q12, q13, [x4, #736]",
"ldp q14, q15, [x4, #768]",
"str q2, [sp]",
"ldr q2, [x4, #800]",
"str q3, [sp, #32]",
"ldr q3, [x4, #816]",
"str q3, [x28, #256]",
"str q2, [x28, #240]",
"str q15, [x28, #224]",
"str q14, [x28, #208]",
"str q13, [x28, #192]",
"str q12, [x28, #176]",
"str q11, [x28, #160]",
"str q10, [x28, #144]",
"str q9, [x28, #128]",
"str q8, [x28, #112]",
"str q7, [x28, #96]",
"str q6, [x28, #80]",
"str q5, [x28, #64]",
"str q4, [x28, #48]",
"ldr q2, [sp, #32]",
"str q2, [x28, #32]",
"ldp q2, q3, [x4, #800]",
"stp q2, q3, [x28, #240]",
"stp q14, q15, [x28, #208]",
"stp q12, q13, [x28, #176]",
"stp q10, q11, [x28, #144]",
"stp q8, q9, [x28, #112]",
"stp q6, q7, [x28, #80]",
"stp q4, q5, [x28, #48]",
"ldr q2, [sp]",
"str q2, [x28, #16]",
"b #+0x48",
"ldr q3, [sp, #32]",
"stp q2, q3, [x28, #16]",
"b #+0x28",
"movi v2.2d, #0x0",
"str q2, [x28, #256]",
"str q2, [x28, #240]",
"str q2, [x28, #224]",
"str q2, [x28, #208]",
"str q2, [x28, #192]",
"str q2, [x28, #176]",
"str q2, [x28, #160]",
"str q2, [x28, #144]",
"str q2, [x28, #128]",
"str q2, [x28, #112]",
"str q2, [x28, #96]",
"str q2, [x28, #80]",
"str q2, [x28, #64]",
"str q2, [x28, #48]",
"str q2, [x28, #32]",
"str q2, [x28, #16]",
"stp q2, q2, [x28, #240]",
"stp q2, q2, [x28, #208]",
"stp q2, q2, [x28, #176]",
"stp q2, q2, [x28, #144]",
"stp q2, q2, [x28, #112]",
"stp q2, q2, [x28, #80]",
"stp q2, q2, [x28, #48]",
"stp q2, q2, [x28, #16]",
"ldr x20, [x4, #512]",
"ubfx x20, x20, #1, #2",
"cbnz x20, #+0x8",

View File

@ -1406,7 +1406,7 @@
]
},
"fxsave [rax]": {
"ExpectedInstructionCount": 52,
"ExpectedInstructionCount": 39,
"Comment": "GROUP15 0x0F 0xAE /0",
"ExpectedArm64ASM": [
"ldrh w20, [x28, #1296]",
@ -1425,42 +1425,29 @@
"ldrb w20, [x28, #1298]",
"strb w20, [x4, #4]",
"ldr q2, [x28, #1040]",
"str q2, [x4, #32]",
"ldr q2, [x28, #1056]",
"str q2, [x4, #48]",
"ldr q3, [x28, #1056]",
"stp q2, q3, [x4, #32]",
"ldr q2, [x28, #1072]",
"str q2, [x4, #64]",
"ldr q2, [x28, #1088]",
"str q2, [x4, #80]",
"ldr q3, [x28, #1088]",
"stp q2, q3, [x4, #64]",
"ldr q2, [x28, #1104]",
"str q2, [x4, #96]",
"ldr q2, [x28, #1120]",
"str q2, [x4, #112]",
"ldr q3, [x28, #1120]",
"stp q2, q3, [x4, #96]",
"ldr q2, [x28, #1136]",
"str q2, [x4, #128]",
"ldr q2, [x28, #1152]",
"str q2, [x4, #144]",
"str q16, [x4, #160]",
"str q17, [x4, #176]",
"str q18, [x4, #192]",
"str q19, [x4, #208]",
"str q20, [x4, #224]",
"str q21, [x4, #240]",
"str q22, [x4, #256]",
"str q23, [x4, #272]",
"str q24, [x4, #288]",
"str q25, [x4, #304]",
"str q26, [x4, #320]",
"str q27, [x4, #336]",
"str q28, [x4, #352]",
"str q29, [x4, #368]",
"str q30, [x4, #384]",
"str q31, [x4, #400]",
"ldr q3, [x28, #1152]",
"stp q2, q3, [x4, #128]",
"stp q16, q17, [x4, #160]",
"stp q18, q19, [x4, #192]",
"stp q20, q21, [x4, #224]",
"stp q22, q23, [x4, #256]",
"stp q24, q25, [x4, #288]",
"stp q26, q27, [x4, #320]",
"stp q28, q29, [x4, #352]",
"stp q30, q31, [x4, #384]",
"ldr w20, [x28, #940]",
"and w20, w20, #0xffc0",
"str w20, [x4, #24]",
"mov w20, #0xffff",
"str w20, [x4, #28]"
"mov w21, #0xffff",
"stp w20, w21, [x4, #24]"
]
},
"rdfsbase eax": {
@ -1478,7 +1465,7 @@
]
},
"fxrstor [rax]": {
"ExpectedInstructionCount": 58,
"ExpectedInstructionCount": 46,
"Comment": "GROUP15 0x0F 0xAE /1",
"ExpectedArm64ASM": [
"ldrh w20, [x4]",
@ -1495,30 +1482,18 @@
"strb w23, [x28, #1018]",
"strb w20, [x28, #1022]",
"ldrb w20, [x4, #4]",
"ldr q2, [x4, #32]",
"ldr q3, [x4, #48]",
"ldr q4, [x4, #64]",
"ldr q5, [x4, #80]",
"ldr q6, [x4, #96]",
"ldr q7, [x4, #112]",
"ldr q8, [x4, #128]",
"ldr q9, [x4, #144]",
"ldr q16, [x4, #160]",
"ldr q17, [x4, #176]",
"ldr q18, [x4, #192]",
"ldr q19, [x4, #208]",
"ldr q20, [x4, #224]",
"ldr q21, [x4, #240]",
"ldr q22, [x4, #256]",
"ldr q23, [x4, #272]",
"ldr q24, [x4, #288]",
"ldr q25, [x4, #304]",
"ldr q26, [x4, #320]",
"ldr q27, [x4, #336]",
"ldr q28, [x4, #352]",
"ldr q29, [x4, #368]",
"ldr q30, [x4, #384]",
"ldr q31, [x4, #400]",
"ldp q2, q3, [x4, #32]",
"ldp q4, q5, [x4, #64]",
"ldp q6, q7, [x4, #96]",
"ldp q8, q9, [x4, #128]",
"ldp q16, q17, [x4, #160]",
"ldp q18, q19, [x4, #192]",
"ldp q20, q21, [x4, #224]",
"ldp q22, q23, [x4, #256]",
"ldp q24, q25, [x4, #288]",
"ldp q26, q27, [x4, #320]",
"ldp q28, q29, [x4, #352]",
"ldp q30, q31, [x4, #384]",
"ldr w21, [x4, #24]",
"and w21, w21, #0xffc0",
"str w21, [x28, #940]",
@ -1612,12 +1587,12 @@
]
},
"xsave [rax]": {
"ExpectedInstructionCount": 98,
"ExpectedInstructionCount": 69,
"Comment": "GROUP15 0x0F 0xAE /4",
"ExpectedArm64ASM": [
"ubfx x20, x4, #0, #1",
"cbnz x20, #+0x8",
"b #+0x80",
"b #+0x70",
"ldrh w20, [x28, #1296]",
"strh w20, [x4]",
"ldrb w20, [x28, #1019]",
@ -1634,83 +1609,54 @@
"ldrb w20, [x28, #1298]",
"strb w20, [x4, #4]",
"ldr q2, [x28, #1040]",
"str q2, [x4, #32]",
"ldr q2, [x28, #1056]",
"str q2, [x4, #48]",
"ldr q3, [x28, #1056]",
"stp q2, q3, [x4, #32]",
"ldr q2, [x28, #1072]",
"str q2, [x4, #64]",
"ldr q2, [x28, #1088]",
"str q2, [x4, #80]",
"ldr q3, [x28, #1088]",
"stp q2, q3, [x4, #64]",
"ldr q2, [x28, #1104]",
"str q2, [x4, #96]",
"ldr q2, [x28, #1120]",
"str q2, [x4, #112]",
"ldr q3, [x28, #1120]",
"stp q2, q3, [x4, #96]",
"ldr q2, [x28, #1136]",
"str q2, [x4, #128]",
"ldr q2, [x28, #1152]",
"str q2, [x4, #144]",
"ldr q3, [x28, #1152]",
"stp q2, q3, [x4, #128]",
"ubfx x20, x4, #1, #1",
"cbnz x20, #+0x8",
"b #+0x44",
"str q16, [x4, #160]",
"str q17, [x4, #176]",
"str q18, [x4, #192]",
"str q19, [x4, #208]",
"str q20, [x4, #224]",
"str q21, [x4, #240]",
"str q22, [x4, #256]",
"str q23, [x4, #272]",
"str q24, [x4, #288]",
"str q25, [x4, #304]",
"str q26, [x4, #320]",
"str q27, [x4, #336]",
"str q28, [x4, #352]",
"str q29, [x4, #368]",
"str q30, [x4, #384]",
"str q31, [x4, #400]",
"b #+0x24",
"stp q16, q17, [x4, #160]",
"stp q18, q19, [x4, #192]",
"stp q20, q21, [x4, #224]",
"stp q22, q23, [x4, #256]",
"stp q24, q25, [x4, #288]",
"stp q26, q27, [x4, #320]",
"stp q28, q29, [x4, #352]",
"stp q30, q31, [x4, #384]",
"ubfx x20, x4, #2, #1",
"cbnz x20, #+0x8",
"b #+0x84",
"ldr q2, [x28, #16]",
"str q2, [x4, #576]",
"ldr q2, [x28, #32]",
"str q2, [x4, #592]",
"ldr q2, [x28, #48]",
"str q2, [x4, #608]",
"ldr q2, [x28, #64]",
"str q2, [x4, #624]",
"ldr q2, [x28, #80]",
"str q2, [x4, #640]",
"ldr q2, [x28, #96]",
"str q2, [x4, #656]",
"ldr q2, [x28, #112]",
"str q2, [x4, #672]",
"ldr q2, [x28, #128]",
"str q2, [x4, #688]",
"ldr q2, [x28, #144]",
"str q2, [x4, #704]",
"ldr q2, [x28, #160]",
"str q2, [x4, #720]",
"ldr q2, [x28, #176]",
"str q2, [x4, #736]",
"ldr q2, [x28, #192]",
"str q2, [x4, #752]",
"ldr q2, [x28, #208]",
"str q2, [x4, #768]",
"ldr q2, [x28, #224]",
"str q2, [x4, #784]",
"ldr q2, [x28, #240]",
"str q2, [x4, #800]",
"ldr q2, [x28, #256]",
"str q2, [x4, #816]",
"b #+0x44",
"ldp q2, q3, [x28, #16]",
"stp q2, q3, [x4, #576]",
"ldp q2, q3, [x28, #48]",
"stp q2, q3, [x4, #608]",
"ldp q2, q3, [x28, #80]",
"stp q2, q3, [x4, #640]",
"ldp q2, q3, [x28, #112]",
"stp q2, q3, [x4, #672]",
"ldp q2, q3, [x28, #144]",
"stp q2, q3, [x4, #704]",
"ldp q2, q3, [x28, #176]",
"stp q2, q3, [x4, #736]",
"ldp q2, q3, [x28, #208]",
"stp q2, q3, [x4, #768]",
"ldp q2, q3, [x28, #240]",
"stp q2, q3, [x4, #800]",
"ubfx x20, x4, #1, #2",
"cbnz x20, #+0x8",
"b #+0x18",
"b #+0x14",
"ldr w20, [x28, #940]",
"and w20, w20, #0xffc0",
"str w20, [x4, #24]",
"mov w20, #0xffff",
"str w20, [x4, #28]",
"mov w21, #0xffff",
"stp w20, w21, [x4, #24]",
"ubfx x20, x4, #0, #3",
"str x20, [x4, #512]"
]
@ -1723,14 +1669,14 @@
]
},
"xrstor [rax]": {
"ExpectedInstructionCount": 166,
"ExpectedInstructionCount": 130,
"Comment": "GROUP15 0x0F 0xAE /5",
"ExpectedArm64ASM": [
"sub sp, sp, #0x40 (64)",
"ldr x20, [x4, #512]",
"ubfx x20, x20, #0, #1",
"cbnz x20, #+0x8",
"b #+0x84",
"b #+0x74",
"ldrh w20, [x4]",
"strh w20, [x28, #1296]",
"ldrh w20, [x4, #2]",
@ -1745,14 +1691,10 @@
"strb w23, [x28, #1018]",
"strb w20, [x28, #1022]",
"ldrb w20, [x4, #4]",
"ldr q2, [x4, #32]",
"ldr q3, [x4, #48]",
"ldr q4, [x4, #64]",
"ldr q5, [x4, #80]",
"ldr q6, [x4, #96]",
"ldr q7, [x4, #112]",
"ldr q8, [x4, #128]",
"ldr q9, [x4, #144]",
"ldp q2, q3, [x4, #32]",
"ldp q4, q5, [x4, #64]",
"ldp q6, q7, [x4, #96]",
"ldp q8, q9, [x4, #128]",
"strb w20, [x28, #1298]",
"str q9, [x28, #1152]",
"str q8, [x28, #1136]",
@ -1783,23 +1725,15 @@
"ldr x20, [x4, #512]",
"ubfx x20, x20, #1, #1",
"cbnz x20, #+0x8",
"b #+0x48",
"ldr q16, [x4, #160]",
"ldr q17, [x4, #176]",
"ldr q18, [x4, #192]",
"ldr q19, [x4, #208]",
"ldr q20, [x4, #224]",
"ldr q21, [x4, #240]",
"ldr q22, [x4, #256]",
"ldr q23, [x4, #272]",
"ldr q24, [x4, #288]",
"ldr q25, [x4, #304]",
"ldr q26, [x4, #320]",
"ldr q27, [x4, #336]",
"ldr q28, [x4, #352]",
"ldr q29, [x4, #368]",
"ldr q30, [x4, #384]",
"ldr q31, [x4, #400]",
"b #+0x28",
"ldp q16, q17, [x4, #160]",
"ldp q18, q19, [x4, #192]",
"ldp q20, q21, [x4, #224]",
"ldp q22, q23, [x4, #256]",
"ldp q24, q25, [x4, #288]",
"ldp q26, q27, [x4, #320]",
"ldp q28, q29, [x4, #352]",
"ldp q30, q31, [x4, #384]",
"b #+0x44",
"movi v31.2d, #0x0",
"mov v30.16b, v31.16b",
@ -1820,61 +1754,37 @@
"ldr x20, [x4, #512]",
"ubfx x20, x20, #2, #1",
"cbnz x20, #+0x8",
"b #+0x98",
"ldr q2, [x4, #576]",
"ldr q3, [x4, #592]",
"ldr q4, [x4, #608]",
"ldr q5, [x4, #624]",
"ldr q6, [x4, #640]",
"ldr q7, [x4, #656]",
"ldr q8, [x4, #672]",
"ldr q9, [x4, #688]",
"ldr q10, [x4, #704]",
"ldr q11, [x4, #720]",
"ldr q12, [x4, #736]",
"ldr q13, [x4, #752]",
"ldr q14, [x4, #768]",
"ldr q15, [x4, #784]",
"b #+0x58",
"ldp q2, q3, [x4, #576]",
"ldp q4, q5, [x4, #608]",
"ldp q6, q7, [x4, #640]",
"ldp q8, q9, [x4, #672]",
"ldp q10, q11, [x4, #704]",
"ldp q12, q13, [x4, #736]",
"ldp q14, q15, [x4, #768]",
"str q2, [sp]",
"ldr q2, [x4, #800]",
"str q3, [sp, #32]",
"ldr q3, [x4, #816]",
"str q3, [x28, #256]",
"str q2, [x28, #240]",
"str q15, [x28, #224]",
"str q14, [x28, #208]",
"str q13, [x28, #192]",
"str q12, [x28, #176]",
"str q11, [x28, #160]",
"str q10, [x28, #144]",
"str q9, [x28, #128]",
"str q8, [x28, #112]",
"str q7, [x28, #96]",
"str q6, [x28, #80]",
"str q5, [x28, #64]",
"str q4, [x28, #48]",
"ldr q2, [sp, #32]",
"str q2, [x28, #32]",
"ldp q2, q3, [x4, #800]",
"stp q2, q3, [x28, #240]",
"stp q14, q15, [x28, #208]",
"stp q12, q13, [x28, #176]",
"stp q10, q11, [x28, #144]",
"stp q8, q9, [x28, #112]",
"stp q6, q7, [x28, #80]",
"stp q4, q5, [x28, #48]",
"ldr q2, [sp]",
"str q2, [x28, #16]",
"b #+0x48",
"ldr q3, [sp, #32]",
"stp q2, q3, [x28, #16]",
"b #+0x28",
"movi v2.2d, #0x0",
"str q2, [x28, #256]",
"str q2, [x28, #240]",
"str q2, [x28, #224]",
"str q2, [x28, #208]",
"str q2, [x28, #192]",
"str q2, [x28, #176]",
"str q2, [x28, #160]",
"str q2, [x28, #144]",
"str q2, [x28, #128]",
"str q2, [x28, #112]",
"str q2, [x28, #96]",
"str q2, [x28, #80]",
"str q2, [x28, #64]",
"str q2, [x28, #48]",
"str q2, [x28, #32]",
"str q2, [x28, #16]",
"stp q2, q2, [x28, #240]",
"stp q2, q2, [x28, #208]",
"stp q2, q2, [x28, #176]",
"stp q2, q2, [x28, #144]",
"stp q2, q2, [x28, #112]",
"stp q2, q2, [x28, #80]",
"stp q2, q2, [x28, #48]",
"stp q2, q2, [x28, #16]",
"ldr x20, [x4, #512]",
"ubfx x20, x20, #1, #2",
"cbnz x20, #+0x8",