diff --git a/FEXCore/Source/Interface/Core/JIT/Arm64/MemoryOps.cpp b/FEXCore/Source/Interface/Core/JIT/Arm64/MemoryOps.cpp index cbd471e23..323b41d02 100644 --- a/FEXCore/Source/Interface/Core/JIT/Arm64/MemoryOps.cpp +++ b/FEXCore/Source/Interface/Core/JIT/Arm64/MemoryOps.cpp @@ -48,6 +48,31 @@ DEF_OP(LoadContext) { } } +DEF_OP(LoadContextPair) { + const auto Op = IROp->C(); + + if (Op->Class == FEXCore::IR::GPRClass) { + const auto Dst1 = GetReg(Op->OutValue1.ID()); + const auto Dst2 = GetReg(Op->OutValue2.ID()); + + switch (IROp->Size) { + case 4: ldp(Dst1.W(), Dst2.W(), STATE, Op->Offset); break; + case 8: ldp(Dst1.X(), Dst2.X(), STATE, Op->Offset); break; + default: LOGMAN_MSG_A_FMT("Unhandled LoadMemPair size: {}", IROp->Size); break; + } + } else { + const auto Dst1 = GetVReg(Op->OutValue1.ID()); + const auto Dst2 = GetVReg(Op->OutValue2.ID()); + + switch (IROp->Size) { + case 4: ldp(Dst1.S(), Dst2.S(), STATE, Op->Offset); break; + case 8: ldp(Dst1.D(), Dst2.D(), STATE, Op->Offset); break; + case 16: ldp(Dst1.Q(), Dst2.Q(), STATE, Op->Offset); break; + default: LOGMAN_MSG_A_FMT("Unhandled LoadMemPair size: {}", IROp->Size); break; + } + } +} + DEF_OP(StoreContext) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; @@ -80,6 +105,32 @@ DEF_OP(StoreContext) { } } +DEF_OP(StoreContextPair) { + const auto Op = IROp->C(); + const auto OpSize = IROp->Size; + + if (Op->Class == FEXCore::IR::GPRClass) { + auto Src1 = GetZeroableReg(Op->Value1); + auto Src2 = GetZeroableReg(Op->Value2); + + switch (OpSize) { + case 4: stp(Src1.W(), Src2.W(), STATE, Op->Offset); break; + case 8: stp(Src1.X(), Src2.X(), STATE, Op->Offset); break; + default: LOGMAN_MSG_A_FMT("Unhandled StoreContext size: {}", OpSize); break; + } + } else { + const auto Src1 = GetVReg(Op->Value1.ID()); + const auto Src2 = GetVReg(Op->Value2.ID()); + + switch (OpSize) { + case 4: stp(Src1.S(), Src2.S(), STATE, Op->Offset); break; + case 8: stp(Src1.D(), Src2.D(), STATE, Op->Offset); break; + case 16: stp(Src1.Q(), Src2.Q(), STATE, Op->Offset); break; + default: LOGMAN_MSG_A_FMT("Unhandled StoreContextPair size: {}", OpSize); break; + } + } +} + DEF_OP(LoadRegister) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; @@ -597,6 +648,32 @@ DEF_OP(LoadMem) { } } +DEF_OP(LoadMemPair) { + const auto Op = IROp->C(); + const auto Addr = GetReg(Op->Addr.ID()); + + if (Op->Class == FEXCore::IR::GPRClass) { + const auto Dst1 = GetReg(Op->OutValue1.ID()); + const auto Dst2 = GetReg(Op->OutValue2.ID()); + + switch (IROp->Size) { + case 4: ldp(Dst1.W(), Dst2.W(), Addr, Op->Offset); break; + case 8: ldp(Dst1.X(), Dst2.X(), Addr, Op->Offset); break; + default: LOGMAN_MSG_A_FMT("Unhandled LoadMemPair size: {}", IROp->Size); break; + } + } else { + const auto Dst1 = GetVReg(Op->OutValue1.ID()); + const auto Dst2 = GetVReg(Op->OutValue2.ID()); + + switch (IROp->Size) { + case 4: ldp(Dst1.S(), Dst2.S(), Addr, Op->Offset); break; + case 8: ldp(Dst1.D(), Dst2.D(), Addr, Op->Offset); break; + case 16: ldp(Dst1.Q(), Dst2.Q(), Addr, Op->Offset); break; + default: LOGMAN_MSG_A_FMT("Unhandled LoadMemPair size: {}", IROp->Size); break; + } + } +} + DEF_OP(LoadMemTSO) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; @@ -1443,6 +1520,32 @@ DEF_OP(StoreMem) { } } +DEF_OP(StoreMemPair) { + const auto Op = IROp->C(); + const auto OpSize = IROp->Size; + const auto Addr = GetReg(Op->Addr.ID()); + + if (Op->Class == FEXCore::IR::GPRClass) { + const auto Src1 = GetReg(Op->Value1.ID()); + const auto Src2 = GetReg(Op->Value2.ID()); + switch (OpSize) { + case 4: stp(Src1.W(), Src2.W(), Addr, Op->Offset); break; + case 8: stp(Src1.X(), Src2.X(), Addr, Op->Offset); break; + default: LOGMAN_MSG_A_FMT("Unhandled StoreMem size: {}", OpSize); break; + } + } else { + const auto Src1 = GetVReg(Op->Value1.ID()); + const auto Src2 = GetVReg(Op->Value2.ID()); + + switch (OpSize) { + case 4: stp(Src1.S(), Src2.S(), Addr, Op->Offset); break; + case 8: stp(Src1.D(), Src2.D(), Addr, Op->Offset); break; + case 16: stp(Src1.Q(), Src2.Q(), Addr, Op->Offset); break; + default: LOGMAN_MSG_A_FMT("Unhandled StoreMemPair size: {}", OpSize); break; + } + } +} + DEF_OP(StoreMemTSO) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; diff --git a/FEXCore/Source/Interface/Core/JIT/Arm64/MiscOps.cpp b/FEXCore/Source/Interface/Core/JIT/Arm64/MiscOps.cpp index 009320564..d593605b7 100644 --- a/FEXCore/Source/Interface/Core/JIT/Arm64/MiscOps.cpp +++ b/FEXCore/Source/Interface/Core/JIT/Arm64/MiscOps.cpp @@ -20,6 +20,7 @@ namespace FEXCore::CPU { DEF_OP(AllocateGPR) {} DEF_OP(AllocateGPRAfter) {} +DEF_OP(AllocateFPR) {} DEF_OP(GuestOpcode) { auto Op = IROp->C(); diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h index c8fe042ef..c0d6e0e2c 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h @@ -1268,7 +1268,23 @@ public: } else { bool Partial = RegCache.Partial & (1ull << Index); unsigned Size = Partial ? 8 : CacheIndexToSize(Index); - _StoreContext(Size, CacheIndexClass(Index), Value, CacheIndexToContextOffset(Index)); + uint64_t NextBit = (1ull << (Index - 1)); + uint32_t Offset = CacheIndexToContextOffset(Index); + auto Class = CacheIndexClass(Index); + + // Use stp where possible to store multiple values at a time. This accelerates AVX. + // TODO: this is all really confusing because of backwards iteration, + // can we peel back that hack? + if ((Bits & NextBit) && !Partial && Size >= 4 && CacheIndexToContextOffset(Index - 1) == Offset - Size && (Offset - Size) / Size < 64) { + LOGMAN_THROW_A_FMT(CacheIndexClass(Index - 1) == Class, "construction"); + LOGMAN_THROW_A_FMT((Offset % Size) == 0, "construction"); + Ref ValueNext = RegCache.Value[Index - 1]; + + _StoreContextPair(Size, Class, ValueNext, Value, Offset - Size); + Bits &= ~NextBit; + } else { + _StoreContext(Size, Class, Value, Offset); + } } Bits &= ~(1ull << Index); @@ -1901,6 +1917,43 @@ private: return RegCache.Value[Index]; } + RefPair AllocatePair(FEXCore::IR::RegisterClassType Class, uint8_t Size) { + if (Class == FPRClass) { + return {_AllocateFPR(Size, Size), _AllocateFPR(Size, Size)}; + } else { + return {_AllocateGPR(false), _AllocateGPR(false)}; + } + } + + RefPair LoadContextPair_Uncached(FEXCore::IR::RegisterClassType Class, uint8_t Size, unsigned Offset) { + RefPair Values = AllocatePair(Class, Size); + _LoadContextPair(Size, Class, Offset, Values.Low, Values.High); + return Values; + } + + RefPair LoadRegCachePair(uint64_t Offset, uint8_t Index, RegisterClassType RegClass, uint8_t Size) { + LOGMAN_THROW_AA_FMT(Index != DFIndex, "must be pairable"); + + // Try to load a pair into the cache + uint64_t Bits = (3ull << (uint64_t)Index); + if (((RegCache.Partial | RegCache.Cached) & Bits) == 0 && ((Offset / Size) < 64)) { + auto Values = LoadContextPair_Uncached(RegClass, Size, Offset); + RegCache.Value[Index] = Values.Low; + RegCache.Value[Index + 1] = Values.High; + RegCache.Cached |= Bits; + if (Size == 8) { + RegCache.Partial |= Bits; + } + return Values; + } + + // Fallback on a pair of loads + return { + .Low = LoadRegCache(Offset, Index, RegClass, Size), + .High = LoadRegCache(Offset + Size, Index + 1, RegClass, Size), + }; + } + Ref LoadGPR(uint8_t Reg) { return LoadRegCache(Reg, GPR0Index + Reg, GPRClass, CTX->GetGPRSize()); } @@ -1909,6 +1962,10 @@ private: return LoadRegCache(CacheIndexToContextOffset(Index), Index, CacheIndexClass(Index), Size); } + RefPair LoadContextPair(uint8_t Size, uint8_t Index) { + return LoadRegCachePair(CacheIndexToContextOffset(Index), Index, CacheIndexClass(Index), Size); + } + Ref LoadContext(uint8_t Index) { return LoadContext(CacheIndexToSize(Index), Index); } @@ -2342,7 +2399,7 @@ private: IROp_IRHeader* CurrentHeader {}; Ref _StoreMemAutoTSO(FEXCore::IR::RegisterClassType Class, uint8_t Size, Ref Addr, Ref Value, uint8_t Align = 1) { - if (CTX->IsAtomicTSOEnabled()) { + if (Class == FPRClass ? CTX->IsVectorAtomicTSOEnabled() : CTX->IsAtomicTSOEnabled()) { return _StoreMemTSO(Class, Size, Value, Addr, Invalid(), Align, MEM_OFFSET_SXTX, 1); } else { return _StoreMem(Class, Size, Value, Addr, Invalid(), Align, MEM_OFFSET_SXTX, 1); @@ -2350,7 +2407,7 @@ private: } Ref _LoadMemAutoTSO(FEXCore::IR::RegisterClassType Class, uint8_t Size, Ref ssa0, uint8_t Align = 1) { - if (CTX->IsAtomicTSOEnabled()) { + if (Class == FPRClass ? CTX->IsVectorAtomicTSOEnabled() : CTX->IsAtomicTSOEnabled()) { return _LoadMemTSO(Class, Size, ssa0, Invalid(), Align, MEM_OFFSET_SXTX, 1); } else { return _LoadMem(Class, Size, ssa0, Invalid(), Align, MEM_OFFSET_SXTX, 1); @@ -2368,6 +2425,44 @@ private: } } + AddressMode SelectPairAddressMode(AddressMode A, uint8_t Size) { + AddressMode Out {}; + + signed OffsetEl = A.Offset / Size; + if ((A.Offset % Size) == 0 && OffsetEl >= -64 && OffsetEl < 64) { + Out.Offset = A.Offset; + A.Offset = 0; + } + + Out.Base = LoadEffectiveAddress(A, true, false); + return Out; + } + + + RefPair LoadMemPair(FEXCore::IR::RegisterClassType Class, uint8_t Size, Ref Base, unsigned Offset) { + RefPair Values = AllocatePair(Class, Size); + _LoadMemPair(Class, Size, Base, Offset, Values.Low, Values.High); + return Values; + } + + RefPair _LoadMemPairAutoTSO(FEXCore::IR::RegisterClassType Class, uint8_t Size, AddressMode A, uint8_t Align = 1) { + bool AtomicTSO = CTX->IsAtomicTSOEnabled() && !A.NonTSO; + + // Use ldp if possible, otherwise fallback on two loads. + if (!AtomicTSO && !A.Segment && Size >= 4 & Size <= 16) { + A = SelectPairAddressMode(A, Size); + return LoadMemPair(Class, Size, A.Base, A.Offset); + } else { + AddressMode HighA = A; + HighA.Offset += 16; + + return { + .Low = _LoadMemAutoTSO(Class, Size, A, Align), + .High = _LoadMemAutoTSO(Class, Size, HighA, Align), + }; + } + } + Ref _StoreMemAutoTSO(FEXCore::IR::RegisterClassType Class, uint8_t Size, AddressMode A, Ref Value, uint8_t Align = 1) { bool AtomicTSO = CTX->IsAtomicTSOEnabled() && !A.NonTSO; A = SelectAddressMode(A, AtomicTSO, Class != GPRClass, Size); @@ -2379,6 +2474,20 @@ private: } } + void _StoreMemPairAutoTSO(FEXCore::IR::RegisterClassType Class, uint8_t Size, AddressMode A, Ref Value1, Ref Value2, uint8_t Align = 1) { + bool AtomicTSO = CTX->IsAtomicTSOEnabled() && !A.NonTSO; + + // Use stp if possible, otherwise fallback on two stores. + if (!AtomicTSO && !A.Segment && Size >= 4 & Size <= 16) { + A = SelectPairAddressMode(A, Size); + _StoreMemPair(Class, Size, Value1, Value2, A.Base, A.Offset); + } else { + _StoreMemAutoTSO(Class, Size, A, Value1, 1); + A.Offset += Size; + _StoreMemAutoTSO(Class, Size, A, Value2, 1); + } + } + Ref Prefetch(bool ForStore, bool Stream, uint8_t CacheLevel, Ref ssa0) { return _Prefetch(ForStore, Stream, CacheLevel, ssa0, Invalid(), MEM_OFFSET_SXTX, 1); } diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp index 60be2df5a..5f4305c5a 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp @@ -508,10 +508,11 @@ OpDispatchBuilder::RefPair OpDispatchBuilder::AVX128_LoadSource_WithOpSize( LOGMAN_THROW_AA_FMT(!IsVSIB, "VSIB uses LoadVSIB instead"); } - return { - .Low = _LoadMemAutoTSO(FPRClass, 16, A, 1), - .High = NeedsHigh ? _LoadMemAutoTSO(FPRClass, 16, HighA, 1) : nullptr, - }; + if (NeedsHigh) { + return _LoadMemPairAutoTSO(FPRClass, 16, A, 1); + } else { + return {.Low = _LoadMemAutoTSO(FPRClass, 16, A, 1)}; + } } } @@ -557,13 +558,10 @@ void OpDispatchBuilder::AVX128_StoreResult_WithOpSize(FEXCore::X86Tables::Decode } else { AddressMode A = DecodeAddress(Op, Operand, AccessType, false /* IsLoad */); - _StoreMemAutoTSO(FPRClass, 16, A, Src.Low, 1); - if (Src.High) { - AddressMode HighA = A; - HighA.Offset += 16; - - _StoreMemAutoTSO(FPRClass, 16, HighA, Src.High, 1); + _StoreMemPairAutoTSO(FPRClass, 16, A, Src.Low, Src.High, 1); + } else { + _StoreMemAutoTSO(FPRClass, 16, A, Src.Low, 1); } } } @@ -2173,18 +2171,20 @@ void OpDispatchBuilder::AVX128_VectorVariableBlend(OpcodeArgs) { void OpDispatchBuilder::AVX128_SaveAVXState(Ref MemBase) { const auto NumRegs = CTX->Config.Is64BitMode ? 16U : 8U; - for (uint32_t i = 0; i < NumRegs; ++i) { - Ref Upper = AVX128_LoadXMMRegister(i, true); - _StoreMem(FPRClass, 16, Upper, MemBase, _Constant(i * 16 + 576), 16, MEM_OFFSET_SXTX, 1); + for (uint32_t i = 0; i < NumRegs; i += 2) { + RefPair Pair = LoadContextPair(16, AVXHigh0Index + i); + _StoreMemPair(FPRClass, 16, Pair.Low, Pair.High, MemBase, i * 16 + 576); } } void OpDispatchBuilder::AVX128_RestoreAVXState(Ref MemBase) { const auto NumRegs = CTX->Config.Is64BitMode ? 16U : 8U; - for (uint32_t i = 0; i < NumRegs; ++i) { - Ref YMMHReg = _LoadMem(FPRClass, 16, MemBase, _Constant(i * 16 + 576), 16, MEM_OFFSET_SXTX, 1); - AVX128_StoreXMMRegister(i, YMMHReg, true); + for (uint32_t i = 0; i < NumRegs; i += 2) { + auto YMMHRegs = LoadMemPair(FPRClass, 16, MemBase, i * 16 + 576); + + AVX128_StoreXMMRegister(i, YMMHRegs.Low, true); + AVX128_StoreXMMRegister(i + 1, YMMHRegs.High, true); } } diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp index f7da91e02..7b82f5135 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp @@ -2736,37 +2736,33 @@ void OpDispatchBuilder::SaveX87State(OpcodeArgs, Ref MemBase) { // MXCSR_MASK: Mask for writes to the MXCSR register // If OSFXSR bit in CR4 is not set than FXSAVE /may/ not save the XMM registers // This is implementation dependent - for (uint32_t i = 0; i < Core::CPUState::NUM_MMS; ++i) { - Ref MMReg = LoadContext(MM0Index + i); - - _StoreMem(FPRClass, 16, MMReg, MemBase, _Constant(i * 16 + 32), 16, MEM_OFFSET_SXTX, 1); + for (uint32_t i = 0; i < Core::CPUState::NUM_MMS; i += 2) { + RefPair MMRegs = LoadContextPair(16, MM0Index + i); + _StoreMemPair(FPRClass, 16, MMRegs.Low, MMRegs.High, MemBase, i * 16 + 32); } } void OpDispatchBuilder::SaveSSEState(Ref MemBase) { const auto NumRegs = CTX->Config.Is64BitMode ? 16U : 8U; - for (uint32_t i = 0; i < NumRegs; ++i) { - Ref XMMReg = LoadXMMRegister(i); - - _StoreMem(FPRClass, 16, XMMReg, MemBase, _Constant(i * 16 + 160), 16, MEM_OFFSET_SXTX, 1); + for (uint32_t i = 0; i < NumRegs; i += 2) { + _StoreMemPair(FPRClass, 16, LoadXMMRegister(i), LoadXMMRegister(i + 1), MemBase, i * 16 + 160); } } void OpDispatchBuilder::SaveMXCSRState(Ref MemBase) { - _StoreMem(GPRClass, 4, GetMXCSR(), MemBase, _Constant(24), 4, MEM_OFFSET_SXTX, 1); - - // Store the mask for all bits. - _StoreMem(GPRClass, 4, _Constant(0xFFFF), MemBase, _Constant(28), 4, MEM_OFFSET_SXTX, 1); + // Store MXCSR and the mask for all bits. + _StoreMemPair(GPRClass, 4, GetMXCSR(), _Constant(0xFFFF), MemBase, 24); } void OpDispatchBuilder::SaveAVXState(Ref MemBase) { const auto NumRegs = CTX->Config.Is64BitMode ? 16U : 8U; - for (uint32_t i = 0; i < NumRegs; ++i) { - Ref Upper = _VDupElement(32, 16, LoadXMMRegister(i), 1); + for (uint32_t i = 0; i < NumRegs; i += 2) { + Ref Upper0 = _VDupElement(32, 16, LoadXMMRegister(i + 0), 1); + Ref Upper1 = _VDupElement(32, 16, LoadXMMRegister(i + 1), 1); - _StoreMem(FPRClass, 16, Upper, MemBase, _Constant(i * 16 + 576), 16, MEM_OFFSET_SXTX, 1); + _StoreMemPair(FPRClass, 16, Upper0, Upper1, MemBase, i * 16 + 576); } } @@ -2868,18 +2864,22 @@ void OpDispatchBuilder::RestoreX87State(Ref MemBase) { StoreContext(AbridgedFTWIndex, _LoadMem(GPRClass, 1, MemBase, _Constant(4), 2, MEM_OFFSET_SXTX, 1)); } - for (uint32_t i = 0; i < Core::CPUState::NUM_MMS; ++i) { - auto MMReg = _LoadMem(FPRClass, 16, MemBase, _Constant(i * 16 + 32), 16, MEM_OFFSET_SXTX, 1); - StoreContext(MM0Index + i, MMReg); + for (uint32_t i = 0; i < Core::CPUState::NUM_MMS; i += 2) { + auto MMRegs = LoadMemPair(FPRClass, 16, MemBase, i * 16 + 32); + + StoreContext(MM0Index + i, MMRegs.Low); + StoreContext(MM0Index + i + 1, MMRegs.High); } } void OpDispatchBuilder::RestoreSSEState(Ref MemBase) { const auto NumRegs = CTX->Config.Is64BitMode ? 16U : 8U; - for (uint32_t i = 0; i < NumRegs; ++i) { - Ref XMMReg = _LoadMem(FPRClass, 16, MemBase, _Constant(i * 16 + 160), 16, MEM_OFFSET_SXTX, 1); - StoreXMMRegister(i, XMMReg); + for (uint32_t i = 0; i < NumRegs; i += 2) { + auto XMMRegs = LoadMemPair(FPRClass, 16, MemBase, i * 16 + 160); + + StoreXMMRegister(i, XMMRegs.Low); + StoreXMMRegister(i + 1, XMMRegs.High); } } @@ -2896,11 +2896,12 @@ void OpDispatchBuilder::RestoreMXCSRState(Ref MXCSR) { void OpDispatchBuilder::RestoreAVXState(Ref MemBase) { const auto NumRegs = CTX->Config.Is64BitMode ? 16U : 8U; - for (uint32_t i = 0; i < NumRegs; ++i) { - Ref XMMReg = LoadXMMRegister(i); - Ref YMMHReg = _LoadMem(FPRClass, 16, MemBase, _Constant(i * 16 + 576), 16, MEM_OFFSET_SXTX, 1); - Ref YMM = _VInsElement(32, 16, 1, 0, XMMReg, YMMHReg); - StoreXMMRegister(i, YMM); + for (uint32_t i = 0; i < NumRegs; i += 2) { + Ref XMMReg0 = LoadXMMRegister(i + 0); + Ref XMMReg1 = LoadXMMRegister(i + 1); + auto YMMHRegs = LoadMemPair(FPRClass, 16, MemBase, i * 16 + 576); + StoreXMMRegister(i + 0, _VInsElement(32, 16, 1, 0, XMMReg0, YMMHRegs.Low)); + StoreXMMRegister(i + 1, _VInsElement(32, 16, 1, 0, XMMReg1, YMMHRegs.High)); } } diff --git a/FEXCore/Source/Interface/IR/IR.json b/FEXCore/Source/Interface/IR/IR.json index 5233db745..54379abc7 100644 --- a/FEXCore/Source/Interface/IR/IR.json +++ b/FEXCore/Source/Interface/IR/IR.json @@ -253,6 +253,11 @@ "If ForPair is set, RA will try to allocate the base of a register pair"], "DestSize": "8" }, + "FPR = AllocateFPR u8:#RegisterSize, u8:#ElementSize": { + "Desc": ["Like AllocateGPR, but for FPR"], + "DestSize": "RegisterSize", + "NumElements": "RegisterSize / ElementSize" + }, "GPR = AllocateGPRAfter GPR:$After": { "Desc": ["Silly pseudo-instruction to allocate a register for a future destination", "This is a kludge to deal with the IR's lack of multiple destinations", @@ -386,6 +391,20 @@ ] }, + "SSA:$Value1, SSA:$Value2 = LoadContextPair u8:#ByteSize, RegisterClass:$Class, u32:$Offset": { + "Desc": ["Loads a pair of values from the context with offset", + "Value0 = Ctx[Offset], Value1 = Ctx[Offset + ByteSize]" + ], + "HasSideEffects": true, + "DestSize": "ByteSize", + "EmitValidation": [ + "($Class == GPRClass && (#ByteSize == 1 || #ByteSize == 2 || #ByteSize == 4 || #ByteSize == 8)) || $Class == FPRClass", + "($Class == FPRClass && (#ByteSize == 1 || #ByteSize == 2 || #ByteSize == 4 || #ByteSize == 8 || #ByteSize == 16 || #ByteSize == 32)) || $Class == GPRClass", + "!($Offset >= offsetof(Core::CPUState, gregs[0]) && $Offset < offsetof(Core::CPUState, gregs[16])) && \"Can't LoadContext to GPR\"", + "!($Offset >= offsetof(Core::CPUState, xmm.avx.data[0]) && $Offset < offsetof(Core::CPUState, xmm.avx.data[16])) && \"Can't LoadContext to XMM\"" + ] + }, + "StoreContext u8:#ByteSize, RegisterClass:$Class, SSA:$Value, u32:$Offset": { "Desc": ["Stores a value to the context with offset", "Ctx[Offset] = Value", @@ -403,6 +422,24 @@ ] }, + "StoreContextPair u8:#ByteSize, RegisterClass:$Class, SSA:$Value1, SSA:$Value2, u32:$Offset": { + "Desc": ["Stores a pair of values to the context with offset", + "Ctx[Offset] = Value1, Ctx[Offset + ByteSize] = Value2", + "Zero Extends if value's type is too small", + "Truncates if value's type is too large" + ], + "HasSideEffects": true, + "DestSize": "ByteSize", + "EmitValidation": [ + "WalkFindRegClass($Value1) == $Class", + "WalkFindRegClass($Value2) == $Class", + "($Class == GPRClass && (#ByteSize == 1 || #ByteSize == 2 || #ByteSize == 4 || #ByteSize == 8)) || $Class == FPRClass", + "($Class == FPRClass && (#ByteSize == 1 || #ByteSize == 2 || #ByteSize == 4 || #ByteSize == 8 || #ByteSize == 16 || #ByteSize == 32)) || $Class == GPRClass", + "!($Offset >= offsetof(Core::CPUState, gregs[0]) && $Offset < offsetof(Core::CPUState, gregs[16])) && \"Can't StoreContext to GPR\"", + "!($Offset >= offsetof(Core::CPUState, xmm.avx.data[0]) && $Offset < offsetof(Core::CPUState, xmm.avx.data[16])) && \"Can't StoreContext to XMM\"" + ] + }, + "SSA = LoadContextIndexed GPR:$Index, u8:#ByteSize, u32:$BaseOffset, u32:$Stride, RegisterClass:$Class": { "Desc": ["Loads a value from the context with offset and indexed by SSA value", "Dest = Ctx[BaseOffset + Index * Stride]" @@ -476,6 +513,12 @@ "DestSize": "Size" }, + "SSA:$Value1, SSA:$Value2 = LoadMemPair RegisterClass:$Class, u8:#Size, GPR:$Addr, u32:$Offset": { + "Desc": ["Load a pair of values from memory."], + "DestSize": "Size", + "HasSideEffects": true + }, + "StoreMem RegisterClass:$Class, u8:#Size, SSA:$Value, GPR:$Addr, GPR:$Offset, u8:$Align, MemOffsetType:$OffsetType, u8:$OffsetScale": { "Desc": [ "Stores a value to memory.", "Zero Extends if value's type is too small", @@ -488,6 +531,19 @@ ] }, + "StoreMemPair RegisterClass:$Class, u8:#Size, SSA:$Value1, SSA:$Value2, GPR:$Addr, u32:$Offset": { + "Desc": [ "Stores a pair of values to memory.", + "Zero Extends if value's type is too small", + "Truncates if value's type is too large" + ], + "HasSideEffects": true, + "DestSize": "Size", + "EmitValidation": [ + "WalkFindRegClass($Value1) == $Class", + "WalkFindRegClass($Value2) == $Class" + ] + }, + "SSA = LoadMemTSO RegisterClass:$Class, u8:#Size, GPR:$Addr, GPR:$Offset, u8:$Align, MemOffsetType:$OffsetType, u8:$OffsetScale": { "Desc": ["Does a x86 TSO compatible load from memory. Offset must be Invalid()." ], diff --git a/unittests/InstructionCountCI/AVX128/VEX_map1.json b/unittests/InstructionCountCI/AVX128/VEX_map1.json index c0460c262..57a0edb0e 100644 --- a/unittests/InstructionCountCI/AVX128/VEX_map1.json +++ b/unittests/InstructionCountCI/AVX128/VEX_map1.json @@ -46,13 +46,12 @@ ] }, "vmovups ymm0, [rax]": { - "ExpectedInstructionCount": 3, + "ExpectedInstructionCount": 2, "Comment": [ "Map 1 0b00 0x10 256-bit" ], "ExpectedArm64ASM": [ - "ldr q16, [x4]", - "ldr q2, [x4, #16]", + "ldp q16, q2, [x4]", "str q2, [x28, #16]" ] }, @@ -89,13 +88,12 @@ ] }, "vmovupd ymm0, [rax]": { - "ExpectedInstructionCount": 3, + "ExpectedInstructionCount": 2, "Comment": [ "Map 1 0b01 0x10 256-bit" ], "ExpectedArm64ASM": [ - "ldr q16, [x4]", - "ldr q2, [x4, #16]", + "ldp q16, q2, [x4]", "str q2, [x28, #16]" ] }, @@ -156,14 +154,13 @@ ] }, "vmovups [rax], ymm0": { - "ExpectedInstructionCount": 3, + "ExpectedInstructionCount": 2, "Comment": [ "Map 1 0b00 0x11 256-bit" ], "ExpectedArm64ASM": [ "ldr q2, [x28, #16]", - "str q16, [x4]", - "str q2, [x4, #16]" + "stp q16, q2, [x4]" ] }, "vmovupd [rax], xmm0": { @@ -176,14 +173,13 @@ ] }, "vmovupd [rax], ymm0": { - "ExpectedInstructionCount": 3, + "ExpectedInstructionCount": 2, "Comment": [ "Map 1 0b01 0x11 256-bit" ], "ExpectedArm64ASM": [ "ldr q2, [x28, #16]", - "str q16, [x4]", - "str q2, [x4, #16]" + "stp q16, q2, [x4]" ] }, "vmovss [rax], xmm0": { @@ -272,13 +268,12 @@ ] }, "vmovsldup ymm0, [rax]": { - "ExpectedInstructionCount": 5, + "ExpectedInstructionCount": 4, "Comment": [ "Map 1 0b10 0x12 256-bit" ], "ExpectedArm64ASM": [ - "ldr q2, [x4]", - "ldr q3, [x4, #16]", + "ldp q2, q3, [x4]", "trn1 v16.4s, v2.4s, v2.4s", "trn1 v2.4s, v3.4s, v3.4s", "str q2, [x28, #16]" @@ -297,13 +292,12 @@ ] }, "vmovddup ymm0, [rax]": { - "ExpectedInstructionCount": 5, + "ExpectedInstructionCount": 4, "Comment": [ "Map 1 0b11 0x12 256-bit" ], "ExpectedArm64ASM": [ - "ldr q2, [x4]", - "ldr q3, [x4, #16]", + "ldp q2, q3, [x4]", "dup v16.2d, v2.d[0]", "dup v2.2d, v3.d[0]", "str q2, [x28, #16]" @@ -340,14 +334,13 @@ ] }, "vunpcklps ymm0, ymm1, [rax]": { - "ExpectedInstructionCount": 6, + "ExpectedInstructionCount": 5, "Comment": [ "Map 1 0b00 0x14 256-bit" ], "ExpectedArm64ASM": [ "ldr q2, [x28, #32]", - "ldr q3, [x4]", - "ldr q4, [x4, #16]", + "ldp q3, q4, [x4]", "zip1 v16.4s, v17.4s, v3.4s", "zip1 v2.4s, v2.4s, v4.4s", "str q2, [x28, #16]" @@ -366,14 +359,13 @@ ] }, "vunpcklpd ymm0, ymm1, [rax]": { - "ExpectedInstructionCount": 6, + "ExpectedInstructionCount": 5, "Comment": [ "Map 1 0b01 0x14 256-bit" ], "ExpectedArm64ASM": [ "ldr q2, [x28, #32]", - "ldr q3, [x4]", - "ldr q4, [x4, #16]", + "ldp q3, q4, [x4]", "zip1 v16.2d, v17.2d, v3.2d", "zip1 v2.2d, v2.2d, v4.2d", "str q2, [x28, #16]" @@ -392,14 +384,13 @@ ] }, "vunpckhps ymm0, ymm1, [rax]": { - "ExpectedInstructionCount": 6, + "ExpectedInstructionCount": 5, "Comment": [ "Map 1 0b00 0x15 256-bit" ], "ExpectedArm64ASM": [ "ldr q2, [x28, #32]", - "ldr q3, [x4]", - "ldr q4, [x4, #16]", + "ldp q3, q4, [x4]", "zip2 v16.4s, v17.4s, v3.4s", "zip2 v2.4s, v2.4s, v4.4s", "str q2, [x28, #16]" @@ -418,14 +409,13 @@ ] }, "vunpckhpd ymm0, ymm1, [rax]": { - "ExpectedInstructionCount": 6, + "ExpectedInstructionCount": 5, "Comment": [ "Map 1 0b01 0x15 256-bit" ], "ExpectedArm64ASM": [ "ldr q2, [x28, #32]", - "ldr q3, [x4]", - "ldr q4, [x4, #16]", + "ldp q3, q4, [x4]", "zip2 v16.2d, v17.2d, v3.2d", "zip2 v2.2d, v2.2d, v4.2d", "str q2, [x28, #16]" @@ -479,13 +469,12 @@ ] }, "vmovshdup ymm0, [rax]": { - "ExpectedInstructionCount": 5, + "ExpectedInstructionCount": 4, "Comment": [ "Map 1 0b10 0x16 256-bit" ], "ExpectedArm64ASM": [ - "ldr q2, [x4]", - "ldr q3, [x4, #16]", + "ldp q2, q3, [x4]", "trn2 v16.4s, v2.4s, v2.4s", "trn2 v2.4s, v3.4s, v3.4s", "str q2, [x28, #16]" @@ -1634,33 +1623,25 @@ ] }, "vzeroupper": { - "ExpectedInstructionCount": 17, + "ExpectedInstructionCount": 9, "Comment": [ "Might be able to use DZ ZVA", "Map 1 0b01 0x77 L=0" ], "ExpectedArm64ASM": [ "movi v2.2d, #0x0", - "str q2, [x28, #256]", - "str q2, [x28, #240]", - "str q2, [x28, #224]", - "str q2, [x28, #208]", - "str q2, [x28, #192]", - "str q2, [x28, #176]", - "str q2, [x28, #160]", - "str q2, [x28, #144]", - "str q2, [x28, #128]", - "str q2, [x28, #112]", - "str q2, [x28, #96]", - "str q2, [x28, #80]", - "str q2, [x28, #64]", - "str q2, [x28, #48]", - "str q2, [x28, #32]", - "str q2, [x28, #16]" + "stp q2, q2, [x28, #240]", + "stp q2, q2, [x28, #208]", + "stp q2, q2, [x28, #176]", + "stp q2, q2, [x28, #144]", + "stp q2, q2, [x28, #112]", + "stp q2, q2, [x28, #80]", + "stp q2, q2, [x28, #48]", + "stp q2, q2, [x28, #16]" ] }, "vzeroall": { - "ExpectedInstructionCount": 32, + "ExpectedInstructionCount": 24, "Comment": [ "Might be able to use DZ ZVA", "Map 1 0b01 0x77 L=1" @@ -1682,22 +1663,14 @@ "movi v29.2d, #0x0", "movi v30.2d, #0x0", "movi v31.2d, #0x0", - "str q31, [x28, #256]", - "str q31, [x28, #240]", - "str q31, [x28, #224]", - "str q31, [x28, #208]", - "str q31, [x28, #192]", - "str q31, [x28, #176]", - "str q31, [x28, #160]", - "str q31, [x28, #144]", - "str q31, [x28, #128]", - "str q31, [x28, #112]", - "str q31, [x28, #96]", - "str q31, [x28, #80]", - "str q31, [x28, #64]", - "str q31, [x28, #48]", - "str q31, [x28, #32]", - "str q31, [x28, #16]" + "stp q31, q31, [x28, #240]", + "stp q31, q31, [x28, #208]", + "stp q31, q31, [x28, #176]", + "stp q31, q31, [x28, #144]", + "stp q31, q31, [x28, #112]", + "stp q31, q31, [x28, #80]", + "stp q31, q31, [x28, #48]", + "stp q31, q31, [x28, #16]" ] }, "vcmpps xmm0, xmm1, xmm2, 0x00": { @@ -2631,13 +2604,12 @@ ] }, "vmovaps ymm0, [rax]": { - "ExpectedInstructionCount": 3, + "ExpectedInstructionCount": 2, "Comment": [ "Map 1 0b00 0x28 256-bit" ], "ExpectedArm64ASM": [ - "ldr q16, [x4]", - "ldr q2, [x4, #16]", + "ldp q16, q2, [x4]", "str q2, [x28, #16]" ] }, @@ -2675,13 +2647,12 @@ ] }, "vmovapd ymm0, [rax]": { - "ExpectedInstructionCount": 3, + "ExpectedInstructionCount": 2, "Comment": [ "Map 1 0b01 0x28 256-bit" ], "ExpectedArm64ASM": [ - "ldr q16, [x4]", - "ldr q2, [x4, #16]", + "ldp q16, q2, [x4]", "str q2, [x28, #16]" ] }, @@ -2717,14 +2688,13 @@ ] }, "vmovaps [rax], ymm0": { - "ExpectedInstructionCount": 3, + "ExpectedInstructionCount": 2, "Comment": [ "Map 1 0b00 0x29 256-bit" ], "ExpectedArm64ASM": [ "ldr q2, [x28, #16]", - "str q16, [x4]", - "str q2, [x4, #16]" + "stp q16, q2, [x4]" ] }, "vmovapd [rax], xmm0": { @@ -2737,14 +2707,13 @@ ] }, "vmovapd [rax], ymm0": { - "ExpectedInstructionCount": 3, + "ExpectedInstructionCount": 2, "Comment": [ "Map 1 0b01 0x29 256-bit" ], "ExpectedArm64ASM": [ "ldr q2, [x28, #16]", - "str q16, [x4]", - "str q2, [x4, #16]" + "stp q16, q2, [x4]" ] }, "vcvtsi2ss xmm0, xmm1, eax": { @@ -3161,13 +3130,12 @@ ] }, "vcvtpd2ps xmm0, yword [rax]": { - "ExpectedInstructionCount": 8, + "ExpectedInstructionCount": 7, "Comment": [ "Map 1 0b01 0x5a 128-bit" ], "ExpectedArm64ASM": [ - "ldr q2, [x4]", - "ldr q3, [x4, #16]", + "ldp q2, q3, [x4]", "fcvtn v2.2s, v2.2d", "fcvtn v3.2s, v3.2d", "mov v16.16b, v2.16b", @@ -4005,47 +3973,43 @@ ] }, "vmovdqa ymm0, [rax]": { - "ExpectedInstructionCount": 3, + "ExpectedInstructionCount": 2, "Comment": [ "Map 1 0b01 0x7f 128-bit" ], "ExpectedArm64ASM": [ - "ldr q16, [x4]", - "ldr q2, [x4, #16]", + "ldp q16, q2, [x4]", "str q2, [x28, #16]" ] }, "vmovdqa [rax], ymm0": { - "ExpectedInstructionCount": 3, + "ExpectedInstructionCount": 2, "Comment": [ "Map 1 0b01 0x7f 128-bit" ], "ExpectedArm64ASM": [ "ldr q2, [x28, #16]", - "str q16, [x4]", - "str q2, [x4, #16]" + "stp q16, q2, [x4]" ] }, "vmovdqu ymm0, [rax]": { - "ExpectedInstructionCount": 3, + "ExpectedInstructionCount": 2, "Comment": [ "Map 1 0b10 0x7f 128-bit" ], "ExpectedArm64ASM": [ - "ldr q16, [x4]", - "ldr q2, [x4, #16]", + "ldp q16, q2, [x4]", "str q2, [x28, #16]" ] }, "vmovdqu [rax], ymm0": { - "ExpectedInstructionCount": 3, + "ExpectedInstructionCount": 2, "Comment": [ "Map 1 0b10 0x7f 128-bit" ], "ExpectedArm64ASM": [ "ldr q2, [x28, #16]", - "str q16, [x4]", - "str q2, [x4, #16]" + "stp q16, q2, [x4]" ] }, "vaddsubpd xmm0, xmm1, xmm2": { @@ -5178,13 +5142,12 @@ ] }, "vlddqu ymm0, [rax]": { - "ExpectedInstructionCount": 3, + "ExpectedInstructionCount": 2, "Comment": [ "Map 1 0b11 0xf0 256-bit" ], "ExpectedArm64ASM": [ - "ldr q16, [x4]", - "ldr q2, [x4, #16]", + "ldp q16, q2, [x4]", "str q2, [x28, #16]" ] }, diff --git a/unittests/InstructionCountCI/FEXOpt/AddressingLimitations.json b/unittests/InstructionCountCI/FEXOpt/AddressingLimitations.json index 8c0f0fef9..17602d097 100644 --- a/unittests/InstructionCountCI/FEXOpt/AddressingLimitations.json +++ b/unittests/InstructionCountCI/FEXOpt/AddressingLimitations.json @@ -1052,6 +1052,86 @@ "ldr x20, [x28, #960]", "ldur x7, [x20, #20]" ] + }, + "vmovdqu ymm7,yword [rsi+0x60]": { + "ExpectedInstructionCount": 2, + "ExpectedArm64ASM": [ + "ldp q23, q2, [x10, #96]", + "str q2, [x28, #128]" + ] + }, + "vmovdqu ymm7,yword [rsi+0x120]": { + "ExpectedInstructionCount": 2, + "ExpectedArm64ASM": [ + "ldp q23, q2, [x10, #288]", + "str q2, [x28, #128]" + ] + }, + "vmovdqu ymm7,yword [rsi-0x60]": { + "ExpectedInstructionCount": 2, + "ExpectedArm64ASM": [ + "ldp q23, q2, [x10, #-96]", + "str q2, [x28, #128]" + ] + }, + "vmovdqu ymm7,yword [rsi-0x400]": { + "ExpectedInstructionCount": 2, + "ExpectedArm64ASM": [ + "ldp q23, q2, [x10, #-1024]", + "str q2, [x28, #128]" + ] + }, + "vmovdqu ymm7,yword [rsi-0x420]": { + "ExpectedInstructionCount": 3, + "ExpectedArm64ASM": [ + "sub x20, x10, #0x420 (1056)", + "ldp q23, q2, [x20]", + "str q2, [x28, #128]" + ] + }, + "vmovdqu ymm7,yword [rsi+0x3d0]": { + "ExpectedInstructionCount": 2, + "ExpectedArm64ASM": [ + "ldp q23, q2, [x10, #976]", + "str q2, [x28, #128]" + ] + }, + "vmovdqu ymm7,yword [rsi+0x400]": { + "ExpectedInstructionCount": 3, + "ExpectedArm64ASM": [ + "add x20, x10, #0x400 (1024)", + "ldp q23, q2, [x20]", + "str q2, [x28, #128]" + ] + }, + "vmovdqa yword [rcx+0x60],ymm1": { + "ExpectedInstructionCount": 2, + "ExpectedArm64ASM": [ + "ldr q2, [x28, #32]", + "stp q17, q2, [x5, #96]" + ] + }, + "vmovdqa yword [rcx+0x3d0],ymm1": { + "ExpectedInstructionCount": 2, + "ExpectedArm64ASM": [ + "ldr q2, [x28, #32]", + "stp q17, q2, [x5, #976]" + ] + }, + "vmovdqa yword [rcx-0x3d0],ymm1": { + "ExpectedInstructionCount": 2, + "ExpectedArm64ASM": [ + "ldr q2, [x28, #32]", + "stp q17, q2, [x5, #-976]" + ] + }, + "vmovdqa yword [rcx+rsi-0x3d0],ymm1": { + "ExpectedInstructionCount": 3, + "ExpectedArm64ASM": [ + "ldr q2, [x28, #32]", + "add x20, x5, x10", + "stp q17, q2, [x20, #-976]" + ] } } } diff --git a/unittests/InstructionCountCI/FlagM/HotBlocks.json b/unittests/InstructionCountCI/FlagM/HotBlocks.json index c12cd6a5e..98e31707f 100644 --- a/unittests/InstructionCountCI/FlagM/HotBlocks.json +++ b/unittests/InstructionCountCI/FlagM/HotBlocks.json @@ -428,6 +428,161 @@ "mov x11, x26" ] }, + "glibc AVX memcpy block 1": { + "ExpectedInstructionCount": 26, + "x86Insts": [ + "vmovdqu ymm5,yword [rsi+0x20]", + "vmovdqu ymm6,yword [rsi+0x40]", + "lea rcx,[rdi+rdx*1-0x81]", + "vmovdqu ymm7,yword [rsi+0x60]", + "vmovdqu ymm8,yword [rsi+rdx*1-0x20]", + "sub rsi,rdi", + "and rcx,0xffffffffffffffe0", + "add rsi,rcx", + "nop dword [rax+0x0]", + "vmovdqu ymm1,yword [rsi+0x60]", + "vmovdqu ymm2,yword [rsi+0x40]", + "vmovdqu ymm3,yword [rsi+0x20]", + "vmovdqu ymm4,yword [rsi]", + "add rsi,0xffffffffffffff80", + "vmovdqa yword [rcx+0x60],ymm1", + "vmovdqa yword [rcx+0x40],ymm2", + "vmovdqa yword [rcx+0x20],ymm3", + "vmovdqa yword [rcx],ymm4", + "add rcx,0xffffffffffffff80", + "cmp rdi,rcx" + ], + "ExpectedArm64ASM": [ + "ldp q21, q2, [x10, #32]", + "ldp q22, q3, [x10, #64]", + "sub x20, x11, #0x81 (129)", + "add x5, x20, x6", + "ldp q23, q4, [x10, #96]", + "add x20, x10, x6", + "ldp q24, q5, [x20, #-32]", + "sub x10, x10, x11", + "and x5, x5, #0xffffffffffffffe0", + "add x10, x10, x5", + "ldp q17, q6, [x10, #96]", + "ldp q18, q7, [x10, #64]", + "ldp q19, q8, [x10, #32]", + "ldp q20, q9, [x10]", + "sub x10, x10, #0x80 (128)", + "stp q17, q6, [x5, #96]", + "stp q18, q7, [x5, #64]", + "stp q19, q8, [x5, #32]", + "stp q20, q9, [x5]", + "sub x5, x5, #0x80 (128)", + "eor w27, w11, w5", + "subs x26, x11, x5", + "stp q4, q5, [x28, #128]", + "stp q2, q3, [x28, #96]", + "stp q8, q9, [x28, #64]", + "stp q6, q7, [x28, #32]" + ] + }, + "glibc AVX memcpy block 2": { + "ExpectedInstructionCount": 31, + "x86Insts": [ + "vmovdqu ymm5,yword [rsi+rdx*1-0x20]", + "vmovdqu ymm6,yword [rsi+rdx*1-0x40]", + "mov rcx,rdi", + "or rdi,0x1f", + "vmovdqu ymm7,yword [rsi+rdx*1-0x60]", + "vmovdqu ymm8,yword [rsi+rdx*1-0x80]", + "sub rsi,rcx", + "inc rdi", + "add rsi,rdi", + "lea rdx,[rcx+rdx*1-0x80]", + "nop dword [rax+rax*1+0x0]", + "vmovdqu ymm1,yword [rsi]", + "vmovdqu ymm2,yword [rsi+0x20]", + "vmovdqu ymm3,yword [rsi+0x40]", + "vmovdqu ymm4,yword [rsi+0x60]", + "sub rsi,0xffffffffffffff80", + "vmovdqa yword [rdi],ymm1", + "vmovdqa yword [rdi+0x20],ymm2", + "vmovdqa yword [rdi+0x40],ymm3", + "vmovdqa yword [rdi+0x60],ymm4", + "sub rdi,0xffffffffffffff80", + "cmp rdx,rdi" + ], + "ExpectedArm64ASM": [ + "add x20, x10, x6", + "ldp q21, q2, [x20, #-32]", + "add x20, x10, x6", + "ldp q22, q3, [x20, #-64]", + "mov x5, x11", + "orr x11, x11, #0x1f", + "add x20, x10, x6", + "ldp q23, q4, [x20, #-96]", + "add x20, x10, x6", + "ldp q24, q5, [x20, #-128]", + "sub x10, x10, x5", + "add x11, x11, #0x1 (1)", + "add x10, x10, x11", + "sub x20, x5, #0x80 (128)", + "add x6, x20, x6", + "ldp q17, q6, [x10]", + "ldp q18, q7, [x10, #32]", + "ldp q19, q8, [x10, #64]", + "ldp q20, q9, [x10, #96]", + "add x10, x10, #0x80 (128)", + "stp q17, q6, [x11]", + "stp q18, q7, [x11, #32]", + "stp q19, q8, [x11, #64]", + "stp q20, q9, [x11, #96]", + "add x11, x11, #0x80 (128)", + "eor w27, w6, w11", + "subs x26, x6, x11", + "stp q4, q5, [x28, #128]", + "stp q2, q3, [x28, #96]", + "stp q8, q9, [x28, #64]", + "stp q6, q7, [x28, #32]" + ] + }, + "bytemark strsift": { + "ExpectedInstructionCount": 20, + "x86Insts": [ + "mov rsi,rdx", + "and rsi,0xfffffffffffffffc", + "movq xmm0,rcx", + "pshufd xmm0,xmm0,0x44", + "mov rdi,qword [rsp+0x20]", + "lea rdi,[rdi+r13*8]", + "xor r8d,r8d", + "movdqu xmm1,oword [rdi+r8*8-0x10]", + "movdqu xmm2,oword [rdi+r8*8]", + "paddq xmm1,xmm0", + "paddq xmm2,xmm0", + "movdqu oword [rdi+r8*8-0x10],xmm1", + "movdqu oword [rdi+r8*8],xmm2", + "add r8,0x4", + "cmp rsi,r8" + ], + "ExpectedArm64ASM": [ + "mov x10, x6", + "and x10, x10, #0xfffffffffffffffc", + "fmov d16, x5", + "dup v16.2d, v16.d[0]", + "ldr x11, [x8, #32]", + "add x11, x11, x17, lsl #3", + "mov w12, #0x0", + "add x20, x11, x12, lsl #3", + "ldur q17, [x20, #-16]", + "add x20, x11, x12, lsl #3", + "ldr q18, [x20]", + "add v17.2d, v17.2d, v16.2d", + "add v18.2d, v18.2d, v16.2d", + "add x20, x11, x12, lsl #3", + "stur q17, [x20, #-16]", + "add x20, x11, x12, lsl #3", + "str q18, [x20]", + "add x12, x12, #0x4 (4)", + "eor w27, w10, w12", + "subs x26, x10, x12" + ] + }, "pcmpistri xmm0, xmm1, 0_0_00_11_01b": { "ExpectedInstructionCount": 41, "Comment": [ diff --git a/unittests/InstructionCountCI/FlagM/SecondaryGroup.json b/unittests/InstructionCountCI/FlagM/SecondaryGroup.json index 9502cf5b4..27c3e36fb 100644 --- a/unittests/InstructionCountCI/FlagM/SecondaryGroup.json +++ b/unittests/InstructionCountCI/FlagM/SecondaryGroup.json @@ -1216,7 +1216,7 @@ ] }, "fxsave [rax]": { - "ExpectedInstructionCount": 52, + "ExpectedInstructionCount": 39, "Comment": "GROUP15 0x0F 0xAE /0", "ExpectedArm64ASM": [ "ldrh w20, [x28, #1296]", @@ -1235,42 +1235,29 @@ "ldrb w20, [x28, #1298]", "strb w20, [x4, #4]", "ldr q2, [x28, #1040]", - "str q2, [x4, #32]", - "ldr q2, [x28, #1056]", - "str q2, [x4, #48]", + "ldr q3, [x28, #1056]", + "stp q2, q3, [x4, #32]", "ldr q2, [x28, #1072]", - "str q2, [x4, #64]", - "ldr q2, [x28, #1088]", - "str q2, [x4, #80]", + "ldr q3, [x28, #1088]", + "stp q2, q3, [x4, #64]", "ldr q2, [x28, #1104]", - "str q2, [x4, #96]", - "ldr q2, [x28, #1120]", - "str q2, [x4, #112]", + "ldr q3, [x28, #1120]", + "stp q2, q3, [x4, #96]", "ldr q2, [x28, #1136]", - "str q2, [x4, #128]", - "ldr q2, [x28, #1152]", - "str q2, [x4, #144]", - "str q16, [x4, #160]", - "str q17, [x4, #176]", - "str q18, [x4, #192]", - "str q19, [x4, #208]", - "str q20, [x4, #224]", - "str q21, [x4, #240]", - "str q22, [x4, #256]", - "str q23, [x4, #272]", - "str q24, [x4, #288]", - "str q25, [x4, #304]", - "str q26, [x4, #320]", - "str q27, [x4, #336]", - "str q28, [x4, #352]", - "str q29, [x4, #368]", - "str q30, [x4, #384]", - "str q31, [x4, #400]", + "ldr q3, [x28, #1152]", + "stp q2, q3, [x4, #128]", + "stp q16, q17, [x4, #160]", + "stp q18, q19, [x4, #192]", + "stp q20, q21, [x4, #224]", + "stp q22, q23, [x4, #256]", + "stp q24, q25, [x4, #288]", + "stp q26, q27, [x4, #320]", + "stp q28, q29, [x4, #352]", + "stp q30, q31, [x4, #384]", "ldr w20, [x28, #940]", "and w20, w20, #0xffc0", - "str w20, [x4, #24]", - "mov w20, #0xffff", - "str w20, [x4, #28]" + "mov w21, #0xffff", + "stp w20, w21, [x4, #24]" ] }, "rdfsbase eax": { @@ -1288,7 +1275,7 @@ ] }, "fxrstor [rax]": { - "ExpectedInstructionCount": 58, + "ExpectedInstructionCount": 46, "Comment": "GROUP15 0x0F 0xAE /1", "ExpectedArm64ASM": [ "ldrh w20, [x4]", @@ -1305,30 +1292,18 @@ "strb w23, [x28, #1018]", "strb w20, [x28, #1022]", "ldrb w20, [x4, #4]", - "ldr q2, [x4, #32]", - "ldr q3, [x4, #48]", - "ldr q4, [x4, #64]", - "ldr q5, [x4, #80]", - "ldr q6, [x4, #96]", - "ldr q7, [x4, #112]", - "ldr q8, [x4, #128]", - "ldr q9, [x4, #144]", - "ldr q16, [x4, #160]", - "ldr q17, [x4, #176]", - "ldr q18, [x4, #192]", - "ldr q19, [x4, #208]", - "ldr q20, [x4, #224]", - "ldr q21, [x4, #240]", - "ldr q22, [x4, #256]", - "ldr q23, [x4, #272]", - "ldr q24, [x4, #288]", - "ldr q25, [x4, #304]", - "ldr q26, [x4, #320]", - "ldr q27, [x4, #336]", - "ldr q28, [x4, #352]", - "ldr q29, [x4, #368]", - "ldr q30, [x4, #384]", - "ldr q31, [x4, #400]", + "ldp q2, q3, [x4, #32]", + "ldp q4, q5, [x4, #64]", + "ldp q6, q7, [x4, #96]", + "ldp q8, q9, [x4, #128]", + "ldp q16, q17, [x4, #160]", + "ldp q18, q19, [x4, #192]", + "ldp q20, q21, [x4, #224]", + "ldp q22, q23, [x4, #256]", + "ldp q24, q25, [x4, #288]", + "ldp q26, q27, [x4, #320]", + "ldp q28, q29, [x4, #352]", + "ldp q30, q31, [x4, #384]", "ldr w21, [x4, #24]", "and w21, w21, #0xffc0", "str w21, [x28, #940]", @@ -1422,12 +1397,12 @@ ] }, "xsave [rax]": { - "ExpectedInstructionCount": 98, + "ExpectedInstructionCount": 69, "Comment": "GROUP15 0x0F 0xAE /4", "ExpectedArm64ASM": [ "ubfx x20, x4, #0, #1", "cbnz x20, #+0x8", - "b #+0x80", + "b #+0x70", "ldrh w20, [x28, #1296]", "strh w20, [x4]", "ldrb w20, [x28, #1019]", @@ -1444,83 +1419,54 @@ "ldrb w20, [x28, #1298]", "strb w20, [x4, #4]", "ldr q2, [x28, #1040]", - "str q2, [x4, #32]", - "ldr q2, [x28, #1056]", - "str q2, [x4, #48]", + "ldr q3, [x28, #1056]", + "stp q2, q3, [x4, #32]", "ldr q2, [x28, #1072]", - "str q2, [x4, #64]", - "ldr q2, [x28, #1088]", - "str q2, [x4, #80]", + "ldr q3, [x28, #1088]", + "stp q2, q3, [x4, #64]", "ldr q2, [x28, #1104]", - "str q2, [x4, #96]", - "ldr q2, [x28, #1120]", - "str q2, [x4, #112]", + "ldr q3, [x28, #1120]", + "stp q2, q3, [x4, #96]", "ldr q2, [x28, #1136]", - "str q2, [x4, #128]", - "ldr q2, [x28, #1152]", - "str q2, [x4, #144]", + "ldr q3, [x28, #1152]", + "stp q2, q3, [x4, #128]", "ubfx x20, x4, #1, #1", "cbnz x20, #+0x8", - "b #+0x44", - "str q16, [x4, #160]", - "str q17, [x4, #176]", - "str q18, [x4, #192]", - "str q19, [x4, #208]", - "str q20, [x4, #224]", - "str q21, [x4, #240]", - "str q22, [x4, #256]", - "str q23, [x4, #272]", - "str q24, [x4, #288]", - "str q25, [x4, #304]", - "str q26, [x4, #320]", - "str q27, [x4, #336]", - "str q28, [x4, #352]", - "str q29, [x4, #368]", - "str q30, [x4, #384]", - "str q31, [x4, #400]", + "b #+0x24", + "stp q16, q17, [x4, #160]", + "stp q18, q19, [x4, #192]", + "stp q20, q21, [x4, #224]", + "stp q22, q23, [x4, #256]", + "stp q24, q25, [x4, #288]", + "stp q26, q27, [x4, #320]", + "stp q28, q29, [x4, #352]", + "stp q30, q31, [x4, #384]", "ubfx x20, x4, #2, #1", "cbnz x20, #+0x8", - "b #+0x84", - "ldr q2, [x28, #16]", - "str q2, [x4, #576]", - "ldr q2, [x28, #32]", - "str q2, [x4, #592]", - "ldr q2, [x28, #48]", - "str q2, [x4, #608]", - "ldr q2, [x28, #64]", - "str q2, [x4, #624]", - "ldr q2, [x28, #80]", - "str q2, [x4, #640]", - "ldr q2, [x28, #96]", - "str q2, [x4, #656]", - "ldr q2, [x28, #112]", - "str q2, [x4, #672]", - "ldr q2, [x28, #128]", - "str q2, [x4, #688]", - "ldr q2, [x28, #144]", - "str q2, [x4, #704]", - "ldr q2, [x28, #160]", - "str q2, [x4, #720]", - "ldr q2, [x28, #176]", - "str q2, [x4, #736]", - "ldr q2, [x28, #192]", - "str q2, [x4, #752]", - "ldr q2, [x28, #208]", - "str q2, [x4, #768]", - "ldr q2, [x28, #224]", - "str q2, [x4, #784]", - "ldr q2, [x28, #240]", - "str q2, [x4, #800]", - "ldr q2, [x28, #256]", - "str q2, [x4, #816]", + "b #+0x44", + "ldp q2, q3, [x28, #16]", + "stp q2, q3, [x4, #576]", + "ldp q2, q3, [x28, #48]", + "stp q2, q3, [x4, #608]", + "ldp q2, q3, [x28, #80]", + "stp q2, q3, [x4, #640]", + "ldp q2, q3, [x28, #112]", + "stp q2, q3, [x4, #672]", + "ldp q2, q3, [x28, #144]", + "stp q2, q3, [x4, #704]", + "ldp q2, q3, [x28, #176]", + "stp q2, q3, [x4, #736]", + "ldp q2, q3, [x28, #208]", + "stp q2, q3, [x4, #768]", + "ldp q2, q3, [x28, #240]", + "stp q2, q3, [x4, #800]", "ubfx x20, x4, #1, #2", "cbnz x20, #+0x8", - "b #+0x18", + "b #+0x14", "ldr w20, [x28, #940]", "and w20, w20, #0xffc0", - "str w20, [x4, #24]", - "mov w20, #0xffff", - "str w20, [x4, #28]", + "mov w21, #0xffff", + "stp w20, w21, [x4, #24]", "ubfx x20, x4, #0, #3", "str x20, [x4, #512]" ] @@ -1533,14 +1479,14 @@ ] }, "xrstor [rax]": { - "ExpectedInstructionCount": 166, + "ExpectedInstructionCount": 130, "Comment": "GROUP15 0x0F 0xAE /5", "ExpectedArm64ASM": [ "sub sp, sp, #0x40 (64)", "ldr x20, [x4, #512]", "ubfx x20, x20, #0, #1", "cbnz x20, #+0x8", - "b #+0x84", + "b #+0x74", "ldrh w20, [x4]", "strh w20, [x28, #1296]", "ldrh w20, [x4, #2]", @@ -1555,14 +1501,10 @@ "strb w23, [x28, #1018]", "strb w20, [x28, #1022]", "ldrb w20, [x4, #4]", - "ldr q2, [x4, #32]", - "ldr q3, [x4, #48]", - "ldr q4, [x4, #64]", - "ldr q5, [x4, #80]", - "ldr q6, [x4, #96]", - "ldr q7, [x4, #112]", - "ldr q8, [x4, #128]", - "ldr q9, [x4, #144]", + "ldp q2, q3, [x4, #32]", + "ldp q4, q5, [x4, #64]", + "ldp q6, q7, [x4, #96]", + "ldp q8, q9, [x4, #128]", "strb w20, [x28, #1298]", "str q9, [x28, #1152]", "str q8, [x28, #1136]", @@ -1593,23 +1535,15 @@ "ldr x20, [x4, #512]", "ubfx x20, x20, #1, #1", "cbnz x20, #+0x8", - "b #+0x48", - "ldr q16, [x4, #160]", - "ldr q17, [x4, #176]", - "ldr q18, [x4, #192]", - "ldr q19, [x4, #208]", - "ldr q20, [x4, #224]", - "ldr q21, [x4, #240]", - "ldr q22, [x4, #256]", - "ldr q23, [x4, #272]", - "ldr q24, [x4, #288]", - "ldr q25, [x4, #304]", - "ldr q26, [x4, #320]", - "ldr q27, [x4, #336]", - "ldr q28, [x4, #352]", - "ldr q29, [x4, #368]", - "ldr q30, [x4, #384]", - "ldr q31, [x4, #400]", + "b #+0x28", + "ldp q16, q17, [x4, #160]", + "ldp q18, q19, [x4, #192]", + "ldp q20, q21, [x4, #224]", + "ldp q22, q23, [x4, #256]", + "ldp q24, q25, [x4, #288]", + "ldp q26, q27, [x4, #320]", + "ldp q28, q29, [x4, #352]", + "ldp q30, q31, [x4, #384]", "b #+0x44", "movi v31.2d, #0x0", "mov v30.16b, v31.16b", @@ -1630,61 +1564,37 @@ "ldr x20, [x4, #512]", "ubfx x20, x20, #2, #1", "cbnz x20, #+0x8", - "b #+0x98", - "ldr q2, [x4, #576]", - "ldr q3, [x4, #592]", - "ldr q4, [x4, #608]", - "ldr q5, [x4, #624]", - "ldr q6, [x4, #640]", - "ldr q7, [x4, #656]", - "ldr q8, [x4, #672]", - "ldr q9, [x4, #688]", - "ldr q10, [x4, #704]", - "ldr q11, [x4, #720]", - "ldr q12, [x4, #736]", - "ldr q13, [x4, #752]", - "ldr q14, [x4, #768]", - "ldr q15, [x4, #784]", + "b #+0x58", + "ldp q2, q3, [x4, #576]", + "ldp q4, q5, [x4, #608]", + "ldp q6, q7, [x4, #640]", + "ldp q8, q9, [x4, #672]", + "ldp q10, q11, [x4, #704]", + "ldp q12, q13, [x4, #736]", + "ldp q14, q15, [x4, #768]", "str q2, [sp]", - "ldr q2, [x4, #800]", "str q3, [sp, #32]", - "ldr q3, [x4, #816]", - "str q3, [x28, #256]", - "str q2, [x28, #240]", - "str q15, [x28, #224]", - "str q14, [x28, #208]", - "str q13, [x28, #192]", - "str q12, [x28, #176]", - "str q11, [x28, #160]", - "str q10, [x28, #144]", - "str q9, [x28, #128]", - "str q8, [x28, #112]", - "str q7, [x28, #96]", - "str q6, [x28, #80]", - "str q5, [x28, #64]", - "str q4, [x28, #48]", - "ldr q2, [sp, #32]", - "str q2, [x28, #32]", + "ldp q2, q3, [x4, #800]", + "stp q2, q3, [x28, #240]", + "stp q14, q15, [x28, #208]", + "stp q12, q13, [x28, #176]", + "stp q10, q11, [x28, #144]", + "stp q8, q9, [x28, #112]", + "stp q6, q7, [x28, #80]", + "stp q4, q5, [x28, #48]", "ldr q2, [sp]", - "str q2, [x28, #16]", - "b #+0x48", + "ldr q3, [sp, #32]", + "stp q2, q3, [x28, #16]", + "b #+0x28", "movi v2.2d, #0x0", - "str q2, [x28, #256]", - "str q2, [x28, #240]", - "str q2, [x28, #224]", - "str q2, [x28, #208]", - "str q2, [x28, #192]", - "str q2, [x28, #176]", - "str q2, [x28, #160]", - "str q2, [x28, #144]", - "str q2, [x28, #128]", - "str q2, [x28, #112]", - "str q2, [x28, #96]", - "str q2, [x28, #80]", - "str q2, [x28, #64]", - "str q2, [x28, #48]", - "str q2, [x28, #32]", - "str q2, [x28, #16]", + "stp q2, q2, [x28, #240]", + "stp q2, q2, [x28, #208]", + "stp q2, q2, [x28, #176]", + "stp q2, q2, [x28, #144]", + "stp q2, q2, [x28, #112]", + "stp q2, q2, [x28, #80]", + "stp q2, q2, [x28, #48]", + "stp q2, q2, [x28, #16]", "ldr x20, [x4, #512]", "ubfx x20, x20, #1, #2", "cbnz x20, #+0x8", diff --git a/unittests/InstructionCountCI/SecondaryGroup.json b/unittests/InstructionCountCI/SecondaryGroup.json index 12ccf3012..edd304915 100644 --- a/unittests/InstructionCountCI/SecondaryGroup.json +++ b/unittests/InstructionCountCI/SecondaryGroup.json @@ -1406,7 +1406,7 @@ ] }, "fxsave [rax]": { - "ExpectedInstructionCount": 52, + "ExpectedInstructionCount": 39, "Comment": "GROUP15 0x0F 0xAE /0", "ExpectedArm64ASM": [ "ldrh w20, [x28, #1296]", @@ -1425,42 +1425,29 @@ "ldrb w20, [x28, #1298]", "strb w20, [x4, #4]", "ldr q2, [x28, #1040]", - "str q2, [x4, #32]", - "ldr q2, [x28, #1056]", - "str q2, [x4, #48]", + "ldr q3, [x28, #1056]", + "stp q2, q3, [x4, #32]", "ldr q2, [x28, #1072]", - "str q2, [x4, #64]", - "ldr q2, [x28, #1088]", - "str q2, [x4, #80]", + "ldr q3, [x28, #1088]", + "stp q2, q3, [x4, #64]", "ldr q2, [x28, #1104]", - "str q2, [x4, #96]", - "ldr q2, [x28, #1120]", - "str q2, [x4, #112]", + "ldr q3, [x28, #1120]", + "stp q2, q3, [x4, #96]", "ldr q2, [x28, #1136]", - "str q2, [x4, #128]", - "ldr q2, [x28, #1152]", - "str q2, [x4, #144]", - "str q16, [x4, #160]", - "str q17, [x4, #176]", - "str q18, [x4, #192]", - "str q19, [x4, #208]", - "str q20, [x4, #224]", - "str q21, [x4, #240]", - "str q22, [x4, #256]", - "str q23, [x4, #272]", - "str q24, [x4, #288]", - "str q25, [x4, #304]", - "str q26, [x4, #320]", - "str q27, [x4, #336]", - "str q28, [x4, #352]", - "str q29, [x4, #368]", - "str q30, [x4, #384]", - "str q31, [x4, #400]", + "ldr q3, [x28, #1152]", + "stp q2, q3, [x4, #128]", + "stp q16, q17, [x4, #160]", + "stp q18, q19, [x4, #192]", + "stp q20, q21, [x4, #224]", + "stp q22, q23, [x4, #256]", + "stp q24, q25, [x4, #288]", + "stp q26, q27, [x4, #320]", + "stp q28, q29, [x4, #352]", + "stp q30, q31, [x4, #384]", "ldr w20, [x28, #940]", "and w20, w20, #0xffc0", - "str w20, [x4, #24]", - "mov w20, #0xffff", - "str w20, [x4, #28]" + "mov w21, #0xffff", + "stp w20, w21, [x4, #24]" ] }, "rdfsbase eax": { @@ -1478,7 +1465,7 @@ ] }, "fxrstor [rax]": { - "ExpectedInstructionCount": 58, + "ExpectedInstructionCount": 46, "Comment": "GROUP15 0x0F 0xAE /1", "ExpectedArm64ASM": [ "ldrh w20, [x4]", @@ -1495,30 +1482,18 @@ "strb w23, [x28, #1018]", "strb w20, [x28, #1022]", "ldrb w20, [x4, #4]", - "ldr q2, [x4, #32]", - "ldr q3, [x4, #48]", - "ldr q4, [x4, #64]", - "ldr q5, [x4, #80]", - "ldr q6, [x4, #96]", - "ldr q7, [x4, #112]", - "ldr q8, [x4, #128]", - "ldr q9, [x4, #144]", - "ldr q16, [x4, #160]", - "ldr q17, [x4, #176]", - "ldr q18, [x4, #192]", - "ldr q19, [x4, #208]", - "ldr q20, [x4, #224]", - "ldr q21, [x4, #240]", - "ldr q22, [x4, #256]", - "ldr q23, [x4, #272]", - "ldr q24, [x4, #288]", - "ldr q25, [x4, #304]", - "ldr q26, [x4, #320]", - "ldr q27, [x4, #336]", - "ldr q28, [x4, #352]", - "ldr q29, [x4, #368]", - "ldr q30, [x4, #384]", - "ldr q31, [x4, #400]", + "ldp q2, q3, [x4, #32]", + "ldp q4, q5, [x4, #64]", + "ldp q6, q7, [x4, #96]", + "ldp q8, q9, [x4, #128]", + "ldp q16, q17, [x4, #160]", + "ldp q18, q19, [x4, #192]", + "ldp q20, q21, [x4, #224]", + "ldp q22, q23, [x4, #256]", + "ldp q24, q25, [x4, #288]", + "ldp q26, q27, [x4, #320]", + "ldp q28, q29, [x4, #352]", + "ldp q30, q31, [x4, #384]", "ldr w21, [x4, #24]", "and w21, w21, #0xffc0", "str w21, [x28, #940]", @@ -1612,12 +1587,12 @@ ] }, "xsave [rax]": { - "ExpectedInstructionCount": 98, + "ExpectedInstructionCount": 69, "Comment": "GROUP15 0x0F 0xAE /4", "ExpectedArm64ASM": [ "ubfx x20, x4, #0, #1", "cbnz x20, #+0x8", - "b #+0x80", + "b #+0x70", "ldrh w20, [x28, #1296]", "strh w20, [x4]", "ldrb w20, [x28, #1019]", @@ -1634,83 +1609,54 @@ "ldrb w20, [x28, #1298]", "strb w20, [x4, #4]", "ldr q2, [x28, #1040]", - "str q2, [x4, #32]", - "ldr q2, [x28, #1056]", - "str q2, [x4, #48]", + "ldr q3, [x28, #1056]", + "stp q2, q3, [x4, #32]", "ldr q2, [x28, #1072]", - "str q2, [x4, #64]", - "ldr q2, [x28, #1088]", - "str q2, [x4, #80]", + "ldr q3, [x28, #1088]", + "stp q2, q3, [x4, #64]", "ldr q2, [x28, #1104]", - "str q2, [x4, #96]", - "ldr q2, [x28, #1120]", - "str q2, [x4, #112]", + "ldr q3, [x28, #1120]", + "stp q2, q3, [x4, #96]", "ldr q2, [x28, #1136]", - "str q2, [x4, #128]", - "ldr q2, [x28, #1152]", - "str q2, [x4, #144]", + "ldr q3, [x28, #1152]", + "stp q2, q3, [x4, #128]", "ubfx x20, x4, #1, #1", "cbnz x20, #+0x8", - "b #+0x44", - "str q16, [x4, #160]", - "str q17, [x4, #176]", - "str q18, [x4, #192]", - "str q19, [x4, #208]", - "str q20, [x4, #224]", - "str q21, [x4, #240]", - "str q22, [x4, #256]", - "str q23, [x4, #272]", - "str q24, [x4, #288]", - "str q25, [x4, #304]", - "str q26, [x4, #320]", - "str q27, [x4, #336]", - "str q28, [x4, #352]", - "str q29, [x4, #368]", - "str q30, [x4, #384]", - "str q31, [x4, #400]", + "b #+0x24", + "stp q16, q17, [x4, #160]", + "stp q18, q19, [x4, #192]", + "stp q20, q21, [x4, #224]", + "stp q22, q23, [x4, #256]", + "stp q24, q25, [x4, #288]", + "stp q26, q27, [x4, #320]", + "stp q28, q29, [x4, #352]", + "stp q30, q31, [x4, #384]", "ubfx x20, x4, #2, #1", "cbnz x20, #+0x8", - "b #+0x84", - "ldr q2, [x28, #16]", - "str q2, [x4, #576]", - "ldr q2, [x28, #32]", - "str q2, [x4, #592]", - "ldr q2, [x28, #48]", - "str q2, [x4, #608]", - "ldr q2, [x28, #64]", - "str q2, [x4, #624]", - "ldr q2, [x28, #80]", - "str q2, [x4, #640]", - "ldr q2, [x28, #96]", - "str q2, [x4, #656]", - "ldr q2, [x28, #112]", - "str q2, [x4, #672]", - "ldr q2, [x28, #128]", - "str q2, [x4, #688]", - "ldr q2, [x28, #144]", - "str q2, [x4, #704]", - "ldr q2, [x28, #160]", - "str q2, [x4, #720]", - "ldr q2, [x28, #176]", - "str q2, [x4, #736]", - "ldr q2, [x28, #192]", - "str q2, [x4, #752]", - "ldr q2, [x28, #208]", - "str q2, [x4, #768]", - "ldr q2, [x28, #224]", - "str q2, [x4, #784]", - "ldr q2, [x28, #240]", - "str q2, [x4, #800]", - "ldr q2, [x28, #256]", - "str q2, [x4, #816]", + "b #+0x44", + "ldp q2, q3, [x28, #16]", + "stp q2, q3, [x4, #576]", + "ldp q2, q3, [x28, #48]", + "stp q2, q3, [x4, #608]", + "ldp q2, q3, [x28, #80]", + "stp q2, q3, [x4, #640]", + "ldp q2, q3, [x28, #112]", + "stp q2, q3, [x4, #672]", + "ldp q2, q3, [x28, #144]", + "stp q2, q3, [x4, #704]", + "ldp q2, q3, [x28, #176]", + "stp q2, q3, [x4, #736]", + "ldp q2, q3, [x28, #208]", + "stp q2, q3, [x4, #768]", + "ldp q2, q3, [x28, #240]", + "stp q2, q3, [x4, #800]", "ubfx x20, x4, #1, #2", "cbnz x20, #+0x8", - "b #+0x18", + "b #+0x14", "ldr w20, [x28, #940]", "and w20, w20, #0xffc0", - "str w20, [x4, #24]", - "mov w20, #0xffff", - "str w20, [x4, #28]", + "mov w21, #0xffff", + "stp w20, w21, [x4, #24]", "ubfx x20, x4, #0, #3", "str x20, [x4, #512]" ] @@ -1723,14 +1669,14 @@ ] }, "xrstor [rax]": { - "ExpectedInstructionCount": 166, + "ExpectedInstructionCount": 130, "Comment": "GROUP15 0x0F 0xAE /5", "ExpectedArm64ASM": [ "sub sp, sp, #0x40 (64)", "ldr x20, [x4, #512]", "ubfx x20, x20, #0, #1", "cbnz x20, #+0x8", - "b #+0x84", + "b #+0x74", "ldrh w20, [x4]", "strh w20, [x28, #1296]", "ldrh w20, [x4, #2]", @@ -1745,14 +1691,10 @@ "strb w23, [x28, #1018]", "strb w20, [x28, #1022]", "ldrb w20, [x4, #4]", - "ldr q2, [x4, #32]", - "ldr q3, [x4, #48]", - "ldr q4, [x4, #64]", - "ldr q5, [x4, #80]", - "ldr q6, [x4, #96]", - "ldr q7, [x4, #112]", - "ldr q8, [x4, #128]", - "ldr q9, [x4, #144]", + "ldp q2, q3, [x4, #32]", + "ldp q4, q5, [x4, #64]", + "ldp q6, q7, [x4, #96]", + "ldp q8, q9, [x4, #128]", "strb w20, [x28, #1298]", "str q9, [x28, #1152]", "str q8, [x28, #1136]", @@ -1783,23 +1725,15 @@ "ldr x20, [x4, #512]", "ubfx x20, x20, #1, #1", "cbnz x20, #+0x8", - "b #+0x48", - "ldr q16, [x4, #160]", - "ldr q17, [x4, #176]", - "ldr q18, [x4, #192]", - "ldr q19, [x4, #208]", - "ldr q20, [x4, #224]", - "ldr q21, [x4, #240]", - "ldr q22, [x4, #256]", - "ldr q23, [x4, #272]", - "ldr q24, [x4, #288]", - "ldr q25, [x4, #304]", - "ldr q26, [x4, #320]", - "ldr q27, [x4, #336]", - "ldr q28, [x4, #352]", - "ldr q29, [x4, #368]", - "ldr q30, [x4, #384]", - "ldr q31, [x4, #400]", + "b #+0x28", + "ldp q16, q17, [x4, #160]", + "ldp q18, q19, [x4, #192]", + "ldp q20, q21, [x4, #224]", + "ldp q22, q23, [x4, #256]", + "ldp q24, q25, [x4, #288]", + "ldp q26, q27, [x4, #320]", + "ldp q28, q29, [x4, #352]", + "ldp q30, q31, [x4, #384]", "b #+0x44", "movi v31.2d, #0x0", "mov v30.16b, v31.16b", @@ -1820,61 +1754,37 @@ "ldr x20, [x4, #512]", "ubfx x20, x20, #2, #1", "cbnz x20, #+0x8", - "b #+0x98", - "ldr q2, [x4, #576]", - "ldr q3, [x4, #592]", - "ldr q4, [x4, #608]", - "ldr q5, [x4, #624]", - "ldr q6, [x4, #640]", - "ldr q7, [x4, #656]", - "ldr q8, [x4, #672]", - "ldr q9, [x4, #688]", - "ldr q10, [x4, #704]", - "ldr q11, [x4, #720]", - "ldr q12, [x4, #736]", - "ldr q13, [x4, #752]", - "ldr q14, [x4, #768]", - "ldr q15, [x4, #784]", + "b #+0x58", + "ldp q2, q3, [x4, #576]", + "ldp q4, q5, [x4, #608]", + "ldp q6, q7, [x4, #640]", + "ldp q8, q9, [x4, #672]", + "ldp q10, q11, [x4, #704]", + "ldp q12, q13, [x4, #736]", + "ldp q14, q15, [x4, #768]", "str q2, [sp]", - "ldr q2, [x4, #800]", "str q3, [sp, #32]", - "ldr q3, [x4, #816]", - "str q3, [x28, #256]", - "str q2, [x28, #240]", - "str q15, [x28, #224]", - "str q14, [x28, #208]", - "str q13, [x28, #192]", - "str q12, [x28, #176]", - "str q11, [x28, #160]", - "str q10, [x28, #144]", - "str q9, [x28, #128]", - "str q8, [x28, #112]", - "str q7, [x28, #96]", - "str q6, [x28, #80]", - "str q5, [x28, #64]", - "str q4, [x28, #48]", - "ldr q2, [sp, #32]", - "str q2, [x28, #32]", + "ldp q2, q3, [x4, #800]", + "stp q2, q3, [x28, #240]", + "stp q14, q15, [x28, #208]", + "stp q12, q13, [x28, #176]", + "stp q10, q11, [x28, #144]", + "stp q8, q9, [x28, #112]", + "stp q6, q7, [x28, #80]", + "stp q4, q5, [x28, #48]", "ldr q2, [sp]", - "str q2, [x28, #16]", - "b #+0x48", + "ldr q3, [sp, #32]", + "stp q2, q3, [x28, #16]", + "b #+0x28", "movi v2.2d, #0x0", - "str q2, [x28, #256]", - "str q2, [x28, #240]", - "str q2, [x28, #224]", - "str q2, [x28, #208]", - "str q2, [x28, #192]", - "str q2, [x28, #176]", - "str q2, [x28, #160]", - "str q2, [x28, #144]", - "str q2, [x28, #128]", - "str q2, [x28, #112]", - "str q2, [x28, #96]", - "str q2, [x28, #80]", - "str q2, [x28, #64]", - "str q2, [x28, #48]", - "str q2, [x28, #32]", - "str q2, [x28, #16]", + "stp q2, q2, [x28, #240]", + "stp q2, q2, [x28, #208]", + "stp q2, q2, [x28, #176]", + "stp q2, q2, [x28, #144]", + "stp q2, q2, [x28, #112]", + "stp q2, q2, [x28, #80]", + "stp q2, q2, [x28, #48]", + "stp q2, q2, [x28, #16]", "ldr x20, [x4, #512]", "ubfx x20, x20, #1, #2", "cbnz x20, #+0x8",