Merge pull request #3528 from alyssarosenzweig/ra/xsave-xrstor

Eliminate crossblock liveness in xsave/xrstor
This commit is contained in:
Alyssa Rosenzweig 2024-03-30 14:11:25 -04:00 committed by GitHub
commit d25ace43aa
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 249 additions and 254 deletions

View File

@ -809,6 +809,7 @@ public:
void FXSaveOp(OpcodeArgs);
void FXRStoreOp(OpcodeArgs);
OrderedNode *XSaveBase(X86Tables::DecodedOp Op);
void XSaveOp(OpcodeArgs);
void PAlignrOp(OpcodeArgs);

View File

@ -3001,16 +3001,15 @@ void OpDispatchBuilder::XSaveOp(OpcodeArgs) {
XSaveOpImpl(Op);
}
void OpDispatchBuilder::XSaveOpImpl(OpcodeArgs) {
const auto XSaveBase = [this, Op] {
OrderedNode *Mem = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.LoadData = false});
return AppendSegmentOffset(Mem, Op->Flags);
};
OrderedNode *OpDispatchBuilder::XSaveBase(X86Tables::DecodedOp Op) {
OrderedNode *Mem = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.LoadData = false});
return AppendSegmentOffset(Mem, Op->Flags);
}
void OpDispatchBuilder::XSaveOpImpl(OpcodeArgs) {
// NOTE: Mask should be EAX and EDX concatenated, but we only need to test
// for features that are in the lower 32 bits, so EAX only is sufficient.
OrderedNode *Mask = LoadGPRRegister(X86State::REG_RAX);
OrderedNode *Base = XSaveBase();
const auto OpSize = IR::SizeToOpSize(CTX->GetGPRSize());
const auto StoreIfFlagSet = [&](uint32_t BitIndex, auto fn, uint32_t FieldSize = 1){
@ -3034,25 +3033,26 @@ void OpDispatchBuilder::XSaveOpImpl(OpcodeArgs) {
// x87
{
StoreIfFlagSet(0, [this, Op, Base] { SaveX87State(Op, Base); });
StoreIfFlagSet(0, [this, Op] { SaveX87State(Op, XSaveBase(Op)); });
}
// SSE
{
StoreIfFlagSet(1, [this, Base] { SaveSSEState(Base); });
StoreIfFlagSet(1, [this, Op] { SaveSSEState(XSaveBase(Op)); });
}
// AVX
if (CTX->HostFeatures.SupportsAVX)
{
StoreIfFlagSet(2, [this, Base] { SaveAVXState(Base); });
StoreIfFlagSet(2, [this, Op] { SaveAVXState(XSaveBase(Op)); });
}
// We need to save MXCSR and MXCSR_MASK if either SSE or AVX are requested to be saved
{
StoreIfFlagSet(1, [this, Base] { SaveMXCSRState(Base); }, 2);
StoreIfFlagSet(1, [this, Op] { SaveMXCSRState(XSaveBase(Op)); }, 2);
}
// Update XSTATE_BV region of the XSAVE header
{
OrderedNode *Base = XSaveBase(Op);
OrderedNode *HeaderOffset = _Add(OpSize, Base, _Constant(512));
// NOTE: We currently only support the first 3 bits (x87, SSE, and AVX)
@ -3210,14 +3210,11 @@ void OpDispatchBuilder::FXRStoreOp(OpcodeArgs) {
void OpDispatchBuilder::XRstorOpImpl(OpcodeArgs) {
const auto OpSize = IR::SizeToOpSize(CTX->GetGPRSize());
const auto XSaveBase = [this, Op] {
OrderedNode *Mem = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.LoadData = false});
return AppendSegmentOffset(Mem, Op->Flags);
};
// Set up base address for the XSAVE region to restore from, and also read the
// XSTATE_BV bit flags out of the XSTATE header.
OrderedNode *Base = XSaveBase();
//
// Note: we rematerialize Base in each block to avoid crossblock liveness.
OrderedNode *Base = XSaveBase(Op);
OrderedNode *Mask = _LoadMem(GPRClass, 8, _Add(OpSize, Base, _Constant(512)), 8);
// If a bit in our XSTATE_BV is set, then we restore from that region of the XSAVE area,
@ -3253,27 +3250,28 @@ void OpDispatchBuilder::XRstorOpImpl(OpcodeArgs) {
// x87
{
RestoreIfFlagSetOrDefault(0,
[this, Base] { RestoreX87State(Base); },
[this, Op] { RestoreX87State(XSaveBase(Op)); },
[this, Op] { DefaultX87State(Op); });
}
// SSE
{
RestoreIfFlagSetOrDefault(1,
[this, Base] { RestoreSSEState(Base); },
[this, Op] { RestoreSSEState(XSaveBase(Op)); },
[this] { DefaultSSEState(); });
}
// AVX
if (CTX->HostFeatures.SupportsAVX)
{
RestoreIfFlagSetOrDefault(2,
[this, Base] { RestoreAVXState(Base); },
[this, Op] { RestoreAVXState(XSaveBase(Op)); },
[this] { DefaultAVXState(); });
}
{
// We need to restore the MXCSR if either SSE or AVX are requested to be saved
RestoreIfFlagSetOrDefault(1,
[this, Base, OpSize] {
[this, Op, OpSize] {
OrderedNode *Base = XSaveBase(Op);
OrderedNode *MXCSRLocation = _Add(OpSize, Base, _Constant(24));
OrderedNode *MXCSR = _LoadMem(GPRClass, 4, MXCSRLocation, 4);
RestoreMXCSRState(MXCSR);

View File

@ -1407,80 +1407,79 @@
]
},
"xsave [rax]": {
"ExpectedInstructionCount": 71,
"ExpectedInstructionCount": 70,
"Comment": "GROUP15 0x0F 0xAE /4",
"ExpectedArm64ASM": [
"mov x20, x4",
"mov x21, x4",
"ubfx x22, x20, #0, #1",
"cbnz x22, #+0x8",
"ubfx x21, x20, #0, #1",
"cbnz x21, #+0x8",
"b #+0x84",
"ldrh w22, [x28, #1024]",
"strh w22, [x21]",
"mov w22, #0x0",
"ldrb w23, [x28, #747]",
"bfi x22, x23, #11, #3",
"ldrb w23, [x28, #744]",
"ldrb w24, [x28, #745]",
"ldrb w25, [x28, #746]",
"ldrb w30, [x28, #750]",
"orr x22, x22, x23, lsl #8",
"orr x22, x22, x24, lsl #9",
"orr x22, x22, x25, lsl #10",
"orr x22, x22, x30, lsl #14",
"strh w22, [x21, #2]",
"ldrb w22, [x28, #1026]",
"strb w22, [x21, #4]",
"ldrh w21, [x28, #1024]",
"strh w21, [x4]",
"mov w21, #0x0",
"ldrb w22, [x28, #747]",
"bfi x21, x22, #11, #3",
"ldrb w22, [x28, #744]",
"ldrb w23, [x28, #745]",
"ldrb w24, [x28, #746]",
"ldrb w25, [x28, #750]",
"orr x21, x21, x22, lsl #8",
"orr x21, x21, x23, lsl #9",
"orr x21, x21, x24, lsl #10",
"orr x21, x21, x25, lsl #14",
"strh w21, [x4, #2]",
"ldrb w21, [x28, #1026]",
"strb w21, [x4, #4]",
"ldr q2, [x28, #768]",
"str q2, [x21, #32]",
"str q2, [x4, #32]",
"ldr q2, [x28, #784]",
"str q2, [x21, #48]",
"str q2, [x4, #48]",
"ldr q2, [x28, #800]",
"str q2, [x21, #64]",
"str q2, [x4, #64]",
"ldr q2, [x28, #816]",
"str q2, [x21, #80]",
"str q2, [x4, #80]",
"ldr q2, [x28, #832]",
"str q2, [x21, #96]",
"str q2, [x4, #96]",
"ldr q2, [x28, #848]",
"str q2, [x21, #112]",
"str q2, [x4, #112]",
"ldr q2, [x28, #864]",
"str q2, [x21, #128]",
"str q2, [x4, #128]",
"ldr q2, [x28, #880]",
"str q2, [x21, #144]",
"ubfx x22, x20, #1, #1",
"cbnz x22, #+0x8",
"str q2, [x4, #144]",
"ubfx x21, x20, #1, #1",
"cbnz x21, #+0x8",
"b #+0x44",
"str q16, [x21, #160]",
"str q17, [x21, #176]",
"str q18, [x21, #192]",
"str q19, [x21, #208]",
"str q20, [x21, #224]",
"str q21, [x21, #240]",
"str q22, [x21, #256]",
"str q23, [x21, #272]",
"str q24, [x21, #288]",
"str q25, [x21, #304]",
"str q26, [x21, #320]",
"str q27, [x21, #336]",
"str q28, [x21, #352]",
"str q29, [x21, #368]",
"str q30, [x21, #384]",
"str q31, [x21, #400]",
"ubfx x22, x20, #1, #2",
"cbnz x22, #+0x8",
"str q16, [x4, #160]",
"str q17, [x4, #176]",
"str q18, [x4, #192]",
"str q19, [x4, #208]",
"str q20, [x4, #224]",
"str q21, [x4, #240]",
"str q22, [x4, #256]",
"str q23, [x4, #272]",
"str q24, [x4, #288]",
"str q25, [x4, #304]",
"str q26, [x4, #320]",
"str q27, [x4, #336]",
"str q28, [x4, #352]",
"str q29, [x4, #368]",
"str q30, [x4, #384]",
"str q31, [x4, #400]",
"ubfx x21, x20, #1, #2",
"cbnz x21, #+0x8",
"b #+0x2c",
"mov w22, #0x1f80",
"mrs x23, fpcr",
"ubfx x23, x23, #22, #3",
"rbit w0, w23",
"bfi x23, x0, #30, #2",
"bfi w22, w23, #13, #3",
"add x23, x21, #0x18 (24)",
"str w22, [x21, #24]",
"mov w22, #0xffff",
"str w22, [x23, #4]",
"mov w21, #0x1f80",
"mrs x22, fpcr",
"ubfx x22, x22, #22, #3",
"rbit w0, w22",
"bfi x22, x0, #30, #2",
"bfi w21, w22, #13, #3",
"add x22, x4, #0x18 (24)",
"str w21, [x4, #24]",
"mov w21, #0xffff",
"str w21, [x22, #4]",
"ubfx x20, x20, #0, #3",
"str x20, [x21, #512]"
"str x20, [x4, #512]"
]
},
"lfence": {
@ -1491,55 +1490,54 @@
]
},
"xrstor [rax]": {
"ExpectedInstructionCount": 104,
"ExpectedInstructionCount": 103,
"Comment": "GROUP15 0x0F 0xAE /5",
"ExpectedArm64ASM": [
"mov x20, x4",
"ldr x21, [x20, #512]",
"ubfx x22, x21, #0, #1",
"cbnz x22, #+0x8",
"ldr x20, [x4, #512]",
"ubfx x21, x20, #0, #1",
"cbnz x21, #+0x8",
"b #+0x84",
"ldrh w22, [x20]",
"strh w22, [x28, #1024]",
"ldrh w22, [x20, #2]",
"ubfx w23, w22, #11, #3",
"strb w23, [x28, #747]",
"ubfx w23, w22, #8, #1",
"ubfx w24, w22, #9, #1",
"ubfx w25, w22, #10, #1",
"ubfx w22, w22, #14, #1",
"strb w23, [x28, #744]",
"strb w24, [x28, #745]",
"strb w25, [x28, #746]",
"strb w22, [x28, #750]",
"ldrb w22, [x20, #4]",
"strb w22, [x28, #1026]",
"ldr q2, [x20, #32]",
"ldrh w21, [x4]",
"strh w21, [x28, #1024]",
"ldrh w21, [x4, #2]",
"ubfx w22, w21, #11, #3",
"strb w22, [x28, #747]",
"ubfx w22, w21, #8, #1",
"ubfx w23, w21, #9, #1",
"ubfx w24, w21, #10, #1",
"ubfx w21, w21, #14, #1",
"strb w22, [x28, #744]",
"strb w23, [x28, #745]",
"strb w24, [x28, #746]",
"strb w21, [x28, #750]",
"ldrb w21, [x4, #4]",
"strb w21, [x28, #1026]",
"ldr q2, [x4, #32]",
"str q2, [x28, #768]",
"ldr q2, [x20, #48]",
"ldr q2, [x4, #48]",
"str q2, [x28, #784]",
"ldr q2, [x20, #64]",
"ldr q2, [x4, #64]",
"str q2, [x28, #800]",
"ldr q2, [x20, #80]",
"ldr q2, [x4, #80]",
"str q2, [x28, #816]",
"ldr q2, [x20, #96]",
"ldr q2, [x4, #96]",
"str q2, [x28, #832]",
"ldr q2, [x20, #112]",
"ldr q2, [x4, #112]",
"str q2, [x28, #848]",
"ldr q2, [x20, #128]",
"ldr q2, [x4, #128]",
"str q2, [x28, #864]",
"ldr q2, [x20, #144]",
"ldr q2, [x4, #144]",
"str q2, [x28, #880]",
"b #+0x4c",
"mov w22, #0x0",
"mov w23, #0x37f",
"strh w23, [x28, #1024]",
"strb w22, [x28, #747]",
"strb w22, [x28, #744]",
"strb w22, [x28, #745]",
"strb w22, [x28, #746]",
"strb w22, [x28, #750]",
"strb w22, [x28, #1026]",
"mov w21, #0x0",
"mov w22, #0x37f",
"strh w22, [x28, #1024]",
"strb w21, [x28, #747]",
"strb w21, [x28, #744]",
"strb w21, [x28, #745]",
"strb w21, [x28, #746]",
"strb w21, [x28, #750]",
"strb w21, [x28, #1026]",
"movi v2.2d, #0x0",
"str q2, [x28, #768]",
"str q2, [x28, #784]",
@ -1549,25 +1547,25 @@
"str q2, [x28, #848]",
"str q2, [x28, #864]",
"str q2, [x28, #880]",
"ubfx x22, x21, #1, #1",
"cbnz x22, #+0x8",
"ubfx x21, x20, #1, #1",
"cbnz x21, #+0x8",
"b #+0x48",
"ldr q16, [x20, #160]",
"ldr q17, [x20, #176]",
"ldr q18, [x20, #192]",
"ldr q19, [x20, #208]",
"ldr q20, [x20, #224]",
"ldr q21, [x20, #240]",
"ldr q22, [x20, #256]",
"ldr q23, [x20, #272]",
"ldr q24, [x20, #288]",
"ldr q25, [x20, #304]",
"ldr q26, [x20, #320]",
"ldr q27, [x20, #336]",
"ldr q28, [x20, #352]",
"ldr q29, [x20, #368]",
"ldr q30, [x20, #384]",
"ldr q31, [x20, #400]",
"ldr q16, [x4, #160]",
"ldr q17, [x4, #176]",
"ldr q18, [x4, #192]",
"ldr q19, [x4, #208]",
"ldr q20, [x4, #224]",
"ldr q21, [x4, #240]",
"ldr q22, [x4, #256]",
"ldr q23, [x4, #272]",
"ldr q24, [x4, #288]",
"ldr q25, [x4, #304]",
"ldr q26, [x4, #320]",
"ldr q27, [x4, #336]",
"ldr q28, [x4, #352]",
"ldr q29, [x4, #368]",
"ldr q30, [x4, #384]",
"ldr q31, [x4, #400]",
"b #+0x44",
"movi v16.2d, #0x0",
"mov v17.16b, v16.16b",
@ -1585,10 +1583,10 @@
"mov v29.16b, v16.16b",
"mov v30.16b, v16.16b",
"mov v31.16b, v16.16b",
"ubfx x21, x21, #1, #2",
"cbnz x21, #+0x8",
"ubfx x20, x20, #1, #2",
"cbnz x20, #+0x8",
"b #+0x2c",
"ldr w20, [x20, #24]",
"ldr w20, [x4, #24]",
"ubfx w20, w20, #13, #3",
"rbit w1, w20",
"lsr w1, w1, #30",

View File

@ -1587,80 +1587,79 @@
]
},
"xsave [rax]": {
"ExpectedInstructionCount": 71,
"ExpectedInstructionCount": 70,
"Comment": "GROUP15 0x0F 0xAE /4",
"ExpectedArm64ASM": [
"mov x20, x4",
"mov x21, x4",
"ubfx x22, x20, #0, #1",
"cbnz x22, #+0x8",
"ubfx x21, x20, #0, #1",
"cbnz x21, #+0x8",
"b #+0x84",
"ldrh w22, [x28, #1024]",
"strh w22, [x21]",
"mov w22, #0x0",
"ldrb w23, [x28, #747]",
"bfi x22, x23, #11, #3",
"ldrb w23, [x28, #744]",
"ldrb w24, [x28, #745]",
"ldrb w25, [x28, #746]",
"ldrb w30, [x28, #750]",
"orr x22, x22, x23, lsl #8",
"orr x22, x22, x24, lsl #9",
"orr x22, x22, x25, lsl #10",
"orr x22, x22, x30, lsl #14",
"strh w22, [x21, #2]",
"ldrb w22, [x28, #1026]",
"strb w22, [x21, #4]",
"ldrh w21, [x28, #1024]",
"strh w21, [x4]",
"mov w21, #0x0",
"ldrb w22, [x28, #747]",
"bfi x21, x22, #11, #3",
"ldrb w22, [x28, #744]",
"ldrb w23, [x28, #745]",
"ldrb w24, [x28, #746]",
"ldrb w25, [x28, #750]",
"orr x21, x21, x22, lsl #8",
"orr x21, x21, x23, lsl #9",
"orr x21, x21, x24, lsl #10",
"orr x21, x21, x25, lsl #14",
"strh w21, [x4, #2]",
"ldrb w21, [x28, #1026]",
"strb w21, [x4, #4]",
"ldr q2, [x28, #768]",
"str q2, [x21, #32]",
"str q2, [x4, #32]",
"ldr q2, [x28, #784]",
"str q2, [x21, #48]",
"str q2, [x4, #48]",
"ldr q2, [x28, #800]",
"str q2, [x21, #64]",
"str q2, [x4, #64]",
"ldr q2, [x28, #816]",
"str q2, [x21, #80]",
"str q2, [x4, #80]",
"ldr q2, [x28, #832]",
"str q2, [x21, #96]",
"str q2, [x4, #96]",
"ldr q2, [x28, #848]",
"str q2, [x21, #112]",
"str q2, [x4, #112]",
"ldr q2, [x28, #864]",
"str q2, [x21, #128]",
"str q2, [x4, #128]",
"ldr q2, [x28, #880]",
"str q2, [x21, #144]",
"ubfx x22, x20, #1, #1",
"cbnz x22, #+0x8",
"str q2, [x4, #144]",
"ubfx x21, x20, #1, #1",
"cbnz x21, #+0x8",
"b #+0x44",
"str q16, [x21, #160]",
"str q17, [x21, #176]",
"str q18, [x21, #192]",
"str q19, [x21, #208]",
"str q20, [x21, #224]",
"str q21, [x21, #240]",
"str q22, [x21, #256]",
"str q23, [x21, #272]",
"str q24, [x21, #288]",
"str q25, [x21, #304]",
"str q26, [x21, #320]",
"str q27, [x21, #336]",
"str q28, [x21, #352]",
"str q29, [x21, #368]",
"str q30, [x21, #384]",
"str q31, [x21, #400]",
"ubfx x22, x20, #1, #2",
"cbnz x22, #+0x8",
"str q16, [x4, #160]",
"str q17, [x4, #176]",
"str q18, [x4, #192]",
"str q19, [x4, #208]",
"str q20, [x4, #224]",
"str q21, [x4, #240]",
"str q22, [x4, #256]",
"str q23, [x4, #272]",
"str q24, [x4, #288]",
"str q25, [x4, #304]",
"str q26, [x4, #320]",
"str q27, [x4, #336]",
"str q28, [x4, #352]",
"str q29, [x4, #368]",
"str q30, [x4, #384]",
"str q31, [x4, #400]",
"ubfx x21, x20, #1, #2",
"cbnz x21, #+0x8",
"b #+0x2c",
"mov w22, #0x1f80",
"mrs x23, fpcr",
"ubfx x23, x23, #22, #3",
"rbit w0, w23",
"bfi x23, x0, #30, #2",
"bfi w22, w23, #13, #3",
"add x23, x21, #0x18 (24)",
"str w22, [x21, #24]",
"mov w22, #0xffff",
"str w22, [x23, #4]",
"mov w21, #0x1f80",
"mrs x22, fpcr",
"ubfx x22, x22, #22, #3",
"rbit w0, w22",
"bfi x22, x0, #30, #2",
"bfi w21, w22, #13, #3",
"add x22, x4, #0x18 (24)",
"str w21, [x4, #24]",
"mov w21, #0xffff",
"str w21, [x22, #4]",
"ubfx x20, x20, #0, #3",
"str x20, [x21, #512]"
"str x20, [x4, #512]"
]
},
"lfence": {
@ -1671,55 +1670,54 @@
]
},
"xrstor [rax]": {
"ExpectedInstructionCount": 104,
"ExpectedInstructionCount": 103,
"Comment": "GROUP15 0x0F 0xAE /5",
"ExpectedArm64ASM": [
"mov x20, x4",
"ldr x21, [x20, #512]",
"ubfx x22, x21, #0, #1",
"cbnz x22, #+0x8",
"ldr x20, [x4, #512]",
"ubfx x21, x20, #0, #1",
"cbnz x21, #+0x8",
"b #+0x84",
"ldrh w22, [x20]",
"strh w22, [x28, #1024]",
"ldrh w22, [x20, #2]",
"ubfx w23, w22, #11, #3",
"strb w23, [x28, #747]",
"ubfx w23, w22, #8, #1",
"ubfx w24, w22, #9, #1",
"ubfx w25, w22, #10, #1",
"ubfx w22, w22, #14, #1",
"strb w23, [x28, #744]",
"strb w24, [x28, #745]",
"strb w25, [x28, #746]",
"strb w22, [x28, #750]",
"ldrb w22, [x20, #4]",
"strb w22, [x28, #1026]",
"ldr q2, [x20, #32]",
"ldrh w21, [x4]",
"strh w21, [x28, #1024]",
"ldrh w21, [x4, #2]",
"ubfx w22, w21, #11, #3",
"strb w22, [x28, #747]",
"ubfx w22, w21, #8, #1",
"ubfx w23, w21, #9, #1",
"ubfx w24, w21, #10, #1",
"ubfx w21, w21, #14, #1",
"strb w22, [x28, #744]",
"strb w23, [x28, #745]",
"strb w24, [x28, #746]",
"strb w21, [x28, #750]",
"ldrb w21, [x4, #4]",
"strb w21, [x28, #1026]",
"ldr q2, [x4, #32]",
"str q2, [x28, #768]",
"ldr q2, [x20, #48]",
"ldr q2, [x4, #48]",
"str q2, [x28, #784]",
"ldr q2, [x20, #64]",
"ldr q2, [x4, #64]",
"str q2, [x28, #800]",
"ldr q2, [x20, #80]",
"ldr q2, [x4, #80]",
"str q2, [x28, #816]",
"ldr q2, [x20, #96]",
"ldr q2, [x4, #96]",
"str q2, [x28, #832]",
"ldr q2, [x20, #112]",
"ldr q2, [x4, #112]",
"str q2, [x28, #848]",
"ldr q2, [x20, #128]",
"ldr q2, [x4, #128]",
"str q2, [x28, #864]",
"ldr q2, [x20, #144]",
"ldr q2, [x4, #144]",
"str q2, [x28, #880]",
"b #+0x4c",
"mov w22, #0x0",
"mov w23, #0x37f",
"strh w23, [x28, #1024]",
"strb w22, [x28, #747]",
"strb w22, [x28, #744]",
"strb w22, [x28, #745]",
"strb w22, [x28, #746]",
"strb w22, [x28, #750]",
"strb w22, [x28, #1026]",
"mov w21, #0x0",
"mov w22, #0x37f",
"strh w22, [x28, #1024]",
"strb w21, [x28, #747]",
"strb w21, [x28, #744]",
"strb w21, [x28, #745]",
"strb w21, [x28, #746]",
"strb w21, [x28, #750]",
"strb w21, [x28, #1026]",
"movi v2.2d, #0x0",
"str q2, [x28, #768]",
"str q2, [x28, #784]",
@ -1729,25 +1727,25 @@
"str q2, [x28, #848]",
"str q2, [x28, #864]",
"str q2, [x28, #880]",
"ubfx x22, x21, #1, #1",
"cbnz x22, #+0x8",
"ubfx x21, x20, #1, #1",
"cbnz x21, #+0x8",
"b #+0x48",
"ldr q16, [x20, #160]",
"ldr q17, [x20, #176]",
"ldr q18, [x20, #192]",
"ldr q19, [x20, #208]",
"ldr q20, [x20, #224]",
"ldr q21, [x20, #240]",
"ldr q22, [x20, #256]",
"ldr q23, [x20, #272]",
"ldr q24, [x20, #288]",
"ldr q25, [x20, #304]",
"ldr q26, [x20, #320]",
"ldr q27, [x20, #336]",
"ldr q28, [x20, #352]",
"ldr q29, [x20, #368]",
"ldr q30, [x20, #384]",
"ldr q31, [x20, #400]",
"ldr q16, [x4, #160]",
"ldr q17, [x4, #176]",
"ldr q18, [x4, #192]",
"ldr q19, [x4, #208]",
"ldr q20, [x4, #224]",
"ldr q21, [x4, #240]",
"ldr q22, [x4, #256]",
"ldr q23, [x4, #272]",
"ldr q24, [x4, #288]",
"ldr q25, [x4, #304]",
"ldr q26, [x4, #320]",
"ldr q27, [x4, #336]",
"ldr q28, [x4, #352]",
"ldr q29, [x4, #368]",
"ldr q30, [x4, #384]",
"ldr q31, [x4, #400]",
"b #+0x44",
"movi v16.2d, #0x0",
"mov v17.16b, v16.16b",
@ -1765,10 +1763,10 @@
"mov v29.16b, v16.16b",
"mov v30.16b, v16.16b",
"mov v31.16b, v16.16b",
"ubfx x21, x21, #1, #2",
"cbnz x21, #+0x8",
"ubfx x20, x20, #1, #2",
"cbnz x20, #+0x8",
"b #+0x2c",
"ldr w20, [x20, #24]",
"ldr w20, [x4, #24]",
"ubfx w20, w20, #13, #3",
"rbit w1, w20",
"lsr w1, w1, #30",