Merge pull request #3491 from alyssarosenzweig/rclse/waw

RCLSE: Optimize store-after-store
This commit is contained in:
Ryan Houdek 2024-03-14 03:23:05 -07:00 committed by GitHub
commit ca6b2e43e6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 1934 additions and 2089 deletions

View File

@ -201,7 +201,7 @@ DEF_OP(VSha256U0) {
else {
mov(VTMP1.Q(), Src1.Q());
sha256su0(VTMP1, Src2);
mov(Dst.Q(), Src1.Q());
mov(Dst.Q(), VTMP1.Q());
}
}

View File

@ -547,10 +547,20 @@ bool RCLSE::ClassifyContextLoad(FEXCore::IR::IREmitter *IREmit, ContextInfo *Loc
bool RCLSE::ClassifyContextStore(FEXCore::IR::IREmitter *IREmit, ContextInfo *LocalInfo, FEXCore::IR::RegisterClassType Class, uint32_t Offset, uint8_t Size, FEXCore::IR::OrderedNode *CodeNode, FEXCore::IR::OrderedNode *ValueNode) {
auto Info = FindMemberInfo(LocalInfo, Offset, Size);
ContextMemberInfo PreviousMemberInfoCopy = *Info;
RecordAccess(Info, Class, Offset, Size, LastAccessType::WRITE, ValueNode,
CodeNode);
// TODO: Optimize redundant stores.
// ContextMemberInfo PreviousMemberInfoCopy = *Info;
if (PreviousMemberInfoCopy.AccessRegClass == Info->AccessRegClass &&
PreviousMemberInfoCopy.AccessOffset == Info->AccessOffset &&
PreviousMemberInfoCopy.AccessSize == Size &&
PreviousMemberInfoCopy.Accessed == LastAccessType::WRITE) {
// This optimizes redundant stores with no intervening load
IREmit->Remove(PreviousMemberInfoCopy.StoreNode);
return true;
}
// TODO: Optimize the case of partial stores.
return false;
}

View File

@ -83,3 +83,6 @@ Test_VEX/vroundpd.asm
Test_VEX/vroundps.asm
Test_VEX/vroundsd.asm
Test_VEX/vroundss.asm
# Simulator doesn't support cycle counter reading
Test_TwoByte/0F_31.asm

View File

@ -14,7 +14,7 @@
],
"Instructions": {
"push ax, bx": {
"ExpectedInstructionCount": 2,
"ExpectedInstructionCount": 4,
"Comment": [
"Mergable 16-bit pushes. May or may not be an optimization."
],
@ -23,12 +23,14 @@
"push bx"
],
"ExpectedArm64ASM": [
"strh w4, [x8, #-2]!",
"mov x20, x8",
"strh w4, [x20, #-2]!",
"mov x8, x20",
"strh w7, [x8, #-2]!"
]
},
"push rax, rbx": {
"ExpectedInstructionCount": 2,
"ExpectedInstructionCount": 4,
"Comment": [
"Mergable 64-bit pushes"
],
@ -37,12 +39,14 @@
"push rbx"
],
"ExpectedArm64ASM": [
"str x4, [x8, #-8]!",
"mov x20, x8",
"str x4, [x20, #-8]!",
"mov x8, x20",
"str x7, [x8, #-8]!"
]
},
"adds xmm0, xmm1, xmm2": {
"ExpectedInstructionCount": 4,
"ExpectedInstructionCount": 6,
"Comment": [
"Redundant scalar adds that can get eliminated without AFP."
],
@ -51,9 +55,11 @@
"addss xmm0, xmm2"
],
"ExpectedArm64ASM": [
"mov v2.16b, v16.16b",
"fadd s0, s16, s17",
"mov v16.s[0], v0.s[0]",
"fadd s0, s16, s18",
"mov v2.s[0], v0.s[0]",
"mov v16.16b, v2.16b",
"fadd s0, s2, s18",
"mov v16.s[0], v0.s[0]"
]
},

View File

@ -15,7 +15,7 @@
],
"Instructions": {
"adds xmm0, xmm1, xmm2": {
"ExpectedInstructionCount": 2,
"ExpectedInstructionCount": 4,
"Comment": [
"Redundant scalar operations should get eliminated with AFP"
],
@ -24,8 +24,10 @@
"addss xmm0, xmm2"
],
"ExpectedArm64ASM": [
"fadd s16, s16, s17",
"fadd s16, s16, s18"
"mov v2.16b, v16.16b",
"fadd s2, s16, s17",
"mov v16.16b, v2.16b",
"fadd s16, s2, s18"
]
}
}

File diff suppressed because it is too large Load Diff

View File

@ -86,16 +86,15 @@
]
},
"INC consumed": {
"ExpectedInstructionCount": 6,
"ExpectedInstructionCount": 5,
"x86Insts": [
"add rax, rbx",
"inc rax"
],
"ExpectedArm64ASM": [
"adds x4, x4, x7",
"adds x27, x4, x7",
"cset w20, hs",
"mov x27, x4",
"adds x26, x4, #0x1 (1)",
"adds x26, x27, #0x1 (1)",
"rmif x20, #63, #nzCv",
"mov x4, x26"
]
@ -108,23 +107,22 @@
"test rax, rdx"
],
"ExpectedArm64ASM": [
"add x4, x4, x7",
"add x4, x4, #0x1 (1)",
"add x20, x4, x7",
"add x4, x20, #0x1 (1)",
"ands x26, x4, x6"
]
},
"DEC consumed": {
"ExpectedInstructionCount": 7,
"ExpectedInstructionCount": 6,
"x86Insts": [
"sub rax, rbx",
"dec rax"
],
"ExpectedArm64ASM": [
"subs x4, x4, x7",
"subs x27, x4, x7",
"cfinv",
"cset w20, hs",
"mov x27, x4",
"subs x26, x4, #0x1 (1)",
"subs x26, x27, #0x1 (1)",
"rmif x20, #63, #nzCv",
"mov x4, x26"
]
@ -137,13 +135,13 @@
"test rax, rcx"
],
"ExpectedArm64ASM": [
"sub x4, x4, x7",
"sub x4, x4, #0x1 (1)",
"sub x20, x4, x7",
"sub x4, x20, #0x1 (1)",
"ands x26, x4, x5"
]
},
"8-bit DEC consumed": {
"ExpectedInstructionCount": 12,
"ExpectedInstructionCount": 15,
"x86Insts": [
"sub al, ah",
"dec al"
@ -154,17 +152,20 @@
"cmp w0, w20, lsl #24",
"sub w20, w4, w20",
"cfinv",
"bfxil x4, x20, #0, #8",
"uxtb w27, w4",
"mov x0, x4",
"bfxil x0, x20, #0, #8",
"mov x20, x0",
"uxtb w27, w20",
"sub w26, w27, #0x1 (1)",
"setf8 w26",
"bic w20, w27, w26",
"rmif x20, #7, #nzcV",
"bic w21, w27, w26",
"rmif x21, #7, #nzcV",
"mov x4, x20",
"bfxil x4, x26, #0, #8"
]
},
"8-bit DEC dead": {
"ExpectedInstructionCount": 8,
"ExpectedInstructionCount": 11,
"x86Insts": [
"sub al, ah",
"dec al",
@ -173,10 +174,13 @@
"ExpectedArm64ASM": [
"lsr w20, w4, #8",
"sub w20, w4, w20",
"bfxil x4, x20, #0, #8",
"uxtb w20, w4",
"sub w20, w20, #0x1 (1)",
"bfxil x4, x20, #0, #8",
"mov x0, x4",
"bfxil x0, x20, #0, #8",
"mov x20, x0",
"uxtb w21, w20",
"sub w21, w21, #0x1 (1)",
"mov x4, x20",
"bfxil x4, x21, #0, #8",
"mov x26, x4",
"cmn wzr, w26, lsl #24"
]

View File

@ -13,7 +13,7 @@
},
"Instructions": {
"The Witcher 3": {
"ExpectedInstructionCount": 9,
"ExpectedInstructionCount": 8,
"x86Insts": [
"mov eax, 0x1",
"lock xadd qword [rcx], rax",
@ -24,19 +24,18 @@
"add rdx, rcx"
],
"ExpectedArm64ASM": [
"mov w4, #0x1",
"ldaddal x4, x4, [x5]",
"mov x6, x4",
"and w6, w4, #0x1f",
"add x6, x6, #0x1 (1)",
"lsl x6, x6, #6",
"eor w27, w6, w5",
"adds x26, x6, x5",
"mov w20, #0x1",
"ldaddal x20, x4, [x5]",
"and w20, w4, #0x1f",
"add x20, x20, #0x1 (1)",
"lsl x20, x20, #6",
"eor w27, w20, w5",
"adds x26, x20, x5",
"mov x6, x26"
]
},
"FMOD scalar loop": {
"ExpectedInstructionCount": 88,
"ExpectedInstructionCount": 86,
"x86Insts": [
"mov esi, ecx",
"mov rdx, rbp",
@ -78,92 +77,90 @@
"sub esi, 0x1"
],
"ExpectedArm64ASM": [
"mov w10, w5",
"mov x6, x9",
"mov x4, x7",
"ldr s18, [x6]",
"add x4, x4, #0x20 (32)",
"fmul s0, s18, s16",
"mov v18.s[0], v0.s[0]",
"add x6, x6, #0x20 (32)",
"mov w27, w5",
"ldr s2, [x9]",
"add x4, x7, #0x20 (32)",
"fmul s0, s2, s16",
"mov v2.s[0], v0.s[0]",
"add x6, x9, #0x20 (32)",
"sub x20, x4, #0x20 (32)",
"ldr s2, [x20]",
"fadd s0, s18, s2",
"mov v18.s[0], v0.s[0]",
"ldr s3, [x20]",
"fadd s0, s2, s3",
"mov v2.s[0], v0.s[0]",
"sub x20, x4, #0x20 (32)",
"str s18, [x20]",
"str s2, [x20]",
"sub x20, x6, #0x1c (28)",
"ldr s18, [x20]",
"fmul s0, s18, s17",
"mov v18.s[0], v0.s[0]",
"sub x20, x4, #0x1c (28)",
"ldr s2, [x20]",
"fadd s0, s18, s2",
"mov v18.s[0], v0.s[0]",
"fmul s0, s2, s17",
"mov v2.s[0], v0.s[0]",
"sub x20, x4, #0x1c (28)",
"str s18, [x20]",
"ldr s3, [x20]",
"fadd s0, s2, s3",
"mov v2.s[0], v0.s[0]",
"sub x20, x4, #0x1c (28)",
"str s2, [x20]",
"sub x20, x6, #0x18 (24)",
"ldr s18, [x20]",
"fmul s0, s18, s16",
"mov v18.s[0], v0.s[0]",
"sub x20, x4, #0x18 (24)",
"ldr s2, [x20]",
"fadd s0, s18, s2",
"mov v18.s[0], v0.s[0]",
"fmul s0, s2, s16",
"mov v2.s[0], v0.s[0]",
"sub x20, x4, #0x18 (24)",
"str s18, [x20]",
"ldr s3, [x20]",
"fadd s0, s2, s3",
"mov v2.s[0], v0.s[0]",
"sub x20, x4, #0x18 (24)",
"str s2, [x20]",
"sub x20, x6, #0x14 (20)",
"ldr s18, [x20]",
"fmul s0, s18, s17",
"mov v18.s[0], v0.s[0]",
"sub x20, x4, #0x14 (20)",
"ldr s2, [x20]",
"fadd s0, s18, s2",
"mov v18.s[0], v0.s[0]",
"fmul s0, s2, s17",
"mov v2.s[0], v0.s[0]",
"sub x20, x4, #0x14 (20)",
"str s18, [x20]",
"ldr s3, [x20]",
"fadd s0, s2, s3",
"mov v2.s[0], v0.s[0]",
"sub x20, x4, #0x14 (20)",
"str s2, [x20]",
"sub x20, x6, #0x10 (16)",
"ldr s18, [x20]",
"fmul s0, s18, s16",
"mov v18.s[0], v0.s[0]",
"sub x20, x4, #0x10 (16)",
"ldr s2, [x20]",
"fadd s0, s18, s2",
"mov v18.s[0], v0.s[0]",
"fmul s0, s2, s16",
"mov v2.s[0], v0.s[0]",
"sub x20, x4, #0x10 (16)",
"str s18, [x20]",
"ldr s3, [x20]",
"fadd s0, s2, s3",
"mov v2.s[0], v0.s[0]",
"sub x20, x4, #0x10 (16)",
"str s2, [x20]",
"sub x20, x6, #0xc (12)",
"ldr s18, [x20]",
"fmul s0, s18, s17",
"mov v18.s[0], v0.s[0]",
"sub x20, x4, #0xc (12)",
"ldr s2, [x20]",
"fadd s0, s18, s2",
"mov v18.s[0], v0.s[0]",
"fmul s0, s2, s17",
"mov v2.s[0], v0.s[0]",
"sub x20, x4, #0xc (12)",
"str s18, [x20]",
"ldr s3, [x20]",
"fadd s0, s2, s3",
"mov v2.s[0], v0.s[0]",
"sub x20, x4, #0xc (12)",
"str s2, [x20]",
"sub x20, x6, #0x8 (8)",
"ldr s18, [x20]",
"fmul s0, s18, s16",
"mov v18.s[0], v0.s[0]",
"sub x20, x4, #0x8 (8)",
"ldr s2, [x20]",
"fadd s0, s18, s2",
"mov v18.s[0], v0.s[0]",
"fmul s0, s2, s16",
"mov v2.s[0], v0.s[0]",
"sub x20, x4, #0x8 (8)",
"str s18, [x20]",
"ldr s3, [x20]",
"fadd s0, s2, s3",
"mov v2.s[0], v0.s[0]",
"sub x20, x4, #0x8 (8)",
"str s2, [x20]",
"sub x20, x6, #0x4 (4)",
"ldr s18, [x20]",
"fmul s0, s18, s17",
"mov v18.s[0], v0.s[0]",
"sub x20, x4, #0x4 (4)",
"ldr s2, [x20]",
"fadd s0, s18, s2",
"fmul s0, s2, s17",
"mov v2.s[0], v0.s[0]",
"sub x20, x4, #0x4 (4)",
"ldr s3, [x20]",
"mov v18.16b, v2.16b",
"fadd s0, s2, s3",
"mov v18.s[0], v0.s[0]",
"sub x20, x4, #0x4 (4)",
"str s18, [x20]",
"mov x27, x10",
"subs w26, w10, #0x1 (1)",
"subs w26, w27, #0x1 (1)",
"cfinv",
"mov x10, x26"
]
@ -181,8 +178,8 @@
"cmp rsi, rax"
],
"ExpectedArm64ASM": [
"ldr q16, [x16, x4, sxtx]",
"add v16.2d, v16.2d, v17.2d",
"ldr q2, [x16, x4, sxtx]",
"add v16.2d, v2.2d, v17.2d",
"str q16, [x16, x4, sxtx]",
"add x4, x4, #0x10 (16)",
"eor w27, w10, w4",
@ -191,7 +188,7 @@
]
},
"bytemark data xor loop": {
"ExpectedInstructionCount": 16,
"ExpectedInstructionCount": 13,
"Comment": [
"Saw this in bytemark"
],
@ -208,13 +205,10 @@
],
"ExpectedArm64ASM": [
"mov x20, x4",
"mov x6, x20",
"mov x5, x20",
"mov x19, x10",
"add x4, x20, #0x1 (1)",
"lsr x6, x20, #6",
"and w5, w20, #0x3f",
"lsl x19, x19, x5",
"lsl x19, x10, x5",
"add x20, x7, x6, lsl #3",
"ldr x20, [x20]",
"eor x20, x20, x19",

View File

@ -147,7 +147,7 @@
]
},
"Psychonauts matrix swizzle": {
"ExpectedInstructionCount": 2426,
"ExpectedInstructionCount": 2364,
"Comment": [
"Hottest block in Windows Psychonauts",
"Doing a 4x4 32-bit float matrix swizzle",
@ -343,10 +343,8 @@
"ldrb w23, [x28, #1026]",
"lsl w24, w22, w20",
"bic w23, w23, w24",
"strb w23, [x28, #1026]",
"add w20, w20, #0x1 (1)",
"and w20, w20, #0x7",
"strb w20, [x28, #747]",
"ldr w5, [x9, w21, sxtw]",
"ldr s2, [x5, #16]",
"mrs x0, nzcv",
@ -418,10 +416,8 @@
"ldrb w23, [x28, #1026]",
"lsl w24, w22, w20",
"bic w23, w23, w24",
"strb w23, [x28, #1026]",
"add w20, w20, #0x1 (1)",
"and w20, w20, #0x7",
"strb w20, [x28, #747]",
"ldr w6, [x9, w21, sxtw]",
"ldr s2, [x6, #32]",
"mrs x0, nzcv",
@ -493,10 +489,8 @@
"ldrb w23, [x28, #1026]",
"lsl w24, w22, w20",
"bic w23, w23, w24",
"strb w23, [x28, #1026]",
"add w20, w20, #0x1 (1)",
"and w20, w20, #0x7",
"strb w20, [x28, #747]",
"ldr w4, [x9, w21, sxtw]",
"ldr s2, [x4, #48]",
"mrs x0, nzcv",
@ -568,10 +562,8 @@
"ldrb w23, [x28, #1026]",
"lsl w24, w22, w20",
"bic w23, w23, w24",
"strb w23, [x28, #1026]",
"add w20, w20, #0x1 (1)",
"and w20, w20, #0x7",
"strb w20, [x28, #747]",
"ldr w5, [x9, w21, sxtw]",
"ldr s2, [x5, #4]",
"mrs x0, nzcv",
@ -643,10 +635,8 @@
"ldrb w21, [x28, #1026]",
"lsl w22, w22, w20",
"bic w21, w21, w22",
"strb w21, [x28, #1026]",
"add w20, w20, #0x1 (1)",
"and w20, w20, #0x7",
"strb w20, [x28, #747]",
"mov w22, #0xffffffbc",
"ldr w6, [x9, w22, sxtw]",
"ldr s2, [x6, #20]",
@ -720,10 +710,8 @@
"ldrb w21, [x28, #1026]",
"lsl w24, w23, w20",
"bic w21, w21, w24",
"strb w21, [x28, #1026]",
"add w20, w20, #0x1 (1)",
"and w20, w20, #0x7",
"strb w20, [x28, #747]",
"ldr w4, [x9, w22, sxtw]",
"ldr s2, [x4, #36]",
"mrs x0, nzcv",
@ -795,10 +783,8 @@
"ldrb w21, [x28, #1026]",
"lsl w24, w23, w20",
"bic w21, w21, w24",
"strb w21, [x28, #1026]",
"add w20, w20, #0x1 (1)",
"and w20, w20, #0x7",
"strb w20, [x28, #747]",
"ldr w5, [x9, w22, sxtw]",
"ldr s2, [x5, #52]",
"mrs x0, nzcv",
@ -870,10 +856,8 @@
"ldrb w21, [x28, #1026]",
"lsl w24, w23, w20",
"bic w21, w21, w24",
"strb w21, [x28, #1026]",
"add w20, w20, #0x1 (1)",
"and w20, w20, #0x7",
"strb w20, [x28, #747]",
"ldr w6, [x9, w22, sxtw]",
"ldr s2, [x6, #8]",
"mrs x0, nzcv",
@ -945,10 +929,8 @@
"ldrb w21, [x28, #1026]",
"lsl w24, w23, w20",
"bic w21, w21, w24",
"strb w21, [x28, #1026]",
"add w20, w20, #0x1 (1)",
"and w20, w20, #0x7",
"strb w20, [x28, #747]",
"ldr w4, [x9, w22, sxtw]",
"ldr s2, [x4, #24]",
"mrs x0, nzcv",
@ -1020,10 +1002,8 @@
"ldrb w21, [x28, #1026]",
"lsl w22, w23, w20",
"bic w21, w21, w22",
"strb w21, [x28, #1026]",
"add w20, w20, #0x1 (1)",
"and w20, w20, #0x7",
"strb w20, [x28, #747]",
"mov w22, #0xffffffbc",
"ldr w5, [x9, w22, sxtw]",
"ldr s2, [x5, #40]",
@ -1097,10 +1077,8 @@
"ldrb w21, [x28, #1026]",
"lsl w24, w23, w20",
"bic w21, w21, w24",
"strb w21, [x28, #1026]",
"add w20, w20, #0x1 (1)",
"and w20, w20, #0x7",
"strb w20, [x28, #747]",
"ldr w6, [x9, w22, sxtw]",
"ldr s2, [x6, #56]",
"mrs x0, nzcv",
@ -1172,10 +1150,8 @@
"ldrb w21, [x28, #1026]",
"lsl w24, w23, w20",
"bic w21, w21, w24",
"strb w21, [x28, #1026]",
"add w20, w20, #0x1 (1)",
"and w20, w20, #0x7",
"strb w20, [x28, #747]",
"ldr w4, [x9, w22, sxtw]",
"ldr s2, [x4, #12]",
"mrs x0, nzcv",
@ -1247,10 +1223,8 @@
"ldrb w21, [x28, #1026]",
"lsl w24, w23, w20",
"bic w21, w21, w24",
"strb w21, [x28, #1026]",
"add w20, w20, #0x1 (1)",
"and w20, w20, #0x7",
"strb w20, [x28, #747]",
"ldr w5, [x9, w22, sxtw]",
"ldr s2, [x5, #28]",
"mrs x0, nzcv",
@ -1322,10 +1296,8 @@
"ldrb w21, [x28, #1026]",
"lsl w24, w23, w20",
"bic w21, w21, w24",
"strb w21, [x28, #1026]",
"add w20, w20, #0x1 (1)",
"and w20, w20, #0x7",
"strb w20, [x28, #747]",
"ldr w6, [x9, w22, sxtw]",
"ldr s2, [x6, #44]",
"mrs x0, nzcv",
@ -1397,10 +1369,8 @@
"ldrb w21, [x28, #1026]",
"lsl w22, w23, w20",
"bic w21, w21, w22",
"strb w21, [x28, #1026]",
"add w20, w20, #0x1 (1)",
"and w20, w20, #0x7",
"strb w20, [x28, #747]",
"mov w22, #0xffffffbc",
"ldr w4, [x9, w22, sxtw]",
"ldr s2, [x4, #60]",
@ -1474,10 +1444,8 @@
"ldrb w21, [x28, #1026]",
"lsl w23, w22, w20",
"bic w21, w21, w23",
"strb w21, [x28, #1026]",
"add w20, w20, #0x1 (1)",
"and w20, w20, #0x7",
"strb w20, [x28, #747]",
"ldr w5, [x9, #8]",
"mov w23, #0xffffffc0",
"ldr s2, [x9, w23, sxtw]",
@ -1549,10 +1517,8 @@
"ldrb w21, [x28, #1026]",
"lsl w23, w22, w20",
"bic w21, w21, w23",
"strb w21, [x28, #1026]",
"add w20, w20, #0x1 (1)",
"and w20, w20, #0x7",
"strb w20, [x28, #747]",
"ldr w6, [x9, #8]",
"mov w23, #0xffffffc4",
"ldr s2, [x9, w23, sxtw]",
@ -1624,10 +1590,8 @@
"ldrb w21, [x28, #1026]",
"lsl w23, w22, w20",
"bic w21, w21, w23",
"strb w21, [x28, #1026]",
"add w20, w20, #0x1 (1)",
"and w20, w20, #0x7",
"strb w20, [x28, #747]",
"ldr w4, [x9, #8]",
"mov w23, #0xffffffc8",
"ldr s2, [x9, w23, sxtw]",
@ -1699,10 +1663,8 @@
"ldrb w21, [x28, #1026]",
"lsl w23, w22, w20",
"bic w21, w21, w23",
"strb w21, [x28, #1026]",
"add w20, w20, #0x1 (1)",
"and w20, w20, #0x7",
"strb w20, [x28, #747]",
"ldr w5, [x9, #8]",
"mov w23, #0xffffffcc",
"ldr s2, [x9, w23, sxtw]",
@ -1774,10 +1736,8 @@
"ldrb w21, [x28, #1026]",
"lsl w22, w22, w20",
"bic w21, w21, w22",
"strb w21, [x28, #1026]",
"add w20, w20, #0x1 (1)",
"and w20, w20, #0x7",
"strb w20, [x28, #747]",
"ldr w6, [x9, #8]",
"mov w22, #0xffffffd0",
"ldr s2, [x9, w22, sxtw]",
@ -1850,10 +1810,8 @@
"ldrb w21, [x28, #1026]",
"lsl w23, w22, w20",
"bic w21, w21, w23",
"strb w21, [x28, #1026]",
"add w20, w20, #0x1 (1)",
"and w20, w20, #0x7",
"strb w20, [x28, #747]",
"ldr w4, [x9, #8]",
"mov w23, #0xffffffd4",
"ldr s2, [x9, w23, sxtw]",
@ -1925,10 +1883,8 @@
"ldrb w21, [x28, #1026]",
"lsl w23, w22, w20",
"bic w21, w21, w23",
"strb w21, [x28, #1026]",
"add w20, w20, #0x1 (1)",
"and w20, w20, #0x7",
"strb w20, [x28, #747]",
"ldr w5, [x9, #8]",
"mov w23, #0xffffffd8",
"ldr s2, [x9, w23, sxtw]",
@ -2000,10 +1956,8 @@
"ldrb w21, [x28, #1026]",
"lsl w23, w22, w20",
"bic w21, w21, w23",
"strb w21, [x28, #1026]",
"add w20, w20, #0x1 (1)",
"and w20, w20, #0x7",
"strb w20, [x28, #747]",
"ldr w6, [x9, #8]",
"mov w23, #0xffffffdc",
"ldr s2, [x9, w23, sxtw]",
@ -2075,10 +2029,8 @@
"ldrb w21, [x28, #1026]",
"lsl w23, w22, w20",
"bic w21, w21, w23",
"strb w21, [x28, #1026]",
"add w20, w20, #0x1 (1)",
"and w20, w20, #0x7",
"strb w20, [x28, #747]",
"ldr w4, [x9, #8]",
"mov w23, #0xffffffe0",
"ldr s2, [x9, w23, sxtw]",
@ -2150,10 +2102,8 @@
"ldrb w21, [x28, #1026]",
"lsl w22, w22, w20",
"bic w21, w21, w22",
"strb w21, [x28, #1026]",
"add w20, w20, #0x1 (1)",
"and w20, w20, #0x7",
"strb w20, [x28, #747]",
"ldr w5, [x9, #8]",
"mov w22, #0xffffffe4",
"ldr s2, [x9, w22, sxtw]",
@ -2226,10 +2176,8 @@
"ldrb w21, [x28, #1026]",
"lsl w23, w22, w20",
"bic w21, w21, w23",
"strb w21, [x28, #1026]",
"add w20, w20, #0x1 (1)",
"and w20, w20, #0x7",
"strb w20, [x28, #747]",
"ldr w6, [x9, #8]",
"mov w23, #0xffffffe8",
"ldr s2, [x9, w23, sxtw]",
@ -2301,10 +2249,8 @@
"ldrb w21, [x28, #1026]",
"lsl w23, w22, w20",
"bic w21, w21, w23",
"strb w21, [x28, #1026]",
"add w20, w20, #0x1 (1)",
"and w20, w20, #0x7",
"strb w20, [x28, #747]",
"ldr w4, [x9, #8]",
"mov w23, #0xffffffec",
"ldr s2, [x9, w23, sxtw]",
@ -2376,10 +2322,8 @@
"ldrb w21, [x28, #1026]",
"lsl w23, w22, w20",
"bic w21, w21, w23",
"strb w21, [x28, #1026]",
"add w20, w20, #0x1 (1)",
"and w20, w20, #0x7",
"strb w20, [x28, #747]",
"ldr w5, [x9, #8]",
"mov w23, #0xfffffff0",
"ldr s2, [x9, w23, sxtw]",
@ -2451,10 +2395,8 @@
"ldrb w21, [x28, #1026]",
"lsl w23, w22, w20",
"bic w21, w21, w23",
"strb w21, [x28, #1026]",
"add w20, w20, #0x1 (1)",
"and w20, w20, #0x7",
"strb w20, [x28, #747]",
"ldr w6, [x9, #8]",
"mov w23, #0xfffffff4",
"ldr s2, [x9, w23, sxtw]",
@ -2526,10 +2468,8 @@
"ldrb w21, [x28, #1026]",
"lsl w22, w22, w20",
"bic w21, w21, w22",
"strb w21, [x28, #1026]",
"add w20, w20, #0x1 (1)",
"and w20, w20, #0x7",
"strb w20, [x28, #747]",
"ldr w4, [x9, #8]",
"mov w22, #0xfffffff8",
"ldr s2, [x9, w22, sxtw]",
@ -2602,10 +2542,8 @@
"ldrb w21, [x28, #1026]",
"lsl w23, w22, w20",
"bic w21, w21, w23",
"strb w21, [x28, #1026]",
"add w20, w20, #0x1 (1)",
"and w20, w20, #0x7",
"strb w20, [x28, #747]",
"ldr w5, [x9, #8]",
"mov w23, #0xfffffffc",
"ldr s2, [x9, w23, sxtw]",

View File

@ -13,7 +13,7 @@
},
"Instructions": {
"FMOD scalar loop": {
"ExpectedInstructionCount": 72,
"ExpectedInstructionCount": 70,
"x86Insts": [
"mov esi, ecx",
"mov rdx, rbp",
@ -55,76 +55,74 @@
"sub esi, 0x1"
],
"ExpectedArm64ASM": [
"mov w10, w5",
"mov x6, x9",
"mov x4, x7",
"ldr s18, [x6]",
"add x4, x4, #0x20 (32)",
"fmul s18, s18, s16",
"add x6, x6, #0x20 (32)",
"mov w27, w5",
"ldr s2, [x9]",
"add x4, x7, #0x20 (32)",
"fmul s2, s2, s16",
"add x6, x9, #0x20 (32)",
"sub x20, x4, #0x20 (32)",
"ldr s2, [x20]",
"fadd s18, s18, s2",
"ldr s3, [x20]",
"fadd s2, s2, s3",
"sub x20, x4, #0x20 (32)",
"str s18, [x20]",
"str s2, [x20]",
"sub x20, x6, #0x1c (28)",
"ldr s18, [x20]",
"fmul s18, s18, s17",
"sub x20, x4, #0x1c (28)",
"ldr s2, [x20]",
"fadd s18, s18, s2",
"fmul s2, s2, s17",
"sub x20, x4, #0x1c (28)",
"str s18, [x20]",
"ldr s3, [x20]",
"fadd s2, s2, s3",
"sub x20, x4, #0x1c (28)",
"str s2, [x20]",
"sub x20, x6, #0x18 (24)",
"ldr s18, [x20]",
"fmul s18, s18, s16",
"sub x20, x4, #0x18 (24)",
"ldr s2, [x20]",
"fadd s18, s18, s2",
"fmul s2, s2, s16",
"sub x20, x4, #0x18 (24)",
"str s18, [x20]",
"ldr s3, [x20]",
"fadd s2, s2, s3",
"sub x20, x4, #0x18 (24)",
"str s2, [x20]",
"sub x20, x6, #0x14 (20)",
"ldr s18, [x20]",
"fmul s18, s18, s17",
"sub x20, x4, #0x14 (20)",
"ldr s2, [x20]",
"fadd s18, s18, s2",
"fmul s2, s2, s17",
"sub x20, x4, #0x14 (20)",
"str s18, [x20]",
"ldr s3, [x20]",
"fadd s2, s2, s3",
"sub x20, x4, #0x14 (20)",
"str s2, [x20]",
"sub x20, x6, #0x10 (16)",
"ldr s18, [x20]",
"fmul s18, s18, s16",
"sub x20, x4, #0x10 (16)",
"ldr s2, [x20]",
"fadd s18, s18, s2",
"fmul s2, s2, s16",
"sub x20, x4, #0x10 (16)",
"str s18, [x20]",
"ldr s3, [x20]",
"fadd s2, s2, s3",
"sub x20, x4, #0x10 (16)",
"str s2, [x20]",
"sub x20, x6, #0xc (12)",
"ldr s18, [x20]",
"fmul s18, s18, s17",
"sub x20, x4, #0xc (12)",
"ldr s2, [x20]",
"fadd s18, s18, s2",
"fmul s2, s2, s17",
"sub x20, x4, #0xc (12)",
"str s18, [x20]",
"ldr s3, [x20]",
"fadd s2, s2, s3",
"sub x20, x4, #0xc (12)",
"str s2, [x20]",
"sub x20, x6, #0x8 (8)",
"ldr s18, [x20]",
"fmul s18, s18, s16",
"sub x20, x4, #0x8 (8)",
"ldr s2, [x20]",
"fadd s18, s18, s2",
"fmul s2, s2, s16",
"sub x20, x4, #0x8 (8)",
"str s18, [x20]",
"ldr s3, [x20]",
"fadd s2, s2, s3",
"sub x20, x4, #0x8 (8)",
"str s2, [x20]",
"sub x20, x6, #0x4 (4)",
"ldr s18, [x20]",
"fmul s18, s18, s17",
"sub x20, x4, #0x4 (4)",
"ldr s2, [x20]",
"fadd s18, s18, s2",
"fmul s2, s2, s17",
"sub x20, x4, #0x4 (4)",
"ldr s3, [x20]",
"mov v18.16b, v2.16b",
"fadd s18, s2, s3",
"sub x20, x4, #0x4 (4)",
"str s18, [x20]",
"mov x27, x10",
"subs w26, w10, #0x1 (1)",
"subs w26, w27, #0x1 (1)",
"cfinv",
"mov x10, x26"
]

View File

@ -6757,7 +6757,7 @@
]
},
"fucompp": {
"ExpectedInstructionCount": 58,
"ExpectedInstructionCount": 57,
"Comment": [
"0xda 11b 0xe9 /5"
],
@ -6811,7 +6811,6 @@
"ldrb w22, [x28, #1026]",
"lsl w23, w21, w20",
"bic w22, w22, w23",
"strb w22, [x28, #1026]",
"add w20, w20, #0x1 (1)",
"and w20, w20, #0x7",
"lsl w21, w21, w20",
@ -14681,7 +14680,7 @@
]
},
"fcompp": {
"ExpectedInstructionCount": 58,
"ExpectedInstructionCount": 57,
"Comment": [
"0xde 11b 0xd9 /3"
],
@ -14735,7 +14734,6 @@
"ldrb w22, [x28, #1026]",
"lsl w23, w21, w20",
"bic w22, w22, w23",
"strb w22, [x28, #1026]",
"add w20, w20, #0x1 (1)",
"and w20, w20, #0x7",
"lsl w21, w21, w20",

View File

@ -4071,7 +4071,7 @@
]
},
"fucompp": {
"ExpectedInstructionCount": 30,
"ExpectedInstructionCount": 29,
"Comment": [
"0xda 11b 0xe9 /5"
],
@ -4097,7 +4097,6 @@
"ldrb w22, [x28, #1026]",
"lsl w23, w21, w20",
"bic w22, w22, w23",
"strb w22, [x28, #1026]",
"add w20, w20, #0x1 (1)",
"and w20, w20, #0x7",
"lsl w21, w21, w20",
@ -6404,7 +6403,7 @@
]
},
"frstor [rax]": {
"ExpectedInstructionCount": 325,
"ExpectedInstructionCount": 324,
"Comment": [
"0xdd !11b /4"
],
@ -6420,7 +6419,6 @@
"bfi x0, x1, #24, #1",
"msr fpcr, x0",
"strh w20, [x28, #1024]",
"strh w20, [x28, #1024]",
"ldr w20, [x4, #4]",
"ubfx w21, w20, #11, #3",
"strb w21, [x28, #747]",
@ -8570,7 +8568,7 @@
]
},
"fcompp": {
"ExpectedInstructionCount": 30,
"ExpectedInstructionCount": 29,
"Comment": [
"0xde 11b 0xd9 /3"
],
@ -8596,7 +8594,6 @@
"ldrb w22, [x28, #1026]",
"lsl w23, w21, w20",
"bic w22, w22, w23",
"strb w22, [x28, #1026]",
"add w20, w20, #0x1 (1)",
"and w20, w20, #0x7",
"lsl w21, w21, w20",

View File

@ -6756,7 +6756,7 @@
]
},
"fucompp": {
"ExpectedInstructionCount": 58,
"ExpectedInstructionCount": 57,
"Comment": [
"0xda 11b 0xe9 /5"
],
@ -6810,7 +6810,6 @@
"ldrb w22, [x28, #1026]",
"lsl w23, w21, w20",
"bic w22, w22, w23",
"strb w22, [x28, #1026]",
"add w20, w20, #0x1 (1)",
"and w20, w20, #0x7",
"lsl w21, w21, w20",
@ -14696,7 +14695,7 @@
]
},
"fcompp": {
"ExpectedInstructionCount": 58,
"ExpectedInstructionCount": 57,
"Comment": [
"0xde 11b 0xd9 /3"
],
@ -14750,7 +14749,6 @@
"ldrb w22, [x28, #1026]",
"lsl w23, w21, w20",
"bic w22, w22, w23",
"strb w22, [x28, #1026]",
"add w20, w20, #0x1 (1)",
"and w20, w20, #0x7",
"lsl w21, w21, w20",

View File

@ -4091,7 +4091,7 @@
]
},
"fucompp": {
"ExpectedInstructionCount": 31,
"ExpectedInstructionCount": 30,
"Comment": [
"0xda 11b 0xe9 /5"
],
@ -4118,7 +4118,6 @@
"ldrb w22, [x28, #1026]",
"lsl w23, w21, w20",
"bic w22, w22, w23",
"strb w22, [x28, #1026]",
"add w20, w20, #0x1 (1)",
"and w20, w20, #0x7",
"lsl w21, w21, w20",
@ -6523,7 +6522,7 @@
]
},
"frstor [rax]": {
"ExpectedInstructionCount": 325,
"ExpectedInstructionCount": 324,
"Comment": [
"0xdd !11b /4"
],
@ -6539,7 +6538,6 @@
"bfi x0, x1, #24, #1",
"msr fpcr, x0",
"strh w20, [x28, #1024]",
"strh w20, [x28, #1024]",
"ldr w20, [x4, #4]",
"ubfx w21, w20, #11, #3",
"strb w21, [x28, #747]",
@ -8707,7 +8705,7 @@
]
},
"fcompp": {
"ExpectedInstructionCount": 31,
"ExpectedInstructionCount": 30,
"Comment": [
"0xde 11b 0xd9 /3"
],
@ -8734,7 +8732,6 @@
"ldrb w22, [x28, #1026]",
"lsl w23, w21, w20",
"bic w22, w22, w23",
"strb w22, [x28, #1026]",
"add w20, w20, #0x1 (1)",
"and w20, w20, #0x7",
"lsl w21, w21, w20",