Merge pull request #3669 from Sonicadvance1/fix_addshift_operation

ConstProp fixes for Darwinia
2025-02-11 01:46:19 +00:00 · 2024-05-29 19:43:13 -07:00 · 2024-05-29 19:43:13 -07:00 · ab0a6bbe9f
commit ab0a6bbe9f
parent 9dd6d8ed94 f7f3024b92
5 changed files with 301 additions and 51 deletions
--- a/FEXCore/Source/Interface/IR/IR.json
+++ b/FEXCore/Source/Interface/IR/IR.json
@ -1001,7 +1001,8 @@
      },
      "GPR = AddShift OpSize:#Size, GPR:$Src1, GPR:$Src2, ShiftType:$Shift{ShiftType::LSL}, u8:$ShiftAmount{0}": {
        "Desc": [ "Integer Add with shifted register",
-                  "Will truncate to 64 or 32bits"
+                  "Will truncate to 64 or 32bits",
+                  "Dest = Src1 + (Src2 << ShiftAmount)"
                ],
        "DestSize": "Size",
        "EmitValidation": [
--- a/FEXCore/Source/Interface/IR/Passes/ConstProp.cpp
+++ b/FEXCore/Source/Interface/IR/Passes/ConstProp.cpp
@ -89,25 +89,32 @@ static bool IsTSOImm9(uint64_t imm) {
  }
 }

-using MemExtendedAddrResult = std::tuple<MemOffsetType, uint8_t, OrderedNode*, OrderedNode*>;
+struct MemExtendedAddrResult {
+  MemOffsetType OffsetType;
+  uint8_t OffsetScale;
+  OrderedNode* Base;
+  OrderedNode* OffsetReg;
+};
+
+static inline std::optional<MemExtendedAddrResult> TryAddShiftScale(IREmitter* IREmit, uint8_t AccessSize, IROp_Header* AddressHeader) {
+  auto AddShift = AddressHeader->C<IROp_AddShift>();
+  if (AddShift->Shift == IR::ShiftType::LSL) {
+    auto Scale = 1U << AddShift->ShiftAmount;
+    if (IsMemoryScale(Scale, AccessSize)) {
+      // remove shift as it can be folded to the mem op
+      return MemExtendedAddrResult {MEM_OFFSET_SXTX, (uint8_t)Scale, IREmit->UnwrapNode(AddShift->Src1), IREmit->UnwrapNode(AddShift->Src2)};
+    } else if (Scale == 1) {
+      return MemExtendedAddrResult {MEM_OFFSET_SXTX, 1, IREmit->UnwrapNode(AddShift->Src1), IREmit->UnwrapNode(AddShift->Src2)};
+    }
+  }
+  return std::nullopt;
+}

 // If this optimization doesn't succeed, it will return the nullopt
 static std::optional<MemExtendedAddrResult> MemExtendedAddressing(IREmitter* IREmit, uint8_t AccessSize, IROp_Header* AddressHeader) {
  // Try to optimize: AddShift Base, LSHL(Offset, Scale)
  if (AddressHeader->Op == OP_ADDSHIFT) {
-    auto AddShift = AddressHeader->C<IROp_AddShift>();
-    if (AddShift->Shift == IR::ShiftType::LSL) {
-      auto Scale = 1U << AddShift->ShiftAmount;
-      if (IsMemoryScale(Scale, AccessSize)) {
-        // remove shift as it can be folded to the mem op
-        return std::make_optional(
-          std::make_tuple(MEM_OFFSET_SXTX, (uint8_t)Scale, IREmit->UnwrapNode(AddShift->Src2), IREmit->UnwrapNode(AddShift->Src1)));
-      } else if (Scale == 1) {
-        return std::make_optional(std::make_tuple(MEM_OFFSET_SXTX, 1, IREmit->UnwrapNode(AddShift->Src2), IREmit->UnwrapNode(AddShift->Src1)));
-      }
-    }
-
-    return std::nullopt;
+    return TryAddShiftScale(IREmit, AccessSize, AddressHeader);
  }

  LOGMAN_THROW_A_FMT(AddressHeader->Op == OP_ADD, "Invalid address Op");
@ -119,12 +126,11 @@ static std::optional<MemExtendedAddrResult> MemExtendedAddressing(IREmitter* IRE
      if (IREmit->IsValueConstant(Src0Header->Args[1], &Scale)) {
        if (IsMemoryScale(Scale, AccessSize)) {
          // remove mul as it can be folded to the mem op
-          return std::make_optional(std::make_tuple(MEM_OFFSET_SXTX, (uint8_t)Scale, IREmit->UnwrapNode(AddressHeader->Args[1]),
-                                                    IREmit->UnwrapNode(Src0Header->Args[0])));
+          return MemExtendedAddrResult {MEM_OFFSET_SXTX, (uint8_t)Scale, IREmit->UnwrapNode(AddressHeader->Args[1]),
+                                        IREmit->UnwrapNode(Src0Header->Args[0])};
        } else if (Scale == 1) {
          // remove nop mul
-          return std::make_optional(
-            std::make_tuple(MEM_OFFSET_SXTX, 1, IREmit->UnwrapNode(AddressHeader->Args[1]), IREmit->UnwrapNode(Src0Header->Args[0])));
+          return MemExtendedAddrResult {MEM_OFFSET_SXTX, 1, IREmit->UnwrapNode(AddressHeader->Args[1]), IREmit->UnwrapNode(Src0Header->Args[0])};
        }
      }
    }
@ -132,15 +138,14 @@ static std::optional<MemExtendedAddrResult> MemExtendedAddressing(IREmitter* IRE
    else if (Src0Header->Op == OP_LSHL) {
      uint64_t Constant2;
      if (IREmit->IsValueConstant(Src0Header->Args[1], &Constant2)) {
-        uint64_t Scale = 1 << Constant2;
+        uint8_t Scale = 1 << Constant2;
        if (IsMemoryScale(Scale, AccessSize)) {
          // remove shift as it can be folded to the mem op
-          return std::make_optional(
-            std::make_tuple(MEM_OFFSET_SXTX, Scale, IREmit->UnwrapNode(AddressHeader->Args[1]), IREmit->UnwrapNode(Src0Header->Args[0])));
+          return MemExtendedAddrResult {MEM_OFFSET_SXTX, Scale, IREmit->UnwrapNode(AddressHeader->Args[1]),
+                                        IREmit->UnwrapNode(Src0Header->Args[0])};
        } else if (Scale == 1) {
          // remove nop shift
-          return std::make_optional(
-            std::make_tuple(MEM_OFFSET_SXTX, 1, IREmit->UnwrapNode(AddressHeader->Args[1]), IREmit->UnwrapNode(Src0Header->Args[0])));
+          return MemExtendedAddrResult {MEM_OFFSET_SXTX, 1, IREmit->UnwrapNode(AddressHeader->Args[1]), IREmit->UnwrapNode(Src0Header->Args[0])};
        }
      }
    }
@ -149,8 +154,7 @@ static std::optional<MemExtendedAddrResult> MemExtendedAddressing(IREmitter* IRE
      auto Bfe = Src0Header->C<IROp_Bfe>();
      if (Bfe->lsb == 0 && Bfe->Width == 32) {
        // todo: arm can also scale here
-        return std::make_optional(
-          std::make_tuple(MEM_OFFSET_UXTW, 1, IREmit->UnwrapNode(AddressHeader->Args[1]), IREmit->UnwrapNode(Src0Header->Args[0])));
+        return MemExtendedAddrResult {MEM_OFFSET_UXTW, 1, IREmit->UnwrapNode(AddressHeader->Args[1]), IREmit->UnwrapNode(Src0Header->Args[0])};
      }
    }
    // Try to optimize: Base + (s32)Offset
@ -158,8 +162,7 @@ static std::optional<MemExtendedAddrResult> MemExtendedAddressing(IREmitter* IRE
      auto Sbfe = Src0Header->C<IROp_Sbfe>();
      if (Sbfe->lsb == 0 && Sbfe->Width == 32) {
        // todo: arm can also scale here
-        return std::make_optional(
-          std::make_tuple(MEM_OFFSET_SXTW, 1, IREmit->UnwrapNode(AddressHeader->Args[1]), IREmit->UnwrapNode(Src0Header->Args[0])));
+        return MemExtendedAddrResult {MEM_OFFSET_SXTW, 1, IREmit->UnwrapNode(AddressHeader->Args[1]), IREmit->UnwrapNode(Src0Header->Args[0])};
      }
    }
  }
@ -181,9 +184,9 @@ static std::optional<MemExtendedAddrResult> MemExtendedAddressing(IREmitter* IRE
    int32_t Val32 = (int32_t)ConstVal;

    if (Val32 > -16384 && Val32 < 0) {
-      return std::make_optional(std::make_tuple(MEM_OFFSET_SXTW, 1, Base, Cnt));
+      return MemExtendedAddrResult {MEM_OFFSET_SXTW, 1, Base, Cnt};
    } else if (Val32 >= 0 && Val32 < 16384) {
-      return std::make_optional(std::make_tuple(MEM_OFFSET_SXTX, 1, Base, Cnt));
+      return MemExtendedAddrResult {MEM_OFFSET_SXTX, 1, Base, Cnt};
    }
  } else if (AddressHeader->Size == 4) {
    // Do not optimize 32bit reg+reg.
@ -195,11 +198,28 @@ static std::optional<MemExtendedAddrResult> MemExtendedAddressing(IREmitter* IRE
    // ldr w7, [x5, w7, sxtx]
    return std::nullopt;
  } else {
-    return std::make_optional(std::make_tuple(MEM_OFFSET_SXTX, 1, Arg0, Arg1));
+    return MemExtendedAddrResult {MEM_OFFSET_SXTX, 1, Arg0, Arg1};
  }
  return std::nullopt;
 }

+static std::optional<MemExtendedAddrResult> MemVectorAtomicExtendedAddressing(IREmitter* IREmit, uint8_t AccessSize, IROp_Header* AddressHeader) {
+  // Atomic TSO emulation of vectors use half-barriers. So it gets the full addressing support of vector loadstores
+  // Addressing capabilities
+  // - LDR, [Reg, Reg, LSL <Size>]
+  // - LDR, [Reg], imm12 Scaled <Size> ///< TODO: Implement this
+  // - LDUR, [Reg], imm9 (Signed [-256,256))  ///< TODO: Implement this
+  // TODO: Implement support for FEAT_LRCPC3.
+  // - LDAPUR [reg], imm9 (Signed [-256,256))
+
+  // Try to optimize: AddShift Base, LSHL(Offset, Scale)
+  if (AddressHeader->Op == OP_ADDSHIFT) {
+    return TryAddShiftScale(IREmit, AccessSize, AddressHeader);
+  }
+
+  return std::nullopt;
+}
+
 static bool IsBfeAlreadyDone(IREmitter* IREmit, OrderedNodeWrapper src, uint64_t Width) {
  auto IROp = IREmit->GetOpHeader(src);
  if (IROp->Op == OP_BFE) {
@ -323,18 +343,18 @@ void ConstProp::ConstantPropagation(IREmitter* IREmit, const IRListView& Current
    auto Op = IROp->CW<IR::IROp_LoadMemTSO>();
    auto AddressHeader = IREmit->GetOpHeader(Op->Addr);

-    if (Op->Class == FEXCore::IR::FPRClass && AddressHeader->Op == OP_ADD && AddressHeader->Size == 8) {
+    if (Op->Class == FEXCore::IR::FPRClass && AddressHeader->Size == 8) {
      // TODO: LRCPC3 supports a vector unscaled offset like LRCPC2.
      // Support once hardware is available to use this.
-      auto MaybeMemAddr = MemExtendedAddressing(IREmit, IROp->Size, AddressHeader);
+      auto MaybeMemAddr = MemVectorAtomicExtendedAddressing(IREmit, IROp->Size, AddressHeader);
      if (!MaybeMemAddr) {
        break;
      }
-      auto [OffsetType, OffsetScale, Arg0, Arg1] = *MaybeMemAddr;
+      auto [OffsetType, OffsetScale, Base, OffsetReg] = *MaybeMemAddr;
      Op->OffsetType = OffsetType;
      Op->OffsetScale = OffsetScale;
-      IREmit->ReplaceNodeArgument(CodeNode, Op->Addr_Index, Arg0);   // Addr
-      IREmit->ReplaceNodeArgument(CodeNode, Op->Offset_Index, Arg1); // Offset
+      IREmit->ReplaceNodeArgument(CodeNode, Op->Addr_Index, Base);        // Addr
+      IREmit->ReplaceNodeArgument(CodeNode, Op->Offset_Index, OffsetReg); // Offset
    }
    break;
  }
@ -343,18 +363,18 @@ void ConstProp::ConstantPropagation(IREmitter* IREmit, const IRListView& Current
    auto Op = IROp->CW<IR::IROp_StoreMemTSO>();
    auto AddressHeader = IREmit->GetOpHeader(Op->Addr);

-    if (Op->Class == FEXCore::IR::FPRClass && AddressHeader->Op == OP_ADD && AddressHeader->Size == 8) {
+    if (Op->Class == FEXCore::IR::FPRClass && AddressHeader->Size == 8) {
      // TODO: LRCPC3 supports a vector unscaled offset like LRCPC2.
      // Support once hardware is available to use this.
-      auto MaybeMemAddr = MemExtendedAddressing(IREmit, IROp->Size, AddressHeader);
+      auto MaybeMemAddr = MemVectorAtomicExtendedAddressing(IREmit, IROp->Size, AddressHeader);
      if (!MaybeMemAddr) {
        break;
      }
-      auto [OffsetType, OffsetScale, Arg0, Arg1] = *MaybeMemAddr;
+      auto [OffsetType, OffsetScale, Base, OffsetReg] = *MaybeMemAddr;
      Op->OffsetType = OffsetType;
      Op->OffsetScale = OffsetScale;
-      IREmit->ReplaceNodeArgument(CodeNode, Op->Addr_Index, Arg0);   // Addr
-      IREmit->ReplaceNodeArgument(CodeNode, Op->Offset_Index, Arg1); // Offset
+      IREmit->ReplaceNodeArgument(CodeNode, Op->Addr_Index, Base);        // Addr
+      IREmit->ReplaceNodeArgument(CodeNode, Op->Offset_Index, OffsetReg); // Offset
    }
    break;
  }
@ -368,12 +388,12 @@ void ConstProp::ConstantPropagation(IREmitter* IREmit, const IRListView& Current
      if (!MaybeMemAddr) {
        break;
      }
-      auto [OffsetType, OffsetScale, Arg0, Arg1] = *MaybeMemAddr;
+      auto [OffsetType, OffsetScale, Base, OffsetReg] = *MaybeMemAddr;

      Op->OffsetType = OffsetType;
      Op->OffsetScale = OffsetScale;
-      IREmit->ReplaceNodeArgument(CodeNode, Op->Addr_Index, Arg0);   // Addr
-      IREmit->ReplaceNodeArgument(CodeNode, Op->Offset_Index, Arg1); // Offset
+      IREmit->ReplaceNodeArgument(CodeNode, Op->Addr_Index, Base);        // Addr
+      IREmit->ReplaceNodeArgument(CodeNode, Op->Offset_Index, OffsetReg); // Offset
    }
    break;
  }
@ -387,12 +407,12 @@ void ConstProp::ConstantPropagation(IREmitter* IREmit, const IRListView& Current
      if (!MaybeMemAddr) {
        break;
      }
-      auto [OffsetType, OffsetScale, Arg0, Arg1] = *MaybeMemAddr;
+      auto [OffsetType, OffsetScale, Base, OffsetReg] = *MaybeMemAddr;

      Op->OffsetType = OffsetType;
      Op->OffsetScale = OffsetScale;
-      IREmit->ReplaceNodeArgument(CodeNode, Op->Addr_Index, Arg0);   // Addr
-      IREmit->ReplaceNodeArgument(CodeNode, Op->Offset_Index, Arg1); // Offset
+      IREmit->ReplaceNodeArgument(CodeNode, Op->Addr_Index, Base);        // Addr
+      IREmit->ReplaceNodeArgument(CodeNode, Op->Offset_Index, OffsetReg); // Offset
    }
    break;
  }
@ -408,12 +428,12 @@ void ConstProp::ConstantPropagation(IREmitter* IREmit, const IRListView& Current
      if (!MaybeMemAddr) {
        break;
      }
-      auto [OffsetType, OffsetScale, Arg0, Arg1] = *MaybeMemAddr;
+      auto [OffsetType, OffsetScale, Base, OffsetReg] = *MaybeMemAddr;

      Op->OffsetType = OffsetType;
      Op->OffsetScale = OffsetScale;
-      IREmit->ReplaceNodeArgument(CodeNode, Op->Addr_Index, Arg0);   // Addr
-      IREmit->ReplaceNodeArgument(CodeNode, Op->Offset_Index, Arg1); // Offset
+      IREmit->ReplaceNodeArgument(CodeNode, Op->Addr_Index, Base);        // Addr
+      IREmit->ReplaceNodeArgument(CodeNode, Op->Offset_Index, OffsetReg); // Offset
    }
    break;
  }
--- a/unittests/ASM/FEX_bugs/SIBScaleTranspose.asm
+++ b/unittests/ASM/FEX_bugs/SIBScaleTranspose.asm
@ -0,0 +1,95 @@
+%ifdef CONFIG
+{
+  "RegData": {
+    "RAX": "0x5152535455565758",
+    "RBX": "0x5152535455565758",
+    "RCX": "0x5152535455565758",
+    "RDX": "0x5152535455565758",
+    "RDI": "0x5152535455565758",
+
+    "XMM0": ["0x5152535455565758", "0x0"],
+    "XMM1": ["0x5152535455565758", "0x0"],
+    "XMM2": ["0x5152535455565758", "0x0"],
+    "XMM3": ["0x5152535455565758", "0x0"],
+    "XMM4": ["0x5152535455565758", "0x0"],
+
+    "MM0": "0x5152535455565758",
+    "MM1": "0x5152535455565758",
+    "MM2": "0x5152535455565758",
+    "MM3": "0x5152535455565758",
+    "MM4": "0x5152535455565758"
+  },
+  "MemoryRegions": {
+    "0x00000000a0000000": "4096",
+    "0x0000000110000000": "4096"
+  },
+  "MemoryData": {
+    "0x00000000a0000000": "0x4142434445464748",
+    "0x0000000110000000": "0x5152535455565758"
+  }
+}
+%endif
+
+; FEX had a bug in its const-prop pass where x86 SIB scale would accidentally transpose the register that was scaling with the base.
+; This test explicitly tests SIB in a way that a transpose would load data from the wrong address.
+; Basic layout is [r14 + (r15 * 8)]
+
+; r14 will be the base
+mov r14, 0x1000_0000
+; r15 will be the index
+mov r15, 0x2000_0000
+
+; Correct transpose will be at 0x0000000110000000
+; Incorrect transpose will be at 0x00000000a0000000
+
+; Break the block
+jmp .test
+.test:
+
+; Basic GPR SIB test
+mov rax, [r14 + (r15 * 8)]
+
+; Basic Vector SIB test
+movq xmm0, [r14 + (r15 * 8)]
+
+; Basic MMX SIB test
+movq mm0, [r14 + (r15 * 8)]
+
+; Break the block now
+jmp .test2
+.test2:
+
+; FEX GPR/XMM LoadMem const prop might only happen with disjoint add + mul so check this
+; Need to be able to const-prop the multiply
+imul r13, r15, 8
+
+; Test base + offset transposed both ways, for all three types
+mov rbx, [r14 + r13]
+mov rcx, [r13 + r14]
+
+movq xmm1, [r14 + r13]
+movq xmm2, [r13 + r14]
+
+movq mm1, [r14 + r13]
+movq mm2, [r13 + r14]
+
+; Break the block now
+jmp .test3
+.test3:
+
+; FEX GPR/XMM LoadMem const prop might only happen with disjoint add + lshl so check this
+; Need to be able to const-prop the lshl
+mov r13, r15
+shl r13, 3
+
+; Test base + offset transposed both ways, for all three types
+mov rdx, [r14 + r13]
+mov rdi, [r13 + r14]
+
+movq xmm3, [r14 + r13]
+movq xmm4, [r13 + r14]
+
+movq mm3, [r14 + r13]
+movq mm4, [r13 + r14]
+
+hlt
--- a/unittests/ASM/FEX_bugs/VectorLoadCrash.asm
+++ b/unittests/ASM/FEX_bugs/VectorLoadCrash.asm
@ -0,0 +1,26 @@
+%ifdef CONFIG
+{
+  "RegData": {
+    "XMM5":  ["0x0000000000000048", "0x0000000000000047"]
+  }
+}
+%endif
+
+
+; FEX-Emu had a bug where a vector load that was using SIB addressing would overflow to larger than what ARM could encode.
+; Test that here.
+; Original bug came from the Darwinia Linux binary from function `HUF_readDTableX1_wksp`
+
+mov rbx, 0
+lea r15, [rel .data - 0x3d4]
+
+; Break the block
+jmp .test
+.test:
+
+pmovzxbq xmm5, word [rbx+r15+0x3d4]
+
+hlt
+
+.data:
+dq 0x4142434445464748, 0x5152535455565758
--- a/unittests/InstructionCountCI/FEXOpt/AddressingLimitations.json
+++ b/unittests/InstructionCountCI/FEXOpt/AddressingLimitations.json
@ -918,7 +918,115 @@
    "prefetch [rax + rcx*8]": {
      "ExpectedInstructionCount": 1,
      "ExpectedArm64ASM": [
-        "prfm pldl1keep, [x5, x4, sxtx #3]"
+        "prfm pldl1keep, [x4, x5, sxtx #3]"
+      ]
+    },
+    "movzx ebx, byte [rax + rcx*1]": {
+      "ExpectedInstructionCount": 1,
+      "ExpectedArm64ASM": [
+        "ldrb w7, [x4, x5, sxtx]"
+      ]
+    },
+    "movzx ebx, byte [rax + rcx*2]": {
+      "ExpectedInstructionCount": 2,
+      "ExpectedArm64ASM": [
+        "add x20, x4, x5, lsl #1",
+        "ldrb w7, [x20]"
+      ]
+    },
+    "movzx ebx, byte [rax + rcx*4]": {
+      "ExpectedInstructionCount": 2,
+      "ExpectedArm64ASM": [
+        "add x20, x4, x5, lsl #2",
+        "ldrb w7, [x20]"
+      ]
+    },
+    "movzx ebx, byte [rax + rcx*8]": {
+      "ExpectedInstructionCount": 2,
+      "ExpectedArm64ASM": [
+        "add x20, x4, x5, lsl #3",
+        "ldrb w7, [x20]"
+      ]
+    },
+    "movzx ebx, word [rax + rcx*1]": {
+      "ExpectedInstructionCount": 1,
+      "ExpectedArm64ASM": [
+        "ldrh w7, [x4, x5, sxtx]"
+      ]
+    },
+    "movzx ebx, word [rax + rcx*2]": {
+      "ExpectedInstructionCount": 2,
+      "ExpectedArm64ASM": [
+        "add x20, x4, x5, lsl #1",
+        "ldrh w7, [x20]"
+      ]
+    },
+    "movzx ebx, word [rax + rcx*4]": {
+      "ExpectedInstructionCount": 2,
+      "ExpectedArm64ASM": [
+        "add x20, x4, x5, lsl #2",
+        "ldrh w7, [x20]"
+      ]
+    },
+    "movzx ebx, word [rax + rcx*8]": {
+      "ExpectedInstructionCount": 2,
+      "ExpectedArm64ASM": [
+        "add x20, x4, x5, lsl #3",
+        "ldrh w7, [x20]"
+      ]
+    },
+    "mov ebx, [rax + rcx*1]": {
+      "ExpectedInstructionCount": 1,
+      "ExpectedArm64ASM": [
+        "ldr w7, [x4, x5, sxtx]"
+      ]
+    },
+    "mov ebx, [rax + rcx*2]": {
+      "ExpectedInstructionCount": 2,
+      "ExpectedArm64ASM": [
+        "add x20, x4, x5, lsl #1",
+        "ldr w7, [x20]"
+      ]
+    },
+    "mov ebx, [rax + rcx*4]": {
+      "ExpectedInstructionCount": 2,
+      "ExpectedArm64ASM": [
+        "add x20, x4, x5, lsl #2",
+        "ldr w7, [x20]"
+      ]
+    },
+    "mov ebx, [rax + rcx*8]": {
+      "ExpectedInstructionCount": 2,
+      "ExpectedArm64ASM": [
+        "add x20, x4, x5, lsl #3",
+        "ldr w7, [x20]"
+      ]
+    },
+    "mov rbx, [rax + rcx*1]": {
+      "ExpectedInstructionCount": 1,
+      "ExpectedArm64ASM": [
+        "ldr x7, [x4, x5, sxtx]"
+      ]
+    },
+    "mov rbx, [rax + rcx*2]": {
+      "ExpectedInstructionCount": 2,
+      "ExpectedArm64ASM": [
+        "add x20, x4, x5, lsl #1",
+        "ldr x7, [x20]"
+      ]
+    },
+    "mov rbx, [rax + rcx*4]": {
+      "ExpectedInstructionCount": 2,
+      "ExpectedArm64ASM": [
+        "add x20, x4, x5, lsl #2",
+        "ldr x7, [x20]"
+      ]
+    },
+    "mov rbx, [rax + rcx*8]": {
+      "ExpectedInstructionCount": 2,
+      "ExpectedArm64ASM": [
+        "add x20, x4, x5, lsl #3",
+        "ldr x7, [x20]"
      ]
    }
  }