[ARM] Extract shifts out of multiply-by-constant

Turning (op x (mul y k)) into (op x (lsl (mul y k>>n) n)) is beneficial when we can do the lsl as a shifted operand and the resulting multiply constant is simpler to generate. Do this by doing the transformation when trying to select a shifted operand, as that ensures that it actually turns out better (the alternative would be to do it in PreprocessISelDAG, but we don't know for sure there if extracting the shift would allow a shifted operand to be used). Differential Revision: http://reviews.llvm.org/D12196 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@247569 91177308-0d34-0410-b5e6-96231b3b80d8
2024-12-26 14:15:53 +00:00 · 2015-09-14 15:19:41 +00:00 · 2015-09-14 15:19:41 +00:00 · e50e6f3e3d
commit e50e6f3e3d
parent 74e437e7b1
2 changed files with 293 additions and 50 deletions
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@ -271,6 +271,22 @@ private:
  // Get the alignment operand for a NEON VLD or VST instruction.
  SDValue GetVLDSTAlign(SDValue Align, SDLoc dl, unsigned NumVecs,
                        bool is64BitVector);
+
+  /// Returns the number of instructions required to materialize the given
+  /// constant in a register, or 3 if a literal pool load is needed.
+  unsigned ConstantMaterializationCost(unsigned Val) const;
+
+  /// Checks if N is a multiplication by a constant where we can extract out a
+  /// power of two from the constant so that it can be used in a shift, but only
+  /// if it simplifies the materialization of the constant. Returns true if it
+  /// is, and assigns to PowerOfTwo the power of two that should be extracted
+  /// out and to NewMulConst the new constant to be multiplied by.
+  bool canExtractShiftFromMul(const SDValue &N, unsigned MaxShift,
+                              unsigned &PowerOfTwo, SDValue &NewMulConst) const;
+
+  /// Replace N with M in CurDAG, in a way that also ensures that M gets
+  /// selected when N would have been selected.
+  void replaceDAGValue(const SDValue &N, SDValue M);
 };
 }

@ -464,6 +480,61 @@ bool ARMDAGToDAGISel::isShifterOpProfitable(const SDValue &Shift,
         (ShAmt == 2 || (Subtarget->isSwift() && ShAmt == 1));
 }

+unsigned ARMDAGToDAGISel::ConstantMaterializationCost(unsigned Val) const {
+  if (Subtarget->isThumb()) {
+    if (Val <= 255) return 1;                               // MOV
+    if (Subtarget->hasV6T2Ops() && Val <= 0xffff) return 1; // MOVW
+    if (~Val <= 255) return 2;                              // MOV + MVN
+    if (ARM_AM::isThumbImmShiftedVal(Val)) return 2;        // MOV + LSL
+  } else {
+    if (ARM_AM::getSOImmVal(Val) != -1) return 1;           // MOV
+    if (ARM_AM::getSOImmVal(~Val) != -1) return 1;          // MVN
+    if (Subtarget->hasV6T2Ops() && Val <= 0xffff) return 1; // MOVW
+    if (ARM_AM::isSOImmTwoPartVal(Val)) return 2;           // two instrs
+  }
+  if (Subtarget->useMovt(*MF)) return 2; // MOVW + MOVT
+  return 3; // Literal pool load
+}
+
+bool ARMDAGToDAGISel::canExtractShiftFromMul(const SDValue &N,
+                                             unsigned MaxShift,
+                                             unsigned &PowerOfTwo,
+                                             SDValue &NewMulConst) const {
+  assert(N.getOpcode() == ISD::MUL);
+  assert(MaxShift > 0);
+
+  // If the multiply is used in more than one place then changing the constant
+  // will make other uses incorrect, so don't.
+  if (!N.hasOneUse()) return false;
+  // Check if the multiply is by a constant
+  ConstantSDNode *MulConst = dyn_cast<ConstantSDNode>(N.getOperand(1));
+  if (!MulConst) return false;
+  // If the constant is used in more than one place then modifying it will mean
+  // we need to materialize two constants instead of one, which is a bad idea.
+  if (!MulConst->hasOneUse()) return false;
+  unsigned MulConstVal = MulConst->getZExtValue();
+  if (MulConstVal == 0) return false;
+
+  // Find the largest power of 2 that MulConstVal is a multiple of
+  PowerOfTwo = MaxShift;
+  while ((MulConstVal % (1 << PowerOfTwo)) != 0) {
+    --PowerOfTwo;
+    if (PowerOfTwo == 0) return false;
+  }
+
+  // Only optimise if the new cost is better
+  unsigned NewMulConstVal = MulConstVal / (1 << PowerOfTwo);
+  NewMulConst = CurDAG->getConstant(NewMulConstVal, SDLoc(N), MVT::i32);
+  unsigned OldCost = ConstantMaterializationCost(MulConstVal);
+  unsigned NewCost = ConstantMaterializationCost(NewMulConstVal);
+  return NewCost < OldCost;
+}
+
+void ARMDAGToDAGISel::replaceDAGValue(const SDValue &N, SDValue M) {
+  CurDAG->RepositionNode(N.getNode(), M.getNode());
+  CurDAG->ReplaceAllUsesWith(N, M);
+}
+
 bool ARMDAGToDAGISel::SelectImmShifterOperand(SDValue N,
                                              SDValue &BaseReg,
                                              SDValue &Opc,
@ -471,6 +542,21 @@ bool ARMDAGToDAGISel::SelectImmShifterOperand(SDValue N,
  if (DisableShifterOp)
    return false;

+  // If N is a multiply-by-constant and it's profitable to extract a shift and
+  // use it in a shifted operand do so.
+  if (N.getOpcode() == ISD::MUL) {
+    unsigned PowerOfTwo = 0;
+    SDValue NewMulConst;
+    if (canExtractShiftFromMul(N, 31, PowerOfTwo, NewMulConst)) {
+      replaceDAGValue(N.getOperand(1), NewMulConst);
+      BaseReg = N;
+      Opc = CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ARM_AM::lsl,
+                                                          PowerOfTwo),
+                                      SDLoc(N), MVT::i32);
+      return true;
+    }
+  }
+
  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOpcode());

  // Don't match base register only case. That is matched to a separate
@ -655,6 +741,18 @@ bool ARMDAGToDAGISel::SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset,
    }
  }

+  // If Offset is a multiply-by-constant and it's profitable to extract a shift
+  // and use it in a shifted operand do so.
+  if (Offset.getOpcode() == ISD::MUL) {
+    unsigned PowerOfTwo = 0;
+    SDValue NewMulConst;
+    if (canExtractShiftFromMul(Offset, 31, PowerOfTwo, NewMulConst)) {
+      replaceDAGValue(Offset.getOperand(1), NewMulConst);
+      ShAmt = PowerOfTwo;
+      ShOpcVal = ARM_AM::lsl;
+    }
+  }
+
  Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt, ShOpcVal),
                                  SDLoc(N), MVT::i32);
  return true;
@ -1314,6 +1412,17 @@ bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDValue N,
    }
  }

+  // If OffReg is a multiply-by-constant and it's profitable to extract a shift
+  // and use it in a shifted operand do so.
+  if (OffReg.getOpcode() == ISD::MUL) {
+    unsigned PowerOfTwo = 0;
+    SDValue NewMulConst;
+    if (canExtractShiftFromMul(OffReg, 3, PowerOfTwo, NewMulConst)) {
+      replaceDAGValue(OffReg.getOperand(1), NewMulConst);
+      ShAmt = PowerOfTwo;
+    }
+  }
+
  ShImm = CurDAG->getTargetConstant(ShAmt, SDLoc(N), MVT::i32);

  return true;
@ -2392,25 +2501,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
  }
  case ISD::Constant: {
    unsigned Val = cast<ConstantSDNode>(N)->getZExtValue();
-    bool UseCP = true;
-    if (Subtarget->useMovt(*MF))
-      // Thumb2-aware targets have the MOVT instruction, so all immediates can
-      // be done with MOV + MOVT, at worst.
-      UseCP = false;
-    else {
-      if (Subtarget->isThumb()) {
-        UseCP = (Val > 255 &&                                  // MOV
-                 ~Val > 255 &&                                 // MOV + MVN
-                 !ARM_AM::isThumbImmShiftedVal(Val) &&         // MOV + LSL
-                 !(Subtarget->hasV6T2Ops() && Val <= 0xffff)); // MOVW
-      } else
-        UseCP = (ARM_AM::getSOImmVal(Val) == -1 &&             // MOV
-                 ARM_AM::getSOImmVal(~Val) == -1 &&            // MVN
-                 !ARM_AM::isSOImmTwoPartVal(Val) &&            // two instrs.
-                 !(Subtarget->hasV6T2Ops() && Val <= 0xffff)); // MOVW
-    }
-
-    if (UseCP) {
+    // If we can't materialize the constant we need to use a literal pool
+    if (ConstantMaterializationCost(Val) > 2) {
      SDValue CPIdx = CurDAG->getTargetConstantPool(
          ConstantInt::get(Type::getInt32Ty(*CurDAG->getContext()), Val),
          TLI->getPointerTy(CurDAG->getDataLayout()));
--- a/test/CodeGen/ARM/shifter_operand.ll
+++ b/test/CodeGen/ARM/shifter_operand.ll
@ -1,14 +1,14 @@
-; RUN: llc < %s -mtriple=armv7-apple-darwin -mcpu=cortex-a8 | FileCheck %s -check-prefix=A8
-; RUN: llc < %s -mtriple=armv7-apple-darwin -mcpu=cortex-a9 | FileCheck %s -check-prefix=A9
+; RUN: llc < %s -mtriple=armv7-apple-darwin -mcpu=cortex-a8 | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-ARM
+; RUN: llc < %s -mtriple=armv7-apple-darwin -mcpu=cortex-a9 | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-ARM
+; RUN: llc < %s -mtriple=thumbv7m-none-eabi | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-THUMB
 ; rdar://8576755


 define i32 @test1(i32 %X, i32 %Y, i8 %sh) {
-; A8-LABEL: test1:
-; A8: add r0, r0, r1, lsl r2
-
-; A9-LABEL: test1:
-; A9: add r0, r0, r1, lsl r2
+; CHECK-LABEL: test1:
+; CHECK-ARM: add r0, r0, r1, lsl r2
+; CHECK-THUMB: lsls r1, r2
+; CHECK-THUMB: add r0, r1
        %shift.upgrd.1 = zext i8 %sh to i32
        %A = shl i32 %Y, %shift.upgrd.1
        %B = add i32 %X, %A
@ -16,11 +16,10 @@ define i32 @test1(i32 %X, i32 %Y, i8 %sh) {
 }

 define i32 @test2(i32 %X, i32 %Y, i8 %sh) {
-; A8-LABEL: test2:
-; A8: bic r0, r0, r1, asr r2
-
-; A9-LABEL: test2:
-; A9: bic r0, r0, r1, asr r2
+; CHECK-LABEL: test2:
+; CHECK-ARM: bic r0, r0, r1, asr r2
+; CHECK-THUMB: asrs r1, r2
+; CHECK-THUMB: bics r0, r1
        %shift.upgrd.2 = zext i8 %sh to i32
        %A = ashr i32 %Y, %shift.upgrd.2
        %B = xor i32 %A, -1
@ -30,14 +29,9 @@ define i32 @test2(i32 %X, i32 %Y, i8 %sh) {

 define i32 @test3(i32 %base, i32 %base2, i32 %offset) {
 entry:
-; A8-LABEL: test3:
-; A8: ldr r0, [r0, r2, lsl #2]
-; A8: ldr r1, [r1, r2, lsl #2]
-
-; lsl #2 is free
-; A9-LABEL: test3:
-; A9: ldr r0, [r0, r2, lsl #2]
-; A9: ldr r1, [r1, r2, lsl #2]
+; CHECK-LABEL: test3:
+; CHECK: ldr{{(.w)?}} r0, [r0, r2, lsl #2]
+; CHECK: ldr{{(.w)?}} r1, [r1, r2, lsl #2]
        %tmp1 = shl i32 %offset, 2
        %tmp2 = add i32 %base, %tmp1
        %tmp3 = inttoptr i32 %tmp2 to i32*
@ -53,17 +47,11 @@ declare i8* @malloc(...)

 define fastcc void @test4(i16 %addr) nounwind {
 entry:
-; A8-LABEL: test4:
-; A8: ldr [[REG:r[0-9]+]], [r0, r1, lsl #2]
-; A8-NOT: ldr [[REG:r[0-9]+]], [r0, r1, lsl #2]!
-; A8: str [[REG]], [r0, r1, lsl #2]
-; A8-NOT: str [[REG]], [r0]
-
-; A9-LABEL: test4:
-; A9: ldr [[REG:r[0-9]+]], [r0, r1, lsl #2]
-; A9-NOT: ldr [[REG:r[0-9]+]], [r0, r1, lsl #2]!
-; A9: str [[REG]], [r0, r1, lsl #2]
-; A9-NOT: str [[REG]], [r0]
+; CHECK-LABEL: test4:
+; CHECK: ldr{{(.w)?}} [[REG:r[0-9]+]], [r0, r1, lsl #2]
+; CHECK-NOT: ldr{{(.w)?}} [[REG:r[0-9]+]], [r0, r1, lsl #2]!
+; CHECK: str{{(.w)?}} [[REG]], [r0, r1, lsl #2]
+; CHECK-NOT: str{{(.w)?}} [[REG]], [r0]
  %0 = tail call i8* (...) @malloc(i32 undef) nounwind
  %1 = bitcast i8* %0 to i32*
  %2 = sext i16 %addr to i32
@ -73,3 +61,166 @@ entry:
  store i32 %5, i32* %3, align 4
  ret void
 }
+
+define i32 @test_orr_extract_from_mul_1(i32 %x, i32 %y) {
+entry:
+; CHECK-LABEL: test_orr_extract_from_mul_1
+; CHECK: movw r2, #63767
+; CHECK-ARM: mul r1, r1, r2
+; CHECK-ARM: orr r0, r1, r0
+; CHECK-THUMB: muls r1, r2, r1
+; CHECk-THUMB: orrs r0, r1
+  %mul = mul i32 %y, 63767
+  %or = or i32 %mul, %x
+  ret i32 %or
+}
+
+define i32 @test_orr_extract_from_mul_2(i32 %x, i32 %y) {
+; CHECK-LABEL: test_orr_extract_from_mul_2
+; CHECK: movw r2, #63767
+; CHECK-ARM: mul r1, r1, r2
+; CHECK-THUMB: muls r1, r2, r1
+; CHECK: orr{{(.w)?}} r0, r0, r1, lsl #1
+entry:
+  %mul1 = mul i32 %y, 127534
+  %or = or i32 %mul1, %x
+  ret i32 %or
+}
+
+define i32 @test_orr_extract_from_mul_3(i32 %x, i32 %y) {
+; CHECK-LABEL: test_orr_extract_from_mul_3
+; CHECK: movw r2, #63767
+; CHECK-ARM: mul r1, r1, r2
+; CHECK-THUMB: muls r1, r2, r1
+; CHECK: orr{{(.w)?}} r0, r0, r1, lsl #2
+entry:
+  %mul1 = mul i32 %y, 255068
+  %or = or i32 %mul1, %x
+  ret i32 %or
+}
+
+define i32 @test_orr_extract_from_mul_4(i32 %x, i32 %y) {
+; CHECK-LABEL: test_orr_extract_from_mul_4
+; CHECK: movw r2, #63767
+; CHECK-ARM: mul r1, r1, r2
+; CHECK-THUMB: muls r1, r2, r1
+; CHECK: orr{{(.w)?}} r0, r0, r1, lsl #3
+entry:
+  %mul1 = mul i32 %y, 510136
+  %or = or i32 %mul1, %x
+  ret i32 %or
+}
+
+define i32 @test_orr_extract_from_mul_5(i32 %x, i32 %y) {
+; CHECK-LABEL: test_orr_extract_from_mul_5
+; CHECK: movw r2, #63767
+; CHECK-ARM: mul r1, r1, r2
+; CHECK-THUMB: muls r1, r2, r1
+; CHECK: orr{{(.w)?}} r0, r0, r1, lsl #4
+entry:
+  %mul1 = mul i32 %y, 1020272
+  %or = or i32 %mul1, %x
+  ret i32 %or
+}
+
+define i32 @test_orr_extract_from_mul_6(i32 %x, i32 %y) {
+; CHECK-LABEL: test_orr_extract_from_mul_6
+; CHECK: movw r2, #63767
+; CHECK-ARM: mul r1, r1, r2
+; CHECK-THUMB: muls r1, r2, r1
+; CHECK: orr{{(.w)?}} r0, r0, r1, lsl #16
+entry:
+  %mul = mul i32 %y, -115933184
+  %or = or i32 %mul, %x
+  ret i32 %or
+}
+
+define i32 @test_load_extract_from_mul_1(i8* %x, i32 %y) {
+; CHECK-LABEL: test_load_extract_from_mul_1
+; CHECK: movw r2, #63767
+; CHECK-ARM: mul r1, r1, r2
+; CHECK-THUMB: muls r1, r2, r1
+; CHECK: ldrb r0, [r0, r1]
+entry:
+  %mul = mul i32 %y, 63767
+  %arrayidx = getelementptr inbounds i8, i8* %x, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}
+
+define i32 @test_load_extract_from_mul_2(i8* %x, i32 %y) {
+; CHECK-LABEL: test_load_extract_from_mul_2
+; CHECK: movw r2, #63767
+; CHECK-ARM: mul r1, r1, r2
+; CHECK-THUMB: muls r1, r2, r1
+; CHECK: ldrb{{(.w)?}} r0, [r0, r1, lsl #1]
+entry:
+  %mul1 = mul i32 %y, 127534
+  %arrayidx = getelementptr inbounds i8, i8* %x, i32 %mul1
+  %0 = load i8, i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}
+
+define i32 @test_load_extract_from_mul_3(i8* %x, i32 %y) {
+; CHECK-LABEL: test_load_extract_from_mul_3
+; CHECK: movw r2, #63767
+; CHECK-ARM: mul r1, r1, r2
+; CHECK-THUMB: muls r1, r2, r1
+; CHECK: ldrb{{(.w)?}} r0, [r0, r1, lsl #2]
+entry:
+  %mul1 = mul i32 %y, 255068
+  %arrayidx = getelementptr inbounds i8, i8* %x, i32 %mul1
+  %0 = load i8, i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}
+
+define i32 @test_load_extract_from_mul_4(i8* %x, i32 %y) {
+; CHECK-LABEL: test_load_extract_from_mul_4
+; CHECK: movw r2, #63767
+; CHECK-ARM: mul r1, r1, r2
+; CHECK-THUMB: muls r1, r2, r1
+; CHECK: ldrb{{(.w)?}} r0, [r0, r1, lsl #3]
+entry:
+  %mul1 = mul i32 %y, 510136
+  %arrayidx = getelementptr inbounds i8, i8* %x, i32 %mul1
+  %0 = load i8, i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}
+
+define i32 @test_load_extract_from_mul_5(i8* %x, i32 %y) {
+; CHECK-LABEL: test_load_extract_from_mul_5
+; CHECK-ARM: movw r2, #63767
+; CHECK-ARM: mul r1, r1, r2
+; CHECK-ARM: ldrb r0, [r0, r1, lsl #4]
+; CHECK-THUMB: movw r2, #37232
+; CHECK-THUMB: movt r2, #15
+; CHECK-THUMB: muls r1, r2, r1
+; CHECK-THUMB: ldrb r0, [r0, r1]
+entry:
+  %mul1 = mul i32 %y, 1020272
+  %arrayidx = getelementptr inbounds i8, i8* %x, i32 %mul1
+  %0 = load i8, i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}
+
+define i32 @test_load_extract_from_mul_6(i8* %x, i32 %y) {
+; CHECK-LABEL: test_load_extract_from_mul_6
+; CHECK-ARM: movw r2, #63767
+; CHECK-ARM: mul r1, r1, r2
+; CHECK-ARM: ldrb r0, [r0, r1, lsl #16]
+; CHECK-THUMB: movs r2, #0
+; CHECK-THUMB: movt r2, #63767
+; CHECK-THUMB: muls r1, r2, r1
+; CHECK-THUMB: ldrb r0, [r0, r1]
+entry:
+  %mul = mul i32 %y, -115933184
+  %arrayidx = getelementptr inbounds i8, i8* %x, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}