[ARM] Extract shifts out of multiply-by-constant

Turning (op x (mul y k)) into (op x (lsl (mul y k>>n) n)) is beneficial when
we can do the lsl as a shifted operand and the resulting multiply constant is
simpler to generate.

Do this by doing the transformation when trying to select a shifted operand,
as that ensures that it actually turns out better (the alternative would be to
do it in PreprocessISelDAG, but we don't know for sure there if extracting the
shift would allow a shifted operand to be used).

Differential Revision: http://reviews.llvm.org/D12196


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@247569 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
John Brawn 2015-09-14 15:19:41 +00:00
parent 74e437e7b1
commit e50e6f3e3d
2 changed files with 293 additions and 50 deletions

View File

@ -271,6 +271,22 @@ private:
// Get the alignment operand for a NEON VLD or VST instruction.
SDValue GetVLDSTAlign(SDValue Align, SDLoc dl, unsigned NumVecs,
bool is64BitVector);
/// Returns the number of instructions required to materialize the given
/// constant in a register, or 3 if a literal pool load is needed.
unsigned ConstantMaterializationCost(unsigned Val) const;
/// Checks if N is a multiplication by a constant where we can extract out a
/// power of two from the constant so that it can be used in a shift, but only
/// if it simplifies the materialization of the constant. Returns true if it
/// is, and assigns to PowerOfTwo the power of two that should be extracted
/// out and to NewMulConst the new constant to be multiplied by.
bool canExtractShiftFromMul(const SDValue &N, unsigned MaxShift,
unsigned &PowerOfTwo, SDValue &NewMulConst) const;
/// Replace N with M in CurDAG, in a way that also ensures that M gets
/// selected when N would have been selected.
void replaceDAGValue(const SDValue &N, SDValue M);
};
}
@ -464,6 +480,61 @@ bool ARMDAGToDAGISel::isShifterOpProfitable(const SDValue &Shift,
(ShAmt == 2 || (Subtarget->isSwift() && ShAmt == 1));
}
unsigned ARMDAGToDAGISel::ConstantMaterializationCost(unsigned Val) const {
if (Subtarget->isThumb()) {
if (Val <= 255) return 1; // MOV
if (Subtarget->hasV6T2Ops() && Val <= 0xffff) return 1; // MOVW
if (~Val <= 255) return 2; // MOV + MVN
if (ARM_AM::isThumbImmShiftedVal(Val)) return 2; // MOV + LSL
} else {
if (ARM_AM::getSOImmVal(Val) != -1) return 1; // MOV
if (ARM_AM::getSOImmVal(~Val) != -1) return 1; // MVN
if (Subtarget->hasV6T2Ops() && Val <= 0xffff) return 1; // MOVW
if (ARM_AM::isSOImmTwoPartVal(Val)) return 2; // two instrs
}
if (Subtarget->useMovt(*MF)) return 2; // MOVW + MOVT
return 3; // Literal pool load
}
bool ARMDAGToDAGISel::canExtractShiftFromMul(const SDValue &N,
unsigned MaxShift,
unsigned &PowerOfTwo,
SDValue &NewMulConst) const {
assert(N.getOpcode() == ISD::MUL);
assert(MaxShift > 0);
// If the multiply is used in more than one place then changing the constant
// will make other uses incorrect, so don't.
if (!N.hasOneUse()) return false;
// Check if the multiply is by a constant
ConstantSDNode *MulConst = dyn_cast<ConstantSDNode>(N.getOperand(1));
if (!MulConst) return false;
// If the constant is used in more than one place then modifying it will mean
// we need to materialize two constants instead of one, which is a bad idea.
if (!MulConst->hasOneUse()) return false;
unsigned MulConstVal = MulConst->getZExtValue();
if (MulConstVal == 0) return false;
// Find the largest power of 2 that MulConstVal is a multiple of
PowerOfTwo = MaxShift;
while ((MulConstVal % (1 << PowerOfTwo)) != 0) {
--PowerOfTwo;
if (PowerOfTwo == 0) return false;
}
// Only optimise if the new cost is better
unsigned NewMulConstVal = MulConstVal / (1 << PowerOfTwo);
NewMulConst = CurDAG->getConstant(NewMulConstVal, SDLoc(N), MVT::i32);
unsigned OldCost = ConstantMaterializationCost(MulConstVal);
unsigned NewCost = ConstantMaterializationCost(NewMulConstVal);
return NewCost < OldCost;
}
void ARMDAGToDAGISel::replaceDAGValue(const SDValue &N, SDValue M) {
CurDAG->RepositionNode(N.getNode(), M.getNode());
CurDAG->ReplaceAllUsesWith(N, M);
}
bool ARMDAGToDAGISel::SelectImmShifterOperand(SDValue N,
SDValue &BaseReg,
SDValue &Opc,
@ -471,6 +542,21 @@ bool ARMDAGToDAGISel::SelectImmShifterOperand(SDValue N,
if (DisableShifterOp)
return false;
// If N is a multiply-by-constant and it's profitable to extract a shift and
// use it in a shifted operand do so.
if (N.getOpcode() == ISD::MUL) {
unsigned PowerOfTwo = 0;
SDValue NewMulConst;
if (canExtractShiftFromMul(N, 31, PowerOfTwo, NewMulConst)) {
replaceDAGValue(N.getOperand(1), NewMulConst);
BaseReg = N;
Opc = CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ARM_AM::lsl,
PowerOfTwo),
SDLoc(N), MVT::i32);
return true;
}
}
ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOpcode());
// Don't match base register only case. That is matched to a separate
@ -655,6 +741,18 @@ bool ARMDAGToDAGISel::SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset,
}
}
// If Offset is a multiply-by-constant and it's profitable to extract a shift
// and use it in a shifted operand do so.
if (Offset.getOpcode() == ISD::MUL) {
unsigned PowerOfTwo = 0;
SDValue NewMulConst;
if (canExtractShiftFromMul(Offset, 31, PowerOfTwo, NewMulConst)) {
replaceDAGValue(Offset.getOperand(1), NewMulConst);
ShAmt = PowerOfTwo;
ShOpcVal = ARM_AM::lsl;
}
}
Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt, ShOpcVal),
SDLoc(N), MVT::i32);
return true;
@ -1314,6 +1412,17 @@ bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDValue N,
}
}
// If OffReg is a multiply-by-constant and it's profitable to extract a shift
// and use it in a shifted operand do so.
if (OffReg.getOpcode() == ISD::MUL) {
unsigned PowerOfTwo = 0;
SDValue NewMulConst;
if (canExtractShiftFromMul(OffReg, 3, PowerOfTwo, NewMulConst)) {
replaceDAGValue(OffReg.getOperand(1), NewMulConst);
ShAmt = PowerOfTwo;
}
}
ShImm = CurDAG->getTargetConstant(ShAmt, SDLoc(N), MVT::i32);
return true;
@ -2392,25 +2501,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
}
case ISD::Constant: {
unsigned Val = cast<ConstantSDNode>(N)->getZExtValue();
bool UseCP = true;
if (Subtarget->useMovt(*MF))
// Thumb2-aware targets have the MOVT instruction, so all immediates can
// be done with MOV + MOVT, at worst.
UseCP = false;
else {
if (Subtarget->isThumb()) {
UseCP = (Val > 255 && // MOV
~Val > 255 && // MOV + MVN
!ARM_AM::isThumbImmShiftedVal(Val) && // MOV + LSL
!(Subtarget->hasV6T2Ops() && Val <= 0xffff)); // MOVW
} else
UseCP = (ARM_AM::getSOImmVal(Val) == -1 && // MOV
ARM_AM::getSOImmVal(~Val) == -1 && // MVN
!ARM_AM::isSOImmTwoPartVal(Val) && // two instrs.
!(Subtarget->hasV6T2Ops() && Val <= 0xffff)); // MOVW
}
if (UseCP) {
// If we can't materialize the constant we need to use a literal pool
if (ConstantMaterializationCost(Val) > 2) {
SDValue CPIdx = CurDAG->getTargetConstantPool(
ConstantInt::get(Type::getInt32Ty(*CurDAG->getContext()), Val),
TLI->getPointerTy(CurDAG->getDataLayout()));

View File

@ -1,14 +1,14 @@
; RUN: llc < %s -mtriple=armv7-apple-darwin -mcpu=cortex-a8 | FileCheck %s -check-prefix=A8
; RUN: llc < %s -mtriple=armv7-apple-darwin -mcpu=cortex-a9 | FileCheck %s -check-prefix=A9
; RUN: llc < %s -mtriple=armv7-apple-darwin -mcpu=cortex-a8 | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-ARM
; RUN: llc < %s -mtriple=armv7-apple-darwin -mcpu=cortex-a9 | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-ARM
; RUN: llc < %s -mtriple=thumbv7m-none-eabi | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-THUMB
; rdar://8576755
define i32 @test1(i32 %X, i32 %Y, i8 %sh) {
; A8-LABEL: test1:
; A8: add r0, r0, r1, lsl r2
; A9-LABEL: test1:
; A9: add r0, r0, r1, lsl r2
; CHECK-LABEL: test1:
; CHECK-ARM: add r0, r0, r1, lsl r2
; CHECK-THUMB: lsls r1, r2
; CHECK-THUMB: add r0, r1
%shift.upgrd.1 = zext i8 %sh to i32
%A = shl i32 %Y, %shift.upgrd.1
%B = add i32 %X, %A
@ -16,11 +16,10 @@ define i32 @test1(i32 %X, i32 %Y, i8 %sh) {
}
define i32 @test2(i32 %X, i32 %Y, i8 %sh) {
; A8-LABEL: test2:
; A8: bic r0, r0, r1, asr r2
; A9-LABEL: test2:
; A9: bic r0, r0, r1, asr r2
; CHECK-LABEL: test2:
; CHECK-ARM: bic r0, r0, r1, asr r2
; CHECK-THUMB: asrs r1, r2
; CHECK-THUMB: bics r0, r1
%shift.upgrd.2 = zext i8 %sh to i32
%A = ashr i32 %Y, %shift.upgrd.2
%B = xor i32 %A, -1
@ -30,14 +29,9 @@ define i32 @test2(i32 %X, i32 %Y, i8 %sh) {
define i32 @test3(i32 %base, i32 %base2, i32 %offset) {
entry:
; A8-LABEL: test3:
; A8: ldr r0, [r0, r2, lsl #2]
; A8: ldr r1, [r1, r2, lsl #2]
; lsl #2 is free
; A9-LABEL: test3:
; A9: ldr r0, [r0, r2, lsl #2]
; A9: ldr r1, [r1, r2, lsl #2]
; CHECK-LABEL: test3:
; CHECK: ldr{{(.w)?}} r0, [r0, r2, lsl #2]
; CHECK: ldr{{(.w)?}} r1, [r1, r2, lsl #2]
%tmp1 = shl i32 %offset, 2
%tmp2 = add i32 %base, %tmp1
%tmp3 = inttoptr i32 %tmp2 to i32*
@ -53,17 +47,11 @@ declare i8* @malloc(...)
define fastcc void @test4(i16 %addr) nounwind {
entry:
; A8-LABEL: test4:
; A8: ldr [[REG:r[0-9]+]], [r0, r1, lsl #2]
; A8-NOT: ldr [[REG:r[0-9]+]], [r0, r1, lsl #2]!
; A8: str [[REG]], [r0, r1, lsl #2]
; A8-NOT: str [[REG]], [r0]
; A9-LABEL: test4:
; A9: ldr [[REG:r[0-9]+]], [r0, r1, lsl #2]
; A9-NOT: ldr [[REG:r[0-9]+]], [r0, r1, lsl #2]!
; A9: str [[REG]], [r0, r1, lsl #2]
; A9-NOT: str [[REG]], [r0]
; CHECK-LABEL: test4:
; CHECK: ldr{{(.w)?}} [[REG:r[0-9]+]], [r0, r1, lsl #2]
; CHECK-NOT: ldr{{(.w)?}} [[REG:r[0-9]+]], [r0, r1, lsl #2]!
; CHECK: str{{(.w)?}} [[REG]], [r0, r1, lsl #2]
; CHECK-NOT: str{{(.w)?}} [[REG]], [r0]
%0 = tail call i8* (...) @malloc(i32 undef) nounwind
%1 = bitcast i8* %0 to i32*
%2 = sext i16 %addr to i32
@ -73,3 +61,166 @@ entry:
store i32 %5, i32* %3, align 4
ret void
}
define i32 @test_orr_extract_from_mul_1(i32 %x, i32 %y) {
entry:
; CHECK-LABEL: test_orr_extract_from_mul_1
; CHECK: movw r2, #63767
; CHECK-ARM: mul r1, r1, r2
; CHECK-ARM: orr r0, r1, r0
; CHECK-THUMB: muls r1, r2, r1
; CHECk-THUMB: orrs r0, r1
%mul = mul i32 %y, 63767
%or = or i32 %mul, %x
ret i32 %or
}
define i32 @test_orr_extract_from_mul_2(i32 %x, i32 %y) {
; CHECK-LABEL: test_orr_extract_from_mul_2
; CHECK: movw r2, #63767
; CHECK-ARM: mul r1, r1, r2
; CHECK-THUMB: muls r1, r2, r1
; CHECK: orr{{(.w)?}} r0, r0, r1, lsl #1
entry:
%mul1 = mul i32 %y, 127534
%or = or i32 %mul1, %x
ret i32 %or
}
define i32 @test_orr_extract_from_mul_3(i32 %x, i32 %y) {
; CHECK-LABEL: test_orr_extract_from_mul_3
; CHECK: movw r2, #63767
; CHECK-ARM: mul r1, r1, r2
; CHECK-THUMB: muls r1, r2, r1
; CHECK: orr{{(.w)?}} r0, r0, r1, lsl #2
entry:
%mul1 = mul i32 %y, 255068
%or = or i32 %mul1, %x
ret i32 %or
}
define i32 @test_orr_extract_from_mul_4(i32 %x, i32 %y) {
; CHECK-LABEL: test_orr_extract_from_mul_4
; CHECK: movw r2, #63767
; CHECK-ARM: mul r1, r1, r2
; CHECK-THUMB: muls r1, r2, r1
; CHECK: orr{{(.w)?}} r0, r0, r1, lsl #3
entry:
%mul1 = mul i32 %y, 510136
%or = or i32 %mul1, %x
ret i32 %or
}
define i32 @test_orr_extract_from_mul_5(i32 %x, i32 %y) {
; CHECK-LABEL: test_orr_extract_from_mul_5
; CHECK: movw r2, #63767
; CHECK-ARM: mul r1, r1, r2
; CHECK-THUMB: muls r1, r2, r1
; CHECK: orr{{(.w)?}} r0, r0, r1, lsl #4
entry:
%mul1 = mul i32 %y, 1020272
%or = or i32 %mul1, %x
ret i32 %or
}
define i32 @test_orr_extract_from_mul_6(i32 %x, i32 %y) {
; CHECK-LABEL: test_orr_extract_from_mul_6
; CHECK: movw r2, #63767
; CHECK-ARM: mul r1, r1, r2
; CHECK-THUMB: muls r1, r2, r1
; CHECK: orr{{(.w)?}} r0, r0, r1, lsl #16
entry:
%mul = mul i32 %y, -115933184
%or = or i32 %mul, %x
ret i32 %or
}
define i32 @test_load_extract_from_mul_1(i8* %x, i32 %y) {
; CHECK-LABEL: test_load_extract_from_mul_1
; CHECK: movw r2, #63767
; CHECK-ARM: mul r1, r1, r2
; CHECK-THUMB: muls r1, r2, r1
; CHECK: ldrb r0, [r0, r1]
entry:
%mul = mul i32 %y, 63767
%arrayidx = getelementptr inbounds i8, i8* %x, i32 %mul
%0 = load i8, i8* %arrayidx, align 1
%conv = zext i8 %0 to i32
ret i32 %conv
}
define i32 @test_load_extract_from_mul_2(i8* %x, i32 %y) {
; CHECK-LABEL: test_load_extract_from_mul_2
; CHECK: movw r2, #63767
; CHECK-ARM: mul r1, r1, r2
; CHECK-THUMB: muls r1, r2, r1
; CHECK: ldrb{{(.w)?}} r0, [r0, r1, lsl #1]
entry:
%mul1 = mul i32 %y, 127534
%arrayidx = getelementptr inbounds i8, i8* %x, i32 %mul1
%0 = load i8, i8* %arrayidx, align 1
%conv = zext i8 %0 to i32
ret i32 %conv
}
define i32 @test_load_extract_from_mul_3(i8* %x, i32 %y) {
; CHECK-LABEL: test_load_extract_from_mul_3
; CHECK: movw r2, #63767
; CHECK-ARM: mul r1, r1, r2
; CHECK-THUMB: muls r1, r2, r1
; CHECK: ldrb{{(.w)?}} r0, [r0, r1, lsl #2]
entry:
%mul1 = mul i32 %y, 255068
%arrayidx = getelementptr inbounds i8, i8* %x, i32 %mul1
%0 = load i8, i8* %arrayidx, align 1
%conv = zext i8 %0 to i32
ret i32 %conv
}
define i32 @test_load_extract_from_mul_4(i8* %x, i32 %y) {
; CHECK-LABEL: test_load_extract_from_mul_4
; CHECK: movw r2, #63767
; CHECK-ARM: mul r1, r1, r2
; CHECK-THUMB: muls r1, r2, r1
; CHECK: ldrb{{(.w)?}} r0, [r0, r1, lsl #3]
entry:
%mul1 = mul i32 %y, 510136
%arrayidx = getelementptr inbounds i8, i8* %x, i32 %mul1
%0 = load i8, i8* %arrayidx, align 1
%conv = zext i8 %0 to i32
ret i32 %conv
}
define i32 @test_load_extract_from_mul_5(i8* %x, i32 %y) {
; CHECK-LABEL: test_load_extract_from_mul_5
; CHECK-ARM: movw r2, #63767
; CHECK-ARM: mul r1, r1, r2
; CHECK-ARM: ldrb r0, [r0, r1, lsl #4]
; CHECK-THUMB: movw r2, #37232
; CHECK-THUMB: movt r2, #15
; CHECK-THUMB: muls r1, r2, r1
; CHECK-THUMB: ldrb r0, [r0, r1]
entry:
%mul1 = mul i32 %y, 1020272
%arrayidx = getelementptr inbounds i8, i8* %x, i32 %mul1
%0 = load i8, i8* %arrayidx, align 1
%conv = zext i8 %0 to i32
ret i32 %conv
}
define i32 @test_load_extract_from_mul_6(i8* %x, i32 %y) {
; CHECK-LABEL: test_load_extract_from_mul_6
; CHECK-ARM: movw r2, #63767
; CHECK-ARM: mul r1, r1, r2
; CHECK-ARM: ldrb r0, [r0, r1, lsl #16]
; CHECK-THUMB: movs r2, #0
; CHECK-THUMB: movt r2, #63767
; CHECK-THUMB: muls r1, r2, r1
; CHECK-THUMB: ldrb r0, [r0, r1]
entry:
%mul = mul i32 %y, -115933184
%arrayidx = getelementptr inbounds i8, i8* %x, i32 %mul
%0 = load i8, i8* %arrayidx, align 1
%conv = zext i8 %0 to i32
ret i32 %conv
}