Add combiner patterns to more effectively utilize the BFI (bitfield insert)

instruction for non-constant operands. This includes the case referenced
in the README.txt regarding a bitfield copy.

llvm-svn: 108608
This commit is contained in:
Jim Grosbach 2010-07-17 03:30:54 +00:00
parent e31ee27cbe
commit 270540da7b
4 changed files with 114 additions and 37 deletions

View File

@ -4240,21 +4240,33 @@ static SDValue PerformMULCombine(SDNode *N,
static SDValue PerformORCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *Subtarget) {
// Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
// reasonable.
// BFI is only available on V6T2+
if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
return SDValue();
SelectionDAG &DAG = DCI.DAG;
SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
// or (and A, mask), val => ARMbfi A, val, mask
// iff (val & mask) == val
if (N0->getOpcode() != ISD::AND)
DebugLoc DL = N->getDebugLoc();
// 1) or (and A, mask), val => ARMbfi A, val, mask
// iff (val & mask) == val
//
// 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
// 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2)
// && CountPopulation_32(mask) == CountPopulation_32(~mask2)
// 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2)
// && CountPopulation_32(mask) == CountPopulation_32(~mask2)
// (i.e., copy a bitfield value into another bitfield of the same width)
if (N0.getOpcode() != ISD::AND)
return SDValue();
EVT VT = N->getValueType(0);
if (VT != MVT::i32)
return SDValue();
// The value and the mask need to be constants so we can verify this is
// actually a bitfield set. If the mask is 0xffff, we can do better
// via a movt instruction, so don't use BFI in that case.
@ -4264,21 +4276,61 @@ static SDValue PerformORCombine(SDNode *N,
unsigned Mask = C->getZExtValue();
if (Mask == 0xffff)
return SDValue();
C = dyn_cast<ConstantSDNode>(N1);
if (!C)
return SDValue();
unsigned Val = C->getZExtValue();
if (!ARM::isBitFieldInvertedMask(Mask) || (Val & ~Mask) != Val)
return SDValue();
Val >>= CountTrailingZeros_32(~Mask);
SDValue Res;
// Case (1): or (and A, mask), val => ARMbfi A, val, mask
if ((C = dyn_cast<ConstantSDNode>(N1))) {
unsigned Val = C->getZExtValue();
if (!ARM::isBitFieldInvertedMask(Mask) || (Val & ~Mask) != Val)
return SDValue();
Val >>= CountTrailingZeros_32(~Mask);
DebugLoc DL = N->getDebugLoc();
SDValue Res = DAG.getNode(ARMISD::BFI, DL, VT, N0.getOperand(0),
DAG.getConstant(Val, MVT::i32),
DAG.getConstant(Mask, MVT::i32));
Res = DAG.getNode(ARMISD::BFI, DL, VT, N0.getOperand(0),
DAG.getConstant(Val, MVT::i32),
DAG.getConstant(Mask, MVT::i32));
// Do not add new nodes to DAG combiner worklist.
DCI.CombineTo(N, Res, false);
// Do not add new nodes to DAG combiner worklist.
DCI.CombineTo(N, Res, false);
} else if (N1.getOpcode() == ISD::AND) {
// case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
if (!C)
return SDValue();
unsigned Mask2 = C->getZExtValue();
if (ARM::isBitFieldInvertedMask(Mask) &&
ARM::isBitFieldInvertedMask(~Mask2) &&
(CountPopulation_32(Mask) == CountPopulation_32(~Mask2))) {
// The pack halfword instruction works better for masks that fit it,
// so use that when it's available.
if (Subtarget->hasT2ExtractPack() &&
(Mask == 0xffff || Mask == 0xffff0000))
return SDValue();
// 2a
unsigned lsb = CountTrailingZeros_32(Mask2);
Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
DAG.getConstant(lsb, MVT::i32));
Res = DAG.getNode(ARMISD::BFI, DL, VT, N0.getOperand(0), Res,
DAG.getConstant(Mask, MVT::i32));
// Do not add new nodes to DAG combiner worklist.
DCI.CombineTo(N, Res, false);
} else if (ARM::isBitFieldInvertedMask(~Mask) &&
ARM::isBitFieldInvertedMask(Mask2) &&
(CountPopulation_32(~Mask) == CountPopulation_32(Mask2))) {
// The pack halfword instruction works better for masks that fit it,
// so use that when it's available.
if (Subtarget->hasT2ExtractPack() &&
(Mask2 == 0xffff || Mask2 == 0xffff0000))
return SDValue();
// 2b
unsigned lsb = CountTrailingZeros_32(Mask);
Res = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0),
DAG.getConstant(lsb, MVT::i32));
Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
DAG.getConstant(Mask2, MVT::i32));
// Do not add new nodes to DAG combiner worklist.
DCI.CombineTo(N, Res, false);
}
}
return SDValue();
}

View File

@ -609,27 +609,6 @@ We currently generate:
We should be able to replace the second ldr+and with a bic (i.e. reuse the
constant which was already loaded). Not sure what's necessary to do that.
//===---------------------------------------------------------------------===//
Given the following on ARMv7:
int test1(int A, int B) {
return (A&-8388481)|(B&8388480);
}
We currently generate:
bfc r0, #7, #16
movw r2, #:lower16:8388480
movt r2, #:upper16:8388480
and r1, r1, r2
orr r0, r1, r0
bx lr
The following is much shorter:
lsr r1, r1, #7
bfi r0, r1, #7, #16
bx lr
//===---------------------------------------------------------------------===//
The code generated for bswap on armv4/5 (CPUs without rev) is less than ideal:

View File

@ -15,3 +15,26 @@ entry:
store i32 %2, i32* bitcast (%struct.F* @X to i32*), align 4
ret void
}
define i32 @f2(i32 %A, i32 %B) nounwind readnone optsize {
entry:
; CHECK: f2
; CHECK: mov r1, r1, lsr #7
; CHECK: bfi r0, r1, #7, #16
%and = and i32 %A, -8388481 ; <i32> [#uses=1]
%and2 = and i32 %B, 8388480 ; <i32> [#uses=1]
%or = or i32 %and2, %and ; <i32> [#uses=1]
ret i32 %or
}
define i32 @f3(i32 %A, i32 %B) nounwind readnone optsize {
entry:
; CHECK: f3
; CHECK: mov r2, r0, lsr #7
; CHECK: mov r0, r1
; CHECK: bfi r0, r2, #7, #16
%and = and i32 %A, 8388480 ; <i32> [#uses=1]
%and2 = and i32 %B, -8388481 ; <i32> [#uses=1]
%or = or i32 %and2, %and ; <i32> [#uses=1]
ret i32 %or
}

View File

@ -15,3 +15,26 @@ entry:
store i32 %2, i32* bitcast (%struct.F* @X to i32*), align 4
ret void
}
define i32 @f2(i32 %A, i32 %B) nounwind readnone optsize {
entry:
; CHECK: f2
; CHECK: lsrs r1, r1, #7
; CHECK: bfi r0, r1, #7, #16
%and = and i32 %A, -8388481 ; <i32> [#uses=1]
%and2 = and i32 %B, 8388480 ; <i32> [#uses=1]
%or = or i32 %and2, %and ; <i32> [#uses=1]
ret i32 %or
}
define i32 @f3(i32 %A, i32 %B) nounwind readnone optsize {
entry:
; CHECK: f3
; CHECK: lsrs r2, r0, #7
; CHECK: mov r0, r1
; CHECK: bfi r0, r2, #7, #16
%and = and i32 %A, 8388480 ; <i32> [#uses=1]
%and2 = and i32 %B, -8388481 ; <i32> [#uses=1]
%or = or i32 %and2, %and ; <i32> [#uses=1]
ret i32 %or
}