mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-26 21:00:29 +00:00
AArch64/ARM64: implement BFI optimisation
ARM64 was not producing pure BFI instructions for bitfield insertion operations, unlike AArch64. The approach had to be a little different (in ISelDAGToDAG rather than ISelLowering), and the outcomes aren't identical but hopefully this gives it similar power. This should address PR19424. llvm-svn: 207102
This commit is contained in:
parent
5b13e67220
commit
586e442827
@ -1394,28 +1394,23 @@ SDNode *ARM64DAGToDAGISel::SelectBitfieldExtractOp(SDNode *N) {
|
||||
return CurDAG->SelectNodeTo(N, Opc, VT, Ops, 3);
|
||||
}
|
||||
|
||||
// Is mask a i32 or i64 binary sequence 1..10..0 and
|
||||
// CountTrailingZeros(mask) == ExpectedTrailingZeros
|
||||
static bool isHighMask(uint64_t Mask, unsigned ExpectedTrailingZeros,
|
||||
unsigned NumberOfIgnoredHighBits, EVT VT) {
|
||||
/// Does DstMask form a complementary pair with the mask provided by
|
||||
/// BitsToBeInserted, suitable for use in a BFI instruction. Roughly speaking,
|
||||
/// this asks whether DstMask zeroes precisely those bits that will be set by
|
||||
/// the other half.
|
||||
static bool isBitfieldDstMask(uint64_t DstMask, APInt BitsToBeInserted,
|
||||
unsigned NumberOfIgnoredHighBits, EVT VT) {
|
||||
assert((VT == MVT::i32 || VT == MVT::i64) &&
|
||||
"i32 or i64 mask type expected!");
|
||||
unsigned BitWidth = VT.getSizeInBits() - NumberOfIgnoredHighBits;
|
||||
APInt SignificantBits =
|
||||
~APInt::getHighBitsSet(BitWidth, NumberOfIgnoredHighBits);
|
||||
|
||||
uint64_t ExpectedMask;
|
||||
if (VT == MVT::i32) {
|
||||
uint32_t ExpectedMaski32 = ~0 << ExpectedTrailingZeros;
|
||||
ExpectedMask = ExpectedMaski32;
|
||||
if (NumberOfIgnoredHighBits) {
|
||||
uint32_t highMask = ~0 << (32 - NumberOfIgnoredHighBits);
|
||||
Mask |= highMask;
|
||||
}
|
||||
} else {
|
||||
ExpectedMask = ((uint64_t) ~0) << ExpectedTrailingZeros;
|
||||
if (NumberOfIgnoredHighBits)
|
||||
Mask |= ((uint64_t) ~0) << (64 - NumberOfIgnoredHighBits);
|
||||
}
|
||||
APInt SignificantDstMask = APInt(BitWidth, DstMask);
|
||||
APInt SignificantBitsToBeInserted = BitsToBeInserted.zextOrTrunc(BitWidth);
|
||||
|
||||
return Mask == ExpectedMask;
|
||||
return (SignificantDstMask & SignificantBitsToBeInserted) == 0 &&
|
||||
(SignificantDstMask | SignificantBitsToBeInserted).isAllOnesValue();
|
||||
}
|
||||
|
||||
// Look for bits that will be useful for later uses.
|
||||
@ -1593,6 +1588,79 @@ static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth) {
|
||||
UsefulBits &= UsersUsefulBits;
|
||||
}
|
||||
|
||||
/// Create a machine node performing a notional SHL of Op by ShlAmount. If
|
||||
/// ShlAmount is negative, do a (logical) right-shift instead. If ShlAmount is
|
||||
/// 0, return Op unchanged.
|
||||
static SDValue getLeftShift(SelectionDAG *CurDAG, SDValue Op, int ShlAmount) {
|
||||
if (ShlAmount == 0)
|
||||
return Op;
|
||||
|
||||
EVT VT = Op.getValueType();
|
||||
unsigned BitWidth = VT.getSizeInBits();
|
||||
unsigned UBFMOpc = BitWidth == 32 ? ARM64::UBFMWri : ARM64::UBFMXri;
|
||||
|
||||
SDNode *ShiftNode;
|
||||
if (ShlAmount > 0) {
|
||||
// LSL wD, wN, #Amt == UBFM wD, wN, #32-Amt, #31-Amt
|
||||
ShiftNode = CurDAG->getMachineNode(
|
||||
UBFMOpc, SDLoc(Op), VT, Op,
|
||||
CurDAG->getTargetConstant(BitWidth - ShlAmount, VT),
|
||||
CurDAG->getTargetConstant(BitWidth - 1 - ShlAmount, VT));
|
||||
} else {
|
||||
// LSR wD, wN, #Amt == UBFM wD, wN, #Amt, #32-1
|
||||
assert(ShlAmount < 0 && "expected right shift");
|
||||
int ShrAmount = -ShlAmount;
|
||||
ShiftNode = CurDAG->getMachineNode(
|
||||
UBFMOpc, SDLoc(Op), VT, Op, CurDAG->getTargetConstant(ShrAmount, VT),
|
||||
CurDAG->getTargetConstant(BitWidth - 1, VT));
|
||||
}
|
||||
|
||||
return SDValue(ShiftNode, 0);
|
||||
}
|
||||
|
||||
/// Does this tree qualify as an attempt to move a bitfield into position,
|
||||
/// essentially "(and (shl VAL, N), Mask)".
|
||||
static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op,
|
||||
SDValue &Src, int &ShiftAmount,
|
||||
int &MaskWidth) {
|
||||
EVT VT = Op.getValueType();
|
||||
unsigned BitWidth = VT.getSizeInBits();
|
||||
assert(BitWidth == 32 || BitWidth == 64);
|
||||
|
||||
APInt KnownZero, KnownOne;
|
||||
CurDAG->ComputeMaskedBits(Op, KnownZero, KnownOne);
|
||||
|
||||
// Non-zero in the sense that they're not provably zero, which is the key
|
||||
// point if we want to use this value
|
||||
uint64_t NonZeroBits = (~KnownZero).getZExtValue();
|
||||
|
||||
// Discard a constant AND mask if present. It's safe because the node will
|
||||
// already have been factored into the ComputeMaskedBits calculation above.
|
||||
uint64_t AndImm;
|
||||
if (isOpcWithIntImmediate(Op.getNode(), ISD::AND, AndImm)) {
|
||||
assert((~APInt(BitWidth, AndImm) & ~KnownZero) == 0);
|
||||
Op = Op.getOperand(0);
|
||||
}
|
||||
|
||||
uint64_t ShlImm;
|
||||
if (!isOpcWithIntImmediate(Op.getNode(), ISD::SHL, ShlImm))
|
||||
return false;
|
||||
Op = Op.getOperand(0);
|
||||
|
||||
if (!isShiftedMask_64(NonZeroBits))
|
||||
return false;
|
||||
|
||||
ShiftAmount = countTrailingZeros(NonZeroBits);
|
||||
MaskWidth = CountTrailingOnes_64(NonZeroBits >> ShiftAmount);
|
||||
|
||||
// BFI encompasses sufficiently many nodes that it's worth inserting an extra
|
||||
// LSL/LSR if the mask in NonZeroBits doesn't quite match up with the ISD::SHL
|
||||
// amount.
|
||||
Src = getLeftShift(CurDAG, Op, ShlImm - ShiftAmount);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// Given a OR operation, check if we have the following pattern
|
||||
// ubfm c, b, imm, imm2 (or something that does the same jobs, see
|
||||
// isBitfieldExtractOp)
|
||||
@ -1602,9 +1670,9 @@ static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth) {
|
||||
// if yes, given reference arguments will be update so that one can replace
|
||||
// the OR instruction with:
|
||||
// f = Opc Opd0, Opd1, LSB, MSB ; where Opc is a BFM, LSB = imm, and MSB = imm2
|
||||
static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Opd0,
|
||||
SDValue &Opd1, unsigned &LSB,
|
||||
unsigned &MSB, SelectionDAG *CurDAG) {
|
||||
static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst,
|
||||
SDValue &Src, unsigned &ImmR,
|
||||
unsigned &ImmS, SelectionDAG *CurDAG) {
|
||||
assert(N->getOpcode() == ISD::OR && "Expect a OR operation");
|
||||
|
||||
// Set Opc
|
||||
@ -1632,30 +1700,36 @@ static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Opd0,
|
||||
for (int i = 0; i < 2;
|
||||
++i, std::swap(OrOpd0, OrOpd1), OrOpd1Val = N->getOperand(0)) {
|
||||
unsigned BFXOpc;
|
||||
// Set Opd1, LSB and MSB arguments by looking for
|
||||
// c = ubfm b, imm, imm2
|
||||
if (!isBitfieldExtractOp(CurDAG, OrOpd0, BFXOpc, Opd1, LSB, MSB,
|
||||
NumberOfIgnoredLowBits, true))
|
||||
continue;
|
||||
int DstLSB, Width;
|
||||
if (isBitfieldExtractOp(CurDAG, OrOpd0, BFXOpc, Src, ImmR, ImmS,
|
||||
NumberOfIgnoredLowBits, true)) {
|
||||
// Check that the returned opcode is compatible with the pattern,
|
||||
// i.e., same type and zero extended (U and not S)
|
||||
if ((BFXOpc != ARM64::UBFMXri && VT == MVT::i64) ||
|
||||
(BFXOpc != ARM64::UBFMWri && VT == MVT::i32))
|
||||
continue;
|
||||
|
||||
// Check that the returned opcode is compatible with the pattern,
|
||||
// i.e., same type and zero extended (U and not S)
|
||||
if ((BFXOpc != ARM64::UBFMXri && VT == MVT::i64) ||
|
||||
(BFXOpc != ARM64::UBFMWri && VT == MVT::i32))
|
||||
continue;
|
||||
// Compute the width of the bitfield insertion
|
||||
DstLSB = 0;
|
||||
Width = ImmS - ImmR + 1;
|
||||
// FIXME: This constraint is to catch bitfield insertion we may
|
||||
// want to widen the pattern if we want to grab general bitfied
|
||||
// move case
|
||||
if (Width <= 0)
|
||||
continue;
|
||||
|
||||
// Compute the width of the bitfield insertion
|
||||
int sMSB = MSB - LSB + 1;
|
||||
// FIXME: This constraints is to catch bitfield insertion we may
|
||||
// want to widen the pattern if we want to grab general bitfied
|
||||
// move case
|
||||
if (sMSB <= 0)
|
||||
// If the mask on the insertee is correct, we have a BFXIL operation. We
|
||||
// can share the ImmR and ImmS values from the already-computed UBFM.
|
||||
} else if (isBitfieldPositioningOp(CurDAG, SDValue(OrOpd0, 0), Src,
|
||||
DstLSB, Width)) {
|
||||
ImmR = (VT.getSizeInBits() - DstLSB) % VT.getSizeInBits();
|
||||
ImmS = Width - 1;
|
||||
} else
|
||||
continue;
|
||||
|
||||
// Check the second part of the pattern
|
||||
EVT VT = OrOpd1->getValueType(0);
|
||||
if (VT != MVT::i32 && VT != MVT::i64)
|
||||
continue;
|
||||
assert((VT == MVT::i32 || VT == MVT::i64) && "unexpected OR operand");
|
||||
|
||||
// Compute the Known Zero for the candidate of the first operand.
|
||||
// This allows to catch more general case than just looking for
|
||||
@ -1666,19 +1740,22 @@ static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Opd0,
|
||||
|
||||
// Check if there is enough room for the second operand to appear
|
||||
// in the first one
|
||||
if (KnownZero.countTrailingOnes() < (unsigned)sMSB)
|
||||
APInt BitsToBeInserted =
|
||||
APInt::getBitsSet(KnownZero.getBitWidth(), DstLSB, DstLSB + Width);
|
||||
|
||||
if ((BitsToBeInserted & ~KnownZero) != 0)
|
||||
continue;
|
||||
|
||||
// Set the first operand
|
||||
uint64_t Imm;
|
||||
if (isOpcWithIntImmediate(OrOpd1, ISD::AND, Imm) &&
|
||||
isHighMask(Imm, sMSB, NumberOfIgnoredHighBits, VT))
|
||||
isBitfieldDstMask(Imm, BitsToBeInserted, NumberOfIgnoredHighBits, VT))
|
||||
// In that case, we can eliminate the AND
|
||||
Opd0 = OrOpd1->getOperand(0);
|
||||
Dst = OrOpd1->getOperand(0);
|
||||
else
|
||||
// Maybe the AND has been removed by simplify-demanded-bits
|
||||
// or is useful because it discards more bits
|
||||
Opd0 = OrOpd1Val;
|
||||
Dst = OrOpd1Val;
|
||||
|
||||
// both parts match
|
||||
return true;
|
||||
|
@ -1,4 +1,5 @@
|
||||
; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
|
||||
; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-AARCH64
|
||||
; RUN: llc -mtriple=arm64-none-linux-gnu < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM64
|
||||
|
||||
; First, a simple example from Clang. The registers could plausibly be
|
||||
; different, but probably won't be.
|
||||
@ -7,8 +8,10 @@
|
||||
|
||||
define [1 x i64] @from_clang([1 x i64] %f.coerce, i32 %n) nounwind readnone {
|
||||
; CHECK-LABEL: from_clang:
|
||||
; CHECK: bfi w0, w1, #3, #4
|
||||
; CHECK-NEXT: ret
|
||||
; CHECK-AARCH64: bfi w0, w1, #3, #4
|
||||
; CHECK-ARCH64-NEXT: ret
|
||||
|
||||
; CHECK-ARM64: bfm {{w[0-9]+}}, {{w[0-9]+}}, #29, #3
|
||||
|
||||
entry:
|
||||
%f.coerce.fca.0.extract = extractvalue [1 x i64] %f.coerce, 0
|
||||
@ -26,7 +29,9 @@ entry:
|
||||
|
||||
define void @test_whole32(i32* %existing, i32* %new) {
|
||||
; CHECK-LABEL: test_whole32:
|
||||
; CHECK: bfi {{w[0-9]+}}, {{w[0-9]+}}, #26, #5
|
||||
|
||||
; CHECK-AARCH64: bfi {{w[0-9]+}}, {{w[0-9]+}}, #26, #5
|
||||
; CHECK-ARM64: bfm {{w[0-9]+}}, {{w[0-9]+}}, #6, #4
|
||||
|
||||
%oldval = load volatile i32* %existing
|
||||
%oldval_keep = and i32 %oldval, 2214592511 ; =0x83ffffff
|
||||
@ -43,7 +48,8 @@ define void @test_whole32(i32* %existing, i32* %new) {
|
||||
|
||||
define void @test_whole64(i64* %existing, i64* %new) {
|
||||
; CHECK-LABEL: test_whole64:
|
||||
; CHECK: bfi {{x[0-9]+}}, {{x[0-9]+}}, #26, #14
|
||||
; CHECK-AARCH64: bfi {{x[0-9]+}}, {{x[0-9]+}}, #26, #14
|
||||
; CHECK-ARM64: bfm {{x[0-9]+}}, {{x[0-9]+}}, #38, #13
|
||||
; CHECK-NOT: and
|
||||
; CHECK: ret
|
||||
|
||||
@ -62,8 +68,11 @@ define void @test_whole64(i64* %existing, i64* %new) {
|
||||
|
||||
define void @test_whole32_from64(i64* %existing, i64* %new) {
|
||||
; CHECK-LABEL: test_whole32_from64:
|
||||
; CHECK: bfi {{w[0-9]+}}, {{w[0-9]+}}, #{{0|16}}, #16
|
||||
; CHECK-NOT: and
|
||||
|
||||
; CHECK-AARCH64: bfi {{w[0-9]+}}, {{w[0-9]+}}, #{{0|16}}, #16
|
||||
; CHECK-AARCH64-NOT: and
|
||||
; CHECK-ARM64: bfm {{x[0-9]+}}, {{x[0-9]+}}, #0, #15
|
||||
|
||||
; CHECK: ret
|
||||
|
||||
%oldval = load volatile i64* %existing
|
||||
@ -80,8 +89,12 @@ define void @test_whole32_from64(i64* %existing, i64* %new) {
|
||||
|
||||
define void @test_32bit_masked(i32 *%existing, i32 *%new) {
|
||||
; CHECK-LABEL: test_32bit_masked:
|
||||
; CHECK: bfi [[INSERT:w[0-9]+]], {{w[0-9]+}}, #3, #4
|
||||
; CHECK: and {{w[0-9]+}}, [[INSERT]], #0xff
|
||||
|
||||
; CHECK-AARCH64: bfi [[INSERT:w[0-9]+]], {{w[0-9]+}}, #3, #4
|
||||
; CHECK-AARCH64: and {{w[0-9]+}}, [[INSERT]], #0xff
|
||||
|
||||
; CHECK-ARM64: and
|
||||
; CHECK-ARM64: bfm {{w[0-9]+}}, {{w[0-9]+}}, #29, #3
|
||||
|
||||
%oldval = load volatile i32* %existing
|
||||
%oldval_keep = and i32 %oldval, 135 ; = 0x87
|
||||
@ -98,8 +111,11 @@ define void @test_32bit_masked(i32 *%existing, i32 *%new) {
|
||||
|
||||
define void @test_64bit_masked(i64 *%existing, i64 *%new) {
|
||||
; CHECK-LABEL: test_64bit_masked:
|
||||
; CHECK: bfi [[INSERT:x[0-9]+]], {{x[0-9]+}}, #40, #8
|
||||
; CHECK: and {{x[0-9]+}}, [[INSERT]], #0xffff00000000
|
||||
; CHECK-AARCH64: bfi [[INSERT:x[0-9]+]], {{x[0-9]+}}, #40, #8
|
||||
; CHECK-AARCH64: and {{x[0-9]+}}, [[INSERT]], #0xffff00000000
|
||||
|
||||
; CHECK-ARM64: and
|
||||
; CHECK-ARM64: bfm {{x[0-9]+}}, {{x[0-9]+}}, #24, #7
|
||||
|
||||
%oldval = load volatile i64* %existing
|
||||
%oldval_keep = and i64 %oldval, 1095216660480 ; = 0xff_0000_0000
|
||||
@ -117,8 +133,12 @@ define void @test_64bit_masked(i64 *%existing, i64 *%new) {
|
||||
; Mask is too complicated for literal ANDwwi, make sure other avenues are tried.
|
||||
define void @test_32bit_complexmask(i32 *%existing, i32 *%new) {
|
||||
; CHECK-LABEL: test_32bit_complexmask:
|
||||
; CHECK: bfi {{w[0-9]+}}, {{w[0-9]+}}, #3, #4
|
||||
; CHECK: and {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
|
||||
|
||||
; CHECK-AARCH64: bfi {{w[0-9]+}}, {{w[0-9]+}}, #3, #4
|
||||
; CHECK-AARCH64: and {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
|
||||
|
||||
; CHECK-ARM64: and
|
||||
; CHECK-ARM64: bfm {{w[0-9]+}}, {{w[0-9]+}}, #29, #3
|
||||
|
||||
%oldval = load volatile i32* %existing
|
||||
%oldval_keep = and i32 %oldval, 647 ; = 0x287
|
||||
@ -137,6 +157,7 @@ define void @test_32bit_complexmask(i32 *%existing, i32 *%new) {
|
||||
define void @test_32bit_badmask(i32 *%existing, i32 *%new) {
|
||||
; CHECK-LABEL: test_32bit_badmask:
|
||||
; CHECK-NOT: bfi
|
||||
; CHECK-NOT: bfm
|
||||
; CHECK: ret
|
||||
|
||||
%oldval = load volatile i32* %existing
|
||||
@ -156,6 +177,7 @@ define void @test_32bit_badmask(i32 *%existing, i32 *%new) {
|
||||
define void @test_64bit_badmask(i64 *%existing, i64 *%new) {
|
||||
; CHECK-LABEL: test_64bit_badmask:
|
||||
; CHECK-NOT: bfi
|
||||
; CHECK-NOT: bfm
|
||||
; CHECK: ret
|
||||
|
||||
%oldval = load volatile i64* %existing
|
||||
@ -186,8 +208,8 @@ define void @test_32bit_with_shr(i32* %existing, i32* %new) {
|
||||
%combined = or i32 %oldval_keep, %newval_masked
|
||||
store volatile i32 %combined, i32* %existing
|
||||
; CHECK: lsr [[BIT:w[0-9]+]], {{w[0-9]+}}, #14
|
||||
; CHECK: bfi {{w[0-9]}}, [[BIT]], #26, #5
|
||||
; CHECK-AARCH64: bfi {{w[0-9]+}}, [[BIT]], #26, #5
|
||||
; CHECK-ARM64: bfm {{w[0-9]+}}, [[BIT]], #6, #4
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -4,7 +4,7 @@
|
||||
define i32 @f0(i32* nocapture %p) nounwind {
|
||||
; CHECK-STRICT: ldrh [[HIGH:w[0-9]+]], [x0, #2]
|
||||
; CHECK-STRICT: ldrh [[LOW:w[0-9]+]], [x0]
|
||||
; CHECK-STRICT: orr w0, [[LOW]], [[HIGH]], lsl #16
|
||||
; CHECK-STRICT: bfm [[LOW]], [[HIGH]], #16, #15
|
||||
; CHECK-STRICT: ret
|
||||
|
||||
; CHECK: ldr w0, [x0]
|
||||
@ -15,7 +15,7 @@ define i32 @f0(i32* nocapture %p) nounwind {
|
||||
|
||||
define i64 @f1(i64* nocapture %p) nounwind {
|
||||
; CHECK-STRICT: ldp w[[LOW:[0-9]+]], w[[HIGH:[0-9]+]], [x0]
|
||||
; CHECK-STRICT: orr x0, x[[LOW]], x[[HIGH]], lsl #32
|
||||
; CHECK-STRICT: bfm x[[LOW]], x[[HIGH]], #32, #31
|
||||
; CHECK-STRICT: ret
|
||||
|
||||
; CHECK: ldr x0, [x0]
|
||||
|
Loading…
Reference in New Issue
Block a user