AArch64/ARM64: implement BFI optimisation

ARM64 was not producing pure BFI instructions for bitfield insertion operations, unlike AArch64. The approach had to be a little different (in ISelDAGToDAG rather than ISelLowering), and the outcomes aren't identical but hopefully this gives it similar power. This should address PR19424. llvm-svn: 207102
2024-11-26 21:00:29 +00:00 · 2014-04-24 12:11:53 +00:00 · 2014-04-24 12:11:53 +00:00 · 586e442827
commit 586e442827
parent 5b13e67220
3 changed files with 159 additions and 60 deletions
--- a/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp
+++ b/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp
@ -1394,28 +1394,23 @@ SDNode *ARM64DAGToDAGISel::SelectBitfieldExtractOp(SDNode *N) {
  return CurDAG->SelectNodeTo(N, Opc, VT, Ops, 3);
 }

-// Is mask a i32 or i64 binary sequence 1..10..0 and
-// CountTrailingZeros(mask) == ExpectedTrailingZeros
-static bool isHighMask(uint64_t Mask, unsigned ExpectedTrailingZeros,
-                       unsigned NumberOfIgnoredHighBits, EVT VT) {
+/// Does DstMask form a complementary pair with the mask provided by
+/// BitsToBeInserted, suitable for use in a BFI instruction. Roughly speaking,
+/// this asks whether DstMask zeroes precisely those bits that will be set by
+/// the other half.
+static bool isBitfieldDstMask(uint64_t DstMask, APInt BitsToBeInserted,
+                              unsigned NumberOfIgnoredHighBits, EVT VT) {
  assert((VT == MVT::i32 || VT == MVT::i64) &&
         "i32 or i64 mask type expected!");
+  unsigned BitWidth = VT.getSizeInBits() - NumberOfIgnoredHighBits;
+  APInt SignificantBits =
+      ~APInt::getHighBitsSet(BitWidth, NumberOfIgnoredHighBits);

-  uint64_t ExpectedMask;
-  if (VT == MVT::i32) {
-    uint32_t ExpectedMaski32 = ~0 << ExpectedTrailingZeros;
-    ExpectedMask = ExpectedMaski32;
-    if (NumberOfIgnoredHighBits) {
-      uint32_t highMask = ~0 << (32 - NumberOfIgnoredHighBits);
-      Mask |= highMask;
-    }
-  } else {
-    ExpectedMask = ((uint64_t) ~0) << ExpectedTrailingZeros;
-    if (NumberOfIgnoredHighBits)
-      Mask |= ((uint64_t) ~0) << (64 - NumberOfIgnoredHighBits);
-  }
+  APInt SignificantDstMask = APInt(BitWidth, DstMask);
+  APInt SignificantBitsToBeInserted = BitsToBeInserted.zextOrTrunc(BitWidth);

-  return Mask == ExpectedMask;
+  return (SignificantDstMask & SignificantBitsToBeInserted) == 0 &&
+         (SignificantDstMask | SignificantBitsToBeInserted).isAllOnesValue();
 }

 // Look for bits that will be useful for later uses.
@ -1593,6 +1588,79 @@ static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth) {
  UsefulBits &= UsersUsefulBits;
 }

+/// Create a machine node performing a notional SHL of Op by ShlAmount. If
+/// ShlAmount is negative, do a (logical) right-shift instead. If ShlAmount is
+/// 0, return Op unchanged.
+static SDValue getLeftShift(SelectionDAG *CurDAG, SDValue Op, int ShlAmount) {
+  if (ShlAmount == 0)
+    return Op;
+
+  EVT VT = Op.getValueType();
+  unsigned BitWidth = VT.getSizeInBits();
+  unsigned UBFMOpc = BitWidth == 32 ? ARM64::UBFMWri : ARM64::UBFMXri;
+
+  SDNode *ShiftNode;
+  if (ShlAmount > 0) {
+    // LSL wD, wN, #Amt == UBFM wD, wN, #32-Amt, #31-Amt
+    ShiftNode = CurDAG->getMachineNode(
+        UBFMOpc, SDLoc(Op), VT, Op,
+        CurDAG->getTargetConstant(BitWidth - ShlAmount, VT),
+        CurDAG->getTargetConstant(BitWidth - 1 - ShlAmount, VT));
+  } else {
+    // LSR wD, wN, #Amt == UBFM wD, wN, #Amt, #32-1
+    assert(ShlAmount < 0 && "expected right shift");
+    int ShrAmount = -ShlAmount;
+    ShiftNode = CurDAG->getMachineNode(
+        UBFMOpc, SDLoc(Op), VT, Op, CurDAG->getTargetConstant(ShrAmount, VT),
+        CurDAG->getTargetConstant(BitWidth - 1, VT));
+  }
+
+  return SDValue(ShiftNode, 0);
+}
+
+/// Does this tree qualify as an attempt to move a bitfield into position,
+/// essentially "(and (shl VAL, N), Mask)".
+static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op,
+                                    SDValue &Src, int &ShiftAmount,
+                                    int &MaskWidth) {
+  EVT VT = Op.getValueType();
+  unsigned BitWidth = VT.getSizeInBits();
+  assert(BitWidth == 32 || BitWidth == 64);
+
+  APInt KnownZero, KnownOne;
+  CurDAG->ComputeMaskedBits(Op, KnownZero, KnownOne);
+
+  // Non-zero in the sense that they're not provably zero, which is the key
+  // point if we want to use this value
+  uint64_t NonZeroBits = (~KnownZero).getZExtValue();
+
+  // Discard a constant AND mask if present. It's safe because the node will
+  // already have been factored into the ComputeMaskedBits calculation above.
+  uint64_t AndImm;
+  if (isOpcWithIntImmediate(Op.getNode(), ISD::AND, AndImm)) {
+    assert((~APInt(BitWidth, AndImm) & ~KnownZero) == 0);
+    Op = Op.getOperand(0);
+  }
+
+  uint64_t ShlImm;
+  if (!isOpcWithIntImmediate(Op.getNode(), ISD::SHL, ShlImm))
+    return false;
+  Op = Op.getOperand(0);
+
+  if (!isShiftedMask_64(NonZeroBits))
+    return false;
+
+  ShiftAmount = countTrailingZeros(NonZeroBits);
+  MaskWidth = CountTrailingOnes_64(NonZeroBits >> ShiftAmount);
+
+  // BFI encompasses sufficiently many nodes that it's worth inserting an extra
+  // LSL/LSR if the mask in NonZeroBits doesn't quite match up with the ISD::SHL
+  // amount.
+  Src = getLeftShift(CurDAG, Op, ShlImm - ShiftAmount);
+
+  return true;
+}
+
 // Given a OR operation, check if we have the following pattern
 // ubfm c, b, imm, imm2 (or something that does the same jobs, see
 //                       isBitfieldExtractOp)
@ -1602,9 +1670,9 @@ static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth) {
 // if yes, given reference arguments will be update so that one can replace
 // the OR instruction with:
 // f = Opc Opd0, Opd1, LSB, MSB ; where Opc is a BFM, LSB = imm, and MSB = imm2
-static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Opd0,
-                                     SDValue &Opd1, unsigned &LSB,
-                                     unsigned &MSB, SelectionDAG *CurDAG) {
+static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst,
+                                     SDValue &Src, unsigned &ImmR,
+                                     unsigned &ImmS, SelectionDAG *CurDAG) {
  assert(N->getOpcode() == ISD::OR && "Expect a OR operation");

  // Set Opc
@ -1632,30 +1700,36 @@ static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Opd0,
  for (int i = 0; i < 2;
       ++i, std::swap(OrOpd0, OrOpd1), OrOpd1Val = N->getOperand(0)) {
    unsigned BFXOpc;
-    // Set Opd1, LSB and MSB arguments by looking for
-    // c = ubfm b, imm, imm2
-    if (!isBitfieldExtractOp(CurDAG, OrOpd0, BFXOpc, Opd1, LSB, MSB,
-                             NumberOfIgnoredLowBits, true))
-      continue;
+    int DstLSB, Width;
+    if (isBitfieldExtractOp(CurDAG, OrOpd0, BFXOpc, Src, ImmR, ImmS,
+                            NumberOfIgnoredLowBits, true)) {
+      // Check that the returned opcode is compatible with the pattern,
+      // i.e., same type and zero extended (U and not S)
+      if ((BFXOpc != ARM64::UBFMXri && VT == MVT::i64) ||
+          (BFXOpc != ARM64::UBFMWri && VT == MVT::i32))
+        continue;

-    // Check that the returned opcode is compatible with the pattern,
-    // i.e., same type and zero extended (U and not S)
-    if ((BFXOpc != ARM64::UBFMXri && VT == MVT::i64) ||
-        (BFXOpc != ARM64::UBFMWri && VT == MVT::i32))
-      continue;
+      // Compute the width of the bitfield insertion
+      DstLSB = 0;
+      Width = ImmS - ImmR + 1;
+      // FIXME: This constraint is to catch bitfield insertion we may
+      // want to widen the pattern if we want to grab general bitfied
+      // move case
+      if (Width <= 0)
+        continue;

-    // Compute the width of the bitfield insertion
-    int sMSB = MSB - LSB + 1;
-    // FIXME: This constraints is to catch bitfield insertion we may
-    // want to widen the pattern if we want to grab general bitfied
-    // move case
-    if (sMSB <= 0)
+      // If the mask on the insertee is correct, we have a BFXIL operation. We
+      // can share the ImmR and ImmS values from the already-computed UBFM.
+    } else if (isBitfieldPositioningOp(CurDAG, SDValue(OrOpd0, 0), Src,
+                                       DstLSB, Width)) {
+      ImmR = (VT.getSizeInBits() - DstLSB) % VT.getSizeInBits();
+      ImmS = Width - 1;
+    } else
      continue;

    // Check the second part of the pattern
    EVT VT = OrOpd1->getValueType(0);
-    if (VT != MVT::i32 && VT != MVT::i64)
-      continue;
+    assert((VT == MVT::i32 || VT == MVT::i64) && "unexpected OR operand");

    // Compute the Known Zero for the candidate of the first operand.
    // This allows to catch more general case than just looking for
@ -1666,19 +1740,22 @@ static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Opd0,

    // Check if there is enough room for the second operand to appear
    // in the first one
-    if (KnownZero.countTrailingOnes() < (unsigned)sMSB)
+    APInt BitsToBeInserted =
+        APInt::getBitsSet(KnownZero.getBitWidth(), DstLSB, DstLSB + Width);
+
+    if ((BitsToBeInserted & ~KnownZero) != 0)
      continue;

    // Set the first operand
    uint64_t Imm;
    if (isOpcWithIntImmediate(OrOpd1, ISD::AND, Imm) &&
-        isHighMask(Imm, sMSB, NumberOfIgnoredHighBits, VT))
+        isBitfieldDstMask(Imm, BitsToBeInserted, NumberOfIgnoredHighBits, VT))
      // In that case, we can eliminate the AND
-      Opd0 = OrOpd1->getOperand(0);
+      Dst = OrOpd1->getOperand(0);
    else
      // Maybe the AND has been removed by simplify-demanded-bits
      // or is useful because it discards more bits
-      Opd0 = OrOpd1Val;
+      Dst = OrOpd1Val;

    // both parts match
    return true;
--- a/test/CodeGen/AArch64/bitfield-insert.ll
+++ b/test/CodeGen/AArch64/bitfield-insert.ll
@ -1,4 +1,5 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-AARCH64
+; RUN: llc -mtriple=arm64-none-linux-gnu < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM64

 ; First, a simple example from Clang. The registers could plausibly be
 ; different, but probably won't be.
@ -7,8 +8,10 @@

 define [1 x i64] @from_clang([1 x i64] %f.coerce, i32 %n) nounwind readnone {
 ; CHECK-LABEL: from_clang:
-; CHECK: bfi w0, w1, #3, #4
-; CHECK-NEXT: ret
+; CHECK-AARCH64: bfi w0, w1, #3, #4
+; CHECK-ARCH64-NEXT: ret
+
+; CHECK-ARM64: bfm {{w[0-9]+}}, {{w[0-9]+}}, #29, #3

 entry:
  %f.coerce.fca.0.extract = extractvalue [1 x i64] %f.coerce, 0
@ -26,7 +29,9 @@ entry:

 define void @test_whole32(i32* %existing, i32* %new) {
 ; CHECK-LABEL: test_whole32:
-; CHECK: bfi {{w[0-9]+}}, {{w[0-9]+}}, #26, #5
+
+; CHECK-AARCH64: bfi {{w[0-9]+}}, {{w[0-9]+}}, #26, #5
+; CHECK-ARM64: bfm {{w[0-9]+}}, {{w[0-9]+}}, #6, #4

  %oldval = load volatile i32* %existing
  %oldval_keep = and i32 %oldval, 2214592511 ; =0x83ffffff
@ -43,7 +48,8 @@ define void @test_whole32(i32* %existing, i32* %new) {

 define void @test_whole64(i64* %existing, i64* %new) {
 ; CHECK-LABEL: test_whole64:
-; CHECK: bfi {{x[0-9]+}}, {{x[0-9]+}}, #26, #14
+; CHECK-AARCH64: bfi {{x[0-9]+}}, {{x[0-9]+}}, #26, #14
+; CHECK-ARM64: bfm {{x[0-9]+}}, {{x[0-9]+}}, #38, #13
 ; CHECK-NOT: and
 ; CHECK: ret

@ -62,8 +68,11 @@ define void @test_whole64(i64* %existing, i64* %new) {

 define void @test_whole32_from64(i64* %existing, i64* %new) {
 ; CHECK-LABEL: test_whole32_from64:
-; CHECK: bfi {{w[0-9]+}}, {{w[0-9]+}}, #{{0|16}}, #16
-; CHECK-NOT: and
+
+; CHECK-AARCH64: bfi {{w[0-9]+}}, {{w[0-9]+}}, #{{0|16}}, #16
+; CHECK-AARCH64-NOT: and
+; CHECK-ARM64: bfm {{x[0-9]+}}, {{x[0-9]+}}, #0, #15
+
 ; CHECK: ret

  %oldval = load volatile i64* %existing
@ -80,8 +89,12 @@ define void @test_whole32_from64(i64* %existing, i64* %new) {

 define void @test_32bit_masked(i32 *%existing, i32 *%new) {
 ; CHECK-LABEL: test_32bit_masked:
-; CHECK: bfi [[INSERT:w[0-9]+]], {{w[0-9]+}}, #3, #4
-; CHECK: and {{w[0-9]+}}, [[INSERT]], #0xff
+
+; CHECK-AARCH64: bfi [[INSERT:w[0-9]+]], {{w[0-9]+}}, #3, #4
+; CHECK-AARCH64: and {{w[0-9]+}}, [[INSERT]], #0xff
+
+; CHECK-ARM64: and
+; CHECK-ARM64: bfm {{w[0-9]+}}, {{w[0-9]+}}, #29, #3

  %oldval = load volatile i32* %existing
  %oldval_keep = and i32 %oldval, 135 ; = 0x87
@ -98,8 +111,11 @@ define void @test_32bit_masked(i32 *%existing, i32 *%new) {

 define void @test_64bit_masked(i64 *%existing, i64 *%new) {
 ; CHECK-LABEL: test_64bit_masked:
-; CHECK: bfi [[INSERT:x[0-9]+]], {{x[0-9]+}}, #40, #8
-; CHECK: and {{x[0-9]+}}, [[INSERT]], #0xffff00000000
+; CHECK-AARCH64: bfi [[INSERT:x[0-9]+]], {{x[0-9]+}}, #40, #8
+; CHECK-AARCH64: and {{x[0-9]+}}, [[INSERT]], #0xffff00000000
+
+; CHECK-ARM64: and
+; CHECK-ARM64: bfm {{x[0-9]+}}, {{x[0-9]+}}, #24, #7

  %oldval = load volatile i64* %existing
  %oldval_keep = and i64 %oldval, 1095216660480 ; = 0xff_0000_0000
@ -117,8 +133,12 @@ define void @test_64bit_masked(i64 *%existing, i64 *%new) {
 ; Mask is too complicated for literal ANDwwi, make sure other avenues are tried.
 define void @test_32bit_complexmask(i32 *%existing, i32 *%new) {
 ; CHECK-LABEL: test_32bit_complexmask:
-; CHECK: bfi {{w[0-9]+}}, {{w[0-9]+}}, #3, #4
-; CHECK: and {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+
+; CHECK-AARCH64: bfi {{w[0-9]+}}, {{w[0-9]+}}, #3, #4
+; CHECK-AARCH64: and {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+
+; CHECK-ARM64: and
+; CHECK-ARM64: bfm {{w[0-9]+}}, {{w[0-9]+}}, #29, #3

  %oldval = load volatile i32* %existing
  %oldval_keep = and i32 %oldval, 647 ; = 0x287
@ -137,6 +157,7 @@ define void @test_32bit_complexmask(i32 *%existing, i32 *%new) {
 define void @test_32bit_badmask(i32 *%existing, i32 *%new) {
 ; CHECK-LABEL: test_32bit_badmask:
 ; CHECK-NOT: bfi
+; CHECK-NOT: bfm
 ; CHECK: ret

  %oldval = load volatile i32* %existing
@ -156,6 +177,7 @@ define void @test_32bit_badmask(i32 *%existing, i32 *%new) {
 define void @test_64bit_badmask(i64 *%existing, i64 *%new) {
 ; CHECK-LABEL: test_64bit_badmask:
 ; CHECK-NOT: bfi
+; CHECK-NOT: bfm
 ; CHECK: ret

  %oldval = load volatile i64* %existing
@ -186,8 +208,8 @@ define void @test_32bit_with_shr(i32* %existing, i32* %new) {
  %combined = or i32 %oldval_keep, %newval_masked
  store volatile i32 %combined, i32* %existing
 ; CHECK: lsr [[BIT:w[0-9]+]], {{w[0-9]+}}, #14
-; CHECK: bfi {{w[0-9]}}, [[BIT]], #26, #5
+; CHECK-AARCH64: bfi {{w[0-9]+}}, [[BIT]], #26, #5
+; CHECK-ARM64: bfm {{w[0-9]+}}, [[BIT]], #6, #4

  ret void
 }
-
--- a/test/CodeGen/ARM64/strict-align.ll
+++ b/test/CodeGen/ARM64/strict-align.ll
@ -4,7 +4,7 @@
 define i32 @f0(i32* nocapture %p) nounwind {
 ; CHECK-STRICT: ldrh [[HIGH:w[0-9]+]], [x0, #2]
 ; CHECK-STRICT: ldrh [[LOW:w[0-9]+]], [x0]
-; CHECK-STRICT: orr w0, [[LOW]], [[HIGH]], lsl #16
+; CHECK-STRICT: bfm [[LOW]], [[HIGH]], #16, #15
 ; CHECK-STRICT: ret

 ; CHECK: ldr w0, [x0]
@ -15,7 +15,7 @@ define i32 @f0(i32* nocapture %p) nounwind {

 define i64 @f1(i64* nocapture %p) nounwind {
 ; CHECK-STRICT:	ldp	w[[LOW:[0-9]+]], w[[HIGH:[0-9]+]], [x0]
-; CHECK-STRICT:	orr	x0, x[[LOW]], x[[HIGH]], lsl #32
+; CHECK-STRICT: bfm x[[LOW]], x[[HIGH]], #32, #31
 ; CHECK-STRICT:	ret

 ; CHECK: ldr x0, [x0]