[AArch64] Improved bitfield instruction selection.

Summary: For bitfield insert OR matching, check both operands for larger pattern first before checking for smaller pattern. Add pattern for unsigned bitfield insert-in-zero done with SHL+AND. Resolves PR21631. Reviewers: jmolloy, t.p.northover Subscribers: aemerson, rengolin, llvm-commits, mcrosier Differential Revision: http://reviews.llvm.org/D12908 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@248006 91177308-0d34-0410-b5e6-96231b3b80d8
2025-01-23 12:45:47 +00:00 · 2015-09-18 17:11:53 +00:00 · 2015-09-18 17:11:53 +00:00 · 068d9ce827
commit 068d9ce827
parent e016c66756
3 changed files with 116 additions and 11 deletions
--- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@ -166,6 +166,7 @@ public:

  SDNode *SelectBitfieldExtractOp(SDNode *N);
  SDNode *SelectBitfieldInsertOp(SDNode *N);
+  SDNode *SelectBitfieldInsertInZeroOp(SDNode *N);

  SDNode *SelectLIBM(SDNode *N);
  SDNode *SelectFPConvertWithRound(SDNode *N);
@ -1918,6 +1919,7 @@ static SDValue getLeftShift(SelectionDAG *CurDAG, SDValue Op, int ShlAmount) {
 /// Does this tree qualify as an attempt to move a bitfield into position,
 /// essentially "(and (shl VAL, N), Mask)".
 static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op,
+                                    bool BiggerPattern,
                                    SDValue &Src, int &ShiftAmount,
                                    int &MaskWidth) {
  EVT VT = Op.getValueType();
@ -1940,6 +1942,11 @@ static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op,
    Op = Op.getOperand(0);
  }

+  // Don't match if the SHL has more than one use, since then we'll end up
+  // generating SHL+UBFIZ instead of just keeping SHL+AND.
+  if (!BiggerPattern && !Op.hasOneUse())
+    return false;
+
  uint64_t ShlImm;
  if (!isOpcWithIntImmediate(Op.getNode(), ISD::SHL, ShlImm))
    return false;
@ -1953,7 +1960,11 @@ static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op,

  // BFI encompasses sufficiently many nodes that it's worth inserting an extra
  // LSL/LSR if the mask in NonZeroBits doesn't quite match up with the ISD::SHL
-  // amount.
+  // amount.  BiggerPattern is true when this pattern is being matched for BFI,
+  // BiggerPattern is false when this pattern is being matched for UBFIZ, in
+  // which case it is not profitable to insert an extra shift.
+  if (ShlImm - ShiftAmount != 0 && !BiggerPattern)
+    return false;
  Src = getLeftShift(CurDAG, Op, ShlImm - ShiftAmount);

  return true;
@ -1990,17 +2001,26 @@ static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst,
  unsigned NumberOfIgnoredLowBits = UsefulBits.countTrailingZeros();
  unsigned NumberOfIgnoredHighBits = UsefulBits.countLeadingZeros();

-  // OR is commutative, check both possibilities (does llvm provide a
-  // way to do that directely, e.g., via code matcher?)
-  SDValue OrOpd1Val = N->getOperand(1);
-  SDNode *OrOpd0 = N->getOperand(0).getNode();
-  SDNode *OrOpd1 = N->getOperand(1).getNode();
-  for (int i = 0; i < 2;
-       ++i, std::swap(OrOpd0, OrOpd1), OrOpd1Val = N->getOperand(0)) {
+  // OR is commutative, check all combinations of operand order and values of
+  // BiggerPattern, i.e.
+  //     Opd0, Opd1, BiggerPattern=false
+  //     Opd1, Opd0, BiggerPattern=false
+  //     Opd0, Opd1, BiggerPattern=true
+  //     Opd1, Opd0, BiggerPattern=true
+  // Several of these combinations may match, so check with BiggerPattern=false
+  // first since that will produce better results by matching more instructions
+  // and/or inserting fewer extra instructions.
+  for (int I = 0; I < 4; ++I) {
+
+    bool BiggerPattern = I / 2;
+    SDNode *OrOpd0 = N->getOperand(I % 2).getNode();
+    SDValue OrOpd1Val = N->getOperand((I + 1) % 2);
+    SDNode *OrOpd1 = OrOpd1Val.getNode();
+
    unsigned BFXOpc;
    int DstLSB, Width;
    if (isBitfieldExtractOp(CurDAG, OrOpd0, BFXOpc, Src, ImmR, ImmS,
-                            NumberOfIgnoredLowBits, true)) {
+                            NumberOfIgnoredLowBits, BiggerPattern)) {
      // Check that the returned opcode is compatible with the pattern,
      // i.e., same type and zero extended (U and not S)
      if ((BFXOpc != AArch64::UBFMXri && VT == MVT::i64) ||
@ -2018,8 +2038,9 @@ static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst,

      // If the mask on the insertee is correct, we have a BFXIL operation. We
      // can share the ImmR and ImmS values from the already-computed UBFM.
-    } else if (isBitfieldPositioningOp(CurDAG, SDValue(OrOpd0, 0), Src,
-                                       DstLSB, Width)) {
+    } else if (isBitfieldPositioningOp(CurDAG, SDValue(OrOpd0, 0),
+                                       BiggerPattern,
+                                       Src, DstLSB, Width)) {
      ImmR = (VT.getSizeInBits() - DstLSB) % VT.getSizeInBits();
      ImmS = Width - 1;
    } else
@ -2082,6 +2103,39 @@ SDNode *AArch64DAGToDAGISel::SelectBitfieldInsertOp(SDNode *N) {
  return CurDAG->SelectNodeTo(N, Opc, VT, Ops);
 }

+/// SelectBitfieldInsertInZeroOp - Match a UBFIZ instruction that is the
+/// equivalent of a left shift by a constant amount followed by an and masking
+/// out a contiguous set of bits.
+SDNode *AArch64DAGToDAGISel::SelectBitfieldInsertInZeroOp(SDNode *N) {
+  if (N->getOpcode() != ISD::AND)
+    return nullptr;
+
+  EVT VT = N->getValueType(0);
+  unsigned Opc;
+  if (VT == MVT::i32)
+    Opc = AArch64::UBFMWri;
+  else if (VT == MVT::i64)
+    Opc = AArch64::UBFMXri;
+  else
+    return nullptr;
+
+  SDValue Op0;
+  int DstLSB, Width;
+  if (!isBitfieldPositioningOp(CurDAG, SDValue(N, 0), /*BiggerPattern=*/false,
+                               Op0, DstLSB, Width))
+    return nullptr;
+
+  // ImmR is the rotate right amount.
+  unsigned ImmR = (VT.getSizeInBits() - DstLSB) % VT.getSizeInBits();
+  // ImmS is the most significant bit of the source to be moved.
+  unsigned ImmS = Width - 1;
+
+  SDLoc DL(N);
+  SDValue Ops[] = {Op0, CurDAG->getTargetConstant(ImmR, DL, VT),
+                   CurDAG->getTargetConstant(ImmS, DL, VT)};
+  return CurDAG->SelectNodeTo(N, Opc, VT, Ops);
+}
+
 /// GenerateInexactFlagIfNeeded - Insert FRINTX instruction to generate inexact
 /// signal on round-to-integer operations if needed. C11 leaves it
 /// implementation-defined whether these operations trigger an inexact
@ -2443,6 +2497,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
  case ISD::SRA:
    if (SDNode *I = SelectBitfieldExtractOp(Node))
      return I;
+    if (SDNode *I = SelectBitfieldInsertInZeroOp(Node))
+      return I;
    break;

  case ISD::OR:
--- a/test/CodeGen/AArch64/bitfield-insert.ll
+++ b/test/CodeGen/AArch64/bitfield-insert.ll
@ -196,3 +196,22 @@ define void @test_32bit_with_shr(i32* %existing, i32* %new) {

  ret void
 }
+
+; Bitfield insert where the second or operand is a better match to be folded into the BFM
+define void @test_32bit_opnd1_better(i32* %existing, i32* %new) {
+; CHECK-LABEL: test_32bit_opnd1_better:
+
+  %oldval = load volatile i32, i32* %existing
+  %oldval_keep = and i32 %oldval, 65535 ; 0x0000ffff
+
+  %newval = load i32, i32* %new
+  %newval_shifted = shl i32 %newval, 16
+  %newval_masked = and i32 %newval_shifted, 16711680 ; 0x00ff0000
+
+  %combined = or i32 %oldval_keep, %newval_masked
+  store volatile i32 %combined, i32* %existing
+; CHECK: and [[BIT:w[0-9]+]], {{w[0-9]+}}, #0xffff
+; CHECK: bfi [[BIT]], {{w[0-9]+}}, #16, #8
+
+  ret void
+}
--- a/test/CodeGen/AArch64/xbfiz.ll
+++ b/test/CodeGen/AArch64/xbfiz.ll
@ -31,3 +31,33 @@ define i32 @ubfiz32(i32 %v) {
  %shr = lshr i32 %shl, 2
  ret i32 %shr
 }
+
+define i64 @ubfiz64and(i64 %v) {
+; CHECK-LABEL: ubfiz64and:
+; CHECK: ubfiz	x0, x0, #36, #11
+  %shl = shl i64 %v, 36
+  %and = and i64 %shl, 140668768878592
+  ret i64 %and
+}
+
+define i32 @ubfiz32and(i32 %v) {
+; CHECK-LABEL: ubfiz32and:
+; CHECK: ubfiz	w0, w0, #6, #24
+  %shl = shl i32 %v, 6
+  %and = and i32 %shl, 1073741760
+  ret i32 %and
+}
+
+; Check that we don't generate a ubfiz if the lsl has more than one
+; use, since we'd just be replacing an and with a ubfiz.
+define i32 @noubfiz32(i32 %v) {
+; CHECK-LABEL: noubfiz32:
+; CHECK: lsl	w[[REG1:[0-9]+]], w0, #6
+; CHECK: and	w[[REG2:[0-9]+]], w[[REG1]], #0x3fffffc0
+; CHECK: add	w0, w[[REG1]], w[[REG2]]
+; CHECK: ret
+  %shl = shl i32 %v, 6
+  %and = and i32 %shl, 1073741760
+  %add = add i32 %shl, %and
+  ret i32 %add
+}