[AArch64] Improve code generation for logical instructions taking

immediate operands. This commit adds an AArch64 dag-combine that optimizes code generation for logical instructions taking immediate operands. The optimization uses demanded bits to change a logical instruction's immediate operand so that the immediate can be folded into the immediate field of the instruction. This recommits r300932 and r300930, which was causing dag-combine to loop forever. The problem was that optimizeLogicalImm was returning true even when there was no change to the immediate node (which happened when the immediate was all zeros or ones), which caused dag-combine to push and pop the same node to the work list over and over again without making any progress. This commit fixes the bug by returning false early in optimizeLogicalImm if the immediate is all zeros or ones. Also, it changes the code to compare the immediate with 0 or Mask rather than calling countPopulation. rdar://problem/18231627 Differential Revision: https://reviews.llvm.org/D5591 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@301019 91177308-0d34-0410-b5e6-96231b3b80d8
2024-11-24 12:19:53 +00:00 · 2017-04-21 18:53:12 +00:00 · 2017-04-21 18:53:12 +00:00 · 586c752a82
commit 586c752a82
parent bbc50e81b0
9 changed files with 283 additions and 59 deletions
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@ -2388,30 +2388,39 @@ public:
      New = N;
      return true;
    }
-
-    /// Check to see if the specified operand of the specified instruction is a
-    /// constant integer.  If so, check to see if there are any bits set in the
-    /// constant that are not demanded.  If so, shrink the constant and return
-    /// true.
-    bool ShrinkDemandedConstant(SDValue Op, const APInt &Demanded);
-
-    /// Convert x+y to (VT)((SmallVT)x+(SmallVT)y) if the casts are free.  This
-    /// uses isZExtFree and ZERO_EXTEND for the widening cast, but it could be
-    /// generalized for targets with other types of implicit widening casts.
-    bool ShrinkDemandedOp(SDValue Op, unsigned BitWidth, const APInt &Demanded,
-                          const SDLoc &dl);
-
-    /// Helper for SimplifyDemandedBits that can simplify an operation with
-    /// multiple uses.  This function uses TLI.SimplifyDemandedBits to
-    /// simplify Operand \p OpIdx of \p User and then updated \p User with
-    /// the simplified version.  No other uses of \p OpIdx are updated.
-    /// If \p User is the only user of \p OpIdx, this function behaves exactly
-    /// like TLI.SimplifyDemandedBits except that it also updates the DAG by
-    /// calling DCI.CommitTargetLoweringOpt.
-    bool SimplifyDemandedBits(SDNode *User, unsigned OpIdx,
-                              const APInt &Demanded, DAGCombinerInfo &DCI);
  };

+  /// Check to see if the specified operand of the specified instruction is a
+  /// constant integer.  If so, check to see if there are any bits set in the
+  /// constant that are not demanded.  If so, shrink the constant and return
+  /// true.
+  bool ShrinkDemandedConstant(SDValue Op, const APInt &Demanded,
+                              TargetLoweringOpt &TLO) const;
+
+  // Target hook to do target-specific const optimization, which is called by
+  // ShrinkDemandedConstant. This function should return true if the target
+  // doesn't want ShrinkDemandedConstant to further optimize the constant.
+  virtual bool targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded,
+                                            TargetLoweringOpt &TLO) const {
+    return false;
+  }
+
+  /// Convert x+y to (VT)((SmallVT)x+(SmallVT)y) if the casts are free.  This
+  /// uses isZExtFree and ZERO_EXTEND for the widening cast, but it could be
+  /// generalized for targets with other types of implicit widening casts.
+  bool ShrinkDemandedOp(SDValue Op, unsigned BitWidth, const APInt &Demanded,
+                        TargetLoweringOpt &TLO) const;
+
+  /// Helper for SimplifyDemandedBits that can simplify an operation with
+  /// multiple uses.  This function simplifies operand \p OpIdx of \p User and
+  /// then updates \p User with the simplified version. No other uses of
+  /// \p OpIdx are updated. If \p User is the only user of \p OpIdx, this
+  /// function behaves exactly like function SimplifyDemandedBits declared
+  /// below except that it also updates the DAG by calling
+  /// DCI.CommitTargetLoweringOpt.
+  bool SimplifyDemandedBits(SDNode *User, unsigned OpIdx, const APInt &Demanded,
+                            DAGCombinerInfo &DCI, TargetLoweringOpt &TLO) const;
+
  /// Look at Op.  At this point, we know that only the DemandedMask bits of the
  /// result of Op are ever used downstream.  If we can use this information to
  /// simplify Op, create a new simplified DAG node and return true, returning
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@ -342,11 +342,16 @@ TargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
 /// If the specified instruction has a constant integer operand and there are
 /// bits set in that constant that are not demanded, then clear those bits and
 /// return true.
-bool TargetLowering::TargetLoweringOpt::ShrinkDemandedConstant(
-    SDValue Op, const APInt &Demanded) {
+bool TargetLowering::ShrinkDemandedConstant(SDValue Op, const APInt &Demanded,
+                                            TargetLoweringOpt &TLO) const {
+  SelectionDAG &DAG = TLO.DAG;
  SDLoc DL(Op);
  unsigned Opcode = Op.getOpcode();

+  // Do target-specific constant optimization.
+  if (targetShrinkDemandedConstant(Op, Demanded, TLO))
+    return TLO.New.getNode();
+
  // FIXME: ISD::SELECT, ISD::SELECT_CC
  switch (Opcode) {
  default:
@ -367,7 +372,7 @@ bool TargetLowering::TargetLoweringOpt::ShrinkDemandedConstant(
      EVT VT = Op.getValueType();
      SDValue NewC = DAG.getConstant(Demanded & C, DL, VT);
      SDValue NewOp = DAG.getNode(Opcode, DL, VT, Op.getOperand(0), NewC);
-      return CombineTo(Op, NewOp);
+      return TLO.CombineTo(Op, NewOp);
    }

    break;
@ -380,15 +385,17 @@ bool TargetLowering::TargetLoweringOpt::ShrinkDemandedConstant(
 /// Convert x+y to (VT)((SmallVT)x+(SmallVT)y) if the casts are free.
 /// This uses isZExtFree and ZERO_EXTEND for the widening cast, but it could be
 /// generalized for targets with other types of implicit widening casts.
-bool TargetLowering::TargetLoweringOpt::ShrinkDemandedOp(SDValue Op,
-                                                         unsigned BitWidth,
-                                                         const APInt &Demanded,
-                                                         const SDLoc &dl) {
+bool TargetLowering::ShrinkDemandedOp(SDValue Op, unsigned BitWidth,
+                                      const APInt &Demanded,
+                                      TargetLoweringOpt &TLO) const {
  assert(Op.getNumOperands() == 2 &&
         "ShrinkDemandedOp only supports binary operators!");
  assert(Op.getNode()->getNumValues() == 1 &&
         "ShrinkDemandedOp only supports nodes with one result!");

+  SelectionDAG &DAG = TLO.DAG;
+  SDLoc dl(Op);
+
  // Early return, as this function cannot handle vector types.
  if (Op.getValueType().isVector())
    return false;
@ -418,23 +425,22 @@ bool TargetLowering::TargetLoweringOpt::ShrinkDemandedOp(SDValue Op,
      bool NeedZext = DemandedSize > SmallVTBits;
      SDValue Z = DAG.getNode(NeedZext ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND,
                              dl, Op.getValueType(), X);
-      return CombineTo(Op, Z);
+      return TLO.CombineTo(Op, Z);
    }
  }
  return false;
 }

 bool
-TargetLowering::TargetLoweringOpt::SimplifyDemandedBits(SDNode *User,
-                                                        unsigned OpIdx,
-                                                        const APInt &Demanded,
-                                                        DAGCombinerInfo &DCI) {
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+TargetLowering::SimplifyDemandedBits(SDNode *User, unsigned OpIdx,
+                                     const APInt &Demanded,
+                                     DAGCombinerInfo &DCI,
+                                     TargetLoweringOpt &TLO) const {
  SDValue Op = User->getOperand(OpIdx);
  APInt KnownZero, KnownOne;

-  if (!TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne,
-                                *this, 0, true))
+  if (!SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne,
+                            TLO, 0, true))
    return false;


@ -446,9 +452,9 @@ TargetLowering::TargetLoweringOpt::SimplifyDemandedBits(SDNode *User,
  // with the value 'x', which will give us:
  // Old = i32 and x, 0xffffff
  // New = x
-  if (Old.hasOneUse()) {
+  if (TLO.Old.hasOneUse()) {
    // For the one use case, we just commit the change.
-    DCI.CommitTargetLoweringOpt(*this);
+    DCI.CommitTargetLoweringOpt(TLO);
    return true;
  }

@ -456,17 +462,17 @@ TargetLowering::TargetLoweringOpt::SimplifyDemandedBits(SDNode *User,
  // AssumeSingleUse flag is not propogated to recursive calls of
  // SimplifyDemanded bits, so the only node with multiple use that
  // it will attempt to combine will be opt.
-  assert(Old == Op);
+  assert(TLO.Old == Op);

  SmallVector <SDValue, 4> NewOps;
  for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) {
    if (i == OpIdx) {
-      NewOps.push_back(New);
+      NewOps.push_back(TLO.New);
      continue;
    }
    NewOps.push_back(User->getOperand(i));
  }
-  DAG.UpdateNodeOperands(User, NewOps);
+  TLO.DAG.UpdateNodeOperands(User, NewOps);
  // Op has less users now, so we may be able to perform additional combines
  // with it.
  DCI.AddToWorklist(Op.getNode());
@ -585,7 +591,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,

      // If any of the set bits in the RHS are known zero on the LHS, shrink
      // the constant.
-      if (TLO.ShrinkDemandedConstant(Op, ~LHSZero & NewMask))
+      if (ShrinkDemandedConstant(Op, ~LHSZero & NewMask, TLO))
        return true;

      // Bitwise-not (xor X, -1) is a special case: we don't usually shrink its
@ -620,10 +626,10 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
    if ((NewMask & (KnownZero|KnownZero2)) == NewMask)
      return TLO.CombineTo(Op, TLO.DAG.getConstant(0, dl, Op.getValueType()));
    // If the RHS is a constant, see if we can simplify it.
-    if (TLO.ShrinkDemandedConstant(Op, ~KnownZero2 & NewMask))
+    if (ShrinkDemandedConstant(Op, ~KnownZero2 & NewMask, TLO))
      return true;
    // If the operation can be done in a smaller type, do so.
-    if (TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
+    if (ShrinkDemandedOp(Op, BitWidth, NewMask, TLO))
      return true;

    // Output known-1 bits are only known if set in both the LHS & RHS.
@ -654,10 +660,10 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
    if ((NewMask & ~KnownZero2 & KnownOne) == (~KnownZero2 & NewMask))
      return TLO.CombineTo(Op, Op.getOperand(1));
    // If the RHS is a constant, see if we can simplify it.
-    if (TLO.ShrinkDemandedConstant(Op, NewMask))
+    if (ShrinkDemandedConstant(Op, NewMask, TLO))
      return true;
    // If the operation can be done in a smaller type, do so.
-    if (TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
+    if (ShrinkDemandedOp(Op, BitWidth, NewMask, TLO))
      return true;

    // Output known-0 bits are only known if clear in both the LHS & RHS.
@ -682,7 +688,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
    if ((KnownZero2 & NewMask) == NewMask)
      return TLO.CombineTo(Op, Op.getOperand(1));
    // If the operation can be done in a smaller type, do so.
-    if (TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
+    if (ShrinkDemandedOp(Op, BitWidth, NewMask, TLO))
      return true;

    // If all of the unknown bits are known to be zero on one side or the other
@ -727,7 +733,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
        }
        // If it already has all the bits set, nothing to change
        // but don't shrink either!
-      } else if (TLO.ShrinkDemandedConstant(Op, NewMask)) {
+      } else if (ShrinkDemandedConstant(Op, NewMask, TLO)) {
        return true;
      }
    }
@ -746,7 +752,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");

    // If the operands are constants, see if we can simplify them.
-    if (TLO.ShrinkDemandedConstant(Op, NewMask))
+    if (ShrinkDemandedConstant(Op, NewMask, TLO))
      return true;

    // Only known if known in both the LHS and RHS.
@ -764,7 +770,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");

    // If the operands are constants, see if we can simplify them.
-    if (TLO.ShrinkDemandedConstant(Op, NewMask))
+    if (ShrinkDemandedConstant(Op, NewMask, TLO))
      return true;

    // Only known if known in both the LHS and RHS.
@ -1284,7 +1290,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
        SimplifyDemandedBits(Op.getOperand(1), LoMask, KnownZero2,
                             KnownOne2, TLO, Depth+1) ||
        // See if the operation should be performed at a smaller bit width.
-        TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl)) {
+        ShrinkDemandedOp(Op, BitWidth, NewMask, TLO)) {
      const SDNodeFlags *Flags = Op.getNode()->getFlags();
      if (Flags->hasNoSignedWrap() || Flags->hasNoUnsignedWrap()) {
        // Disable the nsw and nuw flags. We can no longer guarantee that we
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@ -91,6 +91,7 @@ using namespace llvm;

 STATISTIC(NumTailCalls, "Number of tail calls");
 STATISTIC(NumShiftInserts, "Number of vector shift inserts");
+STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");

 static cl::opt<bool>
 EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
@ -105,6 +106,12 @@ cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
    cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
    cl::init(false));

+static cl::opt<bool>
+EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
+                         cl::desc("Enable AArch64 logical imm instruction "
+                                  "optimization"),
+                         cl::init(true));
+
 /// Value type used for condition codes.
 static const MVT MVT_CC = MVT::i32;

@ -787,6 +794,140 @@ EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
  return VT.changeVectorElementTypeToInteger();
 }

+static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
+                               const APInt &Demanded,
+                               TargetLowering::TargetLoweringOpt &TLO,
+                               unsigned NewOpc) {
+  uint64_t OldImm = Imm, NewImm, Enc;
+  uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
+
+  // Return if the immediate is already all zeros, all ones, a bimm32 or a
+  // bimm64.
+  if (Imm == 0 || Imm == Mask ||
+      AArch64_AM::isLogicalImmediate(Imm & Mask, Size))
+    return false;
+
+  unsigned EltSize = Size;
+  uint64_t DemandedBits = Demanded.getZExtValue();
+
+  // Clear bits that are not demanded.
+  Imm &= DemandedBits;
+
+  while (true) {
+    // The goal here is to set the non-demanded bits in a way that minimizes
+    // the number of switching between 0 and 1. In order to achieve this goal,
+    // we set the non-demanded bits to the value of the preceding demanded bits.
+    // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
+    // non-demanded bit), we copy bit0 (1) to the least significant 'x',
+    // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
+    // The final result is 0b11000011.
+    uint64_t NonDemandedBits = ~DemandedBits;
+    uint64_t InvertedImm = ~Imm & DemandedBits;
+    uint64_t RotatedImm =
+        ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
+        NonDemandedBits;
+    uint64_t Sum = RotatedImm + NonDemandedBits;
+    bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
+    uint64_t Ones = (Sum + Carry) & NonDemandedBits;
+    NewImm = (Imm | Ones) & Mask;
+
+    // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
+    // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
+    // we halve the element size and continue the search.
+    if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
+      break;
+
+    // We cannot shrink the element size any further if it is 2-bits.
+    if (EltSize == 2)
+      return false;
+
+    EltSize /= 2;
+    Mask >>= EltSize;
+    uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
+
+    // Return if there is mismatch in any of the demanded bits of Imm and Hi.
+    if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
+      return false;
+
+    // Merge the upper and lower halves of Imm and DemandedBits.
+    Imm |= Hi;
+    DemandedBits |= DemandedBitsHi;
+  }
+
+  ++NumOptimizedImms;
+
+  // Replicate the element across the register width.
+  while (EltSize < Size) {
+    NewImm |= NewImm << EltSize;
+    EltSize *= 2;
+  }
+
+  (void)OldImm;
+  assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
+         "demanded bits should never be altered");
+  assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
+
+  // Create the new constant immediate node.
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+
+  // If the new constant immediate is all-zeros or all-ones, let the target
+  // independent DAG combine optimize this node.
+  if (NewImm == 0 || NewImm == OrigMask)
+    return TLO.CombineTo(Op.getOperand(1), TLO.DAG.getConstant(NewImm, DL, VT));
+
+  // Otherwise, create a machine node so that target independent DAG combine
+  // doesn't undo this optimization.
+  Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size);
+  SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
+  SDValue New(
+      TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
+
+  return TLO.CombineTo(Op, New);
+}
+
+bool AArch64TargetLowering::targetShrinkDemandedConstant(
+    SDValue Op, const APInt &Demanded, TargetLoweringOpt &TLO) const {
+  // Delay this optimization to as late as possible.
+  if (!TLO.LegalOps)
+    return false;
+
+  if (!EnableOptimizeLogicalImm)
+    return false;
+
+  EVT VT = Op.getValueType();
+  if (VT.isVector())
+    return false;
+
+  unsigned Size = VT.getSizeInBits();
+  assert((Size == 32 || Size == 64) &&
+         "i32 or i64 is expected after legalization.");
+
+  // Exit early if we demand all bits.
+  if (Demanded.countPopulation() == Size)
+    return false;
+
+  unsigned NewOpc;
+  switch (Op.getOpcode()) {
+  default:
+    return false;
+  case ISD::AND:
+    NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
+    break;
+  case ISD::OR:
+    NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
+    break;
+  case ISD::XOR:
+    NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
+    break;
+  }
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+  if (!C)
+    return false;
+  uint64_t Imm = C->getZExtValue();
+  return optimizeLogicalImm(Op, Size, Imm, Demanded, TLO, NewOpc);
+}
+
 /// computeKnownBitsForTargetNode - Determine which of the bits specified in
 /// Mask are known to be either zero or one and return them in the
 /// KnownZero/KnownOne bitsets.
--- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@ -255,6 +255,9 @@ public:
                                     const SelectionDAG &DAG,
                                     unsigned Depth = 0) const override;

+  bool targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded,
+                                    TargetLoweringOpt &TLO) const override;
+
  MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override;

  /// Returns true if the target allows unaligned memory accesses of the
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@ -2315,12 +2315,13 @@ static bool simplifyI24(SDNode *Node24, unsigned OpIdx,

  SelectionDAG &DAG = DCI.DAG;
  SDValue Op = Node24->getOperand(OpIdx);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  EVT VT = Op.getValueType();

  APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24);
  APInt KnownZero, KnownOne;
  TargetLowering::TargetLoweringOpt TLO(DAG, true, true);
-  if (TLO.SimplifyDemandedBits(Node24, OpIdx, Demanded, DCI))
+  if (TLI.SimplifyDemandedBits(Node24, OpIdx, Demanded, DCI, TLO))
    return true;

  return false;
@ -3361,7 +3362,7 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
      TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
                                            !DCI.isBeforeLegalizeOps());
      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-      if (TLO.ShrinkDemandedConstant(BitsFrom, Demanded) ||
+      if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
          TLI.SimplifyDemandedBits(BitsFrom, Demanded,
                                   KnownZero, KnownOne, TLO)) {
        DCI.CommitTargetLoweringOpt(TLO);
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@ -4696,7 +4696,7 @@ SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
  TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
                                        !DCI.isBeforeLegalizeOps());
  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  if (TLO.ShrinkDemandedConstant(Src, Demanded) ||
+  if (TLI.ShrinkDemandedConstant(Src, Demanded, TLO) ||
      TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) {
    DCI.CommitTargetLoweringOpt(TLO);
  }
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -30207,7 +30207,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
    APInt KnownZero, KnownOne;
    TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
                                          DCI.isBeforeLegalizeOps());
-    if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
+    if (TLI.ShrinkDemandedConstant(Cond, DemandedMask, TLO) ||
        TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
                                 TLO)) {
      // If we changed the computation somewhere in the DAG, this change will
@ -33777,7 +33777,7 @@ static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
    TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
                                          !DCI.isBeforeLegalizeOps());
    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-    if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
+    if (TLI.ShrinkDemandedConstant(Op1, DemandedMask, TLO) ||
        TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
      DCI.CommitTargetLoweringOpt(TLO);
  }
--- a/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/lib/Target/XCore/XCoreISelLowering.cpp
@ -1605,7 +1605,7 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
        TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
                                              !DCI.isBeforeLegalizeOps());
        const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-        if (TLO.ShrinkDemandedConstant(OutVal, DemandedMask) ||
+        if (TLI.ShrinkDemandedConstant(OutVal, DemandedMask, TLO) ||
            TLI.SimplifyDemandedBits(OutVal, DemandedMask, KnownZero, KnownOne,
                                     TLO))
          DCI.CommitTargetLoweringOpt(TLO);
@ -1622,7 +1622,7 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
        TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
                                              !DCI.isBeforeLegalizeOps());
        const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-        if (TLO.ShrinkDemandedConstant(Time, DemandedMask) ||
+        if (TLI.ShrinkDemandedConstant(Time, DemandedMask, TLO) ||
            TLI.SimplifyDemandedBits(Time, DemandedMask, KnownZero, KnownOne,
                                     TLO))
          DCI.CommitTargetLoweringOpt(TLO);
--- a/test/CodeGen/AArch64/optimize-imm.ll
+++ b/test/CodeGen/AArch64/optimize-imm.ll
@ -0,0 +1,64 @@
+; RUN: llc -o - %s -mtriple=aarch64-- | FileCheck %s
+
+; CHECK-LABEL: and1:
+; CHECK: and {{w[0-9]+}}, w0, #0xfffffffd
+
+define void @and1(i32 %a, i8* nocapture %p) {
+entry:
+  %and = and i32 %a, 253
+  %conv = trunc i32 %and to i8
+  store i8 %conv, i8* %p, align 1
+  ret void
+}
+
+; (a & 0x3dfd) | 0xffffc000
+;
+; CHECK-LABEL: and2:
+; CHECK: and {{w[0-9]+}}, w0, #0xfdfdfdfd
+
+define i32 @and2(i32 %a) {
+entry:
+  %and = and i32 %a, 15869
+  %or = or i32 %and, -16384
+  ret i32 %or
+}
+
+; (a & 0x19) | 0xffffffc0
+;
+; CHECK-LABEL: and3:
+; CHECK: and {{w[0-9]+}}, w0, #0x99999999
+
+define i32 @and3(i32 %a) {
+entry:
+  %and = and i32 %a, 25
+  %or = or i32 %and, -64
+  ret i32 %or
+}
+
+; (a & 0xc5600) | 0xfff1f1ff
+;
+; CHECK-LABEL: and4:
+; CHECK: and {{w[0-9]+}}, w0, #0xfffc07ff
+
+define i32 @and4(i32 %a) {
+entry:
+  %and = and i32 %a, 787968
+  %or = or i32 %and, -921089
+  ret i32 %or
+}
+
+; Make sure we don't shrink or optimize an XOR's immediate operand if the
+; immediate is -1. Instruction selection turns (and ((xor $mask, -1), $v0)) into
+; a BIC.
+
+; CHECK-LABEL: xor1:
+; CHECK: orr [[R0:w[0-9]+]], wzr, #0x38
+; CHECK: bic {{w[0-9]+}}, [[R0]], w0, lsl #3
+
+define i32 @xor1(i32 %a) {
+entry:
+  %shl = shl i32 %a, 3
+  %xor = and i32 %shl, 56
+  %and = xor i32 %xor, 56
+  ret i32 %and
+}