[X86] Add an override of targetShrinkDemandedConstant to limit the damage that shrinkdemandedbits can do to zext_in_reg operations

Summary: This patch adds an implementation of targetShrinkDemandedConstant that tries to keep shrinkdemandedbits from removing bits that would otherwise have been recognized as a movzx. We still need a follow patch to stop moving ands across srl if the and could be represented as a movzx before the shift but not after. I think this should help with some of the cases that D42088 ended up removing during isel. Reviewers: spatel, RKSimon Reviewed By: spatel Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D42265 llvm-svn: 323048
2025-03-02 17:29:03 +00:00 · 2018-01-20 18:50:09 +00:00 · 2018-01-20 18:50:09 +00:00 · 293879ad3d
commit 293879ad3d
parent 6320b01ed3
7 changed files with 88 additions and 26 deletions
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -27884,6 +27884,65 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
 //                           X86 Optimization Hooks
 //===----------------------------------------------------------------------===//

+bool
+X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
+                                                const APInt &Demanded,
+                                                TargetLoweringOpt &TLO) const {
+  // Only optimize Ands to prevent shrinking a constant that could be
+  // matched by movzx.
+  if (Op.getOpcode() != ISD::AND)
+    return false;
+
+  EVT VT = Op.getValueType();
+
+  // Ignore vectors.
+  if (VT.isVector())
+    return false;
+
+  unsigned Size = VT.getSizeInBits();
+
+  // Make sure the RHS really is a constant.
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+  if (!C)
+    return false;
+
+  const APInt &Mask = C->getAPIntValue();
+
+  // Clear all non-demanded bits initially.
+  APInt ShrunkMask = Mask & Demanded;
+
+  // Find the width of the shrunk mask.
+  unsigned Width = ShrunkMask.getActiveBits();
+
+  // If the mask is all 0s there's nothing to do here.
+  if (Width == 0)
+    return false;
+
+  // Find the next power of 2 width, rounding up to a byte.
+  Width = PowerOf2Ceil(std::max(Width, 8U));
+  // Truncate the width to size to handle illegal types.
+  Width = std::min(Width, Size);
+
+  // Calculate a possible zero extend mask for this constant.
+  APInt ZeroExtendMask = APInt::getLowBitsSet(Size, Width);
+
+  // If we aren't changing the mask, just return true to keep it and prevent
+  // the caller from optimizing.
+  if (ZeroExtendMask == Mask)
+    return true;
+
+  // Make sure the bits in the ZeroExtendMask are also set in the original mask.
+  // TODO: We should be able to set bits that aren't demanded too.
+  if (!ZeroExtendMask.isSubsetOf(Mask))
+    return false;
+
+  // Replace the constant with the zero extend mask.
+  SDLoc DL(Op);
+  SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
+  SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
+  return TLO.CombineTo(Op, NewOp);
+}
+
 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
                                                      KnownBits &Known,
                                                      const APInt &DemandedElts,
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@ -835,6 +835,9 @@ namespace llvm {
    EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
                           EVT VT) const override;

+    bool targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded,
+                                      TargetLoweringOpt &TLO) const override;
+
    /// Determine which of the bits specified in Mask are known to be either
    /// zero or one and return them in the KnownZero/KnownOne bitsets.
    void computeKnownBitsForTargetNode(const SDValue Op,
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@ -1514,6 +1514,10 @@ def : Pat<(i8 (trunc GR16:$src)),
          (EXTRACT_SUBREG GR16:$src, sub_8bit)>,
      Requires<[In64BitMode]>;

+def immff00_ffff  : ImmLeaf<i32, [{
+  return Imm >= 0xff00 && Imm <= 0xffff;
+}]>;
+
 // h-register tricks
 def : Pat<(i8 (trunc (srl_su GR16:$src, (i8 8)))),
          (EXTRACT_SUBREG GR16:$src, sub_8bit_hi)>,
@ -1534,7 +1538,7 @@ def : Pat<(i32 (anyext (srl_su GR16:$src, (i8 8)))),
          (MOVZX32_NOREXrr8 (EXTRACT_SUBREG GR16:$src, sub_8bit_hi))>;
 def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)),
          (MOVZX32_NOREXrr8 (EXTRACT_SUBREG GR32:$src, sub_8bit_hi))>;
-def : Pat<(srl (and_su GR32:$src, 0xff00), (i8 8)),
+def : Pat<(srl (and_su GR32:$src, immff00_ffff), (i8 8)),
          (MOVZX32_NOREXrr8 (EXTRACT_SUBREG GR32:$src, sub_8bit_hi))>;

 // h-register tricks.
--- a/test/CodeGen/X86/3addr-or.ll
+++ b/test/CodeGen/X86/3addr-or.ll
@ -14,16 +14,18 @@ define i32 @test1(i32 %x) nounwind ssp {
  ret i32 %t1
 }

+; This test no longer requires or to be converted to 3 addr form because we are
+; are able to use a zero extend instead of an 'and' which gives the register
+; allocator freedom.
 define i64 @test2(i8 %A, i8 %B) nounwind {
 ; CHECK-LABEL: test2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    # kill: def %esi killed %esi def %rsi
 ; CHECK-NEXT:    # kill: def %edi killed %edi def %rdi
 ; CHECK-NEXT:    shll $4, %edi
 ; CHECK-NEXT:    andl $48, %edi
-; CHECK-NEXT:    andl $240, %esi
-; CHECK-NEXT:    shrq $4, %rsi
-; CHECK-NEXT:    leaq (%rsi,%rdi), %rax
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    shrq $4, %rax
+; CHECK-NEXT:    orq %rdi, %rax
 ; CHECK-NEXT:    retq
  %C = zext i8 %A to i64
  %D = shl i64 %C, 4
--- a/test/CodeGen/X86/popcnt.ll
+++ b/test/CodeGen/X86/popcnt.ll
@ -71,7 +71,6 @@ define i16 @cnt16(i16 %x) nounwind readnone {
 ; X32-NEXT:    andl $13107, %eax # imm = 0x3333
 ; X32-NEXT:    addl %ecx, %eax
 ; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    andl $-16, %ecx
 ; X32-NEXT:    shrl $4, %ecx
 ; X32-NEXT:    addl %eax, %ecx
 ; X32-NEXT:    andl $3855, %ecx # imm = 0xF0F
@ -94,7 +93,6 @@ define i16 @cnt16(i16 %x) nounwind readnone {
 ; X64-NEXT:    andl $13107, %edi # imm = 0x3333
 ; X64-NEXT:    addl %eax, %edi
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    andl $-16, %eax
 ; X64-NEXT:    shrl $4, %eax
 ; X64-NEXT:    addl %edi, %eax
 ; X64-NEXT:    andl $3855, %eax # imm = 0xF0F
--- a/test/CodeGen/X86/pr21792.ll
+++ b/test/CodeGen/X86/pr21792.ll
@ -12,19 +12,18 @@ define void @func(<4 x float> %vx) {
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    pand {{.*}}(%rip), %xmm0
-; CHECK-NEXT:    pextrq $1, %xmm0, %rdx
-; CHECK-NEXT:    movq %rdx, %rcx
-; CHECK-NEXT:    shrq $32, %rcx
-; CHECK-NEXT:    movq %xmm0, %rax
-; CHECK-NEXT:    movq %rax, %r9
+; CHECK-NEXT:    pextrq $1, %xmm0, %rax
+; CHECK-NEXT:    movzwl %ax, %ecx
+; CHECK-NEXT:    shrq $32, %rax
+; CHECK-NEXT:    movq %xmm0, %rdx
+; CHECK-NEXT:    movzwl %dx, %r8d
+; CHECK-NEXT:    movq %rdx, %r9
 ; CHECK-NEXT:    shrq $32, %r9
-; CHECK-NEXT:    andl $2032, %eax # imm = 0x7F0
-; CHECK-NEXT:    leaq stuff(%rax), %rdi
+; CHECK-NEXT:    leaq stuff(%r8), %rdi
 ; CHECK-NEXT:    leaq stuff(%r9), %rsi
-; CHECK-NEXT:    andl $2032, %edx # imm = 0x7F0
-; CHECK-NEXT:    leaq stuff(%rdx), %rdx
-; CHECK-NEXT:    leaq stuff(%rcx), %rcx
-; CHECK-NEXT:    leaq stuff+8(%rax), %r8
+; CHECK-NEXT:    leaq stuff(%rcx), %rdx
+; CHECK-NEXT:    leaq stuff(%rax), %rcx
+; CHECK-NEXT:    leaq stuff+8(%r8), %r8
 ; CHECK-NEXT:    leaq stuff+8(%r9), %r9
 ; CHECK-NEXT:    callq toto
 ; CHECK-NEXT:    popq %rax
--- a/test/CodeGen/X86/zext-demanded.ll
+++ b/test/CodeGen/X86/zext-demanded.ll
@ -5,25 +5,22 @@
 ; demanded bits shortcomings.

 ; The backend will insert a zext to promote the shift to i32.
-; TODO: we should be able to use movzx here.
 define i16 @test1(i16 %x) {
 ; CHECK-LABEL: test1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andl $65534, %edi # imm = 0xFFFE
-; CHECK-NEXT:    shrl %edi
-; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    movzwl %di, %eax
+; CHECK-NEXT:    shrl %eax
+; CHECK-NEXT:    # kill: def %ax killed %ax killed %eax
 ; CHECK-NEXT:    retq
  %y = lshr i16 %x, 1
  ret i16 %y
 }

-; TODO: we should be able to use movzx here.
 define i32 @test2(i32 %x) {
 ; CHECK-LABEL: test2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andl $65534, %edi # imm = 0xFFFE
-; CHECK-NEXT:    shrl %edi
-; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    movzwl %di, %eax
+; CHECK-NEXT:    shrl %eax
 ; CHECK-NEXT:    retq
  %y = and i32 %x, 65535
  %z = lshr i32 %y, 1