[DAG] Don't permit EXTLOAD when combining FSHL/FSHR consecutive loads (PR45265)

Technically we can permit EXTLOAD of the LHS operand but only if all the extended bits are shifted out. Until we test coverage for that case, I'm just disabling this to fix PR45265.
2025-01-17 21:49:21 +00:00 · 2020-03-21 10:33:53 +00:00 · 2020-03-21 10:33:53 +00:00 · c5fd9e3888
commit c5fd9e3888
parent c6d799156a
2 changed files with 67 additions and 1 deletions
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@ -8325,13 +8325,15 @@ SDValue DAGCombiner::visitFunnelShift(SDNode *N) {
    // fold (fshr ld1, ld0, c) -> (ld0[ofs]) iff ld0 and ld1 are consecutive.
    // TODO - bigendian support once we have test coverage.
    // TODO - can we merge this with CombineConseutiveLoads/MatchLoadCombine?
+    // TODO - permit LHS EXTLOAD if extensions are shifted out.
    if ((BitWidth % 8) == 0 && (ShAmt % 8) == 0 && !VT.isVector() &&
        !DAG.getDataLayout().isBigEndian()) {
      auto *LHS = dyn_cast<LoadSDNode>(N0);
      auto *RHS = dyn_cast<LoadSDNode>(N1);
      if (LHS && RHS && LHS->isSimple() && RHS->isSimple() &&
          LHS->getAddressSpace() == RHS->getAddressSpace() &&
-          (LHS->hasOneUse() || RHS->hasOneUse()) && ISD::isNON_EXTLoad(RHS)) {
+          (LHS->hasOneUse() || RHS->hasOneUse()) && ISD::isNON_EXTLoad(RHS) &&
+          ISD::isNON_EXTLoad(LHS)) {
        if (DAG.areNonVolatileConsecutiveLoads(LHS, RHS, BitWidth / 8, 1)) {
          SDLoc DL(RHS);
          uint64_t PtrOff =
--- a/llvm/test/CodeGen/X86/funnel-shift.ll
+++ b/llvm/test/CodeGen/X86/funnel-shift.ll
@ -918,3 +918,67 @@ define <4 x i32> @fshr_v4i32_shift_by_bitwidth(<4 x i32> %x, <4 x i32> %y) nounw
  ret <4 x i32> %f
 }

+%struct.S = type { [11 x i8], i8 }
+define void @PR45265(i32 %0, %struct.S* nocapture readonly %1) nounwind {
+; X32-SSE2-LABEL: PR45265:
+; X32-SSE2:       # %bb.0:
+; X32-SSE2-NEXT:    pushl %edi
+; X32-SSE2-NEXT:    pushl %esi
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE2-NEXT:    leal (%eax,%eax,2), %edx
+; X32-SSE2-NEXT:    movzwl 8(%ecx,%edx,4), %esi
+; X32-SSE2-NEXT:    movsbl 10(%ecx,%edx,4), %edi
+; X32-SSE2-NEXT:    shll $16, %edi
+; X32-SSE2-NEXT:    orl %edi, %esi
+; X32-SSE2-NEXT:    movl 4(%ecx,%edx,4), %ecx
+; X32-SSE2-NEXT:    shrdl $8, %esi, %ecx
+; X32-SSE2-NEXT:    xorl %eax, %ecx
+; X32-SSE2-NEXT:    sarl $31, %eax
+; X32-SSE2-NEXT:    sarl $31, %edi
+; X32-SSE2-NEXT:    shldl $24, %esi, %edi
+; X32-SSE2-NEXT:    xorl %eax, %edi
+; X32-SSE2-NEXT:    orl %edi, %ecx
+; X32-SSE2-NEXT:    jne .LBB44_1
+; X32-SSE2-NEXT:  # %bb.2:
+; X32-SSE2-NEXT:    popl %esi
+; X32-SSE2-NEXT:    popl %edi
+; X32-SSE2-NEXT:    jmp _Z3foov # TAILCALL
+; X32-SSE2-NEXT:  .LBB44_1:
+; X32-SSE2-NEXT:    popl %esi
+; X32-SSE2-NEXT:    popl %edi
+; X32-SSE2-NEXT:    retl
+;
+; X64-AVX2-LABEL: PR45265:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    movslq %edi, %rax
+; X64-AVX2-NEXT:    leaq (%rax,%rax,2), %rcx
+; X64-AVX2-NEXT:    movsbq 10(%rsi,%rcx,4), %rdx
+; X64-AVX2-NEXT:    shlq $16, %rdx
+; X64-AVX2-NEXT:    movzwl 8(%rsi,%rcx,4), %edi
+; X64-AVX2-NEXT:    orq %rdx, %rdi
+; X64-AVX2-NEXT:    movq (%rsi,%rcx,4), %rcx
+; X64-AVX2-NEXT:    shrdq $40, %rdi, %rcx
+; X64-AVX2-NEXT:    cmpq %rax, %rcx
+; X64-AVX2-NEXT:    jne .LBB44_1
+; X64-AVX2-NEXT:  # %bb.2:
+; X64-AVX2-NEXT:    jmp _Z3foov # TAILCALL
+; X64-AVX2-NEXT:  .LBB44_1:
+; X64-AVX2-NEXT:    retq
+  %3 = sext i32 %0 to i64
+  %4 = getelementptr inbounds %struct.S, %struct.S* %1, i64 %3
+  %5 = bitcast %struct.S* %4 to i88*
+  %6 = load i88, i88* %5, align 1
+  %7 = ashr i88 %6, 40
+  %8 = trunc i88 %7 to i64
+  %9 = icmp eq i64 %8, %3
+  br i1 %9, label %10, label %11
+
+10:
+  tail call void @_Z3foov()
+  br label %11
+
+11:
+  ret void
+}
+declare dso_local void @_Z3foov()