diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 7a68d18ef9b5..a6537a0bf97e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -8325,13 +8325,15 @@ SDValue DAGCombiner::visitFunnelShift(SDNode *N) { // fold (fshr ld1, ld0, c) -> (ld0[ofs]) iff ld0 and ld1 are consecutive. // TODO - bigendian support once we have test coverage. // TODO - can we merge this with CombineConseutiveLoads/MatchLoadCombine? + // TODO - permit LHS EXTLOAD if extensions are shifted out. if ((BitWidth % 8) == 0 && (ShAmt % 8) == 0 && !VT.isVector() && !DAG.getDataLayout().isBigEndian()) { auto *LHS = dyn_cast(N0); auto *RHS = dyn_cast(N1); if (LHS && RHS && LHS->isSimple() && RHS->isSimple() && LHS->getAddressSpace() == RHS->getAddressSpace() && - (LHS->hasOneUse() || RHS->hasOneUse()) && ISD::isNON_EXTLoad(RHS)) { + (LHS->hasOneUse() || RHS->hasOneUse()) && ISD::isNON_EXTLoad(RHS) && + ISD::isNON_EXTLoad(LHS)) { if (DAG.areNonVolatileConsecutiveLoads(LHS, RHS, BitWidth / 8, 1)) { SDLoc DL(RHS); uint64_t PtrOff = diff --git a/llvm/test/CodeGen/X86/funnel-shift.ll b/llvm/test/CodeGen/X86/funnel-shift.ll index 517880fb88e5..f78fe2c00eb3 100644 --- a/llvm/test/CodeGen/X86/funnel-shift.ll +++ b/llvm/test/CodeGen/X86/funnel-shift.ll @@ -918,3 +918,67 @@ define <4 x i32> @fshr_v4i32_shift_by_bitwidth(<4 x i32> %x, <4 x i32> %y) nounw ret <4 x i32> %f } +%struct.S = type { [11 x i8], i8 } +define void @PR45265(i32 %0, %struct.S* nocapture readonly %1) nounwind { +; X32-SSE2-LABEL: PR45265: +; X32-SSE2: # %bb.0: +; X32-SSE2-NEXT: pushl %edi +; X32-SSE2-NEXT: pushl %esi +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-SSE2-NEXT: leal (%eax,%eax,2), %edx +; X32-SSE2-NEXT: movzwl 8(%ecx,%edx,4), %esi +; X32-SSE2-NEXT: movsbl 10(%ecx,%edx,4), %edi +; X32-SSE2-NEXT: shll $16, %edi +; X32-SSE2-NEXT: orl %edi, %esi +; X32-SSE2-NEXT: movl 4(%ecx,%edx,4), %ecx +; X32-SSE2-NEXT: shrdl $8, %esi, %ecx +; X32-SSE2-NEXT: xorl %eax, %ecx +; X32-SSE2-NEXT: sarl $31, %eax +; X32-SSE2-NEXT: sarl $31, %edi +; X32-SSE2-NEXT: shldl $24, %esi, %edi +; X32-SSE2-NEXT: xorl %eax, %edi +; X32-SSE2-NEXT: orl %edi, %ecx +; X32-SSE2-NEXT: jne .LBB44_1 +; X32-SSE2-NEXT: # %bb.2: +; X32-SSE2-NEXT: popl %esi +; X32-SSE2-NEXT: popl %edi +; X32-SSE2-NEXT: jmp _Z3foov # TAILCALL +; X32-SSE2-NEXT: .LBB44_1: +; X32-SSE2-NEXT: popl %esi +; X32-SSE2-NEXT: popl %edi +; X32-SSE2-NEXT: retl +; +; X64-AVX2-LABEL: PR45265: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: movslq %edi, %rax +; X64-AVX2-NEXT: leaq (%rax,%rax,2), %rcx +; X64-AVX2-NEXT: movsbq 10(%rsi,%rcx,4), %rdx +; X64-AVX2-NEXT: shlq $16, %rdx +; X64-AVX2-NEXT: movzwl 8(%rsi,%rcx,4), %edi +; X64-AVX2-NEXT: orq %rdx, %rdi +; X64-AVX2-NEXT: movq (%rsi,%rcx,4), %rcx +; X64-AVX2-NEXT: shrdq $40, %rdi, %rcx +; X64-AVX2-NEXT: cmpq %rax, %rcx +; X64-AVX2-NEXT: jne .LBB44_1 +; X64-AVX2-NEXT: # %bb.2: +; X64-AVX2-NEXT: jmp _Z3foov # TAILCALL +; X64-AVX2-NEXT: .LBB44_1: +; X64-AVX2-NEXT: retq + %3 = sext i32 %0 to i64 + %4 = getelementptr inbounds %struct.S, %struct.S* %1, i64 %3 + %5 = bitcast %struct.S* %4 to i88* + %6 = load i88, i88* %5, align 1 + %7 = ashr i88 %6, 40 + %8 = trunc i88 %7 to i64 + %9 = icmp eq i64 %8, %3 + br i1 %9, label %10, label %11 + +10: + tail call void @_Z3foov() + br label %11 + +11: + ret void +} +declare dso_local void @_Z3foov()