From 55a060641fb07cbf7dfc93d2ae620b27b73c5ad3 Mon Sep 17 00:00:00 2001 From: Ahmed Bougacha Date: Mon, 9 Mar 2015 22:51:05 +0000 Subject: [PATCH] [CodeGen] Replace the reused stores' chain for extractelt expansion. This fixes a subtle issue that was introduced in r205153. When reusing a store for the extractelement expansion (to load directly from it, inserting of going through the stack), later stores to the same location might have overwritten the data we were expecting to extract from. To fix that, we need to explicitly replace the chain going out of the reused store, so that later stores also have an explicit dependency on the generated element-extracting loads, and can't clobber them. rdar://20066785 Differential Revision: http://reviews.llvm.org/D8180 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@231721 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 26 +- ...ractelement-legalization-store-ordering.ll | 57 ++++ test/CodeGen/X86/vector-idiv.ll | 250 +++++++++--------- 3 files changed, 205 insertions(+), 128 deletions(-) create mode 100644 test/CodeGen/X86/extractelement-legalization-store-ordering.ll diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 3dd73155488..ece38f33a8c 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -1442,13 +1442,27 @@ SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) { Idx = DAG.getZExtOrTrunc(Idx, dl, TLI.getPointerTy()); StackPtr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), Idx, StackPtr); + SDValue NewLoad; + if (Op.getValueType().isVector()) - return DAG.getLoad(Op.getValueType(), dl, Ch, StackPtr,MachinePointerInfo(), - false, false, false, 0); - return DAG.getExtLoad(ISD::EXTLOAD, dl, Op.getValueType(), Ch, StackPtr, - MachinePointerInfo(), - Vec.getValueType().getVectorElementType(), - false, false, false, 0); + NewLoad = DAG.getLoad(Op.getValueType(), dl, Ch, StackPtr, + MachinePointerInfo(), false, false, false, 0); + else + NewLoad = DAG.getExtLoad( + ISD::EXTLOAD, dl, Op.getValueType(), Ch, StackPtr, MachinePointerInfo(), + Vec.getValueType().getVectorElementType(), false, false, false, 0); + + // Replace the chain going out of the store, by the one out of the load. + DAG.ReplaceAllUsesOfValueWith(Ch, SDValue(NewLoad.getNode(), 1)); + + // We introduced a cycle though, so update the loads operands, making sure + // to use the original store's chain as an incoming chain. + SmallVector NewLoadOperands(NewLoad->op_begin(), + NewLoad->op_end()); + NewLoadOperands[0] = Ch; + NewLoad = + SDValue(DAG.UpdateNodeOperands(NewLoad.getNode(), NewLoadOperands), 0); + return NewLoad; } SDValue SelectionDAGLegalize::ExpandInsertToVectorThroughStack(SDValue Op) { diff --git a/test/CodeGen/X86/extractelement-legalization-store-ordering.ll b/test/CodeGen/X86/extractelement-legalization-store-ordering.ll new file mode 100644 index 00000000000..946516c8a46 --- /dev/null +++ b/test/CodeGen/X86/extractelement-legalization-store-ordering.ll @@ -0,0 +1,57 @@ +; RUN: llc < %s -mtriple i386-apple-darwin -mcpu=yonah | FileCheck %s + +target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128" + +; Make sure we don't break load/store ordering when turning an extractelement +; into loads, off the stack or a previous store. +; Be very explicit about the ordering/stack offsets. + +; CHECK-LABEL: test_extractelement_legalization_storereuse: +; CHECK: # BB#0 +; CHECK-NEXT: pushl %ebx +; CHECK-NEXT: pushl %edi +; CHECK-NEXT: pushl %esi +; CHECK-NEXT: movl 16(%esp), %eax +; CHECK-NEXT: movl 24(%esp), %ecx +; CHECK-NEXT: movl 20(%esp), %edx +; CHECK-NEXT: paddd (%edx), %xmm0 +; CHECK-NEXT: movdqa %xmm0, (%edx) +; CHECK-NEXT: shll $4, %ecx +; CHECK-NEXT: movl (%ecx,%edx), %esi +; CHECK-NEXT: movl 12(%ecx,%edx), %edi +; CHECK-NEXT: movl 8(%ecx,%edx), %ebx +; CHECK-NEXT: movl 4(%ecx,%edx), %edx +; CHECK-NEXT: movl %esi, 12(%eax,%ecx) +; CHECK-NEXT: movl %edx, (%eax,%ecx) +; CHECK-NEXT: movl %ebx, 8(%eax,%ecx) +; CHECK-NEXT: movl %edi, 4(%eax,%ecx) +; CHECK-NEXT: popl %esi +; CHECK-NEXT: popl %edi +; CHECK-NEXT: popl %ebx +; CHECK-NEXT: retl +define void @test_extractelement_legalization_storereuse(<4 x i32> %a, i32* nocapture %x, i32* nocapture readonly %y, i32 %i) #0 { +entry: + %0 = bitcast i32* %y to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 16 + %am = add <4 x i32> %a, %1 + store <4 x i32> %am, <4 x i32>* %0, align 16 + %ip0 = shl nsw i32 %i, 2 + %ip1 = or i32 %ip0, 1 + %ip2 = or i32 %ip0, 2 + %ip3 = or i32 %ip0, 3 + %vecext = extractelement <4 x i32> %am, i32 %ip0 + %arrayidx = getelementptr inbounds i32, i32* %x, i32 %ip3 + store i32 %vecext, i32* %arrayidx, align 4 + %vecext5 = extractelement <4 x i32> %am, i32 %ip1 + %arrayidx8 = getelementptr inbounds i32, i32* %x, i32 %ip0 + store i32 %vecext5, i32* %arrayidx8, align 4 + %vecext11 = extractelement <4 x i32> %am, i32 %ip2 + %arrayidx14 = getelementptr inbounds i32, i32* %x, i32 %ip2 + store i32 %vecext11, i32* %arrayidx14, align 4 + %vecext17 = extractelement <4 x i32> %am, i32 %ip3 + %arrayidx20 = getelementptr inbounds i32, i32* %x, i32 %ip1 + store i32 %vecext17, i32* %arrayidx20, align 4 + ret void +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/X86/vector-idiv.ll b/test/CodeGen/X86/vector-idiv.ll index b95da1304a7..2e482a0f143 100644 --- a/test/CodeGen/X86/vector-idiv.ll +++ b/test/CodeGen/X86/vector-idiv.ll @@ -460,6 +460,9 @@ define <16 x i8> @test7(<16 x i8> %a) #0 { ; ; SSE-LABEL: test7: ; SSE: # BB#0: +; SSE-NEXT: pushq %rbp +; SSE-NEXT: pushq %r14 +; SSE-NEXT: pushq %rbx ; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax ; SSE-NEXT: imull $-109, %eax, %ecx @@ -471,156 +474,156 @@ define <16 x i8> @test7(<16 x i8> %a) #0 { ; SSE-NEXT: addb %al, %cl ; SSE-NEXT: movzbl %cl, %eax ; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %r14d +; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %edx +; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %r9d ; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: imull $-109, %eax, %ecx -; SSE-NEXT: shrl $8, %ecx -; SSE-NEXT: addb %al, %cl -; SSE-NEXT: movb %cl, %al -; SSE-NEXT: shrb $7, %al -; SSE-NEXT: sarb $2, %cl -; SSE-NEXT: addb %al, %cl -; SSE-NEXT: movzbl %cl, %eax -; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %r11d +; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %ecx +; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %r8d +; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %esi +; SSE-NEXT: imull $-109, %esi, %edi +; SSE-NEXT: shrl $8, %edi +; SSE-NEXT: addb %sil, %dil +; SSE-NEXT: movb %dil, %bl +; SSE-NEXT: shrb $7, %bl +; SSE-NEXT: sarb $2, %dil +; SSE-NEXT: addb %bl, %dil +; SSE-NEXT: movzbl %dil, %esi +; SSE-NEXT: movd %esi, %xmm1 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: imull $-109, %eax, %ecx -; SSE-NEXT: shrl $8, %ecx -; SSE-NEXT: addb %al, %cl -; SSE-NEXT: movb %cl, %al +; SSE-NEXT: imull $-109, %eax, %esi +; SSE-NEXT: shrl $8, %esi +; SSE-NEXT: addb %al, %sil +; SSE-NEXT: movb %sil, %al ; SSE-NEXT: shrb $7, %al -; SSE-NEXT: sarb $2, %cl -; SSE-NEXT: addb %al, %cl -; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: sarb $2, %sil +; SSE-NEXT: addb %al, %sil +; SSE-NEXT: movzbl %sil, %eax ; SSE-NEXT: movd %eax, %xmm2 -; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: imull $-109, %eax, %ecx -; SSE-NEXT: shrl $8, %ecx -; SSE-NEXT: addb %al, %cl -; SSE-NEXT: movb %cl, %al +; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %ebp +; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %esi +; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %r10d +; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %edi +; SSE-NEXT: imull $-109, %edi, %ebx +; SSE-NEXT: shrl $8, %ebx +; SSE-NEXT: addb %dil, %bl +; SSE-NEXT: movb %bl, %al ; SSE-NEXT: shrb $7, %al -; SSE-NEXT: sarb $2, %cl -; SSE-NEXT: addb %al, %cl -; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: sarb $2, %bl +; SSE-NEXT: addb %al, %bl +; SSE-NEXT: movzbl %bl, %eax ; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: imull $-109, %eax, %ecx -; SSE-NEXT: shrl $8, %ecx -; SSE-NEXT: addb %al, %cl -; SSE-NEXT: movb %cl, %al -; SSE-NEXT: shrb $7, %al -; SSE-NEXT: sarb $2, %cl -; SSE-NEXT: addb %al, %cl -; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: imull $-109, %edx, %eax +; SSE-NEXT: shrl $8, %eax +; SSE-NEXT: addb %dl, %al +; SSE-NEXT: movb %al, %dl +; SSE-NEXT: shrb $7, %dl +; SSE-NEXT: sarb $2, %al +; SSE-NEXT: addb %dl, %al +; SSE-NEXT: movzbl %al, %eax ; SSE-NEXT: movd %eax, %xmm1 -; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: imull $-109, %eax, %ecx -; SSE-NEXT: shrl $8, %ecx -; SSE-NEXT: addb %al, %cl -; SSE-NEXT: movb %cl, %al -; SSE-NEXT: shrb $7, %al -; SSE-NEXT: sarb $2, %cl -; SSE-NEXT: addb %al, %cl -; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: imull $-109, %esi, %eax +; SSE-NEXT: shrl $8, %eax +; SSE-NEXT: addb %sil, %al +; SSE-NEXT: movb %al, %dl +; SSE-NEXT: shrb $7, %dl +; SSE-NEXT: sarb $2, %al +; SSE-NEXT: addb %dl, %al +; SSE-NEXT: movzbl %al, %eax ; SSE-NEXT: movd %eax, %xmm2 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: imull $-109, %eax, %ecx -; SSE-NEXT: shrl $8, %ecx -; SSE-NEXT: addb %al, %cl -; SSE-NEXT: movb %cl, %al -; SSE-NEXT: shrb $7, %al -; SSE-NEXT: sarb $2, %cl -; SSE-NEXT: addb %al, %cl -; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: imull $-109, %ecx, %eax +; SSE-NEXT: shrl $8, %eax +; SSE-NEXT: addb %cl, %al +; SSE-NEXT: movb %al, %cl +; SSE-NEXT: shrb $7, %cl +; SSE-NEXT: sarb $2, %al +; SSE-NEXT: addb %cl, %al +; SSE-NEXT: movzbl %al, %eax ; SSE-NEXT: movd %eax, %xmm3 +; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %ecx ; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: imull $-109, %eax, %ecx -; SSE-NEXT: shrl $8, %ecx -; SSE-NEXT: addb %al, %cl -; SSE-NEXT: movb %cl, %al +; SSE-NEXT: imull $-109, %eax, %edx +; SSE-NEXT: shrl $8, %edx +; SSE-NEXT: addb %al, %dl +; SSE-NEXT: movb %dl, %al ; SSE-NEXT: shrb $7, %al -; SSE-NEXT: sarb $2, %cl -; SSE-NEXT: addb %al, %cl -; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: sarb $2, %dl +; SSE-NEXT: addb %al, %dl +; SSE-NEXT: movzbl %dl, %eax ; SSE-NEXT: movd %eax, %xmm1 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: imull $-109, %eax, %ecx -; SSE-NEXT: shrl $8, %ecx -; SSE-NEXT: addb %al, %cl -; SSE-NEXT: movb %cl, %al -; SSE-NEXT: shrb $7, %al -; SSE-NEXT: sarb $2, %cl -; SSE-NEXT: addb %al, %cl -; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: imull $-109, %r14d, %eax +; SSE-NEXT: shrl $8, %eax +; SSE-NEXT: addb %r14b, %al +; SSE-NEXT: movb %al, %dl +; SSE-NEXT: shrb $7, %dl +; SSE-NEXT: sarb $2, %al +; SSE-NEXT: addb %dl, %al +; SSE-NEXT: movzbl %al, %eax ; SSE-NEXT: movd %eax, %xmm2 -; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: imull $-109, %eax, %ecx -; SSE-NEXT: shrl $8, %ecx -; SSE-NEXT: addb %al, %cl -; SSE-NEXT: movb %cl, %al -; SSE-NEXT: shrb $7, %al -; SSE-NEXT: sarb $2, %cl -; SSE-NEXT: addb %al, %cl -; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: imull $-109, %ebp, %eax +; SSE-NEXT: shrl $8, %eax +; SSE-NEXT: addb %bpl, %al +; SSE-NEXT: movb %al, %dl +; SSE-NEXT: shrb $7, %dl +; SSE-NEXT: sarb $2, %al +; SSE-NEXT: addb %dl, %al +; SSE-NEXT: movzbl %al, %eax ; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: imull $-109, %eax, %ecx -; SSE-NEXT: shrl $8, %ecx -; SSE-NEXT: addb %al, %cl -; SSE-NEXT: movb %cl, %al -; SSE-NEXT: shrb $7, %al -; SSE-NEXT: sarb $2, %cl -; SSE-NEXT: addb %al, %cl -; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: imull $-109, %r11d, %eax +; SSE-NEXT: shrl $8, %eax +; SSE-NEXT: addb %r11b, %al +; SSE-NEXT: movb %al, %dl +; SSE-NEXT: shrb $7, %dl +; SSE-NEXT: sarb $2, %al +; SSE-NEXT: addb %dl, %al +; SSE-NEXT: movzbl %al, %eax ; SSE-NEXT: movd %eax, %xmm3 -; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: imull $-109, %eax, %ecx -; SSE-NEXT: shrl $8, %ecx -; SSE-NEXT: addb %al, %cl -; SSE-NEXT: movb %cl, %al -; SSE-NEXT: shrb $7, %al -; SSE-NEXT: sarb $2, %cl -; SSE-NEXT: addb %al, %cl -; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: imull $-109, %ecx, %eax +; SSE-NEXT: shrl $8, %eax +; SSE-NEXT: addb %cl, %al +; SSE-NEXT: movb %al, %cl +; SSE-NEXT: shrb $7, %cl +; SSE-NEXT: sarb $2, %al +; SSE-NEXT: addb %cl, %al +; SSE-NEXT: movzbl %al, %eax ; SSE-NEXT: movd %eax, %xmm2 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: imull $-109, %eax, %ecx -; SSE-NEXT: shrl $8, %ecx -; SSE-NEXT: addb %al, %cl -; SSE-NEXT: movb %cl, %al -; SSE-NEXT: shrb $7, %al -; SSE-NEXT: sarb $2, %cl -; SSE-NEXT: addb %al, %cl -; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: imull $-109, %r9d, %eax +; SSE-NEXT: shrl $8, %eax +; SSE-NEXT: addb %r9b, %al +; SSE-NEXT: movb %al, %cl +; SSE-NEXT: shrb $7, %cl +; SSE-NEXT: sarb $2, %al +; SSE-NEXT: addb %cl, %al +; SSE-NEXT: movzbl %al, %eax ; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: imull $-109, %eax, %ecx -; SSE-NEXT: shrl $8, %ecx -; SSE-NEXT: addb %al, %cl -; SSE-NEXT: movb %cl, %al -; SSE-NEXT: shrb $7, %al -; SSE-NEXT: sarb $2, %cl -; SSE-NEXT: addb %al, %cl -; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: imull $-109, %r10d, %eax +; SSE-NEXT: shrl $8, %eax +; SSE-NEXT: addb %r10b, %al +; SSE-NEXT: movb %al, %cl +; SSE-NEXT: shrb $7, %cl +; SSE-NEXT: sarb $2, %al +; SSE-NEXT: addb %cl, %al +; SSE-NEXT: movzbl %al, %eax ; SSE-NEXT: movd %eax, %xmm3 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: imull $-109, %eax, %ecx -; SSE-NEXT: shrl $8, %ecx -; SSE-NEXT: addb %al, %cl -; SSE-NEXT: movb %cl, %al -; SSE-NEXT: shrb $7, %al -; SSE-NEXT: sarb $2, %cl -; SSE-NEXT: addb %al, %cl -; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: imull $-109, %r8d, %eax +; SSE-NEXT: shrl $8, %eax +; SSE-NEXT: addb %r8b, %al +; SSE-NEXT: movb %al, %cl +; SSE-NEXT: shrb $7, %cl +; SSE-NEXT: sarb $2, %al +; SSE-NEXT: addb %cl, %al +; SSE-NEXT: movzbl %al, %eax ; SSE-NEXT: movd %eax, %xmm4 ; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax ; SSE-NEXT: imull $-109, %eax, %ecx @@ -636,6 +639,9 @@ define <16 x i8> @test7(<16 x i8> %a) #0 { ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %r14 +; SSE-NEXT: popq %rbp ; SSE-NEXT: retq ; ; AVX-LABEL: test7: