[X86][SSE] Don't colaesce v4i32 extracts

We currently coalesce v4i32 extracts from all 4 elements to 2 v2i64 extracts + shifts/sign-extends.

This seems to have been added back in the days when we tended to spill vectors and reload scalars, or ended up with repeated shuffles moving everything down to 0'th index. I don't think either of these are likely these days as we have better EXTRACT_VECTOR_ELT and VECTOR_SHUFFLE handling, and the existing code tends to make it very difficult for various vector and load combines.

Differential Revision: https://reviews.llvm.org/D42308

llvm-svn: 323541
This commit is contained in:
Simon Pilgrim 2018-01-26 17:11:34 +00:00
parent a0702cbb69
commit 1d50b0c236
7 changed files with 255 additions and 428 deletions

View File

@ -31239,102 +31239,7 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))
return MinMax;
// Only operate on vectors of 4 elements, where the alternative shuffling
// gets to be more expensive.
if (SrcVT != MVT::v4i32)
return SDValue();
// Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
// single use which is a sign-extend or zero-extend, and all elements are
// used.
SmallVector<SDNode *, 4> Uses;
unsigned ExtractedElements = 0;
for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
if (UI.getUse().getResNo() != InputVector.getResNo())
return SDValue();
SDNode *Extract = *UI;
if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
return SDValue();
if (Extract->getValueType(0) != MVT::i32)
return SDValue();
if (!Extract->hasOneUse())
return SDValue();
if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
return SDValue();
if (!isa<ConstantSDNode>(Extract->getOperand(1)))
return SDValue();
// Record which element was extracted.
ExtractedElements |= 1 << Extract->getConstantOperandVal(1);
Uses.push_back(Extract);
}
// If not all the elements were used, this may not be worthwhile.
if (ExtractedElements != 15)
return SDValue();
// Ok, we've now decided to do the transformation.
// If 64-bit shifts are legal, use the extract-shift sequence,
// otherwise bounce the vector off the cache.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDValue Vals[4];
if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
auto &DL = DAG.getDataLayout();
EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL);
SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
DAG.getConstant(0, dl, VecIdxTy));
SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
DAG.getConstant(1, dl, VecIdxTy));
SDValue ShAmt = DAG.getConstant(
32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL));
Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
} else {
// Store the value to a temporary stack slot.
SDValue StackPtr = DAG.CreateStackTemporary(SrcVT);
SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
MachinePointerInfo());
EVT ElementType = SrcVT.getVectorElementType();
unsigned EltSize = ElementType.getSizeInBits() / 8;
// Replace each use (extract) with a load of the appropriate element.
for (unsigned i = 0; i < 4; ++i) {
uint64_t Offset = EltSize * i;
auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT);
SDValue ScalarAddr =
DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);
// Load the scalar.
Vals[i] =
DAG.getLoad(ElementType, dl, Ch, ScalarAddr, MachinePointerInfo());
}
}
// Replace the extracts
for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
UE = Uses.end(); UI != UE; ++UI) {
SDNode *Extract = *UI;
uint64_t IdxVal = Extract->getConstantOperandVal(1);
DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
}
// The replacement was made in place; return N so it won't be revisited.
return SDValue(N, 0);
return SDValue();
}
/// If a vector select has an operand that is -1 or 0, try to simplify the

View File

@ -7,21 +7,24 @@
; rdar://7398554
; When doing vector gather-scatter index calculation with 32-bit indices,
; use an efficient mov/shift sequence rather than shuffling each individual
; element out of the index vector.
; minimize shuffling of each individual element out of the index vector.
define <4 x double> @foo(double* %p, <4 x i32>* %i, <4 x i32>* %h) nounwind {
; LIN-SSE2-LABEL: foo:
; LIN-SSE2: # %bb.0:
; LIN-SSE2-NEXT: movdqa (%rsi), %xmm0
; LIN-SSE2-NEXT: pand (%rdx), %xmm0
; LIN-SSE2-NEXT: movd %xmm0, %eax
; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; LIN-SSE2-NEXT: movd %xmm1, %ecx
; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; LIN-SSE2-NEXT: movq %xmm1, %rax
; LIN-SSE2-NEXT: movq %xmm0, %rcx
; LIN-SSE2-NEXT: movslq %ecx, %rdx
; LIN-SSE2-NEXT: sarq $32, %rcx
; LIN-SSE2-NEXT: movslq %eax, %rsi
; LIN-SSE2-NEXT: sarq $32, %rax
; LIN-SSE2-NEXT: movd %xmm1, %edx
; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; LIN-SSE2-NEXT: movd %xmm0, %esi
; LIN-SSE2-NEXT: cltq
; LIN-SSE2-NEXT: movslq %ecx, %rcx
; LIN-SSE2-NEXT: movslq %edx, %rdx
; LIN-SSE2-NEXT: movslq %esi, %rsi
; LIN-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; LIN-SSE2-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; LIN-SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
@ -32,14 +35,16 @@ define <4 x double> @foo(double* %p, <4 x i32>* %i, <4 x i32>* %h) nounwind {
; LIN-SSE4: # %bb.0:
; LIN-SSE4-NEXT: movdqa (%rsi), %xmm0
; LIN-SSE4-NEXT: pand (%rdx), %xmm0
; LIN-SSE4-NEXT: pextrq $1, %xmm0, %rax
; LIN-SSE4-NEXT: movq %xmm0, %rcx
; LIN-SSE4-NEXT: movslq %ecx, %rdx
; LIN-SSE4-NEXT: sarq $32, %rcx
; LIN-SSE4-NEXT: movslq %eax, %rsi
; LIN-SSE4-NEXT: movd %xmm0, %eax
; LIN-SSE4-NEXT: pextrd $1, %xmm0, %ecx
; LIN-SSE4-NEXT: pextrd $2, %xmm0, %edx
; LIN-SSE4-NEXT: pextrd $3, %xmm0, %esi
; LIN-SSE4-NEXT: cltq
; LIN-SSE4-NEXT: movslq %ecx, %rcx
; LIN-SSE4-NEXT: movslq %edx, %rdx
; LIN-SSE4-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; LIN-SSE4-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; LIN-SSE4-NEXT: sarq $32, %rax
; LIN-SSE4-NEXT: movslq %esi, %rax
; LIN-SSE4-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; LIN-SSE4-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
; LIN-SSE4-NEXT: retq
@ -48,13 +53,17 @@ define <4 x double> @foo(double* %p, <4 x i32>* %i, <4 x i32>* %h) nounwind {
; WIN-SSE2: # %bb.0:
; WIN-SSE2-NEXT: movdqa (%rdx), %xmm0
; WIN-SSE2-NEXT: pand (%r8), %xmm0
; WIN-SSE2-NEXT: movd %xmm0, %r8d
; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; WIN-SSE2-NEXT: movd %xmm1, %r9d
; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; WIN-SSE2-NEXT: movq %xmm1, %rax
; WIN-SSE2-NEXT: movq %xmm0, %rdx
; WIN-SSE2-NEXT: movslq %edx, %r8
; WIN-SSE2-NEXT: sarq $32, %rdx
; WIN-SSE2-NEXT: movslq %eax, %r9
; WIN-SSE2-NEXT: sarq $32, %rax
; WIN-SSE2-NEXT: movd %xmm1, %r10d
; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; WIN-SSE2-NEXT: movd %xmm0, %edx
; WIN-SSE2-NEXT: movslq %r8d, %rax
; WIN-SSE2-NEXT: movslq %r9d, %r8
; WIN-SSE2-NEXT: movslq %r10d, %r9
; WIN-SSE2-NEXT: movslq %edx, %rdx
; WIN-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; WIN-SSE2-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; WIN-SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
@ -65,14 +74,16 @@ define <4 x double> @foo(double* %p, <4 x i32>* %i, <4 x i32>* %h) nounwind {
; WIN-SSE4: # %bb.0:
; WIN-SSE4-NEXT: movdqa (%rdx), %xmm0
; WIN-SSE4-NEXT: pand (%r8), %xmm0
; WIN-SSE4-NEXT: pextrq $1, %xmm0, %rax
; WIN-SSE4-NEXT: movq %xmm0, %rdx
; WIN-SSE4-NEXT: movslq %edx, %r8
; WIN-SSE4-NEXT: sarq $32, %rdx
; WIN-SSE4-NEXT: movslq %eax, %r9
; WIN-SSE4-NEXT: movd %xmm0, %eax
; WIN-SSE4-NEXT: pextrd $1, %xmm0, %edx
; WIN-SSE4-NEXT: pextrd $2, %xmm0, %r8d
; WIN-SSE4-NEXT: pextrd $3, %xmm0, %r9d
; WIN-SSE4-NEXT: cltq
; WIN-SSE4-NEXT: movslq %edx, %rdx
; WIN-SSE4-NEXT: movslq %r8d, %r8
; WIN-SSE4-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; WIN-SSE4-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; WIN-SSE4-NEXT: sarq $32, %rax
; WIN-SSE4-NEXT: movslq %r9d, %rax
; WIN-SSE4-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; WIN-SSE4-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
; WIN-SSE4-NEXT: retq
@ -127,22 +138,22 @@ define <4 x i64> @old(double* %p, <4 x i32>* %i, <4 x i32>* %h, i64 %f) nounwind
; LIN-SSE2: # %bb.0:
; LIN-SSE2-NEXT: movdqa (%rsi), %xmm0
; LIN-SSE2-NEXT: pand (%rdx), %xmm0
; LIN-SSE2-NEXT: movd %xmm0, %eax
; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; LIN-SSE2-NEXT: movd %xmm1, %edx
; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; LIN-SSE2-NEXT: movq %xmm1, %rax
; LIN-SSE2-NEXT: movq %rax, %rdx
; LIN-SSE2-NEXT: shrq $32, %rdx
; LIN-SSE2-NEXT: movq %xmm0, %rsi
; LIN-SSE2-NEXT: movq %rsi, %rdi
; LIN-SSE2-NEXT: shrq $32, %rdi
; LIN-SSE2-NEXT: andl %ecx, %esi
; LIN-SSE2-NEXT: andl %ecx, %eax
; LIN-SSE2-NEXT: andq %rcx, %rdi
; LIN-SSE2-NEXT: movd %xmm1, %esi
; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; LIN-SSE2-NEXT: movd %xmm0, %edi
; LIN-SSE2-NEXT: andq %rcx, %rax
; LIN-SSE2-NEXT: andq %rcx, %rdx
; LIN-SSE2-NEXT: movq %rdi, %xmm1
; LIN-SSE2-NEXT: movq %rsi, %xmm0
; LIN-SSE2-NEXT: andq %rcx, %rsi
; LIN-SSE2-NEXT: andq %rcx, %rdi
; LIN-SSE2-NEXT: movq %rax, %xmm0
; LIN-SSE2-NEXT: movq %rdx, %xmm1
; LIN-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; LIN-SSE2-NEXT: movq %rdx, %xmm2
; LIN-SSE2-NEXT: movq %rax, %xmm1
; LIN-SSE2-NEXT: movq %rdi, %xmm2
; LIN-SSE2-NEXT: movq %rsi, %xmm1
; LIN-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; LIN-SSE2-NEXT: retq
;
@ -150,21 +161,19 @@ define <4 x i64> @old(double* %p, <4 x i32>* %i, <4 x i32>* %h, i64 %f) nounwind
; LIN-SSE4: # %bb.0:
; LIN-SSE4-NEXT: movdqa (%rsi), %xmm0
; LIN-SSE4-NEXT: pand (%rdx), %xmm0
; LIN-SSE4-NEXT: pextrq $1, %xmm0, %rax
; LIN-SSE4-NEXT: movq %rax, %rdx
; LIN-SSE4-NEXT: shrq $32, %rdx
; LIN-SSE4-NEXT: movq %xmm0, %rsi
; LIN-SSE4-NEXT: movq %rsi, %rdi
; LIN-SSE4-NEXT: shrq $32, %rdi
; LIN-SSE4-NEXT: andl %ecx, %esi
; LIN-SSE4-NEXT: andl %ecx, %eax
; LIN-SSE4-NEXT: andq %rcx, %rdi
; LIN-SSE4-NEXT: movd %xmm0, %eax
; LIN-SSE4-NEXT: pextrd $1, %xmm0, %edx
; LIN-SSE4-NEXT: pextrd $2, %xmm0, %esi
; LIN-SSE4-NEXT: pextrd $3, %xmm0, %edi
; LIN-SSE4-NEXT: andq %rcx, %rax
; LIN-SSE4-NEXT: andq %rcx, %rdx
; LIN-SSE4-NEXT: movq %rdi, %xmm1
; LIN-SSE4-NEXT: movq %rsi, %xmm0
; LIN-SSE4-NEXT: andq %rcx, %rsi
; LIN-SSE4-NEXT: andq %rcx, %rdi
; LIN-SSE4-NEXT: movq %rdx, %xmm1
; LIN-SSE4-NEXT: movq %rax, %xmm0
; LIN-SSE4-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; LIN-SSE4-NEXT: movq %rdx, %xmm2
; LIN-SSE4-NEXT: movq %rax, %xmm1
; LIN-SSE4-NEXT: movq %rdi, %xmm2
; LIN-SSE4-NEXT: movq %rsi, %xmm1
; LIN-SSE4-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; LIN-SSE4-NEXT: retq
;
@ -172,21 +181,21 @@ define <4 x i64> @old(double* %p, <4 x i32>* %i, <4 x i32>* %h, i64 %f) nounwind
; WIN-SSE2: # %bb.0:
; WIN-SSE2-NEXT: movdqa (%rdx), %xmm0
; WIN-SSE2-NEXT: pand (%r8), %xmm0
; WIN-SSE2-NEXT: movd %xmm0, %eax
; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; WIN-SSE2-NEXT: movd %xmm1, %ecx
; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; WIN-SSE2-NEXT: movq %xmm1, %r8
; WIN-SSE2-NEXT: movq %r8, %rcx
; WIN-SSE2-NEXT: shrq $32, %rcx
; WIN-SSE2-NEXT: movq %xmm0, %rax
; WIN-SSE2-NEXT: movq %rax, %rdx
; WIN-SSE2-NEXT: shrq $32, %rdx
; WIN-SSE2-NEXT: andl %r9d, %eax
; WIN-SSE2-NEXT: andl %r9d, %r8d
; WIN-SSE2-NEXT: andq %r9, %rdx
; WIN-SSE2-NEXT: movd %xmm1, %r8d
; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; WIN-SSE2-NEXT: movd %xmm0, %edx
; WIN-SSE2-NEXT: andq %r9, %rax
; WIN-SSE2-NEXT: andq %r9, %rcx
; WIN-SSE2-NEXT: movq %rdx, %xmm1
; WIN-SSE2-NEXT: andq %r9, %r8
; WIN-SSE2-NEXT: andq %r9, %rdx
; WIN-SSE2-NEXT: movq %rax, %xmm0
; WIN-SSE2-NEXT: movq %rcx, %xmm1
; WIN-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; WIN-SSE2-NEXT: movq %rcx, %xmm2
; WIN-SSE2-NEXT: movq %rdx, %xmm2
; WIN-SSE2-NEXT: movq %r8, %xmm1
; WIN-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; WIN-SSE2-NEXT: retq
@ -195,53 +204,47 @@ define <4 x i64> @old(double* %p, <4 x i32>* %i, <4 x i32>* %h, i64 %f) nounwind
; WIN-SSE4: # %bb.0:
; WIN-SSE4-NEXT: movdqa (%rdx), %xmm0
; WIN-SSE4-NEXT: pand (%r8), %xmm0
; WIN-SSE4-NEXT: pextrq $1, %xmm0, %r8
; WIN-SSE4-NEXT: movq %r8, %rcx
; WIN-SSE4-NEXT: shrq $32, %rcx
; WIN-SSE4-NEXT: movq %xmm0, %rax
; WIN-SSE4-NEXT: movq %rax, %rdx
; WIN-SSE4-NEXT: shrq $32, %rdx
; WIN-SSE4-NEXT: andl %r9d, %eax
; WIN-SSE4-NEXT: andl %r9d, %r8d
; WIN-SSE4-NEXT: andq %r9, %rdx
; WIN-SSE4-NEXT: movd %xmm0, %eax
; WIN-SSE4-NEXT: pextrd $1, %xmm0, %ecx
; WIN-SSE4-NEXT: pextrd $2, %xmm0, %r8d
; WIN-SSE4-NEXT: pextrd $3, %xmm0, %edx
; WIN-SSE4-NEXT: andq %r9, %rax
; WIN-SSE4-NEXT: andq %r9, %rcx
; WIN-SSE4-NEXT: movq %rdx, %xmm1
; WIN-SSE4-NEXT: andq %r9, %r8
; WIN-SSE4-NEXT: andq %r9, %rdx
; WIN-SSE4-NEXT: movq %rcx, %xmm1
; WIN-SSE4-NEXT: movq %rax, %xmm0
; WIN-SSE4-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; WIN-SSE4-NEXT: movq %rcx, %xmm2
; WIN-SSE4-NEXT: movq %rdx, %xmm2
; WIN-SSE4-NEXT: movq %r8, %xmm1
; WIN-SSE4-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; WIN-SSE4-NEXT: retq
;
; LIN32-LABEL: old:
; LIN32: # %bb.0:
; LIN32-NEXT: pushl %ebp
; LIN32-NEXT: movl %esp, %ebp
; LIN32-NEXT: pushl %edi
; LIN32-NEXT: pushl %esi
; LIN32-NEXT: andl $-16, %esp
; LIN32-NEXT: subl $32, %esp
; LIN32-NEXT: movl 20(%ebp), %eax
; LIN32-NEXT: movl 16(%ebp), %ecx
; LIN32-NEXT: movl 12(%ebp), %edx
; LIN32-NEXT: movaps (%edx), %xmm0
; LIN32-NEXT: andps (%ecx), %xmm0
; LIN32-NEXT: movaps %xmm0, (%esp)
; LIN32-NEXT: movl (%esp), %ecx
; LIN32-NEXT: andl %eax, %ecx
; LIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
; LIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; LIN32-NEXT: movl {{[0-9]+}}(%esp), %edx
; LIN32-NEXT: movdqa (%edx), %xmm0
; LIN32-NEXT: pand (%ecx), %xmm0
; LIN32-NEXT: movd %xmm0, %ecx
; LIN32-NEXT: pextrd $1, %xmm0, %edx
; LIN32-NEXT: pextrd $2, %xmm0, %esi
; LIN32-NEXT: pextrd $3, %xmm0, %edi
; LIN32-NEXT: andl %eax, %ecx
; LIN32-NEXT: andl %eax, %edx
; LIN32-NEXT: movl {{[0-9]+}}(%esp), %esi
; LIN32-NEXT: andl %eax, %esi
; LIN32-NEXT: andl {{[0-9]+}}(%esp), %eax
; LIN32-NEXT: andl %eax, %edi
; LIN32-NEXT: movd %edx, %xmm1
; LIN32-NEXT: movd %ecx, %xmm0
; LIN32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; LIN32-NEXT: movd %eax, %xmm2
; LIN32-NEXT: movd %edi, %xmm2
; LIN32-NEXT: movd %esi, %xmm1
; LIN32-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; LIN32-NEXT: leal -4(%ebp), %esp
; LIN32-NEXT: popl %esi
; LIN32-NEXT: popl %ebp
; LIN32-NEXT: popl %edi
; LIN32-NEXT: retl
%a = load <4 x i32>, <4 x i32>* %i
%b = load <4 x i32>, <4 x i32>* %h

View File

@ -153,109 +153,51 @@ define <4 x i32> @_mul4xi32b(<4 x i32>, <4 x i32>) {
define <4 x i64> @_mul4xi32toi64a(<4 x i32>, <4 x i32>) {
; SSE2-LABEL: _mul4xi32toi64a:
; SSE2: # %bb.0:
; SSE2-NEXT: movq %xmm1, %rax
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; SSE2-NEXT: movq %xmm1, %rcx
; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: shrq $32, %rcx
; SSE2-NEXT: movq %xmm0, %rdx
; SSE2-NEXT: movd %edx, %xmm2
; SSE2-NEXT: shrq $32, %rdx
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; SSE2-NEXT: movq %xmm0, %rsi
; SSE2-NEXT: movd %esi, %xmm3
; SSE2-NEXT: shrq $32, %rsi
; SSE2-NEXT: movd %esi, %xmm0
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
; SSE2-NEXT: movd %edx, %xmm0
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
; SSE2-NEXT: movd %ecx, %xmm0
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: shrq $32, %rax
; SSE2-NEXT: pmuludq %xmm3, %xmm1
; SSE2-NEXT: movd %eax, %xmm3
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; SSE2-NEXT: pmuludq %xmm2, %xmm0
; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: movdqa %xmm0, %xmm4
; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; SSE2-NEXT: pmuludq %xmm4, %xmm2
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; SSE2-NEXT: pmuludq %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSE42-LABEL: _mul4xi32toi64a:
; SSE42: # %bb.0:
; SSE42-NEXT: movq %xmm1, %rax
; SSE42-NEXT: pextrq $1, %xmm1, %rcx
; SSE42-NEXT: movd %ecx, %xmm1
; SSE42-NEXT: shrq $32, %rcx
; SSE42-NEXT: movq %xmm0, %rdx
; SSE42-NEXT: movd %edx, %xmm2
; SSE42-NEXT: shrq $32, %rdx
; SSE42-NEXT: pextrq $1, %xmm0, %rsi
; SSE42-NEXT: movd %esi, %xmm3
; SSE42-NEXT: shrq $32, %rsi
; SSE42-NEXT: movd %esi, %xmm0
; SSE42-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
; SSE42-NEXT: movd %edx, %xmm0
; SSE42-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
; SSE42-NEXT: movd %ecx, %xmm0
; SSE42-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE42-NEXT: movd %eax, %xmm0
; SSE42-NEXT: shrq $32, %rax
; SSE42-NEXT: pmuludq %xmm3, %xmm1
; SSE42-NEXT: movd %eax, %xmm3
; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; SSE42-NEXT: pmuludq %xmm2, %xmm0
; SSE42-NEXT: pxor %xmm3, %xmm3
; SSE42-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero
; SSE42-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
; SSE42-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
; SSE42-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
; SSE42-NEXT: pmuludq %xmm0, %xmm1
; SSE42-NEXT: pmuludq %xmm4, %xmm2
; SSE42-NEXT: movdqa %xmm2, %xmm0
; SSE42-NEXT: retq
;
; AVX1-LABEL: _mul4xi32toi64a:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vmovd %eax, %xmm2
; AVX1-NEXT: shrq $32, %rax
; AVX1-NEXT: vmovq %xmm1, %rcx
; AVX1-NEXT: vpextrq $1, %xmm0, %rdx
; AVX1-NEXT: vmovd %edx, %xmm0
; AVX1-NEXT: shrq $32, %rdx
; AVX1-NEXT: vpextrq $1, %xmm1, %rsi
; AVX1-NEXT: vmovd %esi, %xmm1
; AVX1-NEXT: shrq $32, %rsi
; AVX1-NEXT: vmovd %esi, %xmm3
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
; AVX1-NEXT: vmovd %edx, %xmm3
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; AVX1-NEXT: vmovd %ecx, %xmm3
; AVX1-NEXT: shrq $32, %rcx
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovd %ecx, %xmm1
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0]
; AVX1-NEXT: vmovd %eax, %xmm3
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: _mul4xi32toi64a:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovq %xmm1, %rax
; AVX2-NEXT: vmovd %eax, %xmm2
; AVX2-NEXT: shrq $32, %rax
; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
; AVX2-NEXT: vmovq %xmm0, %rdx
; AVX2-NEXT: vmovd %edx, %xmm1
; AVX2-NEXT: shrq $32, %rdx
; AVX2-NEXT: vpextrq $1, %xmm0, %rsi
; AVX2-NEXT: vmovd %esi, %xmm0
; AVX2-NEXT: shrq $32, %rsi
; AVX2-NEXT: vmovd %esi, %xmm3
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; AVX2-NEXT: vmovd %edx, %xmm3
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
; AVX2-NEXT: vmovd %ecx, %xmm3
; AVX2-NEXT: shrq $32, %rcx
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: vmovd %ecx, %xmm1
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0]
; AVX2-NEXT: vmovd %eax, %xmm3
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
%f00 = extractelement <4 x i32> %0, i32 0

View File

@ -36,12 +36,14 @@ define void @FFT(%v4_varying_complex* noalias nocapture %destination, float* noa
; X64: # %bb.0: # %begin
; X64-NEXT: movdqu (%rdx), %xmm0
; X64-NEXT: pslld $4, %xmm0
; X64-NEXT: movq %xmm0, %rax
; X64-NEXT: movd %xmm0, %eax
; X64-NEXT: movslq %eax, %r8
; X64-NEXT: sarq $32, %rax
; X64-NEXT: pextrq $1, %xmm0, %rdx
; X64-NEXT: movslq %edx, %rcx
; X64-NEXT: sarq $32, %rdx
; X64-NEXT: pextrd $1, %xmm0, %ecx
; X64-NEXT: movslq %ecx, %rcx
; X64-NEXT: pextrd $2, %xmm0, %edx
; X64-NEXT: movslq %edx, %rdx
; X64-NEXT: pextrd $3, %xmm0, %eax
; X64-NEXT: cltq
; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero

View File

@ -12,19 +12,16 @@ define void @func(<4 x float> %vx) {
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
; CHECK-NEXT: pextrq $1, %xmm0, %rax
; CHECK-NEXT: movzwl %ax, %ecx
; CHECK-NEXT: shrq $32, %rax
; CHECK-NEXT: movq %xmm0, %rdx
; CHECK-NEXT: movzwl %dx, %r8d
; CHECK-NEXT: movq %rdx, %r9
; CHECK-NEXT: shrq $32, %r9
; CHECK-NEXT: movd %xmm0, %r8d
; CHECK-NEXT: leaq stuff(%r8), %rdi
; CHECK-NEXT: leaq stuff(%r9), %rsi
; CHECK-NEXT: leaq stuff(%rcx), %rdx
; CHECK-NEXT: leaq stuff(%rax), %rcx
; CHECK-NEXT: pextrd $1, %xmm0, %eax
; CHECK-NEXT: leaq stuff(%rax), %rsi
; CHECK-NEXT: pextrd $2, %xmm0, %edx
; CHECK-NEXT: pextrd $3, %xmm0, %ecx
; CHECK-NEXT: leaq stuff(%rdx), %rdx
; CHECK-NEXT: leaq stuff(%rcx), %rcx
; CHECK-NEXT: leaq stuff+8(%r8), %r8
; CHECK-NEXT: leaq stuff+8(%r9), %r9
; CHECK-NEXT: leaq stuff+8(%rax), %r9
; CHECK-NEXT: callq toto
; CHECK-NEXT: popq %rax
; CHECK-NEXT: retq

View File

@ -37,44 +37,42 @@ define <2 x i64> @var_shuffle_v2i64(<2 x i64> %v, <2 x i64> %indices) nounwind {
define <4 x i32> @var_shuffle_v4i32(<4 x i32> %v, <4 x i32> %indices) nounwind {
; SSSE3-LABEL: var_shuffle_v4i32:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movd %xmm1, %eax
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
; SSSE3-NEXT: movd %xmm2, %ecx
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
; SSSE3-NEXT: movq %xmm2, %rax
; SSSE3-NEXT: movq %rax, %rcx
; SSSE3-NEXT: sarq $32, %rcx
; SSSE3-NEXT: movq %xmm1, %rdx
; SSSE3-NEXT: movq %rdx, %rsi
; SSSE3-NEXT: sarq $32, %rsi
; SSSE3-NEXT: andl $3, %edx
; SSSE3-NEXT: movd %xmm2, %edx
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
; SSSE3-NEXT: movd %xmm1, %esi
; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSSE3-NEXT: andl $3, %esi
; SSSE3-NEXT: andl $3, %eax
; SSSE3-NEXT: andl $3, %ecx
; SSSE3-NEXT: andl $3, %edx
; SSSE3-NEXT: andl $3, %esi
; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSSE3-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSSE3-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSSE3-NEXT: retq
;
; AVX-LABEL: var_shuffle_v4i32:
; AVX: # %bb.0:
; AVX-NEXT: vpextrq $1, %xmm1, %rax
; AVX-NEXT: movq %rax, %rcx
; AVX-NEXT: sarq $32, %rcx
; AVX-NEXT: vmovq %xmm1, %rdx
; AVX-NEXT: movq %rdx, %rsi
; AVX-NEXT: sarq $32, %rsi
; AVX-NEXT: andl $3, %edx
; AVX-NEXT: vmovd %xmm1, %eax
; AVX-NEXT: vpextrd $1, %xmm1, %ecx
; AVX-NEXT: vpextrd $2, %xmm1, %edx
; AVX-NEXT: vpextrd $3, %xmm1, %esi
; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; AVX-NEXT: andl $3, %esi
; AVX-NEXT: andl $3, %eax
; AVX-NEXT: andl $3, %ecx
; AVX-NEXT: andl $3, %edx
; AVX-NEXT: andl $3, %esi
; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vpinsrd $1, -24(%rsp,%rsi,4), %xmm0, %xmm0
; AVX-NEXT: vpinsrd $2, -24(%rsp,%rax,4), %xmm0, %xmm0
; AVX-NEXT: vpinsrd $3, -24(%rsp,%rcx,4), %xmm0, %xmm0
; AVX-NEXT: vpinsrd $1, -24(%rsp,%rcx,4), %xmm0, %xmm0
; AVX-NEXT: vpinsrd $2, -24(%rsp,%rdx,4), %xmm0, %xmm0
; AVX-NEXT: vpinsrd $3, -24(%rsp,%rsi,4), %xmm0, %xmm0
; AVX-NEXT: retq
%index0 = extractelement <4 x i32> %indices, i32 0
%index1 = extractelement <4 x i32> %indices, i32 1
@ -287,40 +285,38 @@ define <2 x double> @var_shuffle_v2f64(<2 x double> %v, <2 x i64> %indices) noun
define <4 x float> @var_shuffle_v4f32(<4 x float> %v, <4 x i32> %indices) nounwind {
; SSSE3-LABEL: var_shuffle_v4f32:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movd %xmm1, %eax
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
; SSSE3-NEXT: movd %xmm2, %ecx
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
; SSSE3-NEXT: movq %xmm2, %rax
; SSSE3-NEXT: movq %rax, %rcx
; SSSE3-NEXT: sarq $32, %rcx
; SSSE3-NEXT: movq %xmm1, %rdx
; SSSE3-NEXT: movq %rdx, %rsi
; SSSE3-NEXT: sarq $32, %rsi
; SSSE3-NEXT: andl $3, %edx
; SSSE3-NEXT: movd %xmm2, %edx
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
; SSSE3-NEXT: movd %xmm1, %esi
; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSSE3-NEXT: andl $3, %esi
; SSSE3-NEXT: andl $3, %eax
; SSSE3-NEXT: andl $3, %ecx
; SSSE3-NEXT: andl $3, %edx
; SSSE3-NEXT: andl $3, %esi
; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSSE3-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSSE3-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSSE3-NEXT: retq
;
; AVX-LABEL: var_shuffle_v4f32:
; AVX: # %bb.0:
; AVX-NEXT: vpextrq $1, %xmm1, %rax
; AVX-NEXT: movq %rax, %rcx
; AVX-NEXT: sarq $32, %rcx
; AVX-NEXT: vmovq %xmm1, %rdx
; AVX-NEXT: movq %rdx, %rsi
; AVX-NEXT: sarq $32, %rsi
; AVX-NEXT: andl $3, %edx
; AVX-NEXT: vmovd %xmm1, %eax
; AVX-NEXT: vpextrd $1, %xmm1, %ecx
; AVX-NEXT: vpextrd $2, %xmm1, %edx
; AVX-NEXT: vpextrd $3, %xmm1, %esi
; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; AVX-NEXT: andl $3, %esi
; AVX-NEXT: andl $3, %eax
; AVX-NEXT: andl $3, %ecx
; AVX-NEXT: andl $3, %edx
; AVX-NEXT: andl $3, %esi
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]

View File

@ -119,36 +119,32 @@ define <8 x i32> @var_shuffle_v8i32(<8 x i32> %v, <8 x i32> %indices) nounwind {
; AVX1-NEXT: movq %rsp, %rbp
; AVX1-NEXT: andq $-32, %rsp
; AVX1-NEXT: subq $64, %rsp
; AVX1-NEXT: vpextrq $1, %xmm1, %r8
; AVX1-NEXT: movq %r8, %rcx
; AVX1-NEXT: shrq $30, %rcx
; AVX1-NEXT: vmovq %xmm1, %r9
; AVX1-NEXT: movq %r9, %rsi
; AVX1-NEXT: shrq $30, %rsi
; AVX1-NEXT: vmovd %xmm1, %r8d
; AVX1-NEXT: vpextrd $1, %xmm1, %r9d
; AVX1-NEXT: vpextrd $2, %xmm1, %r10d
; AVX1-NEXT: vpextrd $3, %xmm1, %esi
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vpextrq $1, %xmm1, %r10
; AVX1-NEXT: movq %r10, %rdi
; AVX1-NEXT: shrq $30, %rdi
; AVX1-NEXT: vmovq %xmm1, %rax
; AVX1-NEXT: movq %rax, %rdx
; AVX1-NEXT: shrq $30, %rdx
; AVX1-NEXT: vmovd %xmm1, %edi
; AVX1-NEXT: vpextrd $1, %xmm1, %eax
; AVX1-NEXT: vpextrd $2, %xmm1, %ecx
; AVX1-NEXT: vpextrd $3, %xmm1, %edx
; AVX1-NEXT: vmovaps %ymm0, (%rsp)
; AVX1-NEXT: andl $7, %r9d
; AVX1-NEXT: andl $28, %esi
; AVX1-NEXT: andl $7, %r8d
; AVX1-NEXT: andl $28, %ecx
; AVX1-NEXT: andl $7, %eax
; AVX1-NEXT: andl $28, %edx
; AVX1-NEXT: andl $7, %r9d
; AVX1-NEXT: andl $7, %r10d
; AVX1-NEXT: andl $28, %edi
; AVX1-NEXT: andl $7, %esi
; AVX1-NEXT: andl $7, %edi
; AVX1-NEXT: andl $7, %eax
; AVX1-NEXT: andl $7, %ecx
; AVX1-NEXT: andl $7, %edx
; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX1-NEXT: vpinsrd $1, (%rsp,%rdx), %xmm0, %xmm0
; AVX1-NEXT: vpinsrd $2, (%rsp,%r10,4), %xmm0, %xmm0
; AVX1-NEXT: vpinsrd $3, (%rsp,%rdi), %xmm0, %xmm0
; AVX1-NEXT: vpinsrd $1, (%rsp,%rax,4), %xmm0, %xmm0
; AVX1-NEXT: vpinsrd $2, (%rsp,%rcx,4), %xmm0, %xmm0
; AVX1-NEXT: vpinsrd $3, (%rsp,%rdx,4), %xmm0, %xmm0
; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX1-NEXT: vpinsrd $1, (%rsp,%rsi), %xmm1, %xmm1
; AVX1-NEXT: vpinsrd $2, (%rsp,%r8,4), %xmm1, %xmm1
; AVX1-NEXT: vpinsrd $3, (%rsp,%rcx), %xmm1, %xmm1
; AVX1-NEXT: vpinsrd $1, (%rsp,%r9,4), %xmm1, %xmm1
; AVX1-NEXT: vpinsrd $2, (%rsp,%r10,4), %xmm1, %xmm1
; AVX1-NEXT: vpinsrd $3, (%rsp,%rsi,4), %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: movq %rbp, %rsp
; AVX1-NEXT: popq %rbp
@ -1212,28 +1208,24 @@ define <8 x float> @var_shuffle_v8f32(<8 x float> %v, <8 x i32> %indices) nounwi
; AVX1-NEXT: movq %rsp, %rbp
; AVX1-NEXT: andq $-32, %rsp
; AVX1-NEXT: subq $64, %rsp
; AVX1-NEXT: vpextrq $1, %xmm1, %r8
; AVX1-NEXT: movq %r8, %rcx
; AVX1-NEXT: shrq $30, %rcx
; AVX1-NEXT: vmovq %xmm1, %r9
; AVX1-NEXT: movq %r9, %rdx
; AVX1-NEXT: shrq $30, %rdx
; AVX1-NEXT: vmovd %xmm1, %esi
; AVX1-NEXT: vpextrd $1, %xmm1, %r8d
; AVX1-NEXT: vpextrd $2, %xmm1, %r9d
; AVX1-NEXT: vpextrd $3, %xmm1, %r10d
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vpextrq $1, %xmm1, %r10
; AVX1-NEXT: movq %r10, %rdi
; AVX1-NEXT: shrq $30, %rdi
; AVX1-NEXT: vmovq %xmm1, %rax
; AVX1-NEXT: movq %rax, %rsi
; AVX1-NEXT: shrq $30, %rsi
; AVX1-NEXT: vmovd %xmm1, %edx
; AVX1-NEXT: vpextrd $1, %xmm1, %edi
; AVX1-NEXT: vpextrd $2, %xmm1, %eax
; AVX1-NEXT: vpextrd $3, %xmm1, %ecx
; AVX1-NEXT: vmovaps %ymm0, (%rsp)
; AVX1-NEXT: andl $7, %r9d
; AVX1-NEXT: andl $28, %edx
; AVX1-NEXT: andl $7, %esi
; AVX1-NEXT: andl $7, %r8d
; AVX1-NEXT: andl $28, %ecx
; AVX1-NEXT: andl $7, %eax
; AVX1-NEXT: andl $28, %esi
; AVX1-NEXT: andl $7, %r9d
; AVX1-NEXT: andl $7, %r10d
; AVX1-NEXT: andl $28, %edi
; AVX1-NEXT: andl $7, %edx
; AVX1-NEXT: andl $7, %edi
; AVX1-NEXT: andl $7, %eax
; AVX1-NEXT: andl $7, %ecx
; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
@ -1375,36 +1367,32 @@ define <4 x i64> @var_shuffle_v4i64_from_v2i64(<2 x i64> %v, <4 x i64> %indices)
define <8 x i32> @var_shuffle_v8i32_from_v4i32(<4 x i32> %v, <8 x i32> %indices) unnamed_addr nounwind {
; AVX1-LABEL: var_shuffle_v8i32_from_v4i32:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpextrq $1, %xmm1, %r8
; AVX1-NEXT: movq %r8, %r10
; AVX1-NEXT: shrq $30, %r10
; AVX1-NEXT: vmovq %xmm1, %r9
; AVX1-NEXT: movq %r9, %rsi
; AVX1-NEXT: shrq $30, %rsi
; AVX1-NEXT: vmovd %xmm1, %r8d
; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; AVX1-NEXT: andl $3, %r9d
; AVX1-NEXT: andl $12, %esi
; AVX1-NEXT: andl $3, %r8d
; AVX1-NEXT: andl $12, %r10d
; AVX1-NEXT: vpextrd $1, %xmm1, %r9d
; AVX1-NEXT: andl $3, %r9d
; AVX1-NEXT: vpextrd $2, %xmm1, %r10d
; AVX1-NEXT: andl $3, %r10d
; AVX1-NEXT: vpextrd $3, %xmm1, %esi
; AVX1-NEXT: andl $3, %esi
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-NEXT: movq %rax, %rdi
; AVX1-NEXT: shrq $30, %rdi
; AVX1-NEXT: vmovq %xmm0, %rcx
; AVX1-NEXT: movq %rcx, %rdx
; AVX1-NEXT: shrq $30, %rdx
; AVX1-NEXT: andl $3, %ecx
; AVX1-NEXT: andl $12, %edx
; AVX1-NEXT: vmovd %xmm0, %edi
; AVX1-NEXT: andl $3, %edi
; AVX1-NEXT: vpextrd $1, %xmm0, %eax
; AVX1-NEXT: andl $3, %eax
; AVX1-NEXT: andl $12, %edi
; AVX1-NEXT: vpextrd $2, %xmm0, %ecx
; AVX1-NEXT: andl $3, %ecx
; AVX1-NEXT: vpextrd $3, %xmm0, %edx
; AVX1-NEXT: andl $3, %edx
; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX1-NEXT: vpinsrd $1, -24(%rsp,%rdx), %xmm0, %xmm0
; AVX1-NEXT: vpinsrd $2, -24(%rsp,%rax,4), %xmm0, %xmm0
; AVX1-NEXT: vpinsrd $3, -24(%rsp,%rdi), %xmm0, %xmm0
; AVX1-NEXT: vpinsrd $1, -24(%rsp,%rax,4), %xmm0, %xmm0
; AVX1-NEXT: vpinsrd $2, -24(%rsp,%rcx,4), %xmm0, %xmm0
; AVX1-NEXT: vpinsrd $3, -24(%rsp,%rdx,4), %xmm0, %xmm0
; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX1-NEXT: vpinsrd $1, -24(%rsp,%rsi), %xmm1, %xmm1
; AVX1-NEXT: vpinsrd $2, -24(%rsp,%r8,4), %xmm1, %xmm1
; AVX1-NEXT: vpinsrd $3, -24(%rsp,%r10), %xmm1, %xmm1
; AVX1-NEXT: vpinsrd $1, -24(%rsp,%r9,4), %xmm1, %xmm1
; AVX1-NEXT: vpinsrd $2, -24(%rsp,%r10,4), %xmm1, %xmm1
; AVX1-NEXT: vpinsrd $3, -24(%rsp,%rsi,4), %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@ -2402,28 +2390,24 @@ define <4 x double> @var_shuffle_v4f64_from_v2f64(<2 x double> %v, <4 x i64> %in
define <8 x float> @var_shuffle_v8f32_from_v4f32(<4 x float> %v, <8 x i32> %indices) unnamed_addr nounwind {
; AVX1-LABEL: var_shuffle_v8f32_from_v4f32:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpextrq $1, %xmm1, %r8
; AVX1-NEXT: movq %r8, %r10
; AVX1-NEXT: shrq $30, %r10
; AVX1-NEXT: vmovq %xmm1, %r9
; AVX1-NEXT: movq %r9, %rdx
; AVX1-NEXT: shrq $30, %rdx
; AVX1-NEXT: vmovd %xmm1, %r8d
; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; AVX1-NEXT: andl $3, %r9d
; AVX1-NEXT: andl $12, %edx
; AVX1-NEXT: andl $3, %r8d
; AVX1-NEXT: andl $12, %r10d
; AVX1-NEXT: vpextrd $1, %xmm1, %r9d
; AVX1-NEXT: andl $3, %r9d
; AVX1-NEXT: vpextrd $2, %xmm1, %r10d
; AVX1-NEXT: andl $3, %r10d
; AVX1-NEXT: vpextrd $3, %xmm1, %esi
; AVX1-NEXT: andl $3, %esi
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-NEXT: movq %rax, %rdi
; AVX1-NEXT: shrq $30, %rdi
; AVX1-NEXT: vmovq %xmm0, %rcx
; AVX1-NEXT: movq %rcx, %rsi
; AVX1-NEXT: shrq $30, %rsi
; AVX1-NEXT: andl $3, %ecx
; AVX1-NEXT: andl $12, %esi
; AVX1-NEXT: vmovd %xmm0, %edi
; AVX1-NEXT: andl $3, %edi
; AVX1-NEXT: vpextrd $1, %xmm0, %eax
; AVX1-NEXT: andl $3, %eax
; AVX1-NEXT: andl $12, %edi
; AVX1-NEXT: vpextrd $2, %xmm0, %ecx
; AVX1-NEXT: andl $3, %ecx
; AVX1-NEXT: vpextrd $3, %xmm0, %edx
; AVX1-NEXT: andl $3, %edx
; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
@ -2475,19 +2459,17 @@ define <4 x i32> @var_shuffle_v4i32_from_v8i32(<8 x i32> %v, <4 x i32> %indices)
; AVX-NEXT: movq %rsp, %rbp
; AVX-NEXT: andq $-32, %rsp
; AVX-NEXT: subq $64, %rsp
; AVX-NEXT: vmovq %xmm1, %rax
; AVX-NEXT: movq %rax, %rcx
; AVX-NEXT: shrq $30, %rcx
; AVX-NEXT: andl $28, %ecx
; AVX-NEXT: vpextrq $1, %xmm1, %rdx
; AVX-NEXT: movq %rdx, %rsi
; AVX-NEXT: sarq $32, %rsi
; AVX-NEXT: andl $7, %eax
; AVX-NEXT: andl $7, %edx
; AVX-NEXT: vmovd %xmm1, %eax
; AVX-NEXT: vmovaps %ymm0, (%rsp)
; AVX-NEXT: andl $7, %eax
; AVX-NEXT: vpextrd $1, %xmm1, %ecx
; AVX-NEXT: andl $7, %ecx
; AVX-NEXT: vpextrd $2, %xmm1, %edx
; AVX-NEXT: andl $7, %edx
; AVX-NEXT: vpextrd $3, %xmm1, %esi
; AVX-NEXT: andl $7, %esi
; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vpinsrd $1, (%rsp,%rcx), %xmm0, %xmm0
; AVX-NEXT: vpinsrd $1, (%rsp,%rcx,4), %xmm0, %xmm0
; AVX-NEXT: vpinsrd $2, (%rsp,%rdx,4), %xmm0, %xmm0
; AVX-NEXT: vpinsrd $3, (%rsp,%rsi,4), %xmm0, %xmm0
; AVX-NEXT: movq %rbp, %rsp