mirror of
https://github.com/RPCS3/llvm.git
synced 2024-12-03 01:12:53 +00:00
[X86][AVX1] Enable *_EXTEND_VECTOR_INREG lowering of 256-bit vectors
As discussed on D52964, this adds 256-bit *_EXTEND_VECTOR_INREG lowering support for AVX1 targets to help improve SimplifyDemandedBits handling. Differential Revision: https://reviews.llvm.org/D52980 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344019 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
7f9eb168a9
commit
729310ff09
@ -1124,12 +1124,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
|
||||
setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
|
||||
}
|
||||
|
||||
if (HasInt256) {
|
||||
for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
|
||||
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
|
||||
setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
|
||||
}
|
||||
for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
|
||||
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
|
||||
setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
|
||||
}
|
||||
|
||||
if (HasInt256) {
|
||||
// The custom lowering for UINT_TO_FP for v8i32 becomes interesting
|
||||
// when we have a 256bit-wide blend with immediate.
|
||||
setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
|
||||
@ -19713,18 +19713,20 @@ static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
|
||||
if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
|
||||
return SDValue();
|
||||
if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
|
||||
!(VT.is256BitVector() && Subtarget.hasInt256()) &&
|
||||
!(VT.is256BitVector() && Subtarget.hasAVX()) &&
|
||||
!(VT.is512BitVector() && Subtarget.hasAVX512()))
|
||||
return SDValue();
|
||||
|
||||
SDLoc dl(Op);
|
||||
unsigned Opc = Op.getOpcode();
|
||||
unsigned NumElts = VT.getVectorNumElements();
|
||||
|
||||
// For 256-bit vectors, we only need the lower (128-bit) half of the input.
|
||||
// For 512-bit vectors, we need 128-bits or 256-bits.
|
||||
if (VT.getSizeInBits() > 128) {
|
||||
// Input needs to be at least the same number of elements as output, and
|
||||
// at least 128-bits.
|
||||
int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements();
|
||||
int InSize = InSVT.getSizeInBits() * NumElts;
|
||||
In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
|
||||
}
|
||||
|
||||
@ -19733,14 +19735,31 @@ static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
|
||||
// need to be handled here for 256/512-bit results.
|
||||
if (Subtarget.hasInt256()) {
|
||||
assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
|
||||
unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
|
||||
X86ISD::VSEXT : X86ISD::VZEXT;
|
||||
unsigned ExtOpc =
|
||||
Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? X86ISD::VSEXT : X86ISD::VZEXT;
|
||||
return DAG.getNode(ExtOpc, dl, VT, In);
|
||||
}
|
||||
|
||||
// pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
|
||||
if (Subtarget.hasAVX()) {
|
||||
assert(VT.is256BitVector() && "256-bit vector expected");
|
||||
int HalfNumElts = NumElts / 2;
|
||||
MVT HalfVT = MVT::getVectorVT(SVT, HalfNumElts);
|
||||
|
||||
InVT = In.getSimpleValueType();
|
||||
unsigned NumSrcElts = InVT.getVectorNumElements();
|
||||
SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
|
||||
for (int i = 0; i != HalfNumElts; ++i)
|
||||
HiMask[i] = HalfNumElts + i;
|
||||
|
||||
SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
|
||||
SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
|
||||
Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
|
||||
return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
|
||||
}
|
||||
|
||||
// We should only get here for sign extend.
|
||||
assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
|
||||
"Unexpected opcode!");
|
||||
assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!");
|
||||
|
||||
// pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
|
||||
SDValue Curr = In;
|
||||
@ -38346,11 +38365,11 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
|
||||
DAG.getIntPtrConstant(0, DL));
|
||||
}
|
||||
|
||||
// If target-size is 128-bits (or 256-bits on AVX2 target), then convert to
|
||||
// If target-size is 128-bits (or 256-bits on AVX target), then convert to
|
||||
// ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
|
||||
// Also use this if we don't have SSE41 to allow the legalizer do its job.
|
||||
if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
|
||||
(VT.is256BitVector() && Subtarget.hasInt256()) ||
|
||||
(VT.is256BitVector() && Subtarget.hasAVX()) ||
|
||||
(VT.is512BitVector() && Subtarget.useAVX512Regs())) {
|
||||
SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
|
||||
return Opcode == ISD::SIGN_EXTEND
|
||||
@ -38377,9 +38396,9 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
|
||||
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
|
||||
};
|
||||
|
||||
// On pre-AVX2 targets, split into 128-bit nodes of
|
||||
// On pre-AVX targets, split into 128-bit nodes of
|
||||
// ISD::*_EXTEND_VECTOR_INREG.
|
||||
if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128))
|
||||
if (!Subtarget.hasAVX() && !(VT.getSizeInBits() % 128))
|
||||
return SplitAndExtendInReg(128);
|
||||
|
||||
// On pre-AVX512 targets, split into 256-bit nodes of
|
||||
|
@ -275,21 +275,21 @@ define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind {
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,0,1]
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[3,1,2,3]
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[3,3,0,1]
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,2,3]
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm11 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm12 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[3,1,2,3]
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[3,3,0,1]
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm13 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,2,3]
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,3,0,1]
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm14 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[3,1,2,3]
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[3,3,0,1]
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm10 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[1,1,2,3]
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
|
||||
@ -301,7 +301,7 @@ define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind {
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpaddd %xmm5, %xmm7, %xmm5
|
||||
; AVX1-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[3,1,2,3]
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[3,3,0,1]
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpaddd %xmm7, %xmm4, %xmm9
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[1,1,2,3]
|
||||
@ -312,7 +312,7 @@ define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind {
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,3,0,1]
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpaddd %xmm7, %xmm12, %xmm12
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[3,1,2,3]
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[3,3,0,1]
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpaddd %xmm5, %xmm13, %xmm13
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,2,3]
|
||||
@ -323,7 +323,7 @@ define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind {
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpaddd %xmm4, %xmm14, %xmm4
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[3,1,2,3]
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[3,3,0,1]
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpaddd %xmm6, %xmm10, %xmm6
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
|
||||
|
@ -332,33 +332,31 @@ define <8 x i16> @pmaddubsw_bad_extend(<16 x i8>* %Aptr, <16 x i8>* %Bptr) {
|
||||
; AVX1: # %bb.0:
|
||||
; AVX1-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX1-NEXT: vmovdqa (%rsi), %xmm1
|
||||
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
||||
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm3
|
||||
; AVX1-NEXT: vpmovsxbd %xmm3, %xmm4
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
|
||||
; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3
|
||||
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm5
|
||||
; AVX1-NEXT: vpmovsxbd %xmm5, %xmm5
|
||||
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm2
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpmulld %xmm5, %xmm4, %xmm4
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpmulld %xmm2, %xmm3, %xmm2
|
||||
; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm3
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpmulld %xmm3, %xmm5, %xmm3
|
||||
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <9,11,13,15,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm5
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
|
||||
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = <1,3,5,7,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
|
||||
; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm4
|
||||
; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4
|
||||
; AVX1-NEXT: vpmulld %xmm4, %xmm5, %xmm4
|
||||
; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm3
|
||||
; AVX1-NEXT: vpmulld %xmm3, %xmm5, %xmm3
|
||||
; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm3
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
|
||||
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpaddd %xmm0, %xmm3, %xmm0
|
||||
; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0
|
||||
; AVX1-NEXT: vpackssdw %xmm0, %xmm3, %xmm0
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: pmaddubsw_bad_extend:
|
||||
@ -463,29 +461,29 @@ define <8 x i16> @pmaddubsw_bad_indices(<16 x i8>* %Aptr, <16 x i8>* %Bptr) {
|
||||
; AVX1: # %bb.0:
|
||||
; AVX1-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX1-NEXT: vmovdqa (%rsi), %xmm1
|
||||
; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[9,10,13,14,u,u,u,u,u,u,u,u,u,u,u,u]
|
||||
; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,2,5,6,9,10,13,14,u,u,u,u,u,u,u,u]
|
||||
; AVX1-NEXT: vpmovsxbd %xmm2, %xmm3
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
|
||||
; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,2,5,6,u,u,u,u,u,u,u,u,u,u,u,u]
|
||||
; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3
|
||||
; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u]
|
||||
; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpmulld %xmm5, %xmm3, %xmm3
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpmulld %xmm4, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpmulld %xmm4, %xmm3, %xmm3
|
||||
; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[8,11,12,15,u,u,u,u,u,u,u,u,u,u,u,u]
|
||||
; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4
|
||||
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,3,4,7,u,u,u,u,u,u,u,u,u,u,u,u]
|
||||
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,3,4,7,8,11,12,15,u,u,u,u,u,u,u,u]
|
||||
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm4
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
|
||||
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[9,11,13,15,u,u,u,u,u,u,u,u,u,u,u,u]
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpmulld %xmm5, %xmm4, %xmm4
|
||||
; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,u,u,u,u,u,u,u,u,u,u,u,u]
|
||||
; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpaddd %xmm0, %xmm3, %xmm0
|
||||
; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0
|
||||
; AVX1-NEXT: vpackssdw %xmm0, %xmm3, %xmm0
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: pmaddubsw_bad_indices:
|
||||
|
@ -768,7 +768,7 @@ define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind {
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm9 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,0,1]
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm11 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1]
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
|
||||
; AVX1-NEXT: vpmaxud %xmm0, %xmm6, %xmm7
|
||||
@ -1772,10 +1772,6 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind {
|
||||
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
|
||||
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
|
||||
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vzeroupper
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
|
@ -345,7 +345,7 @@ define <16 x i32> @sext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ss
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
|
||||
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1]
|
||||
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
|
||||
; AVX1-NEXT: vmovaps %ymm2, %ymm0
|
||||
@ -576,9 +576,9 @@ define <8 x i64> @sext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp
|
||||
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm2
|
||||
; AVX1-NEXT: vpmovsxbq %xmm2, %xmm2
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
|
||||
; AVX1-NEXT: vpmovsxbq %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
|
||||
; AVX1-NEXT: vpmovsxbq %xmm0, %xmm1
|
||||
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpmovsxbq %xmm0, %xmm0
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
|
||||
; AVX1-NEXT: vmovaps %ymm2, %ymm0
|
||||
@ -971,7 +971,7 @@ define <8 x i64> @sext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
|
||||
; AVX1-NEXT: vpmovsxwq %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1]
|
||||
; AVX1-NEXT: vpmovsxwq %xmm0, %xmm0
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
|
||||
; AVX1-NEXT: vmovaps %ymm2, %ymm0
|
||||
|
@ -293,7 +293,7 @@ define <16 x i32> @zext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ss
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1]
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
|
||||
; AVX1-NEXT: vmovaps %ymm2, %ymm0
|
||||
@ -445,9 +445,9 @@ define <8 x i64> @zext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp
|
||||
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm2
|
||||
; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
|
||||
; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
|
||||
; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
|
||||
; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
|
||||
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
|
||||
; AVX1-NEXT: vmovaps %ymm2, %ymm0
|
||||
@ -747,7 +747,7 @@ define <8 x i64> @zext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
|
||||
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1]
|
||||
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
|
||||
; AVX1-NEXT: vmovaps %ymm2, %ymm0
|
||||
@ -2225,12 +2225,12 @@ define <32 x i32> @zext_32i8_to_32i32(<32 x i8> %x) {
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1]
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,0,1]
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
|
||||
; AVX1-NEXT: vmovaps %ymm4, %ymm0
|
||||
|
Loading…
Reference in New Issue
Block a user