mirror of
https://github.com/RPCS3/llvm.git
synced 2024-12-04 09:54:09 +00:00
[DAGCombiner] shrink/widen a vselect to match its condition operand size (PR14657)
We discussed shrinking/widening of selects in IR in D26556, and I'll try to get back to that patch eventually. But I'm hoping that this transform is less iffy in the DAG where we can check legality of the select that we want to produce. A few things to note: 1. We can't wait until after legalization and do this generically because (at least in the x86 tests from PR14657), we'll have PACKSS and bitcasts in the pattern. 2. This might benefit more of the SSE codegen if we lifted the legal-or-custom requirement, but that requires a closer look to make sure we don't end up worse. 3. There's a 'vblendv' opportunity that we're missing that results in andn/and/or in some cases. That should be fixed next. 4. I'm assuming that AVX1 offers the worst of all worlds wrt uneven ISA support with multiple legal vector sizes, but if there are other targets like that, we should add more tests. 5. There's a codegen miracle in the multi-BB tests from PR14657 (the gcc auto-vectorization tests): despite IR that is terrible for the target, this patch allows us to generate the optimal loop code because something post-ISEL is hoisting the splat extends above the vector loops. Differential Revision: https://reviews.llvm.org/D32620 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@301781 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
21ade9ba1e
commit
7c77a6cf1a
@ -399,6 +399,7 @@ namespace {
|
||||
SDValue createBuildVecShuffle(const SDLoc &DL, SDNode *N,
|
||||
ArrayRef<int> VectorMask, SDValue VecIn1,
|
||||
SDValue VecIn2, unsigned LeftIdx);
|
||||
SDValue matchVSelectOpSizesWithSetCC(SDNode *N);
|
||||
|
||||
SDValue GetDemandedBits(SDValue V, const APInt &Mask);
|
||||
|
||||
@ -6942,6 +6943,51 @@ SDValue DAGCombiner::CombineExtLoad(SDNode *N) {
|
||||
return SDValue(N, 0); // Return N so it doesn't get rechecked!
|
||||
}
|
||||
|
||||
/// If we're narrowing or widening the result of a vector select and the final
|
||||
/// size is the same size as a setcc (compare) feeding the select, then try to
|
||||
/// apply the cast operation to the select's operands because matching vector
|
||||
/// sizes for a select condition and other operands should be more efficient.
|
||||
SDValue DAGCombiner::matchVSelectOpSizesWithSetCC(SDNode *Cast) {
|
||||
unsigned CastOpcode = Cast->getOpcode();
|
||||
assert((CastOpcode == ISD::SIGN_EXTEND || CastOpcode == ISD::ZERO_EXTEND ||
|
||||
CastOpcode == ISD::TRUNCATE || CastOpcode == ISD::FP_EXTEND ||
|
||||
CastOpcode == ISD::FP_ROUND) &&
|
||||
"Unexpected opcode for vector select narrowing/widening");
|
||||
|
||||
// We only do this transform before legal ops because the pattern may be
|
||||
// obfuscated by target-specific operations after legalization. Do not create
|
||||
// an illegal select op, however, because that may be difficult to lower.
|
||||
EVT VT = Cast->getValueType(0);
|
||||
if (LegalOperations || !TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
|
||||
return SDValue();
|
||||
|
||||
SDValue VSel = Cast->getOperand(0);
|
||||
if (VSel.getOpcode() != ISD::VSELECT || !VSel.hasOneUse() ||
|
||||
VSel.getOperand(0).getOpcode() != ISD::SETCC)
|
||||
return SDValue();
|
||||
|
||||
// Does the setcc have the same vector size as the casted select?
|
||||
SDValue SetCC = VSel.getOperand(0);
|
||||
EVT SetCCVT = getSetCCResultType(SetCC.getOperand(0).getValueType());
|
||||
if (SetCCVT.getSizeInBits() != VT.getSizeInBits())
|
||||
return SDValue();
|
||||
|
||||
// cast (vsel (setcc X), A, B) --> vsel (setcc X), (cast A), (cast B)
|
||||
SDValue A = VSel.getOperand(1);
|
||||
SDValue B = VSel.getOperand(2);
|
||||
SDValue CastA, CastB;
|
||||
SDLoc DL(Cast);
|
||||
if (CastOpcode == ISD::FP_ROUND) {
|
||||
// FP_ROUND (fptrunc) has an extra flag operand to pass along.
|
||||
CastA = DAG.getNode(CastOpcode, DL, VT, A, Cast->getOperand(1));
|
||||
CastB = DAG.getNode(CastOpcode, DL, VT, B, Cast->getOperand(1));
|
||||
} else {
|
||||
CastA = DAG.getNode(CastOpcode, DL, VT, A);
|
||||
CastB = DAG.getNode(CastOpcode, DL, VT, B);
|
||||
}
|
||||
return DAG.getNode(ISD::VSELECT, DL, VT, SetCC, CastA, CastB);
|
||||
}
|
||||
|
||||
SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
|
||||
SDValue N0 = N->getOperand(0);
|
||||
EVT VT = N->getValueType(0);
|
||||
@ -7165,6 +7211,9 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
|
||||
DAG.SignBitIsZero(N0))
|
||||
return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0);
|
||||
|
||||
if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
|
||||
return NewVSel;
|
||||
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
@ -7498,6 +7547,9 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
|
||||
ShAmt);
|
||||
}
|
||||
|
||||
if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
|
||||
return NewVSel;
|
||||
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
@ -8292,6 +8344,9 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
|
||||
return DAG.getNode(N0.getOpcode(), SL, VTs, X, Y, N0.getOperand(2));
|
||||
}
|
||||
|
||||
if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
|
||||
return NewVSel;
|
||||
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
@ -10243,6 +10298,9 @@ SDValue DAGCombiner::visitFP_ROUND(SDNode *N) {
|
||||
Tmp, N0.getOperand(1));
|
||||
}
|
||||
|
||||
if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
|
||||
return NewVSel;
|
||||
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
@ -10309,6 +10367,9 @@ SDValue DAGCombiner::visitFP_EXTEND(SDNode *N) {
|
||||
return SDValue(N, 0); // Return N so it doesn't get rechecked!
|
||||
}
|
||||
|
||||
if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
|
||||
return NewVSel;
|
||||
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
|
@ -49,26 +49,23 @@ define <8 x i32> @sext(<8 x float> %a, <8 x float> %b, <8 x i16> %c, <8 x i16> %
|
||||
; AVX1-LABEL: sext:
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vcmpltps %ymm1, %ymm0, %ymm0
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpandn %xmm3, %xmm0, %xmm1
|
||||
; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0
|
||||
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
|
||||
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
|
||||
; AVX1-NEXT: vpmovsxwd %xmm2, %xmm1
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
|
||||
; AVX1-NEXT: vpmovsxwd %xmm2, %xmm2
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
|
||||
; AVX1-NEXT: vpmovsxwd %xmm3, %xmm2
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
|
||||
; AVX1-NEXT: vpmovsxwd %xmm3, %xmm3
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
|
||||
; AVX1-NEXT: vblendvps %ymm0, %ymm1, %ymm2, %ymm0
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: sext:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vcmpltps %ymm1, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpandn %xmm3, %xmm0, %xmm1
|
||||
; AVX2-NEXT: vpand %xmm0, %xmm2, %xmm0
|
||||
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
|
||||
; AVX2-NEXT: vpmovsxwd %xmm2, %ymm1
|
||||
; AVX2-NEXT: vpmovsxwd %xmm3, %ymm2
|
||||
; AVX2-NEXT: vblendvps %ymm0, %ymm1, %ymm2, %ymm0
|
||||
; AVX2-NEXT: retq
|
||||
%cmp = fcmp olt <8 x float> %a, %b
|
||||
%sel = select <8 x i1> %cmp, <8 x i16> %c, <8 x i16> %d
|
||||
@ -117,26 +114,23 @@ define <8 x i32> @zext(<8 x float> %a, <8 x float> %b, <8 x i16> %c, <8 x i16> %
|
||||
; AVX1-LABEL: zext:
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vcmpltps %ymm1, %ymm0, %ymm0
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpandn %xmm3, %xmm0, %xmm1
|
||||
; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0
|
||||
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
|
||||
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
|
||||
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
|
||||
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
|
||||
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
|
||||
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
|
||||
; AVX1-NEXT: vblendvps %ymm0, %ymm1, %ymm2, %ymm0
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: zext:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vcmpltps %ymm1, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpandn %xmm3, %xmm0, %xmm1
|
||||
; AVX2-NEXT: vpand %xmm0, %xmm2, %xmm0
|
||||
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
|
||||
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
|
||||
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
|
||||
; AVX2-NEXT: vblendvps %ymm0, %ymm1, %ymm2, %ymm0
|
||||
; AVX2-NEXT: retq
|
||||
%cmp = fcmp olt <8 x float> %a, %b
|
||||
%sel = select <8 x i1> %cmp, <8 x i16> %c, <8 x i16> %d
|
||||
@ -173,10 +167,9 @@ define <4 x double> @fpext(<4 x double> %a, <4 x double> %b, <4 x float> %c, <4
|
||||
; AVX-LABEL: fpext:
|
||||
; AVX: # BB#0:
|
||||
; AVX-NEXT: vcmpltpd %ymm1, %ymm0, %ymm0
|
||||
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; AVX-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
|
||||
; AVX-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0
|
||||
; AVX-NEXT: vcvtps2pd %xmm0, %ymm0
|
||||
; AVX-NEXT: vcvtps2pd %xmm2, %ymm1
|
||||
; AVX-NEXT: vcvtps2pd %xmm3, %ymm2
|
||||
; AVX-NEXT: vblendvpd %ymm0, %ymm1, %ymm2, %ymm0
|
||||
; AVX-NEXT: retq
|
||||
%cmp = fcmp olt <4 x double> %a, %b
|
||||
%sel = select <4 x i1> %cmp, <4 x float> %c, <4 x float> %d
|
||||
@ -188,64 +181,65 @@ define <8 x i16> @trunc(<8 x i16> %a, <8 x i16> %b, <8 x i32> %c, <8 x i32> %d)
|
||||
; SSE2-LABEL: trunc:
|
||||
; SSE2: # BB#0:
|
||||
; SSE2-NEXT: pcmpeqw %xmm1, %xmm0
|
||||
; SSE2-NEXT: pxor %xmm1, %xmm1
|
||||
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
|
||||
; SSE2-NEXT: psrad $16, %xmm1
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
|
||||
; SSE2-NEXT: psrad $16, %xmm0
|
||||
; SSE2-NEXT: pslld $16, %xmm5
|
||||
; SSE2-NEXT: psrad $16, %xmm5
|
||||
; SSE2-NEXT: pslld $16, %xmm4
|
||||
; SSE2-NEXT: psrad $16, %xmm4
|
||||
; SSE2-NEXT: packssdw %xmm5, %xmm4
|
||||
; SSE2-NEXT: pslld $16, %xmm3
|
||||
; SSE2-NEXT: psrad $16, %xmm3
|
||||
; SSE2-NEXT: pslld $16, %xmm2
|
||||
; SSE2-NEXT: psrad $16, %xmm2
|
||||
; SSE2-NEXT: packssdw %xmm3, %xmm2
|
||||
; SSE2-NEXT: pand %xmm0, %xmm2
|
||||
; SSE2-NEXT: pandn %xmm4, %xmm0
|
||||
; SSE2-NEXT: por %xmm2, %xmm0
|
||||
; SSE2-NEXT: pand %xmm1, %xmm3
|
||||
; SSE2-NEXT: pandn %xmm5, %xmm1
|
||||
; SSE2-NEXT: por %xmm3, %xmm1
|
||||
; SSE2-NEXT: pslld $16, %xmm1
|
||||
; SSE2-NEXT: psrad $16, %xmm1
|
||||
; SSE2-NEXT: pslld $16, %xmm0
|
||||
; SSE2-NEXT: psrad $16, %xmm0
|
||||
; SSE2-NEXT: packssdw %xmm1, %xmm0
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: trunc:
|
||||
; SSE41: # BB#0:
|
||||
; SSE41-NEXT: pcmpeqw %xmm1, %xmm0
|
||||
; SSE41-NEXT: pxor %xmm1, %xmm1
|
||||
; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
|
||||
; SSE41-NEXT: pmovsxwd %xmm0, %xmm0
|
||||
; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm4
|
||||
; SSE41-NEXT: movdqa %xmm1, %xmm0
|
||||
; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm5
|
||||
; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
|
||||
; SSE41-NEXT: pshufb %xmm0, %xmm5
|
||||
; SSE41-NEXT: pshufb %xmm0, %xmm4
|
||||
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
|
||||
; SSE41-NEXT: pshufb %xmm1, %xmm5
|
||||
; SSE41-NEXT: pshufb %xmm1, %xmm4
|
||||
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
|
||||
; SSE41-NEXT: movdqa %xmm4, %xmm0
|
||||
; SSE41-NEXT: pshufb %xmm1, %xmm3
|
||||
; SSE41-NEXT: pshufb %xmm1, %xmm2
|
||||
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
|
||||
; SSE41-NEXT: pand %xmm0, %xmm2
|
||||
; SSE41-NEXT: pandn %xmm4, %xmm0
|
||||
; SSE41-NEXT: por %xmm2, %xmm0
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX1-LABEL: trunc:
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
|
||||
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
|
||||
; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm3, %ymm0
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
|
||||
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1
|
||||
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
|
||||
; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3
|
||||
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0]
|
||||
; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm1
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
|
||||
; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3
|
||||
; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
|
||||
; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0
|
||||
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vzeroupper
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: trunc:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
|
||||
; AVX2-NEXT: vblendvps %ymm0, %ymm2, %ymm3, %ymm0
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
||||
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
||||
; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
||||
; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm3
|
||||
; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
|
||||
; AVX2-NEXT: vpandn %xmm3, %xmm0, %xmm3
|
||||
; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm1
|
||||
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
|
||||
; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0
|
||||
; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vzeroupper
|
||||
; AVX2-NEXT: retq
|
||||
%cmp = icmp eq <8 x i16> %a, %b
|
||||
@ -258,61 +252,38 @@ define <4 x float> @fptrunc(<4 x float> %a, <4 x float> %b, <4 x double> %c, <4
|
||||
; SSE2-LABEL: fptrunc:
|
||||
; SSE2: # BB#0:
|
||||
; SSE2-NEXT: cmpltps %xmm1, %xmm0
|
||||
; SSE2-NEXT: movaps %xmm0, %xmm1
|
||||
; SSE2-NEXT: psrad $31, %xmm1
|
||||
; SSE2-NEXT: xorps %xmm6, %xmm6
|
||||
; SSE2-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3]
|
||||
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; SSE2-NEXT: movaps %xmm6, %xmm1
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3,2,3]
|
||||
; SSE2-NEXT: psrad $31, %xmm6
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3]
|
||||
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
|
||||
; SSE2-NEXT: pand %xmm1, %xmm3
|
||||
; SSE2-NEXT: pandn %xmm5, %xmm1
|
||||
; SSE2-NEXT: por %xmm3, %xmm1
|
||||
; SSE2-NEXT: pand %xmm0, %xmm2
|
||||
; SSE2-NEXT: pandn %xmm4, %xmm0
|
||||
; SSE2-NEXT: por %xmm2, %xmm0
|
||||
; SSE2-NEXT: cvtpd2ps %xmm0, %xmm0
|
||||
; SSE2-NEXT: cvtpd2ps %xmm1, %xmm1
|
||||
; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
||||
; SSE2-NEXT: cvtpd2ps %xmm5, %xmm1
|
||||
; SSE2-NEXT: cvtpd2ps %xmm4, %xmm4
|
||||
; SSE2-NEXT: unpcklpd {{.*#+}} xmm4 = xmm4[0],xmm1[0]
|
||||
; SSE2-NEXT: cvtpd2ps %xmm3, %xmm1
|
||||
; SSE2-NEXT: cvtpd2ps %xmm2, %xmm2
|
||||
; SSE2-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0]
|
||||
; SSE2-NEXT: andpd %xmm0, %xmm2
|
||||
; SSE2-NEXT: andnpd %xmm4, %xmm0
|
||||
; SSE2-NEXT: orpd %xmm2, %xmm0
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: fptrunc:
|
||||
; SSE41: # BB#0:
|
||||
; SSE41-NEXT: cmpltps %xmm1, %xmm0
|
||||
; SSE41-NEXT: xorps %xmm1, %xmm1
|
||||
; SSE41-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
|
||||
; SSE41-NEXT: pmovsxdq %xmm0, %xmm0
|
||||
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4
|
||||
; SSE41-NEXT: cvtpd2ps %xmm3, %xmm1
|
||||
; SSE41-NEXT: cvtpd2ps %xmm2, %xmm2
|
||||
; SSE41-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0]
|
||||
; SSE41-NEXT: cvtpd2ps %xmm5, %xmm3
|
||||
; SSE41-NEXT: cvtpd2ps %xmm4, %xmm1
|
||||
; SSE41-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0]
|
||||
; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1
|
||||
; SSE41-NEXT: movaps %xmm1, %xmm0
|
||||
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5
|
||||
; SSE41-NEXT: cvtpd2ps %xmm5, %xmm1
|
||||
; SSE41-NEXT: cvtpd2ps %xmm4, %xmm0
|
||||
; SSE41-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX1-LABEL: fptrunc:
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vcmpltps %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
|
||||
; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
|
||||
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
|
||||
; AVX1-NEXT: vblendvpd %ymm0, %ymm2, %ymm3, %ymm0
|
||||
; AVX1-NEXT: vcvtpd2ps %ymm0, %xmm0
|
||||
; AVX1-NEXT: vzeroupper
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: fptrunc:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vcmpltps %xmm1, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
|
||||
; AVX2-NEXT: vblendvpd %ymm0, %ymm2, %ymm3, %ymm0
|
||||
; AVX2-NEXT: vcvtpd2ps %ymm0, %xmm0
|
||||
; AVX2-NEXT: vzeroupper
|
||||
; AVX2-NEXT: retq
|
||||
; AVX-LABEL: fptrunc:
|
||||
; AVX: # BB#0:
|
||||
; AVX-NEXT: vcmpltps %xmm1, %xmm0, %xmm0
|
||||
; AVX-NEXT: vcvtpd2ps %ymm2, %xmm1
|
||||
; AVX-NEXT: vcvtpd2ps %ymm3, %xmm2
|
||||
; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
|
||||
; AVX-NEXT: vzeroupper
|
||||
; AVX-NEXT: retq
|
||||
%cmp = fcmp olt <4 x float> %a, %b
|
||||
%sel = select <4 x i1> %cmp, <4 x double> %c, <4 x double> %d
|
||||
%tr = fptrunc <4 x double> %sel to <4 x float>
|
||||
@ -567,20 +538,20 @@ define void @example24(i16 signext %x, i16 signext %y) nounwind {
|
||||
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
|
||||
; AVX1-NEXT: movq $-4096, %rax # imm = 0xF000
|
||||
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
|
||||
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
|
||||
; AVX1-NEXT: vpmovsxwd %xmm1, %xmm2
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
|
||||
; AVX1-NEXT: vpmovsxwd %xmm1, %xmm1
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
|
||||
; AVX1-NEXT: .p2align 4, 0x90
|
||||
; AVX1-NEXT: .LBB6_1: # %vector.body
|
||||
; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
|
||||
; AVX1-NEXT: vmovups da+4096(%rax), %ymm2
|
||||
; AVX1-NEXT: vcmpltps db+4096(%rax), %ymm2, %ymm2
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
|
||||
; AVX1-NEXT: vpacksswb %xmm3, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpandn %xmm1, %xmm2, %xmm3
|
||||
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm2
|
||||
; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpmovsxwd %xmm2, %xmm3
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
|
||||
; AVX1-NEXT: vpmovsxwd %xmm2, %xmm2
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
|
||||
; AVX1-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm2
|
||||
; AVX1-NEXT: vmovups %ymm2, dj+4096(%rax)
|
||||
; AVX1-NEXT: addq $32, %rax
|
||||
; AVX1-NEXT: jne .LBB6_1
|
||||
@ -595,18 +566,15 @@ define void @example24(i16 signext %x, i16 signext %y) nounwind {
|
||||
; AVX2-NEXT: vmovd %esi, %xmm1
|
||||
; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1
|
||||
; AVX2-NEXT: movq $-4096, %rax # imm = 0xF000
|
||||
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
|
||||
; AVX2-NEXT: vpmovsxwd %xmm1, %ymm1
|
||||
; AVX2-NEXT: .p2align 4, 0x90
|
||||
; AVX2-NEXT: .LBB6_1: # %vector.body
|
||||
; AVX2-NEXT: # =>This Inner Loop Header: Depth=1
|
||||
; AVX2-NEXT: vmovups da+4096(%rax), %ymm2
|
||||
; AVX2-NEXT: vcmpltps db+4096(%rax), %ymm2, %ymm2
|
||||
; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3
|
||||
; AVX2-NEXT: vpacksswb %xmm3, %xmm2, %xmm2
|
||||
; AVX2-NEXT: vpandn %xmm1, %xmm2, %xmm3
|
||||
; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm2
|
||||
; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2
|
||||
; AVX2-NEXT: vpmovsxwd %xmm2, %ymm2
|
||||
; AVX2-NEXT: vmovdqu %ymm2, dj+4096(%rax)
|
||||
; AVX2-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm2
|
||||
; AVX2-NEXT: vmovups %ymm2, dj+4096(%rax)
|
||||
; AVX2-NEXT: addq $32, %rax
|
||||
; AVX2-NEXT: jne .LBB6_1
|
||||
; AVX2-NEXT: # BB#2: # %for.end
|
||||
|
Loading…
Reference in New Issue
Block a user