mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-11 22:29:37 +00:00
[X86][SSE] combineTruncateWithSat - use truncateVectorWithPACK down to 64-bit subvectors
Add support for chaining PACKSS/PACKUS down to 64-bit vectors by using only a single 128-bit input. llvm-svn: 325494
This commit is contained in:
parent
c23c92625c
commit
c73a5be0c2
@ -16597,8 +16597,8 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
|
||||
assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
|
||||
"Unexpected PACK opcode");
|
||||
|
||||
// Requires SSE2 but AVX512 has fast truncate.
|
||||
if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
|
||||
// Requires SSE2 but AVX512 has fast vector truncate.
|
||||
if (!Subtarget.hasSSE2() || Subtarget.hasAVX512() || !DstVT.isVector())
|
||||
return SDValue();
|
||||
|
||||
EVT SrcVT = In.getValueType();
|
||||
@ -16607,25 +16607,23 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
|
||||
if (SrcVT == DstVT)
|
||||
return In;
|
||||
|
||||
// We only support vector truncation to 128bits or greater from a
|
||||
// 256bits or greater source.
|
||||
// We only support vector truncation to 64bits or greater from a
|
||||
// 128bits or greater source.
|
||||
unsigned DstSizeInBits = DstVT.getSizeInBits();
|
||||
unsigned SrcSizeInBits = SrcVT.getSizeInBits();
|
||||
if ((DstSizeInBits % 128) != 0 || (SrcSizeInBits % 256) != 0)
|
||||
if ((DstSizeInBits % 64) != 0 || (SrcSizeInBits % 128) != 0)
|
||||
return SDValue();
|
||||
|
||||
unsigned NumElems = SrcVT.getVectorNumElements();
|
||||
if (!isPowerOf2_32(NumElems))
|
||||
return SDValue();
|
||||
|
||||
LLVMContext &Ctx = *DAG.getContext();
|
||||
unsigned NumElems = SrcVT.getVectorNumElements();
|
||||
assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
|
||||
assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
|
||||
|
||||
EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
|
||||
|
||||
// Extract lower/upper subvectors.
|
||||
unsigned NumSubElts = NumElems / 2;
|
||||
SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
|
||||
SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
|
||||
|
||||
// Pack to the largest type possible:
|
||||
// vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
|
||||
EVT InVT = MVT::i16, OutVT = MVT::i8;
|
||||
@ -16635,12 +16633,27 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
|
||||
OutVT = MVT::i16;
|
||||
}
|
||||
|
||||
// 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.
|
||||
if (SrcVT.is128BitVector()) {
|
||||
InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
|
||||
OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
|
||||
SDValue Res = DAG.getNode(Opcode, DL, OutVT,
|
||||
DAG.getBitcast(InVT, In), DAG.getUNDEF(InVT));
|
||||
Res = extractSubVector(Res, 0, DAG, DL, 64);
|
||||
return DAG.getBitcast(DstVT, Res);
|
||||
}
|
||||
|
||||
// Extract lower/upper subvectors.
|
||||
unsigned NumSubElts = NumElems / 2;
|
||||
SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
|
||||
SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
|
||||
|
||||
unsigned SubSizeInBits = SrcSizeInBits / 2;
|
||||
InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
|
||||
OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
|
||||
|
||||
// 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
|
||||
if (SrcVT.is256BitVector()) {
|
||||
if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
|
||||
Lo = DAG.getBitcast(InVT, Lo);
|
||||
Hi = DAG.getBitcast(InVT, Hi);
|
||||
SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
|
||||
@ -16669,7 +16682,7 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
|
||||
}
|
||||
|
||||
// Recursively pack lower/upper subvectors, concat result and pack again.
|
||||
assert(SrcSizeInBits >= 512 && "Expected 512-bit vector or greater");
|
||||
assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
|
||||
EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumSubElts);
|
||||
Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
|
||||
Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);
|
||||
|
@ -3099,98 +3099,27 @@ define <8 x i8> @trunc_packus_v8i32_v8i8(<8 x i32> %a0) {
|
||||
}
|
||||
|
||||
define void @trunc_packus_v8i32_v8i8_store(<8 x i32> %a0, <8 x i8> *%p1) {
|
||||
; SSE2-LABEL: trunc_packus_v8i32_v8i8_store:
|
||||
; SSE2: # %bb.0:
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255]
|
||||
; SSE2-NEXT: movdqa %xmm2, %xmm3
|
||||
; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
|
||||
; SSE2-NEXT: pand %xmm3, %xmm1
|
||||
; SSE2-NEXT: pandn %xmm2, %xmm3
|
||||
; SSE2-NEXT: por %xmm1, %xmm3
|
||||
; SSE2-NEXT: movdqa %xmm2, %xmm1
|
||||
; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
|
||||
; SSE2-NEXT: pand %xmm1, %xmm0
|
||||
; SSE2-NEXT: pandn %xmm2, %xmm1
|
||||
; SSE2-NEXT: por %xmm0, %xmm1
|
||||
; SSE2-NEXT: pxor %xmm0, %xmm0
|
||||
; SSE2-NEXT: movdqa %xmm1, %xmm4
|
||||
; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
|
||||
; SSE2-NEXT: movdqa %xmm3, %xmm5
|
||||
; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
|
||||
; SSE2-NEXT: pand %xmm2, %xmm5
|
||||
; SSE2-NEXT: pand %xmm3, %xmm5
|
||||
; SSE2-NEXT: pand %xmm2, %xmm4
|
||||
; SSE2-NEXT: pand %xmm1, %xmm4
|
||||
; SSE2-NEXT: packuswb %xmm5, %xmm4
|
||||
; SSE2-NEXT: packuswb %xmm4, %xmm4
|
||||
; SSE2-NEXT: movq %xmm4, (%rdi)
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSSE3-LABEL: trunc_packus_v8i32_v8i8_store:
|
||||
; SSSE3: # %bb.0:
|
||||
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255]
|
||||
; SSSE3-NEXT: movdqa %xmm2, %xmm3
|
||||
; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3
|
||||
; SSSE3-NEXT: pand %xmm3, %xmm1
|
||||
; SSSE3-NEXT: pandn %xmm2, %xmm3
|
||||
; SSSE3-NEXT: por %xmm1, %xmm3
|
||||
; SSSE3-NEXT: movdqa %xmm2, %xmm1
|
||||
; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1
|
||||
; SSSE3-NEXT: pand %xmm1, %xmm0
|
||||
; SSSE3-NEXT: pandn %xmm2, %xmm1
|
||||
; SSSE3-NEXT: por %xmm0, %xmm1
|
||||
; SSSE3-NEXT: pxor %xmm0, %xmm0
|
||||
; SSSE3-NEXT: movdqa %xmm1, %xmm2
|
||||
; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2
|
||||
; SSSE3-NEXT: pand %xmm1, %xmm2
|
||||
; SSSE3-NEXT: movdqa %xmm3, %xmm1
|
||||
; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1
|
||||
; SSSE3-NEXT: pand %xmm3, %xmm1
|
||||
; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
|
||||
; SSSE3-NEXT: pshufb %xmm0, %xmm1
|
||||
; SSSE3-NEXT: pshufb %xmm0, %xmm2
|
||||
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
|
||||
; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
|
||||
; SSSE3-NEXT: movq %xmm2, (%rdi)
|
||||
; SSSE3-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: trunc_packus_v8i32_v8i8_store:
|
||||
; SSE41: # %bb.0:
|
||||
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255]
|
||||
; SSE41-NEXT: pminsd %xmm2, %xmm0
|
||||
; SSE41-NEXT: pminsd %xmm2, %xmm1
|
||||
; SSE41-NEXT: pxor %xmm2, %xmm2
|
||||
; SSE41-NEXT: pmaxsd %xmm2, %xmm1
|
||||
; SSE41-NEXT: pmaxsd %xmm2, %xmm0
|
||||
; SSE41-NEXT: packssdw %xmm1, %xmm0
|
||||
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
|
||||
; SSE41-NEXT: movq %xmm0, (%rdi)
|
||||
; SSE41-NEXT: retq
|
||||
; SSE-LABEL: trunc_packus_v8i32_v8i8_store:
|
||||
; SSE: # %bb.0:
|
||||
; SSE-NEXT: packssdw %xmm1, %xmm0
|
||||
; SSE-NEXT: packuswb %xmm0, %xmm0
|
||||
; SSE-NEXT: movq %xmm0, (%rdi)
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
; AVX1-LABEL: trunc_packus_v8i32_v8i8_store:
|
||||
; AVX1: # %bb.0:
|
||||
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255]
|
||||
; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm2
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
||||
; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpmaxsd %xmm1, %xmm2, %xmm1
|
||||
; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm0
|
||||
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vmovq %xmm0, (%rdi)
|
||||
; AVX1-NEXT: vzeroupper
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: trunc_packus_v8i32_v8i8_store:
|
||||
; AVX2: # %bb.0:
|
||||
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
|
||||
; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
|
||||
; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
|
||||
; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vmovq %xmm0, (%rdi)
|
||||
; AVX2-NEXT: vzeroupper
|
||||
; AVX2-NEXT: retq
|
||||
|
@ -2959,93 +2959,17 @@ define <8 x i8> @trunc_ssat_v8i32_v8i8(<8 x i32> %a0) {
|
||||
}
|
||||
|
||||
define void @trunc_ssat_v8i32_v8i8_store(<8 x i32> %a0, <8 x i8> *%p1) {
|
||||
; SSE2-LABEL: trunc_ssat_v8i32_v8i8_store:
|
||||
; SSE2: # %bb.0:
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [127,127,127,127]
|
||||
; SSE2-NEXT: movdqa %xmm2, %xmm3
|
||||
; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
|
||||
; SSE2-NEXT: pand %xmm3, %xmm1
|
||||
; SSE2-NEXT: pandn %xmm2, %xmm3
|
||||
; SSE2-NEXT: por %xmm1, %xmm3
|
||||
; SSE2-NEXT: movdqa %xmm2, %xmm1
|
||||
; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
|
||||
; SSE2-NEXT: pand %xmm1, %xmm0
|
||||
; SSE2-NEXT: pandn %xmm2, %xmm1
|
||||
; SSE2-NEXT: por %xmm0, %xmm1
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [4294967168,4294967168,4294967168,4294967168]
|
||||
; SSE2-NEXT: movdqa %xmm1, %xmm2
|
||||
; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
|
||||
; SSE2-NEXT: pand %xmm2, %xmm1
|
||||
; SSE2-NEXT: pandn %xmm0, %xmm2
|
||||
; SSE2-NEXT: por %xmm1, %xmm2
|
||||
; SSE2-NEXT: movdqa %xmm3, %xmm1
|
||||
; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
|
||||
; SSE2-NEXT: pand %xmm1, %xmm3
|
||||
; SSE2-NEXT: pandn %xmm0, %xmm1
|
||||
; SSE2-NEXT: por %xmm3, %xmm1
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
|
||||
; SSE2-NEXT: pand %xmm0, %xmm1
|
||||
; SSE2-NEXT: pand %xmm0, %xmm2
|
||||
; SSE2-NEXT: packuswb %xmm1, %xmm2
|
||||
; SSE2-NEXT: packuswb %xmm2, %xmm2
|
||||
; SSE2-NEXT: movq %xmm2, (%rdi)
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSSE3-LABEL: trunc_ssat_v8i32_v8i8_store:
|
||||
; SSSE3: # %bb.0:
|
||||
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [127,127,127,127]
|
||||
; SSSE3-NEXT: movdqa %xmm2, %xmm3
|
||||
; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3
|
||||
; SSSE3-NEXT: pand %xmm3, %xmm1
|
||||
; SSSE3-NEXT: pandn %xmm2, %xmm3
|
||||
; SSSE3-NEXT: por %xmm1, %xmm3
|
||||
; SSSE3-NEXT: movdqa %xmm2, %xmm1
|
||||
; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1
|
||||
; SSSE3-NEXT: pand %xmm1, %xmm0
|
||||
; SSSE3-NEXT: pandn %xmm2, %xmm1
|
||||
; SSSE3-NEXT: por %xmm0, %xmm1
|
||||
; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [4294967168,4294967168,4294967168,4294967168]
|
||||
; SSSE3-NEXT: movdqa %xmm1, %xmm2
|
||||
; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2
|
||||
; SSSE3-NEXT: pand %xmm2, %xmm1
|
||||
; SSSE3-NEXT: pandn %xmm0, %xmm2
|
||||
; SSSE3-NEXT: por %xmm1, %xmm2
|
||||
; SSSE3-NEXT: movdqa %xmm3, %xmm1
|
||||
; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1
|
||||
; SSSE3-NEXT: pand %xmm1, %xmm3
|
||||
; SSSE3-NEXT: pandn %xmm0, %xmm1
|
||||
; SSSE3-NEXT: por %xmm3, %xmm1
|
||||
; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
|
||||
; SSSE3-NEXT: pshufb %xmm0, %xmm1
|
||||
; SSSE3-NEXT: pshufb %xmm0, %xmm2
|
||||
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
|
||||
; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
|
||||
; SSSE3-NEXT: movq %xmm2, (%rdi)
|
||||
; SSSE3-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: trunc_ssat_v8i32_v8i8_store:
|
||||
; SSE41: # %bb.0:
|
||||
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [127,127,127,127]
|
||||
; SSE41-NEXT: pminsd %xmm2, %xmm0
|
||||
; SSE41-NEXT: pminsd %xmm2, %xmm1
|
||||
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4294967168,4294967168,4294967168,4294967168]
|
||||
; SSE41-NEXT: pmaxsd %xmm2, %xmm1
|
||||
; SSE41-NEXT: pmaxsd %xmm2, %xmm0
|
||||
; SSE41-NEXT: packssdw %xmm1, %xmm0
|
||||
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
|
||||
; SSE41-NEXT: movq %xmm0, (%rdi)
|
||||
; SSE41-NEXT: retq
|
||||
; SSE-LABEL: trunc_ssat_v8i32_v8i8_store:
|
||||
; SSE: # %bb.0:
|
||||
; SSE-NEXT: packssdw %xmm1, %xmm0
|
||||
; SSE-NEXT: packsswb %xmm0, %xmm0
|
||||
; SSE-NEXT: movq %xmm0, (%rdi)
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
; AVX1-LABEL: trunc_ssat_v8i32_v8i8_store:
|
||||
; AVX1: # %bb.0:
|
||||
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127]
|
||||
; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm2
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
||||
; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168]
|
||||
; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpmaxsd %xmm1, %xmm2, %xmm1
|
||||
; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm0
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vmovq %xmm0, (%rdi)
|
||||
; AVX1-NEXT: vzeroupper
|
||||
@ -3053,10 +2977,6 @@ define void @trunc_ssat_v8i32_v8i8_store(<8 x i32> %a0, <8 x i8> *%p1) {
|
||||
;
|
||||
; AVX2-LABEL: trunc_ssat_v8i32_v8i8_store:
|
||||
; AVX2: # %bb.0:
|
||||
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [127,127,127,127,127,127,127,127]
|
||||
; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168]
|
||||
; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
|
||||
; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
|
||||
|
Loading…
x
Reference in New Issue
Block a user