[X86] combineBitcastvxi1 - don't prematurely create PACKSS nodes.

Similar to Issue #63710 - by truncating the v8i16 result with a PACKSS node before type legalization, we fail to make use of various folds that rely on TRUNCATE nodes.

This required tweaks to LowerTruncateVecPackWithSignBits to recognise when the truncation source has been widened and to more closely match combineVectorSignBitsTruncation wrt truncating with PACKSS/PACKUS on AVX512 targets.

One of the last stages before we can finally get rid of combineVectorSignBitsTruncation.
This commit is contained in:
Simon Pilgrim 2023-07-21 19:10:06 +01:00
parent c6c5aad6a2
commit 65c9153cf0
2 changed files with 26 additions and 7 deletions

View File

@ -22945,6 +22945,26 @@ static SDValue LowerTruncateVecPackWithSignBits(MVT DstVT, SDValue In,
(DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
return SDValue();
// Don't lower with PACK nodes on AVX512 targets if we'd need more than one.
if (Subtarget.hasAVX512() &&
SrcSVT.getSizeInBits() > (DstSVT.getSizeInBits() * 2))
return SDValue();
// If the upper half of the source is undef, then attempt to split and
// only truncate the lower half.
if (DstVT.getSizeInBits() >= 128) {
SmallVector<SDValue> LowerOps;
if (isUpperSubvectorUndef(In, LowerOps, DAG)) {
MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
MVT SrcHalfVT = SrcVT.getHalfNumVectorElementsVT();
SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcHalfVT, LowerOps);
if (SDValue Res = LowerTruncateVecPackWithSignBits(DstHalfVT, Lo, DL,
Subtarget, DAG))
return widenSubVector(Res, false, Subtarget, DAG, DL,
DstVT.getSizeInBits());
}
}
unsigned NumSrcEltBits = SrcVT.getScalarSizeInBits();
unsigned NumPackedSignBits = std::min<unsigned>(DstSVT.getSizeInBits(), 16);
unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
@ -45059,9 +45079,10 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
V = getPMOVMSKB(DL, V, DAG, Subtarget);
} else {
if (SExtVT == MVT::v8i16)
V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,
DAG.getUNDEF(MVT::v8i16));
if (SExtVT == MVT::v8i16) {
V = widenSubVector(V, false, Subtarget, DAG, DL, 256);
V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v16i8, V);
}
V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
}

View File

@ -1193,9 +1193,8 @@ define i8 @icmp0_v8i1(<8 x i8>) nounwind {
; SSE2: # %bb.0:
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: psllw $15, %xmm0
; SSE2-NEXT: psraw $15, %xmm0
; SSE2-NEXT: pmovmskb %xmm0, %eax
; SSE2-NEXT: testl %eax, %eax
; SSE2-NEXT: testl $43690, %eax # imm = 0xAAAA
; SSE2-NEXT: sete %al
; SSE2-NEXT: ret{{[l|q]}}
;
@ -1203,9 +1202,8 @@ define i8 @icmp0_v8i1(<8 x i8>) nounwind {
; SSE41: # %bb.0:
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SSE41-NEXT: psllw $15, %xmm0
; SSE41-NEXT: psraw $15, %xmm0
; SSE41-NEXT: pmovmskb %xmm0, %eax
; SSE41-NEXT: testl %eax, %eax
; SSE41-NEXT: testl $43690, %eax # imm = 0xAAAA
; SSE41-NEXT: sete %al
; SSE41-NEXT: retq
;