[x86][SSE/AVX] optimize pcmp results better (PR28484)

We know that pcmp produces all-ones/all-zeros bitmasks, so we can use that behavior to avoid unnecessary constant loading.

One could argue that load+and is actually a better solution for some CPUs (Intel big cores) because shifts don't have the
same throughput potential as load+and on those cores, but that should be handled as a CPU-specific later transformation if
it ever comes up. Removing the load is the more general x86 optimization. Note that the uneven usage of vpbroadcast in the
test cases is filed as PR28505:
https://llvm.org/bugs/show_bug.cgi?id=28505

Differential Revision: http://reviews.llvm.org/D22225



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@275276 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Sanjay Patel 2016-07-13 16:04:07 +00:00
parent 549def0571
commit 2ca896a175
5 changed files with 66 additions and 41 deletions

View File

@ -28186,6 +28186,42 @@ static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
/// If this is a PCMPEQ or PCMPGT result that is bitwise-anded with 1 (this is
/// the x86 lowering of a SETCC + ZEXT), replace the 'and' with a shift-right to
/// eliminate loading the vector constant mask value. This relies on the fact
/// that a PCMP always creates an all-ones or all-zeros bitmask per element.
static SDValue combinePCMPAnd1(SDNode *N, SelectionDAG &DAG) {
SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
// TODO: Use AssertSext to mark any nodes that have the property of producing
// all-ones or all-zeros. Then check for that node rather than particular
// opcodes.
if (Op0.getOpcode() != X86ISD::PCMPEQ && Op0.getOpcode() != X86ISD::PCMPGT)
return SDValue();
// The existence of the PCMP node guarantees that we have the required SSE2 or
// AVX2 for a shift of this vector type, but there is no vector shift by
// immediate for a vector with byte elements (PSRLB). 512-bit vectors use the
// masked compare nodes, so they should not make it here.
EVT VT0 = Op0.getValueType();
EVT VT1 = Op1.getValueType();
unsigned EltBitWidth = VT0.getScalarType().getSizeInBits();
if (VT0 != VT1 || EltBitWidth == 8)
return SDValue();
assert(VT0.getSizeInBits() == 128 || VT0.getSizeInBits() == 256);
APInt SplatVal;
if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) || SplatVal != 1)
return SDValue();
SDLoc DL(N);
SDValue ShAmt = DAG.getConstant(EltBitWidth - 1, DL, MVT::i8);
SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
return DAG.getBitcast(N->getValueType(0), Shift);
}
static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
@ -28204,6 +28240,9 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
return R;
if (SDValue ShiftRight = combinePCMPAnd1(N, DAG))
return ShiftRight;
EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);

View File

@ -1919,10 +1919,9 @@ define <32 x i16> @zext_32xi1_to_32xi16(<32 x i16> %x, <32 x i16> %y) #0 {
; KNL-LABEL: zext_32xi1_to_32xi16:
; KNL: ## BB#0:
; KNL-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
; KNL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; KNL-NEXT: vpand %ymm2, %ymm0, %ymm0
; KNL-NEXT: vpsrlw $15, %ymm0, %ymm0
; KNL-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1
; KNL-NEXT: vpand %ymm2, %ymm1, %ymm1
; KNL-NEXT: vpsrlw $15, %ymm1, %ymm1
; KNL-NEXT: retq
;
; SKX-LABEL: zext_32xi1_to_32xi16:
@ -1939,7 +1938,7 @@ define <16 x i16> @zext_16xi1_to_16xi16(<16 x i16> %x, <16 x i16> %y) #0 {
; KNL-LABEL: zext_16xi1_to_16xi16:
; KNL: ## BB#0:
; KNL-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; KNL-NEXT: vpsrlw $15, %ymm0, %ymm0
; KNL-NEXT: retq
;
; SKX-LABEL: zext_16xi1_to_16xi16:
@ -1983,8 +1982,7 @@ define <4 x i32> @zext_4xi1_to_4x32(<4 x i8> %x, <4 x i8> %y) #0 {
; KNL-NEXT: vpand %xmm2, %xmm1, %xmm1
; KNL-NEXT: vpand %xmm2, %xmm0, %xmm0
; KNL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0
; KNL-NEXT: vpsrld $31, %xmm0, %xmm0
; KNL-NEXT: retq
;
; SKX-LABEL: zext_4xi1_to_4x32:
@ -2007,7 +2005,7 @@ define <2 x i64> @zext_2xi1_to_2xi64(<2 x i8> %x, <2 x i8> %y) #0 {
; KNL-NEXT: vpand %xmm2, %xmm1, %xmm1
; KNL-NEXT: vpand %xmm2, %xmm0, %xmm0
; KNL-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
; KNL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; KNL-NEXT: vpsrlq $63, %xmm0, %xmm0
; KNL-NEXT: retq
;
; SKX-LABEL: zext_2xi1_to_2xi64:

View File

@ -1215,7 +1215,7 @@ define <2 x i64> @test45(<2 x i16> %x, <2 x i16> %y) #0 {
; KNL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
; KNL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
; KNL-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
; KNL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; KNL-NEXT: vpsrlq $63, %xmm0, %xmm0
; KNL-NEXT: retq
;
; SKX-LABEL: test45:

View File

@ -26,14 +26,14 @@ define <8 x i16> @bar(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: bar:
; SSE: # BB#0:
; SSE-NEXT: pcmpeqw %xmm1, %xmm0
; SSE-NEXT: pand {{.*}}(%rip), %xmm0
; SSE-NEXT: psrlw $15, %xmm0
; SSE-NEXT: psllw $5, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: bar:
; AVX: # BB#0:
; AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vpsrlw $15, %xmm0, %xmm0
; AVX-NEXT: vpsllw $5, %xmm0, %xmm0
; AVX-NEXT: retq
;

View File

@ -294,10 +294,9 @@ define <16 x i16> @cmpeq_zext_v16i16(<16 x i16> %a, <16 x i16> %b) {
; SSE-LABEL: cmpeq_zext_v16i16:
; SSE: # BB#0:
; SSE-NEXT: pcmpeqw %xmm2, %xmm0
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1]
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: psrlw $15, %xmm0
; SSE-NEXT: pcmpeqw %xmm3, %xmm1
; SSE-NEXT: pand %xmm2, %xmm1
; SSE-NEXT: psrlw $15, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: cmpeq_zext_v16i16:
@ -313,7 +312,7 @@ define <16 x i16> @cmpeq_zext_v16i16(<16 x i16> %a, <16 x i16> %b) {
; AVX2-LABEL: cmpeq_zext_v16i16:
; AVX2: # BB#0:
; AVX2-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpsrlw $15, %ymm0, %ymm0
; AVX2-NEXT: retq
;
%cmp = icmp eq <16 x i16> %a, %b
@ -325,21 +324,14 @@ define <4 x i32> @cmpeq_zext_v4i32(<4 x i32> %a, <4 x i32> %b) {
; SSE-LABEL: cmpeq_zext_v4i32:
; SSE: # BB#0:
; SSE-NEXT: pcmpeqd %xmm1, %xmm0
; SSE-NEXT: pand {{.*}}(%rip), %xmm0
; SSE-NEXT: psrld $31, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: cmpeq_zext_v4i32:
; AVX1: # BB#0:
; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: cmpeq_zext_v4i32:
; AVX2: # BB#0:
; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
; AVX-LABEL: cmpeq_zext_v4i32:
; AVX: # BB#0:
; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpsrld $31, %xmm0, %xmm0
; AVX-NEXT: retq
;
%cmp = icmp eq <4 x i32> %a, %b
%zext = zext <4 x i1> %cmp to <4 x i32>
@ -363,10 +355,9 @@ define <4 x i64> @cmpeq_zext_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE42-LABEL: cmpeq_zext_v4i64:
; SSE42: # BB#0:
; SSE42-NEXT: pcmpeqq %xmm2, %xmm0
; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [1,1]
; SSE42-NEXT: pand %xmm2, %xmm0
; SSE42-NEXT: psrlq $63, %xmm0
; SSE42-NEXT: pcmpeqq %xmm3, %xmm1
; SSE42-NEXT: pand %xmm2, %xmm1
; SSE42-NEXT: psrlq $63, %xmm1
; SSE42-NEXT: retq
;
; AVX1-LABEL: cmpeq_zext_v4i64:
@ -382,8 +373,7 @@ define <4 x i64> @cmpeq_zext_v4i64(<4 x i64> %a, <4 x i64> %b) {
; AVX2-LABEL: cmpeq_zext_v4i64:
; AVX2: # BB#0:
; AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpsrlq $63, %ymm0, %ymm0
; AVX2-NEXT: retq
;
%cmp = icmp eq <4 x i64> %a, %b
@ -426,13 +416,13 @@ define <8 x i16> @cmpgt_zext_v8i16(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: cmpgt_zext_v8i16:
; SSE: # BB#0:
; SSE-NEXT: pcmpgtw %xmm1, %xmm0
; SSE-NEXT: pand {{.*}}(%rip), %xmm0
; SSE-NEXT: psrlw $15, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: cmpgt_zext_v8i16:
; AVX: # BB#0:
; AVX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vpsrlw $15, %xmm0, %xmm0
; AVX-NEXT: retq
;
%cmp = icmp sgt <8 x i16> %a, %b
@ -444,10 +434,9 @@ define <8 x i32> @cmpgt_zext_v8i32(<8 x i32> %a, <8 x i32> %b) {
; SSE-LABEL: cmpgt_zext_v8i32:
; SSE: # BB#0:
; SSE-NEXT: pcmpgtd %xmm2, %xmm0
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1]
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: psrld $31, %xmm0
; SSE-NEXT: pcmpgtd %xmm3, %xmm1
; SSE-NEXT: pand %xmm2, %xmm1
; SSE-NEXT: psrld $31, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: cmpgt_zext_v8i32:
@ -463,8 +452,7 @@ define <8 x i32> @cmpgt_zext_v8i32(<8 x i32> %a, <8 x i32> %b) {
; AVX2-LABEL: cmpgt_zext_v8i32:
; AVX2: # BB#0:
; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpsrld $31, %ymm0, %ymm0
; AVX2-NEXT: retq
;
%cmp = icmp sgt <8 x i32> %a, %b
@ -492,13 +480,13 @@ define <2 x i64> @cmpgt_zext_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE42-LABEL: cmpgt_zext_v2i64:
; SSE42: # BB#0:
; SSE42-NEXT: pcmpgtq %xmm1, %xmm0
; SSE42-NEXT: pand {{.*}}(%rip), %xmm0
; SSE42-NEXT: psrlq $63, %xmm0
; SSE42-NEXT: retq
;
; AVX-LABEL: cmpgt_zext_v2i64:
; AVX: # BB#0:
; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vpsrlq $63, %xmm0, %xmm0
; AVX-NEXT: retq
;
%cmp = icmp sgt <2 x i64> %a, %b