From c6fddca2944bd562c12f90575afc847d5e4783d6 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sat, 19 Nov 2016 20:41:48 +0000 Subject: [PATCH] [X86][SSE] Improve PSHUFB lowering from either input Canonicalization may leave the zeroable vector in the first input. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@287461 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 15 ++++++++++----- test/CodeGen/X86/vector-shuffle-512-v64.ll | 11 ++--------- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index cef9f1e6dad..cd6d9a402e0 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -7582,8 +7582,7 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef Mask, return Zeroable; } -/// Try to lower a shuffle with a single PSHUFB of V1. -/// This is only possible if V2 is unused (at all, or only for zero elements). +/// Try to lower a shuffle with a single PSHUFB of V1 or V2. static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT, ArrayRef Mask, SDValue V1, SDValue V2, @@ -7603,6 +7602,7 @@ static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT, // Sign bit set in i8 mask means zero element. SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8); + SDValue V; for (int i = 0; i < NumBytes; ++i) { int M = Mask[i / NumEltBytes]; if (M < 0) { @@ -7613,9 +7613,13 @@ static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT, PSHUFBMask[i] = ZeroMask; continue; } - // Only allow V1. - if (M >= Size) + + // We can only use a single input of V1 or V2. + SDValue SrcV = (M >= Size ? V2 : V1); + if (V && V != SrcV) return SDValue(); + V = SrcV; + M %= Size; // PSHUFB can't cross lanes, ensure this doesn't happen. if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize)) @@ -7625,10 +7629,11 @@ static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT, M = M * NumEltBytes + (i % NumEltBytes); PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8); } + assert(V && "Failed to find a source input"); MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes); return DAG.getBitcast( - VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V1), + VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V), DAG.getBuildVector(I8VT, DL, PSHUFBMask))); } diff --git a/test/CodeGen/X86/vector-shuffle-512-v64.ll b/test/CodeGen/X86/vector-shuffle-512-v64.ll index f3f0a1438a4..5b6c2be1b34 100644 --- a/test/CodeGen/X86/vector-shuffle-512-v64.ll +++ b/test/CodeGen/X86/vector-shuffle-512-v64.ll @@ -120,11 +120,7 @@ define <64 x i8> @shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz(<64 x i8> %a) { ; ; AVX512BW-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: movl $255, %eax -; AVX512BW-NEXT: vmovd %eax, %xmm1 -; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz: @@ -137,10 +133,7 @@ define <64 x i8> @shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz(<64 x i8> %a) { ; ; AVX512VBMI-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz: ; AVX512VBMI: # BB#0: -; AVX512VBMI-NEXT: vmovdqu8 {{.*#+}} zmm2 = [64,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63] -; AVX512VBMI-NEXT: vpxord %zmm1, %zmm1, %zmm1 -; AVX512VBMI-NEXT: vpermt2b %zmm0, %zmm2, %zmm1 -; AVX512VBMI-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512VBMI-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512VBMI-NEXT: retq %shuffle = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> ret <64 x i8> %shuffle