diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 1de40c12562..53d56b950aa 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -12733,7 +12733,9 @@ static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef Mask, // efficient instructions that mirror the shuffles across the four 128-bit // lanes. SmallVector RepeatedMask; - if (is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask)) { + bool Is128BitLaneRepeatedShuffle = + is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask); + if (Is128BitLaneRepeatedShuffle) { assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!"); if (V2.isUndef()) return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1, @@ -12761,6 +12763,16 @@ static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef Mask, DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG)) return Rotate; + // Assume that a single SHUFPS is faster than using a permv shuffle. + // If some CPU is harmed by the domain switch, we can fix it in a later pass. + if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) { + SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1); + SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2); + SDValue ShufPS = + DAG.getVectorShuffle(MVT::v16f32, DL, CastV1, CastV2, Mask); + return DAG.getBitcast(MVT::v16i32, ShufPS); + } + return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG); } diff --git a/test/CodeGen/X86/vector-shuffle-512-v16.ll b/test/CodeGen/X86/vector-shuffle-512-v16.ll index 297fbc825c7..7f7c27af47b 100644 --- a/test/CodeGen/X86/vector-shuffle-512-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-512-v16.ll @@ -360,8 +360,7 @@ define <16 x i32> @shuffle_v16i32_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_0 define <16 x i32> @shuffle_v16i32_00_03_16_19_04_07_20_23_08_11_24_27_12_15_28_31(<16 x i32> %a, <16 x i32> %b) { ; ALL-LABEL: shuffle_v16i32_00_03_16_19_04_07_20_23_08_11_24_27_12_15_28_31: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,3,16,19,4,7,20,23,8,11,24,27,12,15,28,31] -; ALL-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 +; ALL-NEXT: vshufps {{.*#+}} zmm0 = zmm0[0,3],zmm1[0,3],zmm0[4,7],zmm1[4,7],zmm0[8,11],zmm1[8,11],zmm0[12,15],zmm1[12,15] ; ALL-NEXT: retq %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> ret <16 x i32> %shuffle @@ -370,9 +369,7 @@ define <16 x i32> @shuffle_v16i32_00_03_16_19_04_07_20_23_08_11_24_27_12_15_28_3 define <16 x i32> @shuffle_v16i32_16_16_02_03_20_20_06_07_24_24_10_11_28_28_uu_uu(<16 x i32> %a, <16 x i32> %b) { ; ALL-LABEL: shuffle_v16i32_16_16_02_03_20_20_06_07_24_24_10_11_28_28_uu_uu: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa32 {{.*#+}} zmm2 = <0,0,18,19,4,4,22,23,8,8,26,27,12,12,u,u> -; ALL-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 -; ALL-NEXT: vmovdqa64 %zmm2, %zmm0 +; ALL-NEXT: vshufps {{.*#+}} zmm0 = zmm1[0,0],zmm0[2,3],zmm1[4,4],zmm0[6,7],zmm1[8,8],zmm0[10,11],zmm1[12,12],zmm0[14,15] ; ALL-NEXT: retq %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> ret <16 x i32> %shuffle @@ -381,9 +378,7 @@ define <16 x i32> @shuffle_v16i32_16_16_02_03_20_20_06_07_24_24_10_11_28_28_uu_u define <16 x i32> @shuffle_v8i32_17_16_01_00_21_20_05_04_25_24_09_08_29_28_13_12(<16 x i32> %a, <16 x i32> %b) { ; ALL-LABEL: shuffle_v8i32_17_16_01_00_21_20_05_04_25_24_09_08_29_28_13_12: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa32 {{.*#+}} zmm2 = [1,0,17,16,5,4,21,20,9,8,25,24,13,12,29,28] -; ALL-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 -; ALL-NEXT: vmovdqa64 %zmm2, %zmm0 +; ALL-NEXT: vshufps {{.*#+}} zmm0 = zmm1[1,0],zmm0[1,0],zmm1[5,4],zmm0[5,4],zmm1[9,8],zmm0[9,8],zmm1[13,12],zmm0[13,12] ; ALL-NEXT: retq %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> ret <16 x i32> %shuffle