From 41f7baf181ef55fb6935ded8ced3797701a681ca Mon Sep 17 00:00:00 2001 From: Elena Demikhovsky Date: Sun, 25 Aug 2013 12:54:30 +0000 Subject: [PATCH] AVX-512: added UNPACK instructions and tests for all-zero/all-ones vectors git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@189189 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 129 ++++++++++++++---------- lib/Target/X86/X86InstrAVX512.td | 62 ++++++++++++ lib/Target/X86/X86InstrInfo.cpp | 6 +- test/CodeGen/X86/avx512-build-vector.ll | 18 ++++ test/CodeGen/X86/avx512-shuffle.ll | 55 ++++++++-- 5 files changed, 207 insertions(+), 63 deletions(-) create mode 100644 test/CodeGen/X86/avx512-build-vector.ll diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 6e9ecef74d1..a00f848d2af 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -3866,37 +3866,46 @@ SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp, static bool isUNPCKLMask(ArrayRef Mask, MVT VT, bool HasInt256, bool V2IsSplat = false) { - if (VT.is512BitVector()) - return false; - assert((VT.is128BitVector() || VT.is256BitVector()) && - "Unsupported vector type for unpckh"); + assert(VT.getSizeInBits() >= 128 && + "Unsupported vector type for unpckl"); + // AVX defines UNPCK* to operate independently on 128-bit lanes. + unsigned NumLanes; + unsigned NumOf256BitLanes; unsigned NumElts = VT.getVectorNumElements(); - if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 && - (!HasInt256 || (NumElts != 16 && NumElts != 32))) + if (VT.is256BitVector()) { + if (NumElts != 4 && NumElts != 8 && + (!HasInt256 || (NumElts != 16 && NumElts != 32))) return false; + NumLanes = 2; + NumOf256BitLanes = 1; + } else if (VT.is512BitVector()) { + assert(VT.getScalarType().getSizeInBits() >= 32 && + "Unsupported vector type for unpckh"); + NumLanes = 2; + NumOf256BitLanes = 2; + } else { + NumLanes = 1; + NumOf256BitLanes = 1; + } - // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate - // independently on 128-bit lanes. - unsigned NumLanes = VT.getSizeInBits()/128; - unsigned NumLaneElts = NumElts/NumLanes; + unsigned NumEltsInStride = NumElts/NumOf256BitLanes; + unsigned NumLaneElts = NumEltsInStride/NumLanes; - for (unsigned l = 0; l != NumElts; l += NumLaneElts) { - for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) { - int BitI = Mask[l+i]; - int BitI1 = Mask[l+i+1]; - if (!isUndefOrEqual(BitI, j)) - return false; - if (V2IsSplat) { - if (!isUndefOrEqual(BitI1, NumElts)) + for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) { + for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) { + for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) { + int BitI = Mask[l256*NumEltsInStride+l+i]; + int BitI1 = Mask[l256*NumEltsInStride+l+i+1]; + if (!isUndefOrEqual(BitI, j+l256*NumElts)) return false; - } else { - if (!isUndefOrEqual(BitI1, j + NumElts)) + if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts)) + return false; + if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride)) return false; } } } - return true; } @@ -3904,33 +3913,42 @@ static bool isUNPCKLMask(ArrayRef Mask, MVT VT, /// specifies a shuffle of elements that is suitable for input to UNPCKH. static bool isUNPCKHMask(ArrayRef Mask, MVT VT, bool HasInt256, bool V2IsSplat = false) { - unsigned NumElts = VT.getVectorNumElements(); - - if (VT.is512BitVector()) - return false; - assert((VT.is128BitVector() || VT.is256BitVector()) && + assert(VT.getSizeInBits() >= 128 && "Unsupported vector type for unpckh"); - if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 && - (!HasInt256 || (NumElts != 16 && NumElts != 32))) + // AVX defines UNPCK* to operate independently on 128-bit lanes. + unsigned NumLanes; + unsigned NumOf256BitLanes; + unsigned NumElts = VT.getVectorNumElements(); + if (VT.is256BitVector()) { + if (NumElts != 4 && NumElts != 8 && + (!HasInt256 || (NumElts != 16 && NumElts != 32))) return false; + NumLanes = 2; + NumOf256BitLanes = 1; + } else if (VT.is512BitVector()) { + assert(VT.getScalarType().getSizeInBits() >= 32 && + "Unsupported vector type for unpckh"); + NumLanes = 2; + NumOf256BitLanes = 2; + } else { + NumLanes = 1; + NumOf256BitLanes = 1; + } - // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate - // independently on 128-bit lanes. - unsigned NumLanes = VT.getSizeInBits()/128; - unsigned NumLaneElts = NumElts/NumLanes; + unsigned NumEltsInStride = NumElts/NumOf256BitLanes; + unsigned NumLaneElts = NumEltsInStride/NumLanes; - for (unsigned l = 0; l != NumElts; l += NumLaneElts) { - for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) { - int BitI = Mask[l+i]; - int BitI1 = Mask[l+i+1]; - if (!isUndefOrEqual(BitI, j)) - return false; - if (V2IsSplat) { - if (isUndefOrEqual(BitI1, NumElts)) + for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) { + for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) { + for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) { + int BitI = Mask[l256*NumEltsInStride+l+i]; + int BitI1 = Mask[l256*NumEltsInStride+l+i+1]; + if (!isUndefOrEqual(BitI, j+l256*NumElts)) return false; - } else { - if (!isUndefOrEqual(BitI1, j+NumElts)) + if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts)) + return false; + if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride)) return false; } } @@ -4336,7 +4354,7 @@ bool X86::isVEXTRACT256Index(SDNode *N) { static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) { MVT VT = N->getSimpleValueType(0); - assert((VT.is128BitVector() || VT.is256BitVector()) && + assert((VT.getSizeInBits() >= 128) && "Unsupported vector type for PSHUF/SHUFP"); // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate @@ -4345,10 +4363,10 @@ static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) { unsigned NumLanes = VT.getSizeInBits()/128; unsigned NumLaneElts = NumElts/NumLanes; - assert((NumLaneElts == 2 || NumLaneElts == 4) && - "Only supports 2 or 4 elements per lane"); + assert((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) && + "Only supports 2, 4 or 8 elements per lane"); - unsigned Shift = (NumLaneElts == 4) ? 1 : 0; + unsigned Shift = (NumLaneElts >= 4) ? 1 : 0; unsigned Mask = 0; for (unsigned i = 0; i != NumElts; ++i) { int Elt = N->getMaskElt(i); @@ -4680,6 +4698,11 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, array_lengthof(Ops)); } + } else if (VT.is512BitVector()) { // AVX-512 + SDValue Cst = DAG.getTargetConstant(0, MVT::i32); + SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, + Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops, 16); } else llvm_unreachable("Unexpected vector type"); @@ -5674,10 +5697,13 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { DAG.getIntPtrConstant(0)); } - if (!isSplatVector(Op.getNode())) - llvm_unreachable("Unsupported predicate operation"); - + // Splat vector (with undefs) SDValue In = Op.getOperand(0); + for (unsigned i = 1, e = Op.getNumOperands(); i != e; ++i) { + if (Op.getOperand(i) != In && Op.getOperand(i).getOpcode() != ISD::UNDEF) + llvm_unreachable("Unsupported predicate operation"); + } + SDValue EFLAGS, X86CC; if (In.getOpcode() == ISD::SETCC) { SDValue Op0 = In.getOperand(0); @@ -5759,7 +5785,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256())) return Op; - return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl); + if (!VT.is512BitVector()) + return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl); } SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG); @@ -5841,7 +5868,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || (ExtVT == MVT::i64 && Subtarget->is64Bit())) { - if (VT.is256BitVector()) { + if (VT.is256BitVector() || VT.is512BitVector()) { SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl); return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec, Item, DAG.getIntPtrConstant(0)); diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 0cb946ddb8c..17be5df3948 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -1560,6 +1560,68 @@ defm VPMULUDQZ : avx512_binop_rm2<0xF4, "vpmuludq", v8i64, v16i32, def : Pat<(v8i64 (X86pmuludq (v16i32 VR512:$src1), (v16i32 VR512:$src2))), (VPMULUDQZrr VR512:$src1, VR512:$src2)>; +//===----------------------------------------------------------------------===// +// AVX-512 - Unpack Instructions +//===----------------------------------------------------------------------===// + +multiclass avx512_unpack_fp opc, SDNode OpNode, ValueType vt, + PatFrag mem_frag, RegisterClass RC, + X86MemOperand x86memop, string asm, + Domain d> { + def rr : AVX512PI, EVEX_4V, TB; + def rm : AVX512PI, EVEX_4V, TB; +} + +defm VUNPCKHPSZ: avx512_unpack_fp<0x15, X86Unpckh, v16f32, memopv8f64, + VR512, f512mem, "vunpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedSingle>, EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VUNPCKHPDZ: avx512_unpack_fp<0x15, X86Unpckh, v8f64, memopv8f64, + VR512, f512mem, "vunpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedDouble>, OpSize, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +defm VUNPCKLPSZ: avx512_unpack_fp<0x14, X86Unpckl, v16f32, memopv8f64, + VR512, f512mem, "vunpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedSingle>, EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VUNPCKLPDZ: avx512_unpack_fp<0x14, X86Unpckl, v8f64, memopv8f64, + VR512, f512mem, "vunpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedDouble>, OpSize, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + +multiclass avx512_unpack_int opc, string OpcodeStr, SDNode OpNode, + ValueType OpVT, RegisterClass RC, PatFrag memop_frag, + X86MemOperand x86memop> { + def rr : AVX512BI, EVEX_4V; + def rm : AVX512BI, EVEX_4V; +} +defm VPUNPCKLDQZ : avx512_unpack_int<0x62, "vpunpckldq", X86Unpckl, v16i32, + VR512, memopv16i32, i512mem>, EVEX_V512, + EVEX_CD8<32, CD8VF>; +defm VPUNPCKLQDQZ : avx512_unpack_int<0x6C, "vpunpcklqdq", X86Unpckl, v8i64, + VR512, memopv8i64, i512mem>, EVEX_V512, + VEX_W, EVEX_CD8<64, CD8VF>; +defm VPUNPCKHDQZ : avx512_unpack_int<0x6A, "vpunpckhdq", X86Unpckh, v16i32, + VR512, memopv16i32, i512mem>, EVEX_V512, + EVEX_CD8<32, CD8VF>; +defm VPUNPCKHQDQZ : avx512_unpack_int<0x6D, "vpunpckhqdq", X86Unpckh, v8i64, + VR512, memopv8i64, i512mem>, EVEX_V512, + VEX_W, EVEX_CD8<64, CD8VF>; + //===----------------------------------------------------------------------===// // AVX-512 Logical Instructions //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 71df2bb6c8f..c4c090b5c95 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -2939,7 +2939,6 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg, if (X86::FR32XRegClass.contains(DestReg) && X86::GR32RegClass.contains(SrcReg)) // Copy from a GR32 register to a FR32 register. return HasAVX512 ? X86::VMOVDI2SSZrr : (HasAVX ? X86::VMOVDI2SSrr : X86::MOVDI2SSrr); - return 0; } @@ -3781,6 +3780,8 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { case X86::AVX_SET0: assert(HasAVX && "AVX not supported"); return Expand2AddrUndef(MIB, get(X86::VXORPSYrr)); + case X86::AVX512_512_SET0: + return Expand2AddrUndef(MIB, get(X86::VPXORDZrr)); case X86::V_SETALLONES: return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr)); case X86::AVX2_SETALLONES: @@ -3788,6 +3789,9 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { case X86::TEST8ri_NOREX: MI->setDesc(get(X86::TEST8ri)); return true; + case X86::KSET0W: return Expand2AddrUndef(MIB, get(X86::KXORWrr)); + case X86::KSET1B: + case X86::KSET1W: return Expand2AddrUndef(MIB, get(X86::KXNORWrr)); } return false; } diff --git a/test/CodeGen/X86/avx512-build-vector.ll b/test/CodeGen/X86/avx512-build-vector.ll new file mode 100644 index 00000000000..bc4560b3f3f --- /dev/null +++ b/test/CodeGen/X86/avx512-build-vector.ll @@ -0,0 +1,18 @@ +; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s + +; CHECK-LABEL: test1 +; CHECK: vpxord +; CHECK: ret +define <16 x i32> @test1(i32* %x) { + %y = load i32* %x, align 4 + %res = insertelement <16 x i32>zeroinitializer, i32 %y, i32 4 + ret <16 x i32>%res +} + +; CHECK-LABEL: test2 +; CHECK: vpaddd LCP{{.*}}(%rip){1to16} +; CHECK: ret +define <16 x i32> @test2(<16 x i32> %x) { + %res = add <16 x i32>, %x + ret <16 x i32>%res +} \ No newline at end of file diff --git a/test/CodeGen/X86/avx512-shuffle.ll b/test/CodeGen/X86/avx512-shuffle.ll index 8f8bf42ad36..df9106eef3f 100644 --- a/test/CodeGen/X86/avx512-shuffle.ll +++ b/test/CodeGen/X86/avx512-shuffle.ll @@ -16,7 +16,7 @@ ; CHECK: .long 0 ; CHECK: .long 10 ; CHECK: .long 1 -; CHECK: test1: +; CHECK-LABEL: test1: ; CHECK: vpermps ; CHECK: ret define <16 x float> @test1(<16 x float> %a) nounwind { @@ -24,7 +24,7 @@ define <16 x float> @test1(<16 x float> %a) nounwind { ret <16 x float> %c } -; CHECK: test2: +; CHECK-LABEL: test2: ; CHECK: vpermd ; CHECK: ret define <16 x i32> @test2(<16 x i32> %a) nounwind { @@ -32,7 +32,7 @@ define <16 x i32> @test2(<16 x i32> %a) nounwind { ret <16 x i32> %c } -; CHECK: test3: +; CHECK-LABEL: test3: ; CHECK: vpermq ; CHECK: ret define <8 x i64> @test3(<8 x i64> %a) nounwind { @@ -40,7 +40,7 @@ define <8 x i64> @test3(<8 x i64> %a) nounwind { ret <8 x i64> %c } -; CHECK: test4: +; CHECK-LABEL: test4: ; CHECK: vpermpd ; CHECK: ret define <8 x double> @test4(<8 x double> %a) nounwind { @@ -48,7 +48,7 @@ define <8 x double> @test4(<8 x double> %a) nounwind { ret <8 x double> %c } -; CHECK: test5: +; CHECK-LABEL: test5: ; CHECK: vpermi2pd ; CHECK: ret define <8 x double> @test5(<8 x double> %a, <8 x double> %b) nounwind { @@ -56,7 +56,7 @@ define <8 x double> @test5(<8 x double> %a, <8 x double> %b) nounwind { ret <8 x double> %c } -; CHECK: test6: +; CHECK-LABEL: test6: ; CHECK: vpermq $30 ; CHECK: ret define <8 x i64> @test6(<8 x i64> %a) nounwind { @@ -64,7 +64,7 @@ define <8 x i64> @test6(<8 x i64> %a) nounwind { ret <8 x i64> %c } -; CHECK: test7: +; CHECK-LABEL: test7: ; CHECK: vpermi2q ; CHECK: ret define <8 x i64> @test7(<8 x i64> %a, <8 x i64> %b) nounwind { @@ -72,7 +72,7 @@ define <8 x i64> @test7(<8 x i64> %a, <8 x i64> %b) nounwind { ret <8 x i64> %c } -; CHECK: test8: +; CHECK-LABEL: test8: ; CHECK: vpermi2d ; CHECK: ret define <16 x i32> @test8(<16 x i32> %a, <16 x i32> %b) nounwind { @@ -80,7 +80,7 @@ define <16 x i32> @test8(<16 x i32> %a, <16 x i32> %b) nounwind { ret <16 x i32> %c } -; CHECK: test9: +; CHECK-LABEL: test9: ; CHECK: vpermi2ps ; CHECK: ret define <16 x float> @test9(<16 x float> %a, <16 x float> %b) nounwind { @@ -88,7 +88,7 @@ define <16 x float> @test9(<16 x float> %a, <16 x float> %b) nounwind { ret <16 x float> %c } -; CHECK: test10: +; CHECK-LABEL: test10: ; CHECK: vpermi2ps ( ; CHECK: ret define <16 x float> @test10(<16 x float> %a, <16 x float>* %b) nounwind { @@ -97,7 +97,7 @@ define <16 x float> @test10(<16 x float> %a, <16 x float>* %b) nounwind { ret <16 x float> %d } -; CHECK: test11: +; CHECK-LABEL: test11: ; CHECK: vpermi2d ( ; CHECK: ret define <16 x i32> @test11(<16 x i32> %a, <16 x i32>* %b) nounwind { @@ -105,3 +105,36 @@ define <16 x i32> @test11(<16 x i32> %a, <16 x i32>* %b) nounwind { %d = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32> ret <16 x i32> %d } + +; CHECK-LABEL: test18 +; CHECK: vpunpckhdq %zmm +; CHECK: ret +define <16 x i32> @test18(<16 x i32> %a, <16 x i32> %c) { + %b = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32> + ret <16 x i32> %b +} + +; CHECK-LABEL: test19 +; CHECK: vpunpckldq %zmm +; CHECK: ret +define <16 x i32> @test19(<16 x i32> %a, <16 x i32> %c) { + %b = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32> + ret <16 x i32> %b +} + +; CHECK-LABEL: test20 +; CHECK: vpunpckhqdq %zmm +; CHECK: ret +define <8 x i64> @test20(<8 x i64> %a, <8 x i64> %c) { + %b = shufflevector <8 x i64> %a, <8 x i64> %c, <8 x i32> + ret <8 x i64> %b +} + +; CHECK-LABEL: test21 +; CHECK: vunpcklps %zmm +; CHECK: ret +define <16 x float> @test21(<16 x float> %a, <16 x float> %c) { + %b = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32> + ret <16 x float> %b +} +