From a04f9019ef9b5003accd57348ceb9fbe7af9f3a2 Mon Sep 17 00:00:00 2001 From: Nemanja Ivanovic Date: Fri, 23 Sep 2016 13:25:31 +0000 Subject: [PATCH] [Power9] Exploit move and splat instructions for build_vector improvement This patch corresponds to review: https://reviews.llvm.org/D21135 This patch exploits the following instructions: mtvsrws lxvwsx mtvsrdd mfvsrld In order to improve some build_vector and extractelement patterns. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@282246 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../PowerPC/InstPrinter/PPCInstPrinter.cpp | 6 +- lib/Target/PowerPC/PPCISelLowering.cpp | 40 ++++- lib/Target/PowerPC/PPCInstrFormats.td | 7 + lib/Target/PowerPC/PPCInstrInfo.td | 1 + lib/Target/PowerPC/PPCInstrVSX.td | 47 ++++- .../PowerPC/power9-moves-and-splats.ll | 167 ++++++++++++++++++ test/CodeGen/PowerPC/ppc64-i128-abi.ll | 18 +- 7 files changed, 273 insertions(+), 13 deletions(-) create mode 100644 test/CodeGen/PowerPC/power9-moves-and-splats.ll diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp index 8190f31004e..9513fd3cdaf 100644 --- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp +++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp @@ -328,10 +328,12 @@ void PPCInstPrinter::printU7ImmOperand(const MCInst *MI, unsigned OpNo, O << (unsigned int)Value; } +// Operands of BUILD_VECTOR are signed and we use this to print operands +// of XXSPLTIB which are unsigned. So we simply truncate to 8 bits and +// print as unsigned. void PPCInstPrinter::printU8ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - unsigned int Value = MI->getOperand(OpNo).getImm(); - assert(Value <= 255 && "Invalid u8imm argument!"); + unsigned char Value = MI->getOperand(OpNo).getImm(); O << (unsigned int)Value; } diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index c414a1512e9..5bce3363363 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -672,6 +672,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); } + + if (Subtarget.isISA3_0() && Subtarget.hasDirectMove()) + setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Legal); } if (Subtarget.hasQPX()) { @@ -7079,6 +7082,16 @@ static SDValue BuildVSLDOI(SDValue LHS, SDValue RHS, unsigned Amt, EVT VT, return DAG.getNode(ISD::BITCAST, dl, VT, T); } +static bool isNonConstSplatBV(BuildVectorSDNode *BVN, EVT Type) { + if (BVN->getValueType(0) != Type) + return false; + auto OpZero = BVN->getOperand(0); + for (int i = 1, e = BVN->getNumOperands(); i < e; i++) + if (BVN->getOperand(i) != OpZero) + return false; + return true; +} + // If this is a case we can't handle, return null and let the default // expansion code take care of it. If we CAN select this case, and if it // selects to a single instruction, return Op. Otherwise, if we can codegen @@ -7200,8 +7213,17 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, bool HasAnyUndefs; if (! BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize, HasAnyUndefs, 0, !Subtarget.isLittleEndian()) || - SplatBitSize > 32) + SplatBitSize > 32) { + // We can splat a non-const value on CPU's that implement ISA 3.0 + // in two ways: LXVWSX (load and splat) and MTVSRWS(move and splat). + auto OpZero = BVN->getOperand(0); + bool CanLoadAndSplat = OpZero.getOpcode() == ISD::LOAD && + BVN->isOnlyUserOf(OpZero.getNode()); + if (Subtarget.isISA3_0() && + isNonConstSplatBV(BVN, MVT::v4i32) && !CanLoadAndSplat) + return Op; return SDValue(); + } unsigned SplatBits = APSplatBits.getZExtValue(); unsigned SplatUndef = APSplatUndef.getZExtValue(); @@ -7219,6 +7241,10 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, return Op; } + // We have XXSPLTIB for constant splats one byte wide + if (Subtarget.isISA3_0() && Op.getValueType() == MVT::v16i8) + return Op; + // If the sign extended value is in the range [-16,15], use VSPLTI[bhw]. int32_t SextVal= (int32_t(SplatBits << (32-SplatBitSize)) >> (32-SplatBitSize)); @@ -7462,6 +7488,18 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, if (Subtarget.hasVSX()) { if (V2.isUndef() && PPC::isSplatShuffleMask(SVOp, 4)) { int SplatIdx = PPC::getVSPLTImmediate(SVOp, 4, DAG); + + // If the source for the shuffle is a scalar_to_vector that came from a + // 32-bit load, it will have used LXVWSX so we don't need to splat again. + if (Subtarget.isISA3_0() && + ((isLittleEndian && SplatIdx == 3) || + (!isLittleEndian && SplatIdx == 0))) { + SDValue Src = V1.getOperand(0); + if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR && + Src.getOperand(0).getOpcode() == ISD::LOAD && + Src.getOperand(0).hasOneUse()) + return V1; + } SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1); SDValue Splat = DAG.getNode(PPCISD::XXSPLT, dl, MVT::v4i32, Conv, DAG.getConstant(SplatIdx, dl, MVT::i32)); diff --git a/lib/Target/PowerPC/PPCInstrFormats.td b/lib/Target/PowerPC/PPCInstrFormats.td index 163c5d0eb87..e67dfe24dc9 100644 --- a/lib/Target/PowerPC/PPCInstrFormats.td +++ b/lib/Target/PowerPC/PPCInstrFormats.td @@ -1059,6 +1059,13 @@ class XX3Form opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, let Inst{31} = XT{5}; } +class XX3Form_Zero opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list pattern> + : XX3Form { + let XA = XT; + let XB = XT; +} + class XX3Form_1 opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, InstrItinClass itin, list pattern> : I { diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td index 7939bc8482f..51e1a7e0754 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.td +++ b/lib/Target/PowerPC/PPCInstrInfo.td @@ -312,6 +312,7 @@ def immZExt16 : PatLeaf<(imm), [{ // field. Used by instructions like 'ori'. return (uint64_t)N->getZExtValue() == (unsigned short)N->getZExtValue(); }], LO16>; +def immSExt8 : ImmLeaf(Imm); }]>; // imm16Shifted* - These match immediates where the low 16-bits are zero. There // are two forms: imm16ShiftedSExt and imm16ShiftedZExt. These two forms are diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td index 7d1cf8e491a..f461b1a6899 100644 --- a/lib/Target/PowerPC/PPCInstrVSX.td +++ b/lib/Target/PowerPC/PPCInstrVSX.td @@ -767,6 +767,10 @@ let Uses = [RM] in { "xxlxor $XT, $XA, $XB", IIC_VecGeneral, [(set v4i32:$XT, (xor v4i32:$XA, v4i32:$XB))]>; } // isCommutable + let isCodeGenOnly = 1 in + def XXLXORz : XX3Form_Zero<60, 154, (outs vsrc:$XT), (ins), + "xxlxor $XT, $XT, $XT", IIC_VecGeneral, + [(set v4i32:$XT, (v4i32 immAllZerosV))]>; // Permutation Instructions def XXMRGHW : XX3Form<60, 18, @@ -1315,8 +1319,7 @@ let Predicates = [HasDirectMove] in { let Predicates = [IsISA3_0, HasDirectMove] in { def MTVSRWS: XX1_RS6_RD5_XO<31, 403, (outs vsrc:$XT), (ins gprc:$rA), - "mtvsrws $XT, $rA", IIC_VecGeneral, - []>; + "mtvsrws $XT, $rA", IIC_VecGeneral, []>; def MTVSRDD: XX1Form<31, 435, (outs vsrc:$XT), (ins g8rc:$rA, g8rc:$rB), "mtvsrdd $XT, $rA, $rB", IIC_VecGeneral, @@ -1880,6 +1883,10 @@ def AlignValues { dag I32_TO_BE_WORD1 = (COPY_TO_REGCLASS (MTVSRWZ $B), VSRC); } +// Materialize a zero-vector of long long +def : Pat<(v2i64 immAllZerosV), + (v2i64 (XXLXORz))>; + // The following VSX instructions were introduced in Power ISA 3.0 def HasP9Vector : Predicate<"PPCSubTarget->hasP9Vector()">; let AddedComplexity = 400, Predicates = [HasP9Vector] in { @@ -2310,4 +2317,40 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { (STXVX $rS, xoaddr:$dst)>; def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xoaddr:$dst), (STXVX $rS, xoaddr:$dst)>; + + def : Pat<(v4i32 (scalar_to_vector (i32 (load xoaddr:$src)))), + (v4i32 (LXVWSX xoaddr:$src))>; + def : Pat<(v4f32 (scalar_to_vector (f32 (load xoaddr:$src)))), + (v4f32 (LXVWSX xoaddr:$src))>; + def : Pat<(v4i32 (build_vector i32:$A, i32:$A, i32:$A, i32:$A)), + (v4i32 (MTVSRWS $A))>; + def : Pat<(v16i8 (build_vector immSExt8:$A, immSExt8:$A, immSExt8:$A, + immSExt8:$A, immSExt8:$A, immSExt8:$A, + immSExt8:$A, immSExt8:$A, immSExt8:$A, + immSExt8:$A, immSExt8:$A, immSExt8:$A, + immSExt8:$A, immSExt8:$A, immSExt8:$A, + immSExt8:$A)), + (v16i8 (COPY_TO_REGCLASS (XXSPLTIB imm:$A), VSRC))>; + def : Pat<(v16i8 immAllOnesV), + (v16i8 (COPY_TO_REGCLASS (XXSPLTIB 255), VSRC))>; + def : Pat<(v8i16 immAllOnesV), + (v8i16 (COPY_TO_REGCLASS (XXSPLTIB 255), VSRC))>; + def : Pat<(v4i32 immAllOnesV), + (v4i32 (XXSPLTIB 255))>; + def : Pat<(v2i64 immAllOnesV), + (v2i64 (XXSPLTIB 255))>; } // end HasP9Vector, AddedComplexity + +let Predicates = [IsISA3_0, HasDirectMove, IsLittleEndian] in { +def : Pat<(v2i64 (build_vector i64:$rA, i64:$rB)), + (v2i64 (MTVSRDD $rB, $rA))>; +def : Pat<(i64 (extractelt v2i64:$A, 0)), + (i64 (MFVSRLD $A))>; +} + +let Predicates = [IsISA3_0, HasDirectMove, IsBigEndian] in { +def : Pat<(v2i64 (build_vector i64:$rB, i64:$rA)), + (v2i64 (MTVSRDD $rB, $rA))>; +def : Pat<(i64 (extractelt v2i64:$A, 1)), + (i64 (MFVSRLD $A))>; +} diff --git a/test/CodeGen/PowerPC/power9-moves-and-splats.ll b/test/CodeGen/PowerPC/power9-moves-and-splats.ll new file mode 100644 index 00000000000..89e09625a14 --- /dev/null +++ b/test/CodeGen/PowerPC/power9-moves-and-splats.ll @@ -0,0 +1,167 @@ +; RUN: llc -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s +; RUN: llc -mcpu=pwr9 -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s \ +; RUN: --check-prefix=CHECK-BE + +@Globi = external global i32, align 4 +@Globf = external global float, align 4 + +define <2 x i64> @test1(i64 %a, i64 %b) { +entry: +; CHECK-LABEL: test1 +; CHECK: mtvsrdd 34, 4, 3 +; CHECK-BE-LABEL: test1 +; CHECK-BE: mtvsrdd 34, 3, 4 + %vecins = insertelement <2 x i64> undef, i64 %a, i32 0 + %vecins1 = insertelement <2 x i64> %vecins, i64 %b, i32 1 + ret <2 x i64> %vecins1 +} + +define i64 @test2(<2 x i64> %a) { +entry: +; CHECK-LABEL: test2 +; CHECK: mfvsrld 3, 34 + %0 = extractelement <2 x i64> %a, i32 0 + ret i64 %0 +} + +define i64 @test3(<2 x i64> %a) { +entry: +; CHECK-BE-LABEL: test3 +; CHECK-BE: mfvsrld 3, 34 + %0 = extractelement <2 x i64> %a, i32 1 + ret i64 %0 +} + +define <4 x i32> @test4(i32* nocapture readonly %in) { +entry: +; CHECK-LABEL: test4 +; CHECK: lxvwsx 34, 0, 3 +; CHECK-NOT: xxspltw +; CHECK-BE-LABEL: test4 +; CHECK-BE: lxvwsx 34, 0, 3 +; CHECK-BE-NOT: xxspltw + %0 = load i32, i32* %in, align 4 + %splat.splatinsert = insertelement <4 x i32> undef, i32 %0, i32 0 + %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + ret <4 x i32> %splat.splat +} + +define <4 x float> @test5(float* nocapture readonly %in) { +entry: +; CHECK-LABEL: test5 +; CHECK: lxvwsx 34, 0, 3 +; CHECK-NOT: xxspltw +; CHECK-BE-LABEL: test5 +; CHECK-BE: lxvwsx 34, 0, 3 +; CHECK-BE-NOT: xxspltw + %0 = load float, float* %in, align 4 + %splat.splatinsert = insertelement <4 x float> undef, float %0, i32 0 + %splat.splat = shufflevector <4 x float> %splat.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer + ret <4 x float> %splat.splat +} + +define <4 x i32> @test6() { +entry: +; CHECK-LABEL: test6 +; CHECK: addis +; CHECK: ld [[TOC:[0-9]+]], .LC0 +; CHECK: lxvwsx 34, 0, 3 +; CHECK-NOT: xxspltw +; CHECK-BE-LABEL: test6 +; CHECK-BE: addis +; CHECK-BE: ld [[TOC:[0-9]+]], .LC0 +; CHECK-BE: lxvwsx 34, 0, 3 +; CHECK-BE-NOT: xxspltw + %0 = load i32, i32* @Globi, align 4 + %splat.splatinsert = insertelement <4 x i32> undef, i32 %0, i32 0 + %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + ret <4 x i32> %splat.splat +} + +define <4 x float> @test7() { +entry: +; CHECK-LABEL: test7 +; CHECK: addis +; CHECK: ld [[TOC:[0-9]+]], .LC1 +; CHECK: lxvwsx 34, 0, 3 +; CHECK-NOT: xxspltw +; CHECK-BE-LABEL: test7 +; CHECK-BE: addis +; CHECK-BE: ld [[TOC:[0-9]+]], .LC1 +; CHECK-BE: lxvwsx 34, 0, 3 +; CHECK-BE-NOT: xxspltw + %0 = load float, float* @Globf, align 4 + %splat.splatinsert = insertelement <4 x float> undef, float %0, i32 0 + %splat.splat = shufflevector <4 x float> %splat.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer + ret <4 x float> %splat.splat +} + +define <16 x i8> @test8() { +entry: +; CHECK-LABEL: test8 +; CHECK: xxlxor 34, 34, 34 +; CHECK-BE-LABEL: test8 +; CHECK-BE: xxlxor 34, 34, 34 + ret <16 x i8> zeroinitializer +} + +define <16 x i8> @test9() { +entry: +; CHECK-LABEL: test9 +; CHECK: xxspltib 34, 1 +; CHECK-BE-LABEL: test9 +; CHECK-BE: xxspltib 34, 1 + ret <16 x i8> +} + +define <16 x i8> @test10() { +entry: +; CHECK-LABEL: test10 +; CHECK: xxspltib 34, 127 +; CHECK-BE-LABEL: test10 +; CHECK-BE: xxspltib 34, 127 + ret <16 x i8> +} + +define <16 x i8> @test11() { +entry: +; CHECK-LABEL: test11 +; CHECK: xxspltib 34, 128 +; CHECK-BE-LABEL: test11 +; CHECK-BE: xxspltib 34, 128 + ret <16 x i8> +} + +define <16 x i8> @test12() { +entry: +; CHECK-LABEL: test12 +; CHECK: xxspltib 34, 255 +; CHECK-BE-LABEL: test12 +; CHECK-BE: xxspltib 34, 255 + ret <16 x i8> +} + +define <16 x i8> @test13() { +entry: +; CHECK-LABEL: test13 +; CHECK: xxspltib 34, 129 +; CHECK-BE-LABEL: test13 +; CHECK-BE: xxspltib 34, 129 + ret <16 x i8> +} + +define <4 x i32> @test14(<4 x i32> %a, i32* nocapture readonly %b) { +entry: +; CHECK-LABEL: test14 +; CHECK: lwz [[LD:[0-9]+]], +; CHECK: mtvsrws 34, [[LD]] +; CHECK-BE-LABEL: test14 +; CHECK-BE: lwz [[LD:[0-9]+]], +; CHECK-BE: mtvsrws 34, [[LD]] + %0 = load i32, i32* %b, align 4 + %splat.splatinsert = insertelement <4 x i32> undef, i32 %0, i32 0 + %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %1 = add i32 %0, 5 + store i32 %1, i32* %b, align 4 + ret <4 x i32> %splat.splat +} diff --git a/test/CodeGen/PowerPC/ppc64-i128-abi.ll b/test/CodeGen/PowerPC/ppc64-i128-abi.ll index 8d5a8cdf3a3..d81a1104c53 100644 --- a/test/CodeGen/PowerPC/ppc64-i128-abi.ll +++ b/test/CodeGen/PowerPC/ppc64-i128-abi.ll @@ -17,16 +17,16 @@ ; RUN: -mcpu=pwr8 -mattr=-vsx < %s | FileCheck %s -check-prefix=CHECK-LE-NOVSX ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ -; RUN: -mcpu=pwr9 < %s | FileCheck %s -check-prefix=CHECK-P9 \ -; RUN: --implicit-check-not xxswapd +; RUN: -mcpu=pwr9 -ppc-vsr-nums-as-vr < %s | FileCheck %s \ +; RUN: -check-prefix=CHECK-P9 --implicit-check-not xxswapd ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ ; RUN: -mcpu=pwr9 -mattr=-vsx < %s | FileCheck %s -check-prefix=CHECK-NOVSX \ ; RUN: --implicit-check-not xxswapd ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ -; RUN: -mcpu=pwr9 -mattr=-power9-vector < %s | FileCheck %s \ -; RUN: -check-prefix=CHECK-LE +; RUN: -mcpu=pwr9 -mattr=-power9-vector -mattr=-direct-move < %s | \ +; RUN: FileCheck %s -check-prefix=CHECK-LE @x = common global <1 x i128> zeroinitializer, align 16 @y = common global <1 x i128> zeroinitializer, align 16 @@ -55,8 +55,10 @@ define <1 x i128> @v1i128_increment_by_one(<1 x i128> %a) nounwind { ; CHECK-LE: blr ; CHECK-P9-LABEL: @v1i128_increment_by_one -; CHECK-P9: lxvx -; CHECK-P9: vadduqm 2, 2, 3 +; CHECK-P9-DAG: li [[R1:r[0-9]+]], 1 +; CHECK-P9-DAG: li [[R2:r[0-9]+]], 0 +; CHECK-P9: mtvsrdd [[V1:v[0-9]+]], [[R2]], [[R1]] +; CHECK-P9: vadduqm v2, v2, [[V1]] ; CHECK-P9: blr ; CHECK-BE-LABEL: @v1i128_increment_by_one @@ -232,8 +234,8 @@ define <1 x i128> @call_v1i128_increment_by_val() nounwind { ; CHECK-LE: blr ; CHECK-P9-LABEL: @call_v1i128_increment_by_val -; CHECK-P9-DAG: lxvx 34 -; CHECK-P9-DAG: lxvx 35 +; CHECK-P9-DAG: lxvx v2 +; CHECK-P9-DAG: lxvx v3 ; CHECK-P9: bl v1i128_increment_by_val ; CHECK-P9: blr