Scalar to vector conversions using direct moves

This patch corresponds to review: http://reviews.llvm.org/D11471 It improves the code generated for converting a scalar to a vector value. With direct moves from GPRs to VSRs, we no longer require expensive stack operations for this. Subsequent patches will handle the reverse case and more general operations between vectors and their scalar elements. llvm-svn: 244921
2025-02-04 19:26:30 +00:00 · 2015-08-13 17:40:44 +00:00 · 2015-08-13 17:40:44 +00:00 · 285f278c18
commit 285f278c18
parent 15b7b9c050
9 changed files with 205 additions and 25 deletions
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@ -542,6 +542,14 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,

    if (Subtarget.hasVSX()) {
      setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal);
+      if (Subtarget.hasP8Vector())
+        setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal);
+      if (Subtarget.hasDirectMove()) {
+        setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Legal);
+        setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Legal);
+        setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Legal);
+        setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i64, Legal);
+      }
      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal);

      setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal);
@ -11490,7 +11498,7 @@ bool
 PPCTargetLowering::shouldExpandBuildVectorWithShuffles(
                     EVT VT , unsigned DefinedValues) const {
  if (VT == MVT::v2i64)
-    return false;
+    return Subtarget.hasDirectMove(); // Don't need stack ops with direct moves

  if (Subtarget.hasQPX()) {
    if (VT == MVT::v4f32 || VT == MVT::v4f64 || VT == MVT::v4i1)
--- a/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/lib/Target/PowerPC/PPCInstrVSX.td
@ -1181,6 +1181,23 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
                          AltVSXFMARel;
  }
+
+  // Single Precision Conversions (FP <-> INT)
+  def XSCVSXDSP : XX2Form<60, 312,
+                      (outs vssrc:$XT), (ins vsfrc:$XB),
+                      "xscvsxdsp $XT, $XB", IIC_VecFP,
+                      [(set f32:$XT, (PPCfcfids f64:$XB))]>;
+  def XSCVUXDSP : XX2Form<60, 296,
+                      (outs vssrc:$XT), (ins vsfrc:$XB),
+                      "xscvuxdsp $XT, $XB", IIC_VecFP,
+                      [(set f32:$XT, (PPCfcfidus f64:$XB))]>;
+
+  // Conversions between vector and scalar single precision
+  def XSCVDPSPN : XX2Form<60, 267, (outs vsrc:$XT), (ins vssrc:$XB),
+                          "xscvdpspn $XT, $XB", IIC_VecFP, []>;
+  def XSCVSPDPN : XX2Form<60, 331, (outs vssrc:$XT), (ins vsrc:$XB),
+                          "xscvspdpn $XT, $XB", IIC_VecFP, []>;
+
 } // AddedComplexity = 400
 } // HasP8Vector

@ -1204,3 +1221,60 @@ let Predicates = [HasDirectMove, HasVSX] in {
                               "mtvsrwz $XT, $rA", IIC_VecGeneral,
                               [(set f64:$XT, (PPCmtvsrz i32:$rA))]>;
 } // HasDirectMove, HasVSX
+
+/*  Direct moves of various size entities from GPR's into VSR's. Each lines
+    the value up into element 0 (both BE and LE). Namely, entities smaller than
+    a doubleword are shifted left and moved for BE. For LE, they're moved, then
+    swapped to go into the least significant element of the VSR.
+*/
+def Moves {
+  dag BE_BYTE_0 = (MTVSRD
+                  (RLDICR
+                    (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $A, sub_32), 56, 7));
+  dag BE_HALF_0 = (MTVSRD
+                  (RLDICR
+                    (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $A, sub_32), 48, 15));
+  dag BE_WORD_0 = (MTVSRD
+                  (RLDICR
+                    (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $A, sub_32), 32, 31));
+  dag BE_DWORD_0 = (MTVSRD $A);
+
+  dag LE_MTVSRW = (MTVSRD (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $A, sub_32));
+  dag LE_WORD_1 = (v2i64 (COPY_TO_REGCLASS LE_MTVSRW, VSRC));
+  dag LE_WORD_0 = (XXPERMDI LE_WORD_1, LE_WORD_1, 2);
+  dag LE_DWORD_1 = (v2i64 (COPY_TO_REGCLASS BE_DWORD_0, VSRC));
+  dag LE_DWORD_0 = (XXPERMDI LE_DWORD_1, LE_DWORD_1, 2);
+}
+
+let Predicates = [IsBigEndian, HasP8Vector] in {
+  def : Pat<(v4f32 (scalar_to_vector f32:$A)),
+            (v4f32 (XSCVDPSPN $A))>;
+} // IsBigEndian, HasP8Vector
+
+let Predicates = [IsBigEndian, HasDirectMove] in {
+  def : Pat<(v16i8 (scalar_to_vector i32:$A)),
+            (v16i8 (COPY_TO_REGCLASS Moves.BE_BYTE_0, VSRC))>;
+  def : Pat<(v8i16 (scalar_to_vector i32:$A)),
+            (v8i16 (COPY_TO_REGCLASS Moves.BE_HALF_0, VSRC))>;
+  def : Pat<(v4i32 (scalar_to_vector i32:$A)),
+            (v4i32 (COPY_TO_REGCLASS Moves.BE_WORD_0, VSRC))>;
+  def : Pat<(v2i64 (scalar_to_vector i64:$A)),
+            (v2i64 (COPY_TO_REGCLASS Moves.BE_DWORD_0, VSRC))>;
+} // IsBigEndian, HasDirectMove
+
+let Predicates = [IsLittleEndian, HasP8Vector] in {
+  def : Pat<(v4f32 (scalar_to_vector f32:$A)),
+            (v4f32 (XXSLDWI (XSCVDPSPN $A), (XSCVDPSPN $A), 1))>;
+} // IsLittleEndian, HasP8Vector
+
+let Predicates = [IsLittleEndian, HasDirectMove] in {
+  def : Pat<(v16i8 (scalar_to_vector i32:$A)),
+            (v16i8 (COPY_TO_REGCLASS Moves.LE_WORD_0, VSRC))>;
+  def : Pat<(v8i16 (scalar_to_vector i32:$A)),
+            (v8i16 (COPY_TO_REGCLASS Moves.LE_WORD_0, VSRC))>;
+  def : Pat<(v4i32 (scalar_to_vector i32:$A)),
+            (v4i32 (COPY_TO_REGCLASS Moves.LE_WORD_0, VSRC))>;
+  def : Pat<(v2i64 (scalar_to_vector i64:$A)),
+            (v2i64 Moves.LE_DWORD_0)>;
+} // IsLittleEndian, HasDirectMove
+
--- a/lib/Target/PowerPC/PPCVSXCopy.cpp
+++ b/lib/Target/PowerPC/PPCVSXCopy.cpp
@ -77,6 +77,10 @@ namespace {
      return IsRegInClass(Reg, &PPC::F8RCRegClass, MRI);
    }

+    bool IsVSFReg(unsigned Reg, MachineRegisterInfo &MRI) {
+      return IsRegInClass(Reg, &PPC::VSFRCRegClass, MRI);
+    }
+
 protected:
    bool processBlock(MachineBasicBlock &MBB) {
      bool Changed = false;
@ -100,7 +104,8 @@ protected:
            IsVRReg(SrcMO.getReg(), MRI) ? &PPC::VSHRCRegClass :
                                           &PPC::VSLRCRegClass;
          assert((IsF8Reg(SrcMO.getReg(), MRI) ||
-                  IsVRReg(SrcMO.getReg(), MRI)) &&
+                  IsVRReg(SrcMO.getReg(), MRI) ||
+                  IsVSFReg(SrcMO.getReg(), MRI)) &&
                 "Unknown source for a VSX copy");

          unsigned NewVReg = MRI.createVirtualRegister(SrcRC);
--- a/test/CodeGen/PowerPC/fp-int-conversions-direct-moves.ll
+++ b/test/CodeGen/PowerPC/fp-int-conversions-direct-moves.ll
@ -24,8 +24,7 @@ entry:
  ret float %conv
 ; CHECK-LABEL: @_Z6testfcc
 ; CHECK: mtvsrwz [[MOVEREG01:[0-9]+]], 3
-; FIXME: Once we have XSCVUXDSP implemented, this will change
-; CHECK: fcfidus 1, [[MOVEREG01]]
+; CHECK: xscvuxdsp 1, [[MOVEREG01]]
 }

 ; Function Attrs: nounwind
@ -77,8 +76,7 @@ entry:
  ret float %conv
 ; CHECK-LABEL: @_Z7testfuch
 ; CHECK: mtvsrwz [[MOVEREG03:[0-9]+]], 3
-; FIXME: Once we have XSCVUXDSP implemented, this will change
-; CHECK: fcfidus 1, [[MOVEREG03]]
+; CHECK: xscvuxdsp 1, [[MOVEREG03]]
 }

 ; Function Attrs: nounwind
@ -130,8 +128,7 @@ entry:
  ret float %conv
 ; CHECK-LABEL: @_Z6testfss
 ; CHECK: mtvsrwa [[MOVEREG05:[0-9]+]], 3
-; FIXME: Once we have XSCVSXDSP implemented, this will change
-; CHECK: fcfids 1, [[MOVEREG05]]
+; CHECK: xscvsxdsp 1, [[MOVEREG05]]
 }

 ; Function Attrs: nounwind
@ -183,8 +180,7 @@ entry:
  ret float %conv
 ; CHECK-LABEL: @_Z7testfust
 ; CHECK: mtvsrwz [[MOVEREG07:[0-9]+]], 3
-; FIXME: Once we have XSCVUXDSP implemented, this will change
-; CHECK: fcfidus 1, [[MOVEREG07]]
+; CHECK: xscvuxdsp 1, [[MOVEREG07]]
 }

 ; Function Attrs: nounwind
@ -236,8 +232,7 @@ entry:
  ret float %conv
 ; CHECK-LABEL: @_Z6testfii
 ; CHECK: mtvsrwa [[MOVEREG09:[0-9]+]], 3
-; FIXME: Once we have XSCVSXDSP implemented, this will change
-; CHECK: fcfids 1, [[MOVEREG09]]
+; CHECK: xscvsxdsp 1, [[MOVEREG09]]
 }

 ; Function Attrs: nounwind
@ -289,8 +284,7 @@ entry:
  ret float %conv
 ; CHECK-LABEL: @_Z7testfuij
 ; CHECK: mtvsrwz [[MOVEREG11:[0-9]+]], 3
-; FIXME: Once we have XSCVUXDSP implemented, this will change
-; CHECK: fcfidus 1, [[MOVEREG11]]
+; CHECK: xscvuxdsp 1, [[MOVEREG11]]
 }

 ; Function Attrs: nounwind
@ -342,8 +336,7 @@ entry:
  ret float %conv
 ; CHECK-LABEL:@_Z7testfllx
 ; CHECK: mtvsrd [[MOVEREG13:[0-9]+]], 3
-; FIXME: Once we have XSCVSXDSP implemented, this will change
-; CHECK: fcfids 1, [[MOVEREG13]]
+; CHECK: xscvsxdsp 1, [[MOVEREG13]]
 }

 ; Function Attrs: nounwind
@ -395,8 +388,7 @@ entry:
  ret float %conv
 ; CHECK-LABEL: @_Z8testfully
 ; CHECK: mtvsrd [[MOVEREG15:[0-9]+]], 3
-; FIXME: Once we have XSCVUXDSP implemented, this will change
-; CHECK: fcfidus 1, [[MOVEREG15]]
+; CHECK: xscvuxdsp 1, [[MOVEREG15]]
 }

 ; Function Attrs: nounwind
--- a/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll
+++ b/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll
@ -0,0 +1,79 @@
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 | FileCheck %s -check-prefix=CHECK-LE
+
+; The build[csilf] functions simply test the scalar_to_vector handling with
+; direct moves. This corresponds to the "insertelement" instruction. Subsequent
+; to this, there will be a splat corresponding to the shufflevector.
+
+; Function Attrs: nounwind
+define <16 x i8> @buildc(i8 zeroext %a) {
+entry:
+  %a.addr = alloca i8, align 1
+  store i8 %a, i8* %a.addr, align 1
+  %0 = load i8, i8* %a.addr, align 1
+  %splat.splatinsert = insertelement <16 x i8> undef, i8 %0, i32 0
+  %splat.splat = shufflevector <16 x i8> %splat.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
+  ret <16 x i8> %splat.splat
+; CHECK: sldi [[REG1:[0-9]+]], 3, 56
+; CHECK: mtvsrd {{[0-9]+}}, [[REG1]]
+; CHECK-LE: mtvsrd [[REG1:[0-9]+]], 3
+; CHECK-LE: xxswapd {{[0-9]+}}, [[REG1]]
+}
+
+; Function Attrs: nounwind
+define <8 x i16> @builds(i16 zeroext %a) {
+entry:
+  %a.addr = alloca i16, align 2
+  store i16 %a, i16* %a.addr, align 2
+  %0 = load i16, i16* %a.addr, align 2
+  %splat.splatinsert = insertelement <8 x i16> undef, i16 %0, i32 0
+  %splat.splat = shufflevector <8 x i16> %splat.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+  ret <8 x i16> %splat.splat
+; CHECK: sldi [[REG1:[0-9]+]], 3, 48
+; CHECK: mtvsrd {{[0-9]+}}, [[REG1]]
+; CHECK-LE: mtvsrd [[REG1:[0-9]+]], 3
+; CHECK-LE: xxswapd {{[0-9]+}}, [[REG1]]
+}
+
+; Function Attrs: nounwind
+define <4 x i32> @buildi(i32 zeroext %a) {
+entry:
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  %0 = load i32, i32* %a.addr, align 4
+  %splat.splatinsert = insertelement <4 x i32> undef, i32 %0, i32 0
+  %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  ret <4 x i32> %splat.splat
+; CHECK: sldi [[REG1:[0-9]+]], 3, 32
+; CHECK: mtvsrd {{[0-9]+}}, [[REG1]]
+; CHECK-LE: mtvsrd [[REG1:[0-9]+]], 3
+; CHECK-LE: xxswapd {{[0-9]+}}, [[REG1]]
+}
+
+; Function Attrs: nounwind
+define <2 x i64> @buildl(i64 %a) {
+entry:
+  %a.addr = alloca i64, align 8
+  store i64 %a, i64* %a.addr, align 8
+  %0 = load i64, i64* %a.addr, align 8
+  %splat.splatinsert = insertelement <2 x i64> undef, i64 %0, i32 0
+  %splat.splat = shufflevector <2 x i64> %splat.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer
+  ret <2 x i64> %splat.splat
+; CHECK: mtvsrd {{[0-9]+}}, 3
+; CHECK-LE: mtvsrd [[REG1:[0-9]+]], 3
+; CHECK-LE: xxswapd {{[0-9]+}}, [[REG1]]
+}
+
+; Function Attrs: nounwind
+define <4 x float> @buildf(float %a) {
+entry:
+  %a.addr = alloca float, align 4
+  store float %a, float* %a.addr, align 4
+  %0 = load float, float* %a.addr, align 4
+  %splat.splatinsert = insertelement <4 x float> undef, float %0, i32 0
+  %splat.splat = shufflevector <4 x float> %splat.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
+  ret <4 x float> %splat.splat
+; CHECK: xscvdpspn {{[0-9]+}}, 1
+; CHECK-LE: xscvdpspn [[REG1:[0-9]+]], 1
+; CHECK-LE: xxsldwi {{[0-9]+}}, [[REG1]], [[REG1]], 1
+}
--- a/test/CodeGen/PowerPC/vsx.ll
+++ b/test/CodeGen/PowerPC/vsx.ll
@ -1226,11 +1226,11 @@ define <2 x i32> @test80(i32 %v) {
 ; CHECK-FISL: blr

 ; CHECK-LE-LABEL: @test80
-; CHECK-LE-DAG: addi [[R1:[0-9]+]], 1, -16
+; CHECK-LE-DAG: mtvsrd [[R1:[0-9]+]], 3
 ; CHECK-LE-DAG: addi [[R2:[0-9]+]], {{[0-9]+}}, .LCPI
-; CHECK-LE-DAG: lxvd2x [[V1:[0-9]+]], 0, [[R1]]
+; CHECK-LE-DAG: xxswapd [[V1:[0-9]+]], [[R1]]
 ; CHECK-LE-DAG: lxvd2x [[V2:[0-9]+]], 0, [[R2]]
-; CHECK-LE-DAG: xxswapd 34, [[V1]]
+; CHECK-LE-DAG: xxspltd 34, [[V1]]
 ; CHECK-LE-DAG: xxswapd 35, [[V2]]
 ; CHECK-LE: vaddudm 2, 2, 3
 ; CHECK-LE: blr
--- a/test/CodeGen/PowerPC/vsx_scalar_ld_st.ll
+++ b/test/CodeGen/PowerPC/vsx_scalar_ld_st.ll
@ -55,8 +55,7 @@ entry:
  ret void
 ; CHECK-LABEL: @intToFlt
 ; CHECK: lxsiwax [[REGLD2:[0-9]+]],
-; FIXME: the below will change when the VSX form is implemented
-; CHECK: fcfids {{[0-9]}}, [[REGLD2]]
+; CHECK: xscvsxdsp {{[0-9]}}, [[REGLD2]]
 }

 ; Function Attrs: nounwind
@ -108,8 +107,7 @@ entry:
  ret void
 ; CHECK-LABEL: @uIntToFlt
 ; CHECK: lxsiwzx [[REGLD4:[0-9]+]],
-; FIXME: the below will change when the VSX form is implemented
-; CHECK: fcfidus {{[0-9]+}}, [[REGLD4]]
+; CHECK: xscvuxdsp {{[0-9]+}}, [[REGLD4]]
 }

 ; Function Attrs: nounwind
--- a/test/MC/Disassembler/PowerPC/vsx.txt
+++ b/test/MC/Disassembler/PowerPC/vsx.txt
@ -57,6 +57,9 @@
 # CHECK: xscvdpsp 7, 27
 0xf0 0xe0 0xdc 0x24

+# CHECK: xscvdpspn 7, 27
+0xf0 0xe0 0xdc 0x2c
+
 # CHECK: xscvdpsxds 7, 27
 0xf0 0xe0 0xdd 0x60

@ -72,9 +75,18 @@
 # CHECK: xscvspdp 7, 27
 0xf0 0xe0 0xdd 0x24

+# CHECK: xscvspdpn 7, 27
+0xf0 0xe0 0xdd 0x2c
+
+# CHECK: xscvsxdsp 7, 27
+0xf0 0xe0 0xdc 0xe0
+
 # CHECK: xscvsxddp 7, 27
 0xf0 0xe0 0xdd 0xe0

+# CHECK: xscvuxdsp 7, 27
+0xf0 0xe0 0xdc 0xa0
+
 # CHECK: xscvuxddp 7, 27
 0xf0 0xe0 0xdd 0xa0

--- a/test/MC/PowerPC/vsx.s
+++ b/test/MC/PowerPC/vsx.s
@ -62,6 +62,9 @@
 # CHECK-BE: xscvdpsp 7, 27                     # encoding: [0xf0,0xe0,0xdc,0x24]
 # CHECK-LE: xscvdpsp 7, 27                     # encoding: [0x24,0xdc,0xe0,0xf0]
            xscvdpsp 7, 27
+# CHECK-BE: xscvdpspn 7, 27                    # encoding: [0xf0,0xe0,0xdc,0x2c]
+# CHECK-LE: xscvdpspn 7, 27                    # encoding: [0x2c,0xdc,0xe0,0xf0]
+            xscvdpspn 7, 27
 # CHECK-BE: xscvdpsxds 7, 27                   # encoding: [0xf0,0xe0,0xdd,0x60]
 # CHECK-LE: xscvdpsxds 7, 27                   # encoding: [0x60,0xdd,0xe0,0xf0]
            xscvdpsxds 7, 27
@ -77,9 +80,18 @@
 # CHECK-BE: xscvspdp 7, 27                     # encoding: [0xf0,0xe0,0xdd,0x24]
 # CHECK-LE: xscvspdp 7, 27                     # encoding: [0x24,0xdd,0xe0,0xf0]
            xscvspdp 7, 27
+# CHECK-BE: xscvspdpn 7, 27                    # encoding: [0xf0,0xe0,0xdd,0x2c]
+# CHECK-LE: xscvspdpn 7, 27                    # encoding: [0x2c,0xdd,0xe0,0xf0]
+            xscvspdpn 7, 27
+# CHECK-BE: xscvsxdsp 7, 27                    # encoding: [0xf0,0xe0,0xdc,0xe0]
+# CHECK-LE: xscvsxdsp 7, 27                    # encoding: [0xe0,0xdc,0xe0,0xf0]
+            xscvsxdsp 7, 27
 # CHECK-BE: xscvsxddp 7, 27                    # encoding: [0xf0,0xe0,0xdd,0xe0]
 # CHECK-LE: xscvsxddp 7, 27                    # encoding: [0xe0,0xdd,0xe0,0xf0]
            xscvsxddp 7, 27
+# CHECK-BE: xscvuxdsp 7, 27                    # encoding: [0xf0,0xe0,0xdc,0xa0]
+# CHECK-LE: xscvuxdsp 7, 27                    # encoding: [0xa0,0xdc,0xe0,0xf0]
+            xscvuxdsp 7, 27
 # CHECK-BE: xscvuxddp 7, 27                    # encoding: [0xf0,0xe0,0xdd,0xa0]
 # CHECK-LE: xscvuxddp 7, 27                    # encoding: [0xa0,0xdd,0xe0,0xf0]
            xscvuxddp 7, 27