[PowerPC] Improve float vector gather codegen

This patch aims to improve the code generation for float vector gather on POWER9. Patterns have been implemented to utilize instructions that deliver improved performance. Patch by: Kamau Bridgeman Differential Revision: https://reviews.llvm.org/D62908
2025-01-04 06:51:56 +00:00 · 2019-11-18 15:50:44 -06:00 · 2019-11-18 15:50:44 -06:00 · 6512473cee
commit 6512473cee
parent 0213adde21
2 changed files with 51 additions and 22 deletions
--- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@ -3924,8 +3924,20 @@ def DblToULongLoad {
  dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (load xoaddr:$A)))));
 }

+// FP load dags (for f32 -> v4f32)
+def LoadFP {
+  dag A = (f32 (load xoaddr:$A));
+  dag B = (f32 (load xoaddr:$B));
+  dag C = (f32 (load xoaddr:$C));
+  dag D = (f32 (load xoaddr:$D));
+}
+
 // FP merge dags (for f32 -> v4f32)
 def MrgFP {
+  dag LD32A = (COPY_TO_REGCLASS (LIWZX xoaddr:$A), VSRC);
+  dag LD32B = (COPY_TO_REGCLASS (LIWZX xoaddr:$B), VSRC);
+  dag LD32C = (COPY_TO_REGCLASS (LIWZX xoaddr:$C), VSRC);
+  dag LD32D = (COPY_TO_REGCLASS (LIWZX xoaddr:$D), VSRC);
  dag AC = (XVCVDPSP (XXPERMDI (COPY_TO_REGCLASS $A, VSRC),
                               (COPY_TO_REGCLASS $C, VSRC), 0));
  dag BD = (XVCVDPSP (XXPERMDI (COPY_TO_REGCLASS $B, VSRC),
@ -4081,7 +4093,18 @@ let AddedComplexity = 400 in {
              (v2f64 (XXPERMDI
                        (COPY_TO_REGCLASS $A, VSRC),
                        (COPY_TO_REGCLASS $B, VSRC), 0))>;
-
+    // Using VMRGEW to assemble the final vector would be a lower latency
+    // solution. However, we choose to go with the slightly higher latency
+    // XXPERMDI for 2 reasons:
+    // 1. This is likely to occur in unrolled loops where regpressure is high,
+    //    so we want to use the latter as it has access to all 64 VSX registers.
+    // 2. Using Altivec instructions in this sequence would likely cause the
+    //    allocation of Altivec registers even for the loads which in turn would
+    //    force the use of LXSIWZX for the loads, adding a cycle of latency to
+    //    each of the loads which would otherwise be able to use LFIWZX.
+    def : Pat<(v4f32 (build_vector LoadFP.A, LoadFP.B, LoadFP.C, LoadFP.D)),
+              (v4f32 (XXPERMDI (XXMRGHW MrgFP.LD32A, MrgFP.LD32B),
+                               (XXMRGHW MrgFP.LD32C, MrgFP.LD32D), 3))>;
    def : Pat<(v4f32 (build_vector f32:$A, f32:$B, f32:$C, f32:$D)),
              (VMRGEW MrgFP.AC, MrgFP.BD)>;
    def : Pat<(v4f32 (build_vector DblToFlt.A0, DblToFlt.A1,
@ -4148,7 +4171,18 @@ let AddedComplexity = 400 in {
              (v2f64 (XXPERMDI
                        (COPY_TO_REGCLASS $B, VSRC),
                        (COPY_TO_REGCLASS $A, VSRC), 0))>;
-
+    // Using VMRGEW to assemble the final vector would be a lower latency
+    // solution. However, we choose to go with the slightly higher latency
+    // XXPERMDI for 2 reasons:
+    // 1. This is likely to occur in unrolled loops where regpressure is high,
+    //    so we want to use the latter as it has access to all 64 VSX registers.
+    // 2. Using Altivec instructions in this sequence would likely cause the
+    //    allocation of Altivec registers even for the loads which in turn would
+    //    force the use of LXSIWZX for the loads, adding a cycle of latency to
+    //    each of the loads which would otherwise be able to use LFIWZX.
+    def : Pat<(v4f32 (build_vector LoadFP.A, LoadFP.B, LoadFP.C, LoadFP.D)),
+              (v4f32 (XXPERMDI (XXMRGHW MrgFP.LD32D, MrgFP.LD32C),
+                               (XXMRGHW MrgFP.LD32B, MrgFP.LD32A), 3))>;
    def : Pat<(v4f32 (build_vector f32:$D, f32:$C, f32:$B, f32:$A)),
              (VMRGEW MrgFP.AC, MrgFP.BD)>;
    def : Pat<(v4f32 (build_vector DblToFlt.A0, DblToFlt.A1,
--- a/llvm/test/CodeGen/PowerPC/float-vector-gather.ll
+++ b/llvm/test/CodeGen/PowerPC/float-vector-gather.ll
@ -1,6 +1,5 @@
 ; NOTE: This test ensures that for both Big and Little Endian cases a set of
-; NOTE: 4 floats is gathered into a v4f32 register using xxmrghd, xvcvdpsp,
-; NOTE: and vmrgew.
+; NOTE: 4 floats is gathered into a v4f32 register using xxmrghw and xxmrgld
 ; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-vsr-nums-as-vr \
 ; RUN: -ppc-asm-full-reg-names -mtriple=powerpc64le-unknown-linux-gnu < %s \
 ; RUN: | FileCheck %s -check-prefix=CHECK-LE
@ -16,28 +15,24 @@ float* nocapture readonly %d) {
 ; }
 ; CHECK-LE-LABEL: vector_gatherf:
 ; CHECK-LE:       # %bb.0: # %entry
-; CHECK-LE-DAG:    lfs f[[REG0:[0-9]+]], 0(r3)
-; CHECK-LE-DAG:    lfs f[[REG1:[0-9]+]], 0(r4)
-; CHECK-LE-DAG:    lfs f[[REG2:[0-9]+]], 0(r5)
-; CHECK-LE-DAG:    lfs f[[REG3:[0-9]+]], 0(r6)
-; CHECK-LE-DAG:    xxmrghd vs[[REG4:[0-9]+]], vs[[REG2]], vs[[REG0]]
-; CHECK-LE-NEXT:   xvcvdpsp v[[VREG2:[0-9]+]], vs[[REG4]]
-; CHECK-LE-NEXT:   xxmrghd vs[[REG5:[0-9]+]], vs[[REG3]], vs[[REG1]]
-; CHECK-LE-NEXT:   xvcvdpsp v[[VREG3:[0-9]+]], vs[[REG5]]
-; CHECK-LE-NEXT:   vmrgew v[[VREG:[0-9]+]], v[[VREG3]], v[[VREG2]]
+; CHECK-LE-DAG:    lfiwzx f[[REG0:[0-9]+]], 0, r6
+; CHECK-LE-DAG:    lfiwzx f[[REG1:[0-9]+]], 0, r5
+; CHECK-LE-DAG:    lfiwzx f[[REG2:[0-9]+]], 0, r4
+; CHECK-LE-DAG:    lfiwzx f[[REG3:[0-9]+]], 0, r3
+; CHECK-LE-DAG:    xxmrghw vs[[REG0]], vs[[REG0]], vs[[REG1]]
+; CHECK-LE-DAG:    xxmrghw vs[[REG4:[0-9]+]], vs[[REG2]], vs[[REG3]]
+; CHECK-LE-NEXT:   xxmrgld v[[REG:[0-9]+]], vs[[REG0]], vs[[REG4]]
 ; CHECK-LE-NEXT:   blr

 ; CHECK-BE-LABEL: vector_gatherf:
 ; CHECK-BE:       # %bb.0: # %entry
-; CHECK-BE-DAG:    lfs f[[REG0:[0-9]+]], 0(r3)
-; CHECK-BE-DAG:    lfs f[[REG1:[0-9]+]], 0(r4)
-; CHECK-BE-DAG:    lfs f[[REG2:[0-9]+]], 0(r5)
-; CHECK-BE-DAG:    lfs f[[REG3:[0-9]+]], 0(r6)
-; CHECK-BE-DAG:    xxmrghd vs[[REG4:[0-9]+]], vs[[REG0]], vs[[REG2]]
-; CHECK-BE-DAG:    xxmrghd vs[[REG5:[0-9]+]], vs[[REG1]], vs[[REG3]]
-; CHECK-BE-NEXT:   xvcvdpsp v[[VREG2:[0-9]+]], vs[[REG5]]
-; CHECK-BE-NEXT:   xvcvdpsp v[[VREG3:[0-9]+]], vs[[REG4]]
-; CHECK-BE-NEXT:   vmrgew v[[VREG:[0-9]+]], v[[VREG3]], v[[VREG2]]
+; CHECK-BE-DAG:    lfiwzx f[[REG0:[0-9]+]], 0, r3
+; CHECK-BE-DAG:    lfiwzx f[[REG1:[0-9]+]], 0, r4
+; CHECK-BE-DAG:    lfiwzx f[[REG2:[0-9]+]], 0, r5
+; CHECK-BE-DAG:    lfiwzx f[[REG3:[0-9]+]], 0, r6
+; CHECK-BE-DAG:    xxmrghw vs[[REG0]], vs[[REG0]], vs[[REG1]]
+; CHECK-BE-DAG:    xxmrghw vs[[REG4:[0-9]+]], vs[[REG2]], vs[[REG3]]
+; CHECK-BE-NEXT:   xxmrgld v[[REG:[0-9]+]], vs[[REG0]], vs[[REG4]]
 ; CHECK-BE-NEXT:   blr
 entry:
  %0 = load float, float* %a, align 4