From 6512473ceef277705a9d0ac7824d319186e802d9 Mon Sep 17 00:00:00 2001 From: Stefan Pintilie Date: Mon, 18 Nov 2019 15:50:44 -0600 Subject: [PATCH] [PowerPC] Improve float vector gather codegen This patch aims to improve the code generation for float vector gather on POWER9. Patterns have been implemented to utilize instructions that deliver improved performance. Patch by: Kamau Bridgeman Differential Revision: https://reviews.llvm.org/D62908 --- llvm/lib/Target/PowerPC/PPCInstrVSX.td | 38 ++++++++++++++++++- .../CodeGen/PowerPC/float-vector-gather.ll | 35 ++++++++--------- 2 files changed, 51 insertions(+), 22 deletions(-) diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td index a2b27cd921e9..9d5db66bc573 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td +++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td @@ -3924,8 +3924,20 @@ def DblToULongLoad { dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (load xoaddr:$A))))); } +// FP load dags (for f32 -> v4f32) +def LoadFP { + dag A = (f32 (load xoaddr:$A)); + dag B = (f32 (load xoaddr:$B)); + dag C = (f32 (load xoaddr:$C)); + dag D = (f32 (load xoaddr:$D)); +} + // FP merge dags (for f32 -> v4f32) def MrgFP { + dag LD32A = (COPY_TO_REGCLASS (LIWZX xoaddr:$A), VSRC); + dag LD32B = (COPY_TO_REGCLASS (LIWZX xoaddr:$B), VSRC); + dag LD32C = (COPY_TO_REGCLASS (LIWZX xoaddr:$C), VSRC); + dag LD32D = (COPY_TO_REGCLASS (LIWZX xoaddr:$D), VSRC); dag AC = (XVCVDPSP (XXPERMDI (COPY_TO_REGCLASS $A, VSRC), (COPY_TO_REGCLASS $C, VSRC), 0)); dag BD = (XVCVDPSP (XXPERMDI (COPY_TO_REGCLASS $B, VSRC), @@ -4081,7 +4093,18 @@ let AddedComplexity = 400 in { (v2f64 (XXPERMDI (COPY_TO_REGCLASS $A, VSRC), (COPY_TO_REGCLASS $B, VSRC), 0))>; - + // Using VMRGEW to assemble the final vector would be a lower latency + // solution. However, we choose to go with the slightly higher latency + // XXPERMDI for 2 reasons: + // 1. This is likely to occur in unrolled loops where regpressure is high, + // so we want to use the latter as it has access to all 64 VSX registers. + // 2. Using Altivec instructions in this sequence would likely cause the + // allocation of Altivec registers even for the loads which in turn would + // force the use of LXSIWZX for the loads, adding a cycle of latency to + // each of the loads which would otherwise be able to use LFIWZX. + def : Pat<(v4f32 (build_vector LoadFP.A, LoadFP.B, LoadFP.C, LoadFP.D)), + (v4f32 (XXPERMDI (XXMRGHW MrgFP.LD32A, MrgFP.LD32B), + (XXMRGHW MrgFP.LD32C, MrgFP.LD32D), 3))>; def : Pat<(v4f32 (build_vector f32:$A, f32:$B, f32:$C, f32:$D)), (VMRGEW MrgFP.AC, MrgFP.BD)>; def : Pat<(v4f32 (build_vector DblToFlt.A0, DblToFlt.A1, @@ -4148,7 +4171,18 @@ let AddedComplexity = 400 in { (v2f64 (XXPERMDI (COPY_TO_REGCLASS $B, VSRC), (COPY_TO_REGCLASS $A, VSRC), 0))>; - + // Using VMRGEW to assemble the final vector would be a lower latency + // solution. However, we choose to go with the slightly higher latency + // XXPERMDI for 2 reasons: + // 1. This is likely to occur in unrolled loops where regpressure is high, + // so we want to use the latter as it has access to all 64 VSX registers. + // 2. Using Altivec instructions in this sequence would likely cause the + // allocation of Altivec registers even for the loads which in turn would + // force the use of LXSIWZX for the loads, adding a cycle of latency to + // each of the loads which would otherwise be able to use LFIWZX. + def : Pat<(v4f32 (build_vector LoadFP.A, LoadFP.B, LoadFP.C, LoadFP.D)), + (v4f32 (XXPERMDI (XXMRGHW MrgFP.LD32D, MrgFP.LD32C), + (XXMRGHW MrgFP.LD32B, MrgFP.LD32A), 3))>; def : Pat<(v4f32 (build_vector f32:$D, f32:$C, f32:$B, f32:$A)), (VMRGEW MrgFP.AC, MrgFP.BD)>; def : Pat<(v4f32 (build_vector DblToFlt.A0, DblToFlt.A1, diff --git a/llvm/test/CodeGen/PowerPC/float-vector-gather.ll b/llvm/test/CodeGen/PowerPC/float-vector-gather.ll index 02d4967aae51..b7bb622a1f90 100644 --- a/llvm/test/CodeGen/PowerPC/float-vector-gather.ll +++ b/llvm/test/CodeGen/PowerPC/float-vector-gather.ll @@ -1,6 +1,5 @@ ; NOTE: This test ensures that for both Big and Little Endian cases a set of -; NOTE: 4 floats is gathered into a v4f32 register using xxmrghd, xvcvdpsp, -; NOTE: and vmrgew. +; NOTE: 4 floats is gathered into a v4f32 register using xxmrghw and xxmrgld ; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-vsr-nums-as-vr \ ; RUN: -ppc-asm-full-reg-names -mtriple=powerpc64le-unknown-linux-gnu < %s \ ; RUN: | FileCheck %s -check-prefix=CHECK-LE @@ -16,28 +15,24 @@ float* nocapture readonly %d) { ; } ; CHECK-LE-LABEL: vector_gatherf: ; CHECK-LE: # %bb.0: # %entry -; CHECK-LE-DAG: lfs f[[REG0:[0-9]+]], 0(r3) -; CHECK-LE-DAG: lfs f[[REG1:[0-9]+]], 0(r4) -; CHECK-LE-DAG: lfs f[[REG2:[0-9]+]], 0(r5) -; CHECK-LE-DAG: lfs f[[REG3:[0-9]+]], 0(r6) -; CHECK-LE-DAG: xxmrghd vs[[REG4:[0-9]+]], vs[[REG2]], vs[[REG0]] -; CHECK-LE-NEXT: xvcvdpsp v[[VREG2:[0-9]+]], vs[[REG4]] -; CHECK-LE-NEXT: xxmrghd vs[[REG5:[0-9]+]], vs[[REG3]], vs[[REG1]] -; CHECK-LE-NEXT: xvcvdpsp v[[VREG3:[0-9]+]], vs[[REG5]] -; CHECK-LE-NEXT: vmrgew v[[VREG:[0-9]+]], v[[VREG3]], v[[VREG2]] +; CHECK-LE-DAG: lfiwzx f[[REG0:[0-9]+]], 0, r6 +; CHECK-LE-DAG: lfiwzx f[[REG1:[0-9]+]], 0, r5 +; CHECK-LE-DAG: lfiwzx f[[REG2:[0-9]+]], 0, r4 +; CHECK-LE-DAG: lfiwzx f[[REG3:[0-9]+]], 0, r3 +; CHECK-LE-DAG: xxmrghw vs[[REG0]], vs[[REG0]], vs[[REG1]] +; CHECK-LE-DAG: xxmrghw vs[[REG4:[0-9]+]], vs[[REG2]], vs[[REG3]] +; CHECK-LE-NEXT: xxmrgld v[[REG:[0-9]+]], vs[[REG0]], vs[[REG4]] ; CHECK-LE-NEXT: blr ; CHECK-BE-LABEL: vector_gatherf: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-DAG: lfs f[[REG0:[0-9]+]], 0(r3) -; CHECK-BE-DAG: lfs f[[REG1:[0-9]+]], 0(r4) -; CHECK-BE-DAG: lfs f[[REG2:[0-9]+]], 0(r5) -; CHECK-BE-DAG: lfs f[[REG3:[0-9]+]], 0(r6) -; CHECK-BE-DAG: xxmrghd vs[[REG4:[0-9]+]], vs[[REG0]], vs[[REG2]] -; CHECK-BE-DAG: xxmrghd vs[[REG5:[0-9]+]], vs[[REG1]], vs[[REG3]] -; CHECK-BE-NEXT: xvcvdpsp v[[VREG2:[0-9]+]], vs[[REG5]] -; CHECK-BE-NEXT: xvcvdpsp v[[VREG3:[0-9]+]], vs[[REG4]] -; CHECK-BE-NEXT: vmrgew v[[VREG:[0-9]+]], v[[VREG3]], v[[VREG2]] +; CHECK-BE-DAG: lfiwzx f[[REG0:[0-9]+]], 0, r3 +; CHECK-BE-DAG: lfiwzx f[[REG1:[0-9]+]], 0, r4 +; CHECK-BE-DAG: lfiwzx f[[REG2:[0-9]+]], 0, r5 +; CHECK-BE-DAG: lfiwzx f[[REG3:[0-9]+]], 0, r6 +; CHECK-BE-DAG: xxmrghw vs[[REG0]], vs[[REG0]], vs[[REG1]] +; CHECK-BE-DAG: xxmrghw vs[[REG4:[0-9]+]], vs[[REG2]], vs[[REG3]] +; CHECK-BE-NEXT: xxmrgld v[[REG:[0-9]+]], vs[[REG0]], vs[[REG4]] ; CHECK-BE-NEXT: blr entry: %0 = load float, float* %a, align 4