[PowerPC] Improve float vector gather codegen

This patch aims to improve the code generation for float vector gather on POWER9.
Patterns have been implemented to utilize instructions that deliver improved
performance.

Patch by: Kamau Bridgeman

Differential Revision: https://reviews.llvm.org/D62908
This commit is contained in:
Stefan Pintilie 2019-11-18 15:50:44 -06:00
parent 0213adde21
commit 6512473cee
2 changed files with 51 additions and 22 deletions

View File

@ -3924,8 +3924,20 @@ def DblToULongLoad {
dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (load xoaddr:$A)))));
}
// FP load dags (for f32 -> v4f32)
def LoadFP {
dag A = (f32 (load xoaddr:$A));
dag B = (f32 (load xoaddr:$B));
dag C = (f32 (load xoaddr:$C));
dag D = (f32 (load xoaddr:$D));
}
// FP merge dags (for f32 -> v4f32)
def MrgFP {
dag LD32A = (COPY_TO_REGCLASS (LIWZX xoaddr:$A), VSRC);
dag LD32B = (COPY_TO_REGCLASS (LIWZX xoaddr:$B), VSRC);
dag LD32C = (COPY_TO_REGCLASS (LIWZX xoaddr:$C), VSRC);
dag LD32D = (COPY_TO_REGCLASS (LIWZX xoaddr:$D), VSRC);
dag AC = (XVCVDPSP (XXPERMDI (COPY_TO_REGCLASS $A, VSRC),
(COPY_TO_REGCLASS $C, VSRC), 0));
dag BD = (XVCVDPSP (XXPERMDI (COPY_TO_REGCLASS $B, VSRC),
@ -4081,7 +4093,18 @@ let AddedComplexity = 400 in {
(v2f64 (XXPERMDI
(COPY_TO_REGCLASS $A, VSRC),
(COPY_TO_REGCLASS $B, VSRC), 0))>;
// Using VMRGEW to assemble the final vector would be a lower latency
// solution. However, we choose to go with the slightly higher latency
// XXPERMDI for 2 reasons:
// 1. This is likely to occur in unrolled loops where regpressure is high,
// so we want to use the latter as it has access to all 64 VSX registers.
// 2. Using Altivec instructions in this sequence would likely cause the
// allocation of Altivec registers even for the loads which in turn would
// force the use of LXSIWZX for the loads, adding a cycle of latency to
// each of the loads which would otherwise be able to use LFIWZX.
def : Pat<(v4f32 (build_vector LoadFP.A, LoadFP.B, LoadFP.C, LoadFP.D)),
(v4f32 (XXPERMDI (XXMRGHW MrgFP.LD32A, MrgFP.LD32B),
(XXMRGHW MrgFP.LD32C, MrgFP.LD32D), 3))>;
def : Pat<(v4f32 (build_vector f32:$A, f32:$B, f32:$C, f32:$D)),
(VMRGEW MrgFP.AC, MrgFP.BD)>;
def : Pat<(v4f32 (build_vector DblToFlt.A0, DblToFlt.A1,
@ -4148,7 +4171,18 @@ let AddedComplexity = 400 in {
(v2f64 (XXPERMDI
(COPY_TO_REGCLASS $B, VSRC),
(COPY_TO_REGCLASS $A, VSRC), 0))>;
// Using VMRGEW to assemble the final vector would be a lower latency
// solution. However, we choose to go with the slightly higher latency
// XXPERMDI for 2 reasons:
// 1. This is likely to occur in unrolled loops where regpressure is high,
// so we want to use the latter as it has access to all 64 VSX registers.
// 2. Using Altivec instructions in this sequence would likely cause the
// allocation of Altivec registers even for the loads which in turn would
// force the use of LXSIWZX for the loads, adding a cycle of latency to
// each of the loads which would otherwise be able to use LFIWZX.
def : Pat<(v4f32 (build_vector LoadFP.A, LoadFP.B, LoadFP.C, LoadFP.D)),
(v4f32 (XXPERMDI (XXMRGHW MrgFP.LD32D, MrgFP.LD32C),
(XXMRGHW MrgFP.LD32B, MrgFP.LD32A), 3))>;
def : Pat<(v4f32 (build_vector f32:$D, f32:$C, f32:$B, f32:$A)),
(VMRGEW MrgFP.AC, MrgFP.BD)>;
def : Pat<(v4f32 (build_vector DblToFlt.A0, DblToFlt.A1,

View File

@ -1,6 +1,5 @@
; NOTE: This test ensures that for both Big and Little Endian cases a set of
; NOTE: 4 floats is gathered into a v4f32 register using xxmrghd, xvcvdpsp,
; NOTE: and vmrgew.
; NOTE: 4 floats is gathered into a v4f32 register using xxmrghw and xxmrgld
; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-vsr-nums-as-vr \
; RUN: -ppc-asm-full-reg-names -mtriple=powerpc64le-unknown-linux-gnu < %s \
; RUN: | FileCheck %s -check-prefix=CHECK-LE
@ -16,28 +15,24 @@ float* nocapture readonly %d) {
; }
; CHECK-LE-LABEL: vector_gatherf:
; CHECK-LE: # %bb.0: # %entry
; CHECK-LE-DAG: lfs f[[REG0:[0-9]+]], 0(r3)
; CHECK-LE-DAG: lfs f[[REG1:[0-9]+]], 0(r4)
; CHECK-LE-DAG: lfs f[[REG2:[0-9]+]], 0(r5)
; CHECK-LE-DAG: lfs f[[REG3:[0-9]+]], 0(r6)
; CHECK-LE-DAG: xxmrghd vs[[REG4:[0-9]+]], vs[[REG2]], vs[[REG0]]
; CHECK-LE-NEXT: xvcvdpsp v[[VREG2:[0-9]+]], vs[[REG4]]
; CHECK-LE-NEXT: xxmrghd vs[[REG5:[0-9]+]], vs[[REG3]], vs[[REG1]]
; CHECK-LE-NEXT: xvcvdpsp v[[VREG3:[0-9]+]], vs[[REG5]]
; CHECK-LE-NEXT: vmrgew v[[VREG:[0-9]+]], v[[VREG3]], v[[VREG2]]
; CHECK-LE-DAG: lfiwzx f[[REG0:[0-9]+]], 0, r6
; CHECK-LE-DAG: lfiwzx f[[REG1:[0-9]+]], 0, r5
; CHECK-LE-DAG: lfiwzx f[[REG2:[0-9]+]], 0, r4
; CHECK-LE-DAG: lfiwzx f[[REG3:[0-9]+]], 0, r3
; CHECK-LE-DAG: xxmrghw vs[[REG0]], vs[[REG0]], vs[[REG1]]
; CHECK-LE-DAG: xxmrghw vs[[REG4:[0-9]+]], vs[[REG2]], vs[[REG3]]
; CHECK-LE-NEXT: xxmrgld v[[REG:[0-9]+]], vs[[REG0]], vs[[REG4]]
; CHECK-LE-NEXT: blr
; CHECK-BE-LABEL: vector_gatherf:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-DAG: lfs f[[REG0:[0-9]+]], 0(r3)
; CHECK-BE-DAG: lfs f[[REG1:[0-9]+]], 0(r4)
; CHECK-BE-DAG: lfs f[[REG2:[0-9]+]], 0(r5)
; CHECK-BE-DAG: lfs f[[REG3:[0-9]+]], 0(r6)
; CHECK-BE-DAG: xxmrghd vs[[REG4:[0-9]+]], vs[[REG0]], vs[[REG2]]
; CHECK-BE-DAG: xxmrghd vs[[REG5:[0-9]+]], vs[[REG1]], vs[[REG3]]
; CHECK-BE-NEXT: xvcvdpsp v[[VREG2:[0-9]+]], vs[[REG5]]
; CHECK-BE-NEXT: xvcvdpsp v[[VREG3:[0-9]+]], vs[[REG4]]
; CHECK-BE-NEXT: vmrgew v[[VREG:[0-9]+]], v[[VREG3]], v[[VREG2]]
; CHECK-BE-DAG: lfiwzx f[[REG0:[0-9]+]], 0, r3
; CHECK-BE-DAG: lfiwzx f[[REG1:[0-9]+]], 0, r4
; CHECK-BE-DAG: lfiwzx f[[REG2:[0-9]+]], 0, r5
; CHECK-BE-DAG: lfiwzx f[[REG3:[0-9]+]], 0, r6
; CHECK-BE-DAG: xxmrghw vs[[REG0]], vs[[REG0]], vs[[REG1]]
; CHECK-BE-DAG: xxmrghw vs[[REG4:[0-9]+]], vs[[REG2]], vs[[REG3]]
; CHECK-BE-NEXT: xxmrgld v[[REG:[0-9]+]], vs[[REG0]], vs[[REG4]]
; CHECK-BE-NEXT: blr
entry:
%0 = load float, float* %a, align 4