From dc6860fc8267e6046209498234b57adb8ce1bdac Mon Sep 17 00:00:00 2001
From: Hal Finkel <hfinkel@anl.gov>
Date: Thu, 31 Mar 2016 02:56:05 +0000
Subject: [PATCH] [PowerPC] Load two floats directly instead of using one
 64-bit integer load

When dealing with complex<float>, and similar structures with two
single-precision floating-point numbers, especially when such things are being
passed around by value, we'll sometimes end up loading both float values by
extracting them from one 64-bit integer load. It looks like this:

  t13: i64,ch = load<LD8[%ref.tmp]> t0, t6, undef:i64
      t16: i64 = srl t13, Constant:i32<32>
    t17: i32 = truncate t16
  t18: f32 = bitcast t17
    t19: i32 = truncate t13
  t20: f32 = bitcast t19

The problem, especially before the P8 where those bitcasts aren't legal (and
get expanded via the stack), is that it would have been better to use two
floating-point loads directly. Here we add a target-specific DAGCombine to do
just that. In short, we turn:

	ld 3, 0(5)
	stw 3, -8(1)
	rldicl 3, 3, 32, 32
	stw 3, -4(1)
	lfs 3, -4(1)
	lfs 0, -8(1)

into:

        lfs 3, 4(5)
        lfs 0, 0(5)

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@264988 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/PowerPC/PPCISelLowering.cpp | 105 +++++++++++++++++++++++++
 test/CodeGen/PowerPC/load-two-flts.ll  |  60 ++++++++++++++
 2 files changed, 165 insertions(+)
 create mode 100644 test/CodeGen/PowerPC/load-two-flts.ll
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index f3251ba8db7..d0f43434c39 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -10268,6 +10268,111 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
         return expandVSXLoadForLE(N, DCI);
     }
 
+    // We sometimes end up with a 64-bit integer load, from which we extract
+    // two single-precision floating-point numbers. This happens with
+    // std::complex<float>, and other similar structures, because of the way we
+    // canonicalize structure copies. However, if we lack direct moves,
+    // then the final bitcasts from the extracted integer values to the
+    // floating-point numbers turn into store/load pairs. Even with direct moves,
+    // just loading the two floating-point numbers is likely better.
+    auto ReplaceTwoFloatLoad = [&]() {
+      if (VT != MVT::i64)
+        return false;
+
+      if (LD->getExtensionType() != ISD::NON_EXTLOAD ||
+          LD->isVolatile())
+        return false;
+
+      //  We're looking for a sequence like this:
+      //  t13: i64,ch = load<LD8[%ref.tmp]> t0, t6, undef:i64
+      //      t16: i64 = srl t13, Constant:i32<32>
+      //    t17: i32 = truncate t16
+      //  t18: f32 = bitcast t17
+      //    t19: i32 = truncate t13
+      //  t20: f32 = bitcast t19
+
+      if (!LD->hasNUsesOfValue(2, 0))
+        return false;
+
+      auto UI = LD->use_begin();
+      while (UI.getUse().getResNo() != 0) ++UI;
+      SDNode *Trunc = *UI++;
+      while (UI.getUse().getResNo() != 0) ++UI;
+      SDNode *RightShift = *UI;
+      if (Trunc->getOpcode() != ISD::TRUNCATE)
+        std::swap(Trunc, RightShift);
+
+      if (Trunc->getOpcode() != ISD::TRUNCATE ||
+          Trunc->getValueType(0) != MVT::i32 ||
+          !Trunc->hasOneUse())
+        return false;
+      if (RightShift->getOpcode() != ISD::SRL ||
+          !isa<ConstantSDNode>(RightShift->getOperand(1)) ||
+          RightShift->getConstantOperandVal(1) != 32 ||
+          !RightShift->hasOneUse())
+        return false;
+
+      SDNode *Trunc2 = *RightShift->use_begin();
+      if (Trunc2->getOpcode() != ISD::TRUNCATE ||
+          Trunc2->getValueType(0) != MVT::i32 ||
+          !Trunc2->hasOneUse())
+        return false;
+
+      SDNode *Bitcast = *Trunc->use_begin();
+      SDNode *Bitcast2 = *Trunc2->use_begin();
+
+      if (Bitcast->getOpcode() != ISD::BITCAST ||
+          Bitcast->getValueType(0) != MVT::f32)
+        return false;
+      if (Bitcast2->getOpcode() != ISD::BITCAST ||          
+          Bitcast2->getValueType(0) != MVT::f32)
+        return false;
+
+      if (Subtarget.isLittleEndian())
+        std::swap(Bitcast, Bitcast2);
+
+      // Bitcast has the second float (in memory-layout order) and Bitcast2
+      // has the first one.
+
+      SDValue BasePtr = LD->getBasePtr();
+      if (LD->isIndexed()) {
+        assert(LD->getAddressingMode() == ISD::PRE_INC &&
+               "Non-pre-inc AM on PPC?");
+        BasePtr =
+          DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
+                      LD->getOffset());
+      }
+
+      SDValue FloatLoad =
+        DAG.getLoad(MVT::f32, dl, LD->getChain(), BasePtr,
+                    LD->getPointerInfo(), false, LD->isNonTemporal(),
+                    LD->isInvariant(), LD->getAlignment(), LD->getAAInfo());
+      SDValue AddPtr =
+        DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(),
+                    BasePtr, DAG.getIntPtrConstant(4, dl));
+      SDValue FloatLoad2 =
+        DAG.getLoad(MVT::f32, dl, SDValue(FloatLoad.getNode(), 1), AddPtr,
+                    LD->getPointerInfo().getWithOffset(4), false,
+                    LD->isNonTemporal(), LD->isInvariant(),
+                    MinAlign(LD->getAlignment(), 4), LD->getAAInfo());
+
+      if (LD->isIndexed()) {
+	// Note that DAGCombine should re-form any pre-increment load(s) from
+	// what is produced here if that makes sense.
+        DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), BasePtr);
+      }
+
+      DCI.CombineTo(Bitcast2, FloatLoad);
+      DCI.CombineTo(Bitcast, FloatLoad2);
+
+      DAG.ReplaceAllUsesOfValueWith(SDValue(LD, LD->isIndexed() ? 2 : 1),
+                                    SDValue(FloatLoad2.getNode(), 1));
+      return true;
+    };
+
+    if (ReplaceTwoFloatLoad())
+      return SDValue(N, 0);
+
     EVT MemVT = LD->getMemoryVT();
     Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
     unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty);
diff --git a/test/CodeGen/PowerPC/load-two-flts.ll b/test/CodeGen/PowerPC/load-two-flts.ll
new file mode 100644
index 00000000000..270a852b1b0
--- /dev/null
+++ b/test/CodeGen/PowerPC/load-two-flts.ll
@@ -0,0 +1,60 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-bgq-linux"
+
+define void @_Z4testSt7complexIfE(float %v0, float %v1, i64* %ref.tmp, float* %_M_value.realp.i.i, float* %_M_value.imagp.i.i) {
+entry:
+  %v2 = load i64, i64* %ref.tmp, align 8
+  %v3 = lshr i64 %v2, 32
+  %v4 = trunc i64 %v3 to i32
+  %v5 = bitcast i32 %v4 to float
+  %v6 = trunc i64 %v2 to i32
+  %v7 = bitcast i32 %v6 to float
+  %mul_ad.i.i = fmul fast float %v5, %v1
+  %mul_bc.i.i = fmul fast float %v7, %v0
+  %mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i
+  %mul_ac.i.i = fmul fast float %v5, %v0
+  %mul_bd.i.i = fmul fast float %v7, %v1
+  %mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i
+  store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4
+  store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4
+  ret void
+
+; CHECK-LABEL: @_Z4testSt7complexIfE
+; CHECK-NOT: ld {{[0-9]+}}, 0(5)
+; CHECK-NOT: stw
+; CHECK-NOT: rldicl
+; CHECK-DAG: lfs {{[0-9]+}}, 4(5)
+; CHECK-DAG: lfs {{[0-9]+}}, 0(5)
+; CHECK: blr
+}
+
+define i64* @_Z4testSt7complexIfE_idx(float %v0, float %v1, i64* %ref.tmp, float* %_M_value.realp.i.i, float* %_M_value.imagp.i.i) {
+entry:
+  %r = getelementptr i64, i64* %ref.tmp, i64 1
+  %v2 = load i64, i64* %r, align 8
+  %v3 = lshr i64 %v2, 32
+  %v4 = trunc i64 %v3 to i32
+  %v5 = bitcast i32 %v4 to float
+  %v6 = trunc i64 %v2 to i32
+  %v7 = bitcast i32 %v6 to float
+  %mul_ad.i.i = fmul fast float %v5, %v1
+  %mul_bc.i.i = fmul fast float %v7, %v0
+  %mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i
+  %mul_ac.i.i = fmul fast float %v5, %v0
+  %mul_bd.i.i = fmul fast float %v7, %v1
+  %mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i
+  store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4
+  store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4
+  ret i64* %r
+
+; CHECK-LABEL: @_Z4testSt7complexIfE
+; CHECK-NOT: ld {{[0-9]+}}, 8(5)
+; CHECK-NOT: ldu {{[0-9]+}}, 8(5)
+; CHECK-NOT: stw
+; CHECK-NOT: rldicl
+; CHECK-DAG: lfsu {{[0-9]+}}, 8(5)
+; CHECK-DAG: lfs {{[0-9]+}}, 4(5)
+; CHECK: blr
+}
+