From dc6860fc8267e6046209498234b57adb8ce1bdac Mon Sep 17 00:00:00 2001 From: Hal Finkel Date: Thu, 31 Mar 2016 02:56:05 +0000 Subject: [PATCH] [PowerPC] Load two floats directly instead of using one 64-bit integer load When dealing with complex, and similar structures with two single-precision floating-point numbers, especially when such things are being passed around by value, we'll sometimes end up loading both float values by extracting them from one 64-bit integer load. It looks like this: t13: i64,ch = load t0, t6, undef:i64 t16: i64 = srl t13, Constant:i32<32> t17: i32 = truncate t16 t18: f32 = bitcast t17 t19: i32 = truncate t13 t20: f32 = bitcast t19 The problem, especially before the P8 where those bitcasts aren't legal (and get expanded via the stack), is that it would have been better to use two floating-point loads directly. Here we add a target-specific DAGCombine to do just that. In short, we turn: ld 3, 0(5) stw 3, -8(1) rldicl 3, 3, 32, 32 stw 3, -4(1) lfs 3, -4(1) lfs 0, -8(1) into: lfs 3, 4(5) lfs 0, 0(5) git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@264988 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/PowerPC/PPCISelLowering.cpp | 105 +++++++++++++++++++++++++ test/CodeGen/PowerPC/load-two-flts.ll | 60 ++++++++++++++ 2 files changed, 165 insertions(+) create mode 100644 test/CodeGen/PowerPC/load-two-flts.ll diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index f3251ba8db7..d0f43434c39 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -10268,6 +10268,111 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, return expandVSXLoadForLE(N, DCI); } + // We sometimes end up with a 64-bit integer load, from which we extract + // two single-precision floating-point numbers. This happens with + // std::complex, and other similar structures, because of the way we + // canonicalize structure copies. However, if we lack direct moves, + // then the final bitcasts from the extracted integer values to the + // floating-point numbers turn into store/load pairs. Even with direct moves, + // just loading the two floating-point numbers is likely better. + auto ReplaceTwoFloatLoad = [&]() { + if (VT != MVT::i64) + return false; + + if (LD->getExtensionType() != ISD::NON_EXTLOAD || + LD->isVolatile()) + return false; + + // We're looking for a sequence like this: + // t13: i64,ch = load t0, t6, undef:i64 + // t16: i64 = srl t13, Constant:i32<32> + // t17: i32 = truncate t16 + // t18: f32 = bitcast t17 + // t19: i32 = truncate t13 + // t20: f32 = bitcast t19 + + if (!LD->hasNUsesOfValue(2, 0)) + return false; + + auto UI = LD->use_begin(); + while (UI.getUse().getResNo() != 0) ++UI; + SDNode *Trunc = *UI++; + while (UI.getUse().getResNo() != 0) ++UI; + SDNode *RightShift = *UI; + if (Trunc->getOpcode() != ISD::TRUNCATE) + std::swap(Trunc, RightShift); + + if (Trunc->getOpcode() != ISD::TRUNCATE || + Trunc->getValueType(0) != MVT::i32 || + !Trunc->hasOneUse()) + return false; + if (RightShift->getOpcode() != ISD::SRL || + !isa(RightShift->getOperand(1)) || + RightShift->getConstantOperandVal(1) != 32 || + !RightShift->hasOneUse()) + return false; + + SDNode *Trunc2 = *RightShift->use_begin(); + if (Trunc2->getOpcode() != ISD::TRUNCATE || + Trunc2->getValueType(0) != MVT::i32 || + !Trunc2->hasOneUse()) + return false; + + SDNode *Bitcast = *Trunc->use_begin(); + SDNode *Bitcast2 = *Trunc2->use_begin(); + + if (Bitcast->getOpcode() != ISD::BITCAST || + Bitcast->getValueType(0) != MVT::f32) + return false; + if (Bitcast2->getOpcode() != ISD::BITCAST || + Bitcast2->getValueType(0) != MVT::f32) + return false; + + if (Subtarget.isLittleEndian()) + std::swap(Bitcast, Bitcast2); + + // Bitcast has the second float (in memory-layout order) and Bitcast2 + // has the first one. + + SDValue BasePtr = LD->getBasePtr(); + if (LD->isIndexed()) { + assert(LD->getAddressingMode() == ISD::PRE_INC && + "Non-pre-inc AM on PPC?"); + BasePtr = + DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, + LD->getOffset()); + } + + SDValue FloatLoad = + DAG.getLoad(MVT::f32, dl, LD->getChain(), BasePtr, + LD->getPointerInfo(), false, LD->isNonTemporal(), + LD->isInvariant(), LD->getAlignment(), LD->getAAInfo()); + SDValue AddPtr = + DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), + BasePtr, DAG.getIntPtrConstant(4, dl)); + SDValue FloatLoad2 = + DAG.getLoad(MVT::f32, dl, SDValue(FloatLoad.getNode(), 1), AddPtr, + LD->getPointerInfo().getWithOffset(4), false, + LD->isNonTemporal(), LD->isInvariant(), + MinAlign(LD->getAlignment(), 4), LD->getAAInfo()); + + if (LD->isIndexed()) { + // Note that DAGCombine should re-form any pre-increment load(s) from + // what is produced here if that makes sense. + DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), BasePtr); + } + + DCI.CombineTo(Bitcast2, FloatLoad); + DCI.CombineTo(Bitcast, FloatLoad2); + + DAG.ReplaceAllUsesOfValueWith(SDValue(LD, LD->isIndexed() ? 2 : 1), + SDValue(FloatLoad2.getNode(), 1)); + return true; + }; + + if (ReplaceTwoFloatLoad()) + return SDValue(N, 0); + EVT MemVT = LD->getMemoryVT(); Type *Ty = MemVT.getTypeForEVT(*DAG.getContext()); unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty); diff --git a/test/CodeGen/PowerPC/load-two-flts.ll b/test/CodeGen/PowerPC/load-two-flts.ll new file mode 100644 index 00000000000..270a852b1b0 --- /dev/null +++ b/test/CodeGen/PowerPC/load-two-flts.ll @@ -0,0 +1,60 @@ +; RUN: llc < %s | FileCheck %s +target datalayout = "E-m:e-i64:64-n32:64" +target triple = "powerpc64-bgq-linux" + +define void @_Z4testSt7complexIfE(float %v0, float %v1, i64* %ref.tmp, float* %_M_value.realp.i.i, float* %_M_value.imagp.i.i) { +entry: + %v2 = load i64, i64* %ref.tmp, align 8 + %v3 = lshr i64 %v2, 32 + %v4 = trunc i64 %v3 to i32 + %v5 = bitcast i32 %v4 to float + %v6 = trunc i64 %v2 to i32 + %v7 = bitcast i32 %v6 to float + %mul_ad.i.i = fmul fast float %v5, %v1 + %mul_bc.i.i = fmul fast float %v7, %v0 + %mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i + %mul_ac.i.i = fmul fast float %v5, %v0 + %mul_bd.i.i = fmul fast float %v7, %v1 + %mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i + store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4 + store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4 + ret void + +; CHECK-LABEL: @_Z4testSt7complexIfE +; CHECK-NOT: ld {{[0-9]+}}, 0(5) +; CHECK-NOT: stw +; CHECK-NOT: rldicl +; CHECK-DAG: lfs {{[0-9]+}}, 4(5) +; CHECK-DAG: lfs {{[0-9]+}}, 0(5) +; CHECK: blr +} + +define i64* @_Z4testSt7complexIfE_idx(float %v0, float %v1, i64* %ref.tmp, float* %_M_value.realp.i.i, float* %_M_value.imagp.i.i) { +entry: + %r = getelementptr i64, i64* %ref.tmp, i64 1 + %v2 = load i64, i64* %r, align 8 + %v3 = lshr i64 %v2, 32 + %v4 = trunc i64 %v3 to i32 + %v5 = bitcast i32 %v4 to float + %v6 = trunc i64 %v2 to i32 + %v7 = bitcast i32 %v6 to float + %mul_ad.i.i = fmul fast float %v5, %v1 + %mul_bc.i.i = fmul fast float %v7, %v0 + %mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i + %mul_ac.i.i = fmul fast float %v5, %v0 + %mul_bd.i.i = fmul fast float %v7, %v1 + %mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i + store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4 + store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4 + ret i64* %r + +; CHECK-LABEL: @_Z4testSt7complexIfE +; CHECK-NOT: ld {{[0-9]+}}, 8(5) +; CHECK-NOT: ldu {{[0-9]+}}, 8(5) +; CHECK-NOT: stw +; CHECK-NOT: rldicl +; CHECK-DAG: lfsu {{[0-9]+}}, 8(5) +; CHECK-DAG: lfs {{[0-9]+}}, 4(5) +; CHECK: blr +} +