[PowerPC] Load two floats directly instead of using one 64-bit integer load

When dealing with complex<float>, and similar structures with two
single-precision floating-point numbers, especially when such things are being
passed around by value, we'll sometimes end up loading both float values by
extracting them from one 64-bit integer load. It looks like this:

  t13: i64,ch = load<LD8[%ref.tmp]> t0, t6, undef:i64
      t16: i64 = srl t13, Constant:i32<32>
    t17: i32 = truncate t16
  t18: f32 = bitcast t17
    t19: i32 = truncate t13
  t20: f32 = bitcast t19

The problem, especially before the P8 where those bitcasts aren't legal (and
get expanded via the stack), is that it would have been better to use two
floating-point loads directly. Here we add a target-specific DAGCombine to do
just that. In short, we turn:

	ld 3, 0(5)
	stw 3, -8(1)
	rldicl 3, 3, 32, 32
	stw 3, -4(1)
	lfs 3, -4(1)
	lfs 0, -8(1)

into:

        lfs 3, 4(5)
        lfs 0, 0(5)

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@264988 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Hal Finkel 2016-03-31 02:56:05 +00:00
parent 0fb6150931
commit dc6860fc82
2 changed files with 165 additions and 0 deletions

View File

@ -10268,6 +10268,111 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
return expandVSXLoadForLE(N, DCI);
}
// We sometimes end up with a 64-bit integer load, from which we extract
// two single-precision floating-point numbers. This happens with
// std::complex<float>, and other similar structures, because of the way we
// canonicalize structure copies. However, if we lack direct moves,
// then the final bitcasts from the extracted integer values to the
// floating-point numbers turn into store/load pairs. Even with direct moves,
// just loading the two floating-point numbers is likely better.
auto ReplaceTwoFloatLoad = [&]() {
if (VT != MVT::i64)
return false;
if (LD->getExtensionType() != ISD::NON_EXTLOAD ||
LD->isVolatile())
return false;
// We're looking for a sequence like this:
// t13: i64,ch = load<LD8[%ref.tmp]> t0, t6, undef:i64
// t16: i64 = srl t13, Constant:i32<32>
// t17: i32 = truncate t16
// t18: f32 = bitcast t17
// t19: i32 = truncate t13
// t20: f32 = bitcast t19
if (!LD->hasNUsesOfValue(2, 0))
return false;
auto UI = LD->use_begin();
while (UI.getUse().getResNo() != 0) ++UI;
SDNode *Trunc = *UI++;
while (UI.getUse().getResNo() != 0) ++UI;
SDNode *RightShift = *UI;
if (Trunc->getOpcode() != ISD::TRUNCATE)
std::swap(Trunc, RightShift);
if (Trunc->getOpcode() != ISD::TRUNCATE ||
Trunc->getValueType(0) != MVT::i32 ||
!Trunc->hasOneUse())
return false;
if (RightShift->getOpcode() != ISD::SRL ||
!isa<ConstantSDNode>(RightShift->getOperand(1)) ||
RightShift->getConstantOperandVal(1) != 32 ||
!RightShift->hasOneUse())
return false;
SDNode *Trunc2 = *RightShift->use_begin();
if (Trunc2->getOpcode() != ISD::TRUNCATE ||
Trunc2->getValueType(0) != MVT::i32 ||
!Trunc2->hasOneUse())
return false;
SDNode *Bitcast = *Trunc->use_begin();
SDNode *Bitcast2 = *Trunc2->use_begin();
if (Bitcast->getOpcode() != ISD::BITCAST ||
Bitcast->getValueType(0) != MVT::f32)
return false;
if (Bitcast2->getOpcode() != ISD::BITCAST ||
Bitcast2->getValueType(0) != MVT::f32)
return false;
if (Subtarget.isLittleEndian())
std::swap(Bitcast, Bitcast2);
// Bitcast has the second float (in memory-layout order) and Bitcast2
// has the first one.
SDValue BasePtr = LD->getBasePtr();
if (LD->isIndexed()) {
assert(LD->getAddressingMode() == ISD::PRE_INC &&
"Non-pre-inc AM on PPC?");
BasePtr =
DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
LD->getOffset());
}
SDValue FloatLoad =
DAG.getLoad(MVT::f32, dl, LD->getChain(), BasePtr,
LD->getPointerInfo(), false, LD->isNonTemporal(),
LD->isInvariant(), LD->getAlignment(), LD->getAAInfo());
SDValue AddPtr =
DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(),
BasePtr, DAG.getIntPtrConstant(4, dl));
SDValue FloatLoad2 =
DAG.getLoad(MVT::f32, dl, SDValue(FloatLoad.getNode(), 1), AddPtr,
LD->getPointerInfo().getWithOffset(4), false,
LD->isNonTemporal(), LD->isInvariant(),
MinAlign(LD->getAlignment(), 4), LD->getAAInfo());
if (LD->isIndexed()) {
// Note that DAGCombine should re-form any pre-increment load(s) from
// what is produced here if that makes sense.
DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), BasePtr);
}
DCI.CombineTo(Bitcast2, FloatLoad);
DCI.CombineTo(Bitcast, FloatLoad2);
DAG.ReplaceAllUsesOfValueWith(SDValue(LD, LD->isIndexed() ? 2 : 1),
SDValue(FloatLoad2.getNode(), 1));
return true;
};
if (ReplaceTwoFloatLoad())
return SDValue(N, 0);
EVT MemVT = LD->getMemoryVT();
Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty);

View File

@ -0,0 +1,60 @@
; RUN: llc < %s | FileCheck %s
target datalayout = "E-m:e-i64:64-n32:64"
target triple = "powerpc64-bgq-linux"
define void @_Z4testSt7complexIfE(float %v0, float %v1, i64* %ref.tmp, float* %_M_value.realp.i.i, float* %_M_value.imagp.i.i) {
entry:
%v2 = load i64, i64* %ref.tmp, align 8
%v3 = lshr i64 %v2, 32
%v4 = trunc i64 %v3 to i32
%v5 = bitcast i32 %v4 to float
%v6 = trunc i64 %v2 to i32
%v7 = bitcast i32 %v6 to float
%mul_ad.i.i = fmul fast float %v5, %v1
%mul_bc.i.i = fmul fast float %v7, %v0
%mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i
%mul_ac.i.i = fmul fast float %v5, %v0
%mul_bd.i.i = fmul fast float %v7, %v1
%mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i
store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4
store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4
ret void
; CHECK-LABEL: @_Z4testSt7complexIfE
; CHECK-NOT: ld {{[0-9]+}}, 0(5)
; CHECK-NOT: stw
; CHECK-NOT: rldicl
; CHECK-DAG: lfs {{[0-9]+}}, 4(5)
; CHECK-DAG: lfs {{[0-9]+}}, 0(5)
; CHECK: blr
}
define i64* @_Z4testSt7complexIfE_idx(float %v0, float %v1, i64* %ref.tmp, float* %_M_value.realp.i.i, float* %_M_value.imagp.i.i) {
entry:
%r = getelementptr i64, i64* %ref.tmp, i64 1
%v2 = load i64, i64* %r, align 8
%v3 = lshr i64 %v2, 32
%v4 = trunc i64 %v3 to i32
%v5 = bitcast i32 %v4 to float
%v6 = trunc i64 %v2 to i32
%v7 = bitcast i32 %v6 to float
%mul_ad.i.i = fmul fast float %v5, %v1
%mul_bc.i.i = fmul fast float %v7, %v0
%mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i
%mul_ac.i.i = fmul fast float %v5, %v0
%mul_bd.i.i = fmul fast float %v7, %v1
%mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i
store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4
store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4
ret i64* %r
; CHECK-LABEL: @_Z4testSt7complexIfE
; CHECK-NOT: ld {{[0-9]+}}, 8(5)
; CHECK-NOT: ldu {{[0-9]+}}, 8(5)
; CHECK-NOT: stw
; CHECK-NOT: rldicl
; CHECK-DAG: lfsu {{[0-9]+}}, 8(5)
; CHECK-DAG: lfs {{[0-9]+}}, 4(5)
; CHECK: blr
}