mirror of
https://github.com/RPCSX/llvm.git
synced 2024-11-24 12:19:53 +00:00
[PowerPC] Load two floats directly instead of using one 64-bit integer load
When dealing with complex<float>, and similar structures with two single-precision floating-point numbers, especially when such things are being passed around by value, we'll sometimes end up loading both float values by extracting them from one 64-bit integer load. It looks like this: t13: i64,ch = load<LD8[%ref.tmp]> t0, t6, undef:i64 t16: i64 = srl t13, Constant:i32<32> t17: i32 = truncate t16 t18: f32 = bitcast t17 t19: i32 = truncate t13 t20: f32 = bitcast t19 The problem, especially before the P8 where those bitcasts aren't legal (and get expanded via the stack), is that it would have been better to use two floating-point loads directly. Here we add a target-specific DAGCombine to do just that. In short, we turn: ld 3, 0(5) stw 3, -8(1) rldicl 3, 3, 32, 32 stw 3, -4(1) lfs 3, -4(1) lfs 0, -8(1) into: lfs 3, 4(5) lfs 0, 0(5) git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@264988 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
0fb6150931
commit
dc6860fc82
@ -10268,6 +10268,111 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
|
||||
return expandVSXLoadForLE(N, DCI);
|
||||
}
|
||||
|
||||
// We sometimes end up with a 64-bit integer load, from which we extract
|
||||
// two single-precision floating-point numbers. This happens with
|
||||
// std::complex<float>, and other similar structures, because of the way we
|
||||
// canonicalize structure copies. However, if we lack direct moves,
|
||||
// then the final bitcasts from the extracted integer values to the
|
||||
// floating-point numbers turn into store/load pairs. Even with direct moves,
|
||||
// just loading the two floating-point numbers is likely better.
|
||||
auto ReplaceTwoFloatLoad = [&]() {
|
||||
if (VT != MVT::i64)
|
||||
return false;
|
||||
|
||||
if (LD->getExtensionType() != ISD::NON_EXTLOAD ||
|
||||
LD->isVolatile())
|
||||
return false;
|
||||
|
||||
// We're looking for a sequence like this:
|
||||
// t13: i64,ch = load<LD8[%ref.tmp]> t0, t6, undef:i64
|
||||
// t16: i64 = srl t13, Constant:i32<32>
|
||||
// t17: i32 = truncate t16
|
||||
// t18: f32 = bitcast t17
|
||||
// t19: i32 = truncate t13
|
||||
// t20: f32 = bitcast t19
|
||||
|
||||
if (!LD->hasNUsesOfValue(2, 0))
|
||||
return false;
|
||||
|
||||
auto UI = LD->use_begin();
|
||||
while (UI.getUse().getResNo() != 0) ++UI;
|
||||
SDNode *Trunc = *UI++;
|
||||
while (UI.getUse().getResNo() != 0) ++UI;
|
||||
SDNode *RightShift = *UI;
|
||||
if (Trunc->getOpcode() != ISD::TRUNCATE)
|
||||
std::swap(Trunc, RightShift);
|
||||
|
||||
if (Trunc->getOpcode() != ISD::TRUNCATE ||
|
||||
Trunc->getValueType(0) != MVT::i32 ||
|
||||
!Trunc->hasOneUse())
|
||||
return false;
|
||||
if (RightShift->getOpcode() != ISD::SRL ||
|
||||
!isa<ConstantSDNode>(RightShift->getOperand(1)) ||
|
||||
RightShift->getConstantOperandVal(1) != 32 ||
|
||||
!RightShift->hasOneUse())
|
||||
return false;
|
||||
|
||||
SDNode *Trunc2 = *RightShift->use_begin();
|
||||
if (Trunc2->getOpcode() != ISD::TRUNCATE ||
|
||||
Trunc2->getValueType(0) != MVT::i32 ||
|
||||
!Trunc2->hasOneUse())
|
||||
return false;
|
||||
|
||||
SDNode *Bitcast = *Trunc->use_begin();
|
||||
SDNode *Bitcast2 = *Trunc2->use_begin();
|
||||
|
||||
if (Bitcast->getOpcode() != ISD::BITCAST ||
|
||||
Bitcast->getValueType(0) != MVT::f32)
|
||||
return false;
|
||||
if (Bitcast2->getOpcode() != ISD::BITCAST ||
|
||||
Bitcast2->getValueType(0) != MVT::f32)
|
||||
return false;
|
||||
|
||||
if (Subtarget.isLittleEndian())
|
||||
std::swap(Bitcast, Bitcast2);
|
||||
|
||||
// Bitcast has the second float (in memory-layout order) and Bitcast2
|
||||
// has the first one.
|
||||
|
||||
SDValue BasePtr = LD->getBasePtr();
|
||||
if (LD->isIndexed()) {
|
||||
assert(LD->getAddressingMode() == ISD::PRE_INC &&
|
||||
"Non-pre-inc AM on PPC?");
|
||||
BasePtr =
|
||||
DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
|
||||
LD->getOffset());
|
||||
}
|
||||
|
||||
SDValue FloatLoad =
|
||||
DAG.getLoad(MVT::f32, dl, LD->getChain(), BasePtr,
|
||||
LD->getPointerInfo(), false, LD->isNonTemporal(),
|
||||
LD->isInvariant(), LD->getAlignment(), LD->getAAInfo());
|
||||
SDValue AddPtr =
|
||||
DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(),
|
||||
BasePtr, DAG.getIntPtrConstant(4, dl));
|
||||
SDValue FloatLoad2 =
|
||||
DAG.getLoad(MVT::f32, dl, SDValue(FloatLoad.getNode(), 1), AddPtr,
|
||||
LD->getPointerInfo().getWithOffset(4), false,
|
||||
LD->isNonTemporal(), LD->isInvariant(),
|
||||
MinAlign(LD->getAlignment(), 4), LD->getAAInfo());
|
||||
|
||||
if (LD->isIndexed()) {
|
||||
// Note that DAGCombine should re-form any pre-increment load(s) from
|
||||
// what is produced here if that makes sense.
|
||||
DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), BasePtr);
|
||||
}
|
||||
|
||||
DCI.CombineTo(Bitcast2, FloatLoad);
|
||||
DCI.CombineTo(Bitcast, FloatLoad2);
|
||||
|
||||
DAG.ReplaceAllUsesOfValueWith(SDValue(LD, LD->isIndexed() ? 2 : 1),
|
||||
SDValue(FloatLoad2.getNode(), 1));
|
||||
return true;
|
||||
};
|
||||
|
||||
if (ReplaceTwoFloatLoad())
|
||||
return SDValue(N, 0);
|
||||
|
||||
EVT MemVT = LD->getMemoryVT();
|
||||
Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
|
||||
unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty);
|
||||
|
60
test/CodeGen/PowerPC/load-two-flts.ll
Normal file
60
test/CodeGen/PowerPC/load-two-flts.ll
Normal file
@ -0,0 +1,60 @@
|
||||
; RUN: llc < %s | FileCheck %s
|
||||
target datalayout = "E-m:e-i64:64-n32:64"
|
||||
target triple = "powerpc64-bgq-linux"
|
||||
|
||||
define void @_Z4testSt7complexIfE(float %v0, float %v1, i64* %ref.tmp, float* %_M_value.realp.i.i, float* %_M_value.imagp.i.i) {
|
||||
entry:
|
||||
%v2 = load i64, i64* %ref.tmp, align 8
|
||||
%v3 = lshr i64 %v2, 32
|
||||
%v4 = trunc i64 %v3 to i32
|
||||
%v5 = bitcast i32 %v4 to float
|
||||
%v6 = trunc i64 %v2 to i32
|
||||
%v7 = bitcast i32 %v6 to float
|
||||
%mul_ad.i.i = fmul fast float %v5, %v1
|
||||
%mul_bc.i.i = fmul fast float %v7, %v0
|
||||
%mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i
|
||||
%mul_ac.i.i = fmul fast float %v5, %v0
|
||||
%mul_bd.i.i = fmul fast float %v7, %v1
|
||||
%mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i
|
||||
store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4
|
||||
store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4
|
||||
ret void
|
||||
|
||||
; CHECK-LABEL: @_Z4testSt7complexIfE
|
||||
; CHECK-NOT: ld {{[0-9]+}}, 0(5)
|
||||
; CHECK-NOT: stw
|
||||
; CHECK-NOT: rldicl
|
||||
; CHECK-DAG: lfs {{[0-9]+}}, 4(5)
|
||||
; CHECK-DAG: lfs {{[0-9]+}}, 0(5)
|
||||
; CHECK: blr
|
||||
}
|
||||
|
||||
define i64* @_Z4testSt7complexIfE_idx(float %v0, float %v1, i64* %ref.tmp, float* %_M_value.realp.i.i, float* %_M_value.imagp.i.i) {
|
||||
entry:
|
||||
%r = getelementptr i64, i64* %ref.tmp, i64 1
|
||||
%v2 = load i64, i64* %r, align 8
|
||||
%v3 = lshr i64 %v2, 32
|
||||
%v4 = trunc i64 %v3 to i32
|
||||
%v5 = bitcast i32 %v4 to float
|
||||
%v6 = trunc i64 %v2 to i32
|
||||
%v7 = bitcast i32 %v6 to float
|
||||
%mul_ad.i.i = fmul fast float %v5, %v1
|
||||
%mul_bc.i.i = fmul fast float %v7, %v0
|
||||
%mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i
|
||||
%mul_ac.i.i = fmul fast float %v5, %v0
|
||||
%mul_bd.i.i = fmul fast float %v7, %v1
|
||||
%mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i
|
||||
store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4
|
||||
store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4
|
||||
ret i64* %r
|
||||
|
||||
; CHECK-LABEL: @_Z4testSt7complexIfE
|
||||
; CHECK-NOT: ld {{[0-9]+}}, 8(5)
|
||||
; CHECK-NOT: ldu {{[0-9]+}}, 8(5)
|
||||
; CHECK-NOT: stw
|
||||
; CHECK-NOT: rldicl
|
||||
; CHECK-DAG: lfsu {{[0-9]+}}, 8(5)
|
||||
; CHECK-DAG: lfs {{[0-9]+}}, 4(5)
|
||||
; CHECK: blr
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user