mirror of
https://github.com/RPCS3/llvm.git
synced 2025-01-06 03:38:34 +00:00
Distribute (A + B) * C to (A * C) + (B * C) to make use of NEON multiplier
accumulator forwarding: vadd d3, d0, d1 vmul d3, d3, d2 => vmul d3, d0, d2 vmla d3, d1, d2 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@128665 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
a52d7da1d8
commit
463d358f1d
@ -51,6 +51,12 @@ def FeatureVFPOnlySP : SubtargetFeature<"fp-only-sp", "FPOnlySP", "true",
|
||||
// to just not use them.
|
||||
def FeatureHasSlowFPVMLx : SubtargetFeature<"slowfpvmlx", "SlowFPVMLx", "true",
|
||||
"Disable VFP / NEON MAC instructions">;
|
||||
|
||||
// Cortex-A8 / A9 Advanced SIMD has multiplier accumulator forwarding.
|
||||
def FeatureVMLxForwarding : SubtargetFeature<"vmlx-forwarding",
|
||||
"HasVMLxForwarding", "true",
|
||||
"Has multiplier accumulator forwarding">;
|
||||
|
||||
// Some processors benefit from using NEON instructions for scalar
|
||||
// single-precision FP operations.
|
||||
def FeatureNEONForFP : SubtargetFeature<"neonfp", "UseNEONForSinglePrecisionFP",
|
||||
@ -100,11 +106,12 @@ def ProcOthers : SubtargetFeature<"others", "ARMProcFamily", "Others",
|
||||
def ProcA8 : SubtargetFeature<"a8", "ARMProcFamily", "CortexA8",
|
||||
"Cortex-A8 ARM processors",
|
||||
[FeatureSlowFPBrcc, FeatureNEONForFP,
|
||||
FeatureHasSlowFPVMLx, FeatureT2XtPk]>;
|
||||
FeatureHasSlowFPVMLx, FeatureVMLxForwarding,
|
||||
FeatureT2XtPk]>;
|
||||
def ProcA9 : SubtargetFeature<"a9", "ARMProcFamily", "CortexA9",
|
||||
"Cortex-A9 ARM processors",
|
||||
[FeatureHasSlowFPVMLx, FeatureT2XtPk,
|
||||
FeatureFP16]>;
|
||||
[FeatureHasSlowFPVMLx, FeatureVMLxForwarding,
|
||||
FeatureT2XtPk, FeatureFP16]>;
|
||||
|
||||
class ProcNoItin<string Name, list<SubtargetFeature> Features>
|
||||
: Processor<Name, GenericItineraries, Features>;
|
||||
|
@ -5224,6 +5224,42 @@ static SDValue PerformSUBCombine(SDNode *N,
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
/// PerformVMULCombine
|
||||
/// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the
|
||||
/// special multiplier accumulator forwarding.
|
||||
/// vmul d3, d0, d2
|
||||
/// vmla d3, d1, d2
|
||||
/// is faster than
|
||||
/// vadd d3, d0, d1
|
||||
/// vmul d3, d3, d2
|
||||
static SDValue PerformVMULCombine(SDNode *N,
|
||||
TargetLowering::DAGCombinerInfo &DCI,
|
||||
const ARMSubtarget *Subtarget) {
|
||||
if (!Subtarget->hasVMLxForwarding())
|
||||
return SDValue();
|
||||
|
||||
SelectionDAG &DAG = DCI.DAG;
|
||||
SDValue N0 = N->getOperand(0);
|
||||
SDValue N1 = N->getOperand(1);
|
||||
unsigned Opcode = N0.getOpcode();
|
||||
if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
|
||||
Opcode != ISD::FADD && Opcode != ISD::FSUB) {
|
||||
Opcode = N0.getOpcode();
|
||||
if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
|
||||
Opcode != ISD::FADD && Opcode != ISD::FSUB)
|
||||
return SDValue();
|
||||
std::swap(N0, N1);
|
||||
}
|
||||
|
||||
EVT VT = N->getValueType(0);
|
||||
DebugLoc DL = N->getDebugLoc();
|
||||
SDValue N00 = N0->getOperand(0);
|
||||
SDValue N01 = N0->getOperand(1);
|
||||
return DAG.getNode(Opcode, DL, VT,
|
||||
DAG.getNode(ISD::MUL, DL, VT, N00, N1),
|
||||
DAG.getNode(ISD::MUL, DL, VT, N01, N1));
|
||||
}
|
||||
|
||||
static SDValue PerformMULCombine(SDNode *N,
|
||||
TargetLowering::DAGCombinerInfo &DCI,
|
||||
const ARMSubtarget *Subtarget) {
|
||||
@ -5236,6 +5272,8 @@ static SDValue PerformMULCombine(SDNode *N,
|
||||
return SDValue();
|
||||
|
||||
EVT VT = N->getValueType(0);
|
||||
if (VT.is64BitVector() || VT.is128BitVector())
|
||||
return PerformVMULCombine(N, DCI, Subtarget);
|
||||
if (VT != MVT::i32)
|
||||
return SDValue();
|
||||
|
||||
|
@ -61,6 +61,10 @@ protected:
|
||||
/// whether the FP VML[AS] instructions are slow (if so, don't use them).
|
||||
bool SlowFPVMLx;
|
||||
|
||||
/// HasVMLxForwarding - If true, NEON has special multiplier accumulator
|
||||
/// forwarding to allow mul + mla being issued back to back.
|
||||
bool HasVMLxForwarding;
|
||||
|
||||
/// SlowFPBrcc - True if floating point compare + branch is slow.
|
||||
bool SlowFPBrcc;
|
||||
|
||||
@ -182,6 +186,7 @@ protected:
|
||||
bool hasT2ExtractPack() const { return HasT2ExtractPack; }
|
||||
bool hasDataBarrier() const { return HasDataBarrier; }
|
||||
bool useFPVMLx() const { return !SlowFPVMLx; }
|
||||
bool hasVMLxForwarding() const { return HasVMLxForwarding; }
|
||||
bool isFPBrccSlow() const { return SlowFPBrcc; }
|
||||
bool isFPOnlySP() const { return FPOnlySP; }
|
||||
bool prefers32BitThumb() const { return Pref32BitThumb; }
|
||||
|
@ -1,4 +1,4 @@
|
||||
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
|
||||
; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s
|
||||
|
||||
define <8 x i8> @vmuli8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
|
||||
;CHECK: vmuli8:
|
||||
@ -466,3 +466,29 @@ entry:
|
||||
declare <16 x i8> @llvm.arm.neon.vld1.v16i8(i8*, i32) nounwind readonly
|
||||
|
||||
declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind
|
||||
|
||||
; Take advantage of the Cortex-A8 multiplier accumulator forward.
|
||||
|
||||
%struct.uint8x8_t = type { <8 x i8> }
|
||||
|
||||
define void @distribue2(%struct.uint8x8_t* nocapture %dst, i8* %src, i32 %mul) nounwind {
|
||||
entry:
|
||||
; CHECK: distribue2
|
||||
; CHECK-NOT: vadd.i8
|
||||
; CHECK: vmul.i8
|
||||
; CHECK: vmla.i8
|
||||
%0 = trunc i32 %mul to i8
|
||||
%1 = insertelement <8 x i8> undef, i8 %0, i32 0
|
||||
%2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
|
||||
%3 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %src, i32 1)
|
||||
%4 = bitcast <16 x i8> %3 to <2 x double>
|
||||
%5 = extractelement <2 x double> %4, i32 1
|
||||
%6 = bitcast double %5 to <8 x i8>
|
||||
%7 = extractelement <2 x double> %4, i32 0
|
||||
%8 = bitcast double %7 to <8 x i8>
|
||||
%9 = add <8 x i8> %6, %8
|
||||
%10 = mul <8 x i8> %9, %2
|
||||
%11 = getelementptr inbounds %struct.uint8x8_t* %dst, i32 0, i32 0
|
||||
store <8 x i8> %10, <8 x i8>* %11, align 8
|
||||
ret void
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user