Add an intrinsic and codegen support for fused multiply-accumulate. The intent

is to use this for architectures that have a native FMA instruction. llvm-svn: 134742
2025-04-03 16:21:41 +00:00 · 2011-07-08 21:39:21 +00:00 · 2011-07-08 21:39:21 +00:00 · c23366d357
commit c23366d357
parent 69f14d6012
20 changed files with 146 additions and 1 deletions
--- a/docs/LangRef.html
+++ b/docs/LangRef.html
@ -241,6 +241,7 @@
          <li><a href="#int_pow">'<tt>llvm.pow.*</tt>' Intrinsic</a></li>
          <li><a href="#int_exp">'<tt>llvm.exp.*</tt>' Intrinsic</a></li>
          <li><a href="#int_log">'<tt>llvm.log.*</tt>' Intrinsic</a></li>
+          <li><a href="#int_fma">'<tt>llvm.fma.*</tt>' Intrinsic</a></li>
        </ol>
      </li>
      <li><a href="#int_manip">Bit Manipulation Intrinsics</a>
@ -6570,6 +6571,37 @@ LLVM</a>.</p>
 <p>This function returns the same values as the libm <tt>log</tt> functions
   would, and handles error conditions in the same way.</p>

+<h4>
+  <a name="int_fma">'<tt>llvm.fma.*</tt>' Intrinsic</a>
+</h4>
+
+<div>
+
+<h5>Syntax:</h5>
+<p>This is an overloaded intrinsic. You can use <tt>llvm.fma</tt> on any
+   floating point or vector of floating point type. Not all targets support all
+   types however.</p>
+
+<pre>
+  declare float     @llvm.fma.f32(float  %a, float  %b, float  %c)
+  declare double    @llvm.fma.f64(double %a, double %b, double %c)
+  declare x86_fp80  @llvm.fma.f80(x86_fp80 %a, x86_fp80 %b, x86_fp80 %c)
+  declare fp128     @llvm.fma.f128(fp128 %a, fp128 %b, fp128 %c)
+  declare ppc_fp128 @llvm.fma.ppcf128(ppc_fp128 %a, ppc_fp128 %b, ppc_fp128 %c)
+</pre>
+
+<h5>Overview:</h5>
+<p>The '<tt>llvm.fma.*</tt>' intrinsics perform the fused multiply-accumulate
+   operation.</p>
+
+<h5>Arguments:</h5>
+<p>The argument and return value are floating point numbers of the same
+   type.</p>
+
+<h5>Semantics:</h5>
+<p>This function returns the same values as the libm <tt>fma</tt> functions
+   would.</p>
+
 </div>

 <!-- ======================================================================= -->
--- a/include/llvm/CodeGen/ISDOpcodes.h
+++ b/include/llvm/CodeGen/ISDOpcodes.h
@ -232,7 +232,7 @@ namespace ISD {
    SMULO, UMULO,

    // Simple binary floating point operators.
-    FADD, FSUB, FMUL, FDIV, FREM,
+    FADD, FSUB, FMUL, FMA, FDIV, FREM,

    // FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.  NOTE: This
    // DAG node does not require that X and Y have the same type, just that they
--- a/include/llvm/CodeGen/RuntimeLibcalls.h
+++ b/include/llvm/CodeGen/RuntimeLibcalls.h
@ -103,6 +103,10 @@ namespace RTLIB {
    REM_F64,
    REM_F80,
    REM_PPCF128,
+    FMA_F32,
+    FMA_F64,
+    FMA_F80,
+    FMA_PPCF128,
    POWI_F32,
    POWI_F64,
    POWI_F80,
--- a/include/llvm/Intrinsics.td
+++ b/include/llvm/Intrinsics.td
@ -255,6 +255,12 @@ let Properties = [IntrReadMem] in {
  def int_exp2 : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
 }

+let Properties = [IntrNoMem] in {
+  def int_fma  : Intrinsic<[llvm_anyfloat_ty],
+                         [LLVMMatchType<0>, LLVMMatchType<0>,
+                          LLVMMatchType<0>]>;
+}
+
 // NOTE: these are internal interfaces.
 def int_setjmp     : Intrinsic<[llvm_i32_ty],  [llvm_ptr_ty]>;
 def int_longjmp    : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty]>;
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@ -3351,6 +3351,10 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node,
    Results.push_back(ExpandFPLibCall(Node, RTLIB::REM_F32, RTLIB::REM_F64,
                                      RTLIB::REM_F80, RTLIB::REM_PPCF128));
    break;
+  case ISD::FMA:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::FMA_F32, RTLIB::FMA_F64,
+                                      RTLIB::FMA_F80, RTLIB::FMA_PPCF128));
+    break;
  case ISD::FP16_TO_FP32:
    Results.push_back(ExpandLibCall(RTLIB::FPEXT_F16_F32, Node, false));
    break;
--- a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@ -74,6 +74,7 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
    case ISD::FLOG:        R = SoftenFloatRes_FLOG(N); break;
    case ISD::FLOG2:       R = SoftenFloatRes_FLOG2(N); break;
    case ISD::FLOG10:      R = SoftenFloatRes_FLOG10(N); break;
+    case ISD::FMA:         R = SoftenFloatRes_FMA(N); break;
    case ISD::FMUL:        R = SoftenFloatRes_FMUL(N); break;
    case ISD::FNEARBYINT:  R = SoftenFloatRes_FNEARBYINT(N); break;
    case ISD::FNEG:        R = SoftenFloatRes_FNEG(N); break;
@ -294,6 +295,19 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG10(SDNode *N) {
                     NVT, &Op, 1, false, N->getDebugLoc());
 }

+SDValue DAGTypeLegalizer::SoftenFloatRes_FMA(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Ops[3] = { GetSoftenedFloat(N->getOperand(0)),
+                     GetSoftenedFloat(N->getOperand(1)),
+                     GetSoftenedFloat(N->getOperand(2)) };
+  return MakeLibCall(GetFPLibCall(N->getValueType(0),
+                                  RTLIB::FMA_F32,
+                                  RTLIB::FMA_F64,
+                                  RTLIB::FMA_F80,
+                                  RTLIB::FMA_PPCF128),
+                     NVT, Ops, 3, false, N->getDebugLoc());
+}
+
 SDValue DAGTypeLegalizer::SoftenFloatRes_FMUL(SDNode *N) {
  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
  SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
@ -837,6 +851,7 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) {
  case ISD::FLOG:       ExpandFloatRes_FLOG(N, Lo, Hi); break;
  case ISD::FLOG2:      ExpandFloatRes_FLOG2(N, Lo, Hi); break;
  case ISD::FLOG10:     ExpandFloatRes_FLOG10(N, Lo, Hi); break;
+  case ISD::FMA:        ExpandFloatRes_FMA(N, Lo, Hi); break;
  case ISD::FMUL:       ExpandFloatRes_FMUL(N, Lo, Hi); break;
  case ISD::FNEARBYINT: ExpandFloatRes_FNEARBYINT(N, Lo, Hi); break;
  case ISD::FNEG:       ExpandFloatRes_FNEG(N, Lo, Hi); break;
@ -989,6 +1004,19 @@ void DAGTypeLegalizer::ExpandFloatRes_FLOG10(SDNode *N,
  GetPairElements(Call, Lo, Hi);
 }

+void DAGTypeLegalizer::ExpandFloatRes_FMA(SDNode *N, SDValue &Lo,
+                                          SDValue &Hi) {
+  SDValue Ops[3] = { N->getOperand(0), N->getOperand(1), N->getOperand(2) };
+  SDValue Call = MakeLibCall(GetFPLibCall(N->getValueType(0),
+                                          RTLIB::FMA_F32,
+                                          RTLIB::FMA_F64,
+                                          RTLIB::FMA_F80,
+                                          RTLIB::FMA_PPCF128),
+                             N->getValueType(0), Ops, 3, false,
+                             N->getDebugLoc());
+  GetPairElements(Call, Lo, Hi);
+}
+
 void DAGTypeLegalizer::ExpandFloatRes_FMUL(SDNode *N, SDValue &Lo,
                                           SDValue &Hi) {
  SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@ -378,6 +378,7 @@ private:
  SDValue SoftenFloatRes_FLOG(SDNode *N);
  SDValue SoftenFloatRes_FLOG2(SDNode *N);
  SDValue SoftenFloatRes_FLOG10(SDNode *N);
+  SDValue SoftenFloatRes_FMA(SDNode *N);
  SDValue SoftenFloatRes_FMUL(SDNode *N);
  SDValue SoftenFloatRes_FNEARBYINT(SDNode *N);
  SDValue SoftenFloatRes_FNEG(SDNode *N);
@ -442,6 +443,7 @@ private:
  void ExpandFloatRes_FLOG      (SDNode *N, SDValue &Lo, SDValue &Hi);
  void ExpandFloatRes_FLOG2     (SDNode *N, SDValue &Lo, SDValue &Hi);
  void ExpandFloatRes_FLOG10    (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FMA       (SDNode *N, SDValue &Lo, SDValue &Hi);
  void ExpandFloatRes_FMUL      (SDNode *N, SDValue &Lo, SDValue &Hi);
  void ExpandFloatRes_FNEARBYINT(SDNode *N, SDValue &Lo, SDValue &Hi);
  void ExpandFloatRes_FNEG      (SDNode *N, SDValue &Lo, SDValue &Hi);
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@ -5878,6 +5878,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
  case ISD::FSUB:   return "fsub";
  case ISD::FMUL:   return "fmul";
  case ISD::FDIV:   return "fdiv";
+  case ISD::FMA:    return "fma";
  case ISD::FREM:   return "frem";
  case ISD::FCOPYSIGN: return "fcopysign";
  case ISD::FGETSIGN:  return "fgetsign";
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@ -4651,6 +4651,13 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
  case Intrinsic::pow:
    visitPow(I);
    return 0;
+  case Intrinsic::fma:
+    setValue(&I, DAG.getNode(ISD::FMA, dl,
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0)),
+                             getValue(I.getArgOperand(1)),
+                             getValue(I.getArgOperand(2))));
+    return 0;
  case Intrinsic::convert_to_fp16:
    setValue(&I, DAG.getNode(ISD::FP32_TO_FP16, dl,
                             MVT::i16, getValue(I.getArgOperand(0))));
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@ -139,6 +139,10 @@ static void InitLibcallNames(const char **Names) {
  Names[RTLIB::REM_F64] = "fmod";
  Names[RTLIB::REM_F80] = "fmodl";
  Names[RTLIB::REM_PPCF128] = "fmodl";
+  Names[RTLIB::FMA_F32] = "fmaf";
+  Names[RTLIB::FMA_F64] = "fma";
+  Names[RTLIB::FMA_F80] = "fmal";
+  Names[RTLIB::FMA_PPCF128] = "fmal";
  Names[RTLIB::POWI_F32] = "__powisf2";
  Names[RTLIB::POWI_F64] = "__powidf2";
  Names[RTLIB::POWI_F80] = "__powixf2";
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@ -708,6 +708,9 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
  setOperationAction(ISD::FPOW,      MVT::f64, Expand);
  setOperationAction(ISD::FPOW,      MVT::f32, Expand);

+  setOperationAction(ISD::FMA, MVT::f64, Expand);
+  setOperationAction(ISD::FMA, MVT::f32, Expand);
+
  // Various VFP goodness
  if (!UseSoftFloat && !Subtarget->isThumb1Only()) {
    // int <-> fp are custom expanded into bit_convert + ARMISD ops.
--- a/lib/Target/Alpha/AlphaISelLowering.cpp
+++ b/lib/Target/Alpha/AlphaISelLowering.cpp
@ -122,6 +122,9 @@ AlphaTargetLowering::AlphaTargetLowering(TargetMachine &TM)
  setOperationAction(ISD::FPOW , MVT::f32, Expand);
  setOperationAction(ISD::FPOW , MVT::f64, Expand);

+  setOperationAction(ISD::FMA, MVT::f64, Expand);
+  setOperationAction(ISD::FMA, MVT::f32, Expand);
+
  setOperationAction(ISD::SETCC, MVT::f32, Promote);

  setOperationAction(ISD::BITCAST, MVT::f32, Promote);
--- a/lib/Target/CellSPU/SPUISelLowering.cpp
+++ b/lib/Target/CellSPU/SPUISelLowering.cpp
@ -221,6 +221,9 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
  setOperationAction(ISD::FSQRT, MVT::f64, Expand);
  setOperationAction(ISD::FSQRT, MVT::f32, Expand);

+  setOperationAction(ISD::FMA, MVT::f64, Expand);
+  setOperationAction(ISD::FMA, MVT::f32, Expand);
+
  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);

--- a/lib/Target/MBlaze/MBlazeISelLowering.cpp
+++ b/lib/Target/MBlaze/MBlazeISelLowering.cpp
@ -69,6 +69,7 @@ MBlazeTargetLowering::MBlazeTargetLowering(MBlazeTargetMachine &TM)

  // Floating point operations which are not supported
  setOperationAction(ISD::FREM,       MVT::f32, Expand);
+  setOperationAction(ISD::FMA,        MVT::f32, Expand);
  setOperationAction(ISD::UINT_TO_FP, MVT::i8,  Expand);
  setOperationAction(ISD::UINT_TO_FP, MVT::i16, Expand);
  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand);
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@ -146,6 +146,8 @@ MipsTargetLowering(MipsTargetMachine &TM)
  setOperationAction(ISD::FLOG2,             MVT::f32,   Expand);
  setOperationAction(ISD::FLOG10,            MVT::f32,   Expand);
  setOperationAction(ISD::FEXP,              MVT::f32,   Expand);
+  setOperationAction(ISD::FMA,               MVT::f32,   Expand);
+  setOperationAction(ISD::FMA,               MVT::f64,   Expand);

  setOperationAction(ISD::EXCEPTIONADDR,     MVT::i32, Expand);
  setOperationAction(ISD::EHSELECTION,       MVT::i32, Expand);
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@ -125,10 +125,12 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
  setOperationAction(ISD::FCOS , MVT::f64, Expand);
  setOperationAction(ISD::FREM , MVT::f64, Expand);
  setOperationAction(ISD::FPOW , MVT::f64, Expand);
+  setOperationAction(ISD::FMA  , MVT::f64, Expand);
  setOperationAction(ISD::FSIN , MVT::f32, Expand);
  setOperationAction(ISD::FCOS , MVT::f32, Expand);
  setOperationAction(ISD::FREM , MVT::f32, Expand);
  setOperationAction(ISD::FPOW , MVT::f32, Expand);
+  setOperationAction(ISD::FMA  , MVT::f32, Expand);

  setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);

--- a/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/lib/Target/Sparc/SparcISelLowering.cpp
@ -754,9 +754,11 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
  setOperationAction(ISD::FSIN , MVT::f64, Expand);
  setOperationAction(ISD::FCOS , MVT::f64, Expand);
  setOperationAction(ISD::FREM , MVT::f64, Expand);
+  setOperationAction(ISD::FMA  , MVT::f64, Expand);
  setOperationAction(ISD::FSIN , MVT::f32, Expand);
  setOperationAction(ISD::FCOS , MVT::f32, Expand);
  setOperationAction(ISD::FREM , MVT::f32, Expand);
+  setOperationAction(ISD::FMA  , MVT::f32, Expand);
  setOperationAction(ISD::CTPOP, MVT::i32, Expand);
  setOperationAction(ISD::CTTZ , MVT::i32, Expand);
  setOperationAction(ISD::CTLZ , MVT::i32, Expand);
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@ -142,6 +142,8 @@ SystemZTargetLowering::SystemZTargetLowering(SystemZTargetMachine &tm) :
  setOperationAction(ISD::FCOS,             MVT::f64, Expand);
  setOperationAction(ISD::FREM,             MVT::f32, Expand);
  setOperationAction(ISD::FREM,             MVT::f64, Expand);
+  setOperationAction(ISD::FMA,              MVT::f32, Expand);
+  setOperationAction(ISD::FMA,              MVT::f64, Expand);

  // We have only 64-bit bitconverts
  setOperationAction(ISD::BITCAST,          MVT::f32, Expand);
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -646,6 +646,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
    addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
  }

+  // We don't support FMA.
+  setOperationAction(ISD::FMA, MVT::f64, Expand);
+  setOperationAction(ISD::FMA, MVT::f32, Expand);
+
  // Long double always uses X87.
  if (!UseSoftFloat) {
    addRegisterClass(MVT::f80, X86::RFP80RegisterClass);
@ -670,6 +674,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      setOperationAction(ISD::FSIN           , MVT::f80  , Expand);
      setOperationAction(ISD::FCOS           , MVT::f80  , Expand);
    }
+
+    setOperationAction(ISD::FMA, MVT::f80, Expand);
  }

  // Always use a library call for pow.
--- a/test/CodeGen/X86/fma.ll
+++ b/test/CodeGen/X86/fma.ll
@ -0,0 +1,33 @@
+; RUN: llc < %s -mtriple=i386-apple-darwin10 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s
+
+; CHECK: test_f32
+; CHECK: _fmaf
+
+define float @test_f32(float %a, float %b, float %c) nounwind readnone ssp {
+entry:
+  %call = tail call float @llvm.fma.f32(float %a, float %b, float %c) nounwind readnone
+  ret float %call
+}
+
+; CHECK: test_f64
+; CHECK: _fma
+
+define double @test_f64(double %a, double %b, double %c) nounwind readnone ssp {
+entry:
+  %call = tail call double @llvm.fma.f64(double %a, double %b, double %c) nounwind readnone
+  ret double %call
+}
+
+; CHECK: test_f80
+; CHECK: _fmal
+
+define x86_fp80 @test_f80(x86_fp80 %a, x86_fp80 %b, x86_fp80 %c) nounwind readnone ssp {
+entry:
+  %call = tail call x86_fp80 @llvm.fma.f80(x86_fp80 %a, x86_fp80 %b, x86_fp80 %c) nounwind readnone
+  ret x86_fp80 %call
+}
+
+declare float @llvm.fma.f32(float, float, float) nounwind readnone
+declare double @llvm.fma.f64(double, double, double) nounwind readnone
+declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80) nounwind readnone