Rein in overzealous InstCombine of fptrunc(OP(fpextend, fpextend)).

llvm-svn: 195934
2024-12-03 08:51:43 +00:00 · 2013-11-28 21:38:05 +00:00 · 2013-11-28 21:38:05 +00:00 · d8aaca93a6
commit d8aaca93a6
parent 3f4a857bd5
3 changed files with 152 additions and 27 deletions
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@ -1189,36 +1189,92 @@ static Value *LookThroughFPExtensions(Value *V) {
 Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
  if (Instruction *I = commonCastTransforms(CI))
    return I;
-
-  // If we have fptrunc(fadd (fpextend x), (fpextend y)), where x and y are
-  // smaller than the destination type, we can eliminate the truncate by doing
-  // the add as the smaller type.  This applies to fadd/fsub/fmul/fdiv as well
-  // as many builtins (sqrt, etc).
+  // If we have fptrunc(OpI (fpextend x), (fpextend y)), we would like to
+  // simpilify this expression to avoid one or more of the trunc/extend
+  // operations if we can do so without changing the numerical results.
+  //
+  // The exact manner in which the widths of the operands interact to limit
+  // what we can and cannot do safely varies from operation to operation, and
+  // is explained below in the various case statements.
  BinaryOperator *OpI = dyn_cast<BinaryOperator>(CI.getOperand(0));
  if (OpI && OpI->hasOneUse()) {
+    Value *LHSOrig = LookThroughFPExtensions(OpI->getOperand(0));
+    Value *RHSOrig = LookThroughFPExtensions(OpI->getOperand(1));
+    unsigned OpWidth = OpI->getType()->getFPMantissaWidth();
+    unsigned LHSWidth = LHSOrig->getType()->getFPMantissaWidth();
+    unsigned RHSWidth = RHSOrig->getType()->getFPMantissaWidth();
+    unsigned SrcWidth = std::max(LHSWidth, RHSWidth);
+    unsigned DstWidth = CI.getType()->getFPMantissaWidth();
    switch (OpI->getOpcode()) {
-    default: break;
-    case Instruction::FAdd:
-    case Instruction::FSub:
-    case Instruction::FMul:
-    case Instruction::FDiv:
-    case Instruction::FRem:
-      Type *SrcTy = OpI->getType();
-      Value *LHSTrunc = LookThroughFPExtensions(OpI->getOperand(0));
-      Value *RHSTrunc = LookThroughFPExtensions(OpI->getOperand(1));
-      if (LHSTrunc->getType() != SrcTy &&
-          RHSTrunc->getType() != SrcTy) {
-        unsigned DstSize = CI.getType()->getScalarSizeInBits();
-        // If the source types were both smaller than the destination type of
-        // the cast, do this xform.
-        if (LHSTrunc->getType()->getScalarSizeInBits() <= DstSize &&
-            RHSTrunc->getType()->getScalarSizeInBits() <= DstSize) {
-          LHSTrunc = Builder->CreateFPExt(LHSTrunc, CI.getType());
-          RHSTrunc = Builder->CreateFPExt(RHSTrunc, CI.getType());
-          return BinaryOperator::Create(OpI->getOpcode(), LHSTrunc, RHSTrunc);
+      default: break;
+      case Instruction::FAdd:
+      case Instruction::FSub:
+        // For addition and subtraction, the infinitely precise result can
+        // essentially be arbitrarily wide; proving that double rounding
+        // will not occur because the result of OpI is exact (as we will for
+        // FMul, for example) is hopeless.  However, we *can* nonetheless
+        // frequently know that double rounding cannot occur (or that it is
+        // innoculous) by taking advantage of the specific structure of
+        // infinitely-precise results that admit double rounding.
+        //
+        // Specifically, if OpWidth >= 2*DstWdith+1 and DstWidth is sufficent
+        // to represent both sources, we can guarantee that the double
+        // rounding is innocuous (See p50 of Figueroa's 2000 PhD thesis,
+        // "A Rigorous Framework for Fully Supporting the IEEE Standard ..."
+        // for proof of this fact).
+        //
+        // Note: Figueroa does not consider the case where DstFormat !=
+        // SrcFormat.  It's possible (likely even!) that this analysis
+        // could be tightened for those cases, but they are rare (the main
+        // case of interest here is (float)((double)float + float)).
+        if (OpWidth >= 2*DstWidth+1 && DstWidth >= SrcWidth) {
+          if (LHSOrig->getType() != CI.getType())
+            LHSOrig = Builder->CreateFPExt(LHSOrig, CI.getType());
+          if (RHSOrig->getType() != CI.getType())
+            RHSOrig = Builder->CreateFPExt(RHSOrig, CI.getType());
+          return BinaryOperator::Create(OpI->getOpcode(), LHSOrig, RHSOrig);
        }
-      }
-      break;
+        break;
+      case Instruction::FMul:
+        // For multiplication, the infinitely precise result has at most
+        // LHSWidth + RHSWidth significant bits; if OpWidth is sufficient
+        // that such a value can be exactly represented, then no double
+        // rounding can possibly occur; we can safely perform the operation
+        // in the destination format if it can represent both sources.
+        if (OpWidth >= LHSWidth + RHSWidth && DstWidth >= SrcWidth) {
+          if (LHSOrig->getType() != CI.getType())
+            LHSOrig = Builder->CreateFPExt(LHSOrig, CI.getType());
+          if (RHSOrig->getType() != CI.getType())
+            RHSOrig = Builder->CreateFPExt(RHSOrig, CI.getType());
+          return BinaryOperator::CreateFMul(LHSOrig, RHSOrig);
+        }
+        break;
+      case Instruction::FDiv:
+        // For division, we use again use the bound from Figueroa's
+        // dissertation.  I am entirely certain that this bound can be
+        // tightened in the unbalanced operand case by an analysis based on
+        // the diophantine rational approximation bound, but the well-known
+        // condition used here is a good conservative first pass.
+        // TODO: Tighten bound via rigorous analysis of the unbalanced case.
+        if (OpWidth >= 2*DstWidth && DstWidth >= SrcWidth) {
+          if (LHSOrig->getType() != CI.getType())
+            LHSOrig = Builder->CreateFPExt(LHSOrig, CI.getType());
+          if (RHSOrig->getType() != CI.getType())
+            RHSOrig = Builder->CreateFPExt(RHSOrig, CI.getType());
+          return BinaryOperator::CreateFDiv(LHSOrig, RHSOrig);
+        }
+        break;
+      case Instruction::FRem:
+        // Remainder is straightforward.  Remainder is always exact, so the
+        // type of OpI doesn't enter into things at all.  We simply evaluate
+        // in whichever source type is larger, then convert to the
+        // destination type.
+        if (LHSWidth < SrcWidth)
+          LHSOrig = Builder->CreateFPExt(LHSOrig, RHSOrig->getType());
+        else if (RHSWidth <= SrcWidth)
+          RHSOrig = Builder->CreateFPExt(RHSOrig, LHSOrig->getType());
+        Value *ExactResult = Builder->CreateFRem(LHSOrig, RHSOrig);
+        return CastInst::CreateFPCast(ExactResult, CI.getType());
    }

    // (fptrunc (fneg x)) -> (fneg (fptrunc x))
--- a/test/Transforms/InstCombine/fpextend.ll
+++ b/test/Transforms/InstCombine/fpextend.ll
@ -1,3 +1,4 @@
+
 ; RUN: opt < %s -instcombine -S | not grep fpext
@X = external global float 
@Y = external global float
@ -12,6 +13,18 @@ entry:
 	ret void
 }

+define void @test2() nounwind  {
+entry:
+	%tmp = load float* @X, align 4		; <float> [#uses=1]
+	%tmp1 = fpext float %tmp to double		; <double> [#uses=1]
+	%tmp2 = load float* @Y, align 4		; <float> [#uses=1]
+	%tmp23 = fpext float %tmp2 to double		; <double> [#uses=1]
+	%tmp5 = fmul double %tmp1, %tmp23		; <double> [#uses=1]
+	%tmp56 = fptrunc double %tmp5 to float		; <float> [#uses=1]
+	store float %tmp56, float* @X, align 4
+	ret void
+}
+
 define void @test3() nounwind  {
 entry:
 	%tmp = load float* @X, align 4		; <float> [#uses=1]
@ -33,4 +46,3 @@ entry:
 	store float %tmp34, float* @X, align 4
 	ret void
 }
-
--- a/test/Transforms/InstCombine/fpextend_x86.ll
+++ b/test/Transforms/InstCombine/fpextend_x86.ll
@ -0,0 +1,57 @@
+; RUN: opt < %s -instcombine -mtriple=x86_64-apple-macosx -S | FileCheck %s
+target triple = "x86_64-apple-macosx"
+
+define double @test1(double %a, double %b) nounwind {
+  %wa = fpext double %a to x86_fp80
+  %wb = fpext double %b to x86_fp80
+  %wr = fadd x86_fp80 %wa, %wb
+  %r = fptrunc x86_fp80 %wr to double
+  ret double %r
+; CHECK: test1
+; CHECK: fadd x86_fp80
+; CHECK: ret
+}
+
+define double @test2(double %a, double %b) nounwind {
+  %wa = fpext double %a to x86_fp80
+  %wb = fpext double %b to x86_fp80
+  %wr = fsub x86_fp80 %wa, %wb
+  %r = fptrunc x86_fp80 %wr to double
+  ret double %r
+; CHECK: test2
+; CHECK: fsub x86_fp80
+; CHECK: ret
+}
+
+define double @test3(double %a, double %b) nounwind {
+  %wa = fpext double %a to x86_fp80
+  %wb = fpext double %b to x86_fp80
+  %wr = fmul x86_fp80 %wa, %wb
+  %r = fptrunc x86_fp80 %wr to double
+  ret double %r
+; CHECK: test3
+; CHECK: fmul x86_fp80
+; CHECK: ret
+}
+
+define double @test4(double %a, half %b) nounwind {
+  %wa = fpext double %a to x86_fp80
+  %wb = fpext half %b to x86_fp80
+  %wr = fmul x86_fp80 %wa, %wb
+  %r = fptrunc x86_fp80 %wr to double
+  ret double %r
+; CHECK: test4
+; CHECK: fmul double
+; CHECK: ret
+}
+
+define double @test5(double %a, double %b) nounwind {
+  %wa = fpext double %a to x86_fp80
+  %wb = fpext double %b to x86_fp80
+  %wr = fdiv x86_fp80 %wa, %wb
+  %r = fptrunc x86_fp80 %wr to double
+  ret double %r
+; CHECK: test5
+; CHECK: fdiv x86_fp80
+; CHECK: ret
+}