diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp index a6bf4a86a9a..bc1ecd2ea43 100644 --- a/lib/Analysis/InlineCost.cpp +++ b/lib/Analysis/InlineCost.cpp @@ -797,9 +797,33 @@ bool CallAnalyzer::analyzeCall(CallSite CS) { FiftyPercentVectorBonus = Threshold; TenPercentVectorBonus = Threshold / 2; - // Subtract off one instruction per call argument as those will be free after - // inlining. - Cost -= CS.arg_size() * InlineConstants::InstrCost; + // Give out bonuses per argument, as the instructions setting them up will + // be gone after inlining. + for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) { + if (TD && CS.isByValArgument(I)) { + // We approximate the number of loads and stores needed by dividing the + // size of the byval type by the target's pointer size. + PointerType *PTy = cast(CS.getArgument(I)->getType()); + unsigned TypeSize = TD->getTypeSizeInBits(PTy->getElementType()); + unsigned PointerSize = TD->getPointerSizeInBits(); + // Ceiling division. + unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize; + + // If it generates more than 8 stores it is likely to be expanded as an + // inline memcpy so we take that as an upper bound. Otherwise we assume + // one load and one store per word copied. + // FIXME: The maxStoresPerMemcpy setting from the target should be used + // here instead of a magic number of 8, but it's not available via + // TargetData. + NumStores = std::min(NumStores, 8U); + + Cost -= 2 * NumStores * InlineConstants::InstrCost; + } else { + // For non-byval arguments subtract off one instruction per call + // argument. + Cost -= InlineConstants::InstrCost; + } + } // If there is only one call of the function, and it has internal linkage, // the cost of inlining it drops dramatically. diff --git a/test/Transforms/Inline/inline-byval-bonus.ll b/test/Transforms/Inline/inline-byval-bonus.ll new file mode 100644 index 00000000000..f3ed819a7f3 --- /dev/null +++ b/test/Transforms/Inline/inline-byval-bonus.ll @@ -0,0 +1,193 @@ +; RUN: opt -S -inline -inline-threshold=275 < %s | FileCheck %s +; PR13095 + +; The performance of the c-ray benchmark largely depends on the inlining of a +; specific call to @ray_sphere. This test case is designed to verify that it's +; inlined at -O3. + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +%struct.sphere = type { %struct.vec3, double, %struct.material, %struct.sphere* } +%struct.vec3 = type { double, double, double } +%struct.material = type { %struct.vec3, double, double } +%struct.ray = type { %struct.vec3, %struct.vec3 } +%struct.spoint = type { %struct.vec3, %struct.vec3, %struct.vec3, double } + +define i32 @caller(%struct.sphere* %i) { + %shadow_ray = alloca %struct.ray, align 8 + call void @fix(%struct.ray* %shadow_ray) + + %call = call i32 @ray_sphere(%struct.sphere* %i, %struct.ray* byval align 8 %shadow_ray, %struct.spoint* null) + ret i32 %call + +; CHECK: @caller +; CHECK-NOT: call i32 @ray_sphere +; CHECK: ret i32 +} + +declare void @fix(%struct.ray*) + +define i32 @ray_sphere(%struct.sphere* nocapture %sph, %struct.ray* nocapture byval align 8 %ray, %struct.spoint* %sp) nounwind uwtable ssp { + %1 = getelementptr inbounds %struct.ray* %ray, i64 0, i32 1, i32 0 + %2 = load double* %1, align 8 + %3 = fmul double %2, %2 + %4 = getelementptr inbounds %struct.ray* %ray, i64 0, i32 1, i32 1 + %5 = load double* %4, align 8 + %6 = fmul double %5, %5 + %7 = fadd double %3, %6 + %8 = getelementptr inbounds %struct.ray* %ray, i64 0, i32 1, i32 2 + %9 = load double* %8, align 8 + %10 = fmul double %9, %9 + %11 = fadd double %7, %10 + %12 = fmul double %2, 2.000000e+00 + %13 = getelementptr inbounds %struct.ray* %ray, i64 0, i32 0, i32 0 + %14 = load double* %13, align 8 + %15 = getelementptr inbounds %struct.sphere* %sph, i64 0, i32 0, i32 0 + %16 = load double* %15, align 8 + %17 = fsub double %14, %16 + %18 = fmul double %12, %17 + %19 = fmul double %5, 2.000000e+00 + %20 = getelementptr inbounds %struct.ray* %ray, i64 0, i32 0, i32 1 + %21 = load double* %20, align 8 + %22 = getelementptr inbounds %struct.sphere* %sph, i64 0, i32 0, i32 1 + %23 = load double* %22, align 8 + %24 = fsub double %21, %23 + %25 = fmul double %19, %24 + %26 = fadd double %18, %25 + %27 = fmul double %9, 2.000000e+00 + %28 = getelementptr inbounds %struct.ray* %ray, i64 0, i32 0, i32 2 + %29 = load double* %28, align 8 + %30 = getelementptr inbounds %struct.sphere* %sph, i64 0, i32 0, i32 2 + %31 = load double* %30, align 8 + %32 = fsub double %29, %31 + %33 = fmul double %27, %32 + %34 = fadd double %26, %33 + %35 = fmul double %16, %16 + %36 = fmul double %23, %23 + %37 = fadd double %35, %36 + %38 = fmul double %31, %31 + %39 = fadd double %37, %38 + %40 = fmul double %14, %14 + %41 = fadd double %40, %39 + %42 = fmul double %21, %21 + %43 = fadd double %42, %41 + %44 = fmul double %29, %29 + %45 = fadd double %44, %43 + %46 = fsub double -0.000000e+00, %16 + %47 = fmul double %14, %46 + %48 = fmul double %21, %23 + %49 = fsub double %47, %48 + %50 = fmul double %29, %31 + %51 = fsub double %49, %50 + %52 = fmul double %51, 2.000000e+00 + %53 = fadd double %52, %45 + %54 = getelementptr inbounds %struct.sphere* %sph, i64 0, i32 1 + %55 = load double* %54, align 8 + %56 = fmul double %55, %55 + %57 = fsub double %53, %56 + %58 = fmul double %34, %34 + %59 = fmul double %11, 4.000000e+00 + %60 = fmul double %59, %57 + %61 = fsub double %58, %60 + %62 = fcmp olt double %61, 0.000000e+00 + br i1 %62, label %130, label %63 + +;