[REVERT][LV][X86] update the cost of interleaving mem. access of floats

reverted my changes will be committed later after fixing the failure This patch contains update of the costs of interleaved loads of v8f32 of stride 3 and 8. Differential Revision: https://reviews.llvm.org/D39403 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317433 91177308-0d34-0410-b5e6-96231b3b80d8
2025-03-02 15:45:44 +00:00 · 2017-11-05 09:36:54 +00:00 · 2017-11-05 09:36:54 +00:00 · 08eb02abdb
commit 08eb02abdb
parent 1511d2e7fe
2 changed files with 1 additions and 145 deletions
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@ -2644,15 +2644,12 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
    { 3, MVT::v8i8,  9 },  //(load 24i8 and) deinterleave into 3 x 8i8
    { 3, MVT::v16i8, 11},  //(load 48i8 and) deinterleave into 3 x 16i8
    { 3, MVT::v32i8, 13},  //(load 96i8 and) deinterleave into 3 x 32i8
-    { 3, MVT::v8f32, 17 }, //(load 24f32 and)deinterleave into 3 x 8f32

    { 4, MVT::v2i8,  12 }, //(load 8i8 and)   deinterleave into 4 x 2i8
    { 4, MVT::v4i8,  4 },  //(load 16i8 and)  deinterleave into 4 x 4i8
    { 4, MVT::v8i8,  20 }, //(load 32i8 and)  deinterleave into 4 x 8i8
    { 4, MVT::v16i8, 39 }, //(load 64i8 and)  deinterleave into 4 x 16i8
-    { 4, MVT::v32i8, 80 }, //(load 128i8 and) deinterleave into 4 x 32i8
-
-    { 8, MVT::v8f32, 40 }  //(load 64f32 and)deinterleave into 8 x 8f32
+    { 4, MVT::v32i8, 80 }  //(load 128i8 and) deinterleave into 4 x 32i8
  };

  static const CostTblEntry AVX2InterleavedStoreTbl[] = {
--- a/test/Analysis/CostModel/interleaved-load-float.ll
+++ b/test/Analysis/CostModel/interleaved-load-float.ll
@ -1,141 +0,0 @@
-; REQUIRES: asserts
-; RUN: opt -S -loop-vectorize -debug-only=loop-vectorize -mcpu=skylake %s 2>&1 | FileCheck %s
-target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
-target triple = "i386-unknown-linux-gnu"
-
-@src = common local_unnamed_addr global [120 x float] zeroinitializer, align 4
-@dst = common local_unnamed_addr global [120 x float] zeroinitializer, align 4
-
-; Function Attrs: norecurse nounwind
-define void @stride8(float %k, i32 %width_) {
-entry:
-
-; CHECK: Found an estimated cost of 48 for VF 8 For instruction:   %0 = load float
-
-  %cmp72 = icmp sgt i32 %width_, 0
-  br i1 %cmp72, label %for.body.lr.ph, label %for.cond.cleanup
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.cond.cleanup.loopexit:                        ; preds = %for.body
-  br label %for.cond.cleanup
-
-for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
-  ret void
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.073 = phi i32 [ 0, %for.body.lr.ph ], [ %add46, %for.body ]
-  %arrayidx = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %i.073
-  %0 = load float, float* %arrayidx, align 4
-  %mul = fmul fast float %0, %k
-  %arrayidx2 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %i.073
-  %1 = load float, float* %arrayidx2, align 4
-  %add3 = fadd fast float %1, %mul
-  store float %add3, float* %arrayidx2, align 4
-  %add4 = or i32 %i.073, 1
-  %arrayidx5 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add4
-  %2 = load float, float* %arrayidx5, align 4
-  %mul6 = fmul fast float %2, %k
-  %arrayidx8 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add4
-  %3 = load float, float* %arrayidx8, align 4
-  %add9 = fadd fast float %3, %mul6
-  store float %add9, float* %arrayidx8, align 4
-  %add10 = or i32 %i.073, 2
-  %arrayidx11 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add10
-  %4 = load float, float* %arrayidx11, align 4
-  %mul12 = fmul fast float %4, %k
-  %arrayidx14 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add10
-  %5 = load float, float* %arrayidx14, align 4
-  %add15 = fadd fast float %5, %mul12
-  store float %add15, float* %arrayidx14, align 4
-  %add16 = or i32 %i.073, 3
-  %arrayidx17 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add16
-  %6 = load float, float* %arrayidx17, align 4
-  %mul18 = fmul fast float %6, %k
-  %arrayidx20 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add16
-  %7 = load float, float* %arrayidx20, align 4
-  %add21 = fadd fast float %7, %mul18
-  store float %add21, float* %arrayidx20, align 4
-  %add22 = or i32 %i.073, 4
-  %arrayidx23 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add22
-  %8 = load float, float* %arrayidx23, align 4
-  %mul24 = fmul fast float %8, %k
-  %arrayidx26 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add22
-  %9 = load float, float* %arrayidx26, align 4
-  %add27 = fadd fast float %9, %mul24
-  store float %add27, float* %arrayidx26, align 4
-  %add28 = or i32 %i.073, 5
-  %arrayidx29 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add28
-  %10 = load float, float* %arrayidx29, align 4
-  %mul30 = fmul fast float %10, %k
-  %arrayidx32 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add28
-  %11 = load float, float* %arrayidx32, align 4
-  %add33 = fadd fast float %11, %mul30
-  store float %add33, float* %arrayidx32, align 4
-  %add34 = or i32 %i.073, 6
-  %arrayidx35 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add34
-  %12 = load float, float* %arrayidx35, align 4
-  %mul36 = fmul fast float %12, %k
-  %arrayidx38 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add34
-  %13 = load float, float* %arrayidx38, align 4
-  %add39 = fadd fast float %13, %mul36
-  store float %add39, float* %arrayidx38, align 4
-  %add40 = or i32 %i.073, 7
-  %arrayidx41 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add40
-  %14 = load float, float* %arrayidx41, align 4
-  %mul42 = fmul fast float %14, %k
-  %arrayidx44 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add40
-  %15 = load float, float* %arrayidx44, align 4
-  %add45 = fadd fast float %15, %mul42
-  store float %add45, float* %arrayidx44, align 4
-  %add46 = add nuw nsw i32 %i.073, 8
-  %cmp = icmp slt i32 %add46, %width_
-  br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
-}
-
-; Function Attrs: norecurse nounwind
-define void @stride3(float %k, i32 %width_) {
-entry:
-
-; CHECK: Found an estimated cost of 20 for VF 8 For instruction:   %0 = load float
-
-  %cmp27 = icmp sgt i32 %width_, 0
-  br i1 %cmp27, label %for.body.lr.ph, label %for.cond.cleanup
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.cond.cleanup:                                 ; preds = %for.body, %entry
-  ret void
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.028 = phi i32 [ 0, %for.body.lr.ph ], [ %add16, %for.body ]
-  %arrayidx = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %i.028
-  %0 = load float, float* %arrayidx, align 4
-  %mul = fmul fast float %0, %k
-  %arrayidx2 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %i.028
-  %1 = load float, float* %arrayidx2, align 4
-  %add3 = fadd fast float %1, %mul
-  store float %add3, float* %arrayidx2, align 4
-  %add4 = add nuw nsw i32 %i.028, 1
-  %arrayidx5 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add4
-  %2 = load float, float* %arrayidx5, align 4
-  %mul6 = fmul fast float %2, %k
-  %arrayidx8 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add4
-  %3 = load float, float* %arrayidx8, align 4
-  %add9 = fadd fast float %3, %mul6
-  store float %add9, float* %arrayidx8, align 4
-  %add10 = add nuw nsw i32 %i.028, 2
-  %arrayidx11 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add10
-  %4 = load float, float* %arrayidx11, align 4
-  %mul12 = fmul fast float %4, %k
-  %arrayidx14 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add10
-  %5 = load float, float* %arrayidx14, align 4
-  %add15 = fadd fast float %5, %mul12
-  store float %add15, float* %arrayidx14, align 4
-  %add16 = add nuw nsw i32 %i.028, 3
-  %cmp = icmp slt i32 %add16, %width_
-  br i1 %cmp, label %for.body, label %for.cond.cleanup
-}
-