From 08eb02abdb5c396b0b4a69c3fe9c166ec79a2202 Mon Sep 17 00:00:00 2001
From: Mohammed Agabaria <mohammed.agabaria@intel.com>
Date: Sun, 5 Nov 2017 09:36:54 +0000
Subject: [PATCH] [REVERT][LV][X86] update the cost of interleaving mem. access
 of floats

reverted my changes will be committed later after fixing the failure
This patch contains update of the costs of interleaved loads of v8f32 of stride 3 and 8.

Differential Revision: https://reviews.llvm.org/D39403


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317433 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86TargetTransformInfo.cpp     |   5 +-
 .../CostModel/interleaved-load-float.ll       | 141 ------------------
 2 files changed, 1 insertion(+), 145 deletions(-)
 delete mode 100644 test/Analysis/CostModel/interleaved-load-float.ll

diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index 8bddf574554..effbd07fa31 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2644,15 +2644,12 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
     { 3, MVT::v8i8,  9 },  //(load 24i8 and) deinterleave into 3 x 8i8
     { 3, MVT::v16i8, 11},  //(load 48i8 and) deinterleave into 3 x 16i8
     { 3, MVT::v32i8, 13},  //(load 96i8 and) deinterleave into 3 x 32i8
-    { 3, MVT::v8f32, 17 }, //(load 24f32 and)deinterleave into 3 x 8f32
 
     { 4, MVT::v2i8,  12 }, //(load 8i8 and)   deinterleave into 4 x 2i8
     { 4, MVT::v4i8,  4 },  //(load 16i8 and)  deinterleave into 4 x 4i8
     { 4, MVT::v8i8,  20 }, //(load 32i8 and)  deinterleave into 4 x 8i8
     { 4, MVT::v16i8, 39 }, //(load 64i8 and)  deinterleave into 4 x 16i8
-    { 4, MVT::v32i8, 80 }, //(load 128i8 and) deinterleave into 4 x 32i8
-
-    { 8, MVT::v8f32, 40 }  //(load 64f32 and)deinterleave into 8 x 8f32
+    { 4, MVT::v32i8, 80 }  //(load 128i8 and) deinterleave into 4 x 32i8
   };
 
   static const CostTblEntry AVX2InterleavedStoreTbl[] = {
diff --git a/test/Analysis/CostModel/interleaved-load-float.ll b/test/Analysis/CostModel/interleaved-load-float.ll
deleted file mode 100644
index 373a55d7ad4..00000000000
--- a/test/Analysis/CostModel/interleaved-load-float.ll
+++ /dev/null
@@ -1,141 +0,0 @@
-; REQUIRES: asserts
-; RUN: opt -S -loop-vectorize -debug-only=loop-vectorize -mcpu=skylake %s 2>&1 | FileCheck %s
-target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
-target triple = "i386-unknown-linux-gnu"
-
-@src = common local_unnamed_addr global [120 x float] zeroinitializer, align 4
-@dst = common local_unnamed_addr global [120 x float] zeroinitializer, align 4
-
-; Function Attrs: norecurse nounwind
-define void @stride8(float %k, i32 %width_) {
-entry:
-
-; CHECK: Found an estimated cost of 48 for VF 8 For instruction:   %0 = load float
-
-  %cmp72 = icmp sgt i32 %width_, 0
-  br i1 %cmp72, label %for.body.lr.ph, label %for.cond.cleanup
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.cond.cleanup.loopexit:                        ; preds = %for.body
-  br label %for.cond.cleanup
-
-for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
-  ret void
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.073 = phi i32 [ 0, %for.body.lr.ph ], [ %add46, %for.body ]
-  %arrayidx = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %i.073
-  %0 = load float, float* %arrayidx, align 4
-  %mul = fmul fast float %0, %k
-  %arrayidx2 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %i.073
-  %1 = load float, float* %arrayidx2, align 4
-  %add3 = fadd fast float %1, %mul
-  store float %add3, float* %arrayidx2, align 4
-  %add4 = or i32 %i.073, 1
-  %arrayidx5 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add4
-  %2 = load float, float* %arrayidx5, align 4
-  %mul6 = fmul fast float %2, %k
-  %arrayidx8 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add4
-  %3 = load float, float* %arrayidx8, align 4
-  %add9 = fadd fast float %3, %mul6
-  store float %add9, float* %arrayidx8, align 4
-  %add10 = or i32 %i.073, 2
-  %arrayidx11 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add10
-  %4 = load float, float* %arrayidx11, align 4
-  %mul12 = fmul fast float %4, %k
-  %arrayidx14 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add10
-  %5 = load float, float* %arrayidx14, align 4
-  %add15 = fadd fast float %5, %mul12
-  store float %add15, float* %arrayidx14, align 4
-  %add16 = or i32 %i.073, 3
-  %arrayidx17 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add16
-  %6 = load float, float* %arrayidx17, align 4
-  %mul18 = fmul fast float %6, %k
-  %arrayidx20 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add16
-  %7 = load float, float* %arrayidx20, align 4
-  %add21 = fadd fast float %7, %mul18
-  store float %add21, float* %arrayidx20, align 4
-  %add22 = or i32 %i.073, 4
-  %arrayidx23 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add22
-  %8 = load float, float* %arrayidx23, align 4
-  %mul24 = fmul fast float %8, %k
-  %arrayidx26 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add22
-  %9 = load float, float* %arrayidx26, align 4
-  %add27 = fadd fast float %9, %mul24
-  store float %add27, float* %arrayidx26, align 4
-  %add28 = or i32 %i.073, 5
-  %arrayidx29 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add28
-  %10 = load float, float* %arrayidx29, align 4
-  %mul30 = fmul fast float %10, %k
-  %arrayidx32 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add28
-  %11 = load float, float* %arrayidx32, align 4
-  %add33 = fadd fast float %11, %mul30
-  store float %add33, float* %arrayidx32, align 4
-  %add34 = or i32 %i.073, 6
-  %arrayidx35 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add34
-  %12 = load float, float* %arrayidx35, align 4
-  %mul36 = fmul fast float %12, %k
-  %arrayidx38 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add34
-  %13 = load float, float* %arrayidx38, align 4
-  %add39 = fadd fast float %13, %mul36
-  store float %add39, float* %arrayidx38, align 4
-  %add40 = or i32 %i.073, 7
-  %arrayidx41 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add40
-  %14 = load float, float* %arrayidx41, align 4
-  %mul42 = fmul fast float %14, %k
-  %arrayidx44 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add40
-  %15 = load float, float* %arrayidx44, align 4
-  %add45 = fadd fast float %15, %mul42
-  store float %add45, float* %arrayidx44, align 4
-  %add46 = add nuw nsw i32 %i.073, 8
-  %cmp = icmp slt i32 %add46, %width_
-  br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
-}
-
-; Function Attrs: norecurse nounwind
-define void @stride3(float %k, i32 %width_) {
-entry:
-
-; CHECK: Found an estimated cost of 20 for VF 8 For instruction:   %0 = load float
-
-  %cmp27 = icmp sgt i32 %width_, 0
-  br i1 %cmp27, label %for.body.lr.ph, label %for.cond.cleanup
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.cond.cleanup:                                 ; preds = %for.body, %entry
-  ret void
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.028 = phi i32 [ 0, %for.body.lr.ph ], [ %add16, %for.body ]
-  %arrayidx = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %i.028
-  %0 = load float, float* %arrayidx, align 4
-  %mul = fmul fast float %0, %k
-  %arrayidx2 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %i.028
-  %1 = load float, float* %arrayidx2, align 4
-  %add3 = fadd fast float %1, %mul
-  store float %add3, float* %arrayidx2, align 4
-  %add4 = add nuw nsw i32 %i.028, 1
-  %arrayidx5 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add4
-  %2 = load float, float* %arrayidx5, align 4
-  %mul6 = fmul fast float %2, %k
-  %arrayidx8 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add4
-  %3 = load float, float* %arrayidx8, align 4
-  %add9 = fadd fast float %3, %mul6
-  store float %add9, float* %arrayidx8, align 4
-  %add10 = add nuw nsw i32 %i.028, 2
-  %arrayidx11 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add10
-  %4 = load float, float* %arrayidx11, align 4
-  %mul12 = fmul fast float %4, %k
-  %arrayidx14 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add10
-  %5 = load float, float* %arrayidx14, align 4
-  %add15 = fadd fast float %5, %mul12
-  store float %add15, float* %arrayidx14, align 4
-  %add16 = add nuw nsw i32 %i.028, 3
-  %cmp = icmp slt i32 %add16, %width_
-  br i1 %cmp, label %for.body, label %for.cond.cleanup
-}
-