[X86][AVX] Fixed v16i16/v32i8 ADD/SUB costs on AVX1 subtargets

Add explicit v16i16/v32i8 ADD/SUB costs, matching the costs of v4i64/v8i32 - they were missing for some reason. This has side effects on the LV max bandwidth tests (AVX1 now prefers 128-bit vectors vs AVX2 which still prefers 256-bit) git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@286832 91177308-0d34-0410-b5e6-96231b3b80d8
2024-11-27 21:50:40 +00:00 · 2016-11-14 14:45:16 +00:00 · 2016-11-14 14:45:16 +00:00 · 2e6f35ab88
commit 2e6f35ab88
parent 4bbcd0ad60
3 changed files with 16 additions and 10 deletions
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@ -526,6 +526,10 @@ int X86TTIImpl::getArithmeticInstrCost(
    // Two ops + 1 extract + 1 insert = 4.
    { ISD::MUL,     MVT::v16i16,   4 },
    { ISD::MUL,     MVT::v8i32,    4 },
+    { ISD::SUB,     MVT::v32i8,    4 },
+    { ISD::ADD,     MVT::v32i8,    4 },
+    { ISD::SUB,     MVT::v16i16,   4 },
+    { ISD::ADD,     MVT::v16i16,   4 },
    { ISD::SUB,     MVT::v8i32,    4 },
    { ISD::ADD,     MVT::v8i32,    4 },
    { ISD::SUB,     MVT::v4i64,    4 },
--- a/test/Analysis/CostModel/X86/arith.ll
+++ b/test/Analysis/CostModel/X86/arith.ll
@ -57,13 +57,13 @@ define i32 @add(i32 %arg) {
  %G = add <8 x i16> undef, undef
  ; SSSE3: cost of 2 {{.*}} %H = add
  ; SSE42: cost of 2 {{.*}} %H = add
-  ; AVX: cost of 2 {{.*}} %H = add
+  ; AVX: cost of 4 {{.*}} %H = add
  ; AVX2: cost of 1 {{.*}} %H = add
  ; AVX512: cost of 1 {{.*}} %H = add
  %H = add <16 x i16> undef, undef
  ; SSSE3: cost of 4 {{.*}} %I = add
  ; SSE42: cost of 4 {{.*}} %I = add
-  ; AVX: cost of 4 {{.*}} %I = add
+  ; AVX: cost of 8 {{.*}} %I = add
  ; AVX2: cost of 2 {{.*}} %I = add
  ; AVX512F: cost of 2 {{.*}} %I = add
  ; AVX512BW: cost of 1 {{.*}} %I = add
@ -77,13 +77,13 @@ define i32 @add(i32 %arg) {
  %J = add <16 x i8> undef, undef
  ; SSSE3: cost of 2 {{.*}} %K = add
  ; SSE42: cost of 2 {{.*}} %K = add
-  ; AVX: cost of 2 {{.*}} %K = add
+  ; AVX: cost of 4 {{.*}} %K = add
  ; AVX2: cost of 1 {{.*}} %K = add
  ; AVX512: cost of 1 {{.*}} %K = add
  %K = add <32 x i8> undef, undef
  ; SSSE3: cost of 4 {{.*}} %L = add
  ; SSE42: cost of 4 {{.*}} %L = add
-  ; AVX: cost of 4 {{.*}} %L = add
+  ; AVX: cost of 8 {{.*}} %L = add
  ; AVX2: cost of 2 {{.*}} %L = add
  ; AVX512F: cost of 2 {{.*}} %L = add
  ; AVX512BW: cost of 1 {{.*}} %L = add
@ -140,13 +140,13 @@ define i32 @sub(i32 %arg) {
  %G = sub <8 x i16> undef, undef
  ; SSSE3: cost of 2 {{.*}} %H = sub
  ; SSE42: cost of 2 {{.*}} %H = sub
-  ; AVX: cost of 2 {{.*}} %H = sub
+  ; AVX: cost of 4 {{.*}} %H = sub
  ; AVX2: cost of 1 {{.*}} %H = sub
  ; AVX512: cost of 1 {{.*}} %H = sub
  %H = sub <16 x i16> undef, undef
  ; SSSE3: cost of 4 {{.*}} %I = sub
  ; SSE42: cost of 4 {{.*}} %I = sub
-  ; AVX: cost of 4 {{.*}} %I = sub
+  ; AVX: cost of 8 {{.*}} %I = sub
  ; AVX2: cost of 2 {{.*}} %I = sub
  ; AVX512F: cost of 2 {{.*}} %I = sub
  ; AVX512BW: cost of 1 {{.*}} %I = sub
@ -160,13 +160,13 @@ define i32 @sub(i32 %arg) {
  %J = sub <16 x i8> undef, undef
  ; SSSE3: cost of 2 {{.*}} %K = sub
  ; SSE42: cost of 2 {{.*}} %K = sub
-  ; AVX: cost of 2 {{.*}} %K = sub
+  ; AVX: cost of 4 {{.*}} %K = sub
  ; AVX2: cost of 1 {{.*}} %K = sub
  ; AVX512: cost of 1 {{.*}} %K = sub
  %K = sub <32 x i8> undef, undef
  ; SSSE3: cost of 4 {{.*}} %L = sub
  ; SSE42: cost of 4 {{.*}} %L = sub
-  ; AVX: cost of 4 {{.*}} %L = sub
+  ; AVX: cost of 8 {{.*}} %L = sub
  ; AVX2: cost of 2 {{.*}} %L = sub
  ; AVX512F: cost of 2 {{.*}} %L = sub
  ; AVX512BW: cost of 1 {{.*}} %L = sub
--- a/test/Transforms/LoopVectorize/X86/vector_max_bandwidth.ll
+++ b/test/Transforms/LoopVectorize/X86/vector_max_bandwidth.ll
@ -1,4 +1,5 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -mcpu=corei7-avx -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -mcpu=corei7-avx -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s --check-prefix=CHECK-AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -mcpu=core-avx2 -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s --check-prefix=CHECK-AVX2
 ; REQUIRES: asserts

 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@ -16,7 +17,8 @@ target triple = "x86_64-unknown-linux-gnu"
 ; -vectorizer-maximize-bandwidth is indicated.
 ;
 ; CHECK-label: foo
-; CHECK: LV: Selecting VF: 32.
+; CHECK-AVX1: LV: Selecting VF: 16.
+; CHECK-AVX2: LV: Selecting VF: 32.
 define void @foo() {
 entry:
  br label %for.body