mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-12-14 07:09:08 +00:00
[AMDGPU] Fixed cost model for packed 16 bit ops
Differential Revision: https://reviews.llvm.org/D71622
This commit is contained in:
parent
a355453a29
commit
5d423671d1
@ -369,6 +369,9 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
|
||||
if (SLT == MVT::i64)
|
||||
return get64BitInstrCost() * LT.first * NElts;
|
||||
|
||||
if (ST->has16BitInsts() && SLT == MVT::i16)
|
||||
NElts = (NElts + 1) / 2;
|
||||
|
||||
// i32
|
||||
return getFullRateInstrCost() * LT.first * NElts;
|
||||
case ISD::ADD:
|
||||
@ -376,11 +379,14 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
|
||||
case ISD::AND:
|
||||
case ISD::OR:
|
||||
case ISD::XOR:
|
||||
if (SLT == MVT::i64){
|
||||
if (SLT == MVT::i64) {
|
||||
// and, or and xor are typically split into 2 VALU instructions.
|
||||
return 2 * getFullRateInstrCost() * LT.first * NElts;
|
||||
}
|
||||
|
||||
if (ST->has16BitInsts() && SLT == MVT::i16)
|
||||
NElts = (NElts + 1) / 2;
|
||||
|
||||
return LT.first * NElts * getFullRateInstrCost();
|
||||
case ISD::MUL: {
|
||||
const int QuarterRateCost = getQuarterRateInstrCost();
|
||||
@ -389,6 +395,9 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
|
||||
return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
|
||||
}
|
||||
|
||||
if (ST->has16BitInsts() && SLT == MVT::i16)
|
||||
NElts = (NElts + 1) / 2;
|
||||
|
||||
// i32
|
||||
return QuarterRateCost * NElts * LT.first;
|
||||
}
|
||||
@ -398,6 +407,9 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
|
||||
if (SLT == MVT::f64)
|
||||
return LT.first * NElts * get64BitInstrCost();
|
||||
|
||||
if (ST->has16BitInsts() && SLT == MVT::f16)
|
||||
NElts = (NElts + 1) / 2;
|
||||
|
||||
if (SLT == MVT::f32 || SLT == MVT::f16)
|
||||
return LT.first * NElts * getFullRateInstrCost();
|
||||
break;
|
||||
|
@ -1,11 +1,11 @@
|
||||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=+half-rate-64-ops < %s | FileCheck %s
|
||||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=+half-rate-64-ops < %s | FileCheck %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck %s
|
||||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FAST16,ALL %s
|
||||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOW16,ALL %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FAST16,ALL %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOW16,ALL %s
|
||||
|
||||
|
||||
; CHECK: 'add_i32'
|
||||
; CHECK: estimated cost of 1 for {{.*}} add i32
|
||||
; ALL: 'add_i32'
|
||||
; ALL: estimated cost of 1 for {{.*}} add i32
|
||||
define amdgpu_kernel void @add_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
|
||||
%vec = load i32, i32 addrspace(1)* %vaddr
|
||||
%add = add i32 %vec, %b
|
||||
@ -13,8 +13,8 @@ define amdgpu_kernel void @add_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %va
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: 'add_v2i32'
|
||||
; CHECK: estimated cost of 2 for {{.*}} add <2 x i32>
|
||||
; ALL: 'add_v2i32'
|
||||
; ALL: estimated cost of 2 for {{.*}} add <2 x i32>
|
||||
define amdgpu_kernel void @add_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr, <2 x i32> %b) #0 {
|
||||
%vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr
|
||||
%add = add <2 x i32> %vec, %b
|
||||
@ -22,10 +22,10 @@ define amdgpu_kernel void @add_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> add
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: 'add_v3i32'
|
||||
; ALL: 'add_v3i32'
|
||||
; Allow for 4 when v3i32 is illegal and TargetLowering thinks it needs widening,
|
||||
; and 3 when it is legal.
|
||||
; CHECK: estimated cost of {{[34]}} for {{.*}} add <3 x i32>
|
||||
; ALL: estimated cost of {{[34]}} for {{.*}} add <3 x i32>
|
||||
define amdgpu_kernel void @add_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr, <3 x i32> %b) #0 {
|
||||
%vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr
|
||||
%add = add <3 x i32> %vec, %b
|
||||
@ -33,8 +33,8 @@ define amdgpu_kernel void @add_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> add
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: 'add_v4i32'
|
||||
; CHECK: estimated cost of 4 for {{.*}} add <4 x i32>
|
||||
; ALL: 'add_v4i32'
|
||||
; ALL: estimated cost of 4 for {{.*}} add <4 x i32>
|
||||
define amdgpu_kernel void @add_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr, <4 x i32> %b) #0 {
|
||||
%vec = load <4 x i32>, <4 x i32> addrspace(1)* %vaddr
|
||||
%add = add <4 x i32> %vec, %b
|
||||
@ -42,10 +42,10 @@ define amdgpu_kernel void @add_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> add
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: 'add_v5i32'
|
||||
; ALL: 'add_v5i32'
|
||||
; Allow for 8 when v3i32 is illegal and TargetLowering thinks it needs widening,
|
||||
; and 5 when it is legal.
|
||||
; CHECK: estimated cost of {{[58]}} for {{.*}} add <5 x i32>
|
||||
; ALL: estimated cost of {{[58]}} for {{.*}} add <5 x i32>
|
||||
define amdgpu_kernel void @add_v5i32(<5 x i32> addrspace(1)* %out, <5 x i32> addrspace(1)* %vaddr, <5 x i32> %b) #0 {
|
||||
%vec = load <5 x i32>, <5 x i32> addrspace(1)* %vaddr
|
||||
%add = add <5 x i32> %vec, %b
|
||||
@ -53,8 +53,8 @@ define amdgpu_kernel void @add_v5i32(<5 x i32> addrspace(1)* %out, <5 x i32> add
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: 'add_i64'
|
||||
; CHECK: estimated cost of 2 for {{.*}} add i64
|
||||
; ALL: 'add_i64'
|
||||
; ALL: estimated cost of 2 for {{.*}} add i64
|
||||
define amdgpu_kernel void @add_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
|
||||
%vec = load i64, i64 addrspace(1)* %vaddr
|
||||
%add = add i64 %vec, %b
|
||||
@ -62,8 +62,8 @@ define amdgpu_kernel void @add_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %va
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: 'add_v2i64'
|
||||
; CHECK: estimated cost of 4 for {{.*}} add <2 x i64>
|
||||
; ALL: 'add_v2i64'
|
||||
; ALL: estimated cost of 4 for {{.*}} add <2 x i64>
|
||||
define amdgpu_kernel void @add_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr, <2 x i64> %b) #0 {
|
||||
%vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr
|
||||
%add = add <2 x i64> %vec, %b
|
||||
@ -71,8 +71,8 @@ define amdgpu_kernel void @add_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> add
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: 'add_v3i64'
|
||||
; CHECK: estimated cost of 6 for {{.*}} add <3 x i64>
|
||||
; ALL: 'add_v3i64'
|
||||
; ALL: estimated cost of 6 for {{.*}} add <3 x i64>
|
||||
define amdgpu_kernel void @add_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr, <3 x i64> %b) #0 {
|
||||
%vec = load <3 x i64>, <3 x i64> addrspace(1)* %vaddr
|
||||
%add = add <3 x i64> %vec, %b
|
||||
@ -80,8 +80,8 @@ define amdgpu_kernel void @add_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> add
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: 'add_v4i64'
|
||||
; CHECK: estimated cost of 8 for {{.*}} add <4 x i64>
|
||||
; ALL: 'add_v4i64'
|
||||
; ALL: estimated cost of 8 for {{.*}} add <4 x i64>
|
||||
define amdgpu_kernel void @add_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr, <4 x i64> %b) #0 {
|
||||
%vec = load <4 x i64>, <4 x i64> addrspace(1)* %vaddr
|
||||
%add = add <4 x i64> %vec, %b
|
||||
@ -89,8 +89,8 @@ define amdgpu_kernel void @add_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> add
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: 'add_v16i64'
|
||||
; CHECK: estimated cost of 32 for {{.*}} add <16 x i64>
|
||||
; ALL: 'add_v16i64'
|
||||
; ALL: estimated cost of 32 for {{.*}} add <16 x i64>
|
||||
define amdgpu_kernel void @add_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(1)* %vaddr, <16 x i64> %b) #0 {
|
||||
%vec = load <16 x i64>, <16 x i64> addrspace(1)* %vaddr
|
||||
%add = add <16 x i64> %vec, %b
|
||||
@ -98,8 +98,8 @@ define amdgpu_kernel void @add_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64>
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: 'add_i16'
|
||||
; CHECK: estimated cost of 1 for {{.*}} add i16
|
||||
; ALL: 'add_i16'
|
||||
; ALL: estimated cost of 1 for {{.*}} add i16
|
||||
define amdgpu_kernel void @add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
|
||||
%vec = load i16, i16 addrspace(1)* %vaddr
|
||||
%add = add i16 %vec, %b
|
||||
@ -107,8 +107,9 @@ define amdgpu_kernel void @add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %va
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: 'add_v2i16'
|
||||
; CHECK: estimated cost of 2 for {{.*}} add <2 x i16>
|
||||
; ALL: 'add_v2i16'
|
||||
; SLOW16: estimated cost of 2 for {{.*}} add <2 x i16>
|
||||
; FAST16: estimated cost of 1 for {{.*}} add <2 x i16>
|
||||
define amdgpu_kernel void @add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
|
||||
%vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
|
||||
%add = add <2 x i16> %vec, %b
|
||||
@ -116,8 +117,8 @@ define amdgpu_kernel void @add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> add
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: 'sub_i32'
|
||||
; CHECK: estimated cost of 1 for {{.*}} sub i32
|
||||
; ALL: 'sub_i32'
|
||||
; ALL: estimated cost of 1 for {{.*}} sub i32
|
||||
define amdgpu_kernel void @sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
|
||||
%vec = load i32, i32 addrspace(1)* %vaddr
|
||||
%sub = sub i32 %vec, %b
|
||||
@ -125,16 +126,16 @@ define amdgpu_kernel void @sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %va
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: 'sub_i64'
|
||||
; CHECK: estimated cost of 2 for {{.*}} sub i64
|
||||
; ALL: 'sub_i64'
|
||||
; ALL: estimated cost of 2 for {{.*}} sub i64
|
||||
define amdgpu_kernel void @sub_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
|
||||
%vec = load i64, i64 addrspace(1)* %vaddr
|
||||
%sub = sub i64 %vec, %b
|
||||
store i64 %sub, i64 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
; CHECK: 'sub_i16'
|
||||
; CHECK: estimated cost of 1 for {{.*}} sub i16
|
||||
; ALL: 'sub_i16'
|
||||
; ALL: estimated cost of 1 for {{.*}} sub i16
|
||||
define amdgpu_kernel void @sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
|
||||
%vec = load i16, i16 addrspace(1)* %vaddr
|
||||
%sub = sub i16 %vec, %b
|
||||
@ -142,8 +143,9 @@ define amdgpu_kernel void @sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %va
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: 'sub_v2i16'
|
||||
; CHECK: estimated cost of 2 for {{.*}} sub <2 x i16>
|
||||
; ALL: 'sub_v2i16'
|
||||
; SLOW16: estimated cost of 2 for {{.*}} sub <2 x i16>
|
||||
; FAST16: estimated cost of 1 for {{.*}} sub <2 x i16>
|
||||
define amdgpu_kernel void @sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
|
||||
%vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
|
||||
%sub = sub <2 x i16> %vec, %b
|
||||
|
@ -1,8 +1,10 @@
|
||||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s
|
||||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=ALL,SLOW16 %s
|
||||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL,FAST16 %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=ALL,SLOW16 %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL,FAST16 %s
|
||||
|
||||
; CHECK: 'or_i32'
|
||||
; CHECK: estimated cost of 1 for {{.*}} or i32
|
||||
; ALL: 'or_i32'
|
||||
; ALL: estimated cost of 1 for {{.*}} or i32
|
||||
define amdgpu_kernel void @or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
|
||||
%vec = load i32, i32 addrspace(1)* %vaddr
|
||||
%or = or i32 %vec, %b
|
||||
@ -10,8 +12,8 @@ define amdgpu_kernel void @or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vad
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: 'or_i64'
|
||||
; CHECK: estimated cost of 2 for {{.*}} or i64
|
||||
; ALL: 'or_i64'
|
||||
; ALL: estimated cost of 2 for {{.*}} or i64
|
||||
define amdgpu_kernel void @or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
|
||||
%vec = load i64, i64 addrspace(1)* %vaddr
|
||||
%or = or i64 %vec, %b
|
||||
@ -19,8 +21,18 @@ define amdgpu_kernel void @or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vad
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: 'xor_i32'
|
||||
; CHECK: estimated cost of 1 for {{.*}} xor i32
|
||||
; ALL: 'or_v2i16'
|
||||
; SLOW16: estimated cost of 2 for {{.*}} or <2 x i16>
|
||||
; FAST16: estimated cost of 1 for {{.*}} or <2 x i16>
|
||||
define amdgpu_kernel void @or_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
|
||||
%vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
|
||||
%or = or <2 x i16> %vec, %b
|
||||
store <2 x i16> %or, <2 x i16> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL: 'xor_i32'
|
||||
; ALL: estimated cost of 1 for {{.*}} xor i32
|
||||
define amdgpu_kernel void @xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
|
||||
%vec = load i32, i32 addrspace(1)* %vaddr
|
||||
%or = xor i32 %vec, %b
|
||||
@ -28,8 +40,8 @@ define amdgpu_kernel void @xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %va
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: 'xor_i64'
|
||||
; CHECK: estimated cost of 2 for {{.*}} xor i64
|
||||
; ALL: 'xor_i64'
|
||||
; ALL: estimated cost of 2 for {{.*}} xor i64
|
||||
define amdgpu_kernel void @xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
|
||||
%vec = load i64, i64 addrspace(1)* %vaddr
|
||||
%or = xor i64 %vec, %b
|
||||
@ -37,9 +49,18 @@ define amdgpu_kernel void @xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %va
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL: 'xor_v2i16'
|
||||
; SLOW16: estimated cost of 2 for {{.*}} xor <2 x i16>
|
||||
; FAST16: estimated cost of 1 for {{.*}} xor <2 x i16>
|
||||
define amdgpu_kernel void @xor_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
|
||||
%vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
|
||||
%xor = xor <2 x i16> %vec, %b
|
||||
store <2 x i16> %xor, <2 x i16> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: 'and_i32'
|
||||
; CHECK: estimated cost of 1 for {{.*}} and i32
|
||||
; ALL: 'and_i32'
|
||||
; ALL: estimated cost of 1 for {{.*}} and i32
|
||||
define amdgpu_kernel void @and_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
|
||||
%vec = load i32, i32 addrspace(1)* %vaddr
|
||||
%or = and i32 %vec, %b
|
||||
@ -47,8 +68,8 @@ define amdgpu_kernel void @and_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %va
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: 'and_i64'
|
||||
; CHECK: estimated cost of 2 for {{.*}} and i64
|
||||
; ALL: 'and_i64'
|
||||
; ALL: estimated cost of 2 for {{.*}} and i64
|
||||
define amdgpu_kernel void @and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
|
||||
%vec = load i64, i64 addrspace(1)* %vaddr
|
||||
%or = and i64 %vec, %b
|
||||
@ -56,5 +77,14 @@ define amdgpu_kernel void @and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %va
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL: 'and_v2i16'
|
||||
; SLOW16: estimated cost of 2 for {{.*}} and <2 x i16>
|
||||
; FAST16: estimated cost of 1 for {{.*}} and <2 x i16>
|
||||
define amdgpu_kernel void @and_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
|
||||
%vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
|
||||
%and = and <2 x i16> %vec, %b
|
||||
store <2 x i16> %and, <2 x i16> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
|
@ -1,7 +1,7 @@
|
||||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FASTF64 -check-prefix=ALL %s
|
||||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefix=SLOWF64 -check-prefix=ALL %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FASTF64 -check-prefix=ALL %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefix=SLOWF64 -check-prefix=ALL %s
|
||||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FASTF16,ALL %s
|
||||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF16,ALL %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FASTF16,ALL %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF16,ALL %s
|
||||
|
||||
; ALL: 'fadd_f32'
|
||||
; ALL: estimated cost of 1 for {{.*}} fadd float
|
||||
@ -73,8 +73,8 @@ define amdgpu_kernel void @fadd_v3f64(<3 x double> addrspace(1)* %out, <3 x doub
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL 'fadd_f16'
|
||||
; ALL estimated cost of 1 for {{.*}} fadd half
|
||||
; ALL: 'fadd_f16'
|
||||
; ALL: estimated cost of 1 for {{.*}} fadd half
|
||||
define amdgpu_kernel void @fadd_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
|
||||
%vec = load half, half addrspace(1)* %vaddr
|
||||
%add = fadd half %vec, %b
|
||||
@ -82,8 +82,9 @@ define amdgpu_kernel void @fadd_f16(half addrspace(1)* %out, half addrspace(1)*
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL 'fadd_v2f16'
|
||||
; ALL estimated cost of 2 for {{.*}} fadd <2 x half>
|
||||
; ALL: 'fadd_v2f16'
|
||||
; SLOWF16: estimated cost of 2 for {{.*}} fadd <2 x half>
|
||||
; FASTF16: estimated cost of 1 for {{.*}} fadd <2 x half>
|
||||
define amdgpu_kernel void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
|
||||
%vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
|
||||
%add = fadd <2 x half> %vec, %b
|
||||
@ -91,8 +92,19 @@ define amdgpu_kernel void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half>
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL 'fadd_v4f16'
|
||||
; ALL estimated cost of 4 for {{.*}} fadd <4 x half>
|
||||
; ALL: 'fadd_v3f16'
|
||||
; SLOWF16: estimated cost of 4 for {{.*}} fadd <3 x half>
|
||||
; FASTF16: estimated cost of 2 for {{.*}} fadd <3 x half>
|
||||
define amdgpu_kernel void @fadd_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr, <3 x half> %b) #0 {
|
||||
%vec = load <3 x half>, <3 x half> addrspace(1)* %vaddr
|
||||
%add = fadd <3 x half> %vec, %b
|
||||
store <3 x half> %add, <3 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL: 'fadd_v4f16'
|
||||
; SLOWF16: estimated cost of 4 for {{.*}} fadd <4 x half>
|
||||
; FASTF16: estimated cost of 2 for {{.*}} fadd <4 x half>
|
||||
define amdgpu_kernel void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
|
||||
%vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr
|
||||
%add = fadd <4 x half> %vec, %b
|
||||
|
@ -1,7 +1,7 @@
|
||||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FASTF64 -check-prefix=ALL %s
|
||||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefix=SLOWF64 -check-prefix=ALL %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FASTF64 -check-prefix=ALL %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefix=SLOWF64 -check-prefix=ALL %s
|
||||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FASTF16,ALL %s
|
||||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF16,ALL %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FASTF16,ALL %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF16,ALL %s
|
||||
|
||||
; ALL: 'fmul_f32'
|
||||
; ALL: estimated cost of 1 for {{.*}} fmul float
|
||||
@ -73,8 +73,8 @@ define amdgpu_kernel void @fmul_v3f64(<3 x double> addrspace(1)* %out, <3 x doub
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL 'fmul_f16'
|
||||
; ALL estimated cost of 1 for {{.*}} fmul half
|
||||
; ALL: 'fmul_f16'
|
||||
; ALL: estimated cost of 1 for {{.*}} fmul half
|
||||
define amdgpu_kernel void @fmul_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
|
||||
%vec = load half, half addrspace(1)* %vaddr
|
||||
%add = fmul half %vec, %b
|
||||
@ -82,8 +82,9 @@ define amdgpu_kernel void @fmul_f16(half addrspace(1)* %out, half addrspace(1)*
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL 'fmul_v2f16'
|
||||
; ALL estimated cost of 2 for {{.*}} fmul <2 x half>
|
||||
; ALL: 'fmul_v2f16'
|
||||
; SLOWF16 estimated cost of 2 for {{.*}} fmul <2 x half>
|
||||
; FASTF16 estimated cost of 1 for {{.*}} fmul <2 x half>
|
||||
define amdgpu_kernel void @fmul_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
|
||||
%vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
|
||||
%add = fmul <2 x half> %vec, %b
|
||||
@ -91,8 +92,19 @@ define amdgpu_kernel void @fmul_v2f16(<2 x half> addrspace(1)* %out, <2 x half>
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL 'fmul_v4f16'
|
||||
; ALL estimated cost of 4 for {{.*}} fmul <4 x half>
|
||||
; ALL: 'fmul_v3f16'
|
||||
; SLOWF16 estimated cost of 4 for {{.*}} fmul <3 x half>
|
||||
; FASTF16 estimated cost of 2 for {{.*}} fmul <3 x half>
|
||||
define amdgpu_kernel void @fmul_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr, <3 x half> %b) #0 {
|
||||
%vec = load <3 x half>, <3 x half> addrspace(1)* %vaddr
|
||||
%add = fmul <3 x half> %vec, %b
|
||||
store <3 x half> %add, <3 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL: 'fmul_v4f16'
|
||||
; SLOWF16: estimated cost of 4 for {{.*}} fmul <4 x half>
|
||||
; FASTF16: estimated cost of 2 for {{.*}} fmul <4 x half>
|
||||
define amdgpu_kernel void @fmul_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
|
||||
%vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr
|
||||
%add = fmul <4 x half> %vec, %b
|
||||
|
@ -1,7 +1,7 @@
|
||||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FASTF64 -check-prefix=ALL %s
|
||||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefix=SLOWF64 -check-prefix=ALL %s
|
||||
; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=amdgcn-unknown-amdhsa -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FASTF64 -check-prefix=ALL %s
|
||||
; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefix=SLOWF64 -check-prefix=ALL %s
|
||||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FASTF16,ALL %s
|
||||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF16,ALL %s
|
||||
; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FASTF16,ALL %s
|
||||
; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF16,ALL %s
|
||||
|
||||
; ALL: 'fsub_f32'
|
||||
; ALL: estimated cost of 1 for {{.*}} fsub float
|
||||
@ -83,7 +83,8 @@ define amdgpu_kernel void @fsub_f16(half addrspace(1)* %out, half addrspace(1)*
|
||||
}
|
||||
|
||||
; ALL: 'fsub_v2f16'
|
||||
; ALL: estimated cost of 2 for {{.*}} fsub <2 x half>
|
||||
; SLOWF16: estimated cost of 2 for {{.*}} fsub <2 x half>
|
||||
; FASTF16: estimated cost of 1 for {{.*}} fsub <2 x half>
|
||||
define amdgpu_kernel void @fsub_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
|
||||
%vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
|
||||
%add = fsub <2 x half> %vec, %b
|
||||
@ -91,8 +92,19 @@ define amdgpu_kernel void @fsub_v2f16(<2 x half> addrspace(1)* %out, <2 x half>
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL: 'fsub_v3f16'
|
||||
; SLOWF16: estimated cost of 4 for {{.*}} fsub <3 x half>
|
||||
; FASTF16: estimated cost of 2 for {{.*}} fsub <3 x half>
|
||||
define amdgpu_kernel void @fsub_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr, <3 x half> %b) #0 {
|
||||
%vec = load <3 x half>, <3 x half> addrspace(1)* %vaddr
|
||||
%add = fsub <3 x half> %vec, %b
|
||||
store <3 x half> %add, <3 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL: 'fsub_v4f16'
|
||||
; ALL: estimated cost of 4 for {{.*}} fsub <4 x half>
|
||||
; SLOWF16: estimated cost of 4 for {{.*}} fsub <4 x half>
|
||||
; FASTF16: estimated cost of 2 for {{.*}} fsub <4 x half>
|
||||
define amdgpu_kernel void @fsub_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
|
||||
%vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr
|
||||
%add = fsub <4 x half> %vec, %b
|
||||
|
@ -1,8 +1,10 @@
|
||||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s
|
||||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=SLOW16,ALL %s
|
||||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=FAST16,ALL %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=SLOW16,ALL %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=FAST16,ALL %s
|
||||
|
||||
; CHECK: 'mul_i32'
|
||||
; CHECK: estimated cost of 3 for {{.*}} mul i32
|
||||
; ALL: 'mul_i32'
|
||||
; ALL: estimated cost of 3 for {{.*}} mul i32
|
||||
define amdgpu_kernel void @mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
|
||||
%vec = load i32, i32 addrspace(1)* %vaddr
|
||||
%mul = mul i32 %vec, %b
|
||||
@ -10,8 +12,8 @@ define amdgpu_kernel void @mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %va
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: 'mul_v2i32'
|
||||
; CHECK: estimated cost of 6 for {{.*}} mul <2 x i32>
|
||||
; ALL: 'mul_v2i32'
|
||||
; ALL: estimated cost of 6 for {{.*}} mul <2 x i32>
|
||||
define amdgpu_kernel void @mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr, <2 x i32> %b) #0 {
|
||||
%vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr
|
||||
%mul = mul <2 x i32> %vec, %b
|
||||
@ -19,10 +21,10 @@ define amdgpu_kernel void @mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> add
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: 'mul_v3i32'
|
||||
; ALL: 'mul_v3i32'
|
||||
; Allow for 12 when v3i32 is illegal and TargetLowering thinks it needs widening,
|
||||
; and 9 when it is legal.
|
||||
; CHECK: estimated cost of {{9|12}} for {{.*}} mul <3 x i32>
|
||||
; ALL: estimated cost of {{9|12}} for {{.*}} mul <3 x i32>
|
||||
define amdgpu_kernel void @mul_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr, <3 x i32> %b) #0 {
|
||||
%vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr
|
||||
%mul = mul <3 x i32> %vec, %b
|
||||
@ -30,10 +32,10 @@ define amdgpu_kernel void @mul_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> add
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: 'mul_v5i32'
|
||||
; ALL: 'mul_v5i32'
|
||||
; Allow for 24 when v5i32 is illegal and TargetLowering thinks it needs widening,
|
||||
; and 15 when it is legal.
|
||||
; CHECK: estimated cost of {{15|24}} for {{.*}} mul <5 x i32>
|
||||
; ALL: estimated cost of {{15|24}} for {{.*}} mul <5 x i32>
|
||||
define amdgpu_kernel void @mul_v5i32(<5 x i32> addrspace(1)* %out, <5 x i32> addrspace(1)* %vaddr, <5 x i32> %b) #0 {
|
||||
%vec = load <5 x i32>, <5 x i32> addrspace(1)* %vaddr
|
||||
%mul = mul <5 x i32> %vec, %b
|
||||
@ -41,8 +43,8 @@ define amdgpu_kernel void @mul_v5i32(<5 x i32> addrspace(1)* %out, <5 x i32> add
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: 'mul_v4i32'
|
||||
; CHECK: estimated cost of 12 for {{.*}} mul <4 x i32>
|
||||
; ALL: 'mul_v4i32'
|
||||
; ALL: estimated cost of 12 for {{.*}} mul <4 x i32>
|
||||
define amdgpu_kernel void @mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr, <4 x i32> %b) #0 {
|
||||
%vec = load <4 x i32>, <4 x i32> addrspace(1)* %vaddr
|
||||
%mul = mul <4 x i32> %vec, %b
|
||||
@ -50,8 +52,8 @@ define amdgpu_kernel void @mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> add
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: 'mul_i64'
|
||||
; CHECK: estimated cost of 16 for {{.*}} mul i64
|
||||
; ALL: 'mul_i64'
|
||||
; ALL: estimated cost of 16 for {{.*}} mul i64
|
||||
define amdgpu_kernel void @mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
|
||||
%vec = load i64, i64 addrspace(1)* %vaddr
|
||||
%mul = mul i64 %vec, %b
|
||||
@ -59,8 +61,8 @@ define amdgpu_kernel void @mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %va
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: 'mul_v2i64'
|
||||
; CHECK: estimated cost of 32 for {{.*}} mul <2 x i64>
|
||||
; ALL: 'mul_v2i64'
|
||||
; ALL: estimated cost of 32 for {{.*}} mul <2 x i64>
|
||||
define amdgpu_kernel void @mul_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr, <2 x i64> %b) #0 {
|
||||
%vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr
|
||||
%mul = mul <2 x i64> %vec, %b
|
||||
@ -68,8 +70,8 @@ define amdgpu_kernel void @mul_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> add
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: 'mul_v3i64'
|
||||
; CHECK: estimated cost of 48 for {{.*}} mul <3 x i64>
|
||||
; ALL: 'mul_v3i64'
|
||||
; ALL: estimated cost of 48 for {{.*}} mul <3 x i64>
|
||||
define amdgpu_kernel void @mul_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr, <3 x i64> %b) #0 {
|
||||
%vec = load <3 x i64>, <3 x i64> addrspace(1)* %vaddr
|
||||
%mul = mul <3 x i64> %vec, %b
|
||||
@ -77,8 +79,8 @@ define amdgpu_kernel void @mul_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> add
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: 'mul_v4i64'
|
||||
; CHECK: estimated cost of 64 for {{.*}} mul <4 x i64>
|
||||
; ALL: 'mul_v4i64'
|
||||
; ALL: estimated cost of 64 for {{.*}} mul <4 x i64>
|
||||
define amdgpu_kernel void @mul_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr, <4 x i64> %b) #0 {
|
||||
%vec = load <4 x i64>, <4 x i64> addrspace(1)* %vaddr
|
||||
%mul = mul <4 x i64> %vec, %b
|
||||
@ -87,8 +89,8 @@ define amdgpu_kernel void @mul_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> add
|
||||
}
|
||||
|
||||
|
||||
; CHECK: 'mul_v8i64'
|
||||
; CHECK: estimated cost of 128 for {{.*}} mul <8 x i64>
|
||||
; ALL: 'mul_v8i64'
|
||||
; ALL: estimated cost of 128 for {{.*}} mul <8 x i64>
|
||||
define amdgpu_kernel void @mul_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr, <8 x i64> %b) #0 {
|
||||
%vec = load <8 x i64>, <8 x i64> addrspace(1)* %vaddr
|
||||
%mul = mul <8 x i64> %vec, %b
|
||||
@ -96,4 +98,33 @@ define amdgpu_kernel void @mul_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> add
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL: 'mul_i16'
|
||||
; ALL: estimated cost of 3 for {{.*}} mul i16
|
||||
define amdgpu_kernel void @mul_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
|
||||
%vec = load i16, i16 addrspace(1)* %vaddr
|
||||
%mul = mul i16 %vec, %b
|
||||
store i16 %mul, i16 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL: 'mul_v2i16'
|
||||
; SLOW16: estimated cost of 6 for {{.*}} mul <2 x i16>
|
||||
; FAST16: estimated cost of 3 for {{.*}} mul <2 x i16>
|
||||
define amdgpu_kernel void @mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
|
||||
%vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
|
||||
%mul = mul <2 x i16> %vec, %b
|
||||
store <2 x i16> %mul, <2 x i16> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL: 'mul_v3i16'
|
||||
; SLOW16: estimated cost of 12 for {{.*}} mul <3 x i16>
|
||||
; FAST16: estimated cost of 6 for {{.*}} mul <3 x i16>
|
||||
define amdgpu_kernel void @mul_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %vaddr, <3 x i16> %b) #0 {
|
||||
%vec = load <3 x i16>, <3 x i16> addrspace(1)* %vaddr
|
||||
%mul = mul <3 x i16> %vec, %b
|
||||
store <3 x i16> %mul, <3 x i16> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
|
@ -1,7 +1,7 @@
|
||||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=ALL -check-prefix=FAST64 %s
|
||||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefix=ALL -check-prefix=SLOW64 %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=ALL -check-prefix=FAST64 %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefix=ALL -check-prefix=SLOW64 %s
|
||||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,FAST64,FAST16 %s
|
||||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SLOW64,SLOW16 %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,FAST64,FAST16 %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SLOW64,SLOW16 %s
|
||||
|
||||
; ALL: 'shl_i32'
|
||||
; ALL: estimated cost of 1 for {{.*}} shl i32
|
||||
@ -22,6 +22,25 @@ define amdgpu_kernel void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %va
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL: 'shl_i16'
|
||||
; ALL: estimated cost of 1 for {{.*}} shl i16
|
||||
define amdgpu_kernel void @shl_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
|
||||
%vec = load i16, i16 addrspace(1)* %vaddr
|
||||
%or = shl i16 %vec, %b
|
||||
store i16 %or, i16 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL: 'shl_v2i16'
|
||||
; SLOW16: estimated cost of 2 for {{.*}} shl <2 x i16>
|
||||
; FAST16: estimated cost of 1 for {{.*}} shl <2 x i16>
|
||||
define amdgpu_kernel void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
|
||||
%vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
|
||||
%or = shl <2 x i16> %vec, %b
|
||||
store <2 x i16> %or, <2 x i16> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL: 'lshr_i32'
|
||||
; ALL: estimated cost of 1 for {{.*}} lshr i32
|
||||
define amdgpu_kernel void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
|
||||
@ -41,6 +60,25 @@ define amdgpu_kernel void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %v
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL: 'lshr_i16'
|
||||
; ALL: estimated cost of 1 for {{.*}} lshr i16
|
||||
define amdgpu_kernel void @lshr_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
|
||||
%vec = load i16, i16 addrspace(1)* %vaddr
|
||||
%or = lshr i16 %vec, %b
|
||||
store i16 %or, i16 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL: 'lshr_v2i16'
|
||||
; SLOW16: estimated cost of 2 for {{.*}} lshr <2 x i16>
|
||||
; FAST16: estimated cost of 1 for {{.*}} lshr <2 x i16>
|
||||
define amdgpu_kernel void @lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
|
||||
%vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
|
||||
%or = lshr <2 x i16> %vec, %b
|
||||
store <2 x i16> %or, <2 x i16> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL: 'ashr_i32'
|
||||
; ALL: estimated cost of 1 for {{.*}} ashr i32
|
||||
define amdgpu_kernel void @ashr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
|
||||
@ -60,4 +98,23 @@ define amdgpu_kernel void @ashr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %v
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL: 'ashr_i16'
|
||||
; ALL: estimated cost of 1 for {{.*}} ashr i16
|
||||
define amdgpu_kernel void @ashr_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
|
||||
%vec = load i16, i16 addrspace(1)* %vaddr
|
||||
%or = ashr i16 %vec, %b
|
||||
store i16 %or, i16 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL: 'ashr_v2i16'
|
||||
; SLOW16: estimated cost of 2 for {{.*}} ashr <2 x i16>
|
||||
; FAST16: estimated cost of 1 for {{.*}} ashr <2 x i16>
|
||||
define amdgpu_kernel void @ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
|
||||
%vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
|
||||
%or = ashr <2 x i16> %vec, %b
|
||||
store <2 x i16> %or, <2 x i16> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
|
Loading…
Reference in New Issue
Block a user