mirror of
https://github.com/RPCSX/llvm.git
synced 2025-02-01 10:02:42 +00:00
AMDGPU: Allow vectorization of packed types
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@305844 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
d3be377704
commit
84b3660bac
@ -184,9 +184,9 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L,
|
||||
}
|
||||
}
|
||||
|
||||
unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) {
|
||||
if (Vec)
|
||||
return 0;
|
||||
unsigned AMDGPUTTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
|
||||
// The concept of vector registers doesn't really exist. Some packed vector
|
||||
// operations operate on the normal 32-bit registers.
|
||||
|
||||
// Number of VGPRs on SI.
|
||||
if (ST->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)
|
||||
@ -195,8 +195,18 @@ unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) {
|
||||
return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
|
||||
}
|
||||
|
||||
unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) const {
|
||||
// This is really the number of registers to fill when vectorizing /
|
||||
// interleaving loops, so we lie to avoid trying to use all registers.
|
||||
return getHardwareNumberOfRegisters(Vec) >> 3;
|
||||
}
|
||||
|
||||
unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) const {
|
||||
return Vector ? 0 : 32;
|
||||
return 32;
|
||||
}
|
||||
|
||||
unsigned AMDGPUTTIImpl::getMinVectorRegisterBitWidth() const {
|
||||
return 32;
|
||||
}
|
||||
|
||||
unsigned AMDGPUTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
|
||||
@ -247,11 +257,11 @@ bool AMDGPUTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
|
||||
|
||||
unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) {
|
||||
// Disable unrolling if the loop is not vectorized.
|
||||
// TODO: Enable this again.
|
||||
if (VF == 1)
|
||||
return 1;
|
||||
|
||||
// Semi-arbitrary large amount.
|
||||
return 64;
|
||||
return 8;
|
||||
}
|
||||
|
||||
int AMDGPUTTIImpl::getArithmeticInstrCost(
|
||||
|
@ -75,8 +75,10 @@ public:
|
||||
return TTI::PSK_FastHardware;
|
||||
}
|
||||
|
||||
unsigned getNumberOfRegisters(bool Vector);
|
||||
unsigned getRegisterBitWidth(bool Vector) const;
|
||||
unsigned getHardwareNumberOfRegisters(bool Vector) const;
|
||||
unsigned getNumberOfRegisters(bool Vector) const;
|
||||
unsigned getRegisterBitWidth(bool Vector) const ;
|
||||
unsigned getMinVectorRegisterBitWidth() const;
|
||||
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
|
||||
|
||||
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
|
||||
|
34
test/Transforms/LoopVectorize/AMDGPU/packed-math.ll
Normal file
34
test/Transforms/LoopVectorize/AMDGPU/packed-math.ll
Normal file
@ -0,0 +1,34 @@
|
||||
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s -loop-vectorize -dce -instcombine -S | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
|
||||
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s -loop-vectorize -dce -instcombine -S | FileCheck -check-prefix=CIVI -check-prefix=GCN %s
|
||||
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s -loop-vectorize -dce -instcombine -S | FileCheck -check-prefix=CIVI -check-prefix=GCN %s
|
||||
|
||||
; GCN-LABEL: @vectorize_v2f16_loop(
|
||||
; GFX9: vector.body:
|
||||
; GFX9: phi <2 x half>
|
||||
; GFX9: load <2 x half>
|
||||
; GFX9: fadd fast <2 x half>
|
||||
|
||||
; GFX9: middle.block:
|
||||
; GFX9: fadd fast <2 x half>
|
||||
|
||||
; VI: phi half
|
||||
; VI: phi load half
|
||||
; VI: fadd fast half
|
||||
define half @vectorize_v2f16_loop(half addrspace(1)* noalias %s) {
|
||||
entry:
|
||||
br label %for.body
|
||||
|
||||
for.body:
|
||||
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
|
||||
%q.04 = phi half [ 0.0, %entry ], [ %add, %for.body ]
|
||||
%arrayidx = getelementptr inbounds half, half addrspace(1)* %s, i64 %indvars.iv
|
||||
%0 = load half, half addrspace(1)* %arrayidx, align 2
|
||||
%add = fadd fast half %q.04, %0
|
||||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
|
||||
%exitcond = icmp eq i64 %indvars.iv.next, 256
|
||||
br i1 %exitcond, label %for.end, label %for.body
|
||||
|
||||
for.end:
|
||||
%add.lcssa = phi half [ %add, %for.body ]
|
||||
ret half %add.lcssa
|
||||
}
|
195
test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll
Normal file
195
test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll
Normal file
@ -0,0 +1,195 @@
|
||||
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,GFX9 %s
|
||||
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,VI %s
|
||||
|
||||
; FIXME: Should still like to vectorize the memory operations for VI
|
||||
|
||||
; Simple 3-pair chain with loads and stores
|
||||
; GCN-LABEL: @test1_as_3_3_3_v2f16(
|
||||
; GFX9: load <2 x half>, <2 x half> addrspace(3)*
|
||||
; GFX9: load <2 x half>, <2 x half> addrspace(3)*
|
||||
; GFX9: fmul <2 x half>
|
||||
; GFX9: store <2 x half> %{{.*}}, <2 x half> addrspace(3)* %
|
||||
; GFX9: ret
|
||||
|
||||
; VI: load half
|
||||
; VI: load half
|
||||
define amdgpu_kernel void @test1_as_3_3_3_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c) {
|
||||
%i0 = load half, half addrspace(3)* %a, align 2
|
||||
%i1 = load half, half addrspace(3)* %b, align 2
|
||||
%mul = fmul half %i0, %i1
|
||||
%arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
|
||||
%i3 = load half, half addrspace(3)* %arrayidx3, align 2
|
||||
%arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
|
||||
%i4 = load half, half addrspace(3)* %arrayidx4, align 2
|
||||
%mul5 = fmul half %i3, %i4
|
||||
store half %mul, half addrspace(3)* %c, align 2
|
||||
%arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
|
||||
store half %mul5, half addrspace(3)* %arrayidx5, align 2
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: @test1_as_3_0_0(
|
||||
; GFX9: load <2 x half>, <2 x half> addrspace(3)*
|
||||
; GFX9: load <2 x half>, <2 x half>*
|
||||
; GFX9: fmul <2 x half>
|
||||
; GFX9: store <2 x half> %{{.*}}, <2 x half>* %
|
||||
; GFX9: ret
|
||||
|
||||
; VI: load half
|
||||
; VI: load half
|
||||
define amdgpu_kernel void @test1_as_3_0_0(half addrspace(3)* %a, half* %b, half* %c) {
|
||||
%i0 = load half, half addrspace(3)* %a, align 2
|
||||
%i1 = load half, half* %b, align 2
|
||||
%mul = fmul half %i0, %i1
|
||||
%arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
|
||||
%i3 = load half, half addrspace(3)* %arrayidx3, align 2
|
||||
%arrayidx4 = getelementptr inbounds half, half* %b, i64 1
|
||||
%i4 = load half, half* %arrayidx4, align 2
|
||||
%mul5 = fmul half %i3, %i4
|
||||
store half %mul, half* %c, align 2
|
||||
%arrayidx5 = getelementptr inbounds half, half* %c, i64 1
|
||||
store half %mul5, half* %arrayidx5, align 2
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: @test1_as_0_0_3_v2f16(
|
||||
; GFX9: load <2 x half>, <2 x half>*
|
||||
; GFX9: load <2 x half>, <2 x half>*
|
||||
; GFX9: fmul <2 x half>
|
||||
; GFX9: store <2 x half> %{{.*}}, <2 x half> addrspace(3)* %
|
||||
; GFX9: ret
|
||||
|
||||
; VI: load half
|
||||
; VI: load half
|
||||
define amdgpu_kernel void @test1_as_0_0_3_v2f16(half* %a, half* %b, half addrspace(3)* %c) {
|
||||
%i0 = load half, half* %a, align 2
|
||||
%i1 = load half, half* %b, align 2
|
||||
%mul = fmul half %i0, %i1
|
||||
%arrayidx3 = getelementptr inbounds half, half* %a, i64 1
|
||||
%i3 = load half, half* %arrayidx3, align 2
|
||||
%arrayidx4 = getelementptr inbounds half, half* %b, i64 1
|
||||
%i4 = load half, half* %arrayidx4, align 2
|
||||
%mul5 = fmul half %i3, %i4
|
||||
store half %mul, half addrspace(3)* %c, align 2
|
||||
%arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
|
||||
store half %mul5, half addrspace(3)* %arrayidx5, align 2
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: @test1_fma_v2f16(
|
||||
; GFX9: load <2 x half>
|
||||
; GFX9: load <2 x half>
|
||||
; GFX9: load <2 x half>
|
||||
; GFX9: call <2 x half> @llvm.fma.v2f16(
|
||||
; GFX9: store <2 x half>
|
||||
define amdgpu_kernel void @test1_fma_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c, half addrspace(3)* %d) {
|
||||
%i0 = load half, half addrspace(3)* %a, align 2
|
||||
%i1 = load half, half addrspace(3)* %b, align 2
|
||||
%i2 = load half, half addrspace(3)* %c, align 2
|
||||
%fma0 = call half @llvm.fma.f16(half %i0, half %i1, half %i2)
|
||||
%arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
|
||||
%i3 = load half, half addrspace(3)* %arrayidx3, align 2
|
||||
%arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
|
||||
%i4 = load half, half addrspace(3)* %arrayidx4, align 2
|
||||
%arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
|
||||
%i5 = load half, half addrspace(3)* %arrayidx5, align 2
|
||||
%fma1 = call half @llvm.fma.f16(half %i3, half %i4, half %i5)
|
||||
store half %fma0, half addrspace(3)* %d, align 2
|
||||
%arrayidx6 = getelementptr inbounds half, half addrspace(3)* %d, i64 1
|
||||
store half %fma1, half addrspace(3)* %arrayidx6, align 2
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: @mul_scalar_v2f16(
|
||||
; GFX9: load <2 x half>
|
||||
; GFX9: fmul <2 x half>
|
||||
; GFX9: store <2 x half>
|
||||
define amdgpu_kernel void @mul_scalar_v2f16(half addrspace(3)* %a, half %scalar, half addrspace(3)* %c) {
|
||||
%i0 = load half, half addrspace(3)* %a, align 2
|
||||
%mul = fmul half %i0, %scalar
|
||||
%arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
|
||||
%i3 = load half, half addrspace(3)* %arrayidx3, align 2
|
||||
%mul5 = fmul half %i3, %scalar
|
||||
store half %mul, half addrspace(3)* %c, align 2
|
||||
%arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
|
||||
store half %mul5, half addrspace(3)* %arrayidx5, align 2
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: @fabs_v2f16
|
||||
; GFX9: load <2 x half>
|
||||
; GFX9: call <2 x half> @llvm.fabs.v2f16(
|
||||
; GFX9: store <2 x half>
|
||||
define amdgpu_kernel void @fabs_v2f16(half addrspace(3)* %a, half addrspace(3)* %c) {
|
||||
%i0 = load half, half addrspace(3)* %a, align 2
|
||||
%fabs0 = call half @llvm.fabs.f16(half %i0)
|
||||
%arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
|
||||
%i3 = load half, half addrspace(3)* %arrayidx3, align 2
|
||||
%fabs1 = call half @llvm.fabs.f16(half %i3)
|
||||
store half %fabs0, half addrspace(3)* %c, align 2
|
||||
%arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
|
||||
store half %fabs1, half addrspace(3)* %arrayidx5, align 2
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: @test1_fabs_fma_v2f16(
|
||||
; GFX9: load <2 x half>
|
||||
; GFX9: call <2 x half> @llvm.fabs.v2f16(
|
||||
; GFX9: call <2 x half> @llvm.fma.v2f16(
|
||||
; GFX9: store <2 x half>
|
||||
define amdgpu_kernel void @test1_fabs_fma_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c, half addrspace(3)* %d) {
|
||||
%i0 = load half, half addrspace(3)* %a, align 2
|
||||
%i1 = load half, half addrspace(3)* %b, align 2
|
||||
%i2 = load half, half addrspace(3)* %c, align 2
|
||||
%i0.fabs = call half @llvm.fabs.f16(half %i0)
|
||||
|
||||
%fma0 = call half @llvm.fma.f16(half %i0.fabs, half %i1, half %i2)
|
||||
%arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
|
||||
%i3 = load half, half addrspace(3)* %arrayidx3, align 2
|
||||
%arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
|
||||
%i4 = load half, half addrspace(3)* %arrayidx4, align 2
|
||||
%arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
|
||||
%i5 = load half, half addrspace(3)* %arrayidx5, align 2
|
||||
%i3.fabs = call half @llvm.fabs.f16(half %i3)
|
||||
|
||||
%fma1 = call half @llvm.fma.f16(half %i3.fabs, half %i4, half %i5)
|
||||
store half %fma0, half addrspace(3)* %d, align 2
|
||||
%arrayidx6 = getelementptr inbounds half, half addrspace(3)* %d, i64 1
|
||||
store half %fma1, half addrspace(3)* %arrayidx6, align 2
|
||||
ret void
|
||||
}
|
||||
|
||||
; FIXME: Should do vector load and extract component for fabs
|
||||
; GCN-LABEL: @test1_fabs_scalar_fma_v2f16(
|
||||
; GFX9: load half
|
||||
; GFX9: call half @llvm.fabs.f16(
|
||||
; GFX9: load <2 x half>
|
||||
; GFX9: load half
|
||||
; GFX9: load <2 x half>
|
||||
; GFX9: call <2 x half> @llvm.fma.v2f16(
|
||||
; GFX9: store <2 x half>
|
||||
define amdgpu_kernel void @test1_fabs_scalar_fma_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c, half addrspace(3)* %d) {
|
||||
%i0 = load half, half addrspace(3)* %a, align 2
|
||||
%i1 = load half, half addrspace(3)* %b, align 2
|
||||
%i2 = load half, half addrspace(3)* %c, align 2
|
||||
%i1.fabs = call half @llvm.fabs.f16(half %i1)
|
||||
|
||||
%fma0 = call half @llvm.fma.f16(half %i0, half %i1.fabs, half %i2)
|
||||
%arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
|
||||
%i3 = load half, half addrspace(3)* %arrayidx3, align 2
|
||||
%arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
|
||||
%i4 = load half, half addrspace(3)* %arrayidx4, align 2
|
||||
%arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
|
||||
%i5 = load half, half addrspace(3)* %arrayidx5, align 2
|
||||
%fma1 = call half @llvm.fma.f16(half %i3, half %i4, half %i5)
|
||||
store half %fma0, half addrspace(3)* %d, align 2
|
||||
%arrayidx6 = getelementptr inbounds half, half addrspace(3)* %d, i64 1
|
||||
store half %fma1, half addrspace(3)* %arrayidx6, align 2
|
||||
ret void
|
||||
}
|
||||
|
||||
declare half @llvm.fabs.f16(half) #1
|
||||
declare half @llvm.fma.f16(half, half, half) #1
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
attributes #1 = { nounwind readnone }
|
@ -1,70 +0,0 @@
|
||||
; RUN: opt -S -march=r600 -mcpu=cayman -basicaa -slp-vectorizer -dce < %s | FileCheck %s
|
||||
; XFAIL: *
|
||||
;
|
||||
; FIXME: If this test expects to be vectorized, the TTI must indicate that the target
|
||||
; has vector registers of the expected width.
|
||||
; Currently, it says there are 8 vector registers that are 32-bits wide.
|
||||
|
||||
target datalayout = "e-p:32:32:32-p3:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-v2048:2048:2048-n32:64"
|
||||
|
||||
|
||||
; Simple 3-pair chain with loads and stores
|
||||
define amdgpu_kernel void @test1_as_3_3_3(double addrspace(3)* %a, double addrspace(3)* %b, double addrspace(3)* %c) {
|
||||
; CHECK-LABEL: @test1_as_3_3_3(
|
||||
; CHECK: load <2 x double>, <2 x double> addrspace(3)*
|
||||
; CHECK: load <2 x double>, <2 x double> addrspace(3)*
|
||||
; CHECK: store <2 x double> %{{.*}}, <2 x double> addrspace(3)* %
|
||||
; CHECK: ret
|
||||
%i0 = load double, double addrspace(3)* %a, align 8
|
||||
%i1 = load double, double addrspace(3)* %b, align 8
|
||||
%mul = fmul double %i0, %i1
|
||||
%arrayidx3 = getelementptr inbounds double, double addrspace(3)* %a, i64 1
|
||||
%i3 = load double, double addrspace(3)* %arrayidx3, align 8
|
||||
%arrayidx4 = getelementptr inbounds double, double addrspace(3)* %b, i64 1
|
||||
%i4 = load double, double addrspace(3)* %arrayidx4, align 8
|
||||
%mul5 = fmul double %i3, %i4
|
||||
store double %mul, double addrspace(3)* %c, align 8
|
||||
%arrayidx5 = getelementptr inbounds double, double addrspace(3)* %c, i64 1
|
||||
store double %mul5, double addrspace(3)* %arrayidx5, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @test1_as_3_0_0(double addrspace(3)* %a, double* %b, double* %c) {
|
||||
; CHECK-LABEL: @test1_as_3_0_0(
|
||||
; CHECK: load <2 x double>, <2 x double> addrspace(3)*
|
||||
; CHECK: load <2 x double>, <2 x double>*
|
||||
; CHECK: store <2 x double> %{{.*}}, <2 x double>* %
|
||||
; CHECK: ret
|
||||
%i0 = load double, double addrspace(3)* %a, align 8
|
||||
%i1 = load double, double* %b, align 8
|
||||
%mul = fmul double %i0, %i1
|
||||
%arrayidx3 = getelementptr inbounds double, double addrspace(3)* %a, i64 1
|
||||
%i3 = load double, double addrspace(3)* %arrayidx3, align 8
|
||||
%arrayidx4 = getelementptr inbounds double, double* %b, i64 1
|
||||
%i4 = load double, double* %arrayidx4, align 8
|
||||
%mul5 = fmul double %i3, %i4
|
||||
store double %mul, double* %c, align 8
|
||||
%arrayidx5 = getelementptr inbounds double, double* %c, i64 1
|
||||
store double %mul5, double* %arrayidx5, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @test1_as_0_0_3(double* %a, double* %b, double addrspace(3)* %c) {
|
||||
; CHECK-LABEL: @test1_as_0_0_3(
|
||||
; CHECK: load <2 x double>, <2 x double>*
|
||||
; CHECK: load <2 x double>, <2 x double>*
|
||||
; CHECK: store <2 x double> %{{.*}}, <2 x double> addrspace(3)* %
|
||||
; CHECK: ret
|
||||
%i0 = load double, double* %a, align 8
|
||||
%i1 = load double, double* %b, align 8
|
||||
%mul = fmul double %i0, %i1
|
||||
%arrayidx3 = getelementptr inbounds double, double* %a, i64 1
|
||||
%i3 = load double, double* %arrayidx3, align 8
|
||||
%arrayidx4 = getelementptr inbounds double, double* %b, i64 1
|
||||
%i4 = load double, double* %arrayidx4, align 8
|
||||
%mul5 = fmul double %i3, %i4
|
||||
store double %mul, double addrspace(3)* %c, align 8
|
||||
%arrayidx5 = getelementptr inbounds double, double addrspace(3)* %c, i64 1
|
||||
store double %mul5, double addrspace(3)* %arrayidx5, align 8
|
||||
ret void
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user