AMDGPU: Teach isLegalAddressingMode about global_* instructions

Also refine the flat check to respect flat-for-global feature,
and constant fallback should check global handling, not
specifically MUBUF.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@309471 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Matt Arsenault 2017-07-29 01:12:31 +00:00
parent d74d012b62
commit 27eee9a0e2
3 changed files with 100 additions and 26 deletions

View File

@ -586,6 +586,26 @@ bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
return isUInt<12>(AM.BaseOffs) && AM.Scale == 0;
}
bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
if (Subtarget->hasFlatGlobalInsts())
return isInt<13>(AM.BaseOffs) && AM.Scale == 0;
if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
// Assume the we will use FLAT for all global memory accesses
// on VI.
// FIXME: This assumption is currently wrong. On VI we still use
// MUBUF instructions for the r + i addressing mode. As currently
// implemented, the MUBUF instructions only work on buffer < 4GB.
// It may be possible to support > 4GB buffers with MUBUF instructions,
// by setting the stride value in the resource descriptor which would
// increase the size limit to (stride * 4GB). However, this is risky,
// because it has never been validated.
return isLegalFlatAddressingMode(AM);
}
return isLegalMUBUFAddressingMode(AM);
}
bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
// MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
// additionally can do r + r + i with addr64. 32-bit has more addressing
@ -628,22 +648,10 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
if (AM.BaseGV)
return false;
if (AS == AMDGPUASI.GLOBAL_ADDRESS) {
if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
// Assume the we will use FLAT for all global memory accesses
// on VI.
// FIXME: This assumption is currently wrong. On VI we still use
// MUBUF instructions for the r + i addressing mode. As currently
// implemented, the MUBUF instructions only work on buffer < 4GB.
// It may be possible to support > 4GB buffers with MUBUF instructions,
// by setting the stride value in the resource descriptor which would
// increase the size limit to (stride * 4GB). However, this is risky,
// because it has never been validated.
return isLegalFlatAddressingMode(AM);
}
if (AS == AMDGPUASI.GLOBAL_ADDRESS)
return isLegalGlobalAddressingMode(AM);
return isLegalMUBUFAddressingMode(AM);
} else if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
// If the offset isn't a multiple of 4, it probably isn't going to be
// correctly aligned.
// FIXME: Can we get the real alignment here?
@ -655,7 +663,7 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
// FIXME?: We also need to do this if unaligned, but we don't know the
// alignment here.
if (DL.getTypeStoreSize(Ty) < 4)
return isLegalMUBUFAddressingMode(AM);
return isLegalGlobalAddressingMode(AM);
if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) {
// SMRD instructions have an 8-bit, dword offset on SI.

View File

@ -118,6 +118,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue performCvtF32UByteNCombine(SDNode *N, DAGCombinerInfo &DCI) const;
bool isLegalFlatAddressingMode(const AddrMode &AM) const;
bool isLegalGlobalAddressingMode(const AddrMode &AM) const;
bool isLegalMUBUFAddressingMode(const AddrMode &AM) const;
unsigned isCFIntrinsic(const SDNode *Intr) const;

View File

@ -1,9 +1,11 @@
; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tahiti < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-SI %s
; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI %s
; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI %s
; RUN: llc -march=amdgcn -mcpu=tahiti -amdgpu-scalarize-global-loads=false -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
; RUN: llc -march=amdgcn -mcpu=bonaire -amdgpu-scalarize-global-loads=false -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-scalarize-global-loads=false -mattr=-flat-for-global -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tahiti < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-SI -check-prefix=OPT-SICIVI %s
; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI -check-prefix=OPT-SICIVI %s
; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI -check-prefix=OPT-SICIVI %s
; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=gfx900 < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-GFX9 %s
; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SICIVI %s
; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=SICIVI %s
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-scalarize-global-loads=false -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SICIVI %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
@ -14,7 +16,6 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:
; OPT-CI: getelementptr i8,
; GCN-LABEL: {{^}}test_sink_global_small_offset_i32:
; GCN: {{^}}BB0_2:
define amdgpu_kernel void @test_sink_global_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
entry:
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
@ -42,7 +43,8 @@ done:
; GCN-LABEL: {{^}}test_sink_global_small_max_i32_ds_offset:
; GCN: s_and_saveexec_b64
; GCN: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
; GFX9: global_load_sbyte {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
; GCN: {{^}}BB1_2:
; GCN: s_or_b64 exec
define amdgpu_kernel void @test_sink_global_small_max_i32_ds_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
@ -69,7 +71,8 @@ done:
; GCN-LABEL: {{^}}test_sink_global_small_max_mubuf_offset:
; GCN: s_and_saveexec_b64
; GCN: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4095{{$}}
; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4095{{$}}
; GFX9: global_load_sbyte {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, off offset:4095{{$}}
; GCN: {{^}}BB2_2:
; GCN: s_or_b64 exec
define amdgpu_kernel void @test_sink_global_small_max_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
@ -96,7 +99,8 @@ done:
; GCN-LABEL: {{^}}test_sink_global_small_max_plus_1_mubuf_offset:
; GCN: s_and_saveexec_b64
; GCN: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
; GFX9: global_load_sbyte {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
; GCN: {{^}}BB3_2:
; GCN: s_or_b64 exec
define amdgpu_kernel void @test_sink_global_small_max_plus_1_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
@ -673,6 +677,67 @@ done:
ret void
}
; OPT-LABEL: @test_sink_global_small_min_scratch_global_offset(
; OPT-SICIVI: %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 -4096
; OPT-SICIV: br
; OPT-SICIVI: %tmp1 = load i8, i8 addrspace(1)* %in.gep
; OPT-GFX9: br
; OPT-GFX9: %sunkaddr = getelementptr i8, i8 addrspace(1)* %in, i64 -4096
; OPT-GFX9: load i8, i8 addrspace(1)* %sunkaddr
; GCN-LABEL: {{^}}test_sink_global_small_min_scratch_global_offset:
; GFX9: global_load_sbyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:-4096{{$}}
define amdgpu_kernel void @test_sink_global_small_min_scratch_global_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
entry:
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024
%in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 -4096
%tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
%tmp0 = icmp eq i32 %tid, 0
br i1 %tmp0, label %endif, label %if
if:
%tmp1 = load i8, i8 addrspace(1)* %in.gep
%tmp2 = sext i8 %tmp1 to i32
br label %endif
endif:
%x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
store i32 %x, i32 addrspace(1)* %out.gep
br label %done
done:
ret void
}
; OPT-LABEL: @test_sink_global_small_min_scratch_global_neg1_offset(
; OPT: %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 -4097
; OPT: br
; OPT: load i8, i8 addrspace(1)* %in.gep
; GCN-LABEL: {{^}}test_sink_global_small_min_scratch_global_neg1_offset:
define amdgpu_kernel void @test_sink_global_small_min_scratch_global_neg1_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
entry:
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999
%in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 -4097
%tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
%tmp0 = icmp eq i32 %tid, 0
br i1 %tmp0, label %endif, label %if
if:
%tmp1 = load i8, i8 addrspace(1)* %in.gep
%tmp2 = sext i8 %tmp1 to i32
br label %endif
endif:
%x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
store i32 %x, i32 addrspace(1)* %out.gep
br label %done
done:
ret void
}
declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0
declare i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #2
declare i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #2