mirror of
https://github.com/RPCS3/llvm.git
synced 2024-12-15 07:59:57 +00:00
AMDGPU: Teach isLegalAddressingMode about global_* instructions
Also refine the flat check to respect flat-for-global feature, and constant fallback should check global handling, not specifically MUBUF. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@309471 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
d74d012b62
commit
27eee9a0e2
@ -586,6 +586,26 @@ bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
|
||||
return isUInt<12>(AM.BaseOffs) && AM.Scale == 0;
|
||||
}
|
||||
|
||||
bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
|
||||
if (Subtarget->hasFlatGlobalInsts())
|
||||
return isInt<13>(AM.BaseOffs) && AM.Scale == 0;
|
||||
|
||||
if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
|
||||
// Assume the we will use FLAT for all global memory accesses
|
||||
// on VI.
|
||||
// FIXME: This assumption is currently wrong. On VI we still use
|
||||
// MUBUF instructions for the r + i addressing mode. As currently
|
||||
// implemented, the MUBUF instructions only work on buffer < 4GB.
|
||||
// It may be possible to support > 4GB buffers with MUBUF instructions,
|
||||
// by setting the stride value in the resource descriptor which would
|
||||
// increase the size limit to (stride * 4GB). However, this is risky,
|
||||
// because it has never been validated.
|
||||
return isLegalFlatAddressingMode(AM);
|
||||
}
|
||||
|
||||
return isLegalMUBUFAddressingMode(AM);
|
||||
}
|
||||
|
||||
bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
|
||||
// MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
|
||||
// additionally can do r + r + i with addr64. 32-bit has more addressing
|
||||
@ -628,22 +648,10 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
|
||||
if (AM.BaseGV)
|
||||
return false;
|
||||
|
||||
if (AS == AMDGPUASI.GLOBAL_ADDRESS) {
|
||||
if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
|
||||
// Assume the we will use FLAT for all global memory accesses
|
||||
// on VI.
|
||||
// FIXME: This assumption is currently wrong. On VI we still use
|
||||
// MUBUF instructions for the r + i addressing mode. As currently
|
||||
// implemented, the MUBUF instructions only work on buffer < 4GB.
|
||||
// It may be possible to support > 4GB buffers with MUBUF instructions,
|
||||
// by setting the stride value in the resource descriptor which would
|
||||
// increase the size limit to (stride * 4GB). However, this is risky,
|
||||
// because it has never been validated.
|
||||
return isLegalFlatAddressingMode(AM);
|
||||
}
|
||||
if (AS == AMDGPUASI.GLOBAL_ADDRESS)
|
||||
return isLegalGlobalAddressingMode(AM);
|
||||
|
||||
return isLegalMUBUFAddressingMode(AM);
|
||||
} else if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
|
||||
if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
|
||||
// If the offset isn't a multiple of 4, it probably isn't going to be
|
||||
// correctly aligned.
|
||||
// FIXME: Can we get the real alignment here?
|
||||
@ -655,7 +663,7 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
|
||||
// FIXME?: We also need to do this if unaligned, but we don't know the
|
||||
// alignment here.
|
||||
if (DL.getTypeStoreSize(Ty) < 4)
|
||||
return isLegalMUBUFAddressingMode(AM);
|
||||
return isLegalGlobalAddressingMode(AM);
|
||||
|
||||
if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) {
|
||||
// SMRD instructions have an 8-bit, dword offset on SI.
|
||||
|
@ -118,6 +118,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
|
||||
SDValue performCvtF32UByteNCombine(SDNode *N, DAGCombinerInfo &DCI) const;
|
||||
|
||||
bool isLegalFlatAddressingMode(const AddrMode &AM) const;
|
||||
bool isLegalGlobalAddressingMode(const AddrMode &AM) const;
|
||||
bool isLegalMUBUFAddressingMode(const AddrMode &AM) const;
|
||||
|
||||
unsigned isCFIntrinsic(const SDNode *Intr) const;
|
||||
|
@ -1,9 +1,11 @@
|
||||
; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tahiti < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-SI %s
|
||||
; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI %s
|
||||
; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tahiti -amdgpu-scalarize-global-loads=false -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=bonaire -amdgpu-scalarize-global-loads=false -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-scalarize-global-loads=false -mattr=-flat-for-global -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
|
||||
; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tahiti < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-SI -check-prefix=OPT-SICIVI %s
|
||||
; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI -check-prefix=OPT-SICIVI %s
|
||||
; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI -check-prefix=OPT-SICIVI %s
|
||||
; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=gfx900 < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-GFX9 %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SICIVI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=SICIVI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-scalarize-global-loads=false -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SICIVI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
|
||||
|
||||
target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
|
||||
|
||||
@ -14,7 +16,6 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:
|
||||
; OPT-CI: getelementptr i8,
|
||||
|
||||
; GCN-LABEL: {{^}}test_sink_global_small_offset_i32:
|
||||
; GCN: {{^}}BB0_2:
|
||||
define amdgpu_kernel void @test_sink_global_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
|
||||
entry:
|
||||
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
|
||||
@ -42,7 +43,8 @@ done:
|
||||
|
||||
; GCN-LABEL: {{^}}test_sink_global_small_max_i32_ds_offset:
|
||||
; GCN: s_and_saveexec_b64
|
||||
; GCN: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
|
||||
; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
|
||||
; GFX9: global_load_sbyte {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
|
||||
; GCN: {{^}}BB1_2:
|
||||
; GCN: s_or_b64 exec
|
||||
define amdgpu_kernel void @test_sink_global_small_max_i32_ds_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
|
||||
@ -69,7 +71,8 @@ done:
|
||||
|
||||
; GCN-LABEL: {{^}}test_sink_global_small_max_mubuf_offset:
|
||||
; GCN: s_and_saveexec_b64
|
||||
; GCN: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4095{{$}}
|
||||
; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4095{{$}}
|
||||
; GFX9: global_load_sbyte {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, off offset:4095{{$}}
|
||||
; GCN: {{^}}BB2_2:
|
||||
; GCN: s_or_b64 exec
|
||||
define amdgpu_kernel void @test_sink_global_small_max_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
|
||||
@ -96,7 +99,8 @@ done:
|
||||
|
||||
; GCN-LABEL: {{^}}test_sink_global_small_max_plus_1_mubuf_offset:
|
||||
; GCN: s_and_saveexec_b64
|
||||
; GCN: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
|
||||
; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
|
||||
; GFX9: global_load_sbyte {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
|
||||
; GCN: {{^}}BB3_2:
|
||||
; GCN: s_or_b64 exec
|
||||
define amdgpu_kernel void @test_sink_global_small_max_plus_1_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
|
||||
@ -673,6 +677,67 @@ done:
|
||||
ret void
|
||||
}
|
||||
|
||||
; OPT-LABEL: @test_sink_global_small_min_scratch_global_offset(
|
||||
; OPT-SICIVI: %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 -4096
|
||||
; OPT-SICIV: br
|
||||
; OPT-SICIVI: %tmp1 = load i8, i8 addrspace(1)* %in.gep
|
||||
|
||||
; OPT-GFX9: br
|
||||
; OPT-GFX9: %sunkaddr = getelementptr i8, i8 addrspace(1)* %in, i64 -4096
|
||||
; OPT-GFX9: load i8, i8 addrspace(1)* %sunkaddr
|
||||
|
||||
; GCN-LABEL: {{^}}test_sink_global_small_min_scratch_global_offset:
|
||||
; GFX9: global_load_sbyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:-4096{{$}}
|
||||
define amdgpu_kernel void @test_sink_global_small_min_scratch_global_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
|
||||
entry:
|
||||
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024
|
||||
%in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 -4096
|
||||
%tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
|
||||
%tmp0 = icmp eq i32 %tid, 0
|
||||
br i1 %tmp0, label %endif, label %if
|
||||
|
||||
if:
|
||||
%tmp1 = load i8, i8 addrspace(1)* %in.gep
|
||||
%tmp2 = sext i8 %tmp1 to i32
|
||||
br label %endif
|
||||
|
||||
endif:
|
||||
%x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
|
||||
store i32 %x, i32 addrspace(1)* %out.gep
|
||||
br label %done
|
||||
|
||||
done:
|
||||
ret void
|
||||
}
|
||||
|
||||
; OPT-LABEL: @test_sink_global_small_min_scratch_global_neg1_offset(
|
||||
; OPT: %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 -4097
|
||||
; OPT: br
|
||||
; OPT: load i8, i8 addrspace(1)* %in.gep
|
||||
|
||||
; GCN-LABEL: {{^}}test_sink_global_small_min_scratch_global_neg1_offset:
|
||||
define amdgpu_kernel void @test_sink_global_small_min_scratch_global_neg1_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
|
||||
entry:
|
||||
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999
|
||||
%in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 -4097
|
||||
%tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
|
||||
%tmp0 = icmp eq i32 %tid, 0
|
||||
br i1 %tmp0, label %endif, label %if
|
||||
|
||||
if:
|
||||
%tmp1 = load i8, i8 addrspace(1)* %in.gep
|
||||
%tmp2 = sext i8 %tmp1 to i32
|
||||
br label %endif
|
||||
|
||||
endif:
|
||||
%x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
|
||||
store i32 %x, i32 addrspace(1)* %out.gep
|
||||
br label %done
|
||||
|
||||
done:
|
||||
ret void
|
||||
}
|
||||
|
||||
declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0
|
||||
declare i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #2
|
||||
declare i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #2
|
||||
|
Loading…
Reference in New Issue
Block a user