[AMDGPU] Do not widen scalar loads on GFX12 (#78724)

GFX12 has subword scalar loads so there is no need to do this.
This commit is contained in:
Jay Foad 2024-01-19 15:30:07 +00:00 committed by GitHub
parent aac1d9710b
commit 89226ecbb9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 149 additions and 68 deletions

View File

@ -13,9 +13,11 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
#include "AMDGPUTargetMachine.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstVisitor.h"
#include "llvm/InitializePasses.h"
@ -58,6 +60,7 @@ public:
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<TargetPassConfig>();
AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<UniformityInfoWrapperPass>();
AU.setPreservesAll();
@ -90,7 +93,11 @@ bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
if (skipFunction(F))
return false;
// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
const TargetMachine &TM = TPC.getTM<TargetMachine>();
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
if (ST.hasScalarSubwordLoads())
return false;
AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
@ -181,6 +188,7 @@ bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
"AMDGPU IR late optimizations", false, false)
INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare, DEBUG_TYPE,

View File

@ -1,15 +1,21 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-late-codegenprepare %s | FileCheck %s
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-late-codegenprepare %s | FileCheck %s -check-prefix=GFX9
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -amdgpu-late-codegenprepare %s | FileCheck %s -check-prefix=GFX12
; Make sure we don't crash when trying to create a bitcast between
; address spaces
define amdgpu_kernel void @constant_from_offset_cast_generic_null() {
; CHECK-LABEL: @constant_from_offset_cast_generic_null(
; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) getelementptr (i8, ptr addrspace(4) addrspacecast (ptr null to ptr addrspace(4)), i64 4), align 4
; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 16
; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
; CHECK-NEXT: store i8 [[TMP3]], ptr addrspace(1) undef, align 1
; CHECK-NEXT: ret void
; GFX9-LABEL: @constant_from_offset_cast_generic_null(
; GFX9-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) getelementptr (i8, ptr addrspace(4) addrspacecast (ptr null to ptr addrspace(4)), i64 4), align 4
; GFX9-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 16
; GFX9-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
; GFX9-NEXT: store i8 [[TMP3]], ptr addrspace(1) undef, align 1
; GFX9-NEXT: ret void
;
; GFX12-LABEL: @constant_from_offset_cast_generic_null(
; GFX12-NEXT: [[LOAD:%.*]] = load i8, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) addrspacecast (ptr null to ptr addrspace(4)), i64 6), align 1
; GFX12-NEXT: store i8 [[LOAD]], ptr addrspace(1) undef, align 1
; GFX12-NEXT: ret void
;
%load = load i8, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) addrspacecast (ptr null to ptr addrspace(4)), i64 6), align 1
store i8 %load, ptr addrspace(1) undef
@ -17,12 +23,17 @@ define amdgpu_kernel void @constant_from_offset_cast_generic_null() {
}
define amdgpu_kernel void @constant_from_offset_cast_global_null() {
; CHECK-LABEL: @constant_from_offset_cast_global_null(
; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) getelementptr (i8, ptr addrspace(4) addrspacecast (ptr addrspace(1) null to ptr addrspace(4)), i64 4), align 4
; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 16
; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
; CHECK-NEXT: store i8 [[TMP3]], ptr addrspace(1) undef, align 1
; CHECK-NEXT: ret void
; GFX9-LABEL: @constant_from_offset_cast_global_null(
; GFX9-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) getelementptr (i8, ptr addrspace(4) addrspacecast (ptr addrspace(1) null to ptr addrspace(4)), i64 4), align 4
; GFX9-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 16
; GFX9-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
; GFX9-NEXT: store i8 [[TMP3]], ptr addrspace(1) undef, align 1
; GFX9-NEXT: ret void
;
; GFX12-LABEL: @constant_from_offset_cast_global_null(
; GFX12-NEXT: [[LOAD:%.*]] = load i8, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) addrspacecast (ptr addrspace(1) null to ptr addrspace(4)), i64 6), align 1
; GFX12-NEXT: store i8 [[LOAD]], ptr addrspace(1) undef, align 1
; GFX12-NEXT: ret void
;
%load = load i8, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) addrspacecast (ptr addrspace(1) null to ptr addrspace(4)), i64 6), align 1
store i8 %load, ptr addrspace(1) undef
@ -32,12 +43,17 @@ define amdgpu_kernel void @constant_from_offset_cast_global_null() {
@gv = unnamed_addr addrspace(1) global [64 x i8] undef, align 4
define amdgpu_kernel void @constant_from_offset_cast_global_gv() {
; CHECK-LABEL: @constant_from_offset_cast_global_gv(
; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) getelementptr (i8, ptr addrspace(4) addrspacecast (ptr addrspace(1) @gv to ptr addrspace(4)), i64 4), align 4
; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 16
; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
; CHECK-NEXT: store i8 [[TMP3]], ptr addrspace(1) undef, align 1
; CHECK-NEXT: ret void
; GFX9-LABEL: @constant_from_offset_cast_global_gv(
; GFX9-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) getelementptr (i8, ptr addrspace(4) addrspacecast (ptr addrspace(1) @gv to ptr addrspace(4)), i64 4), align 4
; GFX9-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 16
; GFX9-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
; GFX9-NEXT: store i8 [[TMP3]], ptr addrspace(1) undef, align 1
; GFX9-NEXT: ret void
;
; GFX12-LABEL: @constant_from_offset_cast_global_gv(
; GFX12-NEXT: [[LOAD:%.*]] = load i8, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) addrspacecast (ptr addrspace(1) @gv to ptr addrspace(4)), i64 6), align 1
; GFX12-NEXT: store i8 [[LOAD]], ptr addrspace(1) undef, align 1
; GFX12-NEXT: ret void
;
%load = load i8, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) addrspacecast (ptr addrspace(1) @gv to ptr addrspace(4)), i64 6), align 1
store i8 %load, ptr addrspace(1) undef
@ -45,12 +61,17 @@ define amdgpu_kernel void @constant_from_offset_cast_global_gv() {
}
define amdgpu_kernel void @constant_from_offset_cast_generic_inttoptr() {
; CHECK-LABEL: @constant_from_offset_cast_generic_inttoptr(
; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) getelementptr (i8, ptr addrspace(4) addrspacecast (ptr inttoptr (i64 128 to ptr) to ptr addrspace(4)), i64 4), align 4
; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 16
; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
; CHECK-NEXT: store i8 [[TMP3]], ptr addrspace(1) undef, align 1
; CHECK-NEXT: ret void
; GFX9-LABEL: @constant_from_offset_cast_generic_inttoptr(
; GFX9-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) getelementptr (i8, ptr addrspace(4) addrspacecast (ptr inttoptr (i64 128 to ptr) to ptr addrspace(4)), i64 4), align 4
; GFX9-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 16
; GFX9-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
; GFX9-NEXT: store i8 [[TMP3]], ptr addrspace(1) undef, align 1
; GFX9-NEXT: ret void
;
; GFX12-LABEL: @constant_from_offset_cast_generic_inttoptr(
; GFX12-NEXT: [[LOAD:%.*]] = load i8, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) addrspacecast (ptr inttoptr (i64 128 to ptr) to ptr addrspace(4)), i64 6), align 1
; GFX12-NEXT: store i8 [[LOAD]], ptr addrspace(1) undef, align 1
; GFX12-NEXT: ret void
;
%load = load i8, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) addrspacecast (ptr inttoptr (i64 128 to ptr) to ptr addrspace(4)), i64 6), align 1
store i8 %load, ptr addrspace(1) undef
@ -58,10 +79,15 @@ define amdgpu_kernel void @constant_from_offset_cast_generic_inttoptr() {
}
define amdgpu_kernel void @constant_from_inttoptr() {
; CHECK-LABEL: @constant_from_inttoptr(
; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr addrspace(4) inttoptr (i64 128 to ptr addrspace(4)), align 4
; CHECK-NEXT: store i8 [[LOAD]], ptr addrspace(1) undef, align 1
; CHECK-NEXT: ret void
; GFX9-LABEL: @constant_from_inttoptr(
; GFX9-NEXT: [[LOAD:%.*]] = load i8, ptr addrspace(4) inttoptr (i64 128 to ptr addrspace(4)), align 4
; GFX9-NEXT: store i8 [[LOAD]], ptr addrspace(1) undef, align 1
; GFX9-NEXT: ret void
;
; GFX12-LABEL: @constant_from_inttoptr(
; GFX12-NEXT: [[LOAD:%.*]] = load i8, ptr addrspace(4) inttoptr (i64 128 to ptr addrspace(4)), align 1
; GFX12-NEXT: store i8 [[LOAD]], ptr addrspace(1) undef, align 1
; GFX12-NEXT: ret void
;
%load = load i8, ptr addrspace(4) inttoptr (i64 128 to ptr addrspace(4)), align 1
store i8 %load, ptr addrspace(1) undef

View File

@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s -check-prefix=GFX9
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck %s -check-prefix=GFX12
; We have an indirect call with a known set of callees, which are
; known to not need any special inputs. The ABI still needs to use the
@ -8,35 +9,63 @@
; FIXME: Passing real values for workitem ID, and 0s that can be undef
define amdgpu_kernel void @indirect_call_known_no_special_inputs() {
; CHECK-LABEL: indirect_call_known_no_special_inputs:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s4, s7
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
; CHECK-NEXT: s_add_u32 s0, s0, s7
; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: s_mov_b64 s[4:5], 0
; CHECK-NEXT: s_load_dword s7, s[4:5], 0x0
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, wobble@gotpcrel32@hi+12
; CHECK-NEXT: s_getpc_b64 s[8:9]
; CHECK-NEXT: s_add_u32 s8, s8, snork@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s9, s9, snork@gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0
; CHECK-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x0
; CHECK-NEXT: s_mov_b64 s[8:9], 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_and_b32 s4, 1, s7
; CHECK-NEXT: s_cmp_eq_u32 s4, 1
; CHECK-NEXT: v_mov_b32_e32 v31, v0
; CHECK-NEXT: s_cselect_b32 s5, s13, s11
; CHECK-NEXT: s_cselect_b32 s4, s12, s10
; CHECK-NEXT: s_mov_b32 s12, s6
; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: v_mov_b32_e32 v4, 0
; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CHECK-NEXT: s_endpgm
; GFX9-LABEL: indirect_call_known_no_special_inputs:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_add_u32 flat_scratch_lo, s4, s7
; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
; GFX9-NEXT: s_add_u32 s0, s0, s7
; GFX9-NEXT: s_addc_u32 s1, s1, 0
; GFX9-NEXT: s_mov_b64 s[4:5], 0
; GFX9-NEXT: s_load_dword s7, s[4:5], 0x0
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, wobble@gotpcrel32@hi+12
; GFX9-NEXT: s_getpc_b64 s[8:9]
; GFX9-NEXT: s_add_u32 s8, s8, snork@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s9, s9, snork@gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0
; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x0
; GFX9-NEXT: s_mov_b64 s[8:9], 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_and_b32 s4, 1, s7
; GFX9-NEXT: s_cmp_eq_u32 s4, 1
; GFX9-NEXT: v_mov_b32_e32 v31, v0
; GFX9-NEXT: s_cselect_b32 s5, s13, s11
; GFX9-NEXT: s_cselect_b32 s4, s12, s10
; GFX9-NEXT: s_mov_b32 s12, s6
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: indirect_call_known_no_special_inputs:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_getpc_b64 s[4:5]
; GFX12-NEXT: s_sext_i32_i16 s5, s5
; GFX12-NEXT: s_add_co_u32 s4, s4, snork@gotpcrel32@lo+8
; GFX12-NEXT: s_add_co_ci_u32 s5, s5, snork@gotpcrel32@hi+16
; GFX12-NEXT: s_mov_b64 s[2:3], 0
; GFX12-NEXT: s_getpc_b64 s[6:7]
; GFX12-NEXT: s_sext_i32_i16 s7, s7
; GFX12-NEXT: s_add_co_u32 s6, s6, wobble@gotpcrel32@lo+8
; GFX12-NEXT: s_add_co_ci_u32 s7, s7, wobble@gotpcrel32@hi+16
; GFX12-NEXT: s_load_u8 s1, s[2:3], 0x0
; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX12-NEXT: s_load_b64 s[4:5], s[6:7], 0x0
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 0
; GFX12-NEXT: v_mov_b32_e32 v31, v0
; GFX12-NEXT: s_mov_b64 s[8:9], 0
; GFX12-NEXT: s_mov_b32 s12, s0
; GFX12-NEXT: s_mov_b32 s32, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_and_b32 s1, 1, s1
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_cmp_eq_u32 s1, 1
; GFX12-NEXT: s_cselect_b32 s3, s5, s3
; GFX12-NEXT: s_cselect_b32 s2, s4, s2
; GFX12-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX12-NEXT: s_endpgm
bb:
%cond = load i1, ptr addrspace(4) null
@ -46,19 +75,37 @@ bb:
}
define void @wobble() {
; CHECK-LABEL: wobble:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
; GFX9-LABEL: wobble:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: wobble:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
bb:
ret void
}
define void @snork() {
; CHECK-LABEL: snork:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
; GFX9-LABEL: snork:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: snork:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
bb:
ret void
}