diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp index 1983e9f8d4af..69fdeaebe0a0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp @@ -13,9 +13,11 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "AMDGPUTargetMachine.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/UniformityAnalysis.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" #include "llvm/InitializePasses.h" @@ -58,6 +60,7 @@ public: } void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); AU.addRequired(); AU.addRequired(); AU.setPreservesAll(); @@ -90,7 +93,11 @@ bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) { if (skipFunction(F)) return false; - // TODO: Skip this on GFX12 which does have scalar sub-dword loads. + const TargetPassConfig &TPC = getAnalysis(); + const TargetMachine &TM = TPC.getTM(); + const GCNSubtarget &ST = TM.getSubtarget(F); + if (ST.hasScalarSubwordLoads()) + return false; AC = &getAnalysis().getAssumptionCache(F); UA = &getAnalysis().getUniformityInfo(); @@ -181,6 +188,7 @@ bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) { INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR late optimizations", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass) INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare, DEBUG_TYPE, diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-late-codegenprepare.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-late-codegenprepare.ll index 0de0ac7b77a7..83016f1d2d3c 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-late-codegenprepare.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-late-codegenprepare.ll @@ -1,15 +1,21 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-late-codegenprepare %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-late-codegenprepare %s | FileCheck %s -check-prefix=GFX9 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -amdgpu-late-codegenprepare %s | FileCheck %s -check-prefix=GFX12 ; Make sure we don't crash when trying to create a bitcast between ; address spaces define amdgpu_kernel void @constant_from_offset_cast_generic_null() { -; CHECK-LABEL: @constant_from_offset_cast_generic_null( -; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) getelementptr (i8, ptr addrspace(4) addrspacecast (ptr null to ptr addrspace(4)), i64 4), align 4 -; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 16 -; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8 -; CHECK-NEXT: store i8 [[TMP3]], ptr addrspace(1) undef, align 1 -; CHECK-NEXT: ret void +; GFX9-LABEL: @constant_from_offset_cast_generic_null( +; GFX9-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) getelementptr (i8, ptr addrspace(4) addrspacecast (ptr null to ptr addrspace(4)), i64 4), align 4 +; GFX9-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 16 +; GFX9-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8 +; GFX9-NEXT: store i8 [[TMP3]], ptr addrspace(1) undef, align 1 +; GFX9-NEXT: ret void +; +; GFX12-LABEL: @constant_from_offset_cast_generic_null( +; GFX12-NEXT: [[LOAD:%.*]] = load i8, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) addrspacecast (ptr null to ptr addrspace(4)), i64 6), align 1 +; GFX12-NEXT: store i8 [[LOAD]], ptr addrspace(1) undef, align 1 +; GFX12-NEXT: ret void ; %load = load i8, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) addrspacecast (ptr null to ptr addrspace(4)), i64 6), align 1 store i8 %load, ptr addrspace(1) undef @@ -17,12 +23,17 @@ define amdgpu_kernel void @constant_from_offset_cast_generic_null() { } define amdgpu_kernel void @constant_from_offset_cast_global_null() { -; CHECK-LABEL: @constant_from_offset_cast_global_null( -; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) getelementptr (i8, ptr addrspace(4) addrspacecast (ptr addrspace(1) null to ptr addrspace(4)), i64 4), align 4 -; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 16 -; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8 -; CHECK-NEXT: store i8 [[TMP3]], ptr addrspace(1) undef, align 1 -; CHECK-NEXT: ret void +; GFX9-LABEL: @constant_from_offset_cast_global_null( +; GFX9-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) getelementptr (i8, ptr addrspace(4) addrspacecast (ptr addrspace(1) null to ptr addrspace(4)), i64 4), align 4 +; GFX9-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 16 +; GFX9-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8 +; GFX9-NEXT: store i8 [[TMP3]], ptr addrspace(1) undef, align 1 +; GFX9-NEXT: ret void +; +; GFX12-LABEL: @constant_from_offset_cast_global_null( +; GFX12-NEXT: [[LOAD:%.*]] = load i8, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) addrspacecast (ptr addrspace(1) null to ptr addrspace(4)), i64 6), align 1 +; GFX12-NEXT: store i8 [[LOAD]], ptr addrspace(1) undef, align 1 +; GFX12-NEXT: ret void ; %load = load i8, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) addrspacecast (ptr addrspace(1) null to ptr addrspace(4)), i64 6), align 1 store i8 %load, ptr addrspace(1) undef @@ -32,12 +43,17 @@ define amdgpu_kernel void @constant_from_offset_cast_global_null() { @gv = unnamed_addr addrspace(1) global [64 x i8] undef, align 4 define amdgpu_kernel void @constant_from_offset_cast_global_gv() { -; CHECK-LABEL: @constant_from_offset_cast_global_gv( -; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) getelementptr (i8, ptr addrspace(4) addrspacecast (ptr addrspace(1) @gv to ptr addrspace(4)), i64 4), align 4 -; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 16 -; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8 -; CHECK-NEXT: store i8 [[TMP3]], ptr addrspace(1) undef, align 1 -; CHECK-NEXT: ret void +; GFX9-LABEL: @constant_from_offset_cast_global_gv( +; GFX9-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) getelementptr (i8, ptr addrspace(4) addrspacecast (ptr addrspace(1) @gv to ptr addrspace(4)), i64 4), align 4 +; GFX9-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 16 +; GFX9-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8 +; GFX9-NEXT: store i8 [[TMP3]], ptr addrspace(1) undef, align 1 +; GFX9-NEXT: ret void +; +; GFX12-LABEL: @constant_from_offset_cast_global_gv( +; GFX12-NEXT: [[LOAD:%.*]] = load i8, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) addrspacecast (ptr addrspace(1) @gv to ptr addrspace(4)), i64 6), align 1 +; GFX12-NEXT: store i8 [[LOAD]], ptr addrspace(1) undef, align 1 +; GFX12-NEXT: ret void ; %load = load i8, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) addrspacecast (ptr addrspace(1) @gv to ptr addrspace(4)), i64 6), align 1 store i8 %load, ptr addrspace(1) undef @@ -45,12 +61,17 @@ define amdgpu_kernel void @constant_from_offset_cast_global_gv() { } define amdgpu_kernel void @constant_from_offset_cast_generic_inttoptr() { -; CHECK-LABEL: @constant_from_offset_cast_generic_inttoptr( -; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) getelementptr (i8, ptr addrspace(4) addrspacecast (ptr inttoptr (i64 128 to ptr) to ptr addrspace(4)), i64 4), align 4 -; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 16 -; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8 -; CHECK-NEXT: store i8 [[TMP3]], ptr addrspace(1) undef, align 1 -; CHECK-NEXT: ret void +; GFX9-LABEL: @constant_from_offset_cast_generic_inttoptr( +; GFX9-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) getelementptr (i8, ptr addrspace(4) addrspacecast (ptr inttoptr (i64 128 to ptr) to ptr addrspace(4)), i64 4), align 4 +; GFX9-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 16 +; GFX9-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8 +; GFX9-NEXT: store i8 [[TMP3]], ptr addrspace(1) undef, align 1 +; GFX9-NEXT: ret void +; +; GFX12-LABEL: @constant_from_offset_cast_generic_inttoptr( +; GFX12-NEXT: [[LOAD:%.*]] = load i8, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) addrspacecast (ptr inttoptr (i64 128 to ptr) to ptr addrspace(4)), i64 6), align 1 +; GFX12-NEXT: store i8 [[LOAD]], ptr addrspace(1) undef, align 1 +; GFX12-NEXT: ret void ; %load = load i8, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) addrspacecast (ptr inttoptr (i64 128 to ptr) to ptr addrspace(4)), i64 6), align 1 store i8 %load, ptr addrspace(1) undef @@ -58,10 +79,15 @@ define amdgpu_kernel void @constant_from_offset_cast_generic_inttoptr() { } define amdgpu_kernel void @constant_from_inttoptr() { -; CHECK-LABEL: @constant_from_inttoptr( -; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr addrspace(4) inttoptr (i64 128 to ptr addrspace(4)), align 4 -; CHECK-NEXT: store i8 [[LOAD]], ptr addrspace(1) undef, align 1 -; CHECK-NEXT: ret void +; GFX9-LABEL: @constant_from_inttoptr( +; GFX9-NEXT: [[LOAD:%.*]] = load i8, ptr addrspace(4) inttoptr (i64 128 to ptr addrspace(4)), align 4 +; GFX9-NEXT: store i8 [[LOAD]], ptr addrspace(1) undef, align 1 +; GFX9-NEXT: ret void +; +; GFX12-LABEL: @constant_from_inttoptr( +; GFX12-NEXT: [[LOAD:%.*]] = load i8, ptr addrspace(4) inttoptr (i64 128 to ptr addrspace(4)), align 1 +; GFX12-NEXT: store i8 [[LOAD]], ptr addrspace(1) undef, align 1 +; GFX12-NEXT: ret void ; %load = load i8, ptr addrspace(4) inttoptr (i64 128 to ptr addrspace(4)), align 1 store i8 %load, ptr addrspace(1) undef diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll index fe7323eeadf8..9965d214cc9b 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s -check-prefix=GFX9 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck %s -check-prefix=GFX12 ; We have an indirect call with a known set of callees, which are ; known to not need any special inputs. The ABI still needs to use the @@ -8,35 +9,63 @@ ; FIXME: Passing real values for workitem ID, and 0s that can be undef define amdgpu_kernel void @indirect_call_known_no_special_inputs() { -; CHECK-LABEL: indirect_call_known_no_special_inputs: -; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_add_u32 flat_scratch_lo, s4, s7 -; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 -; CHECK-NEXT: s_add_u32 s0, s0, s7 -; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: s_load_dword s7, s[4:5], 0x0 -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, wobble@gotpcrel32@hi+12 -; CHECK-NEXT: s_getpc_b64 s[8:9] -; CHECK-NEXT: s_add_u32 s8, s8, snork@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s9, s9, snork@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 -; CHECK-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x0 -; CHECK-NEXT: s_mov_b64 s[8:9], 0 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_and_b32 s4, 1, s7 -; CHECK-NEXT: s_cmp_eq_u32 s4, 1 -; CHECK-NEXT: v_mov_b32_e32 v31, v0 -; CHECK-NEXT: s_cselect_b32 s5, s13, s11 -; CHECK-NEXT: s_cselect_b32 s4, s12, s10 -; CHECK-NEXT: s_mov_b32 s12, s6 -; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: v_mov_b32_e32 v4, 0 -; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] -; CHECK-NEXT: s_endpgm +; GFX9-LABEL: indirect_call_known_no_special_inputs: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s4, s7 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 +; GFX9-NEXT: s_add_u32 s0, s0, s7 +; GFX9-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_load_dword s7, s[4:5], 0x0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, wobble@gotpcrel32@hi+12 +; GFX9-NEXT: s_getpc_b64 s[8:9] +; GFX9-NEXT: s_add_u32 s8, s8, snork@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s9, s9, snork@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_and_b32 s4, 1, s7 +; GFX9-NEXT: s_cmp_eq_u32 s4, 1 +; GFX9-NEXT: v_mov_b32_e32 v31, v0 +; GFX9-NEXT: s_cselect_b32 s5, s13, s11 +; GFX9-NEXT: s_cselect_b32 s4, s12, s10 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: indirect_call_known_no_special_inputs: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_getpc_b64 s[4:5] +; GFX12-NEXT: s_sext_i32_i16 s5, s5 +; GFX12-NEXT: s_add_co_u32 s4, s4, snork@gotpcrel32@lo+8 +; GFX12-NEXT: s_add_co_ci_u32 s5, s5, snork@gotpcrel32@hi+16 +; GFX12-NEXT: s_mov_b64 s[2:3], 0 +; GFX12-NEXT: s_getpc_b64 s[6:7] +; GFX12-NEXT: s_sext_i32_i16 s7, s7 +; GFX12-NEXT: s_add_co_u32 s6, s6, wobble@gotpcrel32@lo+8 +; GFX12-NEXT: s_add_co_ci_u32 s7, s7, wobble@gotpcrel32@hi+16 +; GFX12-NEXT: s_load_u8 s1, s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-NEXT: s_load_b64 s[4:5], s[6:7], 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 0 +; GFX12-NEXT: v_mov_b32_e32 v31, v0 +; GFX12-NEXT: s_mov_b64 s[8:9], 0 +; GFX12-NEXT: s_mov_b32 s12, s0 +; GFX12-NEXT: s_mov_b32 s32, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_and_b32 s1, 1, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, 1 +; GFX12-NEXT: s_cselect_b32 s3, s5, s3 +; GFX12-NEXT: s_cselect_b32 s2, s4, s2 +; GFX12-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX12-NEXT: s_endpgm bb: %cond = load i1, ptr addrspace(4) null @@ -46,19 +75,37 @@ bb: } define void @wobble() { -; CHECK-LABEL: wobble: -; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: wobble: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: wobble: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] bb: ret void } define void @snork() { -; CHECK-LABEL: snork: -; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: snork: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: snork: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] bb: ret void }