From e079e365062cf7a8430ab12a4354b0d02cc1c052 Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Thu, 2 May 2019 04:26:35 +0000 Subject: [PATCH] [AMDGPU] gfx1010 lost VOP2 forms of some add/sub Add legalization of V_ADD_I32, V_SUB_I32, V_SUBREV_I32. Differential Revision: llvm-svn: 359757 --- lib/Target/AMDGPU/SIISelLowering.cpp | 27 ++++++++++++++++++ test/CodeGen/AMDGPU/mad.u16.ll | 38 +++++++++++++++++++++++++ test/CodeGen/AMDGPU/min.ll | 42 +++++++++++++++------------- 3 files changed, 88 insertions(+), 19 deletions(-) create mode 100644 test/CodeGen/AMDGPU/mad.u16.ll diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 2f1752d69bb..d876acd7eae 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -3540,6 +3540,33 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( MI.eraseFromParent(); return BB; } + case AMDGPU::V_ADD_I32_e32: + case AMDGPU::V_SUB_I32_e32: + case AMDGPU::V_SUBREV_I32_e32: { + // TODO: Define distinct V_*_I32_Pseudo instructions instead. + const DebugLoc &DL = MI.getDebugLoc(); + unsigned Opc = MI.getOpcode(); + + bool NeedClampOperand = false; + if (TII->pseudoToMCOpcode(Opc) == -1) { + Opc = AMDGPU::getVOPe64(Opc); + NeedClampOperand = true; + } + + auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg()); + if (TII->isVOP3(*I)) { + I.addReg(AMDGPU::VCC, RegState::Define); + } + I.add(MI.getOperand(1)) + .add(MI.getOperand(2)); + if (NeedClampOperand) + I.addImm(0); // clamp bit for e64 encoding + + TII->legalizeOperands(*I); + + MI.eraseFromParent(); + return BB; + } default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); } diff --git a/test/CodeGen/AMDGPU/mad.u16.ll b/test/CodeGen/AMDGPU/mad.u16.ll new file mode 100644 index 00000000000..e93ee2ceee6 --- /dev/null +++ b/test/CodeGen/AMDGPU/mad.u16.ll @@ -0,0 +1,38 @@ +; RUN: llc -march=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX10 %s + +; FIXME: GFX9 should be producing v_mad_u16 instead of v_mad_legacy_u16. + +; GCN-LABEL: {{^}}mad_u16 +; GCN: {{flat|global}}_load_ushort v[[A:[0-9]+]] +; GCN: {{flat|global}}_load_ushort v[[B:[0-9]+]] +; GCN: {{flat|global}}_load_ushort v[[C:[0-9]+]] +; GFX8: v_mad_u16 v[[R:[0-9]+]], v[[A]], v[[B]], v[[C]] +; GFX9: v_mad_legacy_u16 v[[R:[0-9]+]], v[[A]], v[[B]], v[[C]] +; GFX10: v_mad_u16 v[[R:[0-9]+]], v[[A]], v[[B]], v[[C]] +; GCN: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[R]] +; GCN: s_endpgm +define amdgpu_kernel void @mad_u16( + i16 addrspace(1)* %r, + i16 addrspace(1)* %a, + i16 addrspace(1)* %b, + i16 addrspace(1)* %c) { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %a.gep = getelementptr inbounds i16, i16 addrspace(1)* %a, i32 %tid + %b.gep = getelementptr inbounds i16, i16 addrspace(1)* %b, i32 %tid + %c.gep = getelementptr inbounds i16, i16 addrspace(1)* %c, i32 %tid + + %a.val = load volatile i16, i16 addrspace(1)* %a.gep + %b.val = load volatile i16, i16 addrspace(1)* %b.gep + %c.val = load volatile i16, i16 addrspace(1)* %c.gep + + %m.val = mul i16 %a.val, %b.val + %r.val = add i16 %m.val, %c.val + + store i16 %r.val, i16 addrspace(1)* %r + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/test/CodeGen/AMDGPU/min.ll b/test/CodeGen/AMDGPU/min.ll index cc772a6e913..2a16be4d6eb 100644 --- a/test/CodeGen/AMDGPU/min.ll +++ b/test/CodeGen/AMDGPU/min.ll @@ -1,6 +1,7 @@ ; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX8_9_10 -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX9_10 -check-prefix=GFX8_9_10 -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX10 -check-prefix=GFX9_10 -check-prefix=GFX8_9_10 -check-prefix=FUNC %s ; RUN: llc -march=r600 -mtriple=r600-- -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}v_test_imin_sle_i32: @@ -74,8 +75,9 @@ define amdgpu_kernel void @s_test_imin_sle_i8(i8 addrspace(1)* %out, [8 x i32], ; FIXME: Why vector and sdwa for last element? ; FUNC-LABEL: {{^}}s_test_imin_sle_v4i8: -; GCN: s_load_dword s -; GCN: s_load_dword s +; GCN-DAG: s_load_dwordx2 +; GCN-DAG: s_load_dword s +; GCN-DAG: s_load_dword s ; GCN-NOT: _load_ ; SI: s_min_i32 @@ -88,10 +90,10 @@ define amdgpu_kernel void @s_test_imin_sle_i8(i8 addrspace(1)* %out, [8 x i32], ; VI: s_min_i32 ; VI: v_min_i32_sdwa -; GFX9: v_min_i16 -; GFX9: v_min_i16 -; GFX9: v_min_i16 -; GFX9: v_min_i16 +; GFX9_10: v_min_i16 +; GFX9_10: v_min_i16 +; GFX9_10: v_min_i16 +; GFX9_10: v_min_i16 ; EG: MIN_INT ; EG: MIN_INT @@ -120,7 +122,7 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(<4 x i8> addrspace(1)* %out, [8 ; VI: s_min_i32 ; VI: s_min_i32 -; GFX9: v_pk_min_i16 +; GFX9_10: v_pk_min_i16 ; EG: MIN_INT ; EG: MIN_INT @@ -143,8 +145,8 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(<2 x i16> addrspace(1)* %out, < ; VI: s_min_i32 ; VI: s_min_i32 -; GFX9: v_pk_min_i16 -; GFX9: v_pk_min_i16 +; GFX9_10: v_pk_min_i16 +; GFX9_10: v_pk_min_i16 ; EG: MIN_INT ; EG: MIN_INT @@ -177,7 +179,8 @@ define amdgpu_kernel void @v_test_imin_slt_i32(i32 addrspace(1)* %out, i32 addrs ; FUNC-LABEL: @v_test_imin_slt_i16 ; SI: v_min_i32_e32 -; GFX89: v_min_i16_e32 +; GFX8_9: v_min_i16_e32 +; GFX10: v_min_i16_e64 ; EG: MIN_INT define amdgpu_kernel void @v_test_imin_slt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) #0 { @@ -293,8 +296,8 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(<3 x i32> addrspace(1)* %out, < ; VI: v_min_u16_e32 ; VI-NOT: v_min_u16 -; GFX9: v_pk_min_u16 -; GFX9: v_pk_min_u16 +; GFX9_10: v_pk_min_u16 +; GFX9_10: v_pk_min_u16 ; GCN: s_endpgm @@ -348,9 +351,10 @@ define amdgpu_kernel void @v_test_umin_ult_i32(i32 addrspace(1)* %out, i32 addrs ; SI: {{buffer|flat|global}}_load_ubyte ; SI: v_min_u32_e32 -; GFX89: {{flat|global}}_load_ubyte -; GFX89: {{flat|global}}_load_ubyte -; GFX89: v_min_u16_e32 +; GFX8_9_10: {{flat|global}}_load_ubyte +; GFX8_9_10: {{flat|global}}_load_ubyte +; GFX8_9: v_min_u16_e32 +; GFX10: v_min_u16_e64 ; EG: MIN_UINT define amdgpu_kernel void @v_test_umin_ult_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %a.ptr, i8 addrspace(1)* %b.ptr) #0 { @@ -597,7 +601,7 @@ define amdgpu_kernel void @test_imin_sle_i64(i64 addrspace(1)* %out, i64 %a, i64 ; VI: v_min_i16 ; VI: v_min_i16 -; GFX9: v_pk_min_i16 +; GFX9_10: v_pk_min_i16 ; EG: MIN_INT ; EG: MIN_INT @@ -622,7 +626,7 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(<2 x i16> addrspace(1)* %out, < ; VI: v_min_u16 ; VI: v_min_u16 -; GFX9: v_pk_min_u16 +; GFX9_10: v_pk_min_u16 ; EG: MIN_UINT ; EG: MIN_UINT