mirror of
https://github.com/capstone-engine/llvm-capstone.git
synced 2025-04-02 21:22:44 +00:00
AMDGPU: Fix not converting d16 load/stores to offset
Fixes missed optimization with new MUBUF instructions. llvm-svn: 318106
This commit is contained in:
parent
c8434103d0
commit
4b7938c658
@ -435,6 +435,10 @@ static int getOffsetMUBUFStore(unsigned Opc) {
|
|||||||
return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
|
return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
|
||||||
case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN:
|
case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN:
|
||||||
return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
|
return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
|
||||||
|
case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN:
|
||||||
|
return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET;
|
||||||
|
case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN:
|
||||||
|
return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET;
|
||||||
default:
|
default:
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
@ -456,6 +460,18 @@ static int getOffsetMUBUFLoad(unsigned Opc) {
|
|||||||
return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
|
return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
|
||||||
case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN:
|
case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN:
|
||||||
return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET;
|
return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET;
|
||||||
|
case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN:
|
||||||
|
return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET;
|
||||||
|
case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN:
|
||||||
|
return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET;
|
||||||
|
case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN:
|
||||||
|
return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET;
|
||||||
|
case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN:
|
||||||
|
return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET;
|
||||||
|
case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN:
|
||||||
|
return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET;
|
||||||
|
case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN:
|
||||||
|
return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET;
|
||||||
default:
|
default:
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
@ -479,7 +495,7 @@ static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII,
|
|||||||
return false;
|
return false;
|
||||||
|
|
||||||
const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata);
|
const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata);
|
||||||
BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
|
MachineInstrBuilder NewMI = BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
|
||||||
.add(*Reg)
|
.add(*Reg)
|
||||||
.add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc))
|
.add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc))
|
||||||
.add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset))
|
.add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset))
|
||||||
@ -488,6 +504,11 @@ static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII,
|
|||||||
.addImm(0) // slc
|
.addImm(0) // slc
|
||||||
.addImm(0) // tfe
|
.addImm(0) // tfe
|
||||||
.setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
|
.setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
|
||||||
|
|
||||||
|
const MachineOperand *VDataIn = TII->getNamedOperand(*MI,
|
||||||
|
AMDGPU::OpName::vdata_in);
|
||||||
|
if (VDataIn)
|
||||||
|
NewMI.add(*VDataIn);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
|
; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
|
||||||
; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
|
; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}load_local_hi_v2i16_undeflo:
|
; GCN-LABEL: {{^}}load_local_hi_v2i16_undeflo:
|
||||||
; GCN: s_waitcnt
|
; GCN: s_waitcnt
|
||||||
@ -503,6 +503,62 @@ entry:
|
|||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
|
; Local object gives known offset, so requires converting from offen
|
||||||
|
; to offset variant.
|
||||||
|
|
||||||
|
; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_to_offset:
|
||||||
|
; GFX9: buffer_store_dword
|
||||||
|
; GFX9-NEXT: buffer_load_short_d16_hi v{{[0-9]+}}, off, s[0:3], s5 offset:4094
|
||||||
|
define void @load_private_hi_v2i16_reglo_vreg_to_offset(i16 %reg) #0 {
|
||||||
|
entry:
|
||||||
|
%obj0 = alloca [10 x i32], align 4
|
||||||
|
%obj1 = alloca [4096 x i16], align 2
|
||||||
|
%bc = bitcast [10 x i32]* %obj0 to i32*
|
||||||
|
store volatile i32 123, i32* %bc
|
||||||
|
%gep = getelementptr inbounds [4096 x i16], [4096 x i16]* %obj1, i32 0, i32 2025
|
||||||
|
%load = load i16, i16* %gep
|
||||||
|
%build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
|
||||||
|
%build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
|
||||||
|
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8_to_offset:
|
||||||
|
; GFX9: buffer_store_dword
|
||||||
|
; GFX9-NEXT: buffer_load_sbyte_d16_hi v{{[0-9]+}}, off, s[0:3], s5 offset:4095
|
||||||
|
define void @load_private_hi_v2i16_reglo_vreg_sexti8_to_offset(i16 %reg) #0 {
|
||||||
|
entry:
|
||||||
|
%obj0 = alloca [10 x i32], align 4
|
||||||
|
%obj1 = alloca [4096 x i8], align 2
|
||||||
|
%bc = bitcast [10 x i32]* %obj0 to i32*
|
||||||
|
store volatile i32 123, i32* %bc
|
||||||
|
%gep = getelementptr inbounds [4096 x i8], [4096 x i8]* %obj1, i32 0, i32 4051
|
||||||
|
%load = load i8, i8* %gep
|
||||||
|
%ext = sext i8 %load to i16
|
||||||
|
%build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
|
||||||
|
%build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
|
||||||
|
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8_to_offset:
|
||||||
|
; GFX9: buffer_store_dword
|
||||||
|
; GFX9-NEXT: buffer_load_ubyte_d16_hi v{{[0-9]+}}, off, s[0:3], s5 offset:4095
|
||||||
|
define void @load_private_hi_v2i16_reglo_vreg_zexti8_to_offset(i16 %reg) #0 {
|
||||||
|
entry:
|
||||||
|
%obj0 = alloca [10 x i32], align 4
|
||||||
|
%obj1 = alloca [4096 x i8], align 2
|
||||||
|
%bc = bitcast [10 x i32]* %obj0 to i32*
|
||||||
|
store volatile i32 123, i32* %bc
|
||||||
|
%gep = getelementptr inbounds [4096 x i8], [4096 x i8]* %obj1, i32 0, i32 4051
|
||||||
|
%load = load i8, i8* %gep
|
||||||
|
%ext = zext i8 %load to i16
|
||||||
|
%build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
|
||||||
|
%build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
|
||||||
|
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
; FIXME: Remove m0 init and waitcnt between reads
|
; FIXME: Remove m0 init and waitcnt between reads
|
||||||
; FIXME: Is there a cost to using the extload over not?
|
; FIXME: Is there a cost to using the extload over not?
|
||||||
; GCN-LABEL: {{^}}load_local_v2i16_split:
|
; GCN-LABEL: {{^}}load_local_v2i16_split:
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
|
; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
|
||||||
; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
|
; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}load_local_lo_v2i16_undeflo:
|
; GCN-LABEL: {{^}}load_local_lo_v2i16_undeflo:
|
||||||
; GCN: s_waitcnt
|
; GCN: s_waitcnt
|
||||||
@ -588,4 +588,63 @@ entry:
|
|||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
|
; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_to_offset:
|
||||||
|
; GFX9: buffer_store_dword
|
||||||
|
; GFX9-NEXT: buffer_load_short_d16 v0, off, s[0:3], s5 offset:4094
|
||||||
|
|
||||||
|
; VI: buffer_load_ushort v
|
||||||
|
define void @load_private_lo_v2i16_reglo_vreg_to_offset(i32 %reg) #0 {
|
||||||
|
entry:
|
||||||
|
%obj0 = alloca [10 x i32], align 4
|
||||||
|
%obj1 = alloca [4096 x i16], align 2
|
||||||
|
%reg.bc = bitcast i32 %reg to <2 x i16>
|
||||||
|
%bc = bitcast [10 x i32]* %obj0 to i32*
|
||||||
|
store volatile i32 123, i32* %bc
|
||||||
|
%gep = getelementptr inbounds [4096 x i16], [4096 x i16]* %obj1, i32 0, i32 2025
|
||||||
|
%load = load volatile i16, i16* %gep
|
||||||
|
%build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
|
||||||
|
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_sexti8_to_offset:
|
||||||
|
; GFX9: buffer_store_dword
|
||||||
|
; GFX9-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s5 offset:4095
|
||||||
|
|
||||||
|
; VI: buffer_load_sbyte v
|
||||||
|
define void @load_private_lo_v2i16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 {
|
||||||
|
entry:
|
||||||
|
%obj0 = alloca [10 x i32], align 4
|
||||||
|
%obj1 = alloca [4096 x i8], align 2
|
||||||
|
%reg.bc = bitcast i32 %reg to <2 x i16>
|
||||||
|
%bc = bitcast [10 x i32]* %obj0 to i32*
|
||||||
|
store volatile i32 123, i32* %bc
|
||||||
|
%gep = getelementptr inbounds [4096 x i8], [4096 x i8]* %obj1, i32 0, i32 4051
|
||||||
|
%load = load volatile i8, i8* %gep
|
||||||
|
%load.ext = sext i8 %load to i16
|
||||||
|
%build1 = insertelement <2 x i16> %reg.bc, i16 %load.ext, i32 0
|
||||||
|
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_zexti8_to_offset:
|
||||||
|
; GFX9: buffer_store_dword
|
||||||
|
; GFX9-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s5 offset:4095
|
||||||
|
|
||||||
|
; VI: buffer_load_ubyte v
|
||||||
|
define void @load_private_lo_v2i16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
|
||||||
|
entry:
|
||||||
|
%obj0 = alloca [10 x i32], align 4
|
||||||
|
%obj1 = alloca [4096 x i8], align 2
|
||||||
|
%reg.bc = bitcast i32 %reg to <2 x i16>
|
||||||
|
%bc = bitcast [10 x i32]* %obj0 to i32*
|
||||||
|
store volatile i32 123, i32* %bc
|
||||||
|
%gep = getelementptr inbounds [4096 x i8], [4096 x i8]* %obj1, i32 0, i32 4051
|
||||||
|
%load = load volatile i8, i8* %gep
|
||||||
|
%load.ext = zext i8 %load to i16
|
||||||
|
%build1 = insertelement <2 x i16> %reg.bc, i16 %load.ext, i32 0
|
||||||
|
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
attributes #0 = { nounwind }
|
attributes #0 = { nounwind }
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
|
; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
|
||||||
; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
|
; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}store_global_hi_v2i16:
|
; GCN-LABEL: {{^}}store_global_hi_v2i16:
|
||||||
; GCN: s_waitcnt
|
; GCN: s_waitcnt
|
||||||
@ -591,4 +591,39 @@ entry:
|
|||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
|
; GCN-LABEL: {{^}}store_private_hi_v2i16_to_offset:
|
||||||
|
; GCN: s_waitcnt
|
||||||
|
; GFX9: buffer_store_dword
|
||||||
|
; GFX9-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], s5 offset:4094
|
||||||
|
define void @store_private_hi_v2i16_to_offset(i32 %arg) #0 {
|
||||||
|
entry:
|
||||||
|
%obj0 = alloca [10 x i32], align 4
|
||||||
|
%obj1 = alloca [4096 x i16], align 2
|
||||||
|
%bc = bitcast [10 x i32]* %obj0 to i32*
|
||||||
|
store volatile i32 123, i32* %bc
|
||||||
|
%value = bitcast i32 %arg to <2 x i16>
|
||||||
|
%hi = extractelement <2 x i16> %value, i32 1
|
||||||
|
%gep = getelementptr inbounds [4096 x i16], [4096 x i16]* %obj1, i32 0, i32 2025
|
||||||
|
store i16 %hi, i16* %gep
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; GCN-LABEL: {{^}}store_private_hi_v2i16_i8_to_offset:
|
||||||
|
; GCN: s_waitcnt
|
||||||
|
; GFX9: buffer_store_dword
|
||||||
|
; GFX9-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], s5 offset:4095
|
||||||
|
define void @store_private_hi_v2i16_i8_to_offset(i32 %arg) #0 {
|
||||||
|
entry:
|
||||||
|
%obj0 = alloca [10 x i32], align 4
|
||||||
|
%obj1 = alloca [4096 x i8], align 2
|
||||||
|
%bc = bitcast [10 x i32]* %obj0 to i32*
|
||||||
|
store volatile i32 123, i32* %bc
|
||||||
|
%value = bitcast i32 %arg to <2 x i16>
|
||||||
|
%hi = extractelement <2 x i16> %value, i32 1
|
||||||
|
%gep = getelementptr inbounds [4096 x i8], [4096 x i8]* %obj1, i32 0, i32 4051
|
||||||
|
%trunc = trunc i16 %hi to i8
|
||||||
|
store i8 %trunc, i8* %gep
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
attributes #0 = { nounwind }
|
attributes #0 = { nounwind }
|
||||||
|
Loading…
x
Reference in New Issue
Block a user