mirror of
https://github.com/RPCSX/llvm.git
synced 2025-04-12 21:18:23 +00:00

An encoding does not allow to use SDWA in an instruction with scalar operands, either literals or SGPRs. That is however possible to copy these operands into a VGPR first. Several copies of the value are produced if multiple SDWA conversions were done. To cleanup MachineLICM (to hoist copies out of loops), MachineCSE (to remove duplicate copies) and SIFoldOperands (to replace SGPR to VGPR copy with immediate copy right to the VGPR) runs are added after the SDWA pass. Differential Revision: https://reviews.llvm.org/D33583 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@304219 91177308-0d34-0410-b5e6-96231b3b80d8
254 lines
10 KiB
LLVM
254 lines
10 KiB
LLVM
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
|
|
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
|
|
|
|
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
|
|
declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
|
|
|
|
; GCN-LABEL: {{^}}load_i8_to_f32:
|
|
; GCN: buffer_load_ubyte [[LOADREG:v[0-9]+]],
|
|
; GCN-NOT: bfe
|
|
; GCN-NOT: lshr
|
|
; GCN: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[LOADREG]]
|
|
; GCN: buffer_store_dword [[CONV]],
|
|
define amdgpu_kernel void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
|
|
%load = load i8, i8 addrspace(1)* %in, align 1
|
|
%cvt = uitofp i8 %load to float
|
|
store float %cvt, float addrspace(1)* %out, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}load_v2i8_to_v2f32:
|
|
; GCN: buffer_load_ushort [[LD:v[0-9]+]]
|
|
; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LD]]
|
|
; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LD]]
|
|
; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
|
|
define amdgpu_kernel void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind {
|
|
%load = load <2 x i8>, <2 x i8> addrspace(1)* %in, align 2
|
|
%cvt = uitofp <2 x i8> %load to <2 x float>
|
|
store <2 x float> %cvt, <2 x float> addrspace(1)* %out, align 16
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}load_v3i8_to_v3f32:
|
|
; GCN: buffer_load_dword [[VAL:v[0-9]+]]
|
|
; GCN-NOT: v_cvt_f32_ubyte3_e32
|
|
; GCN-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, [[VAL]]
|
|
; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[VAL]]
|
|
; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[VAL]]
|
|
; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
|
|
define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
|
|
%load = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4
|
|
%cvt = uitofp <3 x i8> %load to <3 x float>
|
|
store <3 x float> %cvt, <3 x float> addrspace(1)* %out, align 16
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}load_v4i8_to_v4f32:
|
|
; GCN: buffer_load_dword [[LOADREG:v[0-9]+]]
|
|
; GCN-NOT: bfe
|
|
; GCN-NOT: lshr
|
|
; GCN-DAG: v_cvt_f32_ubyte3_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]]
|
|
; GCN-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, [[LOADREG]]
|
|
; GCN-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, [[LOADREG]]
|
|
; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
|
|
; GCN: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
|
|
define amdgpu_kernel void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
|
|
%load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
|
|
%cvt = uitofp <4 x i8> %load to <4 x float>
|
|
store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
|
|
ret void
|
|
}
|
|
|
|
; This should not be adding instructions to shift into the correct
|
|
; position in the word for the component.
|
|
|
|
; FIXME: Packing bytes
|
|
; GCN-LABEL: {{^}}load_v4i8_to_v4f32_unaligned:
|
|
; GCN: buffer_load_ubyte [[LOADREG3:v[0-9]+]]
|
|
; GCN: buffer_load_ubyte [[LOADREG2:v[0-9]+]]
|
|
; GCN: buffer_load_ubyte [[LOADREG1:v[0-9]+]]
|
|
; GCN: buffer_load_ubyte [[LOADREG0:v[0-9]+]]
|
|
; GCN-DAG: v_lshlrev_b32
|
|
; GCN-DAG: v_or_b32
|
|
; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]],
|
|
; GCN-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}},
|
|
; GCN-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}},
|
|
; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[HIRESULT:[0-9]+]]
|
|
|
|
; GCN: buffer_store_dwordx4
|
|
define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
|
|
%load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1
|
|
%cvt = uitofp <4 x i8> %load to <4 x float>
|
|
store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
|
|
ret void
|
|
}
|
|
|
|
; FIXME: Need to handle non-uniform case for function below (load without gep).
|
|
; Instructions still emitted to repack bytes for add use.
|
|
|
|
; GCN-LABEL: {{^}}load_v4i8_to_v4f32_2_uses:
|
|
; GCN: {{buffer|flat}}_load_dword
|
|
; GCN-DAG: v_cvt_f32_ubyte0_e32
|
|
; GCN-DAG: v_cvt_f32_ubyte1_e32
|
|
; GCN-DAG: v_cvt_f32_ubyte2_e32
|
|
; GCN-DAG: v_cvt_f32_ubyte3_e32
|
|
|
|
; GCN-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 24
|
|
|
|
; SI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16
|
|
; SI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 8
|
|
; SI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffff,
|
|
; SI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xff00,
|
|
; SI-DAG: v_add_i32
|
|
|
|
; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffffff00,
|
|
; VI-DAG: v_add_u16_e32
|
|
; VI-DAG: v_add_u16_e32
|
|
|
|
; GCN: {{buffer|flat}}_store_dwordx4
|
|
; GCN: {{buffer|flat}}_store_dword
|
|
|
|
; GCN: s_endpgm
|
|
define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind {
|
|
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
|
|
%in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
|
|
%load = load <4 x i8>, <4 x i8> addrspace(1)* %in.ptr, align 4
|
|
%cvt = uitofp <4 x i8> %load to <4 x float>
|
|
store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
|
|
%add = add <4 x i8> %load, <i8 9, i8 9, i8 9, i8 9> ; Second use of %load
|
|
store <4 x i8> %add, <4 x i8> addrspace(1)* %out2, align 4
|
|
ret void
|
|
}
|
|
|
|
; Make sure this doesn't crash.
|
|
; GCN-LABEL: {{^}}load_v7i8_to_v7f32:
|
|
; GCN: s_endpgm
|
|
define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind {
|
|
%load = load <7 x i8>, <7 x i8> addrspace(1)* %in, align 1
|
|
%cvt = uitofp <7 x i8> %load to <7 x float>
|
|
store <7 x float> %cvt, <7 x float> addrspace(1)* %out, align 16
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}load_v8i8_to_v8f32:
|
|
; GCN: buffer_load_dwordx2 v{{\[}}[[LOLOAD:[0-9]+]]:[[HILOAD:[0-9]+]]{{\]}},
|
|
; GCN-NOT: bfe
|
|
; GCN-NOT: lshr
|
|
; GCN-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[LOLOAD]]
|
|
; GCN-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, v[[LOLOAD]]
|
|
; GCN-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, v[[LOLOAD]]
|
|
; GCN-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[LOLOAD]]
|
|
; GCN-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[HILOAD]]
|
|
; GCN-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, v[[HILOAD]]
|
|
; GCN-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, v[[HILOAD]]
|
|
; GCN-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[HILOAD]]
|
|
; GCN-NOT: bfe
|
|
; GCN-NOT: lshr
|
|
; GCN: buffer_store_dwordx4
|
|
; GCN: buffer_store_dwordx4
|
|
define amdgpu_kernel void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind {
|
|
%load = load <8 x i8>, <8 x i8> addrspace(1)* %in, align 8
|
|
%cvt = uitofp <8 x i8> %load to <8 x float>
|
|
store <8 x float> %cvt, <8 x float> addrspace(1)* %out, align 16
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}i8_zext_inreg_i32_to_f32:
|
|
; GCN: buffer_load_dword [[LOADREG:v[0-9]+]],
|
|
; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 2, [[LOADREG]]
|
|
; GCN-NEXT: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[ADD]]
|
|
; GCN: buffer_store_dword [[CONV]],
|
|
define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
|
|
%load = load i32, i32 addrspace(1)* %in, align 4
|
|
%add = add i32 %load, 2
|
|
%inreg = and i32 %add, 255
|
|
%cvt = uitofp i32 %inreg to float
|
|
store float %cvt, float addrspace(1)* %out, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}i8_zext_inreg_hi1_to_f32:
|
|
define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
|
|
%load = load i32, i32 addrspace(1)* %in, align 4
|
|
%inreg = and i32 %load, 65280
|
|
%shr = lshr i32 %inreg, 8
|
|
%cvt = uitofp i32 %shr to float
|
|
store float %cvt, float addrspace(1)* %out, align 4
|
|
ret void
|
|
}
|
|
|
|
; We don't get these ones because of the zext, but instcombine removes
|
|
; them so it shouldn't really matter.
|
|
; GCN-LABEL: {{^}}i8_zext_i32_to_f32:
|
|
define amdgpu_kernel void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
|
|
%load = load i8, i8 addrspace(1)* %in, align 1
|
|
%ext = zext i8 %load to i32
|
|
%cvt = uitofp i32 %ext to float
|
|
store float %cvt, float addrspace(1)* %out, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}v4i8_zext_v4i32_to_v4f32:
|
|
define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
|
|
%load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1
|
|
%ext = zext <4 x i8> %load to <4 x i32>
|
|
%cvt = uitofp <4 x i32> %ext to <4 x float>
|
|
store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}extract_byte0_to_f32:
|
|
; GCN: buffer_load_dword [[VAL:v[0-9]+]]
|
|
; GCN-NOT: [[VAL]]
|
|
; GCN: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[VAL]]
|
|
; GCN: buffer_store_dword [[CONV]]
|
|
define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
|
|
%val = load i32, i32 addrspace(1)* %in
|
|
%and = and i32 %val, 255
|
|
%cvt = uitofp i32 %and to float
|
|
store float %cvt, float addrspace(1)* %out
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}extract_byte1_to_f32:
|
|
; GCN: buffer_load_dword [[VAL:v[0-9]+]]
|
|
; GCN-NOT: [[VAL]]
|
|
; GCN: v_cvt_f32_ubyte1_e32 [[CONV:v[0-9]+]], [[VAL]]
|
|
; GCN: buffer_store_dword [[CONV]]
|
|
define amdgpu_kernel void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
|
|
%val = load i32, i32 addrspace(1)* %in
|
|
%srl = lshr i32 %val, 8
|
|
%and = and i32 %srl, 255
|
|
%cvt = uitofp i32 %and to float
|
|
store float %cvt, float addrspace(1)* %out
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}extract_byte2_to_f32:
|
|
; GCN: buffer_load_dword [[VAL:v[0-9]+]]
|
|
; GCN-NOT: [[VAL]]
|
|
; GCN: v_cvt_f32_ubyte2_e32 [[CONV:v[0-9]+]], [[VAL]]
|
|
; GCN: buffer_store_dword [[CONV]]
|
|
define amdgpu_kernel void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
|
|
%val = load i32, i32 addrspace(1)* %in
|
|
%srl = lshr i32 %val, 16
|
|
%and = and i32 %srl, 255
|
|
%cvt = uitofp i32 %and to float
|
|
store float %cvt, float addrspace(1)* %out
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}extract_byte3_to_f32:
|
|
; GCN: buffer_load_dword [[VAL:v[0-9]+]]
|
|
; GCN-NOT: [[VAL]]
|
|
; GCN: v_cvt_f32_ubyte3_e32 [[CONV:v[0-9]+]], [[VAL]]
|
|
; GCN: buffer_store_dword [[CONV]]
|
|
define amdgpu_kernel void @extract_byte3_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
|
|
%val = load i32, i32 addrspace(1)* %in
|
|
%srl = lshr i32 %val, 24
|
|
%and = and i32 %srl, 255
|
|
%cvt = uitofp i32 %and to float
|
|
store float %cvt, float addrspace(1)* %out
|
|
ret void
|
|
}
|