llvm/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir
Sam Kolton e88fc4046f [AMDGPU] SDWA: add support for GFX9 in peephole pass
Summary:
Added support based on merged SDWA pseudo instructions. Now peephole allow one scalar operand, omod and clamp modifiers.
Added several subtarget features for GFX9 SDWA.
This diff also contains changes from D34026.
Depends D34026

Reviewers: vpykhtin, rampitec, arsenm

Subscribers: kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye

Differential Revision: https://reviews.llvm.org/D34241

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@305986 91177308-0d34-0410-b5e6-96231b3b80d8
2017-06-22 06:26:41 +00:00

416 lines
15 KiB
YAML

# RUN: llc -march=amdgcn -mcpu=fiji -start-before si-peephole-sdwa -o - %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
# RUN: llc -march=amdgcn -mcpu=gfx900 -start-before si-peephole-sdwa -o - %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
# GCN-LABEL: {{^}}sdwa_imm_operand:
# GCN: v_mov_b32_e32 v[[SHIFT:[0-9]+]], 2
# GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 2
# GCN: BB0_1:
# GCN: v_lshlrev_b32_sdwa v{{[0-9]+}}, v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
# GCN: v_lshlrev_b32_sdwa v{{[0-9]+}}, v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
# GCN-LABEL: {{^}}sdwa_sgpr_operand:
# VI: v_mov_b32_e32 v[[SHIFT:[0-9]+]], 2
# VI-NOT: v_mov_b32_e32 v{{[0-9]+}}, 2
# VI: BB1_1:
# VI: v_lshlrev_b32_sdwa v{{[0-9]+}}, v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
# VI: v_lshlrev_b32_sdwa v{{[0-9]+}}, v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
# GFX9: s_mov_b32 s[[SHIFT:[0-9]+]], 2
# GFX9-NOT: v_mov_b32_e32 v{{[0-9]+}}, 2
# GFX9: BB1_1:
# GFX9: v_lshlrev_b32_sdwa v{{[0-9]+}}, s[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
# GFX9: v_lshlrev_b32_sdwa v{{[0-9]+}}, s[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
--- |
; ModuleID = 'sdwa-scalar-ops.opt.ll'
source_filename = "sdwa-scalar-ops.opt.ll"
target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
define amdgpu_kernel void @sdwa_imm_operand(i32 addrspace(1)* nocapture %arg) {
bb:
br label %bb2
bb1: ; preds = %bb2
ret void
bb2: ; preds = %bb2, %bb
%lsr.iv = phi i64 [ %lsr.iv.next, %bb2 ], [ 0, %bb ]
%bc = bitcast i32 addrspace(1)* %arg to i8 addrspace(1)*
%uglygep4 = getelementptr i8, i8 addrspace(1)* %bc, i64 %lsr.iv
%uglygep45 = bitcast i8 addrspace(1)* %uglygep4 to i32 addrspace(1)*
%tmp5 = load i32, i32 addrspace(1)* %uglygep45, align 4
%tmp6 = lshr i32 %tmp5, 8
%tmp7 = and i32 %tmp6, 255
%tmp8 = zext i32 %tmp7 to i64
%tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp8
store i32 1, i32 addrspace(1)* %tmp9, align 4
%scevgep = getelementptr i32, i32 addrspace(1)* %uglygep45, i64 1
%tmp13 = load i32, i32 addrspace(1)* %scevgep, align 4
%tmp14 = lshr i32 %tmp13, 8
%tmp15 = and i32 %tmp14, 255
%tmp16 = zext i32 %tmp15 to i64
%tmp17 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp16
store i32 1, i32 addrspace(1)* %tmp17, align 4
%lsr.iv.next = add nuw nsw i64 %lsr.iv, 8
%tmp1 = trunc i64 %lsr.iv.next to i32
%tmp19 = icmp eq i32 %tmp1, 4096
br i1 %tmp19, label %bb1, label %bb2
}
define amdgpu_kernel void @sdwa_sgpr_operand(i32 addrspace(1)* nocapture %arg) {
bb:
br label %bb2
bb1: ; preds = %bb2
ret void
bb2: ; preds = %bb2, %bb
%lsr.iv = phi i64 [ %lsr.iv.next, %bb2 ], [ 0, %bb ]
%bc = bitcast i32 addrspace(1)* %arg to i8 addrspace(1)*
%uglygep4 = getelementptr i8, i8 addrspace(1)* %bc, i64 %lsr.iv
%uglygep45 = bitcast i8 addrspace(1)* %uglygep4 to i32 addrspace(1)*
%tmp5 = load i32, i32 addrspace(1)* %uglygep45, align 4
%tmp6 = lshr i32 %tmp5, 8
%tmp7 = and i32 %tmp6, 255
%tmp8 = zext i32 %tmp7 to i64
%tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp8
store i32 1, i32 addrspace(1)* %tmp9, align 4
%scevgep = getelementptr i32, i32 addrspace(1)* %uglygep45, i64 1
%tmp13 = load i32, i32 addrspace(1)* %scevgep, align 4
%tmp14 = lshr i32 %tmp13, 8
%tmp15 = and i32 %tmp14, 255
%tmp16 = zext i32 %tmp15 to i64
%tmp17 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp16
store i32 1, i32 addrspace(1)* %tmp17, align 4
%lsr.iv.next = add nuw nsw i64 %lsr.iv, 8
%tmp1 = trunc i64 %lsr.iv.next to i32
%tmp19 = icmp eq i32 %tmp1, 4096
br i1 %tmp19, label %bb1, label %bb2
}
...
---
name: sdwa_imm_operand
alignment: 0
exposesReturnsTwice: false
legalized: false
regBankSelected: false
selected: false
tracksRegLiveness: true
registers:
- { id: 0, class: sreg_64 }
- { id: 1, class: sreg_64 }
- { id: 2, class: vgpr_32 }
- { id: 3, class: sgpr_128 }
- { id: 4, class: sgpr_64 }
- { id: 5, class: sreg_32_xm0 }
- { id: 6, class: sgpr_32 }
- { id: 7, class: sreg_64 }
- { id: 8, class: sreg_64 }
- { id: 9, class: sreg_64_xexec }
- { id: 10, class: sreg_32_xm0 }
- { id: 11, class: sreg_32_xm0 }
- { id: 12, class: sreg_32_xm0 }
- { id: 13, class: sreg_32_xm0 }
- { id: 14, class: sreg_32_xm0 }
- { id: 15, class: sreg_32_xm0 }
- { id: 16, class: sreg_64 }
- { id: 17, class: vgpr_32 }
- { id: 18, class: vreg_64 }
- { id: 19, class: sreg_32_xm0 }
- { id: 20, class: sreg_32 }
- { id: 21, class: sreg_32_xm0 }
- { id: 22, class: sreg_32_xm0 }
- { id: 23, class: sreg_32_xm0 }
- { id: 24, class: sreg_64 }
- { id: 25, class: sreg_32_xm0 }
- { id: 26, class: sreg_32_xm0 }
- { id: 27, class: sreg_32_xm0 }
- { id: 28, class: sreg_32_xm0 }
- { id: 29, class: sreg_64 }
- { id: 30, class: vgpr_32 }
- { id: 31, class: vreg_64 }
- { id: 32, class: sreg_32_xm0 }
- { id: 33, class: sreg_32_xm0 }
- { id: 34, class: sreg_64 }
- { id: 35, class: sreg_32_xm0 }
- { id: 36, class: sreg_32_xm0 }
- { id: 37, class: sreg_32_xm0 }
- { id: 38, class: sreg_32_xm0 }
- { id: 39, class: vreg_64 }
- { id: 40, class: vgpr_32 }
- { id: 41, class: vreg_64 }
- { id: 42, class: sreg_32_xm0 }
- { id: 43, class: sreg_32 }
- { id: 44, class: sreg_32_xm0 }
- { id: 45, class: sreg_64 }
- { id: 46, class: sreg_32_xm0 }
- { id: 47, class: sreg_32_xm0 }
- { id: 48, class: sreg_32_xm0 }
- { id: 49, class: sreg_32_xm0 }
- { id: 50, class: sreg_64 }
- { id: 51, class: vreg_64 }
- { id: 52, class: sreg_64 }
- { id: 53, class: sreg_32_xm0 }
- { id: 54, class: sreg_32_xm0 }
- { id: 55, class: sreg_32_xm0 }
- { id: 56, class: sreg_32_xm0 }
- { id: 57, class: sreg_64 }
- { id: 58, class: sreg_32_xm0 }
- { id: 59, class: sreg_32_xm0 }
- { id: 60, class: vgpr_32 }
- { id: 61, class: vgpr_32 }
- { id: 62, class: vreg_64 }
- { id: 63, class: vgpr_32 }
- { id: 64, class: vgpr_32 }
- { id: 65, class: vgpr_32 }
- { id: 66, class: vgpr_32 }
- { id: 67, class: vreg_64 }
- { id: 68, class: vgpr_32 }
- { id: 69, class: vgpr_32 }
- { id: 70, class: vgpr_32 }
- { id: 71, class: vgpr_32 }
- { id: 72, class: vgpr_32 }
- { id: 73, class: vgpr_32 }
- { id: 74, class: vgpr_32 }
- { id: 75, class: vreg_64 }
- { id: 76, class: vgpr_32 }
- { id: 77, class: vgpr_32 }
- { id: 78, class: vgpr_32 }
- { id: 79, class: vgpr_32 }
- { id: 80, class: vreg_64 }
- { id: 81, class: vgpr_32 }
- { id: 82, class: vgpr_32 }
- { id: 83, class: vgpr_32 }
liveins:
- { reg: '%sgpr4_sgpr5', virtual-reg: '%4' }
frameInfo:
isFrameAddressTaken: false
isReturnAddressTaken: false
hasStackMap: false
hasPatchPoint: false
stackSize: 0
offsetAdjustment: 0
maxAlignment: 0
adjustsStack: false
hasCalls: false
hasOpaqueSPAdjustment: false
hasVAStart: false
hasMustTailInVarArgFunc: false
body: |
bb.0.bb:
successors: %bb.2.bb2(0x80000000)
liveins: %sgpr4_sgpr5
%4 = COPY %sgpr4_sgpr5
%9 = S_LOAD_DWORDX2_IMM %4, 0, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
%8 = S_MOV_B64 0
%7 = COPY %9
%30 = V_MOV_B32_e32 1, implicit %exec
S_BRANCH %bb.2.bb2
bb.1.bb1:
S_ENDPGM
bb.2.bb2:
successors: %bb.1.bb1(0x04000000), %bb.2.bb2(0x7c000000)
%0 = PHI %8, %bb.0.bb, %1, %bb.2.bb2
%13 = COPY %7.sub1
%14 = S_ADD_U32 %7.sub0, %0.sub0, implicit-def %scc
%15 = S_ADDC_U32 %7.sub1, %0.sub1, implicit-def dead %scc, implicit %scc
%16 = REG_SEQUENCE %14, 1, %15, 2
%18 = COPY %16
%17 = FLAT_LOAD_DWORD %18, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 4 from %ir.uglygep45)
%60 = V_BFE_U32 %17, 8, 8, implicit %exec
%61 = V_LSHLREV_B32_e32 2, killed %60, implicit %exec
%70 = V_ADD_I32_e32 %7.sub0, %61, implicit-def %vcc, implicit %exec
%66 = COPY %13
%65 = V_ADDC_U32_e32 0, %66, implicit-def %vcc, implicit %vcc, implicit %exec
%67 = REG_SEQUENCE %70, 1, killed %65, 2
FLAT_STORE_DWORD %67, %30, 0, 0, 0, implicit %exec, implicit %flat_scr :: (store 4 into %ir.tmp9)
%37 = S_ADD_U32 %14, 4, implicit-def %scc
%38 = S_ADDC_U32 %15, 0, implicit-def dead %scc, implicit %scc
%71 = COPY killed %37
%72 = COPY killed %38
%41 = REG_SEQUENCE killed %71, 1, killed %72, 2
%40 = FLAT_LOAD_DWORD killed %41, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 4 from %ir.scevgep)
%73 = V_BFE_U32 %40, 8, 8, implicit %exec
%74 = V_LSHLREV_B32_e32 2, killed %73, implicit %exec
%83 = V_ADD_I32_e32 %7.sub0, %74, implicit-def %vcc, implicit %exec
%78 = V_ADDC_U32_e32 0, %66, implicit-def %vcc, implicit %vcc, implicit %exec
%80 = REG_SEQUENCE %83, 1, killed %78, 2
FLAT_STORE_DWORD %80, %30, 0, 0, 0, implicit %exec, implicit %flat_scr :: (store 4 into %ir.tmp17)
%55 = S_ADD_U32 %0.sub0, 8, implicit-def %scc
%56 = S_ADDC_U32 %0.sub1, 0, implicit-def dead %scc, implicit %scc
%57 = REG_SEQUENCE %55, 1, killed %56, 2
%1 = COPY %57
S_CMPK_EQ_I32 %55, 4096, implicit-def %scc
S_CBRANCH_SCC1 %bb.1.bb1, implicit %scc
S_BRANCH %bb.2.bb2
...
---
name: sdwa_sgpr_operand
alignment: 0
exposesReturnsTwice: false
legalized: false
regBankSelected: false
selected: false
tracksRegLiveness: true
registers:
- { id: 0, class: sreg_64 }
- { id: 1, class: sreg_64 }
- { id: 2, class: vgpr_32 }
- { id: 3, class: sgpr_128 }
- { id: 4, class: sgpr_64 }
- { id: 5, class: sreg_32_xm0 }
- { id: 6, class: sgpr_32 }
- { id: 7, class: sreg_64 }
- { id: 8, class: sreg_64 }
- { id: 9, class: sreg_64_xexec }
- { id: 10, class: sreg_32_xm0 }
- { id: 11, class: sreg_32_xm0 }
- { id: 12, class: sreg_32_xm0 }
- { id: 13, class: sreg_32_xm0 }
- { id: 14, class: sreg_32_xm0 }
- { id: 15, class: sreg_32_xm0 }
- { id: 16, class: sreg_64 }
- { id: 17, class: vgpr_32 }
- { id: 18, class: vreg_64 }
- { id: 19, class: sreg_32_xm0 }
- { id: 20, class: sreg_32 }
- { id: 21, class: sreg_32_xm0 }
- { id: 22, class: sreg_32_xm0 }
- { id: 23, class: sreg_32_xm0 }
- { id: 24, class: sreg_64 }
- { id: 25, class: sreg_32_xm0 }
- { id: 26, class: sreg_32_xm0 }
- { id: 27, class: sreg_32_xm0 }
- { id: 28, class: sreg_32_xm0 }
- { id: 29, class: sreg_64 }
- { id: 30, class: vgpr_32 }
- { id: 31, class: vreg_64 }
- { id: 32, class: sreg_32_xm0 }
- { id: 33, class: sreg_32_xm0 }
- { id: 34, class: sreg_64 }
- { id: 35, class: sreg_32_xm0 }
- { id: 36, class: sreg_32_xm0 }
- { id: 37, class: sreg_32_xm0 }
- { id: 38, class: sreg_32_xm0 }
- { id: 39, class: vreg_64 }
- { id: 40, class: vgpr_32 }
- { id: 41, class: vreg_64 }
- { id: 42, class: sreg_32_xm0 }
- { id: 43, class: sreg_32 }
- { id: 44, class: sreg_32_xm0 }
- { id: 45, class: sreg_64 }
- { id: 46, class: sreg_32_xm0 }
- { id: 47, class: sreg_32_xm0 }
- { id: 48, class: sreg_32_xm0 }
- { id: 49, class: sreg_32_xm0 }
- { id: 50, class: sreg_64 }
- { id: 51, class: vreg_64 }
- { id: 52, class: sreg_64 }
- { id: 53, class: sreg_32_xm0 }
- { id: 54, class: sreg_32_xm0 }
- { id: 55, class: sreg_32_xm0 }
- { id: 56, class: sreg_32_xm0 }
- { id: 57, class: sreg_64 }
- { id: 58, class: sreg_32_xm0 }
- { id: 59, class: sreg_32_xm0 }
- { id: 60, class: vgpr_32 }
- { id: 61, class: vgpr_32 }
- { id: 62, class: vreg_64 }
- { id: 63, class: vgpr_32 }
- { id: 64, class: vgpr_32 }
- { id: 65, class: vgpr_32 }
- { id: 66, class: vgpr_32 }
- { id: 67, class: vreg_64 }
- { id: 68, class: vgpr_32 }
- { id: 69, class: vgpr_32 }
- { id: 70, class: vgpr_32 }
- { id: 71, class: vgpr_32 }
- { id: 72, class: vgpr_32 }
- { id: 73, class: vgpr_32 }
- { id: 74, class: vgpr_32 }
- { id: 75, class: vreg_64 }
- { id: 76, class: vgpr_32 }
- { id: 77, class: vgpr_32 }
- { id: 78, class: vgpr_32 }
- { id: 79, class: vgpr_32 }
- { id: 80, class: vreg_64 }
- { id: 81, class: vgpr_32 }
- { id: 82, class: vgpr_32 }
- { id: 83, class: vgpr_32 }
- { id: 84, class: sreg_32_xm0 }
liveins:
- { reg: '%sgpr4_sgpr5', virtual-reg: '%4' }
frameInfo:
isFrameAddressTaken: false
isReturnAddressTaken: false
hasStackMap: false
hasPatchPoint: false
stackSize: 0
offsetAdjustment: 0
maxAlignment: 0
adjustsStack: false
hasCalls: false
hasOpaqueSPAdjustment: false
hasVAStart: false
hasMustTailInVarArgFunc: false
body: |
bb.0.bb:
successors: %bb.2.bb2(0x80000000)
liveins: %sgpr4_sgpr5
%4 = COPY %sgpr4_sgpr5
%9 = S_LOAD_DWORDX2_IMM %4, 0, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
%8 = S_MOV_B64 0
%7 = COPY %9
%30 = V_MOV_B32_e32 1, implicit %exec
%84 = S_MOV_B32 2
S_BRANCH %bb.2.bb2
bb.1.bb1:
S_ENDPGM
bb.2.bb2:
successors: %bb.1.bb1(0x04000000), %bb.2.bb2(0x7c000000)
%0 = PHI %8, %bb.0.bb, %1, %bb.2.bb2
%13 = COPY %7.sub1
%14 = S_ADD_U32 %7.sub0, %0.sub0, implicit-def %scc
%15 = S_ADDC_U32 %7.sub1, %0.sub1, implicit-def dead %scc, implicit %scc
%16 = REG_SEQUENCE %14, 1, %15, 2
%18 = COPY %16
%17 = FLAT_LOAD_DWORD %18, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 4 from %ir.uglygep45)
%60 = V_BFE_U32 %17, 8, 8, implicit %exec
%61 = V_LSHLREV_B32_e32 %84, killed %60, implicit %exec
%70 = V_ADD_I32_e32 %7.sub0, %61, implicit-def %vcc, implicit %exec
%66 = COPY %13
%65 = V_ADDC_U32_e32 0, %66, implicit-def %vcc, implicit %vcc, implicit %exec
%67 = REG_SEQUENCE %70, 1, killed %65, 2
FLAT_STORE_DWORD %67, %30, 0, 0, 0, implicit %exec, implicit %flat_scr :: (store 4 into %ir.tmp9)
%37 = S_ADD_U32 %14, 4, implicit-def %scc
%38 = S_ADDC_U32 %15, 0, implicit-def dead %scc, implicit %scc
%71 = COPY killed %37
%72 = COPY killed %38
%41 = REG_SEQUENCE killed %71, 1, killed %72, 2
%40 = FLAT_LOAD_DWORD killed %41, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 4 from %ir.scevgep)
%73 = V_BFE_U32 %40, 8, 8, implicit %exec
%74 = V_LSHLREV_B32_e32 %84, killed %73, implicit %exec
%83 = V_ADD_I32_e32 %7.sub0, %74, implicit-def %vcc, implicit %exec
%78 = V_ADDC_U32_e32 0, %66, implicit-def %vcc, implicit %vcc, implicit %exec
%80 = REG_SEQUENCE %83, 1, killed %78, 2
FLAT_STORE_DWORD %80, %30, 0, 0, 0, implicit %exec, implicit %flat_scr :: (store 4 into %ir.tmp17)
%55 = S_ADD_U32 %0.sub0, 8, implicit-def %scc
%56 = S_ADDC_U32 %0.sub1, 0, implicit-def dead %scc, implicit %scc
%57 = REG_SEQUENCE %55, 1, killed %56, 2
%1 = COPY %57
S_CMPK_EQ_I32 %55, 4096, implicit-def %scc
S_CBRANCH_SCC1 %bb.1.bb1, implicit %scc
S_BRANCH %bb.2.bb2
...