[AMDGPU] Add 2 gfx940 mfma tests. NFC.

This commit is contained in:
Stanislav Mekhanoshin 2022-03-17 15:46:01 -07:00
parent 4308fdf83b
commit 275b0c5a5a
2 changed files with 34 additions and 15 deletions

View File

@ -1,5 +1,6 @@
; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908,GFX908_A %s
; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A,GFX908_A %s
; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX940 %s
; GCN-LABEL: {{^}}test_mfma_loop_zeroinit:
@ -12,6 +13,7 @@
; GCN: [[LOOP:.LBB[0-9_]+]]:
; GCN-NOT: v_accvgpr
; GFX908_A: v_mfma_f32_32x32x1f32
; GFX940: v_mfma_f32_32x32x1_2b_f32
; GCN-NOT: v_accvgpr
; GCN: s_cbranch_scc1 [[LOOP]]
@ -46,6 +48,7 @@ exit:
; Check that we do not use 32 temp sgprs as well.
; GFX908_A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x42f60000
; GFX940: s_mov_b32 [[TMP:s[0-9]+]], 0x42f60000
; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
; GFX90A: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], [[TMP]]
; GFX90A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]]
@ -53,6 +56,7 @@ exit:
; GCN: [[LOOP:.LBB[0-9_]+]]:
; GCN-NOT: v_accvgpr
; GFX908_A: v_mfma_f32_32x32x1f32
; GFX940: v_mfma_f32_32x32x1_2b_f32
; GCN-NOT: v_accvgpr
; GCN: s_cbranch_scc1 [[LOOP]]
@ -88,6 +92,7 @@ exit:
; GCN: [[LOOP:.LBB[0-9_]+]]:
; GCN-NOT: v_accvgpr
; GFX908_A: v_mfma_f32_32x32x1f32
; GFX940: v_mfma_f32_32x32x1_2b_f32
; GCN-NOT: v_accvgpr
; GCN: s_cbranch_scc1 [[LOOP]]
@ -248,10 +253,13 @@ exit:
; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
; GFX940-COUNT-32: s_mov_b32 s{{[0-9]+}}, 0x4{{[0-9a-f]+}}
; GFX940-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
; GCN: [[LOOP:.LBB[0-9_]+]]:
; GCN-NOT: v_accvgpr
; GFX908_A: v_mfma_f32_32x32x1f32
; GFX940: v_mfma_f32_32x32x1_2b_f32
; GCN-NOT: v_accvgpr
; GCN: s_cbranch_scc1 [[LOOP]]
@ -284,6 +292,7 @@ exit:
; GCN: [[LOOP:.LBB[0-9_]+]]:
; GCN-NOT: v_accvgpr
; GFX908_A: v_mfma_f32_32x32x1f32
; GFX940: v_mfma_f32_32x32x1_2b_f32
; GCN-NOT: v_accvgpr
; GCN: s_cbranch_scc1 [[LOOP]]
@ -349,11 +358,13 @@ exit:
; GFX908_A: v_mov_b32_e32 [[TMP:v[0-9]+]], s{{[0-9]+}}
; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
; GFX90A: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], [[TMP]]
; GFX940: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], s{{[0-9]+}}
; GFX90A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]]
; GCN: [[LOOP:.LBB[0-9_]+]]:
; GCN-NOT: v_accvgpr
; GFX908_A: v_mfma_f32_32x32x1f32
; GFX940: v_mfma_f32_32x32x1_2b_f32
; GCN-NOT: v_accvgpr
; GCN: s_cbranch_scc1 [[LOOP]]
@ -416,6 +427,7 @@ exit:
; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v0
; GFX908_A-DAG: v_mov_b32_e32 [[TMP:v[0-9]+]], s{{[0-9]+}}
; GFX940-DAG: s_load_dword [[TMP:s[0-9]+]],
; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
@ -454,6 +466,7 @@ exit:
; GCN: [[LOOP:.LBB[0-9_]+]]:
; GCN-NOT: v_accvgpr
; GFX908_A: v_mfma_f32_32x32x1f32
; GFX940: v_mfma_f32_32x32x1_2b_f32
; GCN-NOT: v_accvgpr
; GCN: s_cbranch_scc1 [[LOOP]]
@ -491,11 +504,13 @@ exit:
; GFX90A-NOT: v_accvgpr
; GFX90A: v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}}
; GFX90A-NOT: v_accvgpr
; GFX940: v_mfma_f32_32x32x1_2b_f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}}
; GCN-NOT: v_accvgpr
; GCN: [[LOOP:.LBB[0-9_]+]]:
; GCN-NOT: v_accvgpr
; GFX908_A: v_mfma_f32_32x32x1f32
; GFX940: v_mfma_f32_32x32x1_2b_f32
; GCN-NOT: v_accvgpr
; GCN: s_cbranch_scc1 [[LOOP]]
@ -530,6 +545,7 @@ exit:
; GFX90A-NOT: v_accvgpr
; GFX90A: v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}}
; GFX90A-NOT: v_accvgpr
; GFX940: v_mfma_f32_32x32x1_2b_f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}}
; Check that we are using only one tmp VGPR.
@ -541,6 +557,7 @@ exit:
; GCN: [[LOOP:.LBB[0-9_]+]]:
; GCN-NOT: v_accvgpr
; GFX908_A: v_mfma_f32_32x32x1f32
; GFX940: v_mfma_f32_32x32x1_2b_f32
; GCN-NOT: v_accvgpr
; GCN: s_cbranch_scc1 [[LOOP]]
@ -614,6 +631,7 @@ exit:
; GCN: [[INNER_LOOP:.LBB[0-9_]+]]:
; GCN-NOT: v_accvgpr
; GFX908_A: v_mfma_f32_32x32x1f32
; GFX940: v_mfma_f32_32x32x1_2b_f32
; GCN-NOT: v_accvgpr
; GCN: s_cbranch_scc1 [[INNER_LOOP]]
; GCN-NOT: v_accvgpr

View File

@ -1,5 +1,6 @@
; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY %s
; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY %s
; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY %s
; RUN: llc -march=amdgcn -mcpu=gfx90a -sgpr-regalloc=fast -vgpr-regalloc=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,FAST %s
@ -11,11 +12,11 @@ declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>
declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i32, i32, i32)
; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32:
; GREEDY: v_mfma_f32_32x32x1f32 a[0:31], v{{[0-9]+}}, v{{[0-9]+}}, a[0:31]
; GREEDY: v_mfma_f32_32x32x1f32 a[32:63], v{{[0-9]+}}, v{{[0-9]+}}, a[0:31]
; FAST: v_mfma_f32_32x32x1f32 a[64:95], v{{[0-9]+}}, v{{[0-9]+}}, a[64:95]
; FAST: v_mfma_f32_32x32x1f32 a[32:63], v{{[0-9]+}}, v{{[0-9]+}}, a[64:95]
; GCN: v_mfma_f32_32x32x1f32 a[0:31], v{{[0-9]+}}, v{{[0-9]+}}, a[0:31]
; GREEDY: v_mfma_f32_32x32x1{{.*}} a[0:31], v{{[0-9]+}}, v{{[0-9]+}}, a[0:31]
; GREEDY: v_mfma_f32_32x32x1{{.*}} a[32:63], v{{[0-9]+}}, v{{[0-9]+}}, a[0:31]
; FAST: v_mfma_f32_32x32x1{{.*}} a[64:95], v{{[0-9]+}}, v{{[0-9]+}}, a[64:95]
; FAST: v_mfma_f32_32x32x1{{.*}} a[32:63], v{{[0-9]+}}, v{{[0-9]+}}, a[64:95]
; GCN: v_mfma_f32_32x32x1{{.*}} a[0:31], v{{[0-9]+}}, v{{[0-9]+}}, a[0:31]
define amdgpu_kernel void @test_mfma_f32_32x32x1f32(<32 x float> addrspace(1)* %arg) #0 {
bb:
%in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg
@ -28,11 +29,11 @@ bb:
}
; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32:
; GREEDY: v_mfma_f32_16x16x1f32 a[0:15], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15]
; GREEDY: v_mfma_f32_16x16x1f32 a[16:31], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15]
; FAST: v_mfma_f32_16x16x1f32 a[32:47], v{{[0-9]+}}, v{{[0-9]+}}, a[32:47]
; FAST: v_mfma_f32_16x16x1f32 a[16:31], v{{[0-9]+}}, v{{[0-9]+}}, a[32:47]
; GCN: v_mfma_f32_16x16x1f32 a[0:15], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15]
; GREEDY: v_mfma_f32_16x16x1{{.*}} a[0:15], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15]
; GREEDY: v_mfma_f32_16x16x1{{.*}} a[16:31], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15]
; FAST: v_mfma_f32_16x16x1{{.*}} a[32:47], v{{[0-9]+}}, v{{[0-9]+}}, a[32:47]
; FAST: v_mfma_f32_16x16x1{{.*}} a[16:31], v{{[0-9]+}}, v{{[0-9]+}}, a[32:47]
; GCN: v_mfma_f32_16x16x1{{.*}} a[0:15], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15]
define amdgpu_kernel void @test_mfma_f32_16x16x1f32(<16 x float> addrspace(1)* %arg) #0 {
bb:
%in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
@ -47,11 +48,11 @@ bb:
; This instruction allows the overlap since it only read 4 registers.
; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32:
; GREEDY: v_mfma_f32_4x4x1f32 a[0:3], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3]
; GREEDY: v_mfma_f32_4x4x1f32 a[2:5], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3]
; FAST: v_mfma_f32_4x4x1f32 a[8:11], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3]
; FAST: v_mfma_f32_4x4x1f32 a[4:7], v{{[0-9]+}}, v{{[0-9]+}}, a[8:11]
; GCN: v_mfma_f32_4x4x1f32 a[0:3], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3]
; GREEDY: v_mfma_f32_4x4x1{{.*}} a[0:3], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3]
; GREEDY: v_mfma_f32_4x4x1{{.*}} a[2:5], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3]
; FAST: v_mfma_f32_4x4x1{{.*}} a[8:11], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3]
; FAST: v_mfma_f32_4x4x1{{.*}} a[4:7], v{{[0-9]+}}, v{{[0-9]+}}, a[8:11]
; GCN: v_mfma_f32_4x4x1{{.*}} a[0:3], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3]
define amdgpu_kernel void @test_mfma_f32_4x4x1f32(<4 x float> addrspace(1)* %arg) #0 {
bb:
%in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg