AMDGPU: Use V_MAC_F32 for fmad.ftz

This avoids regressions in a future patch. I'm confused by the use of
the gfx9 usage legacy_mad. Was this a pointless instruction rename, or
uses fmul_legacy handling? Why is regular mac avilable in that case?
This commit is contained in:
Matt Arsenault 2020-03-09 16:53:00 -04:00
parent 75af694a6d
commit 200b20639a
5 changed files with 50 additions and 38 deletions

View File

@ -848,20 +848,29 @@ def : GCNPat <
// VOP2 Patterns
//===----------------------------------------------------------------------===//
multiclass FMADPat <ValueType vt, Instruction inst> {
def : GCNPat <
(vt (fmad (VOP3NoMods vt:$src0),
(VOP3NoMods vt:$src1),
(VOP3NoMods vt:$src2))),
// TODO: Check only no src2 mods?
class FMADPat <ValueType vt, Instruction inst, SDPatternOperator node>
: GCNPat <(vt (node (vt (VOP3NoMods vt:$src0)),
(vt (VOP3NoMods vt:$src1)),
(vt (VOP3NoMods vt:$src2)))),
(inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
>;
>;
// Prefer mac form when there are no modifiers.
let AddedComplexity = 9 in {
def : FMADPat <f32, V_MAC_F32_e64, fmad>;
def : FMADPat <f32, V_MAC_F32_e64, AMDGPUfmad_ftz>;
let SubtargetPredicate = Has16BitInsts in {
def : FMADPat <f16, V_MAC_F16_e64, fmad>;
def : FMADPat <f16, V_MAC_F16_e64, AMDGPUfmad_ftz>;
}
defm : FMADPat <f16, V_MAC_F16_e64>;
defm : FMADPat <f32, V_MAC_F32_e64>;
}
class FMADModsPat<Instruction inst, SDPatternOperator mad_opr, ValueType Ty>
class FMADModsPat<ValueType Ty, Instruction inst, SDPatternOperator mad_opr>
: GCNPat<
(Ty (mad_opr (Ty (VOP3Mods Ty:$src0, i32:$src0_mod)),
(Ty (VOP3Mods Ty:$src1, i32:$src1_mod)),
@ -870,9 +879,8 @@ class FMADModsPat<Instruction inst, SDPatternOperator mad_opr, ValueType Ty>
$src2_mod, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
>;
// FIXME: This should select to V_MAC_F32
def : FMADModsPat<V_MAD_F32, AMDGPUfmad_ftz, f32>;
def : FMADModsPat<V_MAD_F16, AMDGPUfmad_ftz, f16> {
def : FMADModsPat<f32, V_MAD_F32, AMDGPUfmad_ftz>;
def : FMADModsPat<f16, V_MAD_F16, AMDGPUfmad_ftz> {
let SubtargetPredicate = Has16BitInsts;
}

View File

@ -19,8 +19,8 @@ body: |
; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]]
; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
%2:vgpr(s32) = COPY $vgpr2
@ -43,8 +43,8 @@ body: |
; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]]
; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
%0:sgpr(s32) = COPY $sgpr0
%1:vgpr(s32) = COPY $vgpr0
%2:vgpr(s32) = COPY $vgpr1
@ -67,8 +67,8 @@ body: |
; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GCN: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]]
; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:sgpr(s32) = COPY $sgpr0
%2:vgpr(s32) = COPY $vgpr1
@ -91,8 +91,9 @@ body: |
; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GCN: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr0
; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]]
; GCN: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY2]]
; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY3]], 0, 0, implicit $exec
; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr0
%2:sgpr(s32) = COPY $sgpr0
@ -116,8 +117,8 @@ body: |
; GCN: liveins: $sgpr0, $vgpr0
; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]]
; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
%0:sgpr(s32) = COPY $sgpr0
%1:vgpr(s32) = COPY $vgpr0
%2:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmad.ftz), %0, %0, %1
@ -138,8 +139,9 @@ body: |
; GCN: liveins: $sgpr0, $vgpr0
; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]]
; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
%0:sgpr(s32) = COPY $sgpr0
%1:vgpr(s32) = COPY $vgpr0
%2:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmad.ftz), %0, %1, %0
@ -160,8 +162,9 @@ body: |
; GCN: liveins: $sgpr0, $vgpr0
; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY1]], 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $exec
; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]]
; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, [[COPY2]], 0, 0, implicit $exec
; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
%0:sgpr(s32) = COPY $sgpr0
%1:vgpr(s32) = COPY $vgpr0
%2:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmad.ftz), %1, %0, %0
@ -181,8 +184,9 @@ body: |
; GCN-LABEL: name: fmad_ftz_s32_vsss
; GCN: liveins: $sgpr0, $vgpr0
; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $exec
; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]]
; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
%0:sgpr(s32) = COPY $sgpr0
%1:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmad.ftz), %0, %0, %0
S_ENDPGM 0, implicit %1

View File

@ -137,8 +137,8 @@ define amdgpu_kernel void @test_fold_canonicalize_fma_value_f32(float addrspace(
}
; GCN-LABEL: test_fold_canonicalize_fmad_ftz_value_f32:
; GCN: s_mov_b32 [[SGPR:s[0-9]+]], 0x41700000
; GCN: v_mad_f32 [[V:v[0-9]+]], v{{[0-9]+}}, [[SGPR]], [[SGPR]]
; GCN: v_mov_b32_e32 [[V:v[0-9]+]], 0x41700000
; GCN: v_mac_f32_e32 [[V]], v{{[0-9]+}}, v{{[0-9]+$}}
; GCN-NOT: v_mul
; GCN-NOT: v_max
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]

View File

@ -5,8 +5,7 @@
declare half @llvm.amdgcn.fmad.ftz.f16(half %a, half %b, half %c)
; GCN-LABEL: {{^}}mad_f16:
; GFX8: v_ma{{[dc]}}_f16
; GFX9: v_mad_legacy_f16
; GCN: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+$}}
define amdgpu_kernel void @mad_f16(
half addrspace(1)* %r,
half addrspace(1)* %a,
@ -34,9 +33,7 @@ define amdgpu_kernel void @mad_f16_imm_a(
}
; GCN-LABEL: {{^}}mad_f16_imm_b:
; GCN: s_movk_i32 [[KB:s[0-9]+]], 0x4800
; GFX8: v_mad_f16 {{v[0-9]+}}, {{v[0-9]+}}, [[KB]],
; GFX9: v_mad_legacy_f16 {{v[0-9]+}}, {{v[0-9]+}}, [[KB]],
; GCN: v_mac_f16_e32 {{v[0-9]+}}, 0x4800, {{v[0-9]+$}}
define amdgpu_kernel void @mad_f16_imm_b(
half addrspace(1)* %r,
half addrspace(1)* %a,

View File

@ -35,7 +35,7 @@ define amdgpu_kernel void @mad_f32_imm_a(
; GCN-LABEL: {{^}}mad_f32_imm_b:
; GCN: v_mov_b32_e32 [[KB:v[0-9]+]], 0x41000000
; GCN: v_ma{{[dc]}}_f32 {{v[0-9]+}}, {{[vs][0-9]+}}, [[KB]],
; GCN: v_mac_f32_e32 {{v[0-9]+}}, {{[s][0-9]+}}, [[KB]]
define amdgpu_kernel void @mad_f32_imm_b(
float addrspace(1)* %r,
float addrspace(1)* %a,
@ -48,8 +48,11 @@ define amdgpu_kernel void @mad_f32_imm_b(
}
; GCN-LABEL: {{^}}mad_f32_imm_c:
; GCN: v_mov_b32_e32 [[KC:v[0-9]+]], 0x41000000
; GCN: v_ma{{[dc]}}_f32 {{v[0-9]+}}, {{[vs][0-9]+}}, {{v[0-9]+}}, [[KC]]{{$}}
; GCN: v_mov_b32_e32 [[C:v[0-9]+]], 0x41000000
; GCN: s_load_dword [[A:s[0-9]+]]
; GCN: s_load_dword [[B:s[0-9]+]]
; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]]
; GCN: v_mac_f32_e32 [[C]], {{s[0-9]+}}, [[VB]]{{$}}
define amdgpu_kernel void @mad_f32_imm_c(
float addrspace(1)* %r,
float addrspace(1)* %a,