mirror of
https://github.com/capstone-engine/llvm-capstone.git
synced 2024-11-28 16:11:29 +00:00
AMDGPU: Use V_MAC_F32 for fmad.ftz
This avoids regressions in a future patch. I'm confused by the use of the gfx9 usage legacy_mad. Was this a pointless instruction rename, or uses fmul_legacy handling? Why is regular mac avilable in that case?
This commit is contained in:
parent
75af694a6d
commit
200b20639a
@ -848,20 +848,29 @@ def : GCNPat <
|
||||
// VOP2 Patterns
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
multiclass FMADPat <ValueType vt, Instruction inst> {
|
||||
def : GCNPat <
|
||||
(vt (fmad (VOP3NoMods vt:$src0),
|
||||
(VOP3NoMods vt:$src1),
|
||||
(VOP3NoMods vt:$src2))),
|
||||
// TODO: Check only no src2 mods?
|
||||
class FMADPat <ValueType vt, Instruction inst, SDPatternOperator node>
|
||||
: GCNPat <(vt (node (vt (VOP3NoMods vt:$src0)),
|
||||
(vt (VOP3NoMods vt:$src1)),
|
||||
(vt (VOP3NoMods vt:$src2)))),
|
||||
(inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
|
||||
SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
|
||||
>;
|
||||
>;
|
||||
|
||||
|
||||
// Prefer mac form when there are no modifiers.
|
||||
let AddedComplexity = 9 in {
|
||||
def : FMADPat <f32, V_MAC_F32_e64, fmad>;
|
||||
def : FMADPat <f32, V_MAC_F32_e64, AMDGPUfmad_ftz>;
|
||||
|
||||
let SubtargetPredicate = Has16BitInsts in {
|
||||
def : FMADPat <f16, V_MAC_F16_e64, fmad>;
|
||||
def : FMADPat <f16, V_MAC_F16_e64, AMDGPUfmad_ftz>;
|
||||
}
|
||||
|
||||
defm : FMADPat <f16, V_MAC_F16_e64>;
|
||||
defm : FMADPat <f32, V_MAC_F32_e64>;
|
||||
}
|
||||
|
||||
class FMADModsPat<Instruction inst, SDPatternOperator mad_opr, ValueType Ty>
|
||||
class FMADModsPat<ValueType Ty, Instruction inst, SDPatternOperator mad_opr>
|
||||
: GCNPat<
|
||||
(Ty (mad_opr (Ty (VOP3Mods Ty:$src0, i32:$src0_mod)),
|
||||
(Ty (VOP3Mods Ty:$src1, i32:$src1_mod)),
|
||||
@ -870,9 +879,8 @@ class FMADModsPat<Instruction inst, SDPatternOperator mad_opr, ValueType Ty>
|
||||
$src2_mod, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
|
||||
>;
|
||||
|
||||
// FIXME: This should select to V_MAC_F32
|
||||
def : FMADModsPat<V_MAD_F32, AMDGPUfmad_ftz, f32>;
|
||||
def : FMADModsPat<V_MAD_F16, AMDGPUfmad_ftz, f16> {
|
||||
def : FMADModsPat<f32, V_MAD_F32, AMDGPUfmad_ftz>;
|
||||
def : FMADModsPat<f16, V_MAD_F16, AMDGPUfmad_ftz> {
|
||||
let SubtargetPredicate = Has16BitInsts;
|
||||
}
|
||||
|
||||
|
@ -19,8 +19,8 @@ body: |
|
||||
; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
|
||||
; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
|
||||
; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]]
|
||||
; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
|
||||
; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
|
||||
%0:vgpr(s32) = COPY $vgpr0
|
||||
%1:vgpr(s32) = COPY $vgpr1
|
||||
%2:vgpr(s32) = COPY $vgpr2
|
||||
@ -43,8 +43,8 @@ body: |
|
||||
; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
|
||||
; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
|
||||
; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]]
|
||||
; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
|
||||
; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
|
||||
%0:sgpr(s32) = COPY $sgpr0
|
||||
%1:vgpr(s32) = COPY $vgpr0
|
||||
%2:vgpr(s32) = COPY $vgpr1
|
||||
@ -67,8 +67,8 @@ body: |
|
||||
; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GCN: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
|
||||
; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
|
||||
; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]]
|
||||
; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
|
||||
; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
|
||||
%0:vgpr(s32) = COPY $vgpr0
|
||||
%1:sgpr(s32) = COPY $sgpr0
|
||||
%2:vgpr(s32) = COPY $vgpr1
|
||||
@ -91,8 +91,9 @@ body: |
|
||||
; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GCN: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr0
|
||||
; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
|
||||
; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]]
|
||||
; GCN: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY2]]
|
||||
; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY3]], 0, 0, implicit $exec
|
||||
; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
|
||||
%0:vgpr(s32) = COPY $vgpr0
|
||||
%1:vgpr(s32) = COPY $vgpr0
|
||||
%2:sgpr(s32) = COPY $sgpr0
|
||||
@ -116,8 +117,8 @@ body: |
|
||||
; GCN: liveins: $sgpr0, $vgpr0
|
||||
; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
|
||||
; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
|
||||
; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]]
|
||||
; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
|
||||
; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
|
||||
%0:sgpr(s32) = COPY $sgpr0
|
||||
%1:vgpr(s32) = COPY $vgpr0
|
||||
%2:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmad.ftz), %0, %0, %1
|
||||
@ -138,8 +139,9 @@ body: |
|
||||
; GCN: liveins: $sgpr0, $vgpr0
|
||||
; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
|
||||
; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
|
||||
; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]]
|
||||
; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
|
||||
; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
|
||||
; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
|
||||
%0:sgpr(s32) = COPY $sgpr0
|
||||
%1:vgpr(s32) = COPY $vgpr0
|
||||
%2:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmad.ftz), %0, %1, %0
|
||||
@ -160,8 +162,9 @@ body: |
|
||||
; GCN: liveins: $sgpr0, $vgpr0
|
||||
; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
|
||||
; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY1]], 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $exec
|
||||
; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]]
|
||||
; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
|
||||
; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, [[COPY2]], 0, 0, implicit $exec
|
||||
; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
|
||||
%0:sgpr(s32) = COPY $sgpr0
|
||||
%1:vgpr(s32) = COPY $vgpr0
|
||||
%2:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmad.ftz), %1, %0, %0
|
||||
@ -181,8 +184,9 @@ body: |
|
||||
; GCN-LABEL: name: fmad_ftz_s32_vsss
|
||||
; GCN: liveins: $sgpr0, $vgpr0
|
||||
; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
|
||||
; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $exec
|
||||
; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]]
|
||||
; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
|
||||
; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
|
||||
; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
|
||||
%0:sgpr(s32) = COPY $sgpr0
|
||||
%1:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmad.ftz), %0, %0, %0
|
||||
S_ENDPGM 0, implicit %1
|
||||
|
@ -137,8 +137,8 @@ define amdgpu_kernel void @test_fold_canonicalize_fma_value_f32(float addrspace(
|
||||
}
|
||||
|
||||
; GCN-LABEL: test_fold_canonicalize_fmad_ftz_value_f32:
|
||||
; GCN: s_mov_b32 [[SGPR:s[0-9]+]], 0x41700000
|
||||
; GCN: v_mad_f32 [[V:v[0-9]+]], v{{[0-9]+}}, [[SGPR]], [[SGPR]]
|
||||
; GCN: v_mov_b32_e32 [[V:v[0-9]+]], 0x41700000
|
||||
; GCN: v_mac_f32_e32 [[V]], v{{[0-9]+}}, v{{[0-9]+$}}
|
||||
; GCN-NOT: v_mul
|
||||
; GCN-NOT: v_max
|
||||
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
|
||||
|
@ -5,8 +5,7 @@
|
||||
declare half @llvm.amdgcn.fmad.ftz.f16(half %a, half %b, half %c)
|
||||
|
||||
; GCN-LABEL: {{^}}mad_f16:
|
||||
; GFX8: v_ma{{[dc]}}_f16
|
||||
; GFX9: v_mad_legacy_f16
|
||||
; GCN: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+$}}
|
||||
define amdgpu_kernel void @mad_f16(
|
||||
half addrspace(1)* %r,
|
||||
half addrspace(1)* %a,
|
||||
@ -34,9 +33,7 @@ define amdgpu_kernel void @mad_f16_imm_a(
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}mad_f16_imm_b:
|
||||
; GCN: s_movk_i32 [[KB:s[0-9]+]], 0x4800
|
||||
; GFX8: v_mad_f16 {{v[0-9]+}}, {{v[0-9]+}}, [[KB]],
|
||||
; GFX9: v_mad_legacy_f16 {{v[0-9]+}}, {{v[0-9]+}}, [[KB]],
|
||||
; GCN: v_mac_f16_e32 {{v[0-9]+}}, 0x4800, {{v[0-9]+$}}
|
||||
define amdgpu_kernel void @mad_f16_imm_b(
|
||||
half addrspace(1)* %r,
|
||||
half addrspace(1)* %a,
|
||||
|
@ -35,7 +35,7 @@ define amdgpu_kernel void @mad_f32_imm_a(
|
||||
|
||||
; GCN-LABEL: {{^}}mad_f32_imm_b:
|
||||
; GCN: v_mov_b32_e32 [[KB:v[0-9]+]], 0x41000000
|
||||
; GCN: v_ma{{[dc]}}_f32 {{v[0-9]+}}, {{[vs][0-9]+}}, [[KB]],
|
||||
; GCN: v_mac_f32_e32 {{v[0-9]+}}, {{[s][0-9]+}}, [[KB]]
|
||||
define amdgpu_kernel void @mad_f32_imm_b(
|
||||
float addrspace(1)* %r,
|
||||
float addrspace(1)* %a,
|
||||
@ -48,8 +48,11 @@ define amdgpu_kernel void @mad_f32_imm_b(
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}mad_f32_imm_c:
|
||||
; GCN: v_mov_b32_e32 [[KC:v[0-9]+]], 0x41000000
|
||||
; GCN: v_ma{{[dc]}}_f32 {{v[0-9]+}}, {{[vs][0-9]+}}, {{v[0-9]+}}, [[KC]]{{$}}
|
||||
; GCN: v_mov_b32_e32 [[C:v[0-9]+]], 0x41000000
|
||||
; GCN: s_load_dword [[A:s[0-9]+]]
|
||||
; GCN: s_load_dword [[B:s[0-9]+]]
|
||||
; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]]
|
||||
; GCN: v_mac_f32_e32 [[C]], {{s[0-9]+}}, [[VB]]{{$}}
|
||||
define amdgpu_kernel void @mad_f32_imm_c(
|
||||
float addrspace(1)* %r,
|
||||
float addrspace(1)* %a,
|
||||
|
Loading…
Reference in New Issue
Block a user