AMDGPU: Use V_MAC_F32 for fmad.ftz

This avoids regressions in a future patch. I'm confused by the use of the gfx9 usage legacy_mad. Was this a pointless instruction rename, or uses fmul_legacy handling? Why is regular mac avilable in that case?
2024-11-28 16:11:29 +00:00 · 2020-03-09 16:53:00 -04:00 · 2020-03-09 16:53:00 -04:00 · 200b20639a
commit 200b20639a
parent 75af694a6d
5 changed files with 50 additions and 38 deletions
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@ -848,20 +848,29 @@ def : GCNPat <
 // VOP2 Patterns
 //===----------------------------------------------------------------------===//

-multiclass FMADPat <ValueType vt, Instruction inst> {
-  def : GCNPat <
-    (vt (fmad (VOP3NoMods vt:$src0),
-              (VOP3NoMods vt:$src1),
-              (VOP3NoMods vt:$src2))),
+// TODO: Check only no src2 mods?
+class FMADPat <ValueType vt, Instruction inst, SDPatternOperator node>
+  : GCNPat <(vt (node (vt (VOP3NoMods vt:$src0)),
+                      (vt (VOP3NoMods vt:$src1)),
+                      (vt (VOP3NoMods vt:$src2)))),
    (inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
          SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
-  >;
+>;
+
+
+// Prefer mac form when there are no modifiers.
+let AddedComplexity = 9 in {
+def : FMADPat <f32, V_MAC_F32_e64, fmad>;
+def : FMADPat <f32, V_MAC_F32_e64, AMDGPUfmad_ftz>;
+
+let SubtargetPredicate = Has16BitInsts in {
+def : FMADPat <f16, V_MAC_F16_e64, fmad>;
+def : FMADPat <f16, V_MAC_F16_e64, AMDGPUfmad_ftz>;
 }

-defm : FMADPat <f16, V_MAC_F16_e64>;
-defm : FMADPat <f32, V_MAC_F32_e64>;
+}

-class FMADModsPat<Instruction inst, SDPatternOperator mad_opr, ValueType Ty>
+class FMADModsPat<ValueType Ty, Instruction inst, SDPatternOperator mad_opr>
  : GCNPat<
  (Ty (mad_opr (Ty (VOP3Mods Ty:$src0, i32:$src0_mod)),
               (Ty (VOP3Mods Ty:$src1, i32:$src1_mod)),
@ -870,9 +879,8 @@ class FMADModsPat<Instruction inst, SDPatternOperator mad_opr, ValueType Ty>
  $src2_mod, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
 >;

-// FIXME: This should select to V_MAC_F32
-def : FMADModsPat<V_MAD_F32, AMDGPUfmad_ftz, f32>;
-def : FMADModsPat<V_MAD_F16, AMDGPUfmad_ftz, f16> {
+def : FMADModsPat<f32, V_MAD_F32, AMDGPUfmad_ftz>;
+def : FMADModsPat<f16, V_MAD_F16, AMDGPUfmad_ftz> {
  let SubtargetPredicate = Has16BitInsts;
 }

--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fmad.ftz.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fmad.ftz.mir
@ -19,8 +19,8 @@ body: |
    ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
    ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
    ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
-    ; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]]
+    ; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
    %0:vgpr(s32) = COPY $vgpr0
    %1:vgpr(s32) = COPY $vgpr1
    %2:vgpr(s32) = COPY $vgpr2
@ -43,8 +43,8 @@ body: |
    ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
    ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
    ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
-    ; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]]
+    ; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
    %0:sgpr(s32) = COPY $sgpr0
    %1:vgpr(s32) = COPY $vgpr0
    %2:vgpr(s32) = COPY $vgpr1
@ -67,8 +67,8 @@ body: |
    ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
    ; GCN: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
    ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
-    ; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]]
+    ; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
    %0:vgpr(s32) = COPY $vgpr0
    %1:sgpr(s32) = COPY $sgpr0
    %2:vgpr(s32) = COPY $vgpr1
@ -91,8 +91,9 @@ body: |
    ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
    ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
    ; GCN: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
-    ; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]]
+    ; GCN: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY2]]
+    ; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY3]], 0, 0, implicit $exec
+    ; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
    %0:vgpr(s32) = COPY $vgpr0
    %1:vgpr(s32) = COPY $vgpr0
    %2:sgpr(s32) = COPY $sgpr0
@ -116,8 +117,8 @@ body: |
    ; GCN: liveins: $sgpr0, $vgpr0
    ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
    ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
-    ; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]]
+    ; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+    ; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
    %0:sgpr(s32) = COPY $sgpr0
    %1:vgpr(s32) = COPY $vgpr0
    %2:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmad.ftz), %0, %0, %1
@ -138,8 +139,9 @@ body: |
    ; GCN: liveins: $sgpr0, $vgpr0
    ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
    ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
-    ; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]]
+    ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+    ; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
    %0:sgpr(s32) = COPY $sgpr0
    %1:vgpr(s32) = COPY $vgpr0
    %2:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmad.ftz), %0, %1, %0
@ -160,8 +162,9 @@ body: |
    ; GCN: liveins: $sgpr0, $vgpr0
    ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
    ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY1]], 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $exec
-    ; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]]
+    ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+    ; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
    %0:sgpr(s32) = COPY $sgpr0
    %1:vgpr(s32) = COPY $vgpr0
    %2:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmad.ftz), %1, %0, %0
@ -181,8 +184,9 @@ body: |
    ; GCN-LABEL: name: fmad_ftz_s32_vsss
    ; GCN: liveins: $sgpr0, $vgpr0
    ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $exec
-    ; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]]
+    ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+    ; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+    ; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
    %0:sgpr(s32) = COPY $sgpr0
    %1:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmad.ftz), %0, %0, %0
    S_ENDPGM 0, implicit %1
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
@ -137,8 +137,8 @@ define amdgpu_kernel void @test_fold_canonicalize_fma_value_f32(float addrspace(
 }

 ; GCN-LABEL: test_fold_canonicalize_fmad_ftz_value_f32:
-; GCN: s_mov_b32 [[SGPR:s[0-9]+]], 0x41700000
-; GCN: v_mad_f32 [[V:v[0-9]+]], v{{[0-9]+}}, [[SGPR]], [[SGPR]]
+; GCN: v_mov_b32_e32 [[V:v[0-9]+]], 0x41700000
+; GCN: v_mac_f32_e32 [[V]], v{{[0-9]+}}, v{{[0-9]+$}}
 ; GCN-NOT: v_mul
 ; GCN-NOT: v_max
 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.f16.ll
@ -5,8 +5,7 @@
 declare half @llvm.amdgcn.fmad.ftz.f16(half %a, half %b, half %c)

 ; GCN-LABEL: {{^}}mad_f16:
-; GFX8: v_ma{{[dc]}}_f16
-; GFX9: v_mad_legacy_f16
+; GCN: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+$}}
 define amdgpu_kernel void @mad_f16(
    half addrspace(1)* %r,
    half addrspace(1)* %a,
@ -34,9 +33,7 @@ define amdgpu_kernel void @mad_f16_imm_a(
 }

 ; GCN-LABEL: {{^}}mad_f16_imm_b:
-; GCN:  s_movk_i32 [[KB:s[0-9]+]], 0x4800
-; GFX8: v_mad_f16 {{v[0-9]+}}, {{v[0-9]+}}, [[KB]],
-; GFX9: v_mad_legacy_f16 {{v[0-9]+}}, {{v[0-9]+}}, [[KB]],
+; GCN: v_mac_f16_e32 {{v[0-9]+}}, 0x4800, {{v[0-9]+$}}
 define amdgpu_kernel void @mad_f16_imm_b(
    half addrspace(1)* %r,
    half addrspace(1)* %a,
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.ll
@ -35,7 +35,7 @@ define amdgpu_kernel void @mad_f32_imm_a(

 ; GCN-LABEL: {{^}}mad_f32_imm_b:
 ; GCN: v_mov_b32_e32 [[KB:v[0-9]+]], 0x41000000
-; GCN:  v_ma{{[dc]}}_f32 {{v[0-9]+}}, {{[vs][0-9]+}}, [[KB]],
+; GCN: v_mac_f32_e32 {{v[0-9]+}}, {{[s][0-9]+}}, [[KB]]
 define amdgpu_kernel void @mad_f32_imm_b(
    float addrspace(1)* %r,
    float addrspace(1)* %a,
@ -48,8 +48,11 @@ define amdgpu_kernel void @mad_f32_imm_b(
 }

 ; GCN-LABEL: {{^}}mad_f32_imm_c:
-; GCN: v_mov_b32_e32 [[KC:v[0-9]+]], 0x41000000
-; GCN:  v_ma{{[dc]}}_f32 {{v[0-9]+}}, {{[vs][0-9]+}}, {{v[0-9]+}}, [[KC]]{{$}}
+; GCN: v_mov_b32_e32 [[C:v[0-9]+]], 0x41000000
+; GCN: s_load_dword [[A:s[0-9]+]]
+; GCN: s_load_dword [[B:s[0-9]+]]
+; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]]
+; GCN: v_mac_f32_e32 [[C]], {{s[0-9]+}}, [[VB]]{{$}}
 define amdgpu_kernel void @mad_f32_imm_c(
    float addrspace(1)* %r,
    float addrspace(1)* %a,