From 7377cbeef9fd2f80f128503b2c22d4134cd98b1a Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 25 Sep 2015 16:58:25 +0000 Subject: [PATCH] AMDGPU: Improve accuracy of instruction rates for VOPC These were all using the default 32-bit VALU write class, but the i64/f64 compares are half rate. I'm not sure this is really correct, because they are still using the write to VALU write class, even though they really write to the SALU. llvm-svn: 248582 --- lib/Target/AMDGPU/SIInstrInfo.td | 107 +++++++++++++---------- lib/Target/AMDGPU/SISchedule.td | 16 +++- test/CodeGen/AMDGPU/llvm.AMDGPU.class.ll | 6 +- test/CodeGen/AMDGPU/valu-i1.ll | 12 +-- 4 files changed, 84 insertions(+), 57 deletions(-) diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td index 674f5b70836..7eb6ab319bb 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.td +++ b/lib/Target/AMDGPU/SIInstrInfo.td @@ -1490,19 +1490,24 @@ multiclass VOP3b_2_3_m pattern, string opName, - bit HasMods, bit defExec, string revOp> { + bit HasMods, bit defExec, + string revOp, list sched> { def "" : VOP3_Pseudo , - VOP2_REV; + VOP2_REV { + let SchedRW = sched; + } def _si : VOP3_Real_si , VOP3DisableFields<1, 0, HasMods> { let Defs = !if(defExec, [EXEC], []); + let SchedRW = sched; } def _vi : VOP3_Real_vi , VOP3DisableFields<1, 0, HasMods> { let Defs = !if(defExec, [EXEC], []); + let SchedRW = sched; } } @@ -1690,39 +1695,40 @@ class VOPC_Pseudo pattern, string opName> : multiclass VOPC_m pattern, string opName, bit DefExec, VOPProfile p, + list sched, string revOpName = "", string asm = opName#"_e32 "#op_asm, string alias_asm = opName#" "#op_asm> { - def "" : VOPC_Pseudo ; - - let AssemblerPredicates = [isSICI] in { - - def _si : VOPC, - SIMCInstr { - let Defs = !if(DefExec, [VCC, EXEC], [VCC]); - let hasSideEffects = DefExec; + def "" : VOPC_Pseudo { + let SchedRW = sched; } - def : SIInstAlias < - alias_asm, - (!cast(NAME#"_e32_si") p.Src0RC32:$src0, p.Src1RC32:$src1) - >; + let AssemblerPredicates = [isSICI] in { + def _si : VOPC, + SIMCInstr { + let Defs = !if(DefExec, [VCC, EXEC], [VCC]); + let hasSideEffects = DefExec; + let SchedRW = sched; + } + + def : SIInstAlias < + alias_asm, + (!cast(NAME#"_e32_si") p.Src0RC32:$src0, p.Src1RC32:$src1) + >; } // End AssemblerPredicates = [isSICI] - let AssemblerPredicates = [isVI] in { + def _vi : VOPC, + SIMCInstr { + let Defs = !if(DefExec, [VCC, EXEC], [VCC]); + let hasSideEffects = DefExec; + let SchedRW = sched; + } - def _vi : VOPC, - SIMCInstr { - let Defs = !if(DefExec, [VCC, EXEC], [VCC]); - let hasSideEffects = DefExec; - } - - def : SIInstAlias < - alias_asm, - (!cast(NAME#"_e32_vi") p.Src0RC32:$src0, p.Src1RC32:$src1) - >; - + def : SIInstAlias < + alias_asm, + (!cast(NAME#"_e32_vi") p.Src0RC32:$src0, p.Src1RC32:$src1) + >; } // End AssemblerPredicates = [isVI] } @@ -1730,11 +1736,13 @@ multiclass VOPC_Helper pat32, dag out64, dag ins64, string asm64, list pat64, bit HasMods, bit DefExec, string revOp, - VOPProfile p> { - defm _e32 : VOPC_m ; + VOPProfile p, + list sched> { + defm _e32 : VOPC_m ; defm _e64 : VOP3_C_m ; + opName, HasMods, DefExec, revOp, + sched>; } // Special case for class instructions which only have modifiers on @@ -1743,18 +1751,21 @@ multiclass VOPC_Class_Helper pat32, dag out64, dag ins64, string asm64, list pat64, bit HasMods, bit DefExec, string revOp, - VOPProfile p> { - defm _e32 : VOPC_m ; + VOPProfile p, + list sched> { + defm _e32 : VOPC_m ; defm _e64 : VOP3_C_m , + opName, HasMods, DefExec, revOp, sched>, VOP3DisableModFields<1, 0, 0>; } multiclass VOPCInst : VOPC_Helper < + bit DefExec = 0, + list sched = [Write32Bit]> : + VOPC_Helper < op, opName, P.Ins32, P.Asm32, [], (outs VOPDstS64:$dst), P.Ins64, P.Asm64, @@ -1765,11 +1776,12 @@ multiclass VOPCInst ; multiclass VOPCClassInst : VOPC_Class_Helper < + bit DefExec = 0, + list sched> : VOPC_Class_Helper < op, opName, P.Ins32, P.Asm32, [], (outs VOPDstS64:$dst), P.Ins64, P.Asm64, @@ -1777,7 +1789,7 @@ multiclass VOPCClassInst ; @@ -1785,31 +1797,32 @@ multiclass VOPC_F32 ; multiclass VOPC_F64 : - VOPCInst ; + VOPCInst ; multiclass VOPC_I32 : VOPCInst ; multiclass VOPC_I64 : - VOPCInst ; + VOPCInst ; multiclass VOPCX sched, string revOp = ""> - : VOPCInst ; + : VOPCInst ; multiclass VOPCX_F32 : - VOPCX ; + VOPCX ; multiclass VOPCX_F64 : - VOPCX ; + VOPCX ; multiclass VOPCX_I32 : - VOPCX ; + VOPCX ; multiclass VOPCX_I64 : - VOPCX ; + VOPCX ; multiclass VOP3_Helper pat, int NumSrcArgs, bit HasMods> : VOP3_m < @@ -1817,16 +1830,16 @@ multiclass VOP3_Helper ; multiclass VOPC_CLASS_F32 : - VOPCClassInst ; + VOPCClassInst ; multiclass VOPCX_CLASS_F32 : - VOPCClassInst ; + VOPCClassInst ; multiclass VOPC_CLASS_F64 : - VOPCClassInst ; + VOPCClassInst ; multiclass VOPCX_CLASS_F64 : - VOPCClassInst ; + VOPCClassInst ; multiclass VOP3Inst : VOP3_Helper < diff --git a/lib/Target/AMDGPU/SISchedule.td b/lib/Target/AMDGPU/SISchedule.td index da7601492f9..cd77e519abb 100644 --- a/lib/Target/AMDGPU/SISchedule.td +++ b/lib/Target/AMDGPU/SISchedule.td @@ -22,12 +22,23 @@ def WriteBarrier : SchedWrite; // Vector ALU instructions def Write32Bit : SchedWrite; def WriteQuarterRate32 : SchedWrite; +def WriteFullOrQuarterRate32 : SchedWrite; def WriteFloatFMA : SchedWrite; -def WriteDouble : SchedWrite; +// Slow quarter rate f64 instruction. +def WriteDouble : SchedWrite; + +// half rate f64 instruction (same as v_add_f64) def WriteDoubleAdd : SchedWrite; +// Half rate 64-bit instructions. +def Write64Bit : SchedWrite; + +// FIXME: Should there be a class for instructions which are VALU +// instructions and have VALU rates, but write to the SALU (i.e. VOPC +// instructions) + def SIFullSpeedModel : SchedMachineModel; def SIQuarterSpeedModel : SchedMachineModel; @@ -54,7 +65,7 @@ class HWVALUWriteRes : // The latency numbers are taken from AMD Accelerated Parallel Processing -// guide. They may not be acurate. +// guide. They may not be accurate. // The latency values are 1 / (operations / cycle) / 4. multiclass SICommonWriteRes { @@ -68,6 +79,7 @@ multiclass SICommonWriteRes { def : HWWriteRes; // XXX: Guessed ??? def : HWVALUWriteRes; + def : HWVALUWriteRes; def : HWVALUWriteRes; } diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.class.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.class.ll index 805a88b59c7..80eb3b93f8e 100644 --- a/test/CodeGen/AMDGPU/llvm.AMDGPU.class.ll +++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.class.ll @@ -271,7 +271,8 @@ define void @test_class_64_f64(i32 addrspace(1)* %out, double %a) #0 { ; SI: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb ; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}} ; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[MASK]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc +; SI-NOT: vcc +; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm define void @test_class_full_mask_f64(i32 addrspace(1)* %out, double %a) #0 { @@ -285,7 +286,8 @@ define void @test_class_full_mask_f64(i32 addrspace(1)* %out, double %a) #0 { ; SI-DAG: buffer_load_dwordx2 [[VA:v\[[0-9]+:[0-9]+\]]] ; SI-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}} ; SI: v_cmp_class_f64_e32 vcc, [[VA]], [[MASK]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc +; SI-NOT: vcc +; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm define void @v_test_class_full_mask_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #0 { diff --git a/test/CodeGen/AMDGPU/valu-i1.ll b/test/CodeGen/AMDGPU/valu-i1.ll index 7d0ebd139f5..c27702813a8 100644 --- a/test/CodeGen/AMDGPU/valu-i1.ll +++ b/test/CodeGen/AMDGPU/valu-i1.ll @@ -128,18 +128,18 @@ exit: ; SI-DAG: v_cmp_ne_i32_e64 [[NEG1_CHECK_0:s\[[0-9]+:[0-9]+\]]], -1, [[A]] ; SI-DAG: v_cmp_ne_i32_e32 [[NEG1_CHECK_1:vcc]], -1, [[B]] ; SI: s_and_b64 [[ORNEG1:s\[[0-9]+:[0-9]+\]]], [[NEG1_CHECK_1]], [[NEG1_CHECK_0]] -; SI: s_and_saveexec_b64 [[ORNEG1]], [[ORNEG1]] -; SI: s_xor_b64 [[ORNEG1]], exec, [[ORNEG1]] +; SI: s_and_saveexec_b64 [[ORNEG2:s\[[0-9]+:[0-9]+\]]], [[ORNEG1]] +; SI: s_xor_b64 [[ORNEG2]], exec, [[ORNEG2]] ; SI: s_cbranch_execz BB3_5 ; SI: BB#4: ; SI: buffer_store_dword -; SI: v_cmp_ge_i64_e32 vcc -; SI: s_or_b64 [[COND_STATE]], vcc, [[COND_STATE]] +; SI: v_cmp_ge_i64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]] +; SI: s_or_b64 [[COND_STATE]], [[CMP]], [[COND_STATE]] ; SI: BB3_5: -; SI: s_or_b64 exec, exec, [[ORNEG1]] -; SI: s_or_b64 [[COND_STATE]], [[ORNEG1]], [[COND_STATE]] +; SI: s_or_b64 exec, exec, [[ORNEG2]] +; SI: s_or_b64 [[COND_STATE]], [[ORNEG2]], [[COND_STATE]] ; SI: s_andn2_b64 exec, exec, [[COND_STATE]] ; SI: s_cbranch_execnz BB3_3