[LLVM][NVPTX]: Add intrinsic for setmaxnreg (#77289)

This patch adds an intrinsic for setmaxnreg PTX instruction.
* PTX Doc link for this instruction:
https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-setmaxnreg

* The i32 argument, an immediate value, specifies the actual
  absolute register count for the instruction.
* The `setmaxnreg` instruction is available in SM90a.
  So, this patch adds 'hasSM90a' predicate to use in
  the NVPTX backend.
* lit tests are added to verify the lowering of the intrinsic.
* Verifier logic (and tests) are added to test the register
  count range and divisibility-by-8 requirements.

Signed-off-by: Durgadoss R <durgadossr@nvidia.com>
This commit is contained in:
Durgadoss R 2024-01-10 01:34:13 +05:30 committed by GitHub
parent c7c68f1764
commit 340cc1702e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 69 additions and 0 deletions

View File

@ -4710,4 +4710,14 @@ def int_nvvm_is_explicit_cluster
[IntrNoMem, IntrSpeculatable, NoUndef<RetIndex>],
"llvm.nvvm.is_explicit_cluster">;
// Setmaxnreg inc/dec intrinsics
def int_nvvm_setmaxnreg_inc_sync_aligned_u32
: DefaultAttrsIntrinsic<[], [llvm_i32_ty],
[IntrConvergent, IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>],
"llvm.nvvm.setmaxnreg.inc.sync.aligned.u32">;
def int_nvvm_setmaxnreg_dec_sync_aligned_u32
: DefaultAttrsIntrinsic<[], [llvm_i32_ty],
[IntrConvergent, IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>],
"llvm.nvvm.setmaxnreg.dec.sync.aligned.u32">;
} // let TargetPrefix = "nvvm"

View File

@ -96,6 +96,7 @@
#include "llvm/IR/IntrinsicsAArch64.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsARM.h"
#include "llvm/IR/IntrinsicsNVPTX.h"
#include "llvm/IR/IntrinsicsWebAssembly.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Metadata.h"
@ -6031,6 +6032,16 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
"Value for inactive lanes must be a VGPR function argument", &Call);
break;
}
case Intrinsic::nvvm_setmaxnreg_inc_sync_aligned_u32:
case Intrinsic::nvvm_setmaxnreg_dec_sync_aligned_u32: {
Value *V = Call.getArgOperand(0);
unsigned RegCount = cast<ConstantInt>(V)->getZExtValue();
Check(RegCount % 8 == 0,
"reg_count argument to nvvm.setmaxnreg must be in multiples of 8");
Check((RegCount >= 24 && RegCount <= 256),
"reg_count argument to nvvm.setmaxnreg must be within [24, 256]");
break;
}
case Intrinsic::experimental_convergence_entry:
LLVM_FALLTHROUGH;
case Intrinsic::experimental_convergence_anchor:

View File

@ -164,6 +164,9 @@ def True : Predicate<"true">;
class hasPTX<int version>: Predicate<"Subtarget->getPTXVersion() >= " # version>;
class hasSM<int version>: Predicate<"Subtarget->getSmVersion() >= " # version>;
// Explicit records for arch-accelerated SM versions
def hasSM90a : Predicate<"Subtarget->getFullSmVersion() == 901">;
// non-sync shfl instructions are not available on sm_70+ in PTX6.4+
def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70"
"&& Subtarget->getPTXVersion() >= 64)">;

View File

@ -6727,3 +6727,16 @@ def is_explicit_cluster: NVPTXInst<(outs Int1Regs:$d), (ins),
"mov.pred\t$d, %is_explicit_cluster;",
[(set Int1Regs:$d, (int_nvvm_is_explicit_cluster))]>,
Requires<[hasSM<90>, hasPTX<78>]>;
// setmaxnreg inc/dec intrinsics
let isConvergent = true in {
multiclass SET_MAXNREG<string Action, Intrinsic Intr> {
def : NVPTXInst<(outs), (ins i32imm:$reg_count),
"setmaxnreg." # Action # ".sync.aligned.u32 $reg_count;",
[(Intr timm:$reg_count)]>,
Requires<[hasSM90a, hasPTX<80>]>;
}
defm INT_SET_MAXNREG_INC : SET_MAXNREG<"inc", int_nvvm_setmaxnreg_inc_sync_aligned_u32>;
defm INT_SET_MAXNREG_DEC : SET_MAXNREG<"dec", int_nvvm_setmaxnreg_dec_sync_aligned_u32>;
} // isConvergent

View File

@ -0,0 +1,16 @@
; RUN: llc < %s -march=nvptx64 -mcpu=sm_90a -mattr=+ptx80| FileCheck --check-prefixes=CHECK %s
; RUN: %if ptxas-12.0 %{ llc < %s -march=nvptx64 -mcpu=sm_90a -mattr=+ptx80| %ptxas-verify -arch=sm_90a %}
declare void @llvm.nvvm.setmaxnreg.inc.sync.aligned.u32(i32 %reg_count)
declare void @llvm.nvvm.setmaxnreg.dec.sync.aligned.u32(i32 %reg_count)
; CHECK-LABEL: test_set_maxn_reg
define void @test_set_maxn_reg() {
; CHECK: setmaxnreg.inc.sync.aligned.u32 96;
call void @llvm.nvvm.setmaxnreg.inc.sync.aligned.u32(i32 96)
; CHECK: setmaxnreg.dec.sync.aligned.u32 64;
call void @llvm.nvvm.setmaxnreg.dec.sync.aligned.u32(i32 64)
ret void
}

View File

@ -0,0 +1,2 @@
if not "NVPTX" in config.root.targets:
config.unsupported = True

View File

@ -0,0 +1,14 @@
; RUN: not llvm-as %s -o /dev/null 2>&1 | FileCheck %s
declare void @llvm.nvvm.setmaxnreg.inc.sync.aligned.u32(i32 %reg_count)
declare void @llvm.nvvm.setmaxnreg.dec.sync.aligned.u32(i32 %reg_count)
define void @test_set_maxn_reg() {
; CHECK: reg_count argument to nvvm.setmaxnreg must be in multiples of 8
call void @llvm.nvvm.setmaxnreg.inc.sync.aligned.u32(i32 95)
; CHECK: reg_count argument to nvvm.setmaxnreg must be within [24, 256]
call void @llvm.nvvm.setmaxnreg.dec.sync.aligned.u32(i32 16)
ret void
}