[NVPTX] Re-enable support for virtual registers in the final output

Now that 3.3 is branched, we are re-enabling virtual registers to help
iron out bugs before the next release. Some of the post-RA passes do
not play well with virtual registers, so we disable them for now. The
needed functionality of the PrologEpilogInserter pass is copied to a
new backend-specific NVPTXPrologEpilog pass.

The test for this commit is not breaking the existing tests.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@182998 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Justin Holewinski 2013-05-31 12:14:49 +00:00
parent 7ae921dbec
commit 5443e7d790
10 changed files with 341 additions and 100 deletions

View File

@ -24,6 +24,7 @@ set(NVPTXCodeGen_sources
NVPTXUtilities.cpp
NVVMReflect.cpp
NVPTXGenericToNVVM.cpp
NVPTXPrologEpilogPass.cpp
)
add_llvm_target(NVPTXCodeGen ${NVPTXCodeGen_sources})

View File

@ -27,6 +27,7 @@
namespace llvm {
class NVPTXTargetMachine;
class FunctionPass;
class MachineFunctionPass;
class formatted_raw_ostream;
namespace NVPTXCC {
@ -66,6 +67,7 @@ FunctionPass *createNVPTXReMatBlockPass(NVPTXTargetMachine &);
ModulePass *createGenericToNVVMPass();
ModulePass *createNVVMReflectPass();
ModulePass *createNVVMReflectPass(const StringMap<int>& Mapping);
MachineFunctionPass *createNVPTXPrologEpilogPass();
bool isImageOrSamplerVal(const Value *, const Module *);

View File

@ -436,9 +436,7 @@ void NVPTXAsmPrinter::EmitFunctionEntryLabel() {
}
void NVPTXAsmPrinter::EmitFunctionBodyStart() {
const TargetRegisterInfo &TRI = *TM.getRegisterInfo();
unsigned numRegClasses = TRI.getNumRegClasses();
VRidGlobal2LocalMap = new std::map<unsigned, unsigned>[numRegClasses + 1];
VRegMapping.clear();
OutStreamer.EmitRawText(StringRef("{\n"));
setAndEmitFunctionVirtualRegisters(*MF);
@ -450,7 +448,7 @@ void NVPTXAsmPrinter::EmitFunctionBodyStart() {
void NVPTXAsmPrinter::EmitFunctionBodyEnd() {
OutStreamer.EmitRawText(StringRef("}\n"));
delete[] VRidGlobal2LocalMap;
VRegMapping.clear();
}
void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
@ -507,9 +505,8 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
void NVPTXAsmPrinter::getVirtualRegisterName(unsigned vr, bool isVec,
raw_ostream &O) {
const TargetRegisterClass *RC = MRI->getRegClass(vr);
unsigned id = RC->getID();
std::map<unsigned, unsigned> &regmap = VRidGlobal2LocalMap[id];
DenseMap<unsigned, unsigned> &regmap = VRegMapping[RC];
unsigned mapped_vr = regmap[vr];
if (!isVec) {
@ -1709,48 +1706,36 @@ void NVPTXAsmPrinter::setAndEmitFunctionVirtualRegisters(
for (unsigned i = 0; i < numVRs; i++) {
unsigned int vr = TRI->index2VirtReg(i);
const TargetRegisterClass *RC = MRI->getRegClass(vr);
std::map<unsigned, unsigned> &regmap = VRidGlobal2LocalMap[RC->getID()];
DenseMap<unsigned, unsigned> &regmap = VRegMapping[RC];
int n = regmap.size();
regmap.insert(std::make_pair(vr, n + 1));
}
// Emit register declarations
// @TODO: Extract out the real register usage
O << "\t.reg .pred %p<" << NVPTXNumRegisters << ">;\n";
O << "\t.reg .s16 %rc<" << NVPTXNumRegisters << ">;\n";
O << "\t.reg .s16 %rs<" << NVPTXNumRegisters << ">;\n";
O << "\t.reg .s32 %r<" << NVPTXNumRegisters << ">;\n";
O << "\t.reg .s64 %rl<" << NVPTXNumRegisters << ">;\n";
O << "\t.reg .f32 %f<" << NVPTXNumRegisters << ">;\n";
O << "\t.reg .f64 %fl<" << NVPTXNumRegisters << ">;\n";
// O << "\t.reg .pred %p<" << NVPTXNumRegisters << ">;\n";
// O << "\t.reg .s16 %rc<" << NVPTXNumRegisters << ">;\n";
// O << "\t.reg .s16 %rs<" << NVPTXNumRegisters << ">;\n";
// O << "\t.reg .s32 %r<" << NVPTXNumRegisters << ">;\n";
// O << "\t.reg .s64 %rl<" << NVPTXNumRegisters << ">;\n";
// O << "\t.reg .f32 %f<" << NVPTXNumRegisters << ">;\n";
// O << "\t.reg .f64 %fl<" << NVPTXNumRegisters << ">;\n";
// Emit declaration of the virtual registers or 'physical' registers for
// each register class
//for (unsigned i=0; i< numRegClasses; i++) {
// std::map<unsigned, unsigned> &regmap = VRidGlobal2LocalMap[i];
// const TargetRegisterClass *RC = TRI->getRegClass(i);
// std::string rcname = getNVPTXRegClassName(RC);
// std::string rcStr = getNVPTXRegClassStr(RC);
// //int n = regmap.size();
// if (!isNVPTXVectorRegClass(RC)) {
// O << "\t.reg " << rcname << " \t" << rcStr << "<"
// << NVPTXNumRegisters << ">;\n";
// }
for (unsigned i=0; i< TRI->getNumRegClasses(); i++) {
const TargetRegisterClass *RC = TRI->getRegClass(i);
DenseMap<unsigned, unsigned> &regmap = VRegMapping[RC];
std::string rcname = getNVPTXRegClassName(RC);
std::string rcStr = getNVPTXRegClassStr(RC);
int n = regmap.size();
// Only declare those registers that may be used. And do not emit vector
// registers as
// they are all elementized to scalar registers.
//if (n && !isNVPTXVectorRegClass(RC)) {
// if (RegAllocNilUsed) {
// O << "\t.reg " << rcname << " \t" << rcStr << "<" << (n+1)
// << ">;\n";
// }
// else {
// O << "\t.reg " << rcname << " \t" << StrToUpper(rcStr)
// << "<" << 32 << ">;\n";
// }
//}
//}
// Only declare those registers that may be used.
if (n) {
O << "\t.reg " << rcname << " \t" << rcStr << "<" << (n+1)
<< ">;\n";
}
}
OutStreamer.EmitRawText(O.str());
}

View File

@ -243,7 +243,9 @@ private:
// The contents are specific for each
// MachineFunction. But the size of the
// array is not.
std::map<unsigned, unsigned> *VRidGlobal2LocalMap;
typedef DenseMap<unsigned, unsigned> VRegMap;
typedef DenseMap<const TargetRegisterClass *, VRegMap> VRegRCMap;
VRegRCMap VRegMapping;
// cache the subtarget here.
const NVPTXSubtarget &nvptxSubtarget;
// Build the map between type name and ID based on module's type
@ -281,7 +283,6 @@ public:
: AsmPrinter(TM, Streamer),
nvptxSubtarget(TM.getSubtarget<NVPTXSubtarget>()) {
CurrentBankselLabelInBasicBlock = "";
VRidGlobal2LocalMap = NULL;
reader = NULL;
}

View File

@ -32,36 +32,36 @@ NVPTXInstrInfo::NVPTXInstrInfo(NVPTXTargetMachine &tm)
void NVPTXInstrInfo::copyPhysReg(
MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL,
unsigned DestReg, unsigned SrcReg, bool KillSrc) const {
if (NVPTX::Int32RegsRegClass.contains(DestReg) &&
NVPTX::Int32RegsRegClass.contains(SrcReg))
const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
const TargetRegisterClass *DestRC = MRI.getRegClass(DestReg);
const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
if (DestRC != SrcRC)
report_fatal_error("Attempted to created cross-class register copy");
if (DestRC == &NVPTX::Int32RegsRegClass)
BuildMI(MBB, I, DL, get(NVPTX::IMOV32rr), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
else if (NVPTX::Int8RegsRegClass.contains(DestReg) &&
NVPTX::Int8RegsRegClass.contains(SrcReg))
BuildMI(MBB, I, DL, get(NVPTX::IMOV8rr), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
else if (NVPTX::Int1RegsRegClass.contains(DestReg) &&
NVPTX::Int1RegsRegClass.contains(SrcReg))
else if (DestRC == &NVPTX::Int1RegsRegClass)
BuildMI(MBB, I, DL, get(NVPTX::IMOV1rr), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
else if (NVPTX::Float32RegsRegClass.contains(DestReg) &&
NVPTX::Float32RegsRegClass.contains(SrcReg))
else if (DestRC == &NVPTX::Float32RegsRegClass)
BuildMI(MBB, I, DL, get(NVPTX::FMOV32rr), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
else if (NVPTX::Int16RegsRegClass.contains(DestReg) &&
NVPTX::Int16RegsRegClass.contains(SrcReg))
else if (DestRC == &NVPTX::Int16RegsRegClass)
BuildMI(MBB, I, DL, get(NVPTX::IMOV16rr), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
else if (NVPTX::Int64RegsRegClass.contains(DestReg) &&
NVPTX::Int64RegsRegClass.contains(SrcReg))
else if (DestRC == &NVPTX::Int8RegsRegClass)
BuildMI(MBB, I, DL, get(NVPTX::IMOV8rr), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
else if (DestRC == &NVPTX::Int64RegsRegClass)
BuildMI(MBB, I, DL, get(NVPTX::IMOV64rr), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
else if (NVPTX::Float64RegsRegClass.contains(DestReg) &&
NVPTX::Float64RegsRegClass.contains(SrcReg))
else if (DestRC == &NVPTX::Float64RegsRegClass)
BuildMI(MBB, I, DL, get(NVPTX::FMOV64rr), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
else {
llvm_unreachable("Don't know how to copy a register");
llvm_unreachable("Bad register copy");
}
}

View File

@ -0,0 +1,225 @@
//===-- NVPTXPrologEpilogPass.cpp - NVPTX prolog/epilog inserter ----------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file is a copy of the generic LLVM PrologEpilogInserter pass, modified
// to remove unneeded functionality and to handle virtual registers. Most code
// here is a copy of PrologEpilogInserter.cpp.
//
//===----------------------------------------------------------------------===//
#include "NVPTX.h"
#include "llvm/Pass.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/Target/TargetFrameLowering.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
using namespace llvm;
namespace {
class NVPTXPrologEpilogPass : public MachineFunctionPass {
public:
static char ID;
NVPTXPrologEpilogPass() : MachineFunctionPass(ID) {}
virtual bool runOnMachineFunction(MachineFunction &MF);
private:
void calculateFrameObjectOffsets(MachineFunction &Fn);
};
}
MachineFunctionPass *llvm::createNVPTXPrologEpilogPass() {
return new NVPTXPrologEpilogPass();
}
char NVPTXPrologEpilogPass::ID = 0;
bool NVPTXPrologEpilogPass::runOnMachineFunction(MachineFunction &MF) {
const TargetMachine &TM = MF.getTarget();
const TargetFrameLowering &TFI = *TM.getFrameLowering();
const TargetRegisterInfo &TRI = *TM.getRegisterInfo();
bool Modified = false;
calculateFrameObjectOffsets(MF);
for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB) {
for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) {
MachineInstr *MI = I;
for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
if (!MI->getOperand(i).isFI())
continue;
TRI.eliminateFrameIndex(MI, 0, i, NULL);
Modified = true;
}
}
}
// Add function prolog/epilog
TFI.emitPrologue(MF);
for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
// If last instruction is a return instruction, add an epilogue
if (!I->empty() && I->back().isReturn())
TFI.emitEpilogue(MF, *I);
}
return Modified;
}
/// AdjustStackOffset - Helper function used to adjust the stack frame offset.
static inline void
AdjustStackOffset(MachineFrameInfo *MFI, int FrameIdx,
bool StackGrowsDown, int64_t &Offset,
unsigned &MaxAlign) {
// If the stack grows down, add the object size to find the lowest address.
if (StackGrowsDown)
Offset += MFI->getObjectSize(FrameIdx);
unsigned Align = MFI->getObjectAlignment(FrameIdx);
// If the alignment of this object is greater than that of the stack, then
// increase the stack alignment to match.
MaxAlign = std::max(MaxAlign, Align);
// Adjust to alignment boundary.
Offset = (Offset + Align - 1) / Align * Align;
if (StackGrowsDown) {
DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << -Offset << "]\n");
MFI->setObjectOffset(FrameIdx, -Offset); // Set the computed offset
} else {
DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << Offset << "]\n");
MFI->setObjectOffset(FrameIdx, Offset);
Offset += MFI->getObjectSize(FrameIdx);
}
}
void
NVPTXPrologEpilogPass::calculateFrameObjectOffsets(MachineFunction &Fn) {
const TargetFrameLowering &TFI = *Fn.getTarget().getFrameLowering();
const TargetRegisterInfo *RegInfo = Fn.getTarget().getRegisterInfo();
bool StackGrowsDown =
TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown;
// Loop over all of the stack objects, assigning sequential addresses...
MachineFrameInfo *MFI = Fn.getFrameInfo();
// Start at the beginning of the local area.
// The Offset is the distance from the stack top in the direction
// of stack growth -- so it's always nonnegative.
int LocalAreaOffset = TFI.getOffsetOfLocalArea();
if (StackGrowsDown)
LocalAreaOffset = -LocalAreaOffset;
assert(LocalAreaOffset >= 0
&& "Local area offset should be in direction of stack growth");
int64_t Offset = LocalAreaOffset;
// If there are fixed sized objects that are preallocated in the local area,
// non-fixed objects can't be allocated right at the start of local area.
// We currently don't support filling in holes in between fixed sized
// objects, so we adjust 'Offset' to point to the end of last fixed sized
// preallocated object.
for (int i = MFI->getObjectIndexBegin(); i != 0; ++i) {
int64_t FixedOff;
if (StackGrowsDown) {
// The maximum distance from the stack pointer is at lower address of
// the object -- which is given by offset. For down growing stack
// the offset is negative, so we negate the offset to get the distance.
FixedOff = -MFI->getObjectOffset(i);
} else {
// The maximum distance from the start pointer is at the upper
// address of the object.
FixedOff = MFI->getObjectOffset(i) + MFI->getObjectSize(i);
}
if (FixedOff > Offset) Offset = FixedOff;
}
// NOTE: We do not have a call stack
unsigned MaxAlign = MFI->getMaxAlignment();
// No scavenger
// FIXME: Once this is working, then enable flag will change to a target
// check for whether the frame is large enough to want to use virtual
// frame index registers. Functions which don't want/need this optimization
// will continue to use the existing code path.
if (MFI->getUseLocalStackAllocationBlock()) {
unsigned Align = MFI->getLocalFrameMaxAlign();
// Adjust to alignment boundary.
Offset = (Offset + Align - 1) / Align * Align;
DEBUG(dbgs() << "Local frame base offset: " << Offset << "\n");
// Resolve offsets for objects in the local block.
for (unsigned i = 0, e = MFI->getLocalFrameObjectCount(); i != e; ++i) {
std::pair<int, int64_t> Entry = MFI->getLocalFrameObjectMap(i);
int64_t FIOffset = (StackGrowsDown ? -Offset : Offset) + Entry.second;
DEBUG(dbgs() << "alloc FI(" << Entry.first << ") at SP[" <<
FIOffset << "]\n");
MFI->setObjectOffset(Entry.first, FIOffset);
}
// Allocate the local block
Offset += MFI->getLocalFrameSize();
MaxAlign = std::max(Align, MaxAlign);
}
// No stack protector
// Then assign frame offsets to stack objects that are not used to spill
// callee saved registers.
for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) {
if (MFI->isObjectPreAllocated(i) &&
MFI->getUseLocalStackAllocationBlock())
continue;
if (MFI->isDeadObjectIndex(i))
continue;
AdjustStackOffset(MFI, i, StackGrowsDown, Offset, MaxAlign);
}
// No scavenger
if (!TFI.targetHandlesStackFrameRounding()) {
// If we have reserved argument space for call sites in the function
// immediately on entry to the current function, count it as part of the
// overall stack size.
if (MFI->adjustsStack() && TFI.hasReservedCallFrame(Fn))
Offset += MFI->getMaxCallFrameSize();
// Round up the size to a multiple of the alignment. If the function has
// any calls or alloca's, align to the target's StackAlignment value to
// ensure that the callee's frame or the alloca data is suitably aligned;
// otherwise, for leaf functions, align to the TransientStackAlignment
// value.
unsigned StackAlign;
if (MFI->adjustsStack() || MFI->hasVarSizedObjects() ||
(RegInfo->needsStackRealignment(Fn) && MFI->getObjectIndexEnd() != 0))
StackAlign = TFI.getStackAlignment();
else
StackAlign = TFI.getTransientStackAlignment();
// If the frame pointer is eliminated, all frame offsets will be relative to
// SP not FP. Align to MaxAlign so this works.
StackAlign = std::max(StackAlign, MaxAlign);
unsigned AlignMask = StackAlign - 1;
Offset = (Offset + AlignMask) & ~uint64_t(AlignMask);
}
// Update frame info to pretend that this is part of the stack...
int64_t StackSize = Offset - LocalAreaOffset;
MFI->setStackSize(StackSize);
}

View File

@ -57,9 +57,9 @@ std::string getNVPTXRegClassStr(TargetRegisterClass const *RC) {
return "%f";
}
if (RC == &NVPTX::Float64RegsRegClass) {
return "%fd";
return "%fl";
} else if (RC == &NVPTX::Int64RegsRegClass) {
return "%rd";
return "%rl";
} else if (RC == &NVPTX::Int32RegsRegClass) {
return "%r";
} else if (RC == &NVPTX::Int16RegsRegClass) {

View File

@ -107,6 +107,10 @@ public:
virtual void addIRPasses();
virtual bool addInstSelector();
virtual bool addPreRegAlloc();
virtual bool addPostRegAlloc();
virtual void addFastRegAlloc(FunctionPass *RegAllocPass);
virtual void addOptimizedRegAlloc(FunctionPass *RegAllocPass);
};
} // end anonymous namespace
@ -116,6 +120,15 @@ TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) {
}
void NVPTXPassConfig::addIRPasses() {
// The following passes are known to not play well with virtual regs hanging
// around after register allocation (which in our case, is *all* registers).
// We explicitly disable them here. We do, however, need some functionality
// of the PrologEpilogCodeInserter pass, so we emulate that behavior in the
// NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp).
disablePass(&PrologEpilogCodeInserterID);
disablePass(&MachineCopyPropagationID);
disablePass(&BranchFolderPassID);
TargetPassConfig::addIRPasses();
addPass(createGenericToNVVMPass());
}
@ -129,3 +142,17 @@ bool NVPTXPassConfig::addInstSelector() {
}
bool NVPTXPassConfig::addPreRegAlloc() { return false; }
bool NVPTXPassConfig::addPostRegAlloc() {
addPass(createNVPTXPrologEpilogPass());
return false;
}
void NVPTXPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
// No reg alloc
addPass(&StrongPHIEliminationID);
}
void NVPTXPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
// No reg alloc
addPass(&StrongPHIEliminationID);
}

View File

@ -2,231 +2,231 @@
; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
define ptx_device i32 @test_tid_x() {
; CHECK: mov.u32 %r0, %tid.x;
; CHECK: mov.u32 %r{{[0-9]+}}, %tid.x;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.tid.x()
ret i32 %x
}
define ptx_device i32 @test_tid_y() {
; CHECK: mov.u32 %r0, %tid.y;
; CHECK: mov.u32 %r{{[0-9]+}}, %tid.y;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.tid.y()
ret i32 %x
}
define ptx_device i32 @test_tid_z() {
; CHECK: mov.u32 %r0, %tid.z;
; CHECK: mov.u32 %r{{[0-9]+}}, %tid.z;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.tid.z()
ret i32 %x
}
define ptx_device i32 @test_tid_w() {
; CHECK: mov.u32 %r0, %tid.w;
; CHECK: mov.u32 %r{{[0-9]+}}, %tid.w;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.tid.w()
ret i32 %x
}
define ptx_device i32 @test_ntid_x() {
; CHECK: mov.u32 %r0, %ntid.x;
; CHECK: mov.u32 %r{{[0-9]+}}, %ntid.x;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.ntid.x()
ret i32 %x
}
define ptx_device i32 @test_ntid_y() {
; CHECK: mov.u32 %r0, %ntid.y;
; CHECK: mov.u32 %r{{[0-9]+}}, %ntid.y;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.ntid.y()
ret i32 %x
}
define ptx_device i32 @test_ntid_z() {
; CHECK: mov.u32 %r0, %ntid.z;
; CHECK: mov.u32 %r{{[0-9]+}}, %ntid.z;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.ntid.z()
ret i32 %x
}
define ptx_device i32 @test_ntid_w() {
; CHECK: mov.u32 %r0, %ntid.w;
; CHECK: mov.u32 %r{{[0-9]+}}, %ntid.w;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.ntid.w()
ret i32 %x
}
define ptx_device i32 @test_laneid() {
; CHECK: mov.u32 %r0, %laneid;
; CHECK: mov.u32 %r{{[0-9]+}}, %laneid;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.laneid()
ret i32 %x
}
define ptx_device i32 @test_warpid() {
; CHECK: mov.u32 %r0, %warpid;
; CHECK: mov.u32 %r{{[0-9]+}}, %warpid;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.warpid()
ret i32 %x
}
define ptx_device i32 @test_nwarpid() {
; CHECK: mov.u32 %r0, %nwarpid;
; CHECK: mov.u32 %r{{[0-9]+}}, %nwarpid;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.nwarpid()
ret i32 %x
}
define ptx_device i32 @test_ctaid_x() {
; CHECK: mov.u32 %r0, %ctaid.x;
; CHECK: mov.u32 %r{{[0-9]+}}, %ctaid.x;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.ctaid.x()
ret i32 %x
}
define ptx_device i32 @test_ctaid_y() {
; CHECK: mov.u32 %r0, %ctaid.y;
; CHECK: mov.u32 %r{{[0-9]+}}, %ctaid.y;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.ctaid.y()
ret i32 %x
}
define ptx_device i32 @test_ctaid_z() {
; CHECK: mov.u32 %r0, %ctaid.z;
; CHECK: mov.u32 %r{{[0-9]+}}, %ctaid.z;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.ctaid.z()
ret i32 %x
}
define ptx_device i32 @test_ctaid_w() {
; CHECK: mov.u32 %r0, %ctaid.w;
; CHECK: mov.u32 %r{{[0-9]+}}, %ctaid.w;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.ctaid.w()
ret i32 %x
}
define ptx_device i32 @test_nctaid_x() {
; CHECK: mov.u32 %r0, %nctaid.x;
; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.x;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.nctaid.x()
ret i32 %x
}
define ptx_device i32 @test_nctaid_y() {
; CHECK: mov.u32 %r0, %nctaid.y;
; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.y;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.nctaid.y()
ret i32 %x
}
define ptx_device i32 @test_nctaid_z() {
; CHECK: mov.u32 %r0, %nctaid.z;
; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.z;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.nctaid.z()
ret i32 %x
}
define ptx_device i32 @test_nctaid_w() {
; CHECK: mov.u32 %r0, %nctaid.w;
; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.w;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.nctaid.w()
ret i32 %x
}
define ptx_device i32 @test_smid() {
; CHECK: mov.u32 %r0, %smid;
; CHECK: mov.u32 %r{{[0-9]+}}, %smid;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.smid()
ret i32 %x
}
define ptx_device i32 @test_nsmid() {
; CHECK: mov.u32 %r0, %nsmid;
; CHECK: mov.u32 %r{{[0-9]+}}, %nsmid;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.nsmid()
ret i32 %x
}
define ptx_device i32 @test_gridid() {
; CHECK: mov.u32 %r0, %gridid;
; CHECK: mov.u32 %r{{[0-9]+}}, %gridid;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.gridid()
ret i32 %x
}
define ptx_device i32 @test_lanemask_eq() {
; CHECK: mov.u32 %r0, %lanemask_eq;
; CHECK: mov.u32 %r{{[0-9]+}}, %lanemask_eq;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.lanemask.eq()
ret i32 %x
}
define ptx_device i32 @test_lanemask_le() {
; CHECK: mov.u32 %r0, %lanemask_le;
; CHECK: mov.u32 %r{{[0-9]+}}, %lanemask_le;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.lanemask.le()
ret i32 %x
}
define ptx_device i32 @test_lanemask_lt() {
; CHECK: mov.u32 %r0, %lanemask_lt;
; CHECK: mov.u32 %r{{[0-9]+}}, %lanemask_lt;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.lanemask.lt()
ret i32 %x
}
define ptx_device i32 @test_lanemask_ge() {
; CHECK: mov.u32 %r0, %lanemask_ge;
; CHECK: mov.u32 %r{{[0-9]+}}, %lanemask_ge;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.lanemask.ge()
ret i32 %x
}
define ptx_device i32 @test_lanemask_gt() {
; CHECK: mov.u32 %r0, %lanemask_gt;
; CHECK: mov.u32 %r{{[0-9]+}}, %lanemask_gt;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.lanemask.gt()
ret i32 %x
}
define ptx_device i32 @test_clock() {
; CHECK: mov.u32 %r0, %clock;
; CHECK: mov.u32 %r{{[0-9]+}}, %clock;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.clock()
ret i32 %x
}
define ptx_device i64 @test_clock64() {
; CHECK: mov.u64 %rl0, %clock64;
; CHECK: mov.u64 %rl{{[0-9]+}}, %clock64;
; CHECK: ret;
%x = call i64 @llvm.ptx.read.clock64()
ret i64 %x
}
define ptx_device i32 @test_pm0() {
; CHECK: mov.u32 %r0, %pm0;
; CHECK: mov.u32 %r{{[0-9]+}}, %pm0;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.pm0()
ret i32 %x
}
define ptx_device i32 @test_pm1() {
; CHECK: mov.u32 %r0, %pm1;
; CHECK: mov.u32 %r{{[0-9]+}}, %pm1;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.pm1()
ret i32 %x
}
define ptx_device i32 @test_pm2() {
; CHECK: mov.u32 %r0, %pm2;
; CHECK: mov.u32 %r{{[0-9]+}}, %pm2;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.pm2()
ret i32 %x
}
define ptx_device i32 @test_pm3() {
; CHECK: mov.u32 %r0, %pm3;
; CHECK: mov.u32 %r{{[0-9]+}}, %pm3;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.pm3()
ret i32 %x

View File

@ -2,14 +2,14 @@
; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
define ptx_device float @test_fabsf(float %f) {
; CHECK: abs.f32 %f0, %f0;
; CHECK: abs.f32 %f{{[0-9]+}}, %f{{[0-9]+}};
; CHECK: ret;
%x = call float @llvm.fabs.f32(float %f)
ret float %x
}
define ptx_device double @test_fabs(double %d) {
; CHECK: abs.f64 %fl0, %fl0;
; CHECK: abs.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}};
; CHECK: ret;
%x = call double @llvm.fabs.f64(double %d)
ret double %x