From b1a24afa7a2c317de4da7e25ca003a918e45ac74 Mon Sep 17 00:00:00 2001 From: Artem Belevich Date: Tue, 21 Feb 2017 22:56:05 +0000 Subject: [PATCH] [NVPTX] Unify vectorization of load/stores of aggregate arguments and return values. Original code only used vector loads/stores for explicit vector arguments. It could also do more loads/stores than necessary (e.g v5f32 would touch 8 f32 values). Aggregate types were loaded one element at a time, even the vectors contained within. This change attempts to generalize (and simplify) parameter space loads/stores so that vector loads/stores can be used more broadly. Functionality of the patch has been verified by compiling thrust test suite and manually checking the differences between PTX generated by llvm with and without the patch. General algorithm: * ComputePTXValueVTs() flattens input/output argument into a flat list of scalars to load/store and returns their types and offsets. * VectorizePTXValueVTs() uses that data to create vectorization plan which returns an array of flags marking boundaries of vectorized load/stores. Scalars are represented as 1-element vectors. * Code that generates loads/stores implements a simple state machine that constructs a vector according to the plan. Differential Revision: https://reviews.llvm.org/D30011 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@295784 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/NVPTX/NVPTXISelLowering.cpp | 1148 +++++++++-------------- test/CodeGen/NVPTX/aggregate-return.ll | 35 +- test/CodeGen/NVPTX/f16-instructions.ll | 2 +- test/CodeGen/NVPTX/ldparam-v4.ll | 5 +- test/CodeGen/NVPTX/lower-aggr-copies.ll | 27 +- test/CodeGen/NVPTX/param-load-store.ll | 813 ++++++++++++++++ test/CodeGen/NVPTX/vec-param-load.ll | 83 +- test/CodeGen/NVPTX/vec8.ll | 13 +- test/CodeGen/NVPTX/vector-call.ll | 22 +- 9 files changed, 1393 insertions(+), 755 deletions(-) create mode 100644 test/CodeGen/NVPTX/param-load-store.ll diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp index 9584776e185..d18c7debba5 100644 --- a/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -184,6 +184,125 @@ static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, } } +// Check whether we can merge loads/stores of some of the pieces of a +// flattened function parameter or return value into a single vector +// load/store. +// +// The flattened parameter is represented as a list of EVTs and +// offsets, and the whole structure is aligned to ParamAlignment. This +// function determines whether we can load/store pieces of the +// parameter starting at index Idx using a single vectorized op of +// size AccessSize. If so, it returns the number of param pieces +// covered by the vector op. Otherwise, it returns 1. +static unsigned CanMergeParamLoadStoresStartingAt( + unsigned Idx, uint32_t AccessSize, const SmallVectorImpl &ValueVTs, + const SmallVectorImpl &Offsets, unsigned ParamAlignment) { + assert(isPowerOf2_32(AccessSize) && "must be a power of 2!"); + + // Can't vectorize if param alignment is not sufficient. + if (AccessSize > ParamAlignment) + return 1; + // Can't vectorize if offset is not aligned. + if (Offsets[Idx] & (AccessSize - 1)) + return 1; + + EVT EltVT = ValueVTs[Idx]; + unsigned EltSize = EltVT.getStoreSize(); + + // Element is too large to vectorize. + if (EltSize >= AccessSize) + return 1; + + unsigned NumElts = AccessSize / EltSize; + // Can't vectorize if AccessBytes if not a multiple of EltSize. + if (AccessSize != EltSize * NumElts) + return 1; + + // We don't have enough elements to vectorize. + if (Idx + NumElts > ValueVTs.size()) + return 1; + + // PTX ISA can only deal with 2- and 4-element vector ops. + if (NumElts != 4 && NumElts != 2) + return 1; + + for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) { + // Types do not match. + if (ValueVTs[j] != EltVT) + return 1; + + // Elements are not contiguous. + if (Offsets[j] - Offsets[j - 1] != EltSize) + return 1; + } + // OK. We can vectorize ValueVTs[i..i+NumElts) + return NumElts; +} + +// Flags for tracking per-element vectorization state of loads/stores +// of a flattened function parameter or return value. +enum ParamVectorizationFlags { + PVF_INNER = 0x0, // Middle elements of a vector. + PVF_FIRST = 0x1, // First element of the vector. + PVF_LAST = 0x2, // Last element of the vector. + // Scalar is effectively a 1-element vector. + PVF_SCALAR = PVF_FIRST | PVF_LAST +}; + +// Computes whether and how we can vectorize the loads/stores of a +// flattened function parameter or return value. +// +// The flattened parameter is represented as the list of ValueVTs and +// Offsets, and is aligned to ParamAlignment bytes. We return a vector +// of the same size as ValueVTs indicating how each piece should be +// loaded/stored (i.e. as a scalar, or as part of a vector +// load/store). +static SmallVector +VectorizePTXValueVTs(const SmallVectorImpl &ValueVTs, + const SmallVectorImpl &Offsets, + unsigned ParamAlignment) { + // Set vector size to match ValueVTs and mark all elements as + // scalars by default. + SmallVector VectorInfo; + VectorInfo.assign(ValueVTs.size(), PVF_SCALAR); + + // Check what we can vectorize using 128/64/32-bit accesses. + for (int I = 0, E = ValueVTs.size(); I != E; ++I) { + // Skip elements we've already processed. + assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state."); + for (unsigned AccessSize : {16, 8, 4, 2}) { + unsigned NumElts = CanMergeParamLoadStoresStartingAt( + I, AccessSize, ValueVTs, Offsets, ParamAlignment); + // Mark vectorized elements. + switch (NumElts) { + default: + llvm_unreachable("Unexpected return value"); + case 1: + // Can't vectorize using this size, try next smaller size. + continue; + case 2: + assert(I + 1 < E && "Not enough elements."); + VectorInfo[I] = PVF_FIRST; + VectorInfo[I + 1] = PVF_LAST; + I += 1; + break; + case 4: + assert(I + 3 < E && "Not enough elements."); + VectorInfo[I] = PVF_FIRST; + VectorInfo[I + 1] = PVF_INNER; + VectorInfo[I + 2] = PVF_INNER; + VectorInfo[I + 3] = PVF_LAST; + I += 3; + break; + } + // Break out of the inner loop because we've already succeeded + // using largest possible AccessSize. + break; + } + } + return VectorInfo; +} + // NVPTXTargetLowering Constructor. NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, const NVPTXSubtarget &STI) @@ -1276,21 +1395,18 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SDValue Callee = CLI.Callee; bool &isTailCall = CLI.IsTailCall; ArgListTy &Args = CLI.getArgs(); - Type *retTy = CLI.RetTy; + Type *RetTy = CLI.RetTy; ImmutableCallSite *CS = CLI.CS; + const DataLayout &DL = DAG.getDataLayout(); bool isABI = (STI.getSmVersion() >= 20); assert(isABI && "Non-ABI compilation is not supported"); if (!isABI) return Chain; - MachineFunction &MF = DAG.getMachineFunction(); - const Function *F = MF.getFunction(); - auto &DL = MF.getDataLayout(); SDValue tempChain = Chain; - Chain = DAG.getCALLSEQ_START(Chain, - DAG.getIntPtrConstant(uniqueCallSite, dl, true), - dl); + Chain = DAG.getCALLSEQ_START( + Chain, DAG.getIntPtrConstant(uniqueCallSite, dl, true), dl); SDValue InFlag = Chain.getValue(1); unsigned paramCount = 0; @@ -1311,244 +1427,124 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Type *Ty = Args[i].Ty; if (!Outs[OIdx].Flags.isByVal()) { - if (Ty->isAggregateType()) { - // aggregate - SmallVector vtparts; - SmallVector Offsets; - ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts, &Offsets, - 0); - - unsigned align = - getArgumentAlignment(Callee, CS, Ty, paramCount + 1, DL); - // declare .param .align .b8 .param[]; - unsigned sz = DL.getTypeAllocSize(Ty); - SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue DeclareParamOps[] = { Chain, DAG.getConstant(align, dl, - MVT::i32), - DAG.getConstant(paramCount, dl, MVT::i32), - DAG.getConstant(sz, dl, MVT::i32), - InFlag }; - Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, - DeclareParamOps); - InFlag = Chain.getValue(1); - for (unsigned j = 0, je = vtparts.size(); j != je; ++j) { - EVT elemtype = vtparts[j]; - unsigned ArgAlign = GreatestCommonDivisor64(align, Offsets[j]); - if (elemtype.isInteger() && (sz < 8)) - sz = 8; - SDValue StVal = OutVals[OIdx]; - if (elemtype.getSizeInBits() < 16) { - StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal); - } - SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue CopyParamOps[] = { Chain, - DAG.getConstant(paramCount, dl, MVT::i32), - DAG.getConstant(Offsets[j], dl, MVT::i32), - StVal, InFlag }; - Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, - CopyParamVTs, CopyParamOps, - elemtype, MachinePointerInfo(), - ArgAlign); - InFlag = Chain.getValue(1); - ++OIdx; - } - if (vtparts.size() > 0) - --OIdx; - ++paramCount; - continue; - } - if (Ty->isVectorTy()) { - EVT ObjectVT = getValueType(DL, Ty); - unsigned align = - getArgumentAlignment(Callee, CS, Ty, paramCount + 1, DL); - // declare .param .align .b8 .param[]; - unsigned sz = DL.getTypeAllocSize(Ty); - SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue DeclareParamOps[] = { Chain, - DAG.getConstant(align, dl, MVT::i32), - DAG.getConstant(paramCount, dl, MVT::i32), - DAG.getConstant(sz, dl, MVT::i32), - InFlag }; - Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, - DeclareParamOps); - InFlag = Chain.getValue(1); - unsigned NumElts = ObjectVT.getVectorNumElements(); - EVT EltVT = ObjectVT.getVectorElementType(); - EVT MemVT = EltVT; - bool NeedExtend = false; - if (EltVT.getSizeInBits() < 16) { - NeedExtend = true; - EltVT = MVT::i16; - } - - // V1 store - if (NumElts == 1) { - SDValue Elt = OutVals[OIdx++]; - if (NeedExtend) - Elt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Elt); - - SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue CopyParamOps[] = { Chain, - DAG.getConstant(paramCount, dl, MVT::i32), - DAG.getConstant(0, dl, MVT::i32), Elt, - InFlag }; - Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, - CopyParamVTs, CopyParamOps, - MemVT, MachinePointerInfo()); - InFlag = Chain.getValue(1); - } else if (NumElts == 2) { - SDValue Elt0 = OutVals[OIdx++]; - SDValue Elt1 = OutVals[OIdx++]; - if (NeedExtend) { - Elt0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Elt0); - Elt1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Elt1); - } - - SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue CopyParamOps[] = { Chain, - DAG.getConstant(paramCount, dl, MVT::i32), - DAG.getConstant(0, dl, MVT::i32), Elt0, - Elt1, InFlag }; - Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParamV2, dl, - CopyParamVTs, CopyParamOps, - MemVT, MachinePointerInfo()); - InFlag = Chain.getValue(1); - } else { - unsigned curOffset = 0; - // V4 stores - // We have at least 4 elements (<3 x Ty> expands to 4 elements) and - // the - // vector will be expanded to a power of 2 elements, so we know we can - // always round up to the next multiple of 4 when creating the vector - // stores. - // e.g. 4 elem => 1 st.v4 - // 6 elem => 2 st.v4 - // 8 elem => 2 st.v4 - // 11 elem => 3 st.v4 - unsigned VecSize = 4; - if (EltVT.getSizeInBits() == 64) - VecSize = 2; - - // This is potentially only part of a vector, so assume all elements - // are packed together. - unsigned PerStoreOffset = MemVT.getStoreSizeInBits() / 8 * VecSize; - - for (unsigned i = 0; i < NumElts; i += VecSize) { - // Get values - SDValue StoreVal; - SmallVector Ops; - Ops.push_back(Chain); - Ops.push_back(DAG.getConstant(paramCount, dl, MVT::i32)); - Ops.push_back(DAG.getConstant(curOffset, dl, MVT::i32)); - - unsigned Opc = NVPTXISD::StoreParamV2; - - StoreVal = OutVals[OIdx++]; - if (NeedExtend) - StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal); - Ops.push_back(StoreVal); - - if (i + 1 < NumElts) { - StoreVal = OutVals[OIdx++]; - if (NeedExtend) - StoreVal = - DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal); - } else { - StoreVal = DAG.getUNDEF(EltVT); - } - Ops.push_back(StoreVal); - - if (VecSize == 4) { - Opc = NVPTXISD::StoreParamV4; - if (i + 2 < NumElts) { - StoreVal = OutVals[OIdx++]; - if (NeedExtend) - StoreVal = - DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal); - } else { - StoreVal = DAG.getUNDEF(EltVT); - } - Ops.push_back(StoreVal); - - if (i + 3 < NumElts) { - StoreVal = OutVals[OIdx++]; - if (NeedExtend) - StoreVal = - DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal); - } else { - StoreVal = DAG.getUNDEF(EltVT); - } - Ops.push_back(StoreVal); - } - - Ops.push_back(InFlag); - - SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); - Chain = DAG.getMemIntrinsicNode(Opc, dl, CopyParamVTs, Ops, - MemVT, MachinePointerInfo()); - InFlag = Chain.getValue(1); - curOffset += PerStoreOffset; - } - } - ++paramCount; - --OIdx; - continue; - } - // Plain scalar - // for ABI, declare .param .b .param; - unsigned sz = VT.getSizeInBits(); - bool needExtend = false; - if (VT.isInteger()) { - if (sz < 16) - needExtend = true; - if (sz < 32) - sz = 32; - } else if (VT.isFloatingPoint() && sz < 32) - // PTX ABI requires all scalar parameters to be at least 32 - // bits in size. fp16 normally uses .b16 as its storage type - // in PTX, so its size must be adjusted here, too. - sz = 32; + SmallVector VTs; + SmallVector Offsets; + ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets); + unsigned ArgAlign = + getArgumentAlignment(Callee, CS, Ty, paramCount + 1, DL); + unsigned AllocSize = DL.getTypeAllocSize(Ty); SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue DeclareParamOps[] = { Chain, - DAG.getConstant(paramCount, dl, MVT::i32), - DAG.getConstant(sz, dl, MVT::i32), - DAG.getConstant(0, dl, MVT::i32), InFlag }; - Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs, - DeclareParamOps); - InFlag = Chain.getValue(1); - SDValue OutV = OutVals[OIdx]; - if (needExtend) { - // zext/sext i1 to i16 - unsigned opc = ISD::ZERO_EXTEND; - if (Outs[OIdx].Flags.isSExt()) - opc = ISD::SIGN_EXTEND; - OutV = DAG.getNode(opc, dl, MVT::i16, OutV); + bool NeedAlign; // Does argument declaration specify alignment? + if (Ty->isAggregateType() || Ty->isVectorTy()) { + // declare .param .align .b8 .param[]; + SDValue DeclareParamOps[] = { + Chain, DAG.getConstant(ArgAlign, dl, MVT::i32), + DAG.getConstant(paramCount, dl, MVT::i32), + DAG.getConstant(AllocSize, dl, MVT::i32), InFlag}; + Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, + DeclareParamOps); + NeedAlign = true; + } else { + // declare .param .b .param; + if ((VT.isInteger() || VT.isFloatingPoint()) && AllocSize < 4) { + // PTX ABI requires integral types to be at least 32 bits in + // size. FP16 is loaded/stored using i16, so it's handled + // here as well. + AllocSize = 4; + } + SDValue DeclareScalarParamOps[] = { + Chain, DAG.getConstant(paramCount, dl, MVT::i32), + DAG.getConstant(AllocSize * 8, dl, MVT::i32), + DAG.getConstant(0, dl, MVT::i32), InFlag}; + Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs, + DeclareScalarParamOps); + NeedAlign = false; } - SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue CopyParamOps[] = { Chain, - DAG.getConstant(paramCount, dl, MVT::i32), - DAG.getConstant(0, dl, MVT::i32), OutV, - InFlag }; - - unsigned opcode = NVPTXISD::StoreParam; - if (Outs[OIdx].Flags.isZExt() && VT.getSizeInBits() < 32) - opcode = NVPTXISD::StoreParamU32; - else if (Outs[OIdx].Flags.isSExt() && VT.getSizeInBits() < 32) - opcode = NVPTXISD::StoreParamS32; - Chain = DAG.getMemIntrinsicNode(opcode, dl, CopyParamVTs, CopyParamOps, - VT, MachinePointerInfo()); - InFlag = Chain.getValue(1); + + // PTX Interoperability Guide 3.3(A): [Integer] Values shorter + // than 32-bits are sign extended or zero extended, depending on + // whether they are signed or unsigned types. This case applies + // only to scalar parameters and not to aggregate values. + bool ExtendIntegerParam = + Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32; + + auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign); + SmallVector StoreOperands; + for (unsigned j = 0, je = VTs.size(); j != je; ++j) { + // New store. + if (VectorInfo[j] & PVF_FIRST) { + assert(StoreOperands.empty() && "Unfinished preceeding store."); + StoreOperands.push_back(Chain); + StoreOperands.push_back(DAG.getConstant(paramCount, dl, MVT::i32)); + StoreOperands.push_back(DAG.getConstant(Offsets[j], dl, MVT::i32)); + } + + EVT EltVT = VTs[j]; + SDValue StVal = OutVals[OIdx]; + if (ExtendIntegerParam) { + assert(VTs.size() == 1 && "Scalar can't have multiple parts."); + // zext/sext to i32 + StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND + : ISD::ZERO_EXTEND, + dl, MVT::i32, StVal); + } else if (EltVT.getSizeInBits() < 16) { + // Use 16-bit registers for small stores as it's the + // smallest general purpose register size supported by NVPTX. + StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal); + } + + // Record the value to store. + StoreOperands.push_back(StVal); + + if (VectorInfo[j] & PVF_LAST) { + unsigned NumElts = StoreOperands.size() - 3; + NVPTXISD::NodeType Op; + switch (NumElts) { + case 1: + Op = NVPTXISD::StoreParam; + break; + case 2: + Op = NVPTXISD::StoreParamV2; + break; + case 4: + Op = NVPTXISD::StoreParamV4; + break; + default: + llvm_unreachable("Invalid vector info."); + } + + StoreOperands.push_back(InFlag); + + // Adjust type of the store op if we've extended the scalar + // return value. + EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : VTs[j]; + unsigned EltAlign = + NeedAlign ? GreatestCommonDivisor64(ArgAlign, Offsets[j]) : 0; + + Chain = DAG.getMemIntrinsicNode( + Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands, + TheStoreType, MachinePointerInfo(), EltAlign); + InFlag = Chain.getValue(1); + + // Cleanup. + StoreOperands.clear(); + } + ++OIdx; + } + assert(StoreOperands.empty() && "Unfinished paramter store."); + if (VTs.size() > 0) + --OIdx; ++paramCount; continue; } - // struct or vector - SmallVector vtparts; + + // ByVal arguments + SmallVector VTs; SmallVector Offsets; auto *PTy = dyn_cast(Args[i].Ty); assert(PTy && "Type of a byval parameter should be pointer"); - ComputePTXValueVTs(*this, DAG.getDataLayout(), PTy->getElementType(), - vtparts, &Offsets, 0); + ComputePTXValueVTs(*this, DL, PTy->getElementType(), VTs, &Offsets, 0); // declare .param .align .b8 .param[]; unsigned sz = Outs[OIdx].Flags.getByValSize(); @@ -1569,11 +1565,11 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, DeclareParamOps); InFlag = Chain.getValue(1); - for (unsigned j = 0, je = vtparts.size(); j != je; ++j) { - EVT elemtype = vtparts[j]; + for (unsigned j = 0, je = VTs.size(); j != je; ++j) { + EVT elemtype = VTs[j]; int curOffset = Offsets[j]; unsigned PartAlign = GreatestCommonDivisor64(ArgAlign, curOffset); - auto PtrVT = getPointerTy(DAG.getDataLayout()); + auto PtrVT = getPointerTy(DL); SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, OutVals[OIdx], DAG.getConstant(curOffset, dl, PtrVT)); SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr, @@ -1601,18 +1597,18 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Handle Result if (Ins.size() > 0) { SmallVector resvtparts; - ComputeValueVTs(*this, DL, retTy, resvtparts); + ComputeValueVTs(*this, DL, RetTy, resvtparts); // Declare // .param .align 16 .b8 retval0[], or // .param .b retval0 - unsigned resultsz = DL.getTypeAllocSizeInBits(retTy); + unsigned resultsz = DL.getTypeAllocSizeInBits(RetTy); // Emit ".param .b retval0" instead of byte arrays only for // these three types to match the logic in // NVPTXAsmPrinter::printReturnValStr and NVPTXTargetLowering::getPrototype. // Plus, this behavior is consistent with nvcc's. - if (retTy->isFloatingPointTy() || retTy->isIntegerTy() || - retTy->isPointerTy()) { + if (RetTy->isFloatingPointTy() || RetTy->isIntegerTy() || + RetTy->isPointerTy()) { // Scalar needs to be at least 32bit wide if (resultsz < 32) resultsz = 32; @@ -1624,7 +1620,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, DeclareRetOps); InFlag = Chain.getValue(1); } else { - retAlignment = getArgumentAlignment(Callee, CS, retTy, 0, DL); + retAlignment = getArgumentAlignment(Callee, CS, RetTy, 0, DL); SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue); SDValue DeclareRetOps[] = { Chain, DAG.getConstant(retAlignment, dl, MVT::i32), @@ -1645,8 +1641,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // The prototype is embedded in a string and put as the operand for a // CallPrototype SDNode which will print out to the value of the string. SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue); - std::string Proto = - getPrototype(DAG.getDataLayout(), retTy, Args, Outs, retAlignment, CS); + std::string Proto = getPrototype(DL, RetTy, Args, Outs, retAlignment, CS); const char *ProtoStr = nvTM->getManagedStrPool()->getManagedString(Proto.c_str())->c_str(); SDValue ProtoOps[] = { @@ -1711,175 +1706,84 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Generate loads from param memory/moves from registers for result if (Ins.size() > 0) { - if (retTy && retTy->isVectorTy()) { - EVT ObjectVT = getValueType(DL, retTy); - unsigned NumElts = ObjectVT.getVectorNumElements(); - EVT EltVT = ObjectVT.getVectorElementType(); - assert(STI.getTargetLowering()->getNumRegisters(F->getContext(), - ObjectVT) == NumElts && - "Vector was not scalarized"); - unsigned sz = EltVT.getSizeInBits(); - bool needTruncate = sz < 8; + SmallVector VTs; + SmallVector Offsets; + ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0); + assert(VTs.size() == Ins.size() && "Bad value decomposition"); - if (NumElts == 1) { - // Just a simple load - SmallVector LoadRetVTs; - if (EltVT == MVT::i1 || EltVT == MVT::i8) { - // If loading i1/i8 result, generate - // load.b8 i16 - // if i1 - // trunc i16 to i1 - LoadRetVTs.push_back(MVT::i16); - } else - LoadRetVTs.push_back(EltVT); - LoadRetVTs.push_back(MVT::Other); - LoadRetVTs.push_back(MVT::Glue); - SDValue LoadRetOps[] = {Chain, DAG.getConstant(1, dl, MVT::i32), - DAG.getConstant(0, dl, MVT::i32), InFlag}; - SDValue retval = DAG.getMemIntrinsicNode( - NVPTXISD::LoadParam, dl, - DAG.getVTList(LoadRetVTs), LoadRetOps, EltVT, MachinePointerInfo()); - Chain = retval.getValue(1); - InFlag = retval.getValue(2); - SDValue Ret0 = retval; - if (needTruncate) - Ret0 = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Ret0); - InVals.push_back(Ret0); - } else if (NumElts == 2) { - // LoadV2 - SmallVector LoadRetVTs; - if (EltVT == MVT::i1 || EltVT == MVT::i8) { - // If loading i1/i8 result, generate - // load.b8 i16 - // if i1 - // trunc i16 to i1 - LoadRetVTs.push_back(MVT::i16); - LoadRetVTs.push_back(MVT::i16); - } else { - LoadRetVTs.push_back(EltVT); - LoadRetVTs.push_back(EltVT); - } - LoadRetVTs.push_back(MVT::Other); - LoadRetVTs.push_back(MVT::Glue); - SDValue LoadRetOps[] = {Chain, DAG.getConstant(1, dl, MVT::i32), - DAG.getConstant(0, dl, MVT::i32), InFlag}; - SDValue retval = DAG.getMemIntrinsicNode( - NVPTXISD::LoadParamV2, dl, - DAG.getVTList(LoadRetVTs), LoadRetOps, EltVT, MachinePointerInfo()); - Chain = retval.getValue(2); - InFlag = retval.getValue(3); - SDValue Ret0 = retval.getValue(0); - SDValue Ret1 = retval.getValue(1); - if (needTruncate) { - Ret0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ret0); - InVals.push_back(Ret0); - Ret1 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ret1); - InVals.push_back(Ret1); - } else { - InVals.push_back(Ret0); - InVals.push_back(Ret1); - } - } else { - // Split into N LoadV4 - unsigned Ofst = 0; - unsigned VecSize = 4; - unsigned Opc = NVPTXISD::LoadParamV4; - if (EltVT.getSizeInBits() == 64) { - VecSize = 2; - Opc = NVPTXISD::LoadParamV2; - } - EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, VecSize); - for (unsigned i = 0; i < NumElts; i += VecSize) { - SmallVector LoadRetVTs; - if (EltVT == MVT::i1 || EltVT == MVT::i8) { - // If loading i1/i8 result, generate - // load.b8 i16 - // if i1 - // trunc i16 to i1 - for (unsigned j = 0; j < VecSize; ++j) - LoadRetVTs.push_back(MVT::i16); - } else { - for (unsigned j = 0; j < VecSize; ++j) - LoadRetVTs.push_back(EltVT); - } - LoadRetVTs.push_back(MVT::Other); - LoadRetVTs.push_back(MVT::Glue); - SDValue LoadRetOps[] = {Chain, DAG.getConstant(1, dl, MVT::i32), - DAG.getConstant(Ofst, dl, MVT::i32), InFlag}; - SDValue retval = DAG.getMemIntrinsicNode( - Opc, dl, DAG.getVTList(LoadRetVTs), - LoadRetOps, EltVT, MachinePointerInfo()); - if (VecSize == 2) { - Chain = retval.getValue(2); - InFlag = retval.getValue(3); - } else { - Chain = retval.getValue(4); - InFlag = retval.getValue(5); - } + unsigned RetAlign = getArgumentAlignment(Callee, CS, RetTy, 0, DL); + auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign); - for (unsigned j = 0; j < VecSize; ++j) { - if (i + j >= NumElts) - break; - SDValue Elt = retval.getValue(j); - if (needTruncate) - Elt = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt); - InVals.push_back(Elt); - } - Ofst += DL.getTypeAllocSize(VecVT.getTypeForEVT(F->getContext())); - } + SmallVector LoadVTs; + int VecIdx = -1; // Index of the first element of the vector. + + // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than + // 32-bits are sign extended or zero extended, depending on whether + // they are signed or unsigned types. + bool ExtendIntegerRetVal = + RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32; + + for (unsigned i = 0, e = Ins.size(); i != e; ++i) { + bool needTruncate = false; + EVT TheLoadType = VTs[i]; + EVT EltType = Ins[i].VT; + unsigned EltAlign = GreatestCommonDivisor64(RetAlign, Offsets[i]); + if (ExtendIntegerRetVal) { + TheLoadType = MVT::i32; + EltType = MVT::i32; + needTruncate = true; + } else if (TheLoadType.getSizeInBits() < 16) { + if (VTs[i].isInteger()) + needTruncate = true; + EltType = MVT::i16; } - } else { - SmallVector VTs; - SmallVector Offsets; - auto &DL = DAG.getDataLayout(); - ComputePTXValueVTs(*this, DL, retTy, VTs, &Offsets, 0); - assert(VTs.size() == Ins.size() && "Bad value decomposition"); - unsigned RetAlign = getArgumentAlignment(Callee, CS, retTy, 0, DL); - for (unsigned i = 0, e = Ins.size(); i != e; ++i) { - unsigned sz = VTs[i].getSizeInBits(); - unsigned AlignI = GreatestCommonDivisor64(RetAlign, Offsets[i]); - bool needTruncate = false; - if (VTs[i].isInteger() && sz < 8) { - sz = 8; - needTruncate = true; + + // Record index of the very first element of the vector. + if (VectorInfo[i] & PVF_FIRST) { + assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list."); + VecIdx = i; + } + + LoadVTs.push_back(EltType); + + if (VectorInfo[i] & PVF_LAST) { + unsigned NumElts = LoadVTs.size(); + LoadVTs.push_back(MVT::Other); + LoadVTs.push_back(MVT::Glue); + NVPTXISD::NodeType Op; + switch (NumElts) { + case 1: + Op = NVPTXISD::LoadParam; + break; + case 2: + Op = NVPTXISD::LoadParamV2; + break; + case 4: + Op = NVPTXISD::LoadParamV4; + break; + default: + llvm_unreachable("Invalid vector info."); } - SmallVector LoadRetVTs; - EVT TheLoadType = VTs[i]; - if (retTy->isIntegerTy() && DL.getTypeAllocSizeInBits(retTy) < 32) { - // This is for integer types only, and specifically not for - // aggregates. - LoadRetVTs.push_back(MVT::i32); - TheLoadType = MVT::i32; - needTruncate = true; - } else if (sz < 16) { - // If loading i1/i8 result, generate - // load i8 (-> i16) - // trunc i16 to i1/i8 + SDValue VectorOps[] = {Chain, DAG.getConstant(1, dl, MVT::i32), + DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), + InFlag}; + SDValue RetVal = DAG.getMemIntrinsicNode( + Op, dl, DAG.getVTList(LoadVTs), VectorOps, TheLoadType, + MachinePointerInfo(), EltAlign); - // FIXME: Do we need to set needTruncate to true here, too? We could - // not figure out what this branch is for in D17872, so we left it - // alone. The comment above about loading i1/i8 may be wrong, as the - // branch above seems to cover integers of size < 32. - LoadRetVTs.push_back(MVT::i16); - } else - LoadRetVTs.push_back(Ins[i].VT); - LoadRetVTs.push_back(MVT::Other); - LoadRetVTs.push_back(MVT::Glue); + for (unsigned j = 0; j < NumElts; ++j) { + SDValue Ret = RetVal.getValue(j); + if (needTruncate) + Ret = DAG.getNode(ISD::TRUNCATE, dl, Ins[VecIdx + j].VT, Ret); + InVals.push_back(Ret); + } + Chain = RetVal.getValue(NumElts); + InFlag = RetVal.getValue(NumElts + 1); - SDValue LoadRetOps[] = {Chain, DAG.getConstant(1, dl, MVT::i32), - DAG.getConstant(Offsets[i], dl, MVT::i32), - InFlag}; - SDValue retval = DAG.getMemIntrinsicNode( - NVPTXISD::LoadParam, dl, - DAG.getVTList(LoadRetVTs), LoadRetOps, - TheLoadType, MachinePointerInfo(), AlignI); - Chain = retval.getValue(1); - InFlag = retval.getValue(2); - SDValue Ret0 = retval.getValue(0); - if (needTruncate) - Ret0 = DAG.getNode(ISD::TRUNCATE, dl, Ins[i].VT, Ret0); - InVals.push_back(Ret0); + // Cleanup + VecIdx = -1; + LoadVTs.clear(); } } } @@ -2371,176 +2275,66 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( // appear in the same order as their order of appearance // in the original function. "idx+1" holds that order. if (!PAL.hasAttribute(i + 1, Attribute::ByVal)) { - if (Ty->isAggregateType()) { - SmallVector vtparts; - SmallVector offsets; + bool aggregateIsPacked = false; + if (StructType *STy = dyn_cast(Ty)) + aggregateIsPacked = STy->isPacked(); - // NOTE: Here, we lose the ability to issue vector loads for vectors - // that are a part of a struct. This should be investigated in the - // future. - ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts, &offsets, - 0); - assert(vtparts.size() > 0 && "empty aggregate type not expected"); - bool aggregateIsPacked = false; - if (StructType *STy = dyn_cast(Ty)) - aggregateIsPacked = STy->isPacked(); + SmallVector VTs; + SmallVector Offsets; + ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0); + assert(VTs.size() > 0 && "empty aggregate type not expected"); + auto VectorInfo = + VectorizePTXValueVTs(VTs, Offsets, DL.getABITypeAlignment(Ty)); - SDValue Arg = getParamSymbol(DAG, idx, PtrVT); - for (unsigned parti = 0, parte = vtparts.size(); parti != parte; - ++parti) { - EVT partVT = vtparts[parti]; - Value *srcValue = Constant::getNullValue( - PointerType::get(partVT.getTypeForEVT(F->getContext()), - ADDRESS_SPACE_PARAM)); - SDValue srcAddr = - DAG.getNode(ISD::ADD, dl, PtrVT, Arg, - DAG.getConstant(offsets[parti], dl, PtrVT)); - unsigned partAlign = aggregateIsPacked - ? 1 - : DL.getABITypeAlignment( - partVT.getTypeForEVT(F->getContext())); - SDValue p; - if (Ins[InsIdx].VT.getSizeInBits() > partVT.getSizeInBits()) { - ISD::LoadExtType ExtOp = Ins[InsIdx].Flags.isSExt() ? - ISD::SEXTLOAD : ISD::ZEXTLOAD; - p = DAG.getExtLoad(ExtOp, dl, Ins[InsIdx].VT, Root, srcAddr, - MachinePointerInfo(srcValue), partVT, partAlign); - } else { - p = DAG.getLoad(partVT, dl, Root, srcAddr, - MachinePointerInfo(srcValue), partAlign); - } - if (p.getNode()) - p.getNode()->setIROrder(idx + 1); - InVals.push_back(p); - ++InsIdx; - } - if (vtparts.size() > 0) - --InsIdx; - continue; - } - if (Ty->isVectorTy()) { - EVT ObjectVT = getValueType(DL, Ty); - SDValue Arg = getParamSymbol(DAG, idx, PtrVT); - unsigned NumElts = ObjectVT.getVectorNumElements(); - assert(TLI->getNumRegisters(F->getContext(), ObjectVT) == NumElts && - "Vector was not scalarized"); - EVT EltVT = ObjectVT.getVectorElementType(); - - // V1 load - // f32 = load ... - if (NumElts == 1) { - // We only have one element, so just directly load it - Value *SrcValue = Constant::getNullValue(PointerType::get( - EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM)); - SDValue P = DAG.getLoad( - EltVT, dl, Root, Arg, MachinePointerInfo(SrcValue), - DL.getABITypeAlignment(EltVT.getTypeForEVT(F->getContext())), - MachineMemOperand::MODereferenceable | - MachineMemOperand::MOInvariant); - if (P.getNode()) - P.getNode()->setIROrder(idx + 1); - - if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits()) - P = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, P); - InVals.push_back(P); - ++InsIdx; - } else if (NumElts == 2) { - // V2 load - // f32,f32 = load ... - EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, 2); - Value *SrcValue = Constant::getNullValue(PointerType::get( - VecVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM)); - SDValue P = DAG.getLoad( - VecVT, dl, Root, Arg, MachinePointerInfo(SrcValue), - DL.getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())), - MachineMemOperand::MODereferenceable | - MachineMemOperand::MOInvariant); - if (P.getNode()) - P.getNode()->setIROrder(idx + 1); - - SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, P, - DAG.getIntPtrConstant(0, dl)); - SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, P, - DAG.getIntPtrConstant(1, dl)); - - if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits()) { - Elt0 = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, Elt0); - Elt1 = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, Elt1); - } - - InVals.push_back(Elt0); - InVals.push_back(Elt1); - InsIdx += 2; - } else { - // V4 loads - // We have at least 4 elements (<3 x Ty> expands to 4 elements) and - // the vector will be expanded to a power of 2 elements, so we know we - // can always round up to the next multiple of 4 when creating the - // vector loads. - // e.g. 4 elem => 1 ld.v4 - // 6 elem => 2 ld.v4 - // 8 elem => 2 ld.v4 - // 11 elem => 3 ld.v4 - unsigned VecSize = 4; - if (EltVT.getSizeInBits() == 64) { - VecSize = 2; - } - EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, VecSize); - unsigned Ofst = 0; - for (unsigned i = 0; i < NumElts; i += VecSize) { - Value *SrcValue = Constant::getNullValue( - PointerType::get(VecVT.getTypeForEVT(F->getContext()), - ADDRESS_SPACE_PARAM)); - SDValue SrcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, - DAG.getConstant(Ofst, dl, PtrVT)); - SDValue P = DAG.getLoad( - VecVT, dl, Root, SrcAddr, MachinePointerInfo(SrcValue), - DL.getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())), - MachineMemOperand::MODereferenceable | - MachineMemOperand::MOInvariant); - if (P.getNode()) - P.getNode()->setIROrder(idx + 1); - - for (unsigned j = 0; j < VecSize; ++j) { - if (i + j >= NumElts) - break; - SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, P, - DAG.getIntPtrConstant(j, dl)); - if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits()) - Elt = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, Elt); - InVals.push_back(Elt); - } - Ofst += DL.getTypeAllocSize(VecVT.getTypeForEVT(F->getContext())); - } - InsIdx += NumElts; - } - - if (NumElts > 0) - --InsIdx; - continue; - } - // A plain scalar. - EVT ObjectVT = getValueType(DL, Ty); - // If ABI, load from the param symbol SDValue Arg = getParamSymbol(DAG, idx, PtrVT); - Value *srcValue = Constant::getNullValue(PointerType::get( - ObjectVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM)); - SDValue p; - if (ObjectVT.getSizeInBits() < Ins[InsIdx].VT.getSizeInBits()) { - ISD::LoadExtType ExtOp = Ins[InsIdx].Flags.isSExt() ? - ISD::SEXTLOAD : ISD::ZEXTLOAD; - p = DAG.getExtLoad( - ExtOp, dl, Ins[InsIdx].VT, Root, Arg, MachinePointerInfo(srcValue), - ObjectVT, - DL.getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext()))); - } else { - p = DAG.getLoad( - Ins[InsIdx].VT, dl, Root, Arg, MachinePointerInfo(srcValue), - DL.getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext()))); + int VecIdx = -1; // Index of the first element of the current vector. + for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) { + if (VectorInfo[parti] & PVF_FIRST) { + assert(VecIdx == -1 && "Orphaned vector."); + VecIdx = parti; + } + + // That's the last element of this store op. + if (VectorInfo[parti] & PVF_LAST) { + unsigned NumElts = parti - VecIdx + 1; + EVT EltVT = VTs[parti]; + // i1 is loaded/stored as i8. + EVT LoadVT = EltVT == MVT::i1 ? MVT::i8 : EltVT; + EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts); + SDValue VecAddr = + DAG.getNode(ISD::ADD, dl, PtrVT, Arg, + DAG.getConstant(Offsets[VecIdx], dl, PtrVT)); + Value *srcValue = Constant::getNullValue(PointerType::get( + EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM)); + SDValue P = + DAG.getLoad(VecVT, dl, Root, VecAddr, + MachinePointerInfo(srcValue), aggregateIsPacked, + MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant); + if (P.getNode()) + P.getNode()->setIROrder(idx + 1); + for (unsigned j = 0; j < NumElts; ++j) { + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P, + DAG.getIntPtrConstant(j, dl)); + // We've loaded i1 as an i8 and now must truncate it back to i1 + if (EltVT == MVT::i1) + Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt); + // Extend the element if necesary (e.g an i8 is loaded + // into an i16 register) + if (Ins[InsIdx].VT.getSizeInBits() > LoadVT.getSizeInBits()) { + unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND + : ISD::ZERO_EXTEND; + Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt); + } + InVals.push_back(Elt); + } + // Reset vector tracking state. + VecIdx = -1; + } + ++InsIdx; } - if (p.getNode()) - p.getNode()->setIROrder(idx + 1); - InVals.push_back(p); + if (VTs.size() > 0) + --InsIdx; continue; } @@ -2582,165 +2376,81 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, const SmallVectorImpl &OutVals, const SDLoc &dl, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); - const Function *F = MF.getFunction(); - Type *RetTy = F->getReturnType(); - const DataLayout &TD = DAG.getDataLayout(); + Type *RetTy = MF.getFunction()->getReturnType(); bool isABI = (STI.getSmVersion() >= 20); assert(isABI && "Non-ABI compilation is not supported"); if (!isABI) return Chain; - if (VectorType *VTy = dyn_cast(RetTy)) { - // If we have a vector type, the OutVals array will be the scalarized - // components and we have combine them into 1 or more vector stores. - unsigned NumElts = VTy->getNumElements(); - assert(NumElts == Outs.size() && "Bad scalarization of return value"); + const DataLayout DL = DAG.getDataLayout(); + SmallVector VTs; + SmallVector Offsets; + ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets); + assert(VTs.size() == OutVals.size() && "Bad return value decomposition"); - // const_cast can be removed in later LLVM versions - EVT EltVT = getValueType(TD, RetTy).getVectorElementType(); - bool NeedExtend = false; - if (EltVT.getSizeInBits() < 16) - NeedExtend = true; + auto VectorInfo = VectorizePTXValueVTs( + VTs, Offsets, RetTy->isSized() ? DL.getABITypeAlignment(RetTy) : 1); - // V1 store - if (NumElts == 1) { - SDValue StoreVal = OutVals[0]; - // We only have one element, so just directly store it - if (NeedExtend) - StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal); - SDValue Ops[] = { Chain, DAG.getConstant(0, dl, MVT::i32), StoreVal }; - Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl, - DAG.getVTList(MVT::Other), Ops, - EltVT, MachinePointerInfo()); - } else if (NumElts == 2) { - // V2 store - SDValue StoreVal0 = OutVals[0]; - SDValue StoreVal1 = OutVals[1]; + // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than + // 32-bits are sign extended or zero extended, depending on whether + // they are signed or unsigned types. + bool ExtendIntegerRetVal = + RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32; + bool aggregateIsPacked = false; - if (NeedExtend) { - StoreVal0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal0); - StoreVal1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal1); - } + if (StructType *STy = dyn_cast(RetTy)) + aggregateIsPacked = STy->isPacked(); - SDValue Ops[] = { Chain, DAG.getConstant(0, dl, MVT::i32), StoreVal0, - StoreVal1 }; - Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetvalV2, dl, - DAG.getVTList(MVT::Other), Ops, - EltVT, MachinePointerInfo()); - } else { - // V4 stores - // We have at least 4 elements (<3 x Ty> expands to 4 elements) and the - // vector will be expanded to a power of 2 elements, so we know we can - // always round up to the next multiple of 4 when creating the vector - // stores. - // e.g. 4 elem => 1 st.v4 - // 6 elem => 2 st.v4 - // 8 elem => 2 st.v4 - // 11 elem => 3 st.v4 - - unsigned VecSize = 4; - if (OutVals[0].getValueSizeInBits() == 64) - VecSize = 2; - - unsigned Offset = 0; - - EVT VecVT = - EVT::getVectorVT(F->getContext(), EltVT, VecSize); - unsigned PerStoreOffset = - TD.getTypeAllocSize(VecVT.getTypeForEVT(F->getContext())); - - for (unsigned i = 0; i < NumElts; i += VecSize) { - // Get values - SDValue StoreVal; - SmallVector Ops; - Ops.push_back(Chain); - Ops.push_back(DAG.getConstant(Offset, dl, MVT::i32)); - unsigned Opc = NVPTXISD::StoreRetvalV2; - EVT ExtendedVT = (NeedExtend) ? MVT::i16 : OutVals[0].getValueType(); - - StoreVal = OutVals[i]; - if (NeedExtend) - StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal); - Ops.push_back(StoreVal); - - if (i + 1 < NumElts) { - StoreVal = OutVals[i + 1]; - if (NeedExtend) - StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal); - } else { - StoreVal = DAG.getUNDEF(ExtendedVT); - } - Ops.push_back(StoreVal); - - if (VecSize == 4) { - Opc = NVPTXISD::StoreRetvalV4; - if (i + 2 < NumElts) { - StoreVal = OutVals[i + 2]; - if (NeedExtend) - StoreVal = - DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal); - } else { - StoreVal = DAG.getUNDEF(ExtendedVT); - } - Ops.push_back(StoreVal); - - if (i + 3 < NumElts) { - StoreVal = OutVals[i + 3]; - if (NeedExtend) - StoreVal = - DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal); - } else { - StoreVal = DAG.getUNDEF(ExtendedVT); - } - Ops.push_back(StoreVal); - } - - // Chain = DAG.getNode(Opc, dl, MVT::Other, &Ops[0], Ops.size()); - Chain = - DAG.getMemIntrinsicNode(Opc, dl, DAG.getVTList(MVT::Other), Ops, - EltVT, MachinePointerInfo()); - Offset += PerStoreOffset; - } + SmallVector StoreOperands; + for (unsigned i = 0, e = Outs.size(); i != e; ++i) { + // New load/store. Record chain and offset operands. + if (VectorInfo[i] & PVF_FIRST) { + assert(StoreOperands.empty() && "Orphaned operand list."); + StoreOperands.push_back(Chain); + StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32)); } - } else { - SmallVector ValVTs; - SmallVector Offsets; - ComputePTXValueVTs(*this, DAG.getDataLayout(), RetTy, ValVTs, &Offsets, 0); - assert(ValVTs.size() == OutVals.size() && "Bad return value decomposition"); - for (unsigned i = 0, e = Outs.size(); i != e; ++i) { - SDValue theVal = OutVals[i]; - EVT TheValType = theVal.getValueType(); - unsigned numElems = 1; - if (TheValType.isVector()) - numElems = TheValType.getVectorNumElements(); - for (unsigned j = 0, je = numElems; j != je; ++j) { - SDValue TmpVal = theVal; - if (TheValType.isVector()) - TmpVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, - TheValType.getVectorElementType(), TmpVal, - DAG.getIntPtrConstant(j, dl)); - EVT TheStoreType = ValVTs[i]; - if (RetTy->isIntegerTy() && TD.getTypeAllocSizeInBits(RetTy) < 32) { - // The following zero-extension is for integer types only, and - // specifically not for aggregates. - TmpVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, TmpVal); - TheStoreType = MVT::i32; - } else if (RetTy->isHalfTy()) { - TheStoreType = MVT::f16; - } else if (TmpVal.getValueSizeInBits() < 16) - TmpVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, TmpVal); + SDValue RetVal = OutVals[i]; + if (ExtendIntegerRetVal) { + RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND + : ISD::ZERO_EXTEND, + dl, MVT::i32, RetVal); + } else if (RetVal.getValueSizeInBits() < 16) { + // Use 16-bit registers for small load-stores as it's the + // smallest general purpose register size supported by NVPTX. + RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal); + } - SDValue Ops[] = { - Chain, - DAG.getConstant(Offsets[i], dl, MVT::i32), - TmpVal }; - Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl, - DAG.getVTList(MVT::Other), Ops, - TheStoreType, - MachinePointerInfo()); + // Record the value to return. + StoreOperands.push_back(RetVal); + + // That's the last element of this store op. + if (VectorInfo[i] & PVF_LAST) { + NVPTXISD::NodeType Op; + unsigned NumElts = StoreOperands.size() - 2; + switch (NumElts) { + case 1: + Op = NVPTXISD::StoreRetval; + break; + case 2: + Op = NVPTXISD::StoreRetvalV2; + break; + case 4: + Op = NVPTXISD::StoreRetvalV4; + break; + default: + llvm_unreachable("Invalid vector info."); } + + // Adjust type of load/store op if we've extended the scalar + // return value. + EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i]; + Chain = DAG.getMemIntrinsicNode(Op, dl, DAG.getVTList(MVT::Other), + StoreOperands, TheStoreType, + MachinePointerInfo(), 1); + // Cleanup vector state. + StoreOperands.clear(); } } diff --git a/test/CodeGen/NVPTX/aggregate-return.ll b/test/CodeGen/NVPTX/aggregate-return.ll index 527c5c9aa85..785b4d6d90d 100644 --- a/test/CodeGen/NVPTX/aggregate-return.ll +++ b/test/CodeGen/NVPTX/aggregate-return.ll @@ -1,21 +1,40 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s declare <2 x float> @barv(<2 x float> %input) +declare <3 x float> @barv3(<3 x float> %input) declare [2 x float] @bara([2 x float] %input) declare {float, float} @bars({float, float} %input) -define void @foov(<2 x float> %input, <2 x float>* %output) { -; CHECK-LABEL: @foov +define void @test_v2f32(<2 x float> %input, <2 x float>* %output) { +; CHECK-LABEL: @test_v2f32 %call = tail call <2 x float> @barv(<2 x float> %input) ; CHECK: .param .align 8 .b8 retval0[8]; -; CHECK: ld.param.v2.f32 {[[ELEMV1:%f[0-9]+]], [[ELEMV2:%f[0-9]+]]}, [retval0+0]; +; CHECK: ld.param.v2.f32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]]}, [retval0+0]; store <2 x float> %call, <2 x float>* %output, align 8 -; CHECK: st.v2.f32 [{{%rd[0-9]+}}], {[[ELEMV1]], [[ELEMV2]]} +; CHECK: st.v2.f32 [{{%rd[0-9]+}}], {[[E0]], [[E1]]} ret void } -define void @fooa([2 x float] %input, [2 x float]* %output) { -; CHECK-LABEL: @fooa +define void @test_v3f32(<3 x float> %input, <3 x float>* %output) { +; CHECK-LABEL: @test_v3f32 +; + %call = tail call <3 x float> @barv3(<3 x float> %input) +; CHECK: .param .align 16 .b8 retval0[16]; +; CHECK-DAG: ld.param.v2.f32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.f32 [[E2:%f[0-9]+]], [retval0+8]; +; Make sure we don't load more values than than we need to. +; CHECK-NOT: ld.param.f32 [[E3:%f[0-9]+]], [retval0+12]; + store <3 x float> %call, <3 x float>* %output, align 8 +; CHECK-DAG: st.f32 [{{%rd[0-9]}}+8], +; -- This is suboptimal. We should do st.v2.f32 instead +; of combining 2xf32 info i64. +; CHECK-DAG: st.u64 [{{%rd[0-9]}}], +; CHECK: ret; + ret void +} + +define void @test_a2f32([2 x float] %input, [2 x float]* %output) { +; CHECK-LABEL: @test_a2f32 %call = tail call [2 x float] @bara([2 x float] %input) ; CHECK: .param .align 4 .b8 retval0[8]; ; CHECK-DAG: ld.param.f32 [[ELEMA1:%f[0-9]+]], [retval0+0]; @@ -28,8 +47,8 @@ define void @fooa([2 x float] %input, [2 x float]* %output) { ; CHECK: ret } -define void @foos({float, float} %input, {float, float}* %output) { -; CHECK-LABEL: @foos +define void @test_s2f32({float, float} %input, {float, float}* %output) { +; CHECK-LABEL: @test_s2f32 %call = tail call {float, float} @bars({float, float} %input) ; CHECK: .param .align 4 .b8 retval0[8]; ; CHECK-DAG: ld.param.f32 [[ELEMS1:%f[0-9]+]], [retval0+0]; diff --git a/test/CodeGen/NVPTX/f16-instructions.ll b/test/CodeGen/NVPTX/f16-instructions.ll index b94fd17e91f..5fce2961127 100644 --- a/test/CodeGen/NVPTX/f16-instructions.ll +++ b/test/CodeGen/NVPTX/f16-instructions.ll @@ -229,7 +229,7 @@ define half @test_tailcall_flipped(half %a, half %b) #0 { ; CHECK-LABEL: test_select( ; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_select_param_0]; ; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_select_param_1]; -; CHECK: setp.eq.b16 [[PRED:%p[0-9]+]], %rs{{.*}}, 1; +; CHECK-DAG: setp.eq.b16 [[PRED:%p[0-9]+]], %rs{{.*}}, 1; ; CHECK-NEXT: selp.b16 [[R:%h[0-9]+]], [[A]], [[B]], [[PRED]]; ; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; ; CHECK-NEXT: ret; diff --git a/test/CodeGen/NVPTX/ldparam-v4.ll b/test/CodeGen/NVPTX/ldparam-v4.ll index ec306aafe85..4d082f6e9a5 100644 --- a/test/CodeGen/NVPTX/ldparam-v4.ll +++ b/test/CodeGen/NVPTX/ldparam-v4.ll @@ -2,8 +2,11 @@ declare <4 x float> @bar() +; CHECK-LABEL: .func foo( define void @foo(<4 x float>* %ptr) { -; CHECK: ld.param.v4.f32 +; CHECK: ld.param.u32 %[[PTR:r[0-9]+]], [foo_param_0]; +; CHECK: ld.param.v4.f32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]], [[E2:%f[0-9]+]], [[E3:%f[0-9]+]]}, [retval0+0]; +; CHECK: st.v4.f32 [%[[PTR]]], {[[E0]], [[E1]], [[E2]], [[E3]]} %val = tail call <4 x float> @bar() store <4 x float> %val, <4 x float>* %ptr ret void diff --git a/test/CodeGen/NVPTX/lower-aggr-copies.ll b/test/CodeGen/NVPTX/lower-aggr-copies.ll index c487c9bab4b..f593969f2a4 100644 --- a/test/CodeGen/NVPTX/lower-aggr-copies.ll +++ b/test/CodeGen/NVPTX/lower-aggr-copies.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s --check-prefix PTX +; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -O0 | FileCheck %s --check-prefix PTX ; RUN: opt < %s -S -nvptx-lower-aggr-copies | FileCheck %s --check-prefix IR ; Verify that the NVPTXLowerAggrCopies pass works as expected - calls to @@ -27,9 +27,9 @@ entry: ; PTX: LBB[[LABEL:[_0-9]+]]: ; PTX: ld.u8 %rs[[REG:[0-9]+]] ; PTX: st.u8 [%rd{{[0-9]+}}], %rs[[REG]] -; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd[[COUNTER]], 1 -; PTX-NEXT: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd -; PTX-NEXT: @%p[[PRED]] bra LBB[[LABEL]] +; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd{{[0-9]+}}, 1 +; PTX: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd +; PTX: @%p[[PRED]] bra LBB[[LABEL]] } define i8* @memcpy_volatile_caller(i8* %dst, i8* %src, i64 %n) #0 { @@ -45,9 +45,9 @@ entry: ; PTX: LBB[[LABEL:[_0-9]+]]: ; PTX: ld.volatile.u8 %rs[[REG:[0-9]+]] ; PTX: st.volatile.u8 [%rd{{[0-9]+}}], %rs[[REG]] -; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd[[COUNTER]], 1 -; PTX-NEXT: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd -; PTX-NEXT: @%p[[PRED]] bra LBB[[LABEL]] +; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd{{[0-9]+}}, 1 +; PTX: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd +; PTX: @%p[[PRED]] bra LBB[[LABEL]] } define i8* @memcpy_casting_caller(i32* %dst, i32* %src, i64 %n) #0 { @@ -78,12 +78,13 @@ entry: ; IR-NEXT: store i8 [[VAL]], i8* [[STOREPTR]] ; PTX-LABEL: .visible .func (.param .b64 func_retval0) memset_caller( -; PTX: ld.param.u8 %rs[[REG:[0-9]+]] +; PTX: ld.param.u32 %r[[C:[0-9]+]] +; PTX: cvt.u16.u32 %rs[[REG:[0-9]+]], %r[[C]]; ; PTX: LBB[[LABEL:[_0-9]+]]: ; PTX: st.u8 [%rd{{[0-9]+}}], %rs[[REG]] -; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd[[COUNTER]], 1 -; PTX-NEXT: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd -; PTX-NEXT: @%p[[PRED]] bra LBB[[LABEL]] +; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd{{[0-9]+}}, 1 +; PTX: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd +; PTX: @%p[[PRED]] bra LBB[[LABEL]] } define i8* @volatile_memset_caller(i8* %dst, i32 %c, i64 %n) #0 { @@ -118,7 +119,7 @@ entry: ; PTX-NEXT: @%p[[SRC_GT_THAN_DST]] bra LBB[[FORWARD_BB:[0-9_]+]] ; -- this is the backwards copying BB ; PTX: @%p[[NEQ0]] bra LBB[[EXIT:[0-9_]+]] -; PTX: add.s64 %rd[[N]], %rd[[N]], -1 +; PTX: add.s64 %rd{{[0-9]}}, %rd{{[0-9]}}, -1 ; PTX: ld.u8 %rs[[ELEMENT:[0-9]+]] ; PTX: st.u8 [%rd{{[0-9]+}}], %rs[[ELEMENT]] ; -- this is the forwards copying BB @@ -126,7 +127,7 @@ entry: ; PTX: @%p[[NEQ0]] bra LBB[[EXIT]] ; PTX: ld.u8 %rs[[ELEMENT2:[0-9]+]] ; PTX: st.u8 [%rd{{[0-9]+}}], %rs[[ELEMENT2]] -; PTX: add.s64 %rd[[INDEX:[0-9]+]], %rd[[INDEX]], 1 +; PTX: add.s64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, 1 ; -- exit block ; PTX: LBB[[EXIT]]: ; PTX-NEXT: st.param.b64 [func_retval0 diff --git a/test/CodeGen/NVPTX/param-load-store.ll b/test/CodeGen/NVPTX/param-load-store.ll new file mode 100644 index 00000000000..a6defdcb478 --- /dev/null +++ b/test/CodeGen/NVPTX/param-load-store.ll @@ -0,0 +1,813 @@ +; Verifies correctness of load/store of parameters and return values. +; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -O0 | FileCheck %s + +%s_i1 = type { i1 } +%s_i8 = type { i8 } +%s_i16 = type { i16 } +%s_half = type { half } +%s_i32 = type { i32 } +%s_float = type { float } +%s_i64 = type { i64 } +%s_f64 = type { double } + +; More complicated types. i64 is used to increase natural alignment +; requirement for the type. +%s_i32x4 = type { i32, i32, i32, i32, i64} +%s_i32f32 = type { i32, float, i32, float, i64} +%s_i8i32x4 = type { i32, i32, i8, i32, i32, i64} +%s_i8i32x4p = type <{ i32, i32, i8, i32, i32, i64}> +%s_crossfield = type { i32, [2 x i32], <4 x i32>, [3 x {i32, i32, i32}]} +; All scalar parameters must be at least 32 bits in size. +; i1 is loaded/stored as i8. + +; CHECK: .func (.param .b32 func_retval0) +; CHECK-LABEL: test_i1( +; CHECK-NEXT: .param .b32 test_i1_param_0 +; CHECK: ld.param.u8 [[A8:%r[0-9]+]], [test_i1_param_0]; +; CHECK: and.b32 [[A:%r[0-9]+]], [[A8]], 1; +; CHECK: .param .b32 param0; +; CHECK: st.param.b32 [param0+0], [[A]] +; CHECK: .param .b32 retval0; +; CHECK: call.uni +; CHECK-NEXT: test_i1, +; CHECK: ld.param.b32 [[R8:%r[0-9]+]], [retval0+0]; +; CHECK: and.b32 [[R:%r[0-9]+]], [[R8]], 1; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define i1 @test_i1(i1 %a) { + %r = tail call i1 @test_i1(i1 %a); + ret i1 %r; +} + +; Signed i1 is a somewhat special case. We only care about one bit and +; then us neg.s32 to convert it to 32-bit -1 if it's set. +; CHECK: .func (.param .b32 func_retval0) +; CHECK-LABEL: test_i1s( +; CHECK-NEXT: .param .b32 test_i1s_param_0 +; CHECK: ld.param.u8 [[A8:%rs[0-9]+]], [test_i1s_param_0]; +; CHECK: cvt.u32.u16 [[A32:%r[0-9]+]], [[A8]]; +; CHECK: and.b32 [[A1:%r[0-9]+]], [[A32]], 1; +; CHECK: neg.s32 [[A:%r[0-9]+]], [[A1]]; +; CHECK: .param .b32 param0; +; CHECK: st.param.b32 [param0+0], [[A]]; +; CHECK: .param .b32 retval0; +; CHECK: call.uni +; CHECK: ld.param.b32 [[R8:%r[0-9]+]], [retval0+0]; +; CHECK: and.b32 [[R1:%r[0-9]+]], [[R8]], 1; +; CHECK: neg.s32 [[R:%r[0-9]+]], [[R1]]; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define signext i1 @test_i1s(i1 signext %a) { + %r = tail call signext i1 @test_i1s(i1 signext %a); + ret i1 %r; +} + +; Make sure that i1 loads are vectorized as i8 loads, respecting each element alignment. +; CHECK: .func (.param .align 4 .b8 func_retval0[4]) +; CHECK-LABEL: test_v3i1( +; CHECK-NEXT: .param .align 4 .b8 test_v3i1_param_0[4] +; CHECK-DAG: ld.param.u8 [[E2:%rs[0-9]+]], [test_v3i1_param_0+2]; +; CHECK-DAG: ld.param.v2.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i1_param_0] +; CHECK: .param .align 4 .b8 param0[4]; +; CHECK-DAG: st.param.v2.b8 [param0+0], {[[E0]], [[E1]]}; +; CHECK-DAG: st.param.b8 [param0+2], [[E2]]; +; CHECK: .param .align 4 .b8 retval0[4]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v3i1, +; CHECK-DAG: ld.param.v2.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.b8 [[RE2:%rs[0-9]+]], [retval0+2]; +; CHECK-DAG: st.param.v2.b8 [func_retval0+0], {[[RE0]], [[RE1]]} +; CHECK-DAG: st.param.b8 [func_retval0+2], [[RE2]]; +; CHECK-NEXT: ret; +define <3 x i1> @test_v3i1(<3 x i1> %a) { + %r = tail call <3 x i1> @test_v3i1(<3 x i1> %a); + ret <3 x i1> %r; +} + +; CHECK: .func (.param .align 4 .b8 func_retval0[4]) +; CHECK-LABEL: test_v4i1( +; CHECK-NEXT: .param .align 4 .b8 test_v4i1_param_0[4] +; CHECK: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v4i1_param_0] +; CHECK: .param .align 4 .b8 param0[4]; +; CHECK: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK: .param .align 4 .b8 retval0[4]; +; CHECK: call.uni (retval0), +; CHECK: test_v4i1, +; CHECK: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0]; +; CHECK: st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}; +; CHECK-NEXT: ret; +define <4 x i1> @test_v4i1(<4 x i1> %a) { + %r = tail call <4 x i1> @test_v4i1(<4 x i1> %a); + ret <4 x i1> %r; +} + +; CHECK: .func (.param .align 8 .b8 func_retval0[8]) +; CHECK-LABEL: test_v5i1( +; CHECK-NEXT: .param .align 8 .b8 test_v5i1_param_0[8] +; CHECK-DAG: ld.param.u8 [[E4:%rs[0-9]+]], [test_v5i1_param_0+4]; +; CHECK-DAG: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i1_param_0] +; CHECK: .param .align 8 .b8 param0[8]; +; CHECK-DAG: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK-DAG: st.param.b8 [param0+4], [[E4]]; +; CHECK: .param .align 8 .b8 retval0[8]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v5i1, +; CHECK-DAG: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.b8 [[RE4:%rs[0-9]+]], [retval0+4]; +; CHECK-DAG: st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} +; CHECK-DAG: st.param.b8 [func_retval0+4], [[RE4]]; +; CHECK-NEXT: ret; +define <5 x i1> @test_v5i1(<5 x i1> %a) { + %r = tail call <5 x i1> @test_v5i1(<5 x i1> %a); + ret <5 x i1> %r; +} + +; Unsigned i8 is loaded directly into 32-bit register. +; CHECK: .func (.param .b32 func_retval0) +; CHECK-LABEL: test_i8( +; CHECK-NEXT: .param .b32 test_i8_param_0 +; CHECK: ld.param.u8 [[A8:%rs[0-9]+]], [test_i8_param_0]; +; CHECK: cvt.u32.u16 [[A32:%r[0-9]+]], [[A8]]; +; CHECK: and.b32 [[A:%r[0-9]+]], [[A32]], 255; +; CHECK: .param .b32 param0; +; CHECK: st.param.b32 [param0+0], [[A]]; +; CHECK: .param .b32 retval0; +; CHECK: call.uni (retval0), +; CHECK: test_i8, +; CHECK: ld.param.b32 [[R32:%r[0-9]+]], [retval0+0]; +; CHECK: and.b32 [[R:%r[0-9]+]], [[R32]], 255; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define i8 @test_i8(i8 %a) { + %r = tail call i8 @test_i8(i8 %a); + ret i8 %r; +} + +; signed i8 is loaded into 16-bit register which is then sign-extended to i32. +; CHECK: .func (.param .b32 func_retval0) +; CHECK-LABEL: test_i8s( +; CHECK-NEXT: .param .b32 test_i8s_param_0 +; CHECK: ld.param.s8 [[A8:%rs[0-9]+]], [test_i8s_param_0]; +; CHECK: cvt.s32.s16 [[A:%r[0-9]+]], [[A8]]; +; CHECK: .param .b32 param0; +; CHECK: st.param.b32 [param0+0], [[A]]; +; CHECK: .param .b32 retval0; +; CHECK: call.uni (retval0), +; CHECK: test_i8s, +; CHECK: ld.param.b32 [[R32:%r[0-9]+]], [retval0+0]; +; -- This is suspicious (though correct) -- why not cvt.u8.u32, cvt.s8.s32 ? +; CHECK: cvt.u16.u32 [[R16:%rs[0-9]+]], [[R32]]; +; CHECK: cvt.s32.s16 [[R:%r[0-9]+]], [[R16]]; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define signext i8 @test_i8s(i8 signext %a) { + %r = tail call signext i8 @test_i8s(i8 signext %a); + ret i8 %r; +} + +; CHECK: .func (.param .align 4 .b8 func_retval0[4]) +; CHECK-LABEL: test_v3i8( +; CHECK-NEXT: .param .align 4 .b8 test_v3i8_param_0[4] +; CHECK-DAG: ld.param.u8 [[E2:%rs[0-9]+]], [test_v3i8_param_0+2]; +; CHECK-DAG: ld.param.v2.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i8_param_0]; +; CHECK: .param .align 4 .b8 param0[4]; +; CHECK: st.param.v2.b8 [param0+0], {[[E0]], [[E1]]}; +; CHECK: st.param.b8 [param0+2], [[E2]]; +; CHECK: .param .align 4 .b8 retval0[4]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v3i8, +; CHECK-DAG: ld.param.v2.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.b8 [[RE2:%rs[0-9]+]], [retval0+2]; +; CHECK-DAG: st.param.v2.b8 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK-DAG: st.param.b8 [func_retval0+2], [[RE2]]; +; CHECK-NEXT: ret; +define <3 x i8> @test_v3i8(<3 x i8> %a) { + %r = tail call <3 x i8> @test_v3i8(<3 x i8> %a); + ret <3 x i8> %r; +} + +; CHECK: .func (.param .align 4 .b8 func_retval0[4]) +; CHECK-LABEL: test_v4i8( +; CHECK-NEXT: .param .align 4 .b8 test_v4i8_param_0[4] +; CHECK: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v4i8_param_0] +; CHECK: .param .align 4 .b8 param0[4]; +; CHECK: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK: .param .align 4 .b8 retval0[4]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v4i8, +; CHECK: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0]; +; CHECK: st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} +; CHECK-NEXT: ret; +define <4 x i8> @test_v4i8(<4 x i8> %a) { + %r = tail call <4 x i8> @test_v4i8(<4 x i8> %a); + ret <4 x i8> %r; +} + +; CHECK: .func (.param .align 8 .b8 func_retval0[8]) +; CHECK-LABEL: test_v5i8( +; CHECK-NEXT: .param .align 8 .b8 test_v5i8_param_0[8] +; CHECK-DAG: ld.param.u8 [[E4:%rs[0-9]+]], [test_v5i8_param_0+4]; +; CHECK-DAG ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i8_param_0] +; CHECK: .param .align 8 .b8 param0[8]; +; CHECK-DAG: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK-DAG: st.param.b8 [param0+4], [[E4]]; +; CHECK: .param .align 8 .b8 retval0[8]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v5i8, +; CHECK-DAG: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.b8 [[RE4:%rs[0-9]+]], [retval0+4]; +; CHECK-DAG: st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} +; CHECK-DAG: st.param.b8 [func_retval0+4], [[RE4]]; +; CHECK-NEXT: ret; +define <5 x i8> @test_v5i8(<5 x i8> %a) { + %r = tail call <5 x i8> @test_v5i8(<5 x i8> %a); + ret <5 x i8> %r; +} + +; CHECK: .func (.param .b32 func_retval0) +; CHECK-LABEL: test_i16( +; CHECK-NEXT: .param .b32 test_i16_param_0 +; CHECK: ld.param.u16 [[E16:%rs[0-9]+]], [test_i16_param_0]; +; CHECK: cvt.u32.u16 [[E32:%r[0-9]+]], [[E16]]; +; CHECK: .param .b32 param0; +; CHECK: st.param.b32 [param0+0], [[E32]]; +; CHECK: .param .b32 retval0; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_i16, +; CHECK: ld.param.b32 [[RE32:%r[0-9]+]], [retval0+0]; +; CHECK: and.b32 [[R:%r[0-9]+]], [[RE32]], 65535; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define i16 @test_i16(i16 %a) { + %r = tail call i16 @test_i16(i16 %a); + ret i16 %r; +} + +; CHECK: .func (.param .b32 func_retval0) +; CHECK-LABEL: test_i16s( +; CHECK-NEXT: .param .b32 test_i16s_param_0 +; CHECK: ld.param.u16 [[E16:%rs[0-9]+]], [test_i16s_param_0]; +; CHECK: cvt.s32.s16 [[E32:%r[0-9]+]], [[E16]]; +; CHECK: .param .b32 param0; +; CHECK: st.param.b32 [param0+0], [[E32]]; +; CHECK: .param .b32 retval0; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_i16s, +; CHECK: ld.param.b32 [[RE32:%r[0-9]+]], [retval0+0]; +; CHECK: cvt.s32.s16 [[R:%r[0-9]+]], [[RE32]]; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define signext i16 @test_i16s(i16 signext %a) { + %r = tail call signext i16 @test_i16s(i16 signext %a); + ret i16 %r; +} + +; CHECK: .func (.param .align 8 .b8 func_retval0[8]) +; CHECK-LABEL: test_v3i16( +; CHECK-NEXT: .param .align 8 .b8 test_v3i16_param_0[8] +; CHECK-DAG: ld.param.u16 [[E2:%rs[0-9]+]], [test_v3i16_param_0+4]; +; CHECK-DAG: ld.param.v2.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i16_param_0]; +; CHECK: .param .align 8 .b8 param0[8]; +; CHECK: st.param.v2.b16 [param0+0], {[[E0]], [[E1]]}; +; CHECK: st.param.b16 [param0+4], [[E2]]; +; CHECK: .param .align 8 .b8 retval0[8]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v3i16, +; CHECK: ld.param.v2.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0+0]; +; CHECK: ld.param.b16 [[RE2:%rs[0-9]+]], [retval0+4]; +; CHECK-DAG: st.param.v2.b16 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK-DAG: st.param.b16 [func_retval0+4], [[RE2]]; +; CHECK-NEXT: ret; +define <3 x i16> @test_v3i16(<3 x i16> %a) { + %r = tail call <3 x i16> @test_v3i16(<3 x i16> %a); + ret <3 x i16> %r; +} + +; CHECK: .func (.param .align 8 .b8 func_retval0[8]) +; CHECK-LABEL: test_v4i16( +; CHECK-NEXT: .param .align 8 .b8 test_v4i16_param_0[8] +; CHECK: ld.param.v4.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v4i16_param_0] +; CHECK: .param .align 8 .b8 param0[8]; +; CHECK: st.param.v4.b16 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK: .param .align 8 .b8 retval0[8]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v4i16, +; CHECK: ld.param.v4.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0]; +; CHECK: st.param.v4.b16 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} +; CHECK-NEXT: ret; +define <4 x i16> @test_v4i16(<4 x i16> %a) { + %r = tail call <4 x i16> @test_v4i16(<4 x i16> %a); + ret <4 x i16> %r; +} + +; CHECK: .func (.param .align 16 .b8 func_retval0[16]) +; CHECK-LABEL: test_v5i16( +; CHECK-NEXT: .param .align 16 .b8 test_v5i16_param_0[16] +; CHECK-DAG: ld.param.u16 [[E4:%rs[0-9]+]], [test_v5i16_param_0+8]; +; CHECK-DAG ld.param.v4.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i16_param_0] +; CHECK: .param .align 16 .b8 param0[16]; +; CHECK-DAG: st.param.v4.b16 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK-DAG: st.param.b16 [param0+8], [[E4]]; +; CHECK: .param .align 16 .b8 retval0[16]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v5i16, +; CHECK-DAG: ld.param.v4.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.b16 [[RE4:%rs[0-9]+]], [retval0+8]; +; CHECK-DAG: st.param.v4.b16 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} +; CHECK-DAG: st.param.b16 [func_retval0+8], [[RE4]]; +; CHECK-NEXT: ret; +define <5 x i16> @test_v5i16(<5 x i16> %a) { + %r = tail call <5 x i16> @test_v5i16(<5 x i16> %a); + ret <5 x i16> %r; +} + +; CHECK: .func (.param .b32 func_retval0) +; CHECK-LABEL: test_half( +; CHECK-NEXT: .param .b32 test_half_param_0 +; CHECK: ld.param.b16 [[E:%h[0-9]+]], [test_half_param_0]; +; CHECK: .param .b32 param0; +; CHECK: st.param.b16 [param0+0], [[E]]; +; CHECK: .param .b32 retval0; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_half, +; CHECK: ld.param.b16 [[R:%h[0-9]+]], [retval0+0]; +; CHECK: st.param.b16 [func_retval0+0], [[R]] +; CHECK-NEXT: ret; +define half @test_half(half %a) { + %r = tail call half @test_half(half %a); + ret half %r; +} + +; CHECK: .func (.param .b32 func_retval0) +; CHECK-LABEL: test_i32( +; CHECK-NEXT: .param .b32 test_i32_param_0 +; CHECK: ld.param.u32 [[E:%r[0-9]+]], [test_i32_param_0]; +; CHECK: .param .b32 param0; +; CHECK: st.param.b32 [param0+0], [[E]]; +; CHECK: .param .b32 retval0; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_i32, +; CHECK: ld.param.b32 [[R:%r[0-9]+]], [retval0+0]; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define i32 @test_i32(i32 %a) { + %r = tail call i32 @test_i32(i32 %a); + ret i32 %r; +} + +; CHECK: .func (.param .align 16 .b8 func_retval0[16]) +; CHECK-LABEL: test_v3i32( +; CHECK-NEXT: .param .align 16 .b8 test_v3i32_param_0[16] +; CHECK-DAG: ld.param.u32 [[E2:%r[0-9]+]], [test_v3i32_param_0+8]; +; CHECK-DAG: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_v3i32_param_0]; +; CHECK: .param .align 16 .b8 param0[16]; +; CHECK: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]}; +; CHECK: st.param.b32 [param0+8], [[E2]]; +; CHECK: .param .align 16 .b8 retval0[16]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v3i32, +; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0]; +; CHECK: ld.param.b32 [[RE2:%r[0-9]+]], [retval0+8]; +; CHECK-DAG: st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK-DAG: st.param.b32 [func_retval0+8], [[RE2]]; +; CHECK-NEXT: ret; +define <3 x i32> @test_v3i32(<3 x i32> %a) { + %r = tail call <3 x i32> @test_v3i32(<3 x i32> %a); + ret <3 x i32> %r; +} + +; CHECK: .func (.param .align 16 .b8 func_retval0[16]) +; CHECK-LABEL: test_v4i32( +; CHECK-NEXT: .param .align 16 .b8 test_v4i32_param_0[16] +; CHECK: ld.param.v4.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_v4i32_param_0] +; CHECK: .param .align 16 .b8 param0[16]; +; CHECK: st.param.v4.b32 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK: .param .align 16 .b8 retval0[16]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v4i32, +; CHECK: ld.param.v4.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]], [[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+0]; +; CHECK: st.param.v4.b32 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} +; CHCK-NEXT: ret; +define <4 x i32> @test_v4i32(<4 x i32> %a) { + %r = tail call <4 x i32> @test_v4i32(<4 x i32> %a); + ret <4 x i32> %r; +} + +; CHECK: .func (.param .align 32 .b8 func_retval0[32]) +; CHECK-LABEL: test_v5i32( +; CHECK-NEXT: .param .align 32 .b8 test_v5i32_param_0[32] +; CHECK-DAG: ld.param.u32 [[E4:%r[0-9]+]], [test_v5i32_param_0+16]; +; CHECK-DAG ld.param.v4.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_v5i32_param_0] +; CHECK: .param .align 32 .b8 param0[32]; +; CHECK-DAG: st.param.v4.b32 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK-DAG: st.param.b32 [param0+16], [[E4]]; +; CHECK: .param .align 32 .b8 retval0[32]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v5i32, +; CHECK-DAG: ld.param.v4.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]], [[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.b32 [[RE4:%r[0-9]+]], [retval0+16]; +; CHECK-DAG: st.param.v4.b32 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} +; CHECK-DAG: st.param.b32 [func_retval0+16], [[RE4]]; +; CHECK-NEXT: ret; +define <5 x i32> @test_v5i32(<5 x i32> %a) { + %r = tail call <5 x i32> @test_v5i32(<5 x i32> %a); + ret <5 x i32> %r; +} + +; CHECK: .func (.param .b32 func_retval0) +; CHECK-LABEL: test_float( +; CHECK-NEXT: .param .b32 test_float_param_0 +; CHECK: ld.param.f32 [[E:%f[0-9]+]], [test_float_param_0]; +; CHECK: .param .b32 param0; +; CHECK: st.param.f32 [param0+0], [[E]]; +; CHECK: .param .b32 retval0; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_float, +; CHECK: ld.param.f32 [[R:%f[0-9]+]], [retval0+0]; +; CHECK: st.param.f32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define float @test_float(float %a) { + %r = tail call float @test_float(float %a); + ret float %r; +} + +; CHECK: .func (.param .b64 func_retval0) +; CHECK-LABEL: test_i64( +; CHECK-NEXT: .param .b64 test_i64_param_0 +; CHECK: ld.param.u64 [[E:%rd[0-9]+]], [test_i64_param_0]; +; CHECK: .param .b64 param0; +; CHECK: st.param.b64 [param0+0], [[E]]; +; CHECK: .param .b64 retval0; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_i64, +; CHECK: ld.param.b64 [[R:%rd[0-9]+]], [retval0+0]; +; CHECK: st.param.b64 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define i64 @test_i64(i64 %a) { + %r = tail call i64 @test_i64(i64 %a); + ret i64 %r; +} + +; CHECK: .func (.param .align 32 .b8 func_retval0[32]) +; CHECK-LABEL: test_v3i64( +; CHECK-NEXT: .param .align 32 .b8 test_v3i64_param_0[32] +; CHECK-DAG: ld.param.u64 [[E2:%rd[0-9]+]], [test_v3i64_param_0+16]; +; CHECK-DAG: ld.param.v2.u64 {[[E0:%rd[0-9]+]], [[E1:%rd[0-9]+]]}, [test_v3i64_param_0]; +; CHECK: .param .align 32 .b8 param0[32]; +; CHECK: st.param.v2.b64 [param0+0], {[[E0]], [[E1]]}; +; CHECK: st.param.b64 [param0+16], [[E2]]; +; CHECK: .param .align 32 .b8 retval0[32]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v3i64, +; CHECK: ld.param.v2.b64 {[[RE0:%rd[0-9]+]], [[RE1:%rd[0-9]+]]}, [retval0+0]; +; CHECK: ld.param.b64 [[RE2:%rd[0-9]+]], [retval0+16]; +; CHECK-DAG: st.param.v2.b64 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE2]]; +; CHECK-DAG: st.param.v2.b64 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE2]]; +; CHECK-NEXT: ret; +define <3 x i64> @test_v3i64(<3 x i64> %a) { + %r = tail call <3 x i64> @test_v3i64(<3 x i64> %a); + ret <3 x i64> %r; +} + +; For i64 vector loads are limited by PTX to 2 elements. +; CHECK: .func (.param .align 32 .b8 func_retval0[32]) +; CHECK-LABEL: test_v4i64( +; CHECK-NEXT: .param .align 32 .b8 test_v4i64_param_0[32] +; CHECK-DAG: ld.param.v2.u64 {[[E2:%rd[0-9]+]], [[E3:%rd[0-9]+]]}, [test_v4i64_param_0+16]; +; CHECK-DAG: ld.param.v2.u64 {[[E0:%rd[0-9]+]], [[E1:%rd[0-9]+]]}, [test_v4i64_param_0]; +; CHECK: .param .align 32 .b8 param0[32]; +; CHECK: st.param.v2.b64 [param0+0], {[[E0]], [[E1]]}; +; CHECK: st.param.v2.b64 [param0+16], {[[E2]], [[E3]]}; +; CHECK: .param .align 32 .b8 retval0[32]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v4i64, +; CHECK: ld.param.v2.b64 {[[RE0:%rd[0-9]+]], [[RE1:%rd[0-9]+]]}, [retval0+0]; +; CHECK: ld.param.v2.b64 {[[RE2:%rd[0-9]+]], [[RE3:%rd[0-9]+]]}, [retval0+16]; +; CHECK-DAG: st.param.v2.b64 [func_retval0+16], {[[RE2]], [[RE3]]}; +; CHECK-DAG: st.param.v2.b64 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK-NEXT: ret; +define <4 x i64> @test_v4i64(<4 x i64> %a) { + %r = tail call <4 x i64> @test_v4i64(<4 x i64> %a); + ret <4 x i64> %r; +} + +; Aggregates, on the other hand, do not get extended. + +; CHECK: .func (.param .align 1 .b8 func_retval0[1]) +; CHECK-LABEL: test_s_i1( +; CHECK-NEXT: .align 1 .b8 test_s_i1_param_0[1] +; CHECK: ld.param.u8 [[A:%rs[0-9]+]], [test_s_i1_param_0]; +; CHECK: .param .align 1 .b8 param0[1]; +; CHECK: st.param.b8 [param0+0], [[A]] +; CHECK: .param .align 1 .b8 retval0[1]; +; CHECK: call.uni +; CHECK-NEXT: test_s_i1, +; CHECK: ld.param.b8 [[R:%rs[0-9]+]], [retval0+0]; +; CHECK: st.param.b8 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define %s_i1 @test_s_i1(%s_i1 %a) { + %r = tail call %s_i1 @test_s_i1(%s_i1 %a); + ret %s_i1 %r; +} + +; CHECK: .func (.param .align 1 .b8 func_retval0[1]) +; CHECK-LABEL: test_s_i8( +; CHECK-NEXT: .param .align 1 .b8 test_s_i8_param_0[1] +; CHECK: ld.param.u8 [[A:%rs[0-9]+]], [test_s_i8_param_0]; +; CHECK: .param .align 1 .b8 param0[1]; +; CHECK: st.param.b8 [param0+0], [[A]] +; CHECK: .param .align 1 .b8 retval0[1]; +; CHECK: call.uni +; CHECK-NEXT: test_s_i8, +; CHECK: ld.param.b8 [[R:%rs[0-9]+]], [retval0+0]; +; CHECK: st.param.b8 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define %s_i8 @test_s_i8(%s_i8 %a) { + %r = tail call %s_i8 @test_s_i8(%s_i8 %a); + ret %s_i8 %r; +} + +; CHECK: .func (.param .align 2 .b8 func_retval0[2]) +; CHECK-LABEL: test_s_i16( +; CHECK-NEXT: .param .align 2 .b8 test_s_i16_param_0[2] +; CHECK: ld.param.u16 [[A:%rs[0-9]+]], [test_s_i16_param_0]; +; CHECK: .param .align 2 .b8 param0[2]; +; CHECK: st.param.b16 [param0+0], [[A]] +; CHECK: .param .align 2 .b8 retval0[2]; +; CHECK: call.uni +; CHECK-NEXT: test_s_i16, +; CHECK: ld.param.b16 [[R:%rs[0-9]+]], [retval0+0]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define %s_i16 @test_s_i16(%s_i16 %a) { + %r = tail call %s_i16 @test_s_i16(%s_i16 %a); + ret %s_i16 %r; +} + +; CHECK: .func (.param .align 2 .b8 func_retval0[2]) +; CHECK-LABEL: test_s_half( +; CHECK-NEXT: .param .align 2 .b8 test_s_half_param_0[2] +; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_s_half_param_0]; +; CHECK: .param .align 2 .b8 param0[2]; +; CHECK: st.param.b16 [param0+0], [[A]] +; CHECK: .param .align 2 .b8 retval0[2]; +; CHECK: call.uni +; CHECK-NEXT: test_s_half, +; CHECK: ld.param.b16 [[R:%h[0-9]+]], [retval0+0]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define %s_half @test_s_half(%s_half %a) { + %r = tail call %s_half @test_s_half(%s_half %a); + ret %s_half %r; +} + +; CHECK: .func (.param .align 4 .b8 func_retval0[4]) +; CHECK-LABEL: test_s_i32( +; CHECK-NEXT: .param .align 4 .b8 test_s_i32_param_0[4] +; CHECK: ld.param.u32 [[E:%r[0-9]+]], [test_s_i32_param_0]; +; CHECK: .param .align 4 .b8 param0[4] +; CHECK: st.param.b32 [param0+0], [[E]]; +; CHECK: .param .align 4 .b8 retval0[4]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_s_i32, +; CHECK: ld.param.b32 [[R:%r[0-9]+]], [retval0+0]; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define %s_i32 @test_s_i32(%s_i32 %a) { + %r = tail call %s_i32 @test_s_i32(%s_i32 %a); + ret %s_i32 %r; +} + +; CHECK: .func (.param .align 4 .b8 func_retval0[4]) +; CHECK-LABEL: test_s_float( +; CHECK-NEXT: .param .align 4 .b8 test_s_float_param_0[4] +; CHECK: ld.param.f32 [[E:%f[0-9]+]], [test_s_float_param_0]; +; CHECK: .param .align 4 .b8 param0[4] +; CHECK: st.param.f32 [param0+0], [[E]]; +; CHECK: .param .align 4 .b8 retval0[4]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_s_float, +; CHECK: ld.param.f32 [[R:%f[0-9]+]], [retval0+0]; +; CHECK: st.param.f32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define %s_float @test_s_float(%s_float %a) { + %r = tail call %s_float @test_s_float(%s_float %a); + ret %s_float %r; +} + +; CHECK: .func (.param .align 8 .b8 func_retval0[8]) +; CHECK-LABEL: test_s_i64( +; CHECK-NEXT: .param .align 8 .b8 test_s_i64_param_0[8] +; CHECK: ld.param.u64 [[E:%rd[0-9]+]], [test_s_i64_param_0]; +; CHECK: .param .align 8 .b8 param0[8]; +; CHECK: st.param.b64 [param0+0], [[E]]; +; CHECK: .param .align 8 .b8 retval0[8]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_s_i64, +; CHECK: ld.param.b64 [[R:%rd[0-9]+]], [retval0+0]; +; CHECK: st.param.b64 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define %s_i64 @test_s_i64(%s_i64 %a) { + %r = tail call %s_i64 @test_s_i64(%s_i64 %a); + ret %s_i64 %r; +} + +; Fields that have different types, but identical sizes are not vectorized. +; CHECK: .func (.param .align 8 .b8 func_retval0[24]) +; CHECK-LABEL: test_s_i32f32( +; CHECK: .param .align 8 .b8 test_s_i32f32_param_0[24] +; CHECK-DAG: ld.param.u64 [[E4:%rd[0-9]+]], [test_s_i32f32_param_0+16]; +; CHECK-DAG: ld.param.f32 [[E3:%f[0-9]+]], [test_s_i32f32_param_0+12]; +; CHECK-DAG: ld.param.u32 [[E2:%r[0-9]+]], [test_s_i32f32_param_0+8]; +; CHECK-DAG: ld.param.f32 [[E1:%f[0-9]+]], [test_s_i32f32_param_0+4]; +; CHECK-DAG: ld.param.u32 [[E0:%r[0-9]+]], [test_s_i32f32_param_0]; +; CHECK: .param .align 8 .b8 param0[24]; +; CHECK-DAG: st.param.b32 [param0+0], [[E0]]; +; CHECK-DAG: st.param.f32 [param0+4], [[E1]]; +; CHECK-DAG: st.param.b32 [param0+8], [[E2]]; +; CHECK-DAG: st.param.f32 [param0+12], [[E3]]; +; CHECK-DAG: st.param.b64 [param0+16], [[E4]]; +; CHECK: .param .align 8 .b8 retval0[24]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_s_i32f32, +; CHECK-DAG: ld.param.b32 [[RE0:%r[0-9]+]], [retval0+0]; +; CHECK-DAG: ld.param.f32 [[RE1:%f[0-9]+]], [retval0+4]; +; CHECK-DAG: ld.param.b32 [[RE2:%r[0-9]+]], [retval0+8]; +; CHECK-DAG: ld.param.f32 [[RE3:%f[0-9]+]], [retval0+12]; +; CHECK-DAG: ld.param.b64 [[RE4:%rd[0-9]+]], [retval0+16]; +; CHECK-DAG: st.param.b32 [func_retval0+0], [[RE0]]; +; CHECK-DAG: st.param.f32 [func_retval0+4], [[RE1]]; +; CHECK-DAG: st.param.b32 [func_retval0+8], [[RE2]]; +; CHECK-DAG: st.param.f32 [func_retval0+12], [[RE3]]; +; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE4]]; +; CHECK: ret; +define %s_i32f32 @test_s_i32f32(%s_i32f32 %a) { + %r = tail call %s_i32f32 @test_s_i32f32(%s_i32f32 %a); + ret %s_i32f32 %r; +} + +; We do vectorize consecutive fields with matching types. +; CHECK:.visible .func (.param .align 8 .b8 func_retval0[24]) +; CHECK-LABEL: test_s_i32x4( +; CHECK: .param .align 8 .b8 test_s_i32x4_param_0[24] +; CHECK-DAG: ld.param.u64 [[RD1:%rd[0-9]+]], [test_s_i32x4_param_0+16]; +; CHECK-DAG: ld.param.v2.u32 {[[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_s_i32x4_param_0+8]; +; CHECK-DAG: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_i32x4_param_0]; +; CHECK: .param .align 8 .b8 param0[24]; +; CHECK: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]}; +; CHECK: st.param.v2.b32 [param0+8], {[[E2]], [[E3]]}; +; CHECK: st.param.b64 [param0+16], [[E4]]; +; CHECK: .param .align 8 .b8 retval0[24]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_s_i32x4, +; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0]; +; CHECK: ld.param.v2.b32 {[[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+8]; +; CHECK: ld.param.b64 [[RE4:%rd[0-9]+]], [retval0+16]; +; CHECK-DAG: st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK-DAG: st.param.v2.b32 [func_retval0+8], {[[RE2]], [[RE3]]}; +; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE4]]; +; CHECK: ret; + +define %s_i32x4 @test_s_i32x4(%s_i32x4 %a) { + %r = tail call %s_i32x4 @test_s_i32x4(%s_i32x4 %a); + ret %s_i32x4 %r; +} + +; CHECK:.visible .func (.param .align 8 .b8 func_retval0[32]) +; CHECK-LABEL: test_s_i1i32x4( +; CHECK: .param .align 8 .b8 test_s_i1i32x4_param_0[32] +; CHECK: ld.param.u64 [[E5:%rd[0-9]+]], [test_s_i1i32x4_param_0+24]; +; CHECK: ld.param.u32 [[E4:%r[0-9]+]], [test_s_i1i32x4_param_0+16]; +; CHECK: ld.param.u32 [[E3:%r[0-9]+]], [test_s_i1i32x4_param_0+12]; +; CHECK: ld.param.u8 [[E2:%rs[0-9]+]], [test_s_i1i32x4_param_0+8]; +; CHECK: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_i1i32x4_param_0]; +; CHECK: .param .align 8 .b8 param0[32]; +; CHECK: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]}; +; CHECK: st.param.b8 [param0+8], [[E2]]; +; CHECK: st.param.b32 [param0+12], [[E3]]; +; CHECK: st.param.b32 [param0+16], [[E4]]; +; CHECK: st.param.b64 [param0+24], [[E5]]; +; CHECK: .param .align 8 .b8 retval0[32]; +; CHECK: call.uni (retval0), +; CHECK: test_s_i1i32x4, +; CHECK: ( +; CHECK: param0 +; CHECK: ); +; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0]; +; CHECK: ld.param.b8 [[RE2:%rs[0-9]+]], [retval0+8]; +; CHECK: ld.param.b32 [[RE3:%r[0-9]+]], [retval0+12]; +; CHECK: ld.param.b32 [[RE4:%r[0-9]+]], [retval0+16]; +; CHECK: ld.param.b64 [[RE5:%rd[0-9]+]], [retval0+24]; +; CHECK: st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK: st.param.b8 [func_retval0+8], [[RE2]]; +; CHECK: st.param.b32 [func_retval0+12], [[RE3]]; +; CHECK: st.param.b32 [func_retval0+16], [[RE4]]; +; CHECK: st.param.b64 [func_retval0+24], [[RE5]]; +; CHECK: ret; + +define %s_i8i32x4 @test_s_i1i32x4(%s_i8i32x4 %a) { + %r = tail call %s_i8i32x4 @test_s_i1i32x4(%s_i8i32x4 %a); + ret %s_i8i32x4 %r; +} + +; -- All loads/stores from parameters aligned by one must be done one +; -- byte at a time. +; CHECK:.visible .func (.param .align 1 .b8 func_retval0[25]) +; CHECK-LABEL: test_s_i1i32x4p( +; CHECK-DAG: .param .align 1 .b8 test_s_i1i32x4p_param_0[25] +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+24]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+23]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+22]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+21]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+20]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+19]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+18]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+17]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+16]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+15]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+14]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+13]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+12]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+11]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+10]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+9]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+8]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+7]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+6]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+5]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+4]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+3]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+2]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+1]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0]; +; --- TODO +; --- Unaligned parameter store/ return value load is broken in both nvcc +; --- and llvm and needs to be fixed. +; CHECK: .param .align 1 .b8 param0[25]; +; CHECK-DAG: st.param.b32 [param0+0], +; CHECK-DAG: st.param.b32 [param0+4], +; CHECK-DAG: st.param.b8 [param0+8], +; CHECK-DAG: st.param.b32 [param0+9], +; CHECK-DAG: st.param.b32 [param0+13], +; CHECK-DAG: st.param.b64 [param0+17], +; CHECK: .param .align 1 .b8 retval0[25]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_s_i1i32x4p, +; CHECK-DAG: ld.param.b32 %r41, [retval0+0]; +; CHECK-DAG: ld.param.b32 %r42, [retval0+4]; +; CHECK-DAG: ld.param.b8 %rs2, [retval0+8]; +; CHECK-DAG: ld.param.b32 %r43, [retval0+9]; +; CHECK-DAG: ld.param.b32 %r44, [retval0+13]; +; CHECK-DAG: ld.param.b64 %rd23, [retval0+17]; +; CHECK-DAG: st.param.b32 [func_retval0+0], +; CHECK-DAG: st.param.b32 [func_retval0+4], +; CHECK-DAG: st.param.b8 [func_retval0+8], +; CHECK-DAG: st.param.b32 [func_retval0+9], +; CHECK-DAG: st.param.b32 [func_retval0+13], +; CHECK-DAG: st.param.b64 [func_retval0+17], + +define %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a) { + %r = tail call %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a); + ret %s_i8i32x4p %r; +} + +; Check that we can vectorize loads that span multiple aggregate fields. +; CHECK:.visible .func (.param .align 16 .b8 func_retval0[80]) +; CHECK-LABEL: test_s_crossfield( +; CHECK: .param .align 16 .b8 test_s_crossfield_param_0[80] +; CHECK: ld.param.u32 [[E15:%r[0-9]+]], [test_s_crossfield_param_0+64]; +; CHECK: ld.param.v4.u32 {[[E11:%r[0-9]+]], [[E12:%r[0-9]+]], [[E13:%r[0-9]+]], [[E14:%r[0-9]+]]}, [test_s_crossfield_param_0+48]; +; CHECK: ld.param.v4.u32 {[[E7:%r[0-9]+]], [[E8:%r[0-9]+]], [[E9:%r[0-9]+]], [[E10:%r[0-9]+]]}, [test_s_crossfield_param_0+32]; +; CHECK: ld.param.v4.u32 {[[E3:%r[0-9]+]], [[E4:%r[0-9]+]], [[E5:%r[0-9]+]], [[E6:%r[0-9]+]]}, [test_s_crossfield_param_0+16]; +; CHECK: ld.param.u32 [[E2:%r[0-9]+]], [test_s_crossfield_param_0+8]; +; CHECK: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_crossfield_param_0]; +; CHECK: .param .align 16 .b8 param0[80]; +; CHECK: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]}; +; CHECK: st.param.b32 [param0+8], [[E2]]; +; CHECK: st.param.v4.b32 [param0+16], {[[E3]], [[E4]], [[E5]], [[E6]]}; +; CHECK: st.param.v4.b32 [param0+32], {[[E7]], [[E8]], [[E9]], [[E10]]}; +; CHECK: st.param.v4.b32 [param0+48], {[[E11]], [[E12]], [[E13]], [[E14]]}; +; CHECK: st.param.b32 [param0+64], [[E15]]; +; CHECK: .param .align 16 .b8 retval0[80]; +; CHECK: call.uni (retval0), +; CHECK: test_s_crossfield, +; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0]; +; CHECK: ld.param.b32 [[RE2:%r[0-9]+]], [retval0+8]; +; CHECK: ld.param.v4.b32 {[[RE3:%r[0-9]+]], [[RE4:%r[0-9]+]], [[RE5:%r[0-9]+]], [[RE6:%r[0-9]+]]}, [retval0+16]; +; CHECK: ld.param.v4.b32 {[[RE7:%r[0-9]+]], [[RE8:%r[0-9]+]], [[RE9:%r[0-9]+]], [[RE10:%r[0-9]+]]}, [retval0+32]; +; CHECK: ld.param.v4.b32 {[[RE11:%r[0-9]+]], [[RE12:%r[0-9]+]], [[RE13:%r[0-9]+]], [[RE14:%r[0-9]+]]}, [retval0+48]; +; CHECK: ld.param.b32 [[RE15:%r[0-9]+]], [retval0+64]; +; CHECK: st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK: st.param.b32 [func_retval0+8], [[RE2]]; +; CHECK: st.param.v4.b32 [func_retval0+16], {[[RE3]], [[RE4]], [[RE5]], [[RE6]]}; +; CHECK: st.param.v4.b32 [func_retval0+32], {[[RE7]], [[RE8]], [[RE9]], [[RE10]]}; +; CHECK: st.param.v4.b32 [func_retval0+48], {[[RE11]], [[RE12]], [[RE13]], [[RE14]]}; +; CHECK: st.param.b32 [func_retval0+64], [[RE15]]; +; CHECK: ret; + +define %s_crossfield @test_s_crossfield(%s_crossfield %a) { + %r = tail call %s_crossfield @test_s_crossfield(%s_crossfield %a); + ret %s_crossfield %r; +} diff --git a/test/CodeGen/NVPTX/vec-param-load.ll b/test/CodeGen/NVPTX/vec-param-load.ll index 4193ac4085c..bf26e5ff1bd 100644 --- a/test/CodeGen/NVPTX/vec-param-load.ll +++ b/test/CodeGen/NVPTX/vec-param-load.ll @@ -2,12 +2,81 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" - -define <16 x float> @foo(<16 x float> %a) { -; Make sure we index into vectors properly -; CHECK: ld.param.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [foo_param_0+48]; -; CHECK: ld.param.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [foo_param_0+32]; -; CHECK: ld.param.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [foo_param_0+16]; -; CHECK: ld.param.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [foo_param_0]; +define <16 x float> @test_v16f32(<16 x float> %a) { +; CHECK-LABEL: test_v16f32( +; CHECK-DAG: ld.param.v4.f32 {[[V_12_15:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+48]; +; CHECK-DAG: ld.param.v4.f32 {[[V_8_11:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+32]; +; CHECK-DAG: ld.param.v4.f32 {[[V_4_7:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+16]; +; CHECK-DAG: ld.param.v4.f32 {[[V_0_3:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0]; +; CHECK-DAG: st.param.v4.f32 [func_retval0+0], {[[V_0_3]]} +; CHECK-DAG: st.param.v4.f32 [func_retval0+16], {[[V_4_7]]} +; CHECK-DAG: st.param.v4.f32 [func_retval0+32], {[[V_8_11]]} +; CHECK-DAG: st.param.v4.f32 [func_retval0+48], {[[V_12_15]]} +; CHECK: ret; ret <16 x float> %a } + +define <8 x float> @test_v8f32(<8 x float> %a) { +; CHECK-LABEL: test_v8f32( +; CHECK-DAG: ld.param.v4.f32 {[[V_4_7:(%f[0-9]+[, ]*){4}]]}, [test_v8f32_param_0+16]; +; CHECK-DAG: ld.param.v4.f32 {[[V_0_3:(%f[0-9]+[, ]*){4}]]}, [test_v8f32_param_0]; +; CHECK-DAG: st.param.v4.f32 [func_retval0+0], {[[V_0_3]]} +; CHECK-DAG: st.param.v4.f32 [func_retval0+16], {[[V_4_7]]} +; CHECK: ret; + ret <8 x float> %a +} + +define <4 x float> @test_v4f32(<4 x float> %a) { +; CHECK-LABEL: test_v4f32( +; CHECK-DAG: ld.param.v4.f32 {[[V_0_3:(%f[0-9]+[, ]*){4}]]}, [test_v4f32_param_0]; +; CHECK-DAG: st.param.v4.f32 [func_retval0+0], {[[V_0_3]]} +; CHECK: ret; + ret <4 x float> %a +} + +define <2 x float> @test_v2f32(<2 x float> %a) { +; CHECK-LABEL: test_v2f32( +; CHECK-DAG: ld.param.v2.f32 {[[V_0_3:(%f[0-9]+[, ]*){2}]]}, [test_v2f32_param_0]; +; CHECK-DAG: st.param.v2.f32 [func_retval0+0], {[[V_0_3]]} +; CHECK: ret; + ret <2 x float> %a +} + +; Oddly shaped vectors should not load any extra elements. +define <3 x float> @test_v3f32(<3 x float> %a) { +; CHECK-LABEL: test_v3f32( +; CHECK-DAG: ld.param.f32 [[V_2:%f[0-9]+]], [test_v3f32_param_0+8]; +; CHECK-DAG: ld.param.v2.f32 {[[V_0_1:(%f[0-9]+[, ]*){2}]]}, [test_v3f32_param_0]; +; CHECK-DAG: st.param.v2.f32 [func_retval0+0], {[[V_0_1]]} +; CHECK-DAG: st.param.f32 [func_retval0+8], [[V_2]] +; CHECK: ret; + ret <3 x float> %a +} + +define <8 x i64> @test_v8i64(<8 x i64> %a) { +; CHECK-LABEL: test_v8i64( +; CHECK-DAG: ld.param.v2.u64 {[[V_6_7:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0+48]; +; CHECK-DAG: ld.param.v2.u64 {[[V_4_5:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0+32]; +; CHECK-DAG: ld.param.v2.u64 {[[V_2_3:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0+16]; +; CHECK-DAG: ld.param.v2.u64 {[[V_0_1:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0]; +; CHECK-DAG: st.param.v2.b64 [func_retval0+0], {[[V_0_1]]} +; CHECK-DAG: st.param.v2.b64 [func_retval0+16], {[[V_2_3]]} +; CHECK-DAG: st.param.v2.b64 [func_retval0+32], {[[V_4_5]]} +; CHECK-DAG: st.param.v2.b64 [func_retval0+48], {[[V_6_7]]} +; CHECK: ret; + ret <8 x i64> %a +} + +define <16 x i16> @test_v16i16(<16 x i16> %a) { +; CHECK-LABEL: test_v16i16( +; CHECK-DAG: ld.param.v4.u16 {[[V_12_15:(%rs[0-9]+[, ]*){4}]]}, [test_v16i16_param_0+24]; +; CHECK-DAG: ld.param.v4.u16 {[[V_8_11:(%rs[0-9]+[, ]*){4}]]}, [test_v16i16_param_0+16]; +; CHECK-DAG: ld.param.v4.u16 {[[V_4_7:(%rs[0-9]+[, ]*){4}]]}, [test_v16i16_param_0+8]; +; CHECK-DAG: ld.param.v4.u16 {[[V_0_3:(%rs[0-9]+[, ]*){4}]]}, [test_v16i16_param_0]; +; CHECK-DAG: st.param.v4.b16 [func_retval0+0], {[[V_0_3]]} +; CHECK-DAG: st.param.v4.b16 [func_retval0+8], {[[V_4_7]]} +; CHECK-DAG: st.param.v4.b16 [func_retval0+16], {[[V_8_11]]} +; CHECK-DAG: st.param.v4.b16 [func_retval0+24], {[[V_12_15]]} +; CHECK: ret; + ret <16 x i16> %a +} diff --git a/test/CodeGen/NVPTX/vec8.ll b/test/CodeGen/NVPTX/vec8.ll index 03f5cfc6cb0..a86ba1e29d5 100644 --- a/test/CodeGen/NVPTX/vec8.ll +++ b/test/CodeGen/NVPTX/vec8.ll @@ -4,10 +4,15 @@ target triple = "nvptx-unknown-cuda" ; CHECK: .visible .func foo define void @foo(<8 x i8> %a, i8* %b) { - %t0 = extractelement <8 x i8> %a, i32 0 -; CHECK-DAG: ld.param.v4.u8 -; CHECK-DAG: ld.param.u32 - store i8 %t0, i8* %b +; CHECK-DAG: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [foo_param_0] +; CHECK-DAG: ld.param.v4.u8 {[[E4:%rs[0-9]+]], [[E5:%rs[0-9]+]], [[E6:%rs[0-9]+]], [[E7:%rs[0-9]+]]}, [foo_param_0+4] +; CHECK-DAG: ld.param.u32 %[[B:r[0-9+]]], [foo_param_1] +; CHECK: add.s16 [[T:%rs[0-9+]]], [[E1]], [[E6]]; +; CHECK: st.u8 [%[[B]]], [[T]]; + %t0 = extractelement <8 x i8> %a, i32 1 + %t1 = extractelement <8 x i8> %a, i32 6 + %t = add i8 %t0, %t1 + store i8 %t, i8* %b ret void } diff --git a/test/CodeGen/NVPTX/vector-call.ll b/test/CodeGen/NVPTX/vector-call.ll index 968d1d4a5f5..bf7b931a575 100644 --- a/test/CodeGen/NVPTX/vector-call.ll +++ b/test/CodeGen/NVPTX/vector-call.ll @@ -4,9 +4,27 @@ target triple = "nvptx-unknown-cuda" declare void @bar(<4 x i32>) -; CHECK-LABEL: @foo +; CHECK-LABEL: .func foo( +; CHECK-DAG: ld.param.v4.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [foo_param_0]; +; CHECK: .param .align 16 .b8 param0[16]; +; CHECK-DAG: st.param.v4.b32 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK: call.uni +; CHECK: ret; define void @foo(<4 x i32> %a) { -; CHECK: st.param.v4.b32 tail call void @bar(<4 x i32> %a) ret void } + +; CHECK-LABEL: .func foo3( +; CHECK-DAG: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [foo3_param_0]; +; CHECK-DAG: ld.param.u32 [[E2:%r[0-9]+]], [foo3_param_0+8]; +; CHECK: .param .align 16 .b8 param0[16]; +; CHECK-DAG: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]}; +; CHECK-DAG: st.param.b32 [param0+8], [[E2]]; +; CHECK: call.uni +; CHECK: ret; +declare void @bar3(<3 x i32>) +define void @foo3(<3 x i32> %a) { + tail call void @bar3(<3 x i32> %a) + ret void +}