From 9f7818d9bdfce2e9c7a2cbe31490a135aa6d1211 Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Wed, 23 Jan 2013 02:09:06 +0000 Subject: [PATCH] R600: rework handling of the constants Remove Cxxx registers, add new special register - "ALU_CONST" and new operand for each alu src - "sel". ALU_CONST is used to designate that the new operand contains the value to override src.sel, src.kc_bank, src.chan for constants in the driver. Patch by: Vadim Girlin Vincent Lejeune: - Use pointers for constants - Fold CONST_ADDRESS when possible Tom Stellard: - Give CONSTANT_BUFFER_0 its own address space - Use integer types for constant loads Reviewed-by: Tom Stellard git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@173222 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/R600/AMDGPU.h | 1 + lib/Target/R600/AMDGPUTargetMachine.cpp | 1 + lib/Target/R600/AMDIL.h | 20 ++- lib/Target/R600/AMDILISelDAGToDAG.cpp | 84 +++++++++++- lib/Target/R600/CMakeLists.txt | 1 + .../R600/InstPrinter/AMDGPUInstPrinter.cpp | 24 ++++ .../R600/InstPrinter/AMDGPUInstPrinter.h | 1 + .../R600/MCTargetDesc/R600MCCodeEmitter.cpp | 81 ++++++------ lib/Target/R600/R600Defines.h | 15 +++ lib/Target/R600/R600ISelLowering.cpp | 122 ++++++++++++++++-- lib/Target/R600/R600ISelLowering.h | 1 + lib/Target/R600/R600InstrInfo.cpp | 18 +-- lib/Target/R600/R600Instructions.td | 112 +++++++++++++--- lib/Target/R600/R600LowerConstCopy.cpp | 74 +++++++++++ lib/Target/R600/R600RegisterInfo.cpp | 6 +- lib/Target/R600/R600RegisterInfo.td | 26 ++-- 16 files changed, 483 insertions(+), 104 deletions(-) create mode 100644 lib/Target/R600/R600LowerConstCopy.cpp diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h index c75ec245e0c..1aa607f57ea 100644 --- a/lib/Target/R600/AMDGPU.h +++ b/lib/Target/R600/AMDGPU.h @@ -23,6 +23,7 @@ class AMDGPUTargetMachine; // R600 Passes FunctionPass* createR600KernelParametersPass(const DataLayout *TD); FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm); +FunctionPass *createR600LowerConstCopy(TargetMachine &tm); // SI Passes FunctionPass *createSIAnnotateControlFlowPass(); diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp index 26ac928347d..7b069e77604 100644 --- a/lib/Target/R600/AMDGPUTargetMachine.cpp +++ b/lib/Target/R600/AMDGPUTargetMachine.cpp @@ -136,6 +136,7 @@ bool AMDGPUPassConfig::addPreEmitPass() { addPass(createAMDGPUCFGPreparationPass(*TM)); addPass(createAMDGPUCFGStructurizerPass(*TM)); addPass(createR600ExpandSpecialInstrsPass(*TM)); + addPass(createR600LowerConstCopy(*TM)); addPass(&FinalizeMachineBundlesID); } else { addPass(createSILowerLiteralConstantsPass(*TM)); diff --git a/lib/Target/R600/AMDIL.h b/lib/Target/R600/AMDIL.h index 4e577dc2340..b39fbdbeed9 100644 --- a/lib/Target/R600/AMDIL.h +++ b/lib/Target/R600/AMDIL.h @@ -90,14 +90,30 @@ namespace AMDGPUAS { enum AddressSpaces { PRIVATE_ADDRESS = 0, ///< Address space for private memory. GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0). - CONSTANT_ADDRESS = 2, ///< Address space for constant memory. + CONSTANT_ADDRESS = 2, ///< Address space for constant memory LOCAL_ADDRESS = 3, ///< Address space for local memory. REGION_ADDRESS = 4, ///< Address space for region memory. ADDRESS_NONE = 5, ///< Address space for unknown memory. PARAM_D_ADDRESS = 6, ///< Address space for direct addressible parameter memory (CONST0) PARAM_I_ADDRESS = 7, ///< Address space for indirect addressible parameter memory (VTX1) USER_SGPR_ADDRESS = 8, ///< Address space for USER_SGPRS on SI - LAST_ADDRESS = 9 + CONSTANT_BUFFER_0 = 9, + CONSTANT_BUFFER_1 = 10, + CONSTANT_BUFFER_2 = 11, + CONSTANT_BUFFER_3 = 12, + CONSTANT_BUFFER_4 = 13, + CONSTANT_BUFFER_5 = 14, + CONSTANT_BUFFER_6 = 15, + CONSTANT_BUFFER_7 = 16, + CONSTANT_BUFFER_8 = 17, + CONSTANT_BUFFER_9 = 18, + CONSTANT_BUFFER_10 = 19, + CONSTANT_BUFFER_11 = 20, + CONSTANT_BUFFER_12 = 21, + CONSTANT_BUFFER_13 = 22, + CONSTANT_BUFFER_14 = 23, + CONSTANT_BUFFER_15 = 24, + LAST_ADDRESS = 25 }; } // namespace AMDGPUAS diff --git a/lib/Target/R600/AMDILISelDAGToDAG.cpp b/lib/Target/R600/AMDILISelDAGToDAG.cpp index d15ed393c13..567b3e26cf1 100644 --- a/lib/Target/R600/AMDILISelDAGToDAG.cpp +++ b/lib/Target/R600/AMDILISelDAGToDAG.cpp @@ -20,6 +20,7 @@ #include "llvm/CodeGen/PseudoSourceValue.h" #include "llvm/CodeGen/SelectionDAGISel.h" #include "llvm/Support/Compiler.h" +#include "llvm/CodeGen/SelectionDAG.h" #include #include @@ -45,6 +46,7 @@ public: private: inline SDValue getSmallIPtrImm(unsigned Imm); + bool FoldOperands(unsigned, const R600InstrInfo *, std::vector &); // Complex pattern selectors bool SelectADDRParam(SDValue Addr, SDValue& R1, SDValue& R2); @@ -67,6 +69,9 @@ private: static bool isLocalLoad(const LoadSDNode *N); static bool isRegionLoad(const LoadSDNode *N); + bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr); + bool SelectGlobalValueVariableOffset(SDValue Addr, + SDValue &BaseReg, SDValue& Offset); bool SelectADDR8BitOffset(SDValue Addr, SDValue& Base, SDValue& Offset); bool SelectADDRReg(SDValue Addr, SDValue& Base, SDValue& Offset); bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset); @@ -259,7 +264,65 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { break; } } - return SelectCode(N); + SDNode *Result = SelectCode(N); + + // Fold operands of selected node + + const AMDGPUSubtarget &ST = TM.getSubtarget(); + if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) { + const R600InstrInfo *TII = + static_cast(TM.getInstrInfo()); + if (Result && TII->isALUInstr(Result->getMachineOpcode())) { + bool IsModified = false; + do { + std::vector Ops; + for(SDNode::op_iterator I = Result->op_begin(), E = Result->op_end(); + I != E; ++I) + Ops.push_back(*I); + IsModified = FoldOperands(Result->getMachineOpcode(), TII, Ops); + if (IsModified) { + Result = CurDAG->MorphNodeTo(Result, Result->getOpcode(), + Result->getVTList(), Ops.data(), Ops.size()); + } + } while (IsModified); + } + } + + return Result; +} + +bool AMDGPUDAGToDAGISel::FoldOperands(unsigned Opcode, + const R600InstrInfo *TII, std::vector &Ops) { + int OperandIdx[] = { + TII->getOperandIdx(Opcode, R600Operands::SRC0), + TII->getOperandIdx(Opcode, R600Operands::SRC1), + TII->getOperandIdx(Opcode, R600Operands::SRC2) + }; + int SelIdx[] = { + TII->getOperandIdx(Opcode, R600Operands::SRC0_SEL), + TII->getOperandIdx(Opcode, R600Operands::SRC1_SEL), + TII->getOperandIdx(Opcode, R600Operands::SRC2_SEL) + }; + for (unsigned i = 0; i < 3; i++) { + if (OperandIdx[i] < 0) + return false; + SDValue Operand = Ops[OperandIdx[i] - 1]; + switch (Operand.getOpcode()) { + case AMDGPUISD::CONST_ADDRESS: { + SDValue CstOffset; + if (!Operand.getValueType().isVector() && + SelectGlobalValueConstantOffset(Operand.getOperand(0), CstOffset)) { + Ops[OperandIdx[i] - 1] = CurDAG->getRegister(AMDGPU::ALU_CONST, MVT::f32); + Ops[SelIdx[i] - 1] = CstOffset; + return true; + } + } + break; + default: + break; + } + } + return false; } bool AMDGPUDAGToDAGISel::checkType(const Value *ptr, unsigned int addrspace) { @@ -406,6 +469,25 @@ const char *AMDGPUDAGToDAGISel::getPassName() const { ///==== AMDGPU Functions ====/// +bool AMDGPUDAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr, + SDValue& IntPtr) { + if (ConstantSDNode *Cst = dyn_cast(Addr)) { + IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, true); + return true; + } + return false; +} + +bool AMDGPUDAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr, + SDValue& BaseReg, SDValue &Offset) { + if (!dyn_cast(Addr)) { + BaseReg = Addr; + Offset = CurDAG->getIntPtrConstant(0, true); + return true; + } + return false; +} + bool AMDGPUDAGToDAGISel::SelectADDR8BitOffset(SDValue Addr, SDValue& Base, SDValue& Offset) { if (Addr.getOpcode() == ISD::TargetExternalSymbol || diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt index 790a4aa4db6..a8be7ed975c 100644 --- a/lib/Target/R600/CMakeLists.txt +++ b/lib/Target/R600/CMakeLists.txt @@ -36,6 +36,7 @@ add_llvm_target(R600CodeGen R600ExpandSpecialInstrs.cpp R600InstrInfo.cpp R600ISelLowering.cpp + R600LowerConstCopy.cpp R600MachineFunctionInfo.cpp R600RegisterInfo.cpp SIAnnotateControlFlow.cpp diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp index e6c550b5ac4..e76c6c86757 100644 --- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp +++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp @@ -129,4 +129,28 @@ void AMDGPUInstPrinter::printWrite(const MCInst *MI, unsigned OpNo, } } +void AMDGPUInstPrinter::printSel(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + const char * chans = "XYZW"; + int sel = MI->getOperand(OpNo).getImm(); + + int chan = sel & 3; + sel >>= 2; + + if (sel >= 512) { + sel -= 512; + int cb = sel >> 12; + sel &= 4095; + O << cb << "[" << sel << "]"; + } else if (sel >= 448) { + sel -= 448; + O << sel; + } else if (sel >= 0){ + O << sel; + } + + if (sel >= 0) + O << "." << chans[chan]; +} + #include "AMDGPUGenAsmWriter.inc" diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h index 96e0e46f8a6..e775c4c82e7 100644 --- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h +++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h @@ -45,6 +45,7 @@ private: void printUpdateExecMask(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printUpdatePred(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printWrite(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printSel(const MCInst *MI, unsigned OpNo, raw_ostream &O); }; } // End namespace llvm diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp index 36deae9c0ab..01df8087d5f 100644 --- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp +++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp @@ -63,8 +63,8 @@ private: void EmitALUInstr(const MCInst &MI, SmallVectorImpl &Fixups, raw_ostream &OS) const; void EmitSrc(const MCInst &MI, unsigned OpIdx, raw_ostream &OS) const; - void EmitSrcISA(const MCInst &MI, unsigned OpIdx, uint64_t &Value, - raw_ostream &OS) const; + void EmitSrcISA(const MCInst &MI, unsigned RegOpIdx, unsigned SelOpIdx, + raw_ostream &OS) const; void EmitDst(const MCInst &MI, raw_ostream &OS) const; void EmitTexInstr(const MCInst &MI, SmallVectorImpl &Fixups, raw_ostream &OS) const; @@ -163,7 +163,8 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS, case AMDGPU::VTX_READ_PARAM_32_eg: case AMDGPU::VTX_READ_GLOBAL_8_eg: case AMDGPU::VTX_READ_GLOBAL_32_eg: - case AMDGPU::VTX_READ_GLOBAL_128_eg: { + case AMDGPU::VTX_READ_GLOBAL_128_eg: + case AMDGPU::TEX_VTX_CONSTBUF: { uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups); uint32_t InstWord2 = MI.getOperand(2).getImm(); // Offset @@ -193,7 +194,6 @@ void R600MCCodeEmitter::EmitALUInstr(const MCInst &MI, SmallVectorImpl &Fixups, raw_ostream &OS) const { const MCInstrDesc &MCDesc = MCII.get(MI.getOpcode()); - unsigned NumOperands = MI.getNumOperands(); // Emit instruction type EmitByte(INSTR_ALU, OS); @@ -209,19 +209,21 @@ void R600MCCodeEmitter::EmitALUInstr(const MCInst &MI, InstWord01 |= ISAOpCode << 1; } - unsigned SrcIdx = 0; - for (unsigned int OpIdx = 1; OpIdx < NumOperands; ++OpIdx) { - if (MI.getOperand(OpIdx).isImm() || MI.getOperand(OpIdx).isFPImm() || - OpIdx == (unsigned)MCDesc.findFirstPredOperandIdx()) { - continue; - } - EmitSrcISA(MI, OpIdx, InstWord01, OS); - SrcIdx++; - } + unsigned SrcNum = MCDesc.TSFlags & R600_InstFlag::OP3 ? 3 : + MCDesc.TSFlags & R600_InstFlag::OP2 ? 2 : 1; - // Emit zeros for unused sources - for ( ; SrcIdx < 3; SrcIdx++) { - EmitNullBytes(SRC_BYTE_COUNT - 6, OS); + EmitByte(SrcNum, OS); + + const unsigned SrcOps[3][2] = { + {R600Operands::SRC0, R600Operands::SRC0_SEL}, + {R600Operands::SRC1, R600Operands::SRC1_SEL}, + {R600Operands::SRC2, R600Operands::SRC2_SEL} + }; + + for (unsigned SrcIdx = 0; SrcIdx < SrcNum; ++SrcIdx) { + unsigned RegOpIdx = R600Operands::ALUOpTable[SrcNum-1][SrcOps[SrcIdx][0]]; + unsigned SelOpIdx = R600Operands::ALUOpTable[SrcNum-1][SrcOps[SrcIdx][1]]; + EmitSrcISA(MI, RegOpIdx, SelOpIdx, OS); } Emit(InstWord01, OS); @@ -292,34 +294,37 @@ void R600MCCodeEmitter::EmitSrc(const MCInst &MI, unsigned OpIdx, } -void R600MCCodeEmitter::EmitSrcISA(const MCInst &MI, unsigned OpIdx, - uint64_t &Value, raw_ostream &OS) const { - const MCOperand &MO = MI.getOperand(OpIdx); +void R600MCCodeEmitter::EmitSrcISA(const MCInst &MI, unsigned RegOpIdx, + unsigned SelOpIdx, raw_ostream &OS) const { + const MCOperand &RegMO = MI.getOperand(RegOpIdx); + const MCOperand &SelMO = MI.getOperand(SelOpIdx); + union { float f; uint32_t i; } InlineConstant; InlineConstant.i = 0; - // Emit the source select (2 bytes). For GPRs, this is the register index. - // For other potential instruction operands, (e.g. constant registers) the - // value of the source select is defined in the r600isa docs. - if (MO.isReg()) { - unsigned Reg = MO.getReg(); - if (AMDGPUMCRegisterClasses[AMDGPU::R600_CReg32RegClassID].contains(Reg)) { - EmitByte(1, OS); - } else { - EmitByte(0, OS); - } + // Emit source type (1 byte) and source select (4 bytes). For GPRs type is 0 + // and select is 0 (GPR index is encoded in the instr encoding. For constants + // type is 1 and select is the original const select passed from the driver. + unsigned Reg = RegMO.getReg(); + if (Reg == AMDGPU::ALU_CONST) { + EmitByte(1, OS); + uint32_t Sel = SelMO.getImm(); + Emit(Sel, OS); + } else { + EmitByte(0, OS); + Emit((uint32_t)0, OS); + } - if (Reg == AMDGPU::ALU_LITERAL_X) { - unsigned ImmOpIndex = MI.getNumOperands() - 1; - MCOperand ImmOp = MI.getOperand(ImmOpIndex); - if (ImmOp.isFPImm()) { - InlineConstant.f = ImmOp.getFPImm(); - } else { - assert(ImmOp.isImm()); - InlineConstant.i = ImmOp.getImm(); - } + if (Reg == AMDGPU::ALU_LITERAL_X) { + unsigned ImmOpIndex = MI.getNumOperands() - 1; + MCOperand ImmOp = MI.getOperand(ImmOpIndex); + if (ImmOp.isFPImm()) { + InlineConstant.f = ImmOp.getFPImm(); + } else { + assert(ImmOp.isImm()); + InlineConstant.i = ImmOp.getImm(); } } diff --git a/lib/Target/R600/R600Defines.h b/lib/Target/R600/R600Defines.h index 7dea8e44ea2..e19eea38e49 100644 --- a/lib/Target/R600/R600Defines.h +++ b/lib/Target/R600/R600Defines.h @@ -62,18 +62,33 @@ namespace R600Operands { SRC0_NEG, SRC0_REL, SRC0_ABS, + SRC0_SEL, SRC1, SRC1_NEG, SRC1_REL, SRC1_ABS, + SRC1_SEL, SRC2, SRC2_NEG, SRC2_REL, + SRC2_SEL, LAST, PRED_SEL, IMM, COUNT }; + + const static int ALUOpTable[3][R600Operands::COUNT] = { +// W C S S S S S S S S S S S +// R O D L S R R R R S R R R R S R R R L P +// D U I M R A R C C C C R C C C C R C C C A R I +// S E U T O E M C 0 0 0 0 C 1 1 1 1 C 2 2 2 S E M +// T M P E D L P 0 N R A S 1 N R A S 2 N R S T D M + {0,-1,-1, 1, 2, 3, 4, 5, 6, 7, 8, 9,-1,-1,-1,-1,-1,-1,-1,-1,-1,10,11,12}, + {0, 1, 2, 3, 4 ,5 ,6 ,7, 8, 9,10,11,12,13,14,15,16,-1,-1,-1,-1,17,18,19}, + {0,-1,-1,-1,-1, 1, 2, 3, 4, 5,-1, 6, 7, 8, 9,-1,10,11,12,13,14,15,16,17} + }; + } #endif // R600DEFINES_H_ diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp index f0eece39ea5..69ca3f58304 100644 --- a/lib/Target/R600/R600ISelLowering.cpp +++ b/lib/Target/R600/R600ISelLowering.cpp @@ -74,7 +74,10 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) : setOperationAction(ISD::STORE, MVT::i32, Custom); setOperationAction(ISD::STORE, MVT::v4i32, Custom); + setOperationAction(ISD::LOAD, MVT::i32, Custom); + setOperationAction(ISD::LOAD, MVT::v4i32, Custom); setTargetDAGCombine(ISD::FP_ROUND); + setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); setSchedulingPreference(Sched::VLIW); } @@ -115,15 +118,6 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( break; } - case AMDGPU::R600_LOAD_CONST: { - int64_t RegIndex = MI->getOperand(1).getImm(); - unsigned ConstantReg = AMDGPU::R600_CReg32RegClass.getRegister(RegIndex); - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::COPY)) - .addOperand(MI->getOperand(0)) - .addReg(ConstantReg); - break; - } - case AMDGPU::MASK_WRITE: { unsigned maskedRegister = MI->getOperand(0).getReg(); assert(TargetRegisterInfo::isVirtualRegister(maskedRegister)); @@ -364,6 +358,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::SETCC: return LowerSETCC(Op, DAG); case ISD::STORE: return LowerSTORE(Op, DAG); + case ISD::LOAD: return LowerLOAD(Op, DAG); case ISD::FPOW: return LowerFPOW(Op, DAG); case ISD::INTRINSIC_VOID: { SDValue Chain = Op.getOperand(0); @@ -527,6 +522,16 @@ void R600TargetLowering::ReplaceNodeResults(SDNode *N, switch (N->getOpcode()) { default: return; case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG)); + return; + case ISD::LOAD: { + SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode(); + Results.push_back(SDValue(Node, 0)); + Results.push_back(SDValue(Node, 1)); + // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode + // function + DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1)); + return; + } } } @@ -832,6 +837,94 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } +// return (512 + (kc_bank << 12) +static int +ConstantAddressBlock(unsigned AddressSpace) { + switch (AddressSpace) { + case AMDGPUAS::CONSTANT_BUFFER_0: + return 512; + case AMDGPUAS::CONSTANT_BUFFER_1: + return 512 + 4096; + case AMDGPUAS::CONSTANT_BUFFER_2: + return 512 + 4096 * 2; + case AMDGPUAS::CONSTANT_BUFFER_3: + return 512 + 4096 * 3; + case AMDGPUAS::CONSTANT_BUFFER_4: + return 512 + 4096 * 4; + case AMDGPUAS::CONSTANT_BUFFER_5: + return 512 + 4096 * 5; + case AMDGPUAS::CONSTANT_BUFFER_6: + return 512 + 4096 * 6; + case AMDGPUAS::CONSTANT_BUFFER_7: + return 512 + 4096 * 7; + case AMDGPUAS::CONSTANT_BUFFER_8: + return 512 + 4096 * 8; + case AMDGPUAS::CONSTANT_BUFFER_9: + return 512 + 4096 * 9; + case AMDGPUAS::CONSTANT_BUFFER_10: + return 512 + 4096 * 10; + case AMDGPUAS::CONSTANT_BUFFER_11: + return 512 + 4096 * 11; + case AMDGPUAS::CONSTANT_BUFFER_12: + return 512 + 4096 * 12; + case AMDGPUAS::CONSTANT_BUFFER_13: + return 512 + 4096 * 13; + case AMDGPUAS::CONSTANT_BUFFER_14: + return 512 + 4096 * 14; + case AMDGPUAS::CONSTANT_BUFFER_15: + return 512 + 4096 * 15; + default: + return -1; + } +} + +SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const +{ + EVT VT = Op.getValueType(); + DebugLoc DL = Op.getDebugLoc(); + LoadSDNode *LoadNode = cast(Op); + SDValue Chain = Op.getOperand(0); + SDValue Ptr = Op.getOperand(1); + SDValue LoweredLoad; + + int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace()); + if (ConstantBlock > -1) { + SDValue Result; + if (dyn_cast(LoadNode->getSrcValue()) || + dyn_cast(LoadNode->getSrcValue())) { + SDValue Slots[4]; + for (unsigned i = 0; i < 4; i++) { + // We want Const position encoded with the following formula : + // (((512 + (kc_bank << 12) + const_index) << 2) + chan) + // const_index is Ptr computed by llvm using an alignment of 16. + // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and + // then div by 4 at the ISel step + SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, + DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32)); + Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr); + } + Result = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Slots, 4); + } else { + // non constant ptr cant be folded, keeps it as a v4f32 load + Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32, + DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)) + ); + } + + if (!VT.isVector()) { + Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result, + DAG.getConstant(0, MVT::i32)); + } + + SDValue MergedValues[2] = { + Result, + Chain + }; + return DAG.getMergeValues(MergedValues, 2, DL); + } + + return SDValue(); +} SDValue R600TargetLowering::LowerFPOW(SDValue Op, SelectionDAG &DAG) const { @@ -904,6 +997,17 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, } break; } + // Extract_vec (Build_vector) generated by custom lowering + // also needs to be customly combined + case ISD::EXTRACT_VECTOR_ELT: { + SDValue Arg = N->getOperand(0); + if (Arg.getOpcode() == ISD::BUILD_VECTOR) { + if (ConstantSDNode *Const = dyn_cast(N->getOperand(1))) { + unsigned Element = Const->getZExtValue(); + return Arg->getOperand(Element); + } + } + } } return SDValue(); } diff --git a/lib/Target/R600/R600ISelLowering.h b/lib/Target/R600/R600ISelLowering.h index 2b954dab558..c141d50210e 100644 --- a/lib/Target/R600/R600ISelLowering.h +++ b/lib/Target/R600/R600ISelLowering.h @@ -63,6 +63,7 @@ private: SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFPOW(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; bool isZero(SDValue Op) const; }; diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp index 06b78d09cc7..1adb1422dcb 100644 --- a/lib/Target/R600/R600InstrInfo.cpp +++ b/lib/Target/R600/R600InstrInfo.cpp @@ -486,13 +486,15 @@ MachineInstrBuilder R600InstrInfo::buildDefaultInstruction(MachineBasicBlock &MB .addReg(Src0Reg) // $src0 .addImm(0) // $src0_neg .addImm(0) // $src0_rel - .addImm(0); // $src0_abs + .addImm(0) // $src0_abs + .addImm(-1); // $src0_sel if (Src1Reg) { MIB.addReg(Src1Reg) // $src1 .addImm(0) // $src1_neg .addImm(0) // $src1_rel - .addImm(0); // $src1_abs + .addImm(0) // $src1_abs + .addImm(-1); // $src1_sel } //XXX: The r600g finalizer expects this to be 1, once we've moved the @@ -521,16 +523,6 @@ int R600InstrInfo::getOperandIdx(const MachineInstr &MI, int R600InstrInfo::getOperandIdx(unsigned Opcode, R600Operands::Ops Op) const { - const static int OpTable[3][R600Operands::COUNT] = { -// W C S S S S S S S S -// R O D L S R R R S R R R S R R L P -// D U I M R A R C C C C C C C R C C A R I -// S E U T O E M C 0 0 0 C 1 1 1 C 2 2 S E M -// T M P E D L P 0 N R A 1 N R A 2 N R T D M - {0,-1,-1, 1, 2, 3, 4, 5, 6, 7, 8,-1,-1,-1,-1,-1,-1,-1, 9,10,11}, - {0, 1, 2, 3, 4 ,5 ,6 ,7, 8, 9,10,11,12,-1,-1,-1,13,14,15,16,17}, - {0,-1,-1,-1,-1, 1, 2, 3, 4, 5,-1, 6, 7, 8,-1, 9,10,11,12,13,14} - }; unsigned TargetFlags = get(Opcode).TSFlags; unsigned OpTableIdx; @@ -556,7 +548,7 @@ int R600InstrInfo::getOperandIdx(unsigned Opcode, OpTableIdx = 2; } - return OpTable[OpTableIdx][Op]; + return R600Operands::ALUOpTable[OpTableIdx][Op]; } void R600InstrInfo::setImmOperand(MachineInstr *MI, R600Operands::Ops Op, diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td index d4fa3d68262..a6c39108c72 100644 --- a/lib/Target/R600/R600Instructions.td +++ b/lib/Target/R600/R600Instructions.td @@ -70,6 +70,11 @@ class InstFlag let PrintMethod = PM; } +// src_sel for ALU src operands, see also ALU_CONST, ALU_PARAM registers +def SEL : OperandWithDefaultOps { + let PrintMethod = "printSel"; +} + def LITERAL : InstFlag<"printLiteral">; def WRITE : InstFlag <"printWrite", 1>; @@ -89,6 +94,8 @@ def LAST : InstFlag<"printLast", 1>; def ADDRParam : ComplexPattern; def ADDRDWord : ComplexPattern; def ADDRVTX_READ : ComplexPattern; +def ADDRGA_CONST_OFFSET : ComplexPattern; +def ADDRGA_VAR_OFFSET : ComplexPattern; class R600ALU_Word0 { field bits<32> Word0; @@ -263,11 +270,11 @@ class R600_1OP inst, string opName, list pattern, InstR600 <0, (outs R600_Reg32:$dst), (ins WRITE:$write, OMOD:$omod, REL:$dst_rel, CLAMP:$clamp, - R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs, + R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs, SEL:$src0_sel, LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal), !strconcat(opName, "$clamp $dst$write$dst_rel$omod, " - "$src0_neg$src0_abs$src0$src0_abs$src0_rel, " + "$src0_neg$src0_abs$src0$src0_sel$src0_abs$src0_rel, " "$literal $pred_sel$last"), pattern, itin>, @@ -303,13 +310,13 @@ class R600_2OP inst, string opName, list pattern, (outs R600_Reg32:$dst), (ins UEM:$update_exec_mask, UP:$update_pred, WRITE:$write, OMOD:$omod, REL:$dst_rel, CLAMP:$clamp, - R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs, - R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel, ABS:$src1_abs, + R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs, SEL:$src0_sel, + R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel, ABS:$src1_abs, SEL:$src1_sel, LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal), !strconcat(opName, "$clamp $update_exec_mask$update_pred$dst$write$dst_rel$omod, " - "$src0_neg$src0_abs$src0$src0_abs$src0_rel, " - "$src1_neg$src1_abs$src1$src1_abs$src1_rel, " + "$src0_neg$src0_abs$src0$src0_sel$src0_abs$src0_rel, " + "$src1_neg$src1_abs$src1$src1_sel$src1_abs$src1_rel, " "$literal $pred_sel$last"), pattern, itin>, @@ -340,14 +347,14 @@ class R600_3OP inst, string opName, list pattern, InstR600 <0, (outs R600_Reg32:$dst), (ins REL:$dst_rel, CLAMP:$clamp, - R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, - R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel, - R600_Reg32:$src2, NEG:$src2_neg, REL:$src2_rel, + R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, SEL:$src0_sel, + R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel, SEL:$src1_sel, + R600_Reg32:$src2, NEG:$src2_neg, REL:$src2_rel, SEL:$src2_sel, LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal), !strconcat(opName, "$clamp $dst$dst_rel, " - "$src0_neg$src0$src0_rel, " - "$src1_neg$src1$src1_rel, " - "$src2_neg$src2$src2_rel, " + "$src0_neg$src0$src0_sel$src0_rel, " + "$src1_neg$src1$src1_sel$src1_rel, " + "$src2_neg$src2$src2_sel$src2_rel, " "$literal $pred_sel$last"), pattern, itin>, @@ -482,7 +489,7 @@ def INTERP_P0: SDNode<"AMDGPUISD::INTERP_P0", >; def CONST_ADDRESS: SDNode<"AMDGPUISD::CONST_ADDRESS", - SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisPtrTy<1>]>, + SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisPtrTy<1>]>, [SDNPMayLoad] >; @@ -1538,12 +1545,6 @@ def MASK_WRITE : AMDGPUShaderInst < } // End mayLoad = 0, mayStore = 0, hasSideEffects = 1 -def R600_LOAD_CONST : AMDGPUShaderInst < - (outs R600_Reg32:$dst), - (ins i32imm:$src0), - "R600_LOAD_CONST $dst, $src0", - [(set R600_Reg32:$dst, (int_AMDGPU_load_const imm:$src0))] ->; def RESERVE_REG : AMDGPUShaderInst < (outs), @@ -1551,7 +1552,6 @@ def RESERVE_REG : AMDGPUShaderInst < "RESERVE_REG $src", [(int_AMDGPU_reserve_reg imm:$src)] >; - def TXD: AMDGPUShaderInst < (outs R600_Reg128:$dst), (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget), @@ -1581,6 +1581,78 @@ let isTerminator = 1, isReturn = 1, isBarrier = 1, hasCtrlDep = 1 in { "RETURN", [(IL_retflag)]>; } + +//===----------------------------------------------------------------------===// +// Constant Buffer Addressing Support +//===----------------------------------------------------------------------===// + +let isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU" in { +def CONST_COPY : Instruction { + let OutOperandList = (outs R600_Reg32:$dst); + let InOperandList = (ins i32imm:$src); + let Pattern = [(set R600_Reg32:$dst, (CONST_ADDRESS ADDRGA_CONST_OFFSET:$src))]; + let AsmString = "CONST_COPY"; + let neverHasSideEffects = 1; + let isAsCheapAsAMove = 1; + let Itinerary = NullALU; +} +} // end isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU" + +def TEX_VTX_CONSTBUF : + InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr), "VTX_READ_eg $dst, $ptr", + [(set R600_Reg128:$dst, (CONST_ADDRESS ADDRGA_VAR_OFFSET:$ptr))]>, + VTX_WORD1_GPR, VTX_WORD0 { + + let VC_INST = 0; + let FETCH_TYPE = 2; + let FETCH_WHOLE_QUAD = 0; + let BUFFER_ID = 0; + let SRC_REL = 0; + let SRC_SEL_X = 0; + let DST_REL = 0; + let USE_CONST_FIELDS = 0; + let NUM_FORMAT_ALL = 2; + let FORMAT_COMP_ALL = 1; + let SRF_MODE_ALL = 1; + let MEGA_FETCH_COUNT = 16; + let DST_SEL_X = 0; + let DST_SEL_Y = 1; + let DST_SEL_Z = 2; + let DST_SEL_W = 3; + let DATA_FORMAT = 35; + + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; + +// LLVM can only encode 64-bit instructions, so these fields are manually +// encoded in R600CodeEmitter +// +// bits<16> OFFSET; +// bits<2> ENDIAN_SWAP = 0; +// bits<1> CONST_BUF_NO_STRIDE = 0; +// bits<1> MEGA_FETCH = 0; +// bits<1> ALT_CONST = 0; +// bits<2> BUFFER_INDEX_MODE = 0; + + + +// VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding +// is done in R600CodeEmitter +// +// Inst{79-64} = OFFSET; +// Inst{81-80} = ENDIAN_SWAP; +// Inst{82} = CONST_BUF_NO_STRIDE; +// Inst{83} = MEGA_FETCH; +// Inst{84} = ALT_CONST; +// Inst{86-85} = BUFFER_INDEX_MODE; +// Inst{95-86} = 0; Reserved + +// VTX_WORD3 (Padding) +// +// Inst{127-96} = 0; +} + + //===--------------------------------------------------------------------===// // Instructions support //===--------------------------------------------------------------------===// diff --git a/lib/Target/R600/R600LowerConstCopy.cpp b/lib/Target/R600/R600LowerConstCopy.cpp new file mode 100644 index 00000000000..70a2b138f94 --- /dev/null +++ b/lib/Target/R600/R600LowerConstCopy.cpp @@ -0,0 +1,74 @@ +//===-- R600LowerConstCopy.cpp - Propagate ConstCopy / lower them to MOV---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass is intended to handle remaining ConstCopy pseudo MachineInstr. +/// ISel will fold each Const Buffer read inside scalar ALU. However it cannot +/// fold them inside vector instruction, like DOT4 or Cube ; ISel emits +/// ConstCopy instead. This pass (executed after ExpandingSpecialInstr) will try +/// to fold them if possible or replace them by MOV otherwise. +/// TODO : Implement the folding part, using Copy Propagation algorithm. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "R600InstrInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/IR/GlobalValue.h" + +namespace llvm { + +class R600LowerConstCopy : public MachineFunctionPass { +private: + static char ID; + const R600InstrInfo *TII; +public: + R600LowerConstCopy(TargetMachine &tm); + virtual bool runOnMachineFunction(MachineFunction &MF); + + const char *getPassName() const { return "R600 Eliminate Symbolic Operand"; } +}; + +char R600LowerConstCopy::ID = 0; + + +R600LowerConstCopy::R600LowerConstCopy(TargetMachine &tm) : + MachineFunctionPass(ID), + TII (static_cast(tm.getInstrInfo())) +{ +} + +bool R600LowerConstCopy::runOnMachineFunction(MachineFunction &MF) { + for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); + BB != BB_E; ++BB) { + MachineBasicBlock &MBB = *BB; + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); + I != E;) { + MachineInstr &MI = *I; + I = llvm::next(I); + if (MI.getOpcode() != AMDGPU::CONST_COPY) + continue; + MachineInstr *NewMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::MOV, + MI.getOperand(0).getReg(), AMDGPU::ALU_CONST); + NewMI->getOperand(9).setImm(MI.getOperand(1).getImm()); + MI.eraseFromParent(); + } + } + return false; +} + +FunctionPass *createR600LowerConstCopy(TargetMachine &tm) { + return new R600LowerConstCopy(tm); +} + +} + + diff --git a/lib/Target/R600/R600RegisterInfo.cpp b/lib/Target/R600/R600RegisterInfo.cpp index a39f83dbaca..0441e4a306e 100644 --- a/lib/Target/R600/R600RegisterInfo.cpp +++ b/lib/Target/R600/R600RegisterInfo.cpp @@ -38,16 +38,12 @@ BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const { Reserved.set(AMDGPU::NEG_ONE); Reserved.set(AMDGPU::PV_X); Reserved.set(AMDGPU::ALU_LITERAL_X); + Reserved.set(AMDGPU::ALU_CONST); Reserved.set(AMDGPU::PREDICATE_BIT); Reserved.set(AMDGPU::PRED_SEL_OFF); Reserved.set(AMDGPU::PRED_SEL_ZERO); Reserved.set(AMDGPU::PRED_SEL_ONE); - for (TargetRegisterClass::iterator I = AMDGPU::R600_CReg32RegClass.begin(), - E = AMDGPU::R600_CReg32RegClass.end(); I != E; ++I) { - Reserved.set(*I); - } - for (std::vector::const_iterator I = MFI->ReservedRegs.begin(), E = MFI->ReservedRegs.end(); I != E; ++I) { Reserved.set(*I); diff --git a/lib/Target/R600/R600RegisterInfo.td b/lib/Target/R600/R600RegisterInfo.td index d3d6d25d292..993fefc2ab3 100644 --- a/lib/Target/R600/R600RegisterInfo.td +++ b/lib/Target/R600/R600RegisterInfo.td @@ -27,10 +27,6 @@ foreach Index = 0-127 in { foreach Chan = [ "X", "Y", "Z", "W" ] in { // 32-bit Temporary Registers def T#Index#_#Chan : R600RegWithChan <"T"#Index#"."#Chan, Index, Chan>; - - // 32-bit Constant Registers (There are more than 128, this the number - // that is currently supported. - def C#Index#_#Chan : R600RegWithChan <"C"#Index#"."#Chan, Index, Chan>; } // 128-bit Temporary Registers def T#Index#_XYZW : R600Reg_128 <"T"#Index#".XYZW", @@ -64,13 +60,11 @@ def PRED_SEL_ONE : R600Reg<"Pred_sel_one", 3>; def R600_ArrayBase : RegisterClass <"AMDGPU", [f32, i32], 32, (add (sequence "ArrayBase%u", 448, 464))>; - -def R600_CReg32 : RegisterClass <"AMDGPU", [f32, i32], 32, - (add (interleave - (interleave (sequence "C%u_X", 0, 127), - (sequence "C%u_Z", 0, 127)), - (interleave (sequence "C%u_Y", 0, 127), - (sequence "C%u_W", 0, 127))))>; +// special registers for ALU src operands +// const buffer reference, SRCx_SEL contains index +def ALU_CONST : R600Reg<"CBuf", 0>; +// interpolation param reference, SRCx_SEL contains index +def ALU_PARAM : R600Reg<"Param", 0>; def R600_TReg32_X : RegisterClass <"AMDGPU", [f32, i32], 32, (add (sequence "T%u_X", 0, 127))>; @@ -85,15 +79,15 @@ def R600_TReg32_W : RegisterClass <"AMDGPU", [f32, i32], 32, (add (sequence "T%u_W", 0, 127))>; def R600_TReg32 : RegisterClass <"AMDGPU", [f32, i32], 32, - (add (interleave - (interleave R600_TReg32_X, R600_TReg32_Z), - (interleave R600_TReg32_Y, R600_TReg32_W)))>; + (interleave R600_TReg32_X, R600_TReg32_Y, + R600_TReg32_Z, R600_TReg32_W)>; def R600_Reg32 : RegisterClass <"AMDGPU", [f32, i32], 32, (add R600_TReg32, - R600_CReg32, R600_ArrayBase, - ZERO, HALF, ONE, ONE_INT, PV_X, ALU_LITERAL_X, NEG_ONE, NEG_HALF)>; + ZERO, HALF, ONE, ONE_INT, PV_X, ALU_LITERAL_X, NEG_ONE, NEG_HALF, + ALU_CONST, ALU_PARAM + )>; def R600_Predicate : RegisterClass <"AMDGPU", [i32], 32, (add PRED_SEL_OFF, PRED_SEL_ZERO, PRED_SEL_ONE)>;