R600: rework handling of the constants

Remove Cxxx registers, add new special register - "ALU_CONST" and new operand for each alu src - "sel". ALU_CONST is used to designate that the new operand contains the value to override src.sel, src.kc_bank, src.chan for constants in the driver. Patch by: Vadim Girlin Vincent Lejeune: - Use pointers for constants - Fold CONST_ADDRESS when possible Tom Stellard: - Give CONSTANT_BUFFER_0 its own address space - Use integer types for constant loads Reviewed-by: Tom Stellard <thomas.stellard@amd.com> git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@173222 91177308-0d34-0410-b5e6-96231b3b80d8
2025-01-21 09:25:07 +00:00 · 2013-01-23 02:09:06 +00:00 · 2013-01-23 02:09:06 +00:00 · 9f7818d9bd
commit 9f7818d9bd
parent c7e1888d93
16 changed files with 483 additions and 104 deletions
--- a/lib/Target/R600/AMDGPU.h
+++ b/lib/Target/R600/AMDGPU.h
@ -23,6 +23,7 @@ class AMDGPUTargetMachine;
 // R600 Passes
 FunctionPass* createR600KernelParametersPass(const DataLayout *TD);
 FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm);
+FunctionPass *createR600LowerConstCopy(TargetMachine &tm);

 // SI Passes
 FunctionPass *createSIAnnotateControlFlowPass();
--- a/lib/Target/R600/AMDGPUTargetMachine.cpp
+++ b/lib/Target/R600/AMDGPUTargetMachine.cpp
@ -136,6 +136,7 @@ bool AMDGPUPassConfig::addPreEmitPass() {
    addPass(createAMDGPUCFGPreparationPass(*TM));
    addPass(createAMDGPUCFGStructurizerPass(*TM));
    addPass(createR600ExpandSpecialInstrsPass(*TM));
+    addPass(createR600LowerConstCopy(*TM));
    addPass(&FinalizeMachineBundlesID);
  } else {
    addPass(createSILowerLiteralConstantsPass(*TM));
--- a/lib/Target/R600/AMDIL.h
+++ b/lib/Target/R600/AMDIL.h
@ -90,14 +90,30 @@ namespace AMDGPUAS {
 enum AddressSpaces {
  PRIVATE_ADDRESS  = 0, ///< Address space for private memory.
  GLOBAL_ADDRESS   = 1, ///< Address space for global memory (RAT0, VTX0).
-  CONSTANT_ADDRESS = 2, ///< Address space for constant memory.
+  CONSTANT_ADDRESS = 2, ///< Address space for constant memory
  LOCAL_ADDRESS    = 3, ///< Address space for local memory.
  REGION_ADDRESS   = 4, ///< Address space for region memory.
  ADDRESS_NONE     = 5, ///< Address space for unknown memory.
  PARAM_D_ADDRESS  = 6, ///< Address space for direct addressible parameter memory (CONST0)
  PARAM_I_ADDRESS  = 7, ///< Address space for indirect addressible parameter memory (VTX1)
  USER_SGPR_ADDRESS = 8, ///< Address space for USER_SGPRS on SI
-  LAST_ADDRESS     = 9
+  CONSTANT_BUFFER_0 = 9,
+  CONSTANT_BUFFER_1 = 10,
+  CONSTANT_BUFFER_2 = 11,
+  CONSTANT_BUFFER_3 = 12,
+  CONSTANT_BUFFER_4 = 13,
+  CONSTANT_BUFFER_5 = 14,
+  CONSTANT_BUFFER_6 = 15,
+  CONSTANT_BUFFER_7 = 16,
+  CONSTANT_BUFFER_8 = 17,
+  CONSTANT_BUFFER_9 = 18,
+  CONSTANT_BUFFER_10 = 19,
+  CONSTANT_BUFFER_11 = 20,
+  CONSTANT_BUFFER_12 = 21,
+  CONSTANT_BUFFER_13 = 22,
+  CONSTANT_BUFFER_14 = 23,
+  CONSTANT_BUFFER_15 = 24,
+  LAST_ADDRESS     = 25
 };

 } // namespace AMDGPUAS
--- a/lib/Target/R600/AMDILISelDAGToDAG.cpp
+++ b/lib/Target/R600/AMDILISelDAGToDAG.cpp
@ -20,6 +20,7 @@
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/CodeGen/SelectionDAG.h"
 #include <list>
 #include <queue>

@ -45,6 +46,7 @@ public:

 private:
  inline SDValue getSmallIPtrImm(unsigned Imm);
+  bool FoldOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &);

  // Complex pattern selectors
  bool SelectADDRParam(SDValue Addr, SDValue& R1, SDValue& R2);
@ -67,6 +69,9 @@ private:
  static bool isLocalLoad(const LoadSDNode *N);
  static bool isRegionLoad(const LoadSDNode *N);

+  bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr);
+  bool SelectGlobalValueVariableOffset(SDValue Addr,
+      SDValue &BaseReg, SDValue& Offset);
  bool SelectADDR8BitOffset(SDValue Addr, SDValue& Base, SDValue& Offset);
  bool SelectADDRReg(SDValue Addr, SDValue& Base, SDValue& Offset);
  bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
@ -259,7 +264,65 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
    break;
  }
  }
-  return SelectCode(N);
+  SDNode *Result = SelectCode(N);
+
+  // Fold operands of selected node
+
+  const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
+  if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) {
+    const R600InstrInfo *TII =
+        static_cast<const R600InstrInfo*>(TM.getInstrInfo());
+    if (Result && TII->isALUInstr(Result->getMachineOpcode())) {
+      bool IsModified = false;
+      do {
+        std::vector<SDValue> Ops;
+        for(SDNode::op_iterator I = Result->op_begin(), E = Result->op_end();
+            I != E; ++I)
+          Ops.push_back(*I);
+        IsModified = FoldOperands(Result->getMachineOpcode(), TII, Ops);
+        if (IsModified) {
+          Result = CurDAG->MorphNodeTo(Result, Result->getOpcode(),
+              Result->getVTList(), Ops.data(), Ops.size());
+        }
+      } while (IsModified);
+    }
+  }
+
+  return Result;
+}
+
+bool AMDGPUDAGToDAGISel::FoldOperands(unsigned Opcode,
+    const R600InstrInfo *TII, std::vector<SDValue> &Ops) {
+  int OperandIdx[] = {
+    TII->getOperandIdx(Opcode, R600Operands::SRC0),
+    TII->getOperandIdx(Opcode, R600Operands::SRC1),
+    TII->getOperandIdx(Opcode, R600Operands::SRC2)
+  };
+  int SelIdx[] = {
+    TII->getOperandIdx(Opcode, R600Operands::SRC0_SEL),
+    TII->getOperandIdx(Opcode, R600Operands::SRC1_SEL),
+    TII->getOperandIdx(Opcode, R600Operands::SRC2_SEL)
+  };
+  for (unsigned i = 0; i < 3; i++) {
+    if (OperandIdx[i] < 0)
+      return false;
+    SDValue Operand = Ops[OperandIdx[i] - 1];
+    switch (Operand.getOpcode()) {
+    case AMDGPUISD::CONST_ADDRESS: {
+      SDValue CstOffset;
+      if (!Operand.getValueType().isVector() &&
+          SelectGlobalValueConstantOffset(Operand.getOperand(0), CstOffset)) {
+        Ops[OperandIdx[i] - 1] = CurDAG->getRegister(AMDGPU::ALU_CONST, MVT::f32);
+        Ops[SelIdx[i] - 1] = CstOffset;
+        return true;
+      }
+      }
+      break;
+    default:
+      break;
+    }
+  }
+  return false;
 }

 bool AMDGPUDAGToDAGISel::checkType(const Value *ptr, unsigned int addrspace) {
@ -406,6 +469,25 @@ const char *AMDGPUDAGToDAGISel::getPassName() const {

 ///==== AMDGPU Functions ====///

+bool AMDGPUDAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr,
+    SDValue& IntPtr) {
+  if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Addr)) {
+    IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, true);
+    return true;
+  }
+  return false;
+}
+
+bool AMDGPUDAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr,
+    SDValue& BaseReg, SDValue &Offset) {
+  if (!dyn_cast<ConstantSDNode>(Addr)) {
+    BaseReg = Addr;
+    Offset = CurDAG->getIntPtrConstant(0, true);
+    return true;
+  }
+  return false;
+}
+
 bool AMDGPUDAGToDAGISel::SelectADDR8BitOffset(SDValue Addr, SDValue& Base,
                                             SDValue& Offset) {
  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
--- a/lib/Target/R600/CMakeLists.txt
+++ b/lib/Target/R600/CMakeLists.txt
@ -36,6 +36,7 @@ add_llvm_target(R600CodeGen
  R600ExpandSpecialInstrs.cpp
  R600InstrInfo.cpp
  R600ISelLowering.cpp
+  R600LowerConstCopy.cpp
  R600MachineFunctionInfo.cpp
  R600RegisterInfo.cpp
  SIAnnotateControlFlow.cpp
--- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
+++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
@ -129,4 +129,28 @@ void AMDGPUInstPrinter::printWrite(const MCInst *MI, unsigned OpNo,
  }
 }

+void AMDGPUInstPrinter::printSel(const MCInst *MI, unsigned OpNo,
+                                  raw_ostream &O) {
+  const char * chans = "XYZW";
+  int sel = MI->getOperand(OpNo).getImm();
+
+  int chan = sel & 3;
+  sel >>= 2;
+
+  if (sel >= 512) {
+    sel -= 512;
+    int cb = sel >> 12;
+    sel &= 4095;
+    O << cb << "[" << sel << "]";
+  } else if (sel >= 448) {
+    sel -= 448;
+    O << sel;
+  } else if (sel >= 0){
+    O << sel;
+  }
+
+  if (sel >= 0)
+    O << "." << chans[chan];
+}
+
 #include "AMDGPUGenAsmWriter.inc"
--- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
+++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
@ -45,6 +45,7 @@ private:
  void printUpdateExecMask(const MCInst *MI, unsigned OpNo, raw_ostream &O);
  void printUpdatePred(const MCInst *MI, unsigned OpNo, raw_ostream &O);
  void printWrite(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printSel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
 };

 } // End namespace llvm
--- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
@ -63,8 +63,8 @@ private:
  void EmitALUInstr(const MCInst &MI, SmallVectorImpl<MCFixup> &Fixups,
                    raw_ostream &OS) const;
  void EmitSrc(const MCInst &MI, unsigned OpIdx, raw_ostream &OS) const;
-  void EmitSrcISA(const MCInst &MI, unsigned OpIdx, uint64_t &Value,
-                  raw_ostream &OS) const;
+  void EmitSrcISA(const MCInst &MI, unsigned RegOpIdx, unsigned SelOpIdx,
+                    raw_ostream &OS) const;
  void EmitDst(const MCInst &MI, raw_ostream &OS) const;
  void EmitTexInstr(const MCInst &MI, SmallVectorImpl<MCFixup> &Fixups,
                    raw_ostream &OS) const;
@ -163,7 +163,8 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
    case AMDGPU::VTX_READ_PARAM_32_eg:
    case AMDGPU::VTX_READ_GLOBAL_8_eg:
    case AMDGPU::VTX_READ_GLOBAL_32_eg:
-    case AMDGPU::VTX_READ_GLOBAL_128_eg: {
+    case AMDGPU::VTX_READ_GLOBAL_128_eg:
+    case AMDGPU::TEX_VTX_CONSTBUF: {
      uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups);
      uint32_t InstWord2 = MI.getOperand(2).getImm(); // Offset

@ -193,7 +194,6 @@ void R600MCCodeEmitter::EmitALUInstr(const MCInst &MI,
                                     SmallVectorImpl<MCFixup> &Fixups,
                                     raw_ostream &OS) const {
  const MCInstrDesc &MCDesc = MCII.get(MI.getOpcode());
-  unsigned NumOperands = MI.getNumOperands();

  // Emit instruction type
  EmitByte(INSTR_ALU, OS);
@ -209,19 +209,21 @@ void R600MCCodeEmitter::EmitALUInstr(const MCInst &MI,
    InstWord01 |= ISAOpCode << 1;
  }

-  unsigned SrcIdx = 0;
-  for (unsigned int OpIdx = 1; OpIdx < NumOperands; ++OpIdx) {
-    if (MI.getOperand(OpIdx).isImm() || MI.getOperand(OpIdx).isFPImm() ||
-        OpIdx == (unsigned)MCDesc.findFirstPredOperandIdx()) {
-      continue;
-    }
-    EmitSrcISA(MI, OpIdx, InstWord01, OS);
-    SrcIdx++;
-  }
+  unsigned SrcNum = MCDesc.TSFlags & R600_InstFlag::OP3 ? 3 :
+      MCDesc.TSFlags & R600_InstFlag::OP2 ? 2 : 1;

-  // Emit zeros for unused sources
-  for ( ; SrcIdx < 3; SrcIdx++) {
-    EmitNullBytes(SRC_BYTE_COUNT - 6, OS);
+  EmitByte(SrcNum, OS);
+
+  const unsigned SrcOps[3][2] = {
+      {R600Operands::SRC0, R600Operands::SRC0_SEL},
+      {R600Operands::SRC1, R600Operands::SRC1_SEL},
+      {R600Operands::SRC2, R600Operands::SRC2_SEL}
+  };
+
+  for (unsigned SrcIdx = 0; SrcIdx < SrcNum; ++SrcIdx) {
+    unsigned RegOpIdx = R600Operands::ALUOpTable[SrcNum-1][SrcOps[SrcIdx][0]];
+    unsigned SelOpIdx = R600Operands::ALUOpTable[SrcNum-1][SrcOps[SrcIdx][1]];
+    EmitSrcISA(MI, RegOpIdx, SelOpIdx, OS);
  }

  Emit(InstWord01, OS);
@ -292,34 +294,37 @@ void R600MCCodeEmitter::EmitSrc(const MCInst &MI, unsigned OpIdx,

 }

-void R600MCCodeEmitter::EmitSrcISA(const MCInst &MI, unsigned OpIdx,
-                                   uint64_t &Value, raw_ostream &OS) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
+void R600MCCodeEmitter::EmitSrcISA(const MCInst &MI, unsigned RegOpIdx,
+                                   unsigned SelOpIdx, raw_ostream &OS) const {
+  const MCOperand &RegMO = MI.getOperand(RegOpIdx);
+  const MCOperand &SelMO = MI.getOperand(SelOpIdx);
+
  union {
    float f;
    uint32_t i;
  } InlineConstant;
  InlineConstant.i = 0;
-  // Emit the source select (2 bytes).  For GPRs, this is the register index.
-  // For other potential instruction operands, (e.g. constant registers) the
-  // value of the source select is defined in the r600isa docs.
-  if (MO.isReg()) {
-    unsigned Reg = MO.getReg();
-    if (AMDGPUMCRegisterClasses[AMDGPU::R600_CReg32RegClassID].contains(Reg)) {
-      EmitByte(1, OS);
-    } else {
-      EmitByte(0, OS);
-    }
+  // Emit source type (1 byte) and source select (4 bytes). For GPRs type is 0
+  // and select is 0 (GPR index is encoded in the instr encoding. For constants
+  // type is 1 and select is the original const select passed from the driver.
+  unsigned Reg = RegMO.getReg();
+  if (Reg == AMDGPU::ALU_CONST) {
+    EmitByte(1, OS);
+    uint32_t Sel = SelMO.getImm();
+    Emit(Sel, OS);
+  } else {
+    EmitByte(0, OS);
+    Emit((uint32_t)0, OS);
+  }

-    if (Reg == AMDGPU::ALU_LITERAL_X) {
-      unsigned ImmOpIndex = MI.getNumOperands() - 1;
-      MCOperand ImmOp = MI.getOperand(ImmOpIndex);
-      if (ImmOp.isFPImm()) {
-        InlineConstant.f = ImmOp.getFPImm();
-      } else {
-        assert(ImmOp.isImm());
-        InlineConstant.i = ImmOp.getImm();
-      }
+  if (Reg == AMDGPU::ALU_LITERAL_X) {
+    unsigned ImmOpIndex = MI.getNumOperands() - 1;
+    MCOperand ImmOp = MI.getOperand(ImmOpIndex);
+    if (ImmOp.isFPImm()) {
+      InlineConstant.f = ImmOp.getFPImm();
+    } else {
+      assert(ImmOp.isImm());
+      InlineConstant.i = ImmOp.getImm();
    }
  }

--- a/lib/Target/R600/R600Defines.h
+++ b/lib/Target/R600/R600Defines.h
@ -62,18 +62,33 @@ namespace R600Operands {
    SRC0_NEG,
    SRC0_REL,
    SRC0_ABS,
+    SRC0_SEL,
    SRC1,
    SRC1_NEG,
    SRC1_REL,
    SRC1_ABS,
+    SRC1_SEL,
    SRC2,
    SRC2_NEG,
    SRC2_REL,
+    SRC2_SEL,
    LAST,
    PRED_SEL,
    IMM,
    COUNT
 };
+
+  const static int ALUOpTable[3][R600Operands::COUNT] = {
+//            W        C     S  S  S  S     S  S  S  S     S  S  S
+//            R  O  D  L  S  R  R  R  R  S  R  R  R  R  S  R  R  R  L  P
+//   D  U     I  M  R  A  R  C  C  C  C  R  C  C  C  C  R  C  C  C  A  R  I
+//   S  E  U  T  O  E  M  C  0  0  0  0  C  1  1  1  1  C  2  2  2  S  E  M
+//   T  M  P  E  D  L  P  0  N  R  A  S  1  N  R  A  S  2  N  R  S  T  D  M
+    {0,-1,-1, 1, 2, 3, 4, 5, 6, 7, 8, 9,-1,-1,-1,-1,-1,-1,-1,-1,-1,10,11,12},
+    {0, 1, 2, 3, 4 ,5 ,6 ,7, 8, 9,10,11,12,13,14,15,16,-1,-1,-1,-1,17,18,19},
+    {0,-1,-1,-1,-1, 1, 2, 3, 4, 5,-1, 6, 7, 8, 9,-1,10,11,12,13,14,15,16,17}
+  };
+
 }

 #endif // R600DEFINES_H_
--- a/lib/Target/R600/R600ISelLowering.cpp
+++ b/lib/Target/R600/R600ISelLowering.cpp
@ -74,7 +74,10 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
  setOperationAction(ISD::STORE, MVT::i32, Custom);
  setOperationAction(ISD::STORE, MVT::v4i32, Custom);

+  setOperationAction(ISD::LOAD, MVT::i32, Custom);
+  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
  setTargetDAGCombine(ISD::FP_ROUND);
+  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);

  setSchedulingPreference(Sched::VLIW);
 }
@ -115,15 +118,6 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
    break;
  }

-  case AMDGPU::R600_LOAD_CONST: {
-    int64_t RegIndex = MI->getOperand(1).getImm();
-    unsigned ConstantReg = AMDGPU::R600_CReg32RegClass.getRegister(RegIndex);
-    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::COPY))
-                .addOperand(MI->getOperand(0))
-                .addReg(ConstantReg);
-    break;
-  }
-
  case AMDGPU::MASK_WRITE: {
    unsigned maskedRegister = MI->getOperand(0).getReg();
    assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
@ -364,6 +358,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
  case ISD::SELECT: return LowerSELECT(Op, DAG);
  case ISD::SETCC: return LowerSETCC(Op, DAG);
  case ISD::STORE: return LowerSTORE(Op, DAG);
+  case ISD::LOAD: return LowerLOAD(Op, DAG);
  case ISD::FPOW: return LowerFPOW(Op, DAG);
  case ISD::INTRINSIC_VOID: {
    SDValue Chain = Op.getOperand(0);
@ -527,6 +522,16 @@ void R600TargetLowering::ReplaceNodeResults(SDNode *N,
  switch (N->getOpcode()) {
  default: return;
  case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
+    return;
+  case ISD::LOAD: {
+    SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
+    Results.push_back(SDValue(Node, 0));
+    Results.push_back(SDValue(Node, 1));
+    // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
+    // function
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
+    return;
+  }
  }
 }

@ -832,6 +837,94 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
  return SDValue();
 }

+// return (512 + (kc_bank << 12)
+static int
+ConstantAddressBlock(unsigned AddressSpace) {
+  switch (AddressSpace) {
+  case AMDGPUAS::CONSTANT_BUFFER_0:
+    return 512;
+  case AMDGPUAS::CONSTANT_BUFFER_1:
+    return 512 + 4096;
+  case AMDGPUAS::CONSTANT_BUFFER_2:
+    return 512 + 4096 * 2;
+  case AMDGPUAS::CONSTANT_BUFFER_3:
+    return 512 + 4096 * 3;
+  case AMDGPUAS::CONSTANT_BUFFER_4:
+    return 512 + 4096 * 4;
+  case AMDGPUAS::CONSTANT_BUFFER_5:
+    return 512 + 4096 * 5;
+  case AMDGPUAS::CONSTANT_BUFFER_6:
+    return 512 + 4096 * 6;
+  case AMDGPUAS::CONSTANT_BUFFER_7:
+    return 512 + 4096 * 7;
+  case AMDGPUAS::CONSTANT_BUFFER_8:
+    return 512 + 4096 * 8;
+  case AMDGPUAS::CONSTANT_BUFFER_9:
+    return 512 + 4096 * 9;
+  case AMDGPUAS::CONSTANT_BUFFER_10:
+    return 512 + 4096 * 10;
+  case AMDGPUAS::CONSTANT_BUFFER_11:
+    return 512 + 4096 * 11;
+  case AMDGPUAS::CONSTANT_BUFFER_12:
+    return 512 + 4096 * 12;
+  case AMDGPUAS::CONSTANT_BUFFER_13:
+    return 512 + 4096 * 13;
+  case AMDGPUAS::CONSTANT_BUFFER_14:
+    return 512 + 4096 * 14;
+  case AMDGPUAS::CONSTANT_BUFFER_15:
+    return 512 + 4096 * 15;
+  default:
+    return -1;
+  }
+}
+
+SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
+{
+  EVT VT = Op.getValueType();
+  DebugLoc DL = Op.getDebugLoc();
+  LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
+  SDValue Chain = Op.getOperand(0);
+  SDValue Ptr = Op.getOperand(1);
+  SDValue LoweredLoad;
+
+  int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
+  if (ConstantBlock > -1) {
+    SDValue Result;
+    if (dyn_cast<ConstantExpr>(LoadNode->getSrcValue()) ||
+        dyn_cast<Constant>(LoadNode->getSrcValue())) {
+      SDValue Slots[4];
+      for (unsigned i = 0; i < 4; i++) {
+        // We want Const position encoded with the following formula :
+        // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
+        // const_index is Ptr computed by llvm using an alignment of 16.
+        // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
+        // then div by 4 at the ISel step
+        SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
+            DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
+        Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
+      }
+      Result = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Slots, 4);
+    } else {
+      // non constant ptr cant be folded, keeps it as a v4f32 load
+      Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
+          DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32))
+          );
+    }
+
+    if (!VT.isVector()) {
+      Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
+          DAG.getConstant(0, MVT::i32));
+    }
+
+    SDValue MergedValues[2] = {
+        Result,
+        Chain
+    };
+    return DAG.getMergeValues(MergedValues, 2, DL);
+  }
+
+  return SDValue();
+}

 SDValue R600TargetLowering::LowerFPOW(SDValue Op,
    SelectionDAG &DAG) const {
@ -904,6 +997,17 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
      }
      break;
    }
+  // Extract_vec (Build_vector) generated by custom lowering
+  // also needs to be customly combined
+  case ISD::EXTRACT_VECTOR_ELT: {
+    SDValue Arg = N->getOperand(0);
+    if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
+      if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
+        unsigned Element = Const->getZExtValue();
+        return Arg->getOperand(Element);
+      }
+    }
+  }
  }
  return SDValue();
 }
--- a/lib/Target/R600/R600ISelLowering.h
+++ b/lib/Target/R600/R600ISelLowering.h
@ -63,6 +63,7 @@ private:
  SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
  SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const;
  SDValue LowerFPOW(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
  
  bool isZero(SDValue Op) const;
 };
--- a/lib/Target/R600/R600InstrInfo.cpp
+++ b/lib/Target/R600/R600InstrInfo.cpp
@ -486,13 +486,15 @@ MachineInstrBuilder R600InstrInfo::buildDefaultInstruction(MachineBasicBlock &MB
     .addReg(Src0Reg)  // $src0
     .addImm(0)        // $src0_neg
     .addImm(0)        // $src0_rel
-     .addImm(0);       // $src0_abs
+     .addImm(0)        // $src0_abs
+     .addImm(-1);       // $src0_sel

  if (Src1Reg) {
    MIB.addReg(Src1Reg) // $src1
       .addImm(0)       // $src1_neg
       .addImm(0)       // $src1_rel
-       .addImm(0);       // $src1_abs
+       .addImm(0)       // $src1_abs
+       .addImm(-1);      // $src1_sel
  }

  //XXX: The r600g finalizer expects this to be 1, once we've moved the
@ -521,16 +523,6 @@ int R600InstrInfo::getOperandIdx(const MachineInstr &MI,

 int R600InstrInfo::getOperandIdx(unsigned Opcode,
                                 R600Operands::Ops Op) const {
-  const static int OpTable[3][R600Operands::COUNT] = {
-//            W        C     S  S  S     S  S  S     S  S
-//            R  O  D  L  S  R  R  R  S  R  R  R  S  R  R  L  P
-//   D  U     I  M  R  A  R  C  C  C  C  C  C  C  R  C  C  A  R  I
-//   S  E  U  T  O  E  M  C  0  0  0  C  1  1  1  C  2  2  S  E  M
-//   T  M  P  E  D  L  P  0  N  R  A  1  N  R  A  2  N  R  T  D  M
-    {0,-1,-1, 1, 2, 3, 4, 5, 6, 7, 8,-1,-1,-1,-1,-1,-1,-1, 9,10,11},
-    {0, 1, 2, 3, 4 ,5 ,6 ,7, 8, 9,10,11,12,-1,-1,-1,13,14,15,16,17},
-    {0,-1,-1,-1,-1, 1, 2, 3, 4, 5,-1, 6, 7, 8,-1, 9,10,11,12,13,14}
-  };
  unsigned TargetFlags = get(Opcode).TSFlags;
  unsigned OpTableIdx;

@ -556,7 +548,7 @@ int R600InstrInfo::getOperandIdx(unsigned Opcode,
    OpTableIdx = 2;
  }

-  return OpTable[OpTableIdx][Op];
+  return R600Operands::ALUOpTable[OpTableIdx][Op];
 }

 void R600InstrInfo::setImmOperand(MachineInstr *MI, R600Operands::Ops Op,
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@ -70,6 +70,11 @@ class InstFlag<string PM = "printOperand", int Default = 0>
  let PrintMethod = PM;
 }

+// src_sel for ALU src operands, see also ALU_CONST, ALU_PARAM registers 
+def SEL : OperandWithDefaultOps <i32, (ops (i32 -1))> {
+  let PrintMethod = "printSel";
+}
+
 def LITERAL : InstFlag<"printLiteral">;

 def WRITE : InstFlag <"printWrite", 1>;
@ -89,6 +94,8 @@ def LAST : InstFlag<"printLast", 1>;
 def ADDRParam : ComplexPattern<i32, 2, "SelectADDRParam", [], []>;
 def ADDRDWord : ComplexPattern<i32, 1, "SelectADDRDWord", [], []>;
 def ADDRVTX_READ : ComplexPattern<i32, 2, "SelectADDRVTX_READ", [], []>;
+def ADDRGA_CONST_OFFSET : ComplexPattern<i32, 1, "SelectGlobalValueConstantOffset", [], []>;
+def ADDRGA_VAR_OFFSET : ComplexPattern<i32, 2, "SelectGlobalValueVariableOffset", [], []>;

 class R600ALU_Word0 {
  field bits<32> Word0;
@ -263,11 +270,11 @@ class R600_1OP <bits<11> inst, string opName, list<dag> pattern,
    InstR600 <0,
              (outs R600_Reg32:$dst),
              (ins WRITE:$write, OMOD:$omod, REL:$dst_rel, CLAMP:$clamp,
-                   R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs,
+                   R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs, SEL:$src0_sel,
                   LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal),
              !strconcat(opName,
                   "$clamp $dst$write$dst_rel$omod, "
-                   "$src0_neg$src0_abs$src0$src0_abs$src0_rel, "
+                   "$src0_neg$src0_abs$src0$src0_sel$src0_abs$src0_rel, "
                   "$literal $pred_sel$last"),
              pattern,
              itin>,
@ -303,13 +310,13 @@ class R600_2OP <bits<11> inst, string opName, list<dag> pattern,
          (outs R600_Reg32:$dst),
          (ins UEM:$update_exec_mask, UP:$update_pred, WRITE:$write,
               OMOD:$omod, REL:$dst_rel, CLAMP:$clamp,
-               R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs,
-               R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel, ABS:$src1_abs,
+               R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs, SEL:$src0_sel,
+               R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel, ABS:$src1_abs, SEL:$src1_sel,
               LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal),
          !strconcat(opName,
                "$clamp $update_exec_mask$update_pred$dst$write$dst_rel$omod, "
-                "$src0_neg$src0_abs$src0$src0_abs$src0_rel, "
-                "$src1_neg$src1_abs$src1$src1_abs$src1_rel, "
+                "$src0_neg$src0_abs$src0$src0_sel$src0_abs$src0_rel, "
+                "$src1_neg$src1_abs$src1$src1_sel$src1_abs$src1_rel, "
                "$literal $pred_sel$last"),
          pattern,
          itin>,
@ -340,14 +347,14 @@ class R600_3OP <bits<5> inst, string opName, list<dag> pattern,
  InstR600 <0,
          (outs R600_Reg32:$dst),
          (ins REL:$dst_rel, CLAMP:$clamp,
-               R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel,
-               R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel,
-               R600_Reg32:$src2, NEG:$src2_neg, REL:$src2_rel,
+               R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, SEL:$src0_sel,
+               R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel, SEL:$src1_sel,
+               R600_Reg32:$src2, NEG:$src2_neg, REL:$src2_rel, SEL:$src2_sel,
               LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal),
          !strconcat(opName, "$clamp $dst$dst_rel, "
-                             "$src0_neg$src0$src0_rel, "
-                             "$src1_neg$src1$src1_rel, "
-                             "$src2_neg$src2$src2_rel, "
+                             "$src0_neg$src0$src0_sel$src0_rel, "
+                             "$src1_neg$src1$src1_sel$src1_rel, "
+                             "$src2_neg$src2$src2_sel$src2_rel, "
                             "$literal $pred_sel$last"),
          pattern,
          itin>,
@ -482,7 +489,7 @@ def INTERP_P0: SDNode<"AMDGPUISD::INTERP_P0",
  >;

 def CONST_ADDRESS: SDNode<"AMDGPUISD::CONST_ADDRESS",
-  SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisPtrTy<1>]>,
+  SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisPtrTy<1>]>,
  [SDNPMayLoad]
 >;

@ -1538,12 +1545,6 @@ def MASK_WRITE : AMDGPUShaderInst <

 } // End mayLoad = 0, mayStore = 0, hasSideEffects = 1

-def R600_LOAD_CONST : AMDGPUShaderInst <
-  (outs R600_Reg32:$dst),
-  (ins i32imm:$src0),
-  "R600_LOAD_CONST $dst, $src0",
-  [(set R600_Reg32:$dst, (int_AMDGPU_load_const imm:$src0))]
->;

 def RESERVE_REG : AMDGPUShaderInst <
  (outs),
@ -1551,7 +1552,6 @@ def RESERVE_REG : AMDGPUShaderInst <
  "RESERVE_REG $src",
  [(int_AMDGPU_reserve_reg imm:$src)]
 >;
-
 def TXD: AMDGPUShaderInst <
  (outs R600_Reg128:$dst),
  (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget),
@ -1581,6 +1581,78 @@ let isTerminator = 1, isReturn = 1, isBarrier = 1, hasCtrlDep = 1 in {
      "RETURN", [(IL_retflag)]>;
 }

+
+//===----------------------------------------------------------------------===//
+// Constant Buffer Addressing Support
+//===----------------------------------------------------------------------===//
+
+let isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU"  in {
+def CONST_COPY : Instruction {
+  let OutOperandList = (outs R600_Reg32:$dst);
+  let InOperandList = (ins i32imm:$src);
+  let Pattern = [(set R600_Reg32:$dst, (CONST_ADDRESS ADDRGA_CONST_OFFSET:$src))];
+  let AsmString = "CONST_COPY";
+  let neverHasSideEffects = 1;
+  let isAsCheapAsAMove = 1;
+  let Itinerary = NullALU;
+}
+} // end isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU"
+
+def TEX_VTX_CONSTBUF :
+  InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr), "VTX_READ_eg $dst, $ptr",
+      [(set R600_Reg128:$dst, (CONST_ADDRESS ADDRGA_VAR_OFFSET:$ptr))]>,
+  VTX_WORD1_GPR, VTX_WORD0 {
+
+  let VC_INST = 0;
+  let FETCH_TYPE = 2;
+  let FETCH_WHOLE_QUAD = 0;
+  let BUFFER_ID = 0;
+  let SRC_REL = 0;
+  let SRC_SEL_X = 0;
+  let DST_REL = 0;
+  let USE_CONST_FIELDS = 0;
+  let NUM_FORMAT_ALL = 2;
+  let FORMAT_COMP_ALL = 1;
+  let SRF_MODE_ALL = 1;
+  let MEGA_FETCH_COUNT = 16;
+  let DST_SEL_X        = 0;
+  let DST_SEL_Y        = 1;
+  let DST_SEL_Z        = 2;
+  let DST_SEL_W        = 3;
+  let DATA_FORMAT      = 35;
+
+  let Inst{31-0} = Word0;
+  let Inst{63-32} = Word1;
+
+// LLVM can only encode 64-bit instructions, so these fields are manually
+// encoded in R600CodeEmitter
+//
+// bits<16> OFFSET;
+// bits<2>  ENDIAN_SWAP = 0;
+// bits<1>  CONST_BUF_NO_STRIDE = 0;
+// bits<1>  MEGA_FETCH = 0;
+// bits<1>  ALT_CONST = 0;
+// bits<2>  BUFFER_INDEX_MODE = 0;
+
+
+
+// VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding
+// is done in R600CodeEmitter
+//
+// Inst{79-64} = OFFSET;
+// Inst{81-80} = ENDIAN_SWAP;
+// Inst{82}    = CONST_BUF_NO_STRIDE;
+// Inst{83}    = MEGA_FETCH;
+// Inst{84}    = ALT_CONST;
+// Inst{86-85} = BUFFER_INDEX_MODE;
+// Inst{95-86} = 0; Reserved
+
+// VTX_WORD3 (Padding)
+//
+// Inst{127-96} = 0;
+}
+
+
 //===--------------------------------------------------------------------===//
 // Instructions support
 //===--------------------------------------------------------------------===//
--- a/lib/Target/R600/R600LowerConstCopy.cpp
+++ b/lib/Target/R600/R600LowerConstCopy.cpp
@ -0,0 +1,74 @@
+//===-- R600LowerConstCopy.cpp - Propagate ConstCopy / lower them to MOV---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass is intended to handle remaining ConstCopy pseudo MachineInstr.
+/// ISel will fold each Const Buffer read inside scalar ALU. However it cannot
+/// fold them inside vector instruction, like DOT4 or Cube ; ISel emits
+/// ConstCopy instead. This pass (executed after ExpandingSpecialInstr) will try
+/// to fold them if possible or replace them by MOV otherwise.
+/// TODO : Implement the folding part, using Copy Propagation algorithm.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "R600InstrInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/GlobalValue.h"
+
+namespace llvm {
+
+class R600LowerConstCopy : public MachineFunctionPass {
+private:
+  static char ID;
+  const R600InstrInfo *TII;
+public:
+  R600LowerConstCopy(TargetMachine &tm);
+  virtual bool runOnMachineFunction(MachineFunction &MF);
+
+  const char *getPassName() const { return "R600 Eliminate Symbolic Operand"; }
+};
+
+char R600LowerConstCopy::ID = 0;
+
+
+R600LowerConstCopy::R600LowerConstCopy(TargetMachine &tm) :
+    MachineFunctionPass(ID),
+    TII (static_cast<const R600InstrInfo *>(tm.getInstrInfo()))
+{
+}
+
+bool R600LowerConstCopy::runOnMachineFunction(MachineFunction &MF) {
+  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+                                                  BB != BB_E; ++BB) {
+    MachineBasicBlock &MBB = *BB;
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+                                                      I != E;) {
+      MachineInstr &MI = *I;
+      I = llvm::next(I);
+      if (MI.getOpcode() != AMDGPU::CONST_COPY)
+        continue;
+      MachineInstr *NewMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::MOV,
+          MI.getOperand(0).getReg(), AMDGPU::ALU_CONST);
+      NewMI->getOperand(9).setImm(MI.getOperand(1).getImm());
+      MI.eraseFromParent();
+    }
+  }
+  return false;
+}
+
+FunctionPass *createR600LowerConstCopy(TargetMachine &tm) {
+  return new R600LowerConstCopy(tm);
+}
+
+}
+
+
--- a/lib/Target/R600/R600RegisterInfo.cpp
+++ b/lib/Target/R600/R600RegisterInfo.cpp
@ -38,16 +38,12 @@ BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
  Reserved.set(AMDGPU::NEG_ONE);
  Reserved.set(AMDGPU::PV_X);
  Reserved.set(AMDGPU::ALU_LITERAL_X);
+  Reserved.set(AMDGPU::ALU_CONST);
  Reserved.set(AMDGPU::PREDICATE_BIT);
  Reserved.set(AMDGPU::PRED_SEL_OFF);
  Reserved.set(AMDGPU::PRED_SEL_ZERO);
  Reserved.set(AMDGPU::PRED_SEL_ONE);

-  for (TargetRegisterClass::iterator I = AMDGPU::R600_CReg32RegClass.begin(),
-                        E = AMDGPU::R600_CReg32RegClass.end(); I != E; ++I) {
-    Reserved.set(*I);
-  }
-
  for (std::vector<unsigned>::const_iterator I = MFI->ReservedRegs.begin(),
                                    E = MFI->ReservedRegs.end(); I != E; ++I) {
    Reserved.set(*I);
--- a/lib/Target/R600/R600RegisterInfo.td
+++ b/lib/Target/R600/R600RegisterInfo.td
@ -27,10 +27,6 @@ foreach Index = 0-127 in {
  foreach Chan = [ "X", "Y", "Z", "W" ] in {
    // 32-bit Temporary Registers
    def T#Index#_#Chan : R600RegWithChan <"T"#Index#"."#Chan, Index, Chan>;
-
-    // 32-bit Constant Registers (There are more than 128, this the number
-    // that is currently supported.
-    def C#Index#_#Chan : R600RegWithChan <"C"#Index#"."#Chan, Index, Chan>;
  }
  // 128-bit Temporary Registers
  def T#Index#_XYZW : R600Reg_128 <"T"#Index#".XYZW",
@ -64,13 +60,11 @@ def PRED_SEL_ONE : R600Reg<"Pred_sel_one", 3>;

 def R600_ArrayBase : RegisterClass <"AMDGPU", [f32, i32], 32,
                          (add (sequence "ArrayBase%u", 448, 464))>;
-
-def R600_CReg32 : RegisterClass <"AMDGPU", [f32, i32], 32,
-                          (add (interleave
-                                  (interleave (sequence "C%u_X", 0, 127),
-                                              (sequence "C%u_Z", 0, 127)),
-                                  (interleave (sequence "C%u_Y", 0, 127),
-                                              (sequence "C%u_W", 0, 127))))>;
+// special registers for ALU src operands
+// const buffer reference, SRCx_SEL contains index
+def ALU_CONST : R600Reg<"CBuf", 0>;
+// interpolation param reference, SRCx_SEL contains index
+def ALU_PARAM : R600Reg<"Param", 0>;

 def R600_TReg32_X : RegisterClass <"AMDGPU", [f32, i32], 32,
                                   (add (sequence "T%u_X", 0, 127))>;
@ -85,15 +79,15 @@ def R600_TReg32_W : RegisterClass <"AMDGPU", [f32, i32], 32,
                                   (add (sequence "T%u_W", 0, 127))>;

 def R600_TReg32 : RegisterClass <"AMDGPU", [f32, i32], 32,
-                          (add (interleave
-                                 (interleave R600_TReg32_X, R600_TReg32_Z),
-                                 (interleave R600_TReg32_Y, R600_TReg32_W)))>;
+                                   (interleave R600_TReg32_X, R600_TReg32_Y,
+                                               R600_TReg32_Z, R600_TReg32_W)>;

 def R600_Reg32 : RegisterClass <"AMDGPU", [f32, i32], 32, (add
    R600_TReg32,
-    R600_CReg32,
    R600_ArrayBase,
-    ZERO, HALF, ONE, ONE_INT, PV_X, ALU_LITERAL_X, NEG_ONE, NEG_HALF)>;
+    ZERO, HALF, ONE, ONE_INT, PV_X, ALU_LITERAL_X, NEG_ONE, NEG_HALF,
+    ALU_CONST, ALU_PARAM
+    )>;

 def R600_Predicate : RegisterClass <"AMDGPU", [i32], 32, (add
    PRED_SEL_OFF, PRED_SEL_ZERO, PRED_SEL_ONE)>;