llvm/lib/Target/CellSPU/SPUISelLowering.cpp

//
//===-- SPUISelLowering.cpp - Cell SPU DAG Lowering Implementation --------===//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file implements the SPUTargetLowering class.
//
//===----------------------------------------------------------------------===//

#include "SPURegisterNames.h"
#include "SPUISelLowering.h"
#include "SPUTargetMachine.h"
#include "SPUFrameInfo.h"
#include "SPUMachineFunction.h"
#include "llvm/Constants.h"
#include "llvm/Function.h"
#include "llvm/Intrinsics.h"
#include "llvm/CallingConv.h"
#include "llvm/Type.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
#include "llvm/Target/TargetOptions.h"
#include "llvm/ADT/VectorExtras.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include <map>

using namespace llvm;

// Used in getTargetNodeName() below
namespace {
  std::map<unsigned, const char *> node_names;

  // Byte offset of the preferred slot (counted from the MSB)
  int prefslotOffset(EVT VT) {
    int retval=0;
    if (VT==MVT::i1) retval=3;
    if (VT==MVT::i8) retval=3;
    if (VT==MVT::i16) retval=2;

    return retval;
  }

  //! Expand a library call into an actual call DAG node
  /*!
   \note
   This code is taken from SelectionDAGLegalize, since it is not exposed as
   part of the LLVM SelectionDAG API.
   */

  SDValue
  ExpandLibCall(RTLIB::Libcall LC, SDValue Op, SelectionDAG &DAG,
                bool isSigned, SDValue &Hi, const SPUTargetLowering &TLI) {
    // The input chain to this libcall is the entry node of the function.
    // Legalizing the call will automatically add the previous call to the
    // dependence.
    SDValue InChain = DAG.getEntryNode();

    TargetLowering::ArgListTy Args;
    TargetLowering::ArgListEntry Entry;
    for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
      EVT ArgVT = Op.getOperand(i).getValueType();
      const Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
      Entry.Node = Op.getOperand(i);
      Entry.Ty = ArgTy;
      Entry.isSExt = isSigned;
      Entry.isZExt = !isSigned;
      Args.push_back(Entry);
    }
    SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
                                           TLI.getPointerTy());

    // Splice the libcall in wherever FindInputOutputChains tells us to.
    const Type *RetTy =
                Op.getNode()->getValueType(0).getTypeForEVT(*DAG.getContext());
    std::pair<SDValue, SDValue> CallInfo =
            TLI.LowerCallTo(InChain, RetTy, isSigned, !isSigned, false, false,
                            0, TLI.getLibcallCallingConv(LC), false,
                            /*isReturnValueUsed=*/true,
                            Callee, Args, DAG, Op.getDebugLoc());

    return CallInfo.first;
  }
}

SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
  : TargetLowering(TM, new TargetLoweringObjectFileELF()),
    SPUTM(TM) {
  // Fold away setcc operations if possible.
  setPow2DivIsCheap();

  // Use _setjmp/_longjmp instead of setjmp/longjmp.
  setUseUnderscoreSetJmp(true);
  setUseUnderscoreLongJmp(true);

  // Set RTLIB libcall names as used by SPU:
  setLibcallName(RTLIB::DIV_F64, "__fast_divdf3");

  // Set up the SPU's register classes:
  addRegisterClass(MVT::i8,   SPU::R8CRegisterClass);
  addRegisterClass(MVT::i16,  SPU::R16CRegisterClass);
  addRegisterClass(MVT::i32,  SPU::R32CRegisterClass);
  addRegisterClass(MVT::i64,  SPU::R64CRegisterClass);
  addRegisterClass(MVT::f32,  SPU::R32FPRegisterClass);
  addRegisterClass(MVT::f64,  SPU::R64FPRegisterClass);
  addRegisterClass(MVT::i128, SPU::GPRCRegisterClass);

  // SPU has no sign or zero extended loads for i1, i8, i16:
  setLoadExtAction(ISD::EXTLOAD,  MVT::i1, Promote);
  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);

  setLoadExtAction(ISD::EXTLOAD,  MVT::f32, Expand);
  setLoadExtAction(ISD::EXTLOAD,  MVT::f64, Expand);

  setTruncStoreAction(MVT::i128, MVT::i64, Expand);
  setTruncStoreAction(MVT::i128, MVT::i32, Expand);
  setTruncStoreAction(MVT::i128, MVT::i16, Expand);
  setTruncStoreAction(MVT::i128, MVT::i8, Expand);

  setTruncStoreAction(MVT::f64, MVT::f32, Expand);

  // SPU constant load actions are custom lowered:
  setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
  setOperationAction(ISD::ConstantFP, MVT::f64, Custom);

  // SPU's loads and stores have to be custom lowered:
  for (unsigned sctype = (unsigned) MVT::i8; sctype < (unsigned) MVT::i128;
       ++sctype) {
    MVT::SimpleValueType VT = (MVT::SimpleValueType)sctype;

    setOperationAction(ISD::LOAD,   VT, Custom);
    setOperationAction(ISD::STORE,  VT, Custom);
    setLoadExtAction(ISD::EXTLOAD,  VT, Custom);
    setLoadExtAction(ISD::ZEXTLOAD, VT, Custom);
    setLoadExtAction(ISD::SEXTLOAD, VT, Custom);

    for (unsigned stype = sctype - 1; stype >= (unsigned) MVT::i8; --stype) {
      MVT::SimpleValueType StoreVT = (MVT::SimpleValueType) stype;
      setTruncStoreAction(VT, StoreVT, Expand);
    }
  }

  for (unsigned sctype = (unsigned) MVT::f32; sctype < (unsigned) MVT::f64;
       ++sctype) {
    MVT::SimpleValueType VT = (MVT::SimpleValueType) sctype;

    setOperationAction(ISD::LOAD,   VT, Custom);
    setOperationAction(ISD::STORE,  VT, Custom);

    for (unsigned stype = sctype - 1; stype >= (unsigned) MVT::f32; --stype) {
      MVT::SimpleValueType StoreVT = (MVT::SimpleValueType) stype;
      setTruncStoreAction(VT, StoreVT, Expand);
    }
  }

  // Expand the jumptable branches
  setOperationAction(ISD::BR_JT,        MVT::Other, Expand);
  setOperationAction(ISD::BR_CC,        MVT::Other, Expand);

  // Custom lower SELECT_CC for most cases, but expand by default
  setOperationAction(ISD::SELECT_CC,    MVT::Other, Expand);
  setOperationAction(ISD::SELECT_CC,    MVT::i8,    Custom);
  setOperationAction(ISD::SELECT_CC,    MVT::i16,   Custom);
  setOperationAction(ISD::SELECT_CC,    MVT::i32,   Custom);
  setOperationAction(ISD::SELECT_CC,    MVT::i64,   Custom);

  // SPU has no intrinsics for these particular operations:
  setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand);

  // SPU has no division/remainder instructions
  setOperationAction(ISD::SREM,    MVT::i8,   Expand);
  setOperationAction(ISD::UREM,    MVT::i8,   Expand);
  setOperationAction(ISD::SDIV,    MVT::i8,   Expand);
  setOperationAction(ISD::UDIV,    MVT::i8,   Expand);
  setOperationAction(ISD::SDIVREM, MVT::i8,   Expand);
  setOperationAction(ISD::UDIVREM, MVT::i8,   Expand);
  setOperationAction(ISD::SREM,    MVT::i16,  Expand);
  setOperationAction(ISD::UREM,    MVT::i16,  Expand);
  setOperationAction(ISD::SDIV,    MVT::i16,  Expand);
  setOperationAction(ISD::UDIV,    MVT::i16,  Expand);
  setOperationAction(ISD::SDIVREM, MVT::i16,  Expand);
  setOperationAction(ISD::UDIVREM, MVT::i16,  Expand);
  setOperationAction(ISD::SREM,    MVT::i32,  Expand);
  setOperationAction(ISD::UREM,    MVT::i32,  Expand);
  setOperationAction(ISD::SDIV,    MVT::i32,  Expand);
  setOperationAction(ISD::UDIV,    MVT::i32,  Expand);
  setOperationAction(ISD::SDIVREM, MVT::i32,  Expand);
  setOperationAction(ISD::UDIVREM, MVT::i32,  Expand);
  setOperationAction(ISD::SREM,    MVT::i64,  Expand);
  setOperationAction(ISD::UREM,    MVT::i64,  Expand);
  setOperationAction(ISD::SDIV,    MVT::i64,  Expand);
  setOperationAction(ISD::UDIV,    MVT::i64,  Expand);
  setOperationAction(ISD::SDIVREM, MVT::i64,  Expand);
  setOperationAction(ISD::UDIVREM, MVT::i64,  Expand);
  setOperationAction(ISD::SREM,    MVT::i128, Expand);
  setOperationAction(ISD::UREM,    MVT::i128, Expand);
  setOperationAction(ISD::SDIV,    MVT::i128, Expand);
  setOperationAction(ISD::UDIV,    MVT::i128, Expand);
  setOperationAction(ISD::SDIVREM, MVT::i128, Expand);
  setOperationAction(ISD::UDIVREM, MVT::i128, Expand);

  // We don't support sin/cos/sqrt/fmod
  setOperationAction(ISD::FSIN , MVT::f64, Expand);
  setOperationAction(ISD::FCOS , MVT::f64, Expand);
  setOperationAction(ISD::FREM , MVT::f64, Expand);
  setOperationAction(ISD::FSIN , MVT::f32, Expand);
  setOperationAction(ISD::FCOS , MVT::f32, Expand);
  setOperationAction(ISD::FREM , MVT::f32, Expand);

  // Expand fsqrt to the appropriate libcall (NOTE: should use h/w fsqrt
  // for f32!)
  setOperationAction(ISD::FSQRT, MVT::f64, Expand);
  setOperationAction(ISD::FSQRT, MVT::f32, Expand);

  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);

  // SPU can do rotate right and left, so legalize it... but customize for i8
  // because instructions don't exist.

  // FIXME: Change from "expand" to appropriate type once ROTR is supported in
  //        .td files.
  setOperationAction(ISD::ROTR, MVT::i32,    Expand /*Legal*/);
  setOperationAction(ISD::ROTR, MVT::i16,    Expand /*Legal*/);
  setOperationAction(ISD::ROTR, MVT::i8,     Expand /*Custom*/);

  setOperationAction(ISD::ROTL, MVT::i32,    Legal);
  setOperationAction(ISD::ROTL, MVT::i16,    Legal);
  setOperationAction(ISD::ROTL, MVT::i8,     Custom);

  // SPU has no native version of shift left/right for i8
  setOperationAction(ISD::SHL,  MVT::i8,     Custom);
  setOperationAction(ISD::SRL,  MVT::i8,     Custom);
  setOperationAction(ISD::SRA,  MVT::i8,     Custom);

  // Make these operations legal and handle them during instruction selection:
  setOperationAction(ISD::SHL,  MVT::i64,    Legal);
  setOperationAction(ISD::SRL,  MVT::i64,    Legal);
  setOperationAction(ISD::SRA,  MVT::i64,    Legal);

  // Custom lower i8, i32 and i64 multiplications
  setOperationAction(ISD::MUL,  MVT::i8,     Custom);
  setOperationAction(ISD::MUL,  MVT::i32,    Legal);
  setOperationAction(ISD::MUL,  MVT::i64,    Legal);

  // Expand double-width multiplication
  // FIXME: It would probably be reasonable to support some of these operations
  setOperationAction(ISD::UMUL_LOHI, MVT::i8,  Expand);
  setOperationAction(ISD::SMUL_LOHI, MVT::i8,  Expand);
  setOperationAction(ISD::MULHU,     MVT::i8,  Expand);
  setOperationAction(ISD::MULHS,     MVT::i8,  Expand);
  setOperationAction(ISD::UMUL_LOHI, MVT::i16, Expand);
  setOperationAction(ISD::SMUL_LOHI, MVT::i16, Expand);
  setOperationAction(ISD::MULHU,     MVT::i16, Expand);
  setOperationAction(ISD::MULHS,     MVT::i16, Expand);
  setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
  setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
  setOperationAction(ISD::MULHU,     MVT::i32, Expand);
  setOperationAction(ISD::MULHS,     MVT::i32, Expand);
  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
  setOperationAction(ISD::MULHU,     MVT::i64, Expand);
  setOperationAction(ISD::MULHS,     MVT::i64, Expand);

  // Need to custom handle (some) common i8, i64 math ops
  setOperationAction(ISD::ADD,  MVT::i8,     Custom);
  setOperationAction(ISD::ADD,  MVT::i64,    Legal);
  setOperationAction(ISD::SUB,  MVT::i8,     Custom);
  setOperationAction(ISD::SUB,  MVT::i64,    Legal);

  // SPU does not have BSWAP. It does have i32 support CTLZ.
  // CTPOP has to be custom lowered.
  setOperationAction(ISD::BSWAP, MVT::i32,   Expand);
  setOperationAction(ISD::BSWAP, MVT::i64,   Expand);

  setOperationAction(ISD::CTPOP, MVT::i8,    Custom);
  setOperationAction(ISD::CTPOP, MVT::i16,   Custom);
  setOperationAction(ISD::CTPOP, MVT::i32,   Custom);
  setOperationAction(ISD::CTPOP, MVT::i64,   Custom);
  setOperationAction(ISD::CTPOP, MVT::i128,  Expand);

  setOperationAction(ISD::CTTZ , MVT::i8,    Expand);
  setOperationAction(ISD::CTTZ , MVT::i16,   Expand);
  setOperationAction(ISD::CTTZ , MVT::i32,   Expand);
  setOperationAction(ISD::CTTZ , MVT::i64,   Expand);
  setOperationAction(ISD::CTTZ , MVT::i128,  Expand);

  setOperationAction(ISD::CTLZ , MVT::i8,    Promote);
  setOperationAction(ISD::CTLZ , MVT::i16,   Promote);
  setOperationAction(ISD::CTLZ , MVT::i32,   Legal);
  setOperationAction(ISD::CTLZ , MVT::i64,   Expand);
  setOperationAction(ISD::CTLZ , MVT::i128,  Expand);

  // SPU has a version of select that implements (a&~c)|(b&c), just like
  // select ought to work:
  setOperationAction(ISD::SELECT, MVT::i8,   Legal);
  setOperationAction(ISD::SELECT, MVT::i16,  Legal);
  setOperationAction(ISD::SELECT, MVT::i32,  Legal);
  setOperationAction(ISD::SELECT, MVT::i64,  Legal);

  setOperationAction(ISD::SETCC, MVT::i8,    Legal);
  setOperationAction(ISD::SETCC, MVT::i16,   Legal);
  setOperationAction(ISD::SETCC, MVT::i32,   Legal);
  setOperationAction(ISD::SETCC, MVT::i64,   Legal);
  setOperationAction(ISD::SETCC, MVT::f64,   Custom);

  // Custom lower i128 -> i64 truncates
  setOperationAction(ISD::TRUNCATE, MVT::i64, Custom);

  // Custom lower i32/i64 -> i128 sign extend
  setOperationAction(ISD::SIGN_EXTEND, MVT::i128, Custom);

  setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);
  setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);
  setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
  setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
  // SPU has a legal FP -> signed INT instruction for f32, but for f64, need
  // to expand to a libcall, hence the custom lowering:
  setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Expand);
  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
  setOperationAction(ISD::FP_TO_SINT, MVT::i128, Expand);
  setOperationAction(ISD::FP_TO_UINT, MVT::i128, Expand);

  // FDIV on SPU requires custom lowering
  setOperationAction(ISD::FDIV, MVT::f64, Expand);      // to libcall

  // SPU has [U|S]INT_TO_FP for f32->i32, but not for f64->i32, f64->i64:
  setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
  setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
  setOperationAction(ISD::SINT_TO_FP, MVT::i8,  Promote);
  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
  setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
  setOperationAction(ISD::UINT_TO_FP, MVT::i8,  Promote);
  setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);

  setOperationAction(ISD::BIT_CONVERT, MVT::i32, Legal);
  setOperationAction(ISD::BIT_CONVERT, MVT::f32, Legal);
  setOperationAction(ISD::BIT_CONVERT, MVT::i64, Legal);
  setOperationAction(ISD::BIT_CONVERT, MVT::f64, Legal);

  // We cannot sextinreg(i1).  Expand to shifts.
  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);

  // We want to legalize GlobalAddress and ConstantPool nodes into the
  // appropriate instructions to materialize the address.
  for (unsigned sctype = (unsigned) MVT::i8; sctype < (unsigned) MVT::f128;
       ++sctype) {
    MVT::SimpleValueType VT = (MVT::SimpleValueType)sctype;

    setOperationAction(ISD::GlobalAddress,  VT, Custom);
    setOperationAction(ISD::ConstantPool,   VT, Custom);
    setOperationAction(ISD::JumpTable,      VT, Custom);
  }

  // VASTART needs to be custom lowered to use the VarArgsFrameIndex
  setOperationAction(ISD::VASTART           , MVT::Other, Custom);

  // Use the default implementation.
  setOperationAction(ISD::VAARG             , MVT::Other, Expand);
  setOperationAction(ISD::VACOPY            , MVT::Other, Expand);
  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
  setOperationAction(ISD::STACKSAVE         , MVT::Other, Expand);
  setOperationAction(ISD::STACKRESTORE      , MVT::Other, Expand);
  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32  , Expand);
  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64  , Expand);

  // Cell SPU has instructions for converting between i64 and fp.
  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
  setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);

  // To take advantage of the above i64 FP_TO_SINT, promote i32 FP_TO_UINT
  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Promote);

  // BUILD_PAIR can't be handled natively, and should be expanded to shl/or
  setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);

  // First set operation action for all vector types to expand. Then we
  // will selectively turn on ones that can be effectively codegen'd.
  addRegisterClass(MVT::v16i8, SPU::VECREGRegisterClass);
  addRegisterClass(MVT::v8i16, SPU::VECREGRegisterClass);
  addRegisterClass(MVT::v4i32, SPU::VECREGRegisterClass);
  addRegisterClass(MVT::v2i64, SPU::VECREGRegisterClass);
  addRegisterClass(MVT::v4f32, SPU::VECREGRegisterClass);
  addRegisterClass(MVT::v2f64, SPU::VECREGRegisterClass);

  for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
       i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
    MVT::SimpleValueType VT = (MVT::SimpleValueType)i;

    // add/sub are legal for all supported vector VT's.
    setOperationAction(ISD::ADD,     VT, Legal);
    setOperationAction(ISD::SUB,     VT, Legal);
    // mul has to be custom lowered.
    setOperationAction(ISD::MUL,     VT, Legal);

    setOperationAction(ISD::AND,     VT, Legal);
    setOperationAction(ISD::OR,      VT, Legal);
    setOperationAction(ISD::XOR,     VT, Legal);
    setOperationAction(ISD::LOAD,    VT, Custom);
    setOperationAction(ISD::SELECT,  VT, Legal);
    setOperationAction(ISD::STORE,   VT, Custom);

    // These operations need to be expanded:
    setOperationAction(ISD::SDIV,    VT, Expand);
    setOperationAction(ISD::SREM,    VT, Expand);
    setOperationAction(ISD::UDIV,    VT, Expand);
    setOperationAction(ISD::UREM,    VT, Expand);

    // Custom lower build_vector, constant pool spills, insert and
    // extract vector elements:
    setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
    setOperationAction(ISD::ConstantPool, VT, Custom);
    setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
    setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
    setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
    setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
  }

  setOperationAction(ISD::AND, MVT::v16i8, Custom);
  setOperationAction(ISD::OR,  MVT::v16i8, Custom);
  setOperationAction(ISD::XOR, MVT::v16i8, Custom);
  setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom);

  setOperationAction(ISD::FDIV, MVT::v4f32, Legal);

  setShiftAmountType(MVT::i32);
  setBooleanContents(ZeroOrNegativeOneBooleanContent);

  setStackPointerRegisterToSaveRestore(SPU::R1);

  // We have target-specific dag combine patterns for the following nodes:
  setTargetDAGCombine(ISD::ADD);
  setTargetDAGCombine(ISD::ZERO_EXTEND);
  setTargetDAGCombine(ISD::SIGN_EXTEND);
  setTargetDAGCombine(ISD::ANY_EXTEND);

  computeRegisterProperties();

  // Set pre-RA register scheduler default to BURR, which produces slightly
  // better code than the default (could also be TDRR, but TargetLowering.h
  // needs a mod to support that model):
  setSchedulingPreference(Sched::RegPressure);
}

const char *
SPUTargetLowering::getTargetNodeName(unsigned Opcode) const
{
  if (node_names.empty()) {
    node_names[(unsigned) SPUISD::RET_FLAG] = "SPUISD::RET_FLAG";
    node_names[(unsigned) SPUISD::Hi] = "SPUISD::Hi";
    node_names[(unsigned) SPUISD::Lo] = "SPUISD::Lo";
    node_names[(unsigned) SPUISD::PCRelAddr] = "SPUISD::PCRelAddr";
    node_names[(unsigned) SPUISD::AFormAddr] = "SPUISD::AFormAddr";
    node_names[(unsigned) SPUISD::IndirectAddr] = "SPUISD::IndirectAddr";
    node_names[(unsigned) SPUISD::LDRESULT] = "SPUISD::LDRESULT";
    node_names[(unsigned) SPUISD::CALL] = "SPUISD::CALL";
    node_names[(unsigned) SPUISD::SHUFB] = "SPUISD::SHUFB";
    node_names[(unsigned) SPUISD::SHUFFLE_MASK] = "SPUISD::SHUFFLE_MASK";
    node_names[(unsigned) SPUISD::CNTB] = "SPUISD::CNTB";
    node_names[(unsigned) SPUISD::PREFSLOT2VEC] = "SPUISD::PREFSLOT2VEC";
    node_names[(unsigned) SPUISD::VEC2PREFSLOT] = "SPUISD::VEC2PREFSLOT";
    node_names[(unsigned) SPUISD::SHL_BITS] = "SPUISD::SHL_BITS";
    node_names[(unsigned) SPUISD::SHL_BYTES] = "SPUISD::SHL_BYTES";
    node_names[(unsigned) SPUISD::VEC_ROTL] = "SPUISD::VEC_ROTL";
    node_names[(unsigned) SPUISD::VEC_ROTR] = "SPUISD::VEC_ROTR";
    node_names[(unsigned) SPUISD::ROTBYTES_LEFT] = "SPUISD::ROTBYTES_LEFT";
    node_names[(unsigned) SPUISD::ROTBYTES_LEFT_BITS] =
            "SPUISD::ROTBYTES_LEFT_BITS";
    node_names[(unsigned) SPUISD::SELECT_MASK] = "SPUISD::SELECT_MASK";
    node_names[(unsigned) SPUISD::SELB] = "SPUISD::SELB";
    node_names[(unsigned) SPUISD::ADD64_MARKER] = "SPUISD::ADD64_MARKER";
    node_names[(unsigned) SPUISD::SUB64_MARKER] = "SPUISD::SUB64_MARKER";
    node_names[(unsigned) SPUISD::MUL64_MARKER] = "SPUISD::MUL64_MARKER";
  }

  std::map<unsigned, const char *>::iterator i = node_names.find(Opcode);

  return ((i != node_names.end()) ? i->second : 0);
}

/// getFunctionAlignment - Return the Log2 alignment of this function.
unsigned SPUTargetLowering::getFunctionAlignment(const Function *) const {
  return 3;
}

//===----------------------------------------------------------------------===//
// Return the Cell SPU's SETCC result type
//===----------------------------------------------------------------------===//

MVT::SimpleValueType SPUTargetLowering::getSetCCResultType(EVT VT) const {
  // i16 and i32 are valid SETCC result types
  return ((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) ?
    VT.getSimpleVT().SimpleTy :
    MVT::i32);
}

//===----------------------------------------------------------------------===//
// Calling convention code:
//===----------------------------------------------------------------------===//

#include "SPUGenCallingConv.inc"

//===----------------------------------------------------------------------===//
//  LowerOperation implementation
//===----------------------------------------------------------------------===//

/// Custom lower loads for CellSPU
/*!
 All CellSPU loads and stores are aligned to 16-byte boundaries, so for elements
 within a 16-byte block, we have to rotate to extract the requested element.

 For extending loads, we also want to ensure that the following sequence is
 emitted, e.g. for MVT::f32 extending load to MVT::f64:

\verbatim
%1  v16i8,ch = load
%2  v16i8,ch = rotate %1
%3  v4f8, ch = bitconvert %2
%4  f32      = vec2perfslot %3
%5  f64      = fp_extend %4
\endverbatim
*/
static SDValue
LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
  LoadSDNode *LN = cast<LoadSDNode>(Op);
  SDValue the_chain = LN->getChain();
  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
  EVT InVT = LN->getMemoryVT();
  EVT OutVT = Op.getValueType();
  ISD::LoadExtType ExtType = LN->getExtensionType();
  unsigned alignment = LN->getAlignment();
  int pso = prefslotOffset(InVT);
  DebugLoc dl = Op.getDebugLoc();
  EVT vecVT = InVT.isVector()? InVT: EVT::getVectorVT(*DAG.getContext(), InVT,
                                                  (128 / InVT.getSizeInBits()));

  // two sanity checks
  assert( LN->getAddressingMode() == ISD::UNINDEXED
          && "we should get only UNINDEXED adresses");
  // clean aligned loads can be selected as-is
  if (InVT.getSizeInBits() == 128 && alignment == 16)
    return SDValue();

  // Get pointerinfos to the memory chunk(s) that contain the data to load
  uint64_t mpi_offset = LN->getPointerInfo().Offset;
  mpi_offset -= mpi_offset%16;
  MachinePointerInfo lowMemPtr( LN->getPointerInfo().V, mpi_offset);
  MachinePointerInfo highMemPtr( LN->getPointerInfo().V, mpi_offset+16);


    SDValue result;
    SDValue basePtr = LN->getBasePtr();
    SDValue rotate;

    if (alignment == 16) {
      ConstantSDNode *CN;

      // Special cases for a known aligned load to simplify the base pointer
      // and the rotation amount:
      if (basePtr.getOpcode() == ISD::ADD
          && (CN = dyn_cast<ConstantSDNode > (basePtr.getOperand(1))) != 0) {
        // Known offset into basePtr
        int64_t offset = CN->getSExtValue();
        int64_t rotamt = int64_t((offset & 0xf) - pso);

        if (rotamt < 0)
          rotamt += 16;

        rotate = DAG.getConstant(rotamt, MVT::i16);

        // Simplify the base pointer for this case:
        basePtr = basePtr.getOperand(0);
        if ((offset & ~0xf) > 0) {
          basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
                                basePtr,
                                DAG.getConstant((offset & ~0xf), PtrVT));
        }
      } else if ((basePtr.getOpcode() == SPUISD::AFormAddr)
                 || (basePtr.getOpcode() == SPUISD::IndirectAddr
                     && basePtr.getOperand(0).getOpcode() == SPUISD::Hi
                     && basePtr.getOperand(1).getOpcode() == SPUISD::Lo)) {
        // Plain aligned a-form address: rotate into preferred slot
        // Same for (SPUindirect (SPUhi ...), (SPUlo ...))
        int64_t rotamt = -pso;
        if (rotamt < 0)
          rotamt += 16;
        rotate = DAG.getConstant(rotamt, MVT::i16);
      } else {
        // Offset the rotate amount by the basePtr and the preferred slot
        // byte offset
        int64_t rotamt = -pso;
        if (rotamt < 0)
          rotamt += 16;
        rotate = DAG.getNode(ISD::ADD, dl, PtrVT,
                             basePtr,
                             DAG.getConstant(rotamt, PtrVT));
      }
    } else {
      // Unaligned load: must be more pessimistic about addressing modes:
      if (basePtr.getOpcode() == ISD::ADD) {
        MachineFunction &MF = DAG.getMachineFunction();
        MachineRegisterInfo &RegInfo = MF.getRegInfo();
        unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
        SDValue Flag;

        SDValue Op0 = basePtr.getOperand(0);
        SDValue Op1 = basePtr.getOperand(1);

        if (isa<ConstantSDNode>(Op1)) {
          // Convert the (add <ptr>, <const>) to an indirect address contained
          // in a register. Note that this is done because we need to avoid
          // creating a 0(reg) d-form address due to the SPU's block loads.
          basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
          the_chain = DAG.getCopyToReg(the_chain, dl, VReg, basePtr, Flag);
          basePtr = DAG.getCopyFromReg(the_chain, dl, VReg, PtrVT);
        } else {
          // Convert the (add <arg1>, <arg2>) to an indirect address, which
          // will likely be lowered as a reg(reg) x-form address.
          basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
        }
      } else {
        basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
                              basePtr,
                              DAG.getConstant(0, PtrVT));
      }

      // Offset the rotate amount by the basePtr and the preferred slot
      // byte offset
      rotate = DAG.getNode(ISD::ADD, dl, PtrVT,
                           basePtr,
                           DAG.getConstant(-pso, PtrVT));
    }

    // Do the load as a i128 to allow possible shifting
    SDValue low = DAG.getLoad(MVT::i128, dl, the_chain, basePtr,
                         lowMemPtr,
                         LN->isVolatile(), LN->isNonTemporal(), 16);

  // When the size is not greater than alignment we get all data with just
  // one load
  if (alignment >= InVT.getSizeInBits()/8) {
    // Update the chain
    the_chain = low.getValue(1);

    // Rotate into the preferred slot:
    result = DAG.getNode(SPUISD::ROTBYTES_LEFT, dl, MVT::i128,
                         low.getValue(0), rotate);

    // Convert the loaded v16i8 vector to the appropriate vector type
    // specified by the operand:
    EVT vecVT = EVT::getVectorVT(*DAG.getContext(),
                                 InVT, (128 / InVT.getSizeInBits()));
    result = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, InVT,
                         DAG.getNode(ISD::BIT_CONVERT, dl, vecVT, result));
  }
  // When alignment is less than the size, we might need (known only at
  // run-time) two loads
  // TODO: if the memory address is composed only from constants, we have
  // extra kowledge, and might avoid the second load
  else {
    // storage position offset from lower 16 byte aligned memory chunk
    SDValue offset = DAG.getNode( ISD::AND, dl, MVT::i32,
                                  basePtr, DAG.getConstant( 0xf, MVT::i32 ) );
    // 16 - offset
    SDValue offset_compl = DAG.getNode( ISD::SUB, dl, MVT::i32,
                                        DAG.getConstant( 16, MVT::i32),
                                        offset );
    // get a registerfull of ones. (this implementation is a workaround: LLVM
    // cannot handle 128 bit signed int constants)
    SDValue ones = DAG.getConstant( -1, MVT::v4i32 );
    ones = DAG.getNode( ISD::BIT_CONVERT, dl, MVT::i128, ones);

    SDValue high = DAG.getLoad(MVT::i128, dl, the_chain,
                               DAG.getNode(ISD::ADD, dl, PtrVT,
                                           basePtr,
                                           DAG.getConstant(16, PtrVT)),
                               highMemPtr,
                               LN->isVolatile(), LN->isNonTemporal(), 16);

    the_chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(1),
                                                              high.getValue(1));

    // Shift the (possible) high part right to compensate the misalignemnt.
    // if there is no highpart (i.e. value is i64 and offset is 4), this
    // will zero out the high value.
    high = DAG.getNode( SPUISD::SRL_BYTES, dl, MVT::i128, high,
                                     DAG.getNode( ISD::SUB, dl, MVT::i32,
                                                 DAG.getConstant( 16, MVT::i32),
                                                 offset
                                                ));

    // Shift the low similarily
    // TODO: add SPUISD::SHL_BYTES
    low = DAG.getNode( SPUISD::SHL_BYTES, dl, MVT::i128, low, offset );

    // Merge the two parts
    result = DAG.getNode( ISD::BIT_CONVERT, dl, vecVT,
                          DAG.getNode(ISD::OR, dl, MVT::i128, low, high));

    if (!InVT.isVector()) {
      result = DAG.getNode( SPUISD::VEC2PREFSLOT, dl, InVT, result );
     }

  }
    // Handle extending loads by extending the scalar result:
    if (ExtType == ISD::SEXTLOAD) {
      result = DAG.getNode(ISD::SIGN_EXTEND, dl, OutVT, result);
    } else if (ExtType == ISD::ZEXTLOAD) {
      result = DAG.getNode(ISD::ZERO_EXTEND, dl, OutVT, result);
    } else if (ExtType == ISD::EXTLOAD) {
      unsigned NewOpc = ISD::ANY_EXTEND;

      if (OutVT.isFloatingPoint())
        NewOpc = ISD::FP_EXTEND;

      result = DAG.getNode(NewOpc, dl, OutVT, result);
    }

    SDVTList retvts = DAG.getVTList(OutVT, MVT::Other);
    SDValue retops[2] = {
      result,
      the_chain
    };

    result = DAG.getNode(SPUISD::LDRESULT, dl, retvts,
                         retops, sizeof(retops) / sizeof(retops[0]));
    return result;
}

/// Custom lower stores for CellSPU
/*!
 All CellSPU stores are aligned to 16-byte boundaries, so for elements
 within a 16-byte block, we have to generate a shuffle to insert the
 requested element into its place, then store the resulting block.
 */
static SDValue
LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
  StoreSDNode *SN = cast<StoreSDNode>(Op);
  SDValue Value = SN->getValue();
  EVT VT = Value.getValueType();
  EVT StVT = (!SN->isTruncatingStore() ? VT : SN->getMemoryVT());
  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
  DebugLoc dl = Op.getDebugLoc();
  unsigned alignment = SN->getAlignment();
  SDValue result;
  EVT vecVT = StVT.isVector()? StVT: EVT::getVectorVT(*DAG.getContext(), StVT,
                                                 (128 / StVT.getSizeInBits()));
  // Get pointerinfos to the memory chunk(s) that contain the data to load
  uint64_t mpi_offset = SN->getPointerInfo().Offset;
  mpi_offset -= mpi_offset%16;
  MachinePointerInfo lowMemPtr( SN->getPointerInfo().V, mpi_offset);
  MachinePointerInfo highMemPtr( SN->getPointerInfo().V, mpi_offset+16);


  // two sanity checks
  assert( SN->getAddressingMode() == ISD::UNINDEXED
          && "we should get only UNINDEXED adresses");
  // clean aligned loads can be selected as-is
  if (StVT.getSizeInBits() == 128 && alignment == 16)
    return SDValue();


    SDValue alignLoadVec;
    SDValue basePtr = SN->getBasePtr();
    SDValue the_chain = SN->getChain();
    SDValue insertEltOffs;

    if (alignment == 16) {
      ConstantSDNode *CN;
      // Special cases for a known aligned load to simplify the base pointer
      // and insertion byte:
      if (basePtr.getOpcode() == ISD::ADD
          && (CN = dyn_cast<ConstantSDNode>(basePtr.getOperand(1))) != 0) {
        // Known offset into basePtr
        int64_t offset = CN->getSExtValue();

        // Simplify the base pointer for this case:
        basePtr = basePtr.getOperand(0);
        insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
                                    basePtr,
                                    DAG.getConstant((offset & 0xf), PtrVT));

        if ((offset & ~0xf) > 0) {
          basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
                                basePtr,
                                DAG.getConstant((offset & ~0xf), PtrVT));
        }
      } else {
        // Otherwise, assume it's at byte 0 of basePtr
        insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
                                    basePtr,
                                    DAG.getConstant(0, PtrVT));
        basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
                                    basePtr,
                                    DAG.getConstant(0, PtrVT));
      }
    } else {
      // Unaligned load: must be more pessimistic about addressing modes:
      if (basePtr.getOpcode() == ISD::ADD) {
        MachineFunction &MF = DAG.getMachineFunction();
        MachineRegisterInfo &RegInfo = MF.getRegInfo();
        unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
        SDValue Flag;

        SDValue Op0 = basePtr.getOperand(0);
        SDValue Op1 = basePtr.getOperand(1);

        if (isa<ConstantSDNode>(Op1)) {
          // Convert the (add <ptr>, <const>) to an indirect address contained
          // in a register. Note that this is done because we need to avoid
          // creating a 0(reg) d-form address due to the SPU's block loads.
          basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
          the_chain = DAG.getCopyToReg(the_chain, dl, VReg, basePtr, Flag);
          basePtr = DAG.getCopyFromReg(the_chain, dl, VReg, PtrVT);
        } else {
          // Convert the (add <arg1>, <arg2>) to an indirect address, which
          // will likely be lowered as a reg(reg) x-form address.
          basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
        }
      } else {
        basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
                              basePtr,
                              DAG.getConstant(0, PtrVT));
      }

      // Insertion point is solely determined by basePtr's contents
      insertEltOffs = DAG.getNode(ISD::ADD, dl, PtrVT,
                                  basePtr,
                                  DAG.getConstant(0, PtrVT));
    }

    // Load the lower part of the memory to which to store.
    SDValue low = DAG.getLoad(vecVT, dl, the_chain, basePtr,
                              lowMemPtr, SN->isVolatile(), SN->isNonTemporal(), 16);

  // if we don't need to store over the 16 byte boundary, one store suffices
  if (alignment >= StVT.getSizeInBits()/8) {
    // Update the chain
    the_chain = low.getValue(1);

    LoadSDNode *LN = cast<LoadSDNode>(low);
    SDValue theValue = SN->getValue();

    if (StVT != VT
        && (theValue.getOpcode() == ISD::AssertZext
            || theValue.getOpcode() == ISD::AssertSext)) {
      // Drill down and get the value for zero- and sign-extended
      // quantities
      theValue = theValue.getOperand(0);
    }

    // If the base pointer is already a D-form address, then just create
    // a new D-form address with a slot offset and the orignal base pointer.
    // Otherwise generate a D-form address with the slot offset relative
    // to the stack pointer, which is always aligned.
#if !defined(NDEBUG)
      if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
        errs() << "CellSPU LowerSTORE: basePtr = ";
        basePtr.getNode()->dump(&DAG);
        errs() << "\n";
      }
#endif

    SDValue insertEltOp = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, vecVT,
                                      insertEltOffs);
    SDValue vectorizeOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, vecVT,
                                      theValue);

    result = DAG.getNode(SPUISD::SHUFB, dl, vecVT,
                         vectorizeOp, low,
                         DAG.getNode(ISD::BIT_CONVERT, dl,
                                     MVT::v4i32, insertEltOp));

    result = DAG.getStore(the_chain, dl, result, basePtr,
                          lowMemPtr,
                          LN->isVolatile(), LN->isNonTemporal(),
                          16);

#if 0 && !defined(NDEBUG)
    if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
      const SDValue &currentRoot = DAG.getRoot();

      DAG.setRoot(result);
      errs() << "------- CellSPU:LowerStore result:\n";
      DAG.dump();
      errs() << "-------\n";
      DAG.setRoot(currentRoot);
    }
#endif
  }
  // do the store when it might cross the 16 byte memory access boundary.
  else {
    // TODO issue a warning if SN->isVolatile()== true? This is likely not
    // what the user wanted.

    // address offset from nearest lower 16byte alinged address
    SDValue offset = DAG.getNode(ISD::AND, dl, MVT::i32,
                                    SN->getBasePtr(),
                                    DAG.getConstant(0xf, MVT::i32));
    // 16 - offset
    SDValue offset_compl = DAG.getNode(ISD::SUB, dl, MVT::i32,
                                           DAG.getConstant( 16, MVT::i32),
                                           offset);
    SDValue hi_shift = DAG.getNode(ISD::SUB, dl, MVT::i32,
                                      DAG.getConstant( VT.getSizeInBits()/8,
                                                       MVT::i32),
                                      offset_compl);
    // 16 - sizeof(Value)
    SDValue surplus = DAG.getNode(ISD::SUB, dl, MVT::i32,
                                     DAG.getConstant( 16, MVT::i32),
                                     DAG.getConstant( VT.getSizeInBits()/8,
                                                      MVT::i32));
    // get a registerfull of ones
    SDValue ones = DAG.getConstant(-1, MVT::v4i32);
    ones = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i128, ones);

    // Create the 128 bit masks that have ones where the data to store is
    // located.
    SDValue lowmask, himask;
    // if the value to store don't fill up the an entire 128 bits, zero
    // out the last bits of the mask so that only the value we want to store
    // is masked.
    // this is e.g. in the case of store i32, align 2
    if (!VT.isVector()){
      Value = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, Value);
      lowmask = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, ones, surplus);
      lowmask = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, lowmask,
                                                               surplus);
      Value = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i128, Value);
      Value = DAG.getNode(ISD::AND, dl, MVT::i128, Value, lowmask);

    }
    else {
      lowmask = ones;
      Value = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i128, Value);
    }
    // this will zero, if there are no data that goes to the high quad
    himask = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, lowmask,
                                                            offset_compl);
    lowmask = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, lowmask,
                                                             offset);

    // Load in the old data and zero out the parts that will be overwritten with
    // the new data to store.
    SDValue hi = DAG.getLoad(MVT::i128, dl, the_chain,
                               DAG.getNode(ISD::ADD, dl, PtrVT, basePtr,
                                           DAG.getConstant( 16, PtrVT)),
                               highMemPtr,
                               SN->isVolatile(), SN->isNonTemporal(), 16);
    the_chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(1),
                                                              hi.getValue(1));

    low = DAG.getNode(ISD::AND, dl, MVT::i128,
                        DAG.getNode( ISD::BIT_CONVERT, dl, MVT::i128, low),
                        DAG.getNode( ISD::XOR, dl, MVT::i128, lowmask, ones));
    hi = DAG.getNode(ISD::AND, dl, MVT::i128,
                        DAG.getNode( ISD::BIT_CONVERT, dl, MVT::i128, hi),
                        DAG.getNode( ISD::XOR, dl, MVT::i128, himask, ones));

    // Shift the Value to store into place. rlow contains the parts that go to
    // the lower memory chunk, rhi has the parts that go to the upper one.
    SDValue rlow = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, Value, offset);
    rlow = DAG.getNode(ISD::AND, dl, MVT::i128, rlow, lowmask);
    SDValue rhi = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, Value,
                                                            offset_compl);

    // Merge the old data and the new data and store the results
    // Need to convert vectors here to integer as 'OR'ing floats assert
    rlow = DAG.getNode(ISD::OR, dl, MVT::i128,
                          DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i128, low),
                          DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i128, rlow));
    rhi = DAG.getNode(ISD::OR, dl, MVT::i128,
                         DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i128, hi),
                         DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i128, rhi));

    low = DAG.getStore(the_chain, dl, rlow, basePtr,
                          lowMemPtr,
                          SN->isVolatile(), SN->isNonTemporal(), 16);
    hi  = DAG.getStore(the_chain, dl, rhi,
                            DAG.getNode(ISD::ADD, dl, PtrVT, basePtr,
                                        DAG.getConstant( 16, PtrVT)),
                            highMemPtr,
                            SN->isVolatile(), SN->isNonTemporal(), 16);
    result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(0),
                                                           hi.getValue(0));
  }

  return result;

}

//! Generate the address of a constant pool entry.
static SDValue
LowerConstantPool(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
  EVT PtrVT = Op.getValueType();
  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
  const Constant *C = CP->getConstVal();
  SDValue CPI = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment());
  SDValue Zero = DAG.getConstant(0, PtrVT);
  const TargetMachine &TM = DAG.getTarget();
  // FIXME there is no actual debug info here
  DebugLoc dl = Op.getDebugLoc();

  if (TM.getRelocationModel() == Reloc::Static) {
    if (!ST->usingLargeMem()) {
      // Just return the SDValue with the constant pool address in it.
      return DAG.getNode(SPUISD::AFormAddr, dl, PtrVT, CPI, Zero);
    } else {
      SDValue Hi = DAG.getNode(SPUISD::Hi, dl, PtrVT, CPI, Zero);
      SDValue Lo = DAG.getNode(SPUISD::Lo, dl, PtrVT, CPI, Zero);
      return DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Hi, Lo);
    }
  }

  llvm_unreachable("LowerConstantPool: Relocation model other than static"
                   " not supported.");
  return SDValue();
}

//! Alternate entry point for generating the address of a constant pool entry
SDValue
SPU::LowerConstantPool(SDValue Op, SelectionDAG &DAG, const SPUTargetMachine &TM) {
  return ::LowerConstantPool(Op, DAG, TM.getSubtargetImpl());
}

static SDValue
LowerJumpTable(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
  EVT PtrVT = Op.getValueType();
  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
  SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
  SDValue Zero = DAG.getConstant(0, PtrVT);
  const TargetMachine &TM = DAG.getTarget();
  // FIXME there is no actual debug info here
  DebugLoc dl = Op.getDebugLoc();

  if (TM.getRelocationModel() == Reloc::Static) {
    if (!ST->usingLargeMem()) {
      return DAG.getNode(SPUISD::AFormAddr, dl, PtrVT, JTI, Zero);
    } else {
      SDValue Hi = DAG.getNode(SPUISD::Hi, dl, PtrVT, JTI, Zero);
      SDValue Lo = DAG.getNode(SPUISD::Lo, dl, PtrVT, JTI, Zero);
      return DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Hi, Lo);
    }
  }

  llvm_unreachable("LowerJumpTable: Relocation model other than static"
                   " not supported.");
  return SDValue();
}

static SDValue
LowerGlobalAddress(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
  EVT PtrVT = Op.getValueType();
  GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op);
  const GlobalValue *GV = GSDN->getGlobal();
  SDValue GA = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(),
                                          PtrVT, GSDN->getOffset());
  const TargetMachine &TM = DAG.getTarget();
  SDValue Zero = DAG.getConstant(0, PtrVT);
  // FIXME there is no actual debug info here
  DebugLoc dl = Op.getDebugLoc();

  if (TM.getRelocationModel() == Reloc::Static) {
    if (!ST->usingLargeMem()) {
      return DAG.getNode(SPUISD::AFormAddr, dl, PtrVT, GA, Zero);
    } else {
      SDValue Hi = DAG.getNode(SPUISD::Hi, dl, PtrVT, GA, Zero);
      SDValue Lo = DAG.getNode(SPUISD::Lo, dl, PtrVT, GA, Zero);
      return DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Hi, Lo);
    }
  } else {
    report_fatal_error("LowerGlobalAddress: Relocation model other than static"
                      "not supported.");
    /*NOTREACHED*/
  }

  return SDValue();
}

//! Custom lower double precision floating point constants
static SDValue
LowerConstantFP(SDValue Op, SelectionDAG &DAG) {
  EVT VT = Op.getValueType();
  // FIXME there is no actual debug info here
  DebugLoc dl = Op.getDebugLoc();

  if (VT == MVT::f64) {
    ConstantFPSDNode *FP = cast<ConstantFPSDNode>(Op.getNode());

    assert((FP != 0) &&
           "LowerConstantFP: Node is not ConstantFPSDNode");

    uint64_t dbits = DoubleToBits(FP->getValueAPF().convertToDouble());
    SDValue T = DAG.getConstant(dbits, MVT::i64);
    SDValue Tvec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, T, T);
    return DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT,
                       DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Tvec));
  }

  return SDValue();
}

SDValue
SPUTargetLowering::LowerFormalArguments(SDValue Chain,
                                        CallingConv::ID CallConv, bool isVarArg,
                                        const SmallVectorImpl<ISD::InputArg>
                                          &Ins,
                                        DebugLoc dl, SelectionDAG &DAG,
                                        SmallVectorImpl<SDValue> &InVals)
                                          const {

  MachineFunction &MF = DAG.getMachineFunction();
  MachineFrameInfo *MFI = MF.getFrameInfo();
  MachineRegisterInfo &RegInfo = MF.getRegInfo();
  SPUFunctionInfo *FuncInfo = MF.getInfo<SPUFunctionInfo>();

  unsigned ArgOffset = SPUFrameInfo::minStackSize();
  unsigned ArgRegIdx = 0;
  unsigned StackSlotSize = SPUFrameInfo::stackSlotSize();

  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();

  SmallVector<CCValAssign, 16> ArgLocs;
  CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs,
                 *DAG.getContext());
  // FIXME: allow for other calling conventions
  CCInfo.AnalyzeFormalArguments(Ins, CCC_SPU);

  // Add DAG nodes to load the arguments or copy them out of registers.
  for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
    EVT ObjectVT = Ins[ArgNo].VT;
    unsigned ObjSize = ObjectVT.getSizeInBits()/8;
    SDValue ArgVal;
    CCValAssign &VA = ArgLocs[ArgNo];

    if (VA.isRegLoc()) {
      const TargetRegisterClass *ArgRegClass;

      switch (ObjectVT.getSimpleVT().SimpleTy) {
      default:
        report_fatal_error("LowerFormalArguments Unhandled argument type: " +
                           Twine(ObjectVT.getEVTString()));
      case MVT::i8:
        ArgRegClass = &SPU::R8CRegClass;
        break;
      case MVT::i16:
        ArgRegClass = &SPU::R16CRegClass;
        break;
      case MVT::i32:
        ArgRegClass = &SPU::R32CRegClass;
        break;
      case MVT::i64:
        ArgRegClass = &SPU::R64CRegClass;
        break;
      case MVT::i128:
        ArgRegClass = &SPU::GPRCRegClass;
        break;
      case MVT::f32:
        ArgRegClass = &SPU::R32FPRegClass;
        break;
      case MVT::f64:
        ArgRegClass = &SPU::R64FPRegClass;
        break;
      case MVT::v2f64:
      case MVT::v4f32:
      case MVT::v2i64:
      case MVT::v4i32:
      case MVT::v8i16:
      case MVT::v16i8:
        ArgRegClass = &SPU::VECREGRegClass;
        break;
      }

      unsigned VReg = RegInfo.createVirtualRegister(ArgRegClass);
      RegInfo.addLiveIn(VA.getLocReg(), VReg);
      ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
      ++ArgRegIdx;
    } else {
      // We need to load the argument to a virtual register if we determined
      // above that we ran out of physical registers of the appropriate type
      // or we're forced to do vararg
      int FI = MFI->CreateFixedObject(ObjSize, ArgOffset, true);
      SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
      ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo(),
                           false, false, 0);
      ArgOffset += StackSlotSize;
    }

    InVals.push_back(ArgVal);
    // Update the chain
    Chain = ArgVal.getOperand(0);
  }

  // vararg handling:
  if (isVarArg) {
    // FIXME: we should be able to query the argument registers from
    //        tablegen generated code.
    static const unsigned ArgRegs[] = {
      SPU::R3,  SPU::R4,  SPU::R5,  SPU::R6,  SPU::R7,  SPU::R8,  SPU::R9,
      SPU::R10, SPU::R11, SPU::R12, SPU::R13, SPU::R14, SPU::R15, SPU::R16,
      SPU::R17, SPU::R18, SPU::R19, SPU::R20, SPU::R21, SPU::R22, SPU::R23,
      SPU::R24, SPU::R25, SPU::R26, SPU::R27, SPU::R28, SPU::R29, SPU::R30,
      SPU::R31, SPU::R32, SPU::R33, SPU::R34, SPU::R35, SPU::R36, SPU::R37,
      SPU::R38, SPU::R39, SPU::R40, SPU::R41, SPU::R42, SPU::R43, SPU::R44,
      SPU::R45, SPU::R46, SPU::R47, SPU::R48, SPU::R49, SPU::R50, SPU::R51,
      SPU::R52, SPU::R53, SPU::R54, SPU::R55, SPU::R56, SPU::R57, SPU::R58,
      SPU::R59, SPU::R60, SPU::R61, SPU::R62, SPU::R63, SPU::R64, SPU::R65,
      SPU::R66, SPU::R67, SPU::R68, SPU::R69, SPU::R70, SPU::R71, SPU::R72,
      SPU::R73, SPU::R74, SPU::R75, SPU::R76, SPU::R77, SPU::R78, SPU::R79
    };
    // size of ArgRegs array
    unsigned NumArgRegs = 77;

    // We will spill (79-3)+1 registers to the stack
    SmallVector<SDValue, 79-3+1> MemOps;

    // Create the frame slot
    for (; ArgRegIdx != NumArgRegs; ++ArgRegIdx) {
      FuncInfo->setVarArgsFrameIndex(
        MFI->CreateFixedObject(StackSlotSize, ArgOffset, true));
      SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
      unsigned VReg = MF.addLiveIn(ArgRegs[ArgRegIdx], &SPU::R32CRegClass);
      SDValue ArgVal = DAG.getRegister(VReg, MVT::v16i8);
      SDValue Store = DAG.getStore(Chain, dl, ArgVal, FIN, MachinePointerInfo(),
                                   false, false, 0);
      Chain = Store.getOperand(0);
      MemOps.push_back(Store);

      // Increment address by stack slot size for the next stored argument
      ArgOffset += StackSlotSize;
    }
    if (!MemOps.empty())
      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
                          &MemOps[0], MemOps.size());
  }

  return Chain;
}

/// isLSAAddress - Return the immediate to use if the specified
/// value is representable as a LSA address.
static SDNode *isLSAAddress(SDValue Op, SelectionDAG &DAG) {
  ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
  if (!C) return 0;

  int Addr = C->getZExtValue();
  if ((Addr & 3) != 0 ||  // Low 2 bits are implicitly zero.
      (Addr << 14 >> 14) != Addr)
    return 0;  // Top 14 bits have to be sext of immediate.

  return DAG.getConstant((int)C->getZExtValue() >> 2, MVT::i32).getNode();
}

SDValue
SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
                             CallingConv::ID CallConv, bool isVarArg,
                             bool &isTailCall,
                             const SmallVectorImpl<ISD::OutputArg> &Outs,
                             const SmallVectorImpl<SDValue> &OutVals,
                             const SmallVectorImpl<ISD::InputArg> &Ins,
                             DebugLoc dl, SelectionDAG &DAG,
                             SmallVectorImpl<SDValue> &InVals) const {
  // CellSPU target does not yet support tail call optimization.
  isTailCall = false;

  const SPUSubtarget *ST = SPUTM.getSubtargetImpl();
  unsigned NumOps     = Outs.size();
  unsigned StackSlotSize = SPUFrameInfo::stackSlotSize();

  SmallVector<CCValAssign, 16> ArgLocs;
  CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs,
                 *DAG.getContext());
  // FIXME: allow for other calling conventions
  CCInfo.AnalyzeCallOperands(Outs, CCC_SPU);

  const unsigned NumArgRegs = ArgLocs.size();


  // Handy pointer type
  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();

  // Set up a copy of the stack pointer for use loading and storing any
  // arguments that may not fit in the registers available for argument
  // passing.
  SDValue StackPtr = DAG.getRegister(SPU::R1, MVT::i32);

  // Figure out which arguments are going to go in registers, and which in
  // memory.
  unsigned ArgOffset = SPUFrameInfo::minStackSize(); // Just below [LR]
  unsigned ArgRegIdx = 0;

  // Keep track of registers passing arguments
  std::vector<std::pair<unsigned, SDValue> > RegsToPass;
  // And the arguments passed on the stack
  SmallVector<SDValue, 8> MemOpChains;

  for (; ArgRegIdx != NumOps; ++ArgRegIdx) {
    SDValue Arg = OutVals[ArgRegIdx];
    CCValAssign &VA = ArgLocs[ArgRegIdx];

    // PtrOff will be used to store the current argument to the stack if a
    // register cannot be found for it.
    SDValue PtrOff = DAG.getConstant(ArgOffset, StackPtr.getValueType());
    PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);

    switch (Arg.getValueType().getSimpleVT().SimpleTy) {
    default: llvm_unreachable("Unexpected ValueType for argument!");
    case MVT::i8:
    case MVT::i16:
    case MVT::i32:
    case MVT::i64:
    case MVT::i128:
    case MVT::f32:
    case MVT::f64:
    case MVT::v2i64:
    case MVT::v2f64:
    case MVT::v4f32:
    case MVT::v4i32:
    case MVT::v8i16:
    case MVT::v16i8:
      if (ArgRegIdx != NumArgRegs) {
        RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
      } else {
        MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff,
                                           MachinePointerInfo(),
                                           false, false, 0));
        ArgOffset += StackSlotSize;
      }
      break;
    }
  }

  // Accumulate how many bytes are to be pushed on the stack, including the
  // linkage area, and parameter passing area.  According to the SPU ABI,
  // we minimally need space for [LR] and [SP].
  unsigned NumStackBytes = ArgOffset - SPUFrameInfo::minStackSize();

  // Insert a call sequence start
  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumStackBytes,
                                                            true));

  if (!MemOpChains.empty()) {
    // Adjust the stack pointer for the stack arguments.
    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
                        &MemOpChains[0], MemOpChains.size());
  }

  // Build a sequence of copy-to-reg nodes chained together with token chain
  // and flag operands which copy the outgoing args into the appropriate regs.
  SDValue InFlag;
  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
                             RegsToPass[i].second, InFlag);
    InFlag = Chain.getValue(1);
  }

  SmallVector<SDValue, 8> Ops;
  unsigned CallOpc = SPUISD::CALL;

  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
  // node so that legalize doesn't hack it.
  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
    const GlobalValue *GV = G->getGlobal();
    EVT CalleeVT = Callee.getValueType();
    SDValue Zero = DAG.getConstant(0, PtrVT);
    SDValue GA = DAG.getTargetGlobalAddress(GV, dl, CalleeVT);

    if (!ST->usingLargeMem()) {
      // Turn calls to targets that are defined (i.e., have bodies) into BRSL
      // style calls, otherwise, external symbols are BRASL calls. This assumes
      // that declared/defined symbols are in the same compilation unit and can
      // be reached through PC-relative jumps.
      //
      // NOTE:
      // This may be an unsafe assumption for JIT and really large compilation
      // units.
      if (GV->isDeclaration()) {
        Callee = DAG.getNode(SPUISD::AFormAddr, dl, CalleeVT, GA, Zero);
      } else {
        Callee = DAG.getNode(SPUISD::PCRelAddr, dl, CalleeVT, GA, Zero);
      }
    } else {
      // "Large memory" mode: Turn all calls into indirect calls with a X-form
      // address pairs:
      Callee = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, GA, Zero);
    }
  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
    EVT CalleeVT = Callee.getValueType();
    SDValue Zero = DAG.getConstant(0, PtrVT);
    SDValue ExtSym = DAG.getTargetExternalSymbol(S->getSymbol(),
        Callee.getValueType());

    if (!ST->usingLargeMem()) {
      Callee = DAG.getNode(SPUISD::AFormAddr, dl, CalleeVT, ExtSym, Zero);
    } else {
      Callee = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, ExtSym, Zero);
    }
  } else if (SDNode *Dest = isLSAAddress(Callee, DAG)) {
    // If this is an absolute destination address that appears to be a legal
    // local store address, use the munged value.
    Callee = SDValue(Dest, 0);
  }

  Ops.push_back(Chain);
  Ops.push_back(Callee);

  // Add argument registers to the end of the list so that they are known live
  // into the call.
  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
                                  RegsToPass[i].second.getValueType()));

  if (InFlag.getNode())
    Ops.push_back(InFlag);
  // Returns a chain and a flag for retval copy to use.
  Chain = DAG.getNode(CallOpc, dl, DAG.getVTList(MVT::Other, MVT::Flag),
                      &Ops[0], Ops.size());
  InFlag = Chain.getValue(1);

  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumStackBytes, true),
                             DAG.getIntPtrConstant(0, true), InFlag);
  if (!Ins.empty())
    InFlag = Chain.getValue(1);

  // If the function returns void, just return the chain.
  if (Ins.empty())
    return Chain;

  // Now handle the return value(s)
  SmallVector<CCValAssign, 16> RVLocs;
  CCState CCRetInfo(CallConv, isVarArg, getTargetMachine(),
                    RVLocs, *DAG.getContext());
  CCRetInfo.AnalyzeCallResult(Ins, CCC_SPU);


  // If the call has results, copy the values out of the ret val registers.
  for (unsigned i = 0; i != RVLocs.size(); ++i) {
    CCValAssign VA = RVLocs[i];

    SDValue Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
                                     InFlag);
    Chain = Val.getValue(1);
    InFlag = Val.getValue(2);
    InVals.push_back(Val);
   }

  return Chain;
}

SDValue
SPUTargetLowering::LowerReturn(SDValue Chain,
                               CallingConv::ID CallConv, bool isVarArg,
                               const SmallVectorImpl<ISD::OutputArg> &Outs,
                               const SmallVectorImpl<SDValue> &OutVals,
                               DebugLoc dl, SelectionDAG &DAG) const {

  SmallVector<CCValAssign, 16> RVLocs;
  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
                 RVLocs, *DAG.getContext());
  CCInfo.AnalyzeReturn(Outs, RetCC_SPU);

  // If this is the first return lowered for this function, add the regs to the
  // liveout set for the function.
  if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
    for (unsigned i = 0; i != RVLocs.size(); ++i)
      DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
  }

  SDValue Flag;

  // Copy the result values into the output registers.
  for (unsigned i = 0; i != RVLocs.size(); ++i) {
    CCValAssign &VA = RVLocs[i];
    assert(VA.isRegLoc() && "Can only return in registers!");
    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
                             OutVals[i], Flag);
    Flag = Chain.getValue(1);
  }

  if (Flag.getNode())
    return DAG.getNode(SPUISD::RET_FLAG, dl, MVT::Other, Chain, Flag);
  else
    return DAG.getNode(SPUISD::RET_FLAG, dl, MVT::Other, Chain);
}


//===----------------------------------------------------------------------===//
// Vector related lowering:
//===----------------------------------------------------------------------===//

static ConstantSDNode *
getVecImm(SDNode *N) {
  SDValue OpVal(0, 0);

  // Check to see if this buildvec has a single non-undef value in its elements.
  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
    if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue;
    if (OpVal.getNode() == 0)
      OpVal = N->getOperand(i);
    else if (OpVal != N->getOperand(i))
      return 0;
  }

  if (OpVal.getNode() != 0) {
    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal)) {
      return CN;
    }
  }

  return 0;
}

/// get_vec_i18imm - Test if this vector is a vector filled with the same value
/// and the value fits into an unsigned 18-bit constant, and if so, return the
/// constant
SDValue SPU::get_vec_u18imm(SDNode *N, SelectionDAG &DAG,
                              EVT ValueType) {
  if (ConstantSDNode *CN = getVecImm(N)) {
    uint64_t Value = CN->getZExtValue();
    if (ValueType == MVT::i64) {
      uint64_t UValue = CN->getZExtValue();
      uint32_t upper = uint32_t(UValue >> 32);
      uint32_t lower = uint32_t(UValue);
      if (upper != lower)
        return SDValue();
      Value = Value >> 32;
    }
    if (Value <= 0x3ffff)
      return DAG.getTargetConstant(Value, ValueType);
  }

  return SDValue();
}

/// get_vec_i16imm - Test if this vector is a vector filled with the same value
/// and the value fits into a signed 16-bit constant, and if so, return the
/// constant
SDValue SPU::get_vec_i16imm(SDNode *N, SelectionDAG &DAG,
                              EVT ValueType) {
  if (ConstantSDNode *CN = getVecImm(N)) {
    int64_t Value = CN->getSExtValue();
    if (ValueType == MVT::i64) {
      uint64_t UValue = CN->getZExtValue();
      uint32_t upper = uint32_t(UValue >> 32);
      uint32_t lower = uint32_t(UValue);
      if (upper != lower)
        return SDValue();
      Value = Value >> 32;
    }
    if (Value >= -(1 << 15) && Value <= ((1 << 15) - 1)) {
      return DAG.getTargetConstant(Value, ValueType);
    }
  }

  return SDValue();
}

/// get_vec_i10imm - Test if this vector is a vector filled with the same value
/// and the value fits into a signed 10-bit constant, and if so, return the
/// constant
SDValue SPU::get_vec_i10imm(SDNode *N, SelectionDAG &DAG,
                              EVT ValueType) {
  if (ConstantSDNode *CN = getVecImm(N)) {
    int64_t Value = CN->getSExtValue();
    if (ValueType == MVT::i64) {
      uint64_t UValue = CN->getZExtValue();
      uint32_t upper = uint32_t(UValue >> 32);
      uint32_t lower = uint32_t(UValue);
      if (upper != lower)
        return SDValue();
      Value = Value >> 32;
    }
    if (isInt<10>(Value))
      return DAG.getTargetConstant(Value, ValueType);
  }

  return SDValue();
}

/// get_vec_i8imm - Test if this vector is a vector filled with the same value
/// and the value fits into a signed 8-bit constant, and if so, return the
/// constant.
///
/// @note: The incoming vector is v16i8 because that's the only way we can load
/// constant vectors. Thus, we test to see if the upper and lower bytes are the
/// same value.
SDValue SPU::get_vec_i8imm(SDNode *N, SelectionDAG &DAG,
                             EVT ValueType) {
  if (ConstantSDNode *CN = getVecImm(N)) {
    int Value = (int) CN->getZExtValue();
    if (ValueType == MVT::i16
        && Value <= 0xffff                 /* truncated from uint64_t */
        && ((short) Value >> 8) == ((short) Value & 0xff))
      return DAG.getTargetConstant(Value & 0xff, ValueType);
    else if (ValueType == MVT::i8
             && (Value & 0xff) == Value)
      return DAG.getTargetConstant(Value, ValueType);
  }

  return SDValue();
}

/// get_ILHUvec_imm - Test if this vector is a vector filled with the same value
/// and the value fits into a signed 16-bit constant, and if so, return the
/// constant
SDValue SPU::get_ILHUvec_imm(SDNode *N, SelectionDAG &DAG,
                               EVT ValueType) {
  if (ConstantSDNode *CN = getVecImm(N)) {
    uint64_t Value = CN->getZExtValue();
    if ((ValueType == MVT::i32
          && ((unsigned) Value & 0xffff0000) == (unsigned) Value)
        || (ValueType == MVT::i64 && (Value & 0xffff0000) == Value))
      return DAG.getTargetConstant(Value >> 16, ValueType);
  }

  return SDValue();
}

/// get_v4i32_imm - Catch-all for general 32-bit constant vectors
SDValue SPU::get_v4i32_imm(SDNode *N, SelectionDAG &DAG) {
  if (ConstantSDNode *CN = getVecImm(N)) {
    return DAG.getTargetConstant((unsigned) CN->getZExtValue(), MVT::i32);
  }

  return SDValue();
}

/// get_v4i32_imm - Catch-all for general 64-bit constant vectors
SDValue SPU::get_v2i64_imm(SDNode *N, SelectionDAG &DAG) {
  if (ConstantSDNode *CN = getVecImm(N)) {
    return DAG.getTargetConstant((unsigned) CN->getZExtValue(), MVT::i64);
  }

  return SDValue();
}

//! Lower a BUILD_VECTOR instruction creatively:
static SDValue
LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
  EVT VT = Op.getValueType();
  EVT EltVT = VT.getVectorElementType();
  DebugLoc dl = Op.getDebugLoc();
  BuildVectorSDNode *BCN = dyn_cast<BuildVectorSDNode>(Op.getNode());
  assert(BCN != 0 && "Expected BuildVectorSDNode in SPU LowerBUILD_VECTOR");
  unsigned minSplatBits = EltVT.getSizeInBits();

  if (minSplatBits < 16)
    minSplatBits = 16;

  APInt APSplatBits, APSplatUndef;
  unsigned SplatBitSize;
  bool HasAnyUndefs;

  if (!BCN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
                            HasAnyUndefs, minSplatBits)
      || minSplatBits < SplatBitSize)
    return SDValue();   // Wasn't a constant vector or splat exceeded min

  uint64_t SplatBits = APSplatBits.getZExtValue();

  switch (VT.getSimpleVT().SimpleTy) {
  default:
    report_fatal_error("CellSPU: Unhandled VT in LowerBUILD_VECTOR, VT = " +
                       Twine(VT.getEVTString()));
    /*NOTREACHED*/
  case MVT::v4f32: {
    uint32_t Value32 = uint32_t(SplatBits);
    assert(SplatBitSize == 32
           && "LowerBUILD_VECTOR: Unexpected floating point vector element.");
    // NOTE: pretend the constant is an integer. LLVM won't load FP constants
    SDValue T = DAG.getConstant(Value32, MVT::i32);
    return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32,
                       DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, T,T,T,T));
    break;
  }
  case MVT::v2f64: {
    uint64_t f64val = uint64_t(SplatBits);
    assert(SplatBitSize == 64
           && "LowerBUILD_VECTOR: 64-bit float vector size > 8 bytes.");
    // NOTE: pretend the constant is an integer. LLVM won't load FP constants
    SDValue T = DAG.getConstant(f64val, MVT::i64);
    return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64,
                       DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, T, T));
    break;
  }
  case MVT::v16i8: {
   // 8-bit constants have to be expanded to 16-bits
   unsigned short Value16 = SplatBits /* | (SplatBits << 8) */;
   SmallVector<SDValue, 8> Ops;

   Ops.assign(8, DAG.getConstant(Value16, MVT::i16));
   return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
                      DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i16, &Ops[0], Ops.size()));
  }
  case MVT::v8i16: {
    unsigned short Value16 = SplatBits;
    SDValue T = DAG.getConstant(Value16, EltVT);
    SmallVector<SDValue, 8> Ops;

    Ops.assign(8, T);
    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Ops[0], Ops.size());
  }
  case MVT::v4i32: {
    SDValue T = DAG.getConstant(unsigned(SplatBits), VT.getVectorElementType());
    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, T, T, T, T);
  }
  case MVT::v2i64: {
    return SPU::LowerV2I64Splat(VT, DAG, SplatBits, dl);
  }
  }

  return SDValue();
}

/*!
 */
SDValue
SPU::LowerV2I64Splat(EVT OpVT, SelectionDAG& DAG, uint64_t SplatVal,
                     DebugLoc dl) {
  uint32_t upper = uint32_t(SplatVal >> 32);
  uint32_t lower = uint32_t(SplatVal);

  if (upper == lower) {
    // Magic constant that can be matched by IL, ILA, et. al.
    SDValue Val = DAG.getTargetConstant(upper, MVT::i32);
    return DAG.getNode(ISD::BIT_CONVERT, dl, OpVT,
                       DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
                                   Val, Val, Val, Val));
  } else {
    bool upper_special, lower_special;

    // NOTE: This code creates common-case shuffle masks that can be easily
    // detected as common expressions. It is not attempting to create highly
    // specialized masks to replace any and all 0's, 0xff's and 0x80's.

    // Detect if the upper or lower half is a special shuffle mask pattern:
    upper_special = (upper == 0 || upper == 0xffffffff || upper == 0x80000000);
    lower_special = (lower == 0 || lower == 0xffffffff || lower == 0x80000000);

    // Both upper and lower are special, lower to a constant pool load:
    if (lower_special && upper_special) {
      SDValue SplatValCN = DAG.getConstant(SplatVal, MVT::i64);
      return DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64,
                         SplatValCN, SplatValCN);
    }

    SDValue LO32;
    SDValue HI32;
    SmallVector<SDValue, 16> ShufBytes;
    SDValue Result;

    // Create lower vector if not a special pattern
    if (!lower_special) {
      SDValue LO32C = DAG.getConstant(lower, MVT::i32);
      LO32 = DAG.getNode(ISD::BIT_CONVERT, dl, OpVT,
                         DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
                                     LO32C, LO32C, LO32C, LO32C));
    }

    // Create upper vector if not a special pattern
    if (!upper_special) {
      SDValue HI32C = DAG.getConstant(upper, MVT::i32);
      HI32 = DAG.getNode(ISD::BIT_CONVERT, dl, OpVT,
                         DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
                                     HI32C, HI32C, HI32C, HI32C));
    }

    // If either upper or lower are special, then the two input operands are
    // the same (basically, one of them is a "don't care")
    if (lower_special)
      LO32 = HI32;
    if (upper_special)
      HI32 = LO32;

    for (int i = 0; i < 4; ++i) {
      uint64_t val = 0;
      for (int j = 0; j < 4; ++j) {
        SDValue V;
        bool process_upper, process_lower;
        val <<= 8;
        process_upper = (upper_special && (i & 1) == 0);
        process_lower = (lower_special && (i & 1) == 1);

        if (process_upper || process_lower) {
          if ((process_upper && upper == 0)
                  || (process_lower && lower == 0))
            val |= 0x80;
          else if ((process_upper && upper == 0xffffffff)
                  || (process_lower && lower == 0xffffffff))
            val |= 0xc0;
          else if ((process_upper && upper == 0x80000000)
                  || (process_lower && lower == 0x80000000))
            val |= (j == 0 ? 0xe0 : 0x80);
        } else
          val |= i * 4 + j + ((i & 1) * 16);
      }

      ShufBytes.push_back(DAG.getConstant(val, MVT::i32));
    }

    return DAG.getNode(SPUISD::SHUFB, dl, OpVT, HI32, LO32,
                       DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
                                   &ShufBytes[0], ShufBytes.size()));
  }
}

/// LowerVECTOR_SHUFFLE - Lower a vector shuffle (V1, V2, V3) to something on
/// which the Cell can operate. The code inspects V3 to ascertain whether the
/// permutation vector, V3, is monotonically increasing with one "exception"
/// element, e.g., (0, 1, _, 3). If this is the case, then generate a
/// SHUFFLE_MASK synthetic instruction. Otherwise, spill V3 to the constant pool.
/// In either case, the net result is going to eventually invoke SHUFB to
/// permute/shuffle the bytes from V1 and V2.
/// \note
/// SHUFFLE_MASK is eventually selected as one of the C*D instructions, generate
/// control word for byte/halfword/word insertion. This takes care of a single
/// element move from V2 into V1.
/// \note
/// SPUISD::SHUFB is eventually selected as Cell's <i>shufb</i> instructions.
static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
  const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
  SDValue V1 = Op.getOperand(0);
  SDValue V2 = Op.getOperand(1);
  DebugLoc dl = Op.getDebugLoc();

  if (V2.getOpcode() == ISD::UNDEF) V2 = V1;

  // If we have a single element being moved from V1 to V2, this can be handled
  // using the C*[DX] compute mask instructions, but the vector elements have
  // to be monotonically increasing with one exception element, and the source
  // slot of the element to move must be the same as the destination.
  EVT VecVT = V1.getValueType();
  EVT EltVT = VecVT.getVectorElementType();
  unsigned EltsFromV2 = 0;
  unsigned V2EltOffset = 0;
  unsigned V2EltIdx0 = 0;
  unsigned CurrElt = 0;
  unsigned MaxElts = VecVT.getVectorNumElements();
  unsigned PrevElt = 0;
  bool monotonic = true;
  bool rotate = true;
  int rotamt=0;
  EVT maskVT;             // which of the c?d instructions to use

  if (EltVT == MVT::i8) {
    V2EltIdx0 = 16;
    maskVT = MVT::v16i8;
  } else if (EltVT == MVT::i16) {
    V2EltIdx0 = 8;
    maskVT = MVT::v8i16;
  } else if (EltVT == MVT::i32 || EltVT == MVT::f32) {
    V2EltIdx0 = 4;
    maskVT = MVT::v4i32;
  } else if (EltVT == MVT::i64 || EltVT == MVT::f64) {
    V2EltIdx0 = 2;
    maskVT = MVT::v2i64;
  } else
    llvm_unreachable("Unhandled vector type in LowerVECTOR_SHUFFLE");

  for (unsigned i = 0; i != MaxElts; ++i) {
    if (SVN->getMaskElt(i) < 0)
      continue;

    unsigned SrcElt = SVN->getMaskElt(i);

    if (monotonic) {
      if (SrcElt >= V2EltIdx0) {
        // TODO: optimize for the monotonic case when several consecutive
        // elements are taken form V2. Do we ever get such a case?
        if (EltsFromV2 == 0 && CurrElt == (SrcElt - V2EltIdx0))
          V2EltOffset = (SrcElt - V2EltIdx0) * (EltVT.getSizeInBits()/8);
        else
          monotonic = false;
        ++EltsFromV2;
      } else if (CurrElt != SrcElt) {
        monotonic = false;
      }

      ++CurrElt;
    }

    if (rotate) {
      if (PrevElt > 0 && SrcElt < MaxElts) {
        if ((PrevElt == SrcElt - 1)
            || (PrevElt == MaxElts - 1 && SrcElt == 0)) {
          rotamt = SrcElt-i;
          PrevElt = SrcElt;
        } else {
          rotate = false;
        }
      } else if (i == 0 || (PrevElt==0 && SrcElt==1)) {
        // First time or after a "wrap around"
        PrevElt = SrcElt;
      } else {
        // This isn't a rotation, takes elements from vector 2
        rotate = false;
      }
    }
  }

  if (EltsFromV2 == 1 && monotonic) {
    // Compute mask and shuffle
    EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();

    // As SHUFFLE_MASK becomes a c?d instruction, feed it an address
    // R1 ($sp) is used here only as it is guaranteed to have last bits zero
    SDValue Pointer = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
                                DAG.getRegister(SPU::R1, PtrVT),
                                DAG.getConstant(V2EltOffset, MVT::i32));
    SDValue ShufMaskOp = DAG.getNode(SPUISD::SHUFFLE_MASK, dl,
                                     maskVT, Pointer);

    // Use shuffle mask in SHUFB synthetic instruction:
    return DAG.getNode(SPUISD::SHUFB, dl, V1.getValueType(), V2, V1,
                       ShufMaskOp);
  } else if (rotate) {
    if (rotamt < 0)
      rotamt +=MaxElts;
    rotamt *= EltVT.getSizeInBits()/8;
    return DAG.getNode(SPUISD::ROTBYTES_LEFT, dl, V1.getValueType(),
                       V1, DAG.getConstant(rotamt, MVT::i16));
  } else {
   // Convert the SHUFFLE_VECTOR mask's input element units to the
   // actual bytes.
    unsigned BytesPerElement = EltVT.getSizeInBits()/8;

    SmallVector<SDValue, 16> ResultMask;
    for (unsigned i = 0, e = MaxElts; i != e; ++i) {
      unsigned SrcElt = SVN->getMaskElt(i) < 0 ? 0 : SVN->getMaskElt(i);

      for (unsigned j = 0; j < BytesPerElement; ++j)
        ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement+j,MVT::i8));
    }
    SDValue VPermMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i8,
                                    &ResultMask[0], ResultMask.size());
    return DAG.getNode(SPUISD::SHUFB, dl, V1.getValueType(), V1, V2, VPermMask);
  }
}

static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
  SDValue Op0 = Op.getOperand(0);                     // Op0 = the scalar
  DebugLoc dl = Op.getDebugLoc();

  if (Op0.getNode()->getOpcode() == ISD::Constant) {
    // For a constant, build the appropriate constant vector, which will
    // eventually simplify to a vector register load.

    ConstantSDNode *CN = cast<ConstantSDNode>(Op0.getNode());
    SmallVector<SDValue, 16> ConstVecValues;
    EVT VT;
    size_t n_copies;

    // Create a constant vector:
    switch (Op.getValueType().getSimpleVT().SimpleTy) {
    default: llvm_unreachable("Unexpected constant value type in "
                              "LowerSCALAR_TO_VECTOR");
    case MVT::v16i8: n_copies = 16; VT = MVT::i8; break;
    case MVT::v8i16: n_copies = 8; VT = MVT::i16; break;
    case MVT::v4i32: n_copies = 4; VT = MVT::i32; break;
    case MVT::v4f32: n_copies = 4; VT = MVT::f32; break;
    case MVT::v2i64: n_copies = 2; VT = MVT::i64; break;
    case MVT::v2f64: n_copies = 2; VT = MVT::f64; break;
    }

    SDValue CValue = DAG.getConstant(CN->getZExtValue(), VT);
    for (size_t j = 0; j < n_copies; ++j)
      ConstVecValues.push_back(CValue);

    return DAG.getNode(ISD::BUILD_VECTOR, dl, Op.getValueType(),
                       &ConstVecValues[0], ConstVecValues.size());
  } else {
    // Otherwise, copy the value from one register to another:
    switch (Op0.getValueType().getSimpleVT().SimpleTy) {
    default: llvm_unreachable("Unexpected value type in LowerSCALAR_TO_VECTOR");
    case MVT::i8:
    case MVT::i16:
    case MVT::i32:
    case MVT::i64:
    case MVT::f32:
    case MVT::f64:
      return DAG.getNode(SPUISD::PREFSLOT2VEC, dl, Op.getValueType(), Op0, Op0);
    }
  }

  return SDValue();
}

static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
  EVT VT = Op.getValueType();
  SDValue N = Op.getOperand(0);
  SDValue Elt = Op.getOperand(1);
  DebugLoc dl = Op.getDebugLoc();
  SDValue retval;

  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
    // Constant argument:
    int EltNo = (int) C->getZExtValue();

    // sanity checks:
    if (VT == MVT::i8 && EltNo >= 16)
      llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i8 extraction slot > 15");
    else if (VT == MVT::i16 && EltNo >= 8)
      llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i16 extraction slot > 7");
    else if (VT == MVT::i32 && EltNo >= 4)
      llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i32 extraction slot > 4");
    else if (VT == MVT::i64 && EltNo >= 2)
      llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i64 extraction slot > 2");

    if (EltNo == 0 && (VT == MVT::i32 || VT == MVT::i64)) {
      // i32 and i64: Element 0 is the preferred slot
      return DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT, N);
    }

    // Need to generate shuffle mask and extract:
    int prefslot_begin = -1, prefslot_end = -1;
    int elt_byte = EltNo * VT.getSizeInBits() / 8;

    switch (VT.getSimpleVT().SimpleTy) {
    default:
      assert(false && "Invalid value type!");
    case MVT::i8: {
      prefslot_begin = prefslot_end = 3;
      break;
    }
    case MVT::i16: {
      prefslot_begin = 2; prefslot_end = 3;
      break;
    }
    case MVT::i32:
    case MVT::f32: {
      prefslot_begin = 0; prefslot_end = 3;
      break;
    }
    case MVT::i64:
    case MVT::f64: {
      prefslot_begin = 0; prefslot_end = 7;
      break;
    }
    }

    assert(prefslot_begin != -1 && prefslot_end != -1 &&
           "LowerEXTRACT_VECTOR_ELT: preferred slots uninitialized");

    unsigned int ShufBytes[16] = {
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
    };
    for (int i = 0; i < 16; ++i) {
      // zero fill uppper part of preferred slot, don't care about the
      // other slots:
      unsigned int mask_val;
      if (i <= prefslot_end) {
        mask_val =
          ((i < prefslot_begin)
           ? 0x80
           : elt_byte + (i - prefslot_begin));

        ShufBytes[i] = mask_val;
      } else
        ShufBytes[i] = ShufBytes[i % (prefslot_end + 1)];
    }

    SDValue ShufMask[4];
    for (unsigned i = 0; i < sizeof(ShufMask)/sizeof(ShufMask[0]); ++i) {
      unsigned bidx = i * 4;
      unsigned int bits = ((ShufBytes[bidx] << 24) |
                           (ShufBytes[bidx+1] << 16) |
                           (ShufBytes[bidx+2] << 8) |
                           ShufBytes[bidx+3]);
      ShufMask[i] = DAG.getConstant(bits, MVT::i32);
    }

    SDValue ShufMaskVec =
      DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
                  &ShufMask[0], sizeof(ShufMask)/sizeof(ShufMask[0]));

    retval = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT,
                         DAG.getNode(SPUISD::SHUFB, dl, N.getValueType(),
                                     N, N, ShufMaskVec));
  } else {
    // Variable index: Rotate the requested element into slot 0, then replicate
    // slot 0 across the vector
    EVT VecVT = N.getValueType();
    if (!VecVT.isSimple() || !VecVT.isVector()) {
      report_fatal_error("LowerEXTRACT_VECTOR_ELT: Must have a simple, 128-bit"
                        "vector type!");
    }

    // Make life easier by making sure the index is zero-extended to i32
    if (Elt.getValueType() != MVT::i32)
      Elt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Elt);

    // Scale the index to a bit/byte shift quantity
    APInt scaleFactor =
            APInt(32, uint64_t(16 / N.getValueType().getVectorNumElements()), false);
    unsigned scaleShift = scaleFactor.logBase2();
    SDValue vecShift;

    if (scaleShift > 0) {
      // Scale the shift factor:
      Elt = DAG.getNode(ISD::SHL, dl, MVT::i32, Elt,
                        DAG.getConstant(scaleShift, MVT::i32));
    }

    vecShift = DAG.getNode(SPUISD::SHL_BYTES, dl, VecVT, N, Elt);

    // Replicate the bytes starting at byte 0 across the entire vector (for
    // consistency with the notion of a unified register set)
    SDValue replicate;

    switch (VT.getSimpleVT().SimpleTy) {
    default:
      report_fatal_error("LowerEXTRACT_VECTOR_ELT(varable): Unhandled vector"
                        "type");
      /*NOTREACHED*/
    case MVT::i8: {
      SDValue factor = DAG.getConstant(0x00000000, MVT::i32);
      replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
                              factor, factor, factor, factor);
      break;
    }
    case MVT::i16: {
      SDValue factor = DAG.getConstant(0x00010001, MVT::i32);
      replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
                              factor, factor, factor, factor);
      break;
    }
    case MVT::i32:
    case MVT::f32: {
      SDValue factor = DAG.getConstant(0x00010203, MVT::i32);
      replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
                              factor, factor, factor, factor);
      break;
    }
    case MVT::i64:
    case MVT::f64: {
      SDValue loFactor = DAG.getConstant(0x00010203, MVT::i32);
      SDValue hiFactor = DAG.getConstant(0x04050607, MVT::i32);
      replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
                              loFactor, hiFactor, loFactor, hiFactor);
      break;
    }
    }

    retval = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT,
                         DAG.getNode(SPUISD::SHUFB, dl, VecVT,
                                     vecShift, vecShift, replicate));
  }

  return retval;
}

static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
  SDValue VecOp = Op.getOperand(0);
  SDValue ValOp = Op.getOperand(1);
  SDValue IdxOp = Op.getOperand(2);
  DebugLoc dl = Op.getDebugLoc();
  EVT VT = Op.getValueType();
  EVT eltVT = ValOp.getValueType();

  // use 0 when the lane to insert to is 'undef'
  int64_t Offset=0;
  if (IdxOp.getOpcode() != ISD::UNDEF) {
    ConstantSDNode *CN = cast<ConstantSDNode>(IdxOp);
    assert(CN != 0 && "LowerINSERT_VECTOR_ELT: Index is not constant!");
    Offset = (CN->getSExtValue()) * eltVT.getSizeInBits()/8;
  }

  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
  // Use $sp ($1) because it's always 16-byte aligned and it's available:
  SDValue Pointer = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
                                DAG.getRegister(SPU::R1, PtrVT),
                                DAG.getConstant(Offset, PtrVT));
  // widen the mask when dealing with half vectors
  EVT maskVT = EVT::getVectorVT(*(DAG.getContext()), VT.getVectorElementType(),
                                128/ VT.getVectorElementType().getSizeInBits());
  SDValue ShufMask = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, maskVT, Pointer);

  SDValue result =
    DAG.getNode(SPUISD::SHUFB, dl, VT,
                DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, ValOp),
                VecOp,
                DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, ShufMask));

  return result;
}

static SDValue LowerI8Math(SDValue Op, SelectionDAG &DAG, unsigned Opc,
                           const TargetLowering &TLI)
{
  SDValue N0 = Op.getOperand(0);      // Everything has at least one operand
  DebugLoc dl = Op.getDebugLoc();
  EVT ShiftVT = TLI.getShiftAmountTy();

  assert(Op.getValueType() == MVT::i8);
  switch (Opc) {
  default:
    llvm_unreachable("Unhandled i8 math operator");
    /*NOTREACHED*/
    break;
  case ISD::ADD: {
    // 8-bit addition: Promote the arguments up to 16-bits and truncate
    // the result:
    SDValue N1 = Op.getOperand(1);
    N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
    N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N1);
    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
                       DAG.getNode(Opc, dl, MVT::i16, N0, N1));

  }

  case ISD::SUB: {
    // 8-bit subtraction: Promote the arguments up to 16-bits and truncate
    // the result:
    SDValue N1 = Op.getOperand(1);
    N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
    N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N1);
    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
                       DAG.getNode(Opc, dl, MVT::i16, N0, N1));
  }
  case ISD::ROTR:
  case ISD::ROTL: {
    SDValue N1 = Op.getOperand(1);
    EVT N1VT = N1.getValueType();

    N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, N0);
    if (!N1VT.bitsEq(ShiftVT)) {
      unsigned N1Opc = N1.getValueType().bitsLT(ShiftVT)
                       ? ISD::ZERO_EXTEND
                       : ISD::TRUNCATE;
      N1 = DAG.getNode(N1Opc, dl, ShiftVT, N1);
    }

    // Replicate lower 8-bits into upper 8:
    SDValue ExpandArg =
      DAG.getNode(ISD::OR, dl, MVT::i16, N0,
                  DAG.getNode(ISD::SHL, dl, MVT::i16,
                              N0, DAG.getConstant(8, MVT::i32)));

    // Truncate back down to i8
    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
                       DAG.getNode(Opc, dl, MVT::i16, ExpandArg, N1));
  }
  case ISD::SRL:
  case ISD::SHL: {
    SDValue N1 = Op.getOperand(1);
    EVT N1VT = N1.getValueType();

    N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, N0);
    if (!N1VT.bitsEq(ShiftVT)) {
      unsigned N1Opc = ISD::ZERO_EXTEND;

      if (N1.getValueType().bitsGT(ShiftVT))
        N1Opc = ISD::TRUNCATE;

      N1 = DAG.getNode(N1Opc, dl, ShiftVT, N1);
    }

    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
                       DAG.getNode(Opc, dl, MVT::i16, N0, N1));
  }
  case ISD::SRA: {
    SDValue N1 = Op.getOperand(1);
    EVT N1VT = N1.getValueType();

    N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
    if (!N1VT.bitsEq(ShiftVT)) {
      unsigned N1Opc = ISD::SIGN_EXTEND;

      if (N1VT.bitsGT(ShiftVT))
        N1Opc = ISD::TRUNCATE;
      N1 = DAG.getNode(N1Opc, dl, ShiftVT, N1);
    }

    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
                       DAG.getNode(Opc, dl, MVT::i16, N0, N1));
  }
  case ISD::MUL: {
    SDValue N1 = Op.getOperand(1);

    N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
    N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N1);
    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
                       DAG.getNode(Opc, dl, MVT::i16, N0, N1));
    break;
  }
  }

  return SDValue();
}

//! Lower byte immediate operations for v16i8 vectors:
static SDValue
LowerByteImmed(SDValue Op, SelectionDAG &DAG) {
  SDValue ConstVec;
  SDValue Arg;
  EVT VT = Op.getValueType();
  DebugLoc dl = Op.getDebugLoc();

  ConstVec = Op.getOperand(0);
  Arg = Op.getOperand(1);
  if (ConstVec.getNode()->getOpcode() != ISD::BUILD_VECTOR) {
    if (ConstVec.getNode()->getOpcode() == ISD::BIT_CONVERT) {
      ConstVec = ConstVec.getOperand(0);
    } else {
      ConstVec = Op.getOperand(1);
      Arg = Op.getOperand(0);
      if (ConstVec.getNode()->getOpcode() == ISD::BIT_CONVERT) {
        ConstVec = ConstVec.getOperand(0);
      }
    }
  }

  if (ConstVec.getNode()->getOpcode() == ISD::BUILD_VECTOR) {
    BuildVectorSDNode *BCN = dyn_cast<BuildVectorSDNode>(ConstVec.getNode());
    assert(BCN != 0 && "Expected BuildVectorSDNode in SPU LowerByteImmed");

    APInt APSplatBits, APSplatUndef;
    unsigned SplatBitSize;
    bool HasAnyUndefs;
    unsigned minSplatBits = VT.getVectorElementType().getSizeInBits();

    if (BCN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
                              HasAnyUndefs, minSplatBits)
        && minSplatBits <= SplatBitSize) {
      uint64_t SplatBits = APSplatBits.getZExtValue();
      SDValue tc = DAG.getTargetConstant(SplatBits & 0xff, MVT::i8);

      SmallVector<SDValue, 16> tcVec;
      tcVec.assign(16, tc);
      return DAG.getNode(Op.getNode()->getOpcode(), dl, VT, Arg,
                         DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &tcVec[0], tcVec.size()));
    }
  }

  // These operations (AND, OR, XOR) are legal, they just couldn't be custom
  // lowered.  Return the operation, rather than a null SDValue.
  return Op;
}

//! Custom lowering for CTPOP (count population)
/*!
  Custom lowering code that counts the number ones in the input
  operand. SPU has such an instruction, but it counts the number of
  ones per byte, which then have to be accumulated.
*/
static SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) {
  EVT VT = Op.getValueType();
  EVT vecVT = EVT::getVectorVT(*DAG.getContext(),
                               VT, (128 / VT.getSizeInBits()));
  DebugLoc dl = Op.getDebugLoc();

  switch (VT.getSimpleVT().SimpleTy) {
  default:
    assert(false && "Invalid value type!");
  case MVT::i8: {
    SDValue N = Op.getOperand(0);
    SDValue Elt0 = DAG.getConstant(0, MVT::i32);

    SDValue Promote = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, N, N);
    SDValue CNTB = DAG.getNode(SPUISD::CNTB, dl, vecVT, Promote);

    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i8, CNTB, Elt0);
  }

  case MVT::i16: {
    MachineFunction &MF = DAG.getMachineFunction();
    MachineRegisterInfo &RegInfo = MF.getRegInfo();

    unsigned CNTB_reg = RegInfo.createVirtualRegister(&SPU::R16CRegClass);

    SDValue N = Op.getOperand(0);
    SDValue Elt0 = DAG.getConstant(0, MVT::i16);
    SDValue Mask0 = DAG.getConstant(0x0f, MVT::i16);
    SDValue Shift1 = DAG.getConstant(8, MVT::i32);

    SDValue Promote = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, N, N);
    SDValue CNTB = DAG.getNode(SPUISD::CNTB, dl, vecVT, Promote);

    // CNTB_result becomes the chain to which all of the virtual registers
    // CNTB_reg, SUM1_reg become associated:
    SDValue CNTB_result =
      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, CNTB, Elt0);

    SDValue CNTB_rescopy =
      DAG.getCopyToReg(CNTB_result, dl, CNTB_reg, CNTB_result);

    SDValue Tmp1 = DAG.getCopyFromReg(CNTB_rescopy, dl, CNTB_reg, MVT::i16);

    return DAG.getNode(ISD::AND, dl, MVT::i16,
                       DAG.getNode(ISD::ADD, dl, MVT::i16,
                                   DAG.getNode(ISD::SRL, dl, MVT::i16,
                                               Tmp1, Shift1),
                                   Tmp1),
                       Mask0);
  }

  case MVT::i32: {
    MachineFunction &MF = DAG.getMachineFunction();
    MachineRegisterInfo &RegInfo = MF.getRegInfo();

    unsigned CNTB_reg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
    unsigned SUM1_reg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);

    SDValue N = Op.getOperand(0);
    SDValue Elt0 = DAG.getConstant(0, MVT::i32);
    SDValue Mask0 = DAG.getConstant(0xff, MVT::i32);
    SDValue Shift1 = DAG.getConstant(16, MVT::i32);
    SDValue Shift2 = DAG.getConstant(8, MVT::i32);

    SDValue Promote = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, N, N);
    SDValue CNTB = DAG.getNode(SPUISD::CNTB, dl, vecVT, Promote);

    // CNTB_result becomes the chain to which all of the virtual registers
    // CNTB_reg, SUM1_reg become associated:
    SDValue CNTB_result =
      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, CNTB, Elt0);

    SDValue CNTB_rescopy =
      DAG.getCopyToReg(CNTB_result, dl, CNTB_reg, CNTB_result);

    SDValue Comp1 =
      DAG.getNode(ISD::SRL, dl, MVT::i32,
                  DAG.getCopyFromReg(CNTB_rescopy, dl, CNTB_reg, MVT::i32),
                  Shift1);

    SDValue Sum1 =
      DAG.getNode(ISD::ADD, dl, MVT::i32, Comp1,
                  DAG.getCopyFromReg(CNTB_rescopy, dl, CNTB_reg, MVT::i32));

    SDValue Sum1_rescopy =
      DAG.getCopyToReg(CNTB_result, dl, SUM1_reg, Sum1);

    SDValue Comp2 =
      DAG.getNode(ISD::SRL, dl, MVT::i32,
                  DAG.getCopyFromReg(Sum1_rescopy, dl, SUM1_reg, MVT::i32),
                  Shift2);
    SDValue Sum2 =
      DAG.getNode(ISD::ADD, dl, MVT::i32, Comp2,
                  DAG.getCopyFromReg(Sum1_rescopy, dl, SUM1_reg, MVT::i32));

    return DAG.getNode(ISD::AND, dl, MVT::i32, Sum2, Mask0);
  }

  case MVT::i64:
    break;
  }

  return SDValue();
}

//! Lower ISD::FP_TO_SINT, ISD::FP_TO_UINT for i32
/*!
 f32->i32 passes through unchanged, whereas f64->i32 expands to a libcall.
 All conversions to i64 are expanded to a libcall.
 */
static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
                              const SPUTargetLowering &TLI) {
  EVT OpVT = Op.getValueType();
  SDValue Op0 = Op.getOperand(0);
  EVT Op0VT = Op0.getValueType();

  if ((OpVT == MVT::i32 && Op0VT == MVT::f64)
      || OpVT == MVT::i64) {
    // Convert f32 / f64 to i32 / i64 via libcall.
    RTLIB::Libcall LC =
            (Op.getOpcode() == ISD::FP_TO_SINT)
             ? RTLIB::getFPTOSINT(Op0VT, OpVT)
             : RTLIB::getFPTOUINT(Op0VT, OpVT);
    assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpectd fp-to-int conversion!");
    SDValue Dummy;
    return ExpandLibCall(LC, Op, DAG, false, Dummy, TLI);
  }

  return Op;
}

//! Lower ISD::SINT_TO_FP, ISD::UINT_TO_FP for i32
/*!
 i32->f32 passes through unchanged, whereas i32->f64 is expanded to a libcall.
 All conversions from i64 are expanded to a libcall.
 */
static SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG,
                              const SPUTargetLowering &TLI) {
  EVT OpVT = Op.getValueType();
  SDValue Op0 = Op.getOperand(0);
  EVT Op0VT = Op0.getValueType();

  if ((OpVT == MVT::f64 && Op0VT == MVT::i32)
      || Op0VT == MVT::i64) {
    // Convert i32, i64 to f64 via libcall:
    RTLIB::Libcall LC =
            (Op.getOpcode() == ISD::SINT_TO_FP)
             ? RTLIB::getSINTTOFP(Op0VT, OpVT)
             : RTLIB::getUINTTOFP(Op0VT, OpVT);
    assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpectd int-to-fp conversion!");
    SDValue Dummy;
    return ExpandLibCall(LC, Op, DAG, false, Dummy, TLI);
  }

  return Op;
}

//! Lower ISD::SETCC
/*!
 This handles MVT::f64 (double floating point) condition lowering
 */
static SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG,
                          const TargetLowering &TLI) {
  CondCodeSDNode *CC = dyn_cast<CondCodeSDNode>(Op.getOperand(2));
  DebugLoc dl = Op.getDebugLoc();
  assert(CC != 0 && "LowerSETCC: CondCodeSDNode should not be null here!\n");

  SDValue lhs = Op.getOperand(0);
  SDValue rhs = Op.getOperand(1);
  EVT lhsVT = lhs.getValueType();
  assert(lhsVT == MVT::f64 && "LowerSETCC: type other than MVT::64\n");

  EVT ccResultVT = TLI.getSetCCResultType(lhs.getValueType());
  APInt ccResultOnes = APInt::getAllOnesValue(ccResultVT.getSizeInBits());
  EVT IntVT(MVT::i64);

  // Take advantage of the fact that (truncate (sra arg, 32)) is efficiently
  // selected to a NOP:
  SDValue i64lhs = DAG.getNode(ISD::BIT_CONVERT, dl, IntVT, lhs);
  SDValue lhsHi32 =
          DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
                      DAG.getNode(ISD::SRL, dl, IntVT,
                                  i64lhs, DAG.getConstant(32, MVT::i32)));
  SDValue lhsHi32abs =
          DAG.getNode(ISD::AND, dl, MVT::i32,
                      lhsHi32, DAG.getConstant(0x7fffffff, MVT::i32));
  SDValue lhsLo32 =
          DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, i64lhs);

  // SETO and SETUO only use the lhs operand:
  if (CC->get() == ISD::SETO) {
    // Evaluates to true if Op0 is not [SQ]NaN - lowers to the inverse of
    // SETUO
    APInt ccResultAllOnes = APInt::getAllOnesValue(ccResultVT.getSizeInBits());
    return DAG.getNode(ISD::XOR, dl, ccResultVT,
                       DAG.getSetCC(dl, ccResultVT,
                                    lhs, DAG.getConstantFP(0.0, lhsVT),
                                    ISD::SETUO),
                       DAG.getConstant(ccResultAllOnes, ccResultVT));
  } else if (CC->get() == ISD::SETUO) {
    // Evaluates to true if Op0 is [SQ]NaN
    return DAG.getNode(ISD::AND, dl, ccResultVT,
                       DAG.getSetCC(dl, ccResultVT,
                                    lhsHi32abs,
                                    DAG.getConstant(0x7ff00000, MVT::i32),
                                    ISD::SETGE),
                       DAG.getSetCC(dl, ccResultVT,
                                    lhsLo32,
                                    DAG.getConstant(0, MVT::i32),
                                    ISD::SETGT));
  }

  SDValue i64rhs = DAG.getNode(ISD::BIT_CONVERT, dl, IntVT, rhs);
  SDValue rhsHi32 =
          DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
                      DAG.getNode(ISD::SRL, dl, IntVT,
                                  i64rhs, DAG.getConstant(32, MVT::i32)));

  // If a value is negative, subtract from the sign magnitude constant:
  SDValue signMag2TC = DAG.getConstant(0x8000000000000000ULL, IntVT);

  // Convert the sign-magnitude representation into 2's complement:
  SDValue lhsSelectMask = DAG.getNode(ISD::SRA, dl, ccResultVT,
                                      lhsHi32, DAG.getConstant(31, MVT::i32));
  SDValue lhsSignMag2TC = DAG.getNode(ISD::SUB, dl, IntVT, signMag2TC, i64lhs);
  SDValue lhsSelect =
          DAG.getNode(ISD::SELECT, dl, IntVT,
                      lhsSelectMask, lhsSignMag2TC, i64lhs);

  SDValue rhsSelectMask = DAG.getNode(ISD::SRA, dl, ccResultVT,
                                      rhsHi32, DAG.getConstant(31, MVT::i32));
  SDValue rhsSignMag2TC = DAG.getNode(ISD::SUB, dl, IntVT, signMag2TC, i64rhs);
  SDValue rhsSelect =
          DAG.getNode(ISD::SELECT, dl, IntVT,
                      rhsSelectMask, rhsSignMag2TC, i64rhs);

  unsigned compareOp;

  switch (CC->get()) {
  case ISD::SETOEQ:
  case ISD::SETUEQ:
    compareOp = ISD::SETEQ; break;
  case ISD::SETOGT:
  case ISD::SETUGT:
    compareOp = ISD::SETGT; break;
  case ISD::SETOGE:
  case ISD::SETUGE:
    compareOp = ISD::SETGE; break;
  case ISD::SETOLT:
  case ISD::SETULT:
    compareOp = ISD::SETLT; break;
  case ISD::SETOLE:
  case ISD::SETULE:
    compareOp = ISD::SETLE; break;
  case ISD::SETUNE:
  case ISD::SETONE:
    compareOp = ISD::SETNE; break;
  default:
    report_fatal_error("CellSPU ISel Select: unimplemented f64 condition");
  }

  SDValue result =
          DAG.getSetCC(dl, ccResultVT, lhsSelect, rhsSelect,
                       (ISD::CondCode) compareOp);

  if ((CC->get() & 0x8) == 0) {
    // Ordered comparison:
    SDValue lhsNaN = DAG.getSetCC(dl, ccResultVT,
                                  lhs, DAG.getConstantFP(0.0, MVT::f64),
                                  ISD::SETO);
    SDValue rhsNaN = DAG.getSetCC(dl, ccResultVT,
                                  rhs, DAG.getConstantFP(0.0, MVT::f64),
                                  ISD::SETO);
    SDValue ordered = DAG.getNode(ISD::AND, dl, ccResultVT, lhsNaN, rhsNaN);

    result = DAG.getNode(ISD::AND, dl, ccResultVT, ordered, result);
  }

  return result;
}

//! Lower ISD::SELECT_CC
/*!
  ISD::SELECT_CC can (generally) be implemented directly on the SPU using the
  SELB instruction.

  \note Need to revisit this in the future: if the code path through the true
  and false value computations is longer than the latency of a branch (6
  cycles), then it would be more advantageous to branch and insert a new basic
  block and branch on the condition. However, this code does not make that
  assumption, given the simplisitc uses so far.
 */

static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG,
                              const TargetLowering &TLI) {
  EVT VT = Op.getValueType();
  SDValue lhs = Op.getOperand(0);
  SDValue rhs = Op.getOperand(1);
  SDValue trueval = Op.getOperand(2);
  SDValue falseval = Op.getOperand(3);
  SDValue condition = Op.getOperand(4);
  DebugLoc dl = Op.getDebugLoc();

  // NOTE: SELB's arguments: $rA, $rB, $mask
  //
  // SELB selects bits from $rA where bits in $mask are 0, bits from $rB
  // where bits in $mask are 1. CCond will be inverted, having 1s where the
  // condition was true and 0s where the condition was false. Hence, the
  // arguments to SELB get reversed.

  // Note: Really should be ISD::SELECT instead of SPUISD::SELB, but LLVM's
  // legalizer insists on combining SETCC/SELECT into SELECT_CC, so we end up
  // with another "cannot select select_cc" assert:

  SDValue compare = DAG.getNode(ISD::SETCC, dl,
                                TLI.getSetCCResultType(Op.getValueType()),
                                lhs, rhs, condition);
  return DAG.getNode(SPUISD::SELB, dl, VT, falseval, trueval, compare);
}

//! Custom lower ISD::TRUNCATE
static SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG)
{
  // Type to truncate to
  EVT VT = Op.getValueType();
  MVT simpleVT = VT.getSimpleVT();
  EVT VecVT = EVT::getVectorVT(*DAG.getContext(),
                               VT, (128 / VT.getSizeInBits()));
  DebugLoc dl = Op.getDebugLoc();

  // Type to truncate from
  SDValue Op0 = Op.getOperand(0);
  EVT Op0VT = Op0.getValueType();

  if (Op0VT == MVT::i128 && simpleVT == MVT::i64) {
    // Create shuffle mask, least significant doubleword of quadword
    unsigned maskHigh = 0x08090a0b;
    unsigned maskLow = 0x0c0d0e0f;
    // Use a shuffle to perform the truncation
    SDValue shufMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
                                   DAG.getConstant(maskHigh, MVT::i32),
                                   DAG.getConstant(maskLow, MVT::i32),
                                   DAG.getConstant(maskHigh, MVT::i32),
                                   DAG.getConstant(maskLow, MVT::i32));

    SDValue truncShuffle = DAG.getNode(SPUISD::SHUFB, dl, VecVT,
                                       Op0, Op0, shufMask);

    return DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT, truncShuffle);
  }

  return SDValue();             // Leave the truncate unmolested
}

/*!
 * Emit the instruction sequence for i64/i32 -> i128 sign extend. The basic
 * algorithm is to duplicate the sign bit using rotmai to generate at
 * least one byte full of sign bits. Then propagate the "sign-byte" into
 * the leftmost words and the i64/i32 into the rightmost words using shufb.
 *
 * @param Op The sext operand
 * @param DAG The current DAG
 * @return The SDValue with the entire instruction sequence
 */
static SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG)
{
  DebugLoc dl = Op.getDebugLoc();

  // Type to extend to
  MVT OpVT = Op.getValueType().getSimpleVT();

  // Type to extend from
  SDValue Op0 = Op.getOperand(0);
  MVT Op0VT = Op0.getValueType().getSimpleVT();

  // The type to extend to needs to be a i128 and
  // the type to extend from needs to be i64 or i32.
  assert((OpVT == MVT::i128 && (Op0VT == MVT::i64 || Op0VT == MVT::i32)) &&
          "LowerSIGN_EXTEND: input and/or output operand have wrong size");

  // Create shuffle mask
  unsigned mask1 = 0x10101010; // byte 0 - 3 and 4 - 7
  unsigned mask2 = Op0VT == MVT::i64 ? 0x00010203 : 0x10101010; // byte  8 - 11
  unsigned mask3 = Op0VT == MVT::i64 ? 0x04050607 : 0x00010203; // byte 12 - 15
  SDValue shufMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
                                 DAG.getConstant(mask1, MVT::i32),
                                 DAG.getConstant(mask1, MVT::i32),
                                 DAG.getConstant(mask2, MVT::i32),
                                 DAG.getConstant(mask3, MVT::i32));

  // Word wise arithmetic right shift to generate at least one byte
  // that contains sign bits.
  MVT mvt = Op0VT == MVT::i64 ? MVT::v2i64 : MVT::v4i32;
  SDValue sraVal = DAG.getNode(ISD::SRA,
                 dl,
                 mvt,
                 DAG.getNode(SPUISD::PREFSLOT2VEC, dl, mvt, Op0, Op0),
                 DAG.getConstant(31, MVT::i32));

  // reinterpret as a i128 (SHUFB requires it). This gets lowered away.
  SDValue extended = SDValue(DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
                                        dl, Op0VT, Op0,
                                        DAG.getTargetConstant(
                                                  SPU::GPRCRegClass.getID(),
                                                  MVT::i32)), 0);
  // Shuffle bytes - Copy the sign bits into the upper 64 bits
  // and the input value into the lower 64 bits.
  SDValue extShuffle = DAG.getNode(SPUISD::SHUFB, dl, mvt,
        extended, sraVal, shufMask);
  return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i128, extShuffle);
}

//! Custom (target-specific) lowering entry point
/*!
  This is where LLVM's DAG selection process calls to do target-specific
  lowering of nodes.
 */
SDValue
SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
{
  unsigned Opc = (unsigned) Op.getOpcode();
  EVT VT = Op.getValueType();

  switch (Opc) {
  default: {
#ifndef NDEBUG
    errs() << "SPUTargetLowering::LowerOperation(): need to lower this!\n";
    errs() << "Op.getOpcode() = " << Opc << "\n";
    errs() << "*Op.getNode():\n";
    Op.getNode()->dump();
#endif
    llvm_unreachable(0);
  }
  case ISD::LOAD:
  case ISD::EXTLOAD:
  case ISD::SEXTLOAD:
  case ISD::ZEXTLOAD:
    return LowerLOAD(Op, DAG, SPUTM.getSubtargetImpl());
  case ISD::STORE:
    return LowerSTORE(Op, DAG, SPUTM.getSubtargetImpl());
  case ISD::ConstantPool:
    return LowerConstantPool(Op, DAG, SPUTM.getSubtargetImpl());
  case ISD::GlobalAddress:
    return LowerGlobalAddress(Op, DAG, SPUTM.getSubtargetImpl());
  case ISD::JumpTable:
    return LowerJumpTable(Op, DAG, SPUTM.getSubtargetImpl());
  case ISD::ConstantFP:
    return LowerConstantFP(Op, DAG);

  // i8, i64 math ops:
  case ISD::ADD:
  case ISD::SUB:
  case ISD::ROTR:
  case ISD::ROTL:
  case ISD::SRL:
  case ISD::SHL:
  case ISD::SRA: {
    if (VT == MVT::i8)
      return LowerI8Math(Op, DAG, Opc, *this);
    break;
  }

  case ISD::FP_TO_SINT:
  case ISD::FP_TO_UINT:
    return LowerFP_TO_INT(Op, DAG, *this);

  case ISD::SINT_TO_FP:
  case ISD::UINT_TO_FP:
    return LowerINT_TO_FP(Op, DAG, *this);

  // Vector-related lowering.
  case ISD::BUILD_VECTOR:
    return LowerBUILD_VECTOR(Op, DAG);
  case ISD::SCALAR_TO_VECTOR:
    return LowerSCALAR_TO_VECTOR(Op, DAG);
  case ISD::VECTOR_SHUFFLE:
    return LowerVECTOR_SHUFFLE(Op, DAG);
  case ISD::EXTRACT_VECTOR_ELT:
    return LowerEXTRACT_VECTOR_ELT(Op, DAG);
  case ISD::INSERT_VECTOR_ELT:
    return LowerINSERT_VECTOR_ELT(Op, DAG);

  // Look for ANDBI, ORBI and XORBI opportunities and lower appropriately:
  case ISD::AND:
  case ISD::OR:
  case ISD::XOR:
    return LowerByteImmed(Op, DAG);

  // Vector and i8 multiply:
  case ISD::MUL:
    if (VT == MVT::i8)
      return LowerI8Math(Op, DAG, Opc, *this);

  case ISD::CTPOP:
    return LowerCTPOP(Op, DAG);

  case ISD::SELECT_CC:
    return LowerSELECT_CC(Op, DAG, *this);

  case ISD::SETCC:
    return LowerSETCC(Op, DAG, *this);

  case ISD::TRUNCATE:
    return LowerTRUNCATE(Op, DAG);

  case ISD::SIGN_EXTEND:
    return LowerSIGN_EXTEND(Op, DAG);
  }

  return SDValue();
}

void SPUTargetLowering::ReplaceNodeResults(SDNode *N,
                                           SmallVectorImpl<SDValue>&Results,
                                           SelectionDAG &DAG) const
{
#if 0
  unsigned Opc = (unsigned) N->getOpcode();
  EVT OpVT = N->getValueType(0);

  switch (Opc) {
  default: {
    errs() << "SPUTargetLowering::ReplaceNodeResults(): need to fix this!\n";
    errs() << "Op.getOpcode() = " << Opc << "\n";
    errs() << "*Op.getNode():\n";
    N->dump();
    abort();
    /*NOTREACHED*/
  }
  }
#endif

  /* Otherwise, return unchanged */
}

//===----------------------------------------------------------------------===//
// Target Optimization Hooks
//===----------------------------------------------------------------------===//

SDValue
SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
{
#if 0
  TargetMachine &TM = getTargetMachine();
#endif
  const SPUSubtarget *ST = SPUTM.getSubtargetImpl();
  SelectionDAG &DAG = DCI.DAG;
  SDValue Op0 = N->getOperand(0);       // everything has at least one operand
  EVT NodeVT = N->getValueType(0);      // The node's value type
  EVT Op0VT = Op0.getValueType();       // The first operand's result
  SDValue Result;                       // Initially, empty result
  DebugLoc dl = N->getDebugLoc();

  switch (N->getOpcode()) {
  default: break;
  case ISD::ADD: {
    SDValue Op1 = N->getOperand(1);

    if (Op0.getOpcode() == SPUISD::IndirectAddr
        || Op1.getOpcode() == SPUISD::IndirectAddr) {
      // Normalize the operands to reduce repeated code
      SDValue IndirectArg = Op0, AddArg = Op1;

      if (Op1.getOpcode() == SPUISD::IndirectAddr) {
        IndirectArg = Op1;
        AddArg = Op0;
      }

      if (isa<ConstantSDNode>(AddArg)) {
        ConstantSDNode *CN0 = cast<ConstantSDNode > (AddArg);
        SDValue IndOp1 = IndirectArg.getOperand(1);

        if (CN0->isNullValue()) {
          // (add (SPUindirect <arg>, <arg>), 0) ->
          // (SPUindirect <arg>, <arg>)

#if !defined(NDEBUG)
          if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
            errs() << "\n"
                 << "Replace: (add (SPUindirect <arg>, <arg>), 0)\n"
                 << "With:    (SPUindirect <arg>, <arg>)\n";
          }
#endif

          return IndirectArg;
        } else if (isa<ConstantSDNode>(IndOp1)) {
          // (add (SPUindirect <arg>, <const>), <const>) ->
          // (SPUindirect <arg>, <const + const>)
          ConstantSDNode *CN1 = cast<ConstantSDNode > (IndOp1);
          int64_t combinedConst = CN0->getSExtValue() + CN1->getSExtValue();
          SDValue combinedValue = DAG.getConstant(combinedConst, Op0VT);

#if !defined(NDEBUG)
          if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
            errs() << "\n"
                 << "Replace: (add (SPUindirect <arg>, " << CN1->getSExtValue()
                 << "), " << CN0->getSExtValue() << ")\n"
                 << "With:    (SPUindirect <arg>, "
                 << combinedConst << ")\n";
          }
#endif

          return DAG.getNode(SPUISD::IndirectAddr, dl, Op0VT,
                             IndirectArg, combinedValue);
        }
      }
    }
    break;
  }
  case ISD::SIGN_EXTEND:
  case ISD::ZERO_EXTEND:
  case ISD::ANY_EXTEND: {
    if (Op0.getOpcode() == SPUISD::VEC2PREFSLOT && NodeVT == Op0VT) {
      // (any_extend (SPUextract_elt0 <arg>)) ->
      // (SPUextract_elt0 <arg>)
      // Types must match, however...
#if !defined(NDEBUG)
      if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
        errs() << "\nReplace: ";
        N->dump(&DAG);
        errs() << "\nWith:    ";
        Op0.getNode()->dump(&DAG);
        errs() << "\n";
      }
#endif

      return Op0;
    }
    break;
  }
  case SPUISD::IndirectAddr: {
    if (!ST->usingLargeMem() && Op0.getOpcode() == SPUISD::AFormAddr) {
      ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1));
      if (CN != 0 && CN->isNullValue()) {
        // (SPUindirect (SPUaform <addr>, 0), 0) ->
        // (SPUaform <addr>, 0)

        DEBUG(errs() << "Replace: ");
        DEBUG(N->dump(&DAG));
        DEBUG(errs() << "\nWith:    ");
        DEBUG(Op0.getNode()->dump(&DAG));
        DEBUG(errs() << "\n");

        return Op0;
      }
    } else if (Op0.getOpcode() == ISD::ADD) {
      SDValue Op1 = N->getOperand(1);
      if (ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(Op1)) {
        // (SPUindirect (add <arg>, <arg>), 0) ->
        // (SPUindirect <arg>, <arg>)
        if (CN1->isNullValue()) {

#if !defined(NDEBUG)
          if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
            errs() << "\n"
                 << "Replace: (SPUindirect (add <arg>, <arg>), 0)\n"
                 << "With:    (SPUindirect <arg>, <arg>)\n";
          }
#endif

          return DAG.getNode(SPUISD::IndirectAddr, dl, Op0VT,
                             Op0.getOperand(0), Op0.getOperand(1));
        }
      }
    }
    break;
  }
  case SPUISD::SHL_BITS:
  case SPUISD::SHL_BYTES:
  case SPUISD::ROTBYTES_LEFT: {
    SDValue Op1 = N->getOperand(1);

    // Kill degenerate vector shifts:
    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Op1)) {
      if (CN->isNullValue()) {
        Result = Op0;
      }
    }
    break;
  }
  case SPUISD::PREFSLOT2VEC: {
    switch (Op0.getOpcode()) {
    default:
      break;
    case ISD::ANY_EXTEND:
    case ISD::ZERO_EXTEND:
    case ISD::SIGN_EXTEND: {
      // (SPUprefslot2vec (any|zero|sign_extend (SPUvec2prefslot <arg>))) ->
      // <arg>
      // but only if the SPUprefslot2vec and <arg> types match.
      SDValue Op00 = Op0.getOperand(0);
      if (Op00.getOpcode() == SPUISD::VEC2PREFSLOT) {
        SDValue Op000 = Op00.getOperand(0);
        if (Op000.getValueType() == NodeVT) {
          Result = Op000;
        }
      }
      break;
    }
    case SPUISD::VEC2PREFSLOT: {
      // (SPUprefslot2vec (SPUvec2prefslot <arg>)) ->
      // <arg>
      Result = Op0.getOperand(0);
      break;
    }
    }
    break;
  }
  }

  // Otherwise, return unchanged.
#ifndef NDEBUG
  if (Result.getNode()) {
    DEBUG(errs() << "\nReplace.SPU: ");
    DEBUG(N->dump(&DAG));
    DEBUG(errs() << "\nWith:        ");
    DEBUG(Result.getNode()->dump(&DAG));
    DEBUG(errs() << "\n");
  }
#endif

  return Result;
}

//===----------------------------------------------------------------------===//
// Inline Assembly Support
//===----------------------------------------------------------------------===//

/// getConstraintType - Given a constraint letter, return the type of
/// constraint it is for this target.
SPUTargetLowering::ConstraintType
SPUTargetLowering::getConstraintType(const std::string &ConstraintLetter) const {
  if (ConstraintLetter.size() == 1) {
    switch (ConstraintLetter[0]) {
    default: break;
    case 'b':
    case 'r':
    case 'f':
    case 'v':
    case 'y':
      return C_RegisterClass;
    }
  }
  return TargetLowering::getConstraintType(ConstraintLetter);
}

/// Examine constraint type and operand type and determine a weight value.
/// This object must already have been set up with the operand type
/// and the current alternative constraint selected.
TargetLowering::ConstraintWeight
SPUTargetLowering::getSingleConstraintMatchWeight(
    AsmOperandInfo &info, const char *constraint) const {
  ConstraintWeight weight = CW_Invalid;
  Value *CallOperandVal = info.CallOperandVal;
    // If we don't have a value, we can't do a match,
    // but allow it at the lowest weight.
  if (CallOperandVal == NULL)
    return CW_Default;
  // Look at the constraint type.
  switch (*constraint) {
  default:
    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
    break;
    //FIXME: Seems like the supported constraint letters were just copied
    // from PPC, as the following doesn't correspond to the GCC docs.
    // I'm leaving it so until someone adds the corresponding lowering support.
  case 'b':
  case 'r':
  case 'f':
  case 'd':
  case 'v':
  case 'y':
    weight = CW_Register;
    break;
  }
  return weight;
}

std::pair<unsigned, const TargetRegisterClass*>
SPUTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
                                                EVT VT) const
{
  if (Constraint.size() == 1) {
    // GCC RS6000 Constraint Letters
    switch (Constraint[0]) {
    case 'b':   // R1-R31
    case 'r':   // R0-R31
      if (VT == MVT::i64)
        return std::make_pair(0U, SPU::R64CRegisterClass);
      return std::make_pair(0U, SPU::R32CRegisterClass);
    case 'f':
      if (VT == MVT::f32)
        return std::make_pair(0U, SPU::R32FPRegisterClass);
      else if (VT == MVT::f64)
        return std::make_pair(0U, SPU::R64FPRegisterClass);
      break;
    case 'v':
      return std::make_pair(0U, SPU::GPRCRegisterClass);
    }
  }

  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
}

//! Compute used/known bits for a SPU operand
void
SPUTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
                                                  const APInt &Mask,
                                                  APInt &KnownZero,
                                                  APInt &KnownOne,
                                                  const SelectionDAG &DAG,
                                                  unsigned Depth ) const {
#if 0
  const uint64_t uint64_sizebits = sizeof(uint64_t) * CHAR_BIT;

  switch (Op.getOpcode()) {
  default:
    // KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0);
    break;
  case CALL:
  case SHUFB:
  case SHUFFLE_MASK:
  case CNTB:
  case SPUISD::PREFSLOT2VEC:
  case SPUISD::LDRESULT:
  case SPUISD::VEC2PREFSLOT:
  case SPUISD::SHLQUAD_L_BITS:
  case SPUISD::SHLQUAD_L_BYTES:
  case SPUISD::VEC_ROTL:
  case SPUISD::VEC_ROTR:
  case SPUISD::ROTBYTES_LEFT:
  case SPUISD::SELECT_MASK:
  case SPUISD::SELB:
  }
#endif
}

unsigned
SPUTargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
                                                   unsigned Depth) const {
  switch (Op.getOpcode()) {
  default:
    return 1;

  case ISD::SETCC: {
    EVT VT = Op.getValueType();

    if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32) {
      VT = MVT::i32;
    }
    return VT.getSizeInBits();
  }
  }
}

// LowerAsmOperandForConstraint
void
SPUTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
                                                char ConstraintLetter,
                                                std::vector<SDValue> &Ops,
                                                SelectionDAG &DAG) const {
  // Default, for the time being, to the base class handler
  TargetLowering::LowerAsmOperandForConstraint(Op, ConstraintLetter, Ops, DAG);
}

/// isLegalAddressImmediate - Return true if the integer value can be used
/// as the offset of the target addressing mode.
bool SPUTargetLowering::isLegalAddressImmediate(int64_t V,
                                                const Type *Ty) const {
  // SPU's addresses are 256K:
  return (V > -(1 << 18) && V < (1 << 18) - 1);
}

bool SPUTargetLowering::isLegalAddressImmediate(llvm::GlobalValue* GV) const {
  return false;
}

bool
SPUTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
  // The SPU target isn't yet aware of offsets.
  return false;
}

// can we compare to Imm without writing it into a register?
bool SPUTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
  //ceqi, cgti, etc. all take s10 operand
  return isInt<10>(Imm);
}

bool
SPUTargetLowering::isLegalAddressingMode(const AddrMode &AM,
                                         const Type * ) const{

  // A-form: 18bit absolute address.
  if (AM.BaseGV && !AM.HasBaseReg && AM.Scale == 0 && AM.BaseOffs == 0)
    return true;

  // D-form: reg + 14bit offset
  if (AM.BaseGV ==0 && AM.HasBaseReg && AM.Scale == 0 && isInt<14>(AM.BaseOffs))
    return true;

  // X-form: reg+reg
  if (AM.BaseGV == 0 && AM.HasBaseReg && AM.Scale == 1 && AM.BaseOffs ==0)
    return true;

  return false;
}