AMDGPU/R600: Delete/rename intrinsics no longer used by mesa

Use the replacement pass to update the tests, and delete old names. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@275375 91177308-0d34-0410-b5e6-96231b3b80d8
2025-02-04 19:38:22 +00:00 · 2016-07-14 05:47:17 +00:00 · 2016-07-14 05:47:17 +00:00 · 4d120e9b24
commit 4d120e9b24
parent 04e6d2604d
24 changed files with 1450 additions and 1721 deletions
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@ -29,7 +29,6 @@ class TargetMachine;

 // R600 Passes
 FunctionPass *createR600VectorRegMerger(TargetMachine &tm);
-FunctionPass *createR600TextureIntrinsicsReplacer();
 FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm);
 FunctionPass *createR600EmitClauseMarkers();
 FunctionPass *createR600ClauseMergePass(TargetMachine &tm);
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@ -432,7 +432,6 @@ bool R600PassConfig::addPreISel() {

  if (EnableR600StructurizeCFG)
    addPass(createStructurizeCFGPass());
-  addPass(createR600TextureIntrinsicsReplacer());
  return false;
 }

--- a/lib/Target/AMDGPU/CMakeLists.txt
+++ b/lib/Target/AMDGPU/CMakeLists.txt
@ -61,7 +61,6 @@ add_llvm_target(AMDGPUCodeGen
  R600OptimizeVectorRegisters.cpp
  R600Packetizer.cpp
  R600RegisterInfo.cpp
-  R600TextureIntrinsicsReplacer.cpp
  SIAnnotateControlFlow.cpp
  SIDebuggerInsertNops.cpp
  SIFixControlFlowLiveIntervals.cpp
--- a/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/lib/Target/AMDGPU/R600ISelLowering.cpp
@ -738,7 +738,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
      };
      return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
    }
-    case AMDGPUIntrinsic::AMDGPU_dp4: {
+    case AMDGPUIntrinsic::r600_dot4: {
      SDValue Args[8] = {
      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
          DAG.getConstant(0, DL, MVT::i32)),
--- a/lib/Target/AMDGPU/R600Instructions.td
+++ b/lib/Target/AMDGPU/R600Instructions.td
@ -1332,9 +1332,7 @@ def TXD: InstR600 <
  (outs R600_Reg128:$dst),
  (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2,
       i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget),
-  "TXD $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget",
-  [(set v4f32:$dst, (int_AMDGPU_txd v4f32:$src0, v4f32:$src1, v4f32:$src2,
-                     imm:$resourceId, imm:$samplerId, imm:$textureTarget))],
+  "TXD $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget", [],
  NullALU > {
  let TEXInst = 1;
 }
@ -1344,10 +1342,7 @@ def TXD_SHADOW: InstR600 <
  (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2,
       i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget),
  "TXD_SHADOW $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget",
-  [(set v4f32:$dst, (int_AMDGPU_txd v4f32:$src0, v4f32:$src1, v4f32:$src2,
-        imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))],
-   NullALU
-> {
+  [], NullALU> {
  let TEXInst = 1;
 }
 } // End isPseudo = 1
--- a/lib/Target/AMDGPU/R600Intrinsics.td
+++ b/lib/Target/AMDGPU/R600Intrinsics.td
@ -12,18 +12,6 @@
 //===----------------------------------------------------------------------===//

 // FIXME: Should migrate to using TargetPrefix that matches triple arch name.
-let TargetPrefix = "AMDGPU", isTarget = 1 in {
-  def int_AMDGPU_dp4 : Intrinsic<[llvm_float_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
-  def int_AMDGPU_tex : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_txb : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_txf : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_txq : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_txd : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_txl : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_ddx : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_ddy : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-}
-
 let TargetPrefix = "R600", isTarget = 1 in {
  def int_R600_store_swizzle :
    Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []>;
@ -70,4 +58,8 @@ let TargetPrefix = "r600", isTarget = 1 in {
  def int_r600_txq : TextureIntrinsicInt32Input;
  def int_r600_ddx : TextureIntrinsicFloatInput;
  def int_r600_ddy : TextureIntrinsicFloatInput;
+
+  def int_r600_dot4 : Intrinsic<[llvm_float_ty],
+    [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]
+  >;
 } // End TargetPrefix = "r600", isTarget = 1
--- a/lib/Target/AMDGPU/R600TextureIntrinsicsReplacer.cpp
+++ b/lib/Target/AMDGPU/R600TextureIntrinsicsReplacer.cpp
@ -1,303 +0,0 @@
-//===-- R600TextureIntrinsicsReplacer.cpp ---------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// This pass translates tgsi-like texture intrinsics into R600 texture
-/// closer to hardware intrinsics.
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/Passes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstVisitor.h"
-
-using namespace llvm;
-
-namespace {
-class R600TextureIntrinsicsReplacer :
-    public FunctionPass, public InstVisitor<R600TextureIntrinsicsReplacer> {
-  static char ID;
-
-  Module *Mod;
-  Type *FloatType;
-  Type *Int32Type;
-  Type *V4f32Type;
-  Type *V4i32Type;
-  FunctionType *TexSign;
-  FunctionType *TexQSign;
-
-  void getAdjustmentFromTextureTarget(unsigned TextureType, bool hasLOD,
-                                      unsigned SrcSelect[4], unsigned CT[4],
-                                      bool &useShadowVariant) {
-    enum TextureTypes {
-      TEXTURE_1D = 1,
-      TEXTURE_2D,
-      TEXTURE_3D,
-      TEXTURE_CUBE,
-      TEXTURE_RECT,
-      TEXTURE_SHADOW1D,
-      TEXTURE_SHADOW2D,
-      TEXTURE_SHADOWRECT,
-      TEXTURE_1D_ARRAY,
-      TEXTURE_2D_ARRAY,
-      TEXTURE_SHADOW1D_ARRAY,
-      TEXTURE_SHADOW2D_ARRAY,
-      TEXTURE_SHADOWCUBE,
-      TEXTURE_2D_MSAA,
-      TEXTURE_2D_ARRAY_MSAA,
-      TEXTURE_CUBE_ARRAY,
-      TEXTURE_SHADOWCUBE_ARRAY
-    };
-
-    switch (TextureType) {
-    case 0:
-      useShadowVariant = false;
-      return;
-    case TEXTURE_RECT:
-    case TEXTURE_1D:
-    case TEXTURE_2D:
-    case TEXTURE_3D:
-    case TEXTURE_CUBE:
-    case TEXTURE_1D_ARRAY:
-    case TEXTURE_2D_ARRAY:
-    case TEXTURE_CUBE_ARRAY:
-    case TEXTURE_2D_MSAA:
-    case TEXTURE_2D_ARRAY_MSAA:
-      useShadowVariant = false;
-      break;
-    case TEXTURE_SHADOW1D:
-    case TEXTURE_SHADOW2D:
-    case TEXTURE_SHADOWRECT:
-    case TEXTURE_SHADOW1D_ARRAY:
-    case TEXTURE_SHADOW2D_ARRAY:
-    case TEXTURE_SHADOWCUBE:
-    case TEXTURE_SHADOWCUBE_ARRAY:
-      useShadowVariant = true;
-      break;
-    default:
-      llvm_unreachable("Unknow Texture Type");
-    }
-
-    if (TextureType == TEXTURE_RECT ||
-        TextureType == TEXTURE_SHADOWRECT) {
-      CT[0] = 0;
-      CT[1] = 0;
-    }
-
-    if (TextureType == TEXTURE_CUBE_ARRAY ||
-        TextureType == TEXTURE_SHADOWCUBE_ARRAY)
-      CT[2] = 0;
-
-    if (TextureType == TEXTURE_1D_ARRAY ||
-        TextureType == TEXTURE_SHADOW1D_ARRAY) {
-      if (hasLOD && useShadowVariant) {
-        CT[1] = 0;
-      } else {
-        CT[2] = 0;
-        SrcSelect[2] = 1;
-      }
-    } else if (TextureType == TEXTURE_2D_ARRAY ||
-        TextureType == TEXTURE_SHADOW2D_ARRAY) {
-      CT[2] = 0;
-    }
-
-    if ((TextureType == TEXTURE_SHADOW1D ||
-        TextureType == TEXTURE_SHADOW2D ||
-        TextureType == TEXTURE_SHADOWRECT ||
-        TextureType == TEXTURE_SHADOW1D_ARRAY) &&
-        !(hasLOD && useShadowVariant))
-      SrcSelect[3] = 2;
-  }
-
-  void ReplaceCallInst(CallInst &I, FunctionType *FT, const char *Name,
-                       unsigned SrcSelect[4], Value *Offset[3], Value *Resource,
-                       Value *Sampler, unsigned CT[4], Value *Coord) {
-    IRBuilder<> Builder(&I);
-    Constant *Mask[] = {
-      ConstantInt::get(Int32Type, SrcSelect[0]),
-      ConstantInt::get(Int32Type, SrcSelect[1]),
-      ConstantInt::get(Int32Type, SrcSelect[2]),
-      ConstantInt::get(Int32Type, SrcSelect[3])
-    };
-    Value *SwizzleMask = ConstantVector::get(Mask);
-    Value *SwizzledCoord =
-        Builder.CreateShuffleVector(Coord, Coord, SwizzleMask);
-
-    Value *Args[] = {
-      SwizzledCoord,
-      Offset[0],
-      Offset[1],
-      Offset[2],
-      Resource,
-      Sampler,
-      ConstantInt::get(Int32Type, CT[0]),
-      ConstantInt::get(Int32Type, CT[1]),
-      ConstantInt::get(Int32Type, CT[2]),
-      ConstantInt::get(Int32Type, CT[3])
-    };
-
-    Function *F = Mod->getFunction(Name);
-    if (!F) {
-      F = Function::Create(FT, GlobalValue::ExternalLinkage, Name, Mod);
-      F->addFnAttr(Attribute::ReadNone);
-    }
-    I.replaceAllUsesWith(Builder.CreateCall(F, Args));
-    I.eraseFromParent();
-  }
-
-  void ReplaceTexIntrinsic(CallInst &I, bool hasLOD, FunctionType *FT,
-                           const char *VanillaInt,
-                           const char *ShadowInt) {
-    Value *Coord = I.getArgOperand(0);
-    Value *ResourceId = I.getArgOperand(1);
-    Value *SamplerId = I.getArgOperand(2);
-
-    unsigned TextureType =
-        cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
-
-    unsigned SrcSelect[4] = { 0, 1, 2, 3 };
-    unsigned CT[4] = {1, 1, 1, 1};
-    Value *Offset[3] = {
-      ConstantInt::get(Int32Type, 0),
-      ConstantInt::get(Int32Type, 0),
-      ConstantInt::get(Int32Type, 0)
-    };
-    bool useShadowVariant;
-
-    getAdjustmentFromTextureTarget(TextureType, hasLOD, SrcSelect, CT,
-                                   useShadowVariant);
-
-    ReplaceCallInst(I, FT, useShadowVariant?ShadowInt:VanillaInt, SrcSelect,
-                    Offset, ResourceId, SamplerId, CT, Coord);
-  }
-
-  void ReplaceTXF(CallInst &I) {
-    Value *Coord = I.getArgOperand(0);
-    Value *ResourceId = I.getArgOperand(4);
-    Value *SamplerId = I.getArgOperand(5);
-
-    unsigned TextureType =
-        cast<ConstantInt>(I.getArgOperand(6))->getZExtValue();
-
-    unsigned SrcSelect[4] = { 0, 1, 2, 3 };
-    unsigned CT[4] = {1, 1, 1, 1};
-    Value *Offset[3] = {
-      I.getArgOperand(1),
-      I.getArgOperand(2),
-      I.getArgOperand(3),
-    };
-    bool useShadowVariant;
-
-    getAdjustmentFromTextureTarget(TextureType, false, SrcSelect, CT,
-                                   useShadowVariant);
-
-    ReplaceCallInst(I, TexQSign, "llvm.R600.txf", SrcSelect,
-                    Offset, ResourceId, SamplerId, CT, Coord);
-  }
-
-public:
-  R600TextureIntrinsicsReplacer():
-    FunctionPass(ID) {
-  }
-
-  bool doInitialization(Module &M) override {
-    LLVMContext &Ctx = M.getContext();
-    Mod = &M;
-    FloatType = Type::getFloatTy(Ctx);
-    Int32Type = Type::getInt32Ty(Ctx);
-    V4f32Type = VectorType::get(FloatType, 4);
-    V4i32Type = VectorType::get(Int32Type, 4);
-    Type *ArgsType[] = {
-      V4f32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-    };
-    TexSign = FunctionType::get(V4f32Type, ArgsType, /*isVarArg=*/false);
-    Type *ArgsQType[] = {
-      V4i32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-      Int32Type,
-    };
-    TexQSign = FunctionType::get(V4f32Type, ArgsQType, /*isVarArg=*/false);
-    return false;
-  }
-
-  bool runOnFunction(Function &F) override {
-    visit(F);
-    return false;
-  }
-
-  const char *getPassName() const override {
-    return "R600 Texture Intrinsics Replacer";
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-  }
-
-  void visitCallInst(CallInst &I) {
-    if (!I.getCalledFunction())
-      return;
-
-    StringRef Name = I.getCalledFunction()->getName();
-    if (Name == "llvm.AMDGPU.tex") {
-      ReplaceTexIntrinsic(I, false, TexSign, "llvm.r600.tex", "llvm.r600.texc");
-      return;
-    }
-    if (Name == "llvm.AMDGPU.txl") {
-      ReplaceTexIntrinsic(I, true, TexSign, "llvm.r600.txl", "llvm.r600.txlc");
-      return;
-    }
-    if (Name == "llvm.AMDGPU.txb") {
-      ReplaceTexIntrinsic(I, true, TexSign, "llvm.r600.txb", "llvm.r600.txbc");
-      return;
-    }
-    if (Name == "llvm.AMDGPU.txf") {
-      ReplaceTXF(I);
-      return;
-    }
-    if (Name == "llvm.AMDGPU.txq") {
-      ReplaceTexIntrinsic(I, false, TexQSign, "llvm.r600.txq", "llvm.r600.txq");
-      return;
-    }
-    if (Name == "llvm.AMDGPU.ddx") {
-      ReplaceTexIntrinsic(I, false, TexSign, "llvm.r600.ddx", "llvm.r600.ddx");
-      return;
-    }
-    if (Name == "llvm.AMDGPU.ddy") {
-      ReplaceTexIntrinsic(I, false, TexSign, "llvm.r600.ddy", "llvm.r600.ddy");
-      return;
-    }
-  }
-
-};
-
-char R600TextureIntrinsicsReplacer::ID = 0;
-
-}
-
-FunctionPass *llvm::createR600TextureIntrinsicsReplacer() {
-  return new R600TextureIntrinsicsReplacer();
-}
--- a/test/CodeGen/AMDGPU/big_alu.ll
+++ b/test/CodeGen/AMDGPU/big_alu.ll
--- a/test/CodeGen/AMDGPU/dot4-folding.ll
+++ b/test/CodeGen/AMDGPU/dot4-folding.ll
@ -1,27 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-; Exactly one constant vector can be folded into dot4, which means exactly
-; 4 MOV instructions
-; CHECK: {{^}}main:
-; CHECK: MOV
-; CHECK: MOV
-; CHECK: MOV
-; CHECK: MOV
-; CHECK-NOT: MOV
-; CHECK-NOT: MOV
-; CHECK-NOT: MOV
-; CHECK-NOT: MOV
-
-define void @main(float addrspace(1)* %out) {
-main_body:
-  %0 = load <4 x float>, <4 x float> addrspace(8)* null
-  %1 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %2 = call float @llvm.AMDGPU.dp4(<4 x float> %0,<4 x float> %1)
-  %3 = insertelement <4 x float> undef, float %2, i32 0
-  call void @llvm.R600.store.swizzle(<4 x float> %3, i32 0, i32 0)
-  ret void
-}
-
-declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-attributes #1 = { readnone }
--- a/test/CodeGen/AMDGPU/fetch-limits.r600.ll
+++ b/test/CodeGen/AMDGPU/fetch-limits.r600.ll
@ -9,38 +9,48 @@

 define amdgpu_ps void @fetch_limits_r600() {
 entry:
-  %0 = load <4 x float>, <4 x float> addrspace(8)* null
-  %1 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %3 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
-  %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
-  %5 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
-  %6 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
-  %7 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
-  %8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
-  %res0 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %0, i32 0, i32 0, i32 1)
-  %res1 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %1, i32 0, i32 0, i32 1)
-  %res2 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %2, i32 0, i32 0, i32 1)
-  %res3 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %3, i32 0, i32 0, i32 1)
-  %res4 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %4, i32 0, i32 0, i32 1)
-  %res5 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %5, i32 0, i32 0, i32 1)
-  %res6 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %6, i32 0, i32 0, i32 1)
-  %res7 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %7, i32 0, i32 0, i32 1)
-  %res8 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 0, i32 0, i32 1)
-  %a = fadd <4 x float> %res0, %res1
-  %b = fadd <4 x float> %res2, %res3
-  %c = fadd <4 x float> %res4, %res5
-  %d = fadd <4 x float> %res6, %res7
-  %e = fadd <4 x float> %res8, %a
-
+  %tmp = load <4 x float>, <4 x float> addrspace(8)* null
+  %tmp1 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %tmp2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %tmp3 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %tmp4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %tmp5 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %tmp6 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
+  %tmp7 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %tmp8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
+  %tmp9 = shufflevector <4 x float> %tmp, <4 x float> %tmp, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp10 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp9, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp11 = shufflevector <4 x float> %tmp1, <4 x float> %tmp1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp12 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp11, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp13 = shufflevector <4 x float> %tmp2, <4 x float> %tmp2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp14 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp13, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp15 = shufflevector <4 x float> %tmp3, <4 x float> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp16 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp17 = shufflevector <4 x float> %tmp4, <4 x float> %tmp4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp18 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp17, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp19 = shufflevector <4 x float> %tmp5, <4 x float> %tmp5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp20 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp19, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp21 = shufflevector <4 x float> %tmp6, <4 x float> %tmp6, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp22 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp21, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp23 = shufflevector <4 x float> %tmp7, <4 x float> %tmp7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp24 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp23, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp25 = shufflevector <4 x float> %tmp8, <4 x float> %tmp8, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp26 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp25, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %a = fadd <4 x float> %tmp10, %tmp12
+  %b = fadd <4 x float> %tmp14, %tmp16
+  %c = fadd <4 x float> %tmp18, %tmp20
+  %d = fadd <4 x float> %tmp22, %tmp24
+  %e = fadd <4 x float> %tmp26, %a
  %bc = fadd <4 x float> %b, %c
  %de = fadd <4 x float> %d, %e
-
  %bcde = fadd <4 x float> %bc, %de
-
  call void @llvm.R600.store.swizzle(<4 x float> %bcde, i32 0, i32 1)
  ret void
 }

-declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone
 declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+; Function Attrs: readnone
+declare <4 x float> @llvm.r600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) #0
+
+attributes #0 = { nounwind readnone }
--- a/test/CodeGen/AMDGPU/fetch-limits.r700+.ll
+++ b/test/CodeGen/AMDGPU/fetch-limits.r700+.ll
@ -35,45 +35,63 @@ entry:
  %14 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
  %15 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 15)
  %16 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16)
-  %res0 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %0, i32 0, i32 0, i32 1)
-  %res1 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %1, i32 0, i32 0, i32 1)
-  %res2 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %2, i32 0, i32 0, i32 1)
-  %res3 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %3, i32 0, i32 0, i32 1)
-  %res4 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %4, i32 0, i32 0, i32 1)
-  %res5 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %5, i32 0, i32 0, i32 1)
-  %res6 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %6, i32 0, i32 0, i32 1)
-  %res7 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %7, i32 0, i32 0, i32 1)
-  %res8 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 0, i32 0, i32 1)
-  %res9 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %9, i32 0, i32 0, i32 1)
-  %res10 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %10, i32 0, i32 0, i32 1)
-  %res11 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %11, i32 0, i32 0, i32 1)
-  %res12 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %12, i32 0, i32 0, i32 1)
-  %res13 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %13, i32 0, i32 0, i32 1)
-  %res14 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %14, i32 0, i32 0, i32 1)
-  %res15 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %15, i32 0, i32 0, i32 1)
-  %res16 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %16, i32 0, i32 0, i32 1)
-  %a = fadd <4 x float> %res0, %res1
-  %b = fadd <4 x float> %res2, %res3
-  %c = fadd <4 x float> %res4, %res5
-  %d = fadd <4 x float> %res6, %res7
-  %e = fadd <4 x float> %res8, %res9
-  %f = fadd <4 x float> %res10, %res11
-  %g = fadd <4 x float> %res12, %res13
-  %h = fadd <4 x float> %res14, %res15
-  %i = fadd <4 x float> %res16, %a
-
+  %17 = shufflevector <4 x float> %0, <4 x float> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %18 = call <4 x float> @llvm.r600.tex(<4 x float> %17, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %19 = shufflevector <4 x float> %1, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %20 = call <4 x float> @llvm.r600.tex(<4 x float> %19, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %21 = shufflevector <4 x float> %2, <4 x float> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %22 = call <4 x float> @llvm.r600.tex(<4 x float> %21, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %23 = shufflevector <4 x float> %3, <4 x float> %3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %24 = call <4 x float> @llvm.r600.tex(<4 x float> %23, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %25 = shufflevector <4 x float> %4, <4 x float> %4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %26 = call <4 x float> @llvm.r600.tex(<4 x float> %25, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %27 = shufflevector <4 x float> %5, <4 x float> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %28 = call <4 x float> @llvm.r600.tex(<4 x float> %27, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %29 = shufflevector <4 x float> %6, <4 x float> %6, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %30 = call <4 x float> @llvm.r600.tex(<4 x float> %29, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %31 = shufflevector <4 x float> %7, <4 x float> %7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %32 = call <4 x float> @llvm.r600.tex(<4 x float> %31, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %33 = shufflevector <4 x float> %8, <4 x float> %8, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %34 = call <4 x float> @llvm.r600.tex(<4 x float> %33, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %35 = shufflevector <4 x float> %9, <4 x float> %9, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %36 = call <4 x float> @llvm.r600.tex(<4 x float> %35, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %37 = shufflevector <4 x float> %10, <4 x float> %10, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %38 = call <4 x float> @llvm.r600.tex(<4 x float> %37, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %39 = shufflevector <4 x float> %11, <4 x float> %11, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %40 = call <4 x float> @llvm.r600.tex(<4 x float> %39, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %41 = shufflevector <4 x float> %12, <4 x float> %12, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %42 = call <4 x float> @llvm.r600.tex(<4 x float> %41, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %43 = shufflevector <4 x float> %13, <4 x float> %13, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %44 = call <4 x float> @llvm.r600.tex(<4 x float> %43, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %45 = shufflevector <4 x float> %14, <4 x float> %14, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %46 = call <4 x float> @llvm.r600.tex(<4 x float> %45, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %47 = shufflevector <4 x float> %15, <4 x float> %15, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %48 = call <4 x float> @llvm.r600.tex(<4 x float> %47, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %49 = shufflevector <4 x float> %16, <4 x float> %16, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %50 = call <4 x float> @llvm.r600.tex(<4 x float> %49, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %a = fadd <4 x float> %18, %20
+  %b = fadd <4 x float> %22, %24
+  %c = fadd <4 x float> %26, %28
+  %d = fadd <4 x float> %30, %32
+  %e = fadd <4 x float> %34, %36
+  %f = fadd <4 x float> %38, %40
+  %g = fadd <4 x float> %42, %44
+  %h = fadd <4 x float> %46, %48
+  %i = fadd <4 x float> %50, %a
  %bc = fadd <4 x float> %b, %c
  %de = fadd <4 x float> %d, %e
  %fg = fadd <4 x float> %f, %g
  %hi = fadd <4 x float> %h, %i
-
  %bcde = fadd <4 x float> %bc, %de
  %fghi = fadd <4 x float> %fg, %hi
-
  %bcdefghi = fadd <4 x float> %bcde, %fghi
  call void @llvm.R600.store.swizzle(<4 x float> %bcdefghi, i32 0, i32 1)
  ret void
 }

-declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone
 declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+; Function Attrs: readnone
+declare <4 x float> @llvm.r600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) #0
+
+attributes #0 = { readnone }
--- a/test/CodeGen/AMDGPU/literals.ll
+++ b/test/CodeGen/AMDGPU/literals.ll
@ -54,11 +54,11 @@ entry:
 ; CHECK-NEXT: DOT4 * T[[GPR]].W (MASKED), 1.0
 define void @inline_literal_dot4(float addrspace(1)* %out) {
 entry:
-  %0 = call float @llvm.AMDGPU.dp4(<4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+  %0 = call float @llvm.r600.dot4(<4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
  store float %0, float addrspace(1)* %out
  ret void
 }

-declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
+declare float @llvm.r600.dot4(<4 x float>, <4 x float>) #1

 attributes #1 = { readnone }
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.cube.ll
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.cube.ll
@ -1,4 +1,4 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s

 ; CHECK-LABEL: {{^}}cube:
 ; CHECK: CUBE T{{[0-9]}}.X
@ -7,51 +7,51 @@
 ; CHECK: CUBE * T{{[0-9]}}.W
 define amdgpu_ps void @cube() {
 main_body:
-  %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
-  %1 = extractelement <4 x float> %0, i32 3
-  %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
-  %3 = extractelement <4 x float> %2, i32 0
-  %4 = fdiv float %3, %1
-  %5 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
-  %6 = extractelement <4 x float> %5, i32 1
-  %7 = fdiv float %6, %1
-  %8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
-  %9 = extractelement <4 x float> %8, i32 2
-  %10 = fdiv float %9, %1
-  %11 = insertelement <4 x float> undef, float %4, i32 0
-  %12 = insertelement <4 x float> %11, float %7, i32 1
-  %13 = insertelement <4 x float> %12, float %10, i32 2
-  %14 = insertelement <4 x float> %13, float 1.000000e+00, i32 3
-  %15 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %14)
-  %16 = extractelement <4 x float> %15, i32 0
-  %17 = extractelement <4 x float> %15, i32 1
-  %18 = extractelement <4 x float> %15, i32 2
-  %19 = extractelement <4 x float> %15, i32 3
-  %20 = call float @fabs(float %18)
-  %21 = fdiv float 1.000000e+00, %20
-  %22 = fmul float %16, %21
-  %23 = fadd float %22, 1.500000e+00
-  %24 = fmul float %17, %21
-  %25 = fadd float %24, 1.500000e+00
-  %26 = insertelement <4 x float> undef, float %25, i32 0
-  %27 = insertelement <4 x float> %26, float %23, i32 1
-  %28 = insertelement <4 x float> %27, float %19, i32 2
-  %29 = insertelement <4 x float> %28, float %25, i32 3
-  %30 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %29, i32 16, i32 0, i32 4)
-  call void @llvm.R600.store.swizzle(<4 x float> %30, i32 0, i32 0)
+  %tmp = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %tmp1 = extractelement <4 x float> %tmp, i32 3
+  %tmp2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %tmp3 = extractelement <4 x float> %tmp2, i32 0
+  %tmp4 = fdiv float %tmp3, %tmp1
+  %tmp5 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %tmp6 = extractelement <4 x float> %tmp5, i32 1
+  %tmp7 = fdiv float %tmp6, %tmp1
+  %tmp8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %tmp9 = extractelement <4 x float> %tmp8, i32 2
+  %tmp10 = fdiv float %tmp9, %tmp1
+  %tmp11 = insertelement <4 x float> undef, float %tmp4, i32 0
+  %tmp12 = insertelement <4 x float> %tmp11, float %tmp7, i32 1
+  %tmp13 = insertelement <4 x float> %tmp12, float %tmp10, i32 2
+  %tmp14 = insertelement <4 x float> %tmp13, float 1.000000e+00, i32 3
+  %tmp15 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %tmp14)
+  %tmp16 = extractelement <4 x float> %tmp15, i32 0
+  %tmp17 = extractelement <4 x float> %tmp15, i32 1
+  %tmp18 = extractelement <4 x float> %tmp15, i32 2
+  %tmp19 = extractelement <4 x float> %tmp15, i32 3
+  %tmp20 = call float @llvm.fabs.f32(float %tmp18)
+  %tmp21 = fdiv float 1.000000e+00, %tmp20
+  %tmp22 = fmul float %tmp16, %tmp21
+  %tmp23 = fadd float %tmp22, 1.500000e+00
+  %tmp24 = fmul float %tmp17, %tmp21
+  %tmp25 = fadd float %tmp24, 1.500000e+00
+  %tmp26 = insertelement <4 x float> undef, float %tmp25, i32 0
+  %tmp27 = insertelement <4 x float> %tmp26, float %tmp23, i32 1
+  %tmp28 = insertelement <4 x float> %tmp27, float %tmp19, i32 2
+  %tmp29 = insertelement <4 x float> %tmp28, float %tmp25, i32 3
+  %tmp30 = shufflevector <4 x float> %tmp29, <4 x float> %tmp29, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp31 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp30, i32 0, i32 0, i32 0, i32 16, i32 0, i32 1, i32 1, i32 1, i32 1)
+  call void @llvm.R600.store.swizzle(<4 x float> %tmp31, i32 0, i32 0)
  ret void
 }

 ; Function Attrs: readnone
 declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #0

-; Function Attrs: readnone
-declare float @fabs(float) #0
-
-; Function Attrs: readnone
-declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) #0
+; Function Attrs: nounwind readnone
+declare float @llvm.fabs.f32(float) #0

 declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)

-attributes #0 = { readnone }
+; Function Attrs: readnone
+declare <4 x float> @llvm.r600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) #0

+attributes #0 = { nounwind readnone }
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.tex.ll
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.tex.ll
@ -1,42 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
-;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
-;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
-;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
-;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:UUNN
-;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZZ}} RID:0 SID:0 CT:NNNN
-;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZZ}} RID:0 SID:0 CT:NNNN
-;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZZ}} RID:0 SID:0 CT:UUNN
-;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYYW}} RID:0 SID:0 CT:NNUN
-;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN
-;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYYZ}} RID:0 SID:0 CT:NNUN
-;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN
-;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
-;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
-;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
-;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN
-
-define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
-   %addr = load <4 x float>, <4 x float> addrspace(1)* %in
-   %res1 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %addr, i32 0, i32 0, i32 1)
-   %res2 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res1, i32 0, i32 0, i32 2)
-   %res3 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res2, i32 0, i32 0, i32 3)
-   %res4 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res3, i32 0, i32 0, i32 4)
-   %res5 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res4, i32 0, i32 0, i32 5)
-   %res6 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res5, i32 0, i32 0, i32 6)
-   %res7 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res6, i32 0, i32 0, i32 7)
-   %res8 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res7, i32 0, i32 0, i32 8)
-   %res9 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res8, i32 0, i32 0, i32 9)
-   %res10 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res9, i32 0, i32 0, i32 10)
-   %res11 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res10, i32 0, i32 0, i32 11)
-   %res12 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res11, i32 0, i32 0, i32 12)
-   %res13 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res12, i32 0, i32 0, i32 13)
-   %res14 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res13, i32 0, i32 0, i32 14)
-   %res15 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res14, i32 0, i32 0, i32 15)
-   %res16 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res15, i32 0, i32 0, i32 16)
-   store <4 x float> %res16, <4 x float> addrspace(1)* %out
-   ret void
-}
-
-declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone
--- a/test/CodeGen/AMDGPU/llvm.amdgpu.dp4.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgpu.dp4.ll
@ -1,11 +1,11 @@
 ; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s

-declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) nounwind readnone
+declare float @llvm.r600.dot4(<4 x float>, <4 x float>) nounwind readnone

 define void @test_dp4(float addrspace(1)* %out, <4 x float> addrspace(1)* %a, <4 x float> addrspace(1)* %b) nounwind {
  %src0 = load <4 x float>, <4 x float> addrspace(1)* %a, align 16
  %src1 = load <4 x float>, <4 x float> addrspace(1)* %b, align 16
-  %dp4 = call float @llvm.AMDGPU.dp4(<4 x float> %src0, <4 x float> %src1) nounwind readnone
+  %dp4 = call float @llvm.r600.dot4(<4 x float> %src0, <4 x float> %src1) nounwind readnone
  store float %dp4, float addrspace(1)* %out, align 4
  ret void
 }
--- a/test/CodeGen/AMDGPU/llvm.r600.tex.ll
+++ b/test/CodeGen/AMDGPU/llvm.r600.tex.ll
@ -0,0 +1,65 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
+;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
+;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
+;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
+;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:UUNN
+;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZZ}} RID:0 SID:0 CT:NNNN
+;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZZ}} RID:0 SID:0 CT:NNNN
+;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZZ}} RID:0 SID:0 CT:UUNN
+;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYYW}} RID:0 SID:0 CT:NNUN
+;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN
+;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYYZ}} RID:0 SID:0 CT:NNUN
+;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN
+;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
+;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
+;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
+;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN
+
+define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+bb:
+  %addr = load <4 x float>, <4 x float> addrspace(1)* %in
+  %tmp = shufflevector <4 x float> %addr, <4 x float> %addr, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp1 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> %tmp1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp3 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp4 = shufflevector <4 x float> %tmp3, <4 x float> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp5 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp4, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp6 = shufflevector <4 x float> %tmp5, <4 x float> %tmp5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp7 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp8 = shufflevector <4 x float> %tmp7, <4 x float> %tmp7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp9 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1)
+  %tmp10 = shufflevector <4 x float> %tmp9, <4 x float> %tmp9, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+  %tmp11 = call <4 x float> @llvm.r600.texc(<4 x float> %tmp10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp12 = shufflevector <4 x float> %tmp11, <4 x float> %tmp11, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+  %tmp13 = call <4 x float> @llvm.r600.texc(<4 x float> %tmp12, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp14 = shufflevector <4 x float> %tmp13, <4 x float> %tmp13, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+  %tmp15 = call <4 x float> @llvm.r600.texc(<4 x float> %tmp14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1)
+  %tmp16 = shufflevector <4 x float> %tmp15, <4 x float> %tmp15, <4 x i32> <i32 0, i32 1, i32 1, i32 3>
+  %tmp17 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 0, i32 1)
+  %tmp18 = shufflevector <4 x float> %tmp17, <4 x float> %tmp17, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp19 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp18, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 0, i32 1)
+  %tmp20 = shufflevector <4 x float> %tmp19, <4 x float> %tmp19, <4 x i32> <i32 0, i32 1, i32 1, i32 2>
+  %tmp21 = call <4 x float> @llvm.r600.texc(<4 x float> %tmp20, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 0, i32 1)
+  %tmp22 = shufflevector <4 x float> %tmp21, <4 x float> %tmp21, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp23 = call <4 x float> @llvm.r600.texc(<4 x float> %tmp22, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 0, i32 1)
+  %tmp24 = shufflevector <4 x float> %tmp23, <4 x float> %tmp23, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp25 = call <4 x float> @llvm.r600.texc(<4 x float> %tmp24, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp26 = shufflevector <4 x float> %tmp25, <4 x float> %tmp25, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp27 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp26, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp28 = shufflevector <4 x float> %tmp27, <4 x float> %tmp27, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp29 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp28, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp30 = shufflevector <4 x float> %tmp29, <4 x float> %tmp29, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp31 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp30, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 0, i32 1)
+  store <4 x float> %tmp31, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; Function Attrs: readnone
+declare <4 x float> @llvm.r600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) #0
+
+; Function Attrs: readnone
+declare <4 x float> @llvm.r600.texc(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) #0
+
+attributes #0 = { nounwind readnone }
--- a/test/CodeGen/AMDGPU/load-input-fold.ll
+++ b/test/CodeGen/AMDGPU/load-input-fold.ll
@ -88,14 +88,14 @@ main_body:
  %83 = insertelement <4 x float> %82, float %75, i32 1
  %84 = insertelement <4 x float> %83, float %77, i32 2
  %85 = insertelement <4 x float> %84, float 0.000000e+00, i32 3
-  %86 = call float @llvm.AMDGPU.dp4(<4 x float> %81, <4 x float> %85)
+  %86 = call float @llvm.r600.dot4(<4 x float> %81, <4 x float> %85)
  %87 = insertelement <4 x float> undef, float %86, i32 0
  call void @llvm.R600.store.swizzle(<4 x float> %87, i32 2, i32 2)
  ret void
 }

 ; Function Attrs: readnone
-declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
+declare float @llvm.r600.dot4(<4 x float>, <4 x float>) #1

 ; Function Attrs: readonly
 declare float @fabs(float) #2
--- a/test/CodeGen/AMDGPU/max-literals.ll
+++ b/test/CodeGen/AMDGPU/max-literals.ll
@ -23,7 +23,7 @@ main_body:
  %15 = insertelement <4 x float> %14, float %8, i32 3
  %16 = insertelement <4 x float> %15, float %11, i32 3

-  %17 = call float @llvm.AMDGPU.dp4(<4 x float> %15,<4 x float> %16)
+  %17 = call float @llvm.r600.dot4(<4 x float> %15,<4 x float> %16)
  %18 = insertelement <4 x float> undef, float %17, i32 0
  call void @llvm.R600.store.swizzle(<4 x float> %18, i32 0, i32 2)
  ret void
@ -52,14 +52,14 @@ main_body:
  %15 = insertelement <4 x float> %14, float %8, i32 3
  %16 = insertelement <4 x float> %15, float %11, i32 3

-  %17 = call float @llvm.AMDGPU.dp4(<4 x float> %15,<4 x float> %16)
+  %17 = call float @llvm.r600.dot4(<4 x float> %15,<4 x float> %16)
  %18 = insertelement <4 x float> undef, float %17, i32 0
  call void @llvm.R600.store.swizzle(<4 x float> %18, i32 0, i32 2)
  ret void
 }

 ; Function Attrs: readnone
-declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
+declare float @llvm.r600.dot4(<4 x float>, <4 x float>) #1

 declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)

--- a/test/CodeGen/AMDGPU/predicate-dp4.ll
+++ b/test/CodeGen/AMDGPU/predicate-dp4.ll
@ -11,7 +11,7 @@ main_body:
  br i1 %3, label %IF, label %ENDIF

 IF:                                             ; preds = %main_body
-  %4 = call float @llvm.AMDGPU.dp4(<4 x float> %0, <4 x float> %0)
+  %4 = call float @llvm.r600.dot4(<4 x float> %0, <4 x float> %0)
  br label %ENDIF

 ENDIF:                                            ; preds = %IF, %main_body
@ -21,6 +21,6 @@ ENDIF:                                            ; preds = %IF, %main_body
  ret void
 }

-declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
+declare float @llvm.r600.dot4(<4 x float>, <4 x float>) #1
 declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
 attributes #1 = { readnone }
--- a/test/CodeGen/AMDGPU/pv-packing.ll
+++ b/test/CodeGen/AMDGPU/pv-packing.ll
@ -16,7 +16,7 @@ main_body:
  %8 = extractelement <4 x float> %reg3, i32 2
  %9 = load <4 x float>, <4 x float> addrspace(8)* null
  %10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %11 = call float @llvm.AMDGPU.dp4(<4 x float> %9, <4 x float> %9)
+  %11 = call float @llvm.r600.dot4(<4 x float> %9, <4 x float> %9)
  %12 = fmul float %0, %3
  %13 = fadd float %12, %6
  %14 = fmul float %1, %4
@ -29,14 +29,14 @@ main_body:
  %21 = insertelement <4 x float> %20, float %15, i32 1
  %22 = insertelement <4 x float> %21, float %17, i32 2
  %23 = insertelement <4 x float> %22, float %19, i32 3
-  %24 = call float @llvm.AMDGPU.dp4(<4 x float> %23, <4 x float> %10)
+  %24 = call float @llvm.r600.dot4(<4 x float> %23, <4 x float> %10)
  %25 = insertelement <4 x float> undef, float %24, i32 0
  call void @llvm.R600.store.swizzle(<4 x float> %25, i32 0, i32 2)
  ret void
 }

 ; Function Attrs: readnone
-declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
+declare float @llvm.r600.dot4(<4 x float>, <4 x float>) #1


 declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
--- a/test/CodeGen/AMDGPU/pv.ll
+++ b/test/CodeGen/AMDGPU/pv.ll
@ -101,7 +101,7 @@ main_body:
  %93 = insertelement <4 x float> %92, float %5, i32 1
  %94 = insertelement <4 x float> %93, float %6, i32 2
  %95 = insertelement <4 x float> %94, float 0.000000e+00, i32 3
-  %96 = call float @llvm.AMDGPU.dp4(<4 x float> %91, <4 x float> %95)
+  %96 = call float @llvm.r600.dot4(<4 x float> %91, <4 x float> %95)
  %97 = call float @fabs(float %96)
  %98 = call float @llvm.AMDGPU.rsq.clamped.f32(float %97)
  %99 = fmul float %4, %98
@ -137,7 +137,7 @@ main_body:
  %129 = insertelement <4 x float> %128, float %121, i32 1
  %130 = insertelement <4 x float> %129, float %123, i32 2
  %131 = insertelement <4 x float> %130, float 0.000000e+00, i32 3
-  %132 = call float @llvm.AMDGPU.dp4(<4 x float> %127, <4 x float> %131)
+  %132 = call float @llvm.r600.dot4(<4 x float> %127, <4 x float> %131)
  %133 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
  %134 = extractelement <4 x float> %133, i32 0
  %135 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
@ -152,7 +152,7 @@ main_body:
  %144 = insertelement <4 x float> %143, float %136, i32 1
  %145 = insertelement <4 x float> %144, float %138, i32 2
  %146 = insertelement <4 x float> %145, float 0.000000e+00, i32 3
-  %147 = call float @llvm.AMDGPU.dp4(<4 x float> %142, <4 x float> %146)
+  %147 = call float @llvm.r600.dot4(<4 x float> %142, <4 x float> %146)
  %148 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
  %149 = extractelement <4 x float> %148, i32 0
  %150 = fmul float %149, %8
@ -219,7 +219,7 @@ main_body:
 }

 ; Function Attrs: readnone
-declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
+declare float @llvm.r600.dot4(<4 x float>, <4 x float>) #1

 ; Function Attrs: readonly
 declare float @fabs(float) #2
--- a/test/CodeGen/AMDGPU/r600-infinite-loop-bug-while-reorganizing-vector.ll
+++ b/test/CodeGen/AMDGPU/r600-infinite-loop-bug-while-reorganizing-vector.ll
@ -1,57 +1,58 @@
-;RUN: llc < %s -march=r600 -mcpu=cayman
+; RUN: llc -march=r600 -mcpu=cayman < %s

-define amdgpu_ps void @main(<4 x float> inreg, <4 x float> inreg) {
+define amdgpu_ps void @main(<4 x float> inreg %arg, <4 x float> inreg %arg1) {
 main_body:
-  %2 = extractelement <4 x float> %0, i32 0
-  %3 = extractelement <4 x float> %0, i32 1
-  %4 = extractelement <4 x float> %0, i32 2
-  %5 = extractelement <4 x float> %0, i32 3
-  %6 = insertelement <4 x float> undef, float %2, i32 0
-  %7 = insertelement <4 x float> %6, float %3, i32 1
-  %8 = insertelement <4 x float> %7, float %4, i32 2
-  %9 = insertelement <4 x float> %8, float %5, i32 3
-  %10 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %9)
-  %11 = extractelement <4 x float> %10, i32 0
-  %12 = extractelement <4 x float> %10, i32 1
-  %13 = extractelement <4 x float> %10, i32 2
-  %14 = extractelement <4 x float> %10, i32 3
-  %15 = call float @fabs(float %13)
-  %16 = fdiv float 1.000000e+00, %15
-  %17 = fmul float %11, %16
-  %18 = fadd float %17, 1.500000e+00
-  %19 = fmul float %12, %16
-  %20 = fadd float %19, 1.500000e+00
-  %21 = insertelement <4 x float> undef, float %20, i32 0
-  %22 = insertelement <4 x float> %21, float %18, i32 1
-  %23 = insertelement <4 x float> %22, float %14, i32 2
-  %24 = insertelement <4 x float> %23, float %5, i32 3
-  %25 = extractelement <4 x float> %24, i32 0
-  %26 = extractelement <4 x float> %24, i32 1
-  %27 = extractelement <4 x float> %24, i32 2
-  %28 = extractelement <4 x float> %24, i32 3
-  %29 = insertelement <4 x float> undef, float %25, i32 0
-  %30 = insertelement <4 x float> %29, float %26, i32 1
-  %31 = insertelement <4 x float> %30, float %27, i32 2
-  %32 = insertelement <4 x float> %31, float %28, i32 3
-  %33 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %32, i32 16, i32 0, i32 13)
-  %34 = extractelement <4 x float> %33, i32 0
-  %35 = insertelement <4 x float> undef, float %34, i32 0
-  %36 = insertelement <4 x float> %35, float %34, i32 1
-  %37 = insertelement <4 x float> %36, float %34, i32 2
-  %38 = insertelement <4 x float> %37, float 1.000000e+00, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %38, i32 0, i32 0)
+  %tmp = extractelement <4 x float> %arg, i32 0
+  %tmp2 = extractelement <4 x float> %arg, i32 1
+  %tmp3 = extractelement <4 x float> %arg, i32 2
+  %tmp4 = extractelement <4 x float> %arg, i32 3
+  %tmp5 = insertelement <4 x float> undef, float %tmp, i32 0
+  %tmp6 = insertelement <4 x float> %tmp5, float %tmp2, i32 1
+  %tmp7 = insertelement <4 x float> %tmp6, float %tmp3, i32 2
+  %tmp8 = insertelement <4 x float> %tmp7, float %tmp4, i32 3
+  %tmp9 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %tmp8)
+  %tmp10 = extractelement <4 x float> %tmp9, i32 0
+  %tmp11 = extractelement <4 x float> %tmp9, i32 1
+  %tmp12 = extractelement <4 x float> %tmp9, i32 2
+  %tmp13 = extractelement <4 x float> %tmp9, i32 3
+  %tmp14 = call float @fabs(float %tmp12)
+  %tmp15 = fdiv float 1.000000e+00, %tmp14
+  %tmp16 = fmul float %tmp10, %tmp15
+  %tmp17 = fadd float %tmp16, 1.500000e+00
+  %tmp18 = fmul float %tmp11, %tmp15
+  %tmp19 = fadd float %tmp18, 1.500000e+00
+  %tmp20 = insertelement <4 x float> undef, float %tmp19, i32 0
+  %tmp21 = insertelement <4 x float> %tmp20, float %tmp17, i32 1
+  %tmp22 = insertelement <4 x float> %tmp21, float %tmp13, i32 2
+  %tmp23 = insertelement <4 x float> %tmp22, float %tmp4, i32 3
+  %tmp24 = extractelement <4 x float> %tmp23, i32 0
+  %tmp25 = extractelement <4 x float> %tmp23, i32 1
+  %tmp26 = extractelement <4 x float> %tmp23, i32 2
+  %tmp27 = extractelement <4 x float> %tmp23, i32 3
+  %tmp28 = insertelement <4 x float> undef, float %tmp24, i32 0
+  %tmp29 = insertelement <4 x float> %tmp28, float %tmp25, i32 1
+  %tmp30 = insertelement <4 x float> %tmp29, float %tmp26, i32 2
+  %tmp31 = insertelement <4 x float> %tmp30, float %tmp27, i32 3
+  %tmp32 = shufflevector <4 x float> %tmp31, <4 x float> %tmp31, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp33 = call <4 x float> @llvm.r600.texc(<4 x float> %tmp32, i32 0, i32 0, i32 0, i32 16, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp34 = extractelement <4 x float> %tmp33, i32 0
+  %tmp35 = insertelement <4 x float> undef, float %tmp34, i32 0
+  %tmp36 = insertelement <4 x float> %tmp35, float %tmp34, i32 1
+  %tmp37 = insertelement <4 x float> %tmp36, float %tmp34, i32 2
+  %tmp38 = insertelement <4 x float> %tmp37, float 1.000000e+00, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %tmp38, i32 0, i32 0)
  ret void
 }

 ; Function Attrs: readnone
-declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #1
+declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #0

 ; Function Attrs: readnone
-declare float @fabs(float) #1
-
-; Function Attrs: readnone
-declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) #1
+declare float @fabs(float) #0

 declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)

-attributes #1 = { readnone }
+; Function Attrs: readnone
+declare <4 x float> @llvm.r600.texc(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) #0
+
+attributes #0 = { nounwind readnone }
--- a/test/CodeGen/AMDGPU/rv7x0_count3.ll
+++ b/test/CodeGen/AMDGPU/rv7x0_count3.ll
@ -1,39 +1,52 @@
 ; RUN: llc < %s -march=r600 -show-mc-encoding  -mcpu=rv710 | FileCheck %s

 ; CHECK: TEX 9 @6 ;  encoding: [0x06,0x00,0x00,0x00,0x00,0x04,0x88,0x80]
-
 define amdgpu_vs void @test(<4 x float> inreg %reg0, <4 x float> inreg %reg1) {
-   %1 = extractelement <4 x float> %reg1, i32 0
-   %2 = extractelement <4 x float> %reg1, i32 1
-   %3 = extractelement <4 x float> %reg1, i32 2
-   %4 = extractelement <4 x float> %reg1, i32 3
-   %5 = insertelement <4 x float> undef, float %1, i32 0
-   %6 = insertelement <4 x float> %5, float %2, i32 1
-   %7 = insertelement <4 x float> %6, float %3, i32 2
-   %8 = insertelement <4 x float> %7, float %4, i32 3
-   %9 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 0, i32 0, i32 1)
-   %10 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 1, i32 0, i32 1)
-   %11 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 2, i32 0, i32 1)
-   %12 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 3, i32 0, i32 1)
-   %13 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 4, i32 0, i32 1)
-   %14 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 5, i32 0, i32 1)
-   %15 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 6, i32 0, i32 1)
-   %16 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 7, i32 0, i32 1)
-   %17 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 8, i32 0, i32 1)
-   %18 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 9, i32 0, i32 1)
-   %19 = fadd <4 x float> %9, %10
-   %20 = fadd <4 x float> %19, %11
-   %21 = fadd <4 x float> %20, %12
-   %22 = fadd <4 x float> %21, %13
-   %23 = fadd <4 x float> %22, %14
-   %24 = fadd <4 x float> %23, %15
-   %25 = fadd <4 x float> %24, %16
-   %26 = fadd <4 x float> %25, %17
-   %27 = fadd <4 x float> %26, %18
-   call void @llvm.R600.store.swizzle(<4 x float> %27, i32 0, i32 2)
-   ret void
+bb:
+  %tmp = extractelement <4 x float> %reg1, i32 0
+  %tmp1 = extractelement <4 x float> %reg1, i32 1
+  %tmp2 = extractelement <4 x float> %reg1, i32 2
+  %tmp3 = extractelement <4 x float> %reg1, i32 3
+  %tmp4 = insertelement <4 x float> undef, float %tmp, i32 0
+  %tmp5 = insertelement <4 x float> %tmp4, float %tmp1, i32 1
+  %tmp6 = insertelement <4 x float> %tmp5, float %tmp2, i32 2
+  %tmp7 = insertelement <4 x float> %tmp6, float %tmp3, i32 3
+  %tmp8 = shufflevector <4 x float> %tmp7, <4 x float> %tmp7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp9 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp10 = shufflevector <4 x float> %tmp7, <4 x float> %tmp7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp11 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp10, i32 0, i32 0, i32 0, i32 1, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp12 = shufflevector <4 x float> %tmp7, <4 x float> %tmp7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp13 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp12, i32 0, i32 0, i32 0, i32 2, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp14 = shufflevector <4 x float> %tmp7, <4 x float> %tmp7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp15 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp14, i32 0, i32 0, i32 0, i32 3, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp16 = shufflevector <4 x float> %tmp7, <4 x float> %tmp7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp17 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp16, i32 0, i32 0, i32 0, i32 4, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp18 = shufflevector <4 x float> %tmp7, <4 x float> %tmp7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp19 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp18, i32 0, i32 0, i32 0, i32 5, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp20 = shufflevector <4 x float> %tmp7, <4 x float> %tmp7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp21 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp20, i32 0, i32 0, i32 0, i32 6, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp22 = shufflevector <4 x float> %tmp7, <4 x float> %tmp7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp23 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp22, i32 0, i32 0, i32 0, i32 7, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp24 = shufflevector <4 x float> %tmp7, <4 x float> %tmp7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp25 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp24, i32 0, i32 0, i32 0, i32 8, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp26 = shufflevector <4 x float> %tmp7, <4 x float> %tmp7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp27 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp26, i32 0, i32 0, i32 0, i32 9, i32 0, i32 1, i32 1, i32 1, i32 1)
+  %tmp28 = fadd <4 x float> %tmp9, %tmp11
+  %tmp29 = fadd <4 x float> %tmp28, %tmp13
+  %tmp30 = fadd <4 x float> %tmp29, %tmp15
+  %tmp31 = fadd <4 x float> %tmp30, %tmp17
+  %tmp32 = fadd <4 x float> %tmp31, %tmp19
+  %tmp33 = fadd <4 x float> %tmp32, %tmp21
+  %tmp34 = fadd <4 x float> %tmp33, %tmp23
+  %tmp35 = fadd <4 x float> %tmp34, %tmp25
+  %tmp36 = fadd <4 x float> %tmp35, %tmp27
+  call void @llvm.R600.store.swizzle(<4 x float> %tmp36, i32 0, i32 2)
+  ret void
 }

-declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone
-
 declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.r600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) #0
+
+attributes #0 = { nounwind readnone }
--- a/test/CodeGen/AMDGPU/shared-op-cycle.ll
+++ b/test/CodeGen/AMDGPU/shared-op-cycle.ll
@ -17,14 +17,14 @@ define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4
   %v0 = insertelement <4 x float> undef, float %r0, i32 0
   %v1 = insertelement <4 x float> %v0, float %r1, i32 1
   %v2 = insertelement <4 x float> %v1, float %r2, i32 2
-   %res = call float @llvm.AMDGPU.dp4(<4 x float> %v2, <4 x float> %v2)
+   %res = call float @llvm.r600.dot4(<4 x float> %v2, <4 x float> %v2)
   %vecres = insertelement <4 x float> undef, float %res, i32 0
   call void @llvm.R600.store.swizzle(<4 x float> %vecres, i32 0, i32 2)
   ret void
 }

 ; Function Attrs: readnone
-declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
+declare float @llvm.r600.dot4(<4 x float>, <4 x float>) #1

 declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)