mirror of
https://github.com/RPCSX/llvm.git
synced 2025-02-04 19:38:22 +00:00
AMDGPU/R600: Delete/rename intrinsics no longer used by mesa
Use the replacement pass to update the tests, and delete old names. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@275375 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
04e6d2604d
commit
4d120e9b24
@ -29,7 +29,6 @@ class TargetMachine;
|
||||
|
||||
// R600 Passes
|
||||
FunctionPass *createR600VectorRegMerger(TargetMachine &tm);
|
||||
FunctionPass *createR600TextureIntrinsicsReplacer();
|
||||
FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm);
|
||||
FunctionPass *createR600EmitClauseMarkers();
|
||||
FunctionPass *createR600ClauseMergePass(TargetMachine &tm);
|
||||
|
@ -432,7 +432,6 @@ bool R600PassConfig::addPreISel() {
|
||||
|
||||
if (EnableR600StructurizeCFG)
|
||||
addPass(createStructurizeCFGPass());
|
||||
addPass(createR600TextureIntrinsicsReplacer());
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -61,7 +61,6 @@ add_llvm_target(AMDGPUCodeGen
|
||||
R600OptimizeVectorRegisters.cpp
|
||||
R600Packetizer.cpp
|
||||
R600RegisterInfo.cpp
|
||||
R600TextureIntrinsicsReplacer.cpp
|
||||
SIAnnotateControlFlow.cpp
|
||||
SIDebuggerInsertNops.cpp
|
||||
SIFixControlFlowLiveIntervals.cpp
|
||||
|
@ -738,7 +738,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
|
||||
};
|
||||
return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
|
||||
}
|
||||
case AMDGPUIntrinsic::AMDGPU_dp4: {
|
||||
case AMDGPUIntrinsic::r600_dot4: {
|
||||
SDValue Args[8] = {
|
||||
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
|
||||
DAG.getConstant(0, DL, MVT::i32)),
|
||||
|
@ -1332,9 +1332,7 @@ def TXD: InstR600 <
|
||||
(outs R600_Reg128:$dst),
|
||||
(ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2,
|
||||
i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget),
|
||||
"TXD $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget",
|
||||
[(set v4f32:$dst, (int_AMDGPU_txd v4f32:$src0, v4f32:$src1, v4f32:$src2,
|
||||
imm:$resourceId, imm:$samplerId, imm:$textureTarget))],
|
||||
"TXD $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget", [],
|
||||
NullALU > {
|
||||
let TEXInst = 1;
|
||||
}
|
||||
@ -1344,10 +1342,7 @@ def TXD_SHADOW: InstR600 <
|
||||
(ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2,
|
||||
i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget),
|
||||
"TXD_SHADOW $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget",
|
||||
[(set v4f32:$dst, (int_AMDGPU_txd v4f32:$src0, v4f32:$src1, v4f32:$src2,
|
||||
imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))],
|
||||
NullALU
|
||||
> {
|
||||
[], NullALU> {
|
||||
let TEXInst = 1;
|
||||
}
|
||||
} // End isPseudo = 1
|
||||
|
@ -12,18 +12,6 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
// FIXME: Should migrate to using TargetPrefix that matches triple arch name.
|
||||
let TargetPrefix = "AMDGPU", isTarget = 1 in {
|
||||
def int_AMDGPU_dp4 : Intrinsic<[llvm_float_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
|
||||
def int_AMDGPU_tex : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
|
||||
def int_AMDGPU_txb : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
|
||||
def int_AMDGPU_txf : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
|
||||
def int_AMDGPU_txq : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
|
||||
def int_AMDGPU_txd : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
|
||||
def int_AMDGPU_txl : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
|
||||
def int_AMDGPU_ddx : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
|
||||
def int_AMDGPU_ddy : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
|
||||
}
|
||||
|
||||
let TargetPrefix = "R600", isTarget = 1 in {
|
||||
def int_R600_store_swizzle :
|
||||
Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []>;
|
||||
@ -70,4 +58,8 @@ let TargetPrefix = "r600", isTarget = 1 in {
|
||||
def int_r600_txq : TextureIntrinsicInt32Input;
|
||||
def int_r600_ddx : TextureIntrinsicFloatInput;
|
||||
def int_r600_ddy : TextureIntrinsicFloatInput;
|
||||
|
||||
def int_r600_dot4 : Intrinsic<[llvm_float_ty],
|
||||
[llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]
|
||||
>;
|
||||
} // End TargetPrefix = "r600", isTarget = 1
|
||||
|
@ -1,303 +0,0 @@
|
||||
//===-- R600TextureIntrinsicsReplacer.cpp ---------------------------------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
/// \file
|
||||
/// This pass translates tgsi-like texture intrinsics into R600 texture
|
||||
/// closer to hardware intrinsics.
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "AMDGPU.h"
|
||||
#include "llvm/ADT/Statistic.h"
|
||||
#include "llvm/Analysis/Passes.h"
|
||||
#include "llvm/IR/Function.h"
|
||||
#include "llvm/IR/GlobalValue.h"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
#include "llvm/IR/InstVisitor.h"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
namespace {
|
||||
class R600TextureIntrinsicsReplacer :
|
||||
public FunctionPass, public InstVisitor<R600TextureIntrinsicsReplacer> {
|
||||
static char ID;
|
||||
|
||||
Module *Mod;
|
||||
Type *FloatType;
|
||||
Type *Int32Type;
|
||||
Type *V4f32Type;
|
||||
Type *V4i32Type;
|
||||
FunctionType *TexSign;
|
||||
FunctionType *TexQSign;
|
||||
|
||||
void getAdjustmentFromTextureTarget(unsigned TextureType, bool hasLOD,
|
||||
unsigned SrcSelect[4], unsigned CT[4],
|
||||
bool &useShadowVariant) {
|
||||
enum TextureTypes {
|
||||
TEXTURE_1D = 1,
|
||||
TEXTURE_2D,
|
||||
TEXTURE_3D,
|
||||
TEXTURE_CUBE,
|
||||
TEXTURE_RECT,
|
||||
TEXTURE_SHADOW1D,
|
||||
TEXTURE_SHADOW2D,
|
||||
TEXTURE_SHADOWRECT,
|
||||
TEXTURE_1D_ARRAY,
|
||||
TEXTURE_2D_ARRAY,
|
||||
TEXTURE_SHADOW1D_ARRAY,
|
||||
TEXTURE_SHADOW2D_ARRAY,
|
||||
TEXTURE_SHADOWCUBE,
|
||||
TEXTURE_2D_MSAA,
|
||||
TEXTURE_2D_ARRAY_MSAA,
|
||||
TEXTURE_CUBE_ARRAY,
|
||||
TEXTURE_SHADOWCUBE_ARRAY
|
||||
};
|
||||
|
||||
switch (TextureType) {
|
||||
case 0:
|
||||
useShadowVariant = false;
|
||||
return;
|
||||
case TEXTURE_RECT:
|
||||
case TEXTURE_1D:
|
||||
case TEXTURE_2D:
|
||||
case TEXTURE_3D:
|
||||
case TEXTURE_CUBE:
|
||||
case TEXTURE_1D_ARRAY:
|
||||
case TEXTURE_2D_ARRAY:
|
||||
case TEXTURE_CUBE_ARRAY:
|
||||
case TEXTURE_2D_MSAA:
|
||||
case TEXTURE_2D_ARRAY_MSAA:
|
||||
useShadowVariant = false;
|
||||
break;
|
||||
case TEXTURE_SHADOW1D:
|
||||
case TEXTURE_SHADOW2D:
|
||||
case TEXTURE_SHADOWRECT:
|
||||
case TEXTURE_SHADOW1D_ARRAY:
|
||||
case TEXTURE_SHADOW2D_ARRAY:
|
||||
case TEXTURE_SHADOWCUBE:
|
||||
case TEXTURE_SHADOWCUBE_ARRAY:
|
||||
useShadowVariant = true;
|
||||
break;
|
||||
default:
|
||||
llvm_unreachable("Unknow Texture Type");
|
||||
}
|
||||
|
||||
if (TextureType == TEXTURE_RECT ||
|
||||
TextureType == TEXTURE_SHADOWRECT) {
|
||||
CT[0] = 0;
|
||||
CT[1] = 0;
|
||||
}
|
||||
|
||||
if (TextureType == TEXTURE_CUBE_ARRAY ||
|
||||
TextureType == TEXTURE_SHADOWCUBE_ARRAY)
|
||||
CT[2] = 0;
|
||||
|
||||
if (TextureType == TEXTURE_1D_ARRAY ||
|
||||
TextureType == TEXTURE_SHADOW1D_ARRAY) {
|
||||
if (hasLOD && useShadowVariant) {
|
||||
CT[1] = 0;
|
||||
} else {
|
||||
CT[2] = 0;
|
||||
SrcSelect[2] = 1;
|
||||
}
|
||||
} else if (TextureType == TEXTURE_2D_ARRAY ||
|
||||
TextureType == TEXTURE_SHADOW2D_ARRAY) {
|
||||
CT[2] = 0;
|
||||
}
|
||||
|
||||
if ((TextureType == TEXTURE_SHADOW1D ||
|
||||
TextureType == TEXTURE_SHADOW2D ||
|
||||
TextureType == TEXTURE_SHADOWRECT ||
|
||||
TextureType == TEXTURE_SHADOW1D_ARRAY) &&
|
||||
!(hasLOD && useShadowVariant))
|
||||
SrcSelect[3] = 2;
|
||||
}
|
||||
|
||||
void ReplaceCallInst(CallInst &I, FunctionType *FT, const char *Name,
|
||||
unsigned SrcSelect[4], Value *Offset[3], Value *Resource,
|
||||
Value *Sampler, unsigned CT[4], Value *Coord) {
|
||||
IRBuilder<> Builder(&I);
|
||||
Constant *Mask[] = {
|
||||
ConstantInt::get(Int32Type, SrcSelect[0]),
|
||||
ConstantInt::get(Int32Type, SrcSelect[1]),
|
||||
ConstantInt::get(Int32Type, SrcSelect[2]),
|
||||
ConstantInt::get(Int32Type, SrcSelect[3])
|
||||
};
|
||||
Value *SwizzleMask = ConstantVector::get(Mask);
|
||||
Value *SwizzledCoord =
|
||||
Builder.CreateShuffleVector(Coord, Coord, SwizzleMask);
|
||||
|
||||
Value *Args[] = {
|
||||
SwizzledCoord,
|
||||
Offset[0],
|
||||
Offset[1],
|
||||
Offset[2],
|
||||
Resource,
|
||||
Sampler,
|
||||
ConstantInt::get(Int32Type, CT[0]),
|
||||
ConstantInt::get(Int32Type, CT[1]),
|
||||
ConstantInt::get(Int32Type, CT[2]),
|
||||
ConstantInt::get(Int32Type, CT[3])
|
||||
};
|
||||
|
||||
Function *F = Mod->getFunction(Name);
|
||||
if (!F) {
|
||||
F = Function::Create(FT, GlobalValue::ExternalLinkage, Name, Mod);
|
||||
F->addFnAttr(Attribute::ReadNone);
|
||||
}
|
||||
I.replaceAllUsesWith(Builder.CreateCall(F, Args));
|
||||
I.eraseFromParent();
|
||||
}
|
||||
|
||||
void ReplaceTexIntrinsic(CallInst &I, bool hasLOD, FunctionType *FT,
|
||||
const char *VanillaInt,
|
||||
const char *ShadowInt) {
|
||||
Value *Coord = I.getArgOperand(0);
|
||||
Value *ResourceId = I.getArgOperand(1);
|
||||
Value *SamplerId = I.getArgOperand(2);
|
||||
|
||||
unsigned TextureType =
|
||||
cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
|
||||
|
||||
unsigned SrcSelect[4] = { 0, 1, 2, 3 };
|
||||
unsigned CT[4] = {1, 1, 1, 1};
|
||||
Value *Offset[3] = {
|
||||
ConstantInt::get(Int32Type, 0),
|
||||
ConstantInt::get(Int32Type, 0),
|
||||
ConstantInt::get(Int32Type, 0)
|
||||
};
|
||||
bool useShadowVariant;
|
||||
|
||||
getAdjustmentFromTextureTarget(TextureType, hasLOD, SrcSelect, CT,
|
||||
useShadowVariant);
|
||||
|
||||
ReplaceCallInst(I, FT, useShadowVariant?ShadowInt:VanillaInt, SrcSelect,
|
||||
Offset, ResourceId, SamplerId, CT, Coord);
|
||||
}
|
||||
|
||||
void ReplaceTXF(CallInst &I) {
|
||||
Value *Coord = I.getArgOperand(0);
|
||||
Value *ResourceId = I.getArgOperand(4);
|
||||
Value *SamplerId = I.getArgOperand(5);
|
||||
|
||||
unsigned TextureType =
|
||||
cast<ConstantInt>(I.getArgOperand(6))->getZExtValue();
|
||||
|
||||
unsigned SrcSelect[4] = { 0, 1, 2, 3 };
|
||||
unsigned CT[4] = {1, 1, 1, 1};
|
||||
Value *Offset[3] = {
|
||||
I.getArgOperand(1),
|
||||
I.getArgOperand(2),
|
||||
I.getArgOperand(3),
|
||||
};
|
||||
bool useShadowVariant;
|
||||
|
||||
getAdjustmentFromTextureTarget(TextureType, false, SrcSelect, CT,
|
||||
useShadowVariant);
|
||||
|
||||
ReplaceCallInst(I, TexQSign, "llvm.R600.txf", SrcSelect,
|
||||
Offset, ResourceId, SamplerId, CT, Coord);
|
||||
}
|
||||
|
||||
public:
|
||||
R600TextureIntrinsicsReplacer():
|
||||
FunctionPass(ID) {
|
||||
}
|
||||
|
||||
bool doInitialization(Module &M) override {
|
||||
LLVMContext &Ctx = M.getContext();
|
||||
Mod = &M;
|
||||
FloatType = Type::getFloatTy(Ctx);
|
||||
Int32Type = Type::getInt32Ty(Ctx);
|
||||
V4f32Type = VectorType::get(FloatType, 4);
|
||||
V4i32Type = VectorType::get(Int32Type, 4);
|
||||
Type *ArgsType[] = {
|
||||
V4f32Type,
|
||||
Int32Type,
|
||||
Int32Type,
|
||||
Int32Type,
|
||||
Int32Type,
|
||||
Int32Type,
|
||||
Int32Type,
|
||||
Int32Type,
|
||||
Int32Type,
|
||||
Int32Type,
|
||||
};
|
||||
TexSign = FunctionType::get(V4f32Type, ArgsType, /*isVarArg=*/false);
|
||||
Type *ArgsQType[] = {
|
||||
V4i32Type,
|
||||
Int32Type,
|
||||
Int32Type,
|
||||
Int32Type,
|
||||
Int32Type,
|
||||
Int32Type,
|
||||
Int32Type,
|
||||
Int32Type,
|
||||
Int32Type,
|
||||
Int32Type,
|
||||
};
|
||||
TexQSign = FunctionType::get(V4f32Type, ArgsQType, /*isVarArg=*/false);
|
||||
return false;
|
||||
}
|
||||
|
||||
bool runOnFunction(Function &F) override {
|
||||
visit(F);
|
||||
return false;
|
||||
}
|
||||
|
||||
const char *getPassName() const override {
|
||||
return "R600 Texture Intrinsics Replacer";
|
||||
}
|
||||
|
||||
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
||||
}
|
||||
|
||||
void visitCallInst(CallInst &I) {
|
||||
if (!I.getCalledFunction())
|
||||
return;
|
||||
|
||||
StringRef Name = I.getCalledFunction()->getName();
|
||||
if (Name == "llvm.AMDGPU.tex") {
|
||||
ReplaceTexIntrinsic(I, false, TexSign, "llvm.r600.tex", "llvm.r600.texc");
|
||||
return;
|
||||
}
|
||||
if (Name == "llvm.AMDGPU.txl") {
|
||||
ReplaceTexIntrinsic(I, true, TexSign, "llvm.r600.txl", "llvm.r600.txlc");
|
||||
return;
|
||||
}
|
||||
if (Name == "llvm.AMDGPU.txb") {
|
||||
ReplaceTexIntrinsic(I, true, TexSign, "llvm.r600.txb", "llvm.r600.txbc");
|
||||
return;
|
||||
}
|
||||
if (Name == "llvm.AMDGPU.txf") {
|
||||
ReplaceTXF(I);
|
||||
return;
|
||||
}
|
||||
if (Name == "llvm.AMDGPU.txq") {
|
||||
ReplaceTexIntrinsic(I, false, TexQSign, "llvm.r600.txq", "llvm.r600.txq");
|
||||
return;
|
||||
}
|
||||
if (Name == "llvm.AMDGPU.ddx") {
|
||||
ReplaceTexIntrinsic(I, false, TexSign, "llvm.r600.ddx", "llvm.r600.ddx");
|
||||
return;
|
||||
}
|
||||
if (Name == "llvm.AMDGPU.ddy") {
|
||||
ReplaceTexIntrinsic(I, false, TexSign, "llvm.r600.ddy", "llvm.r600.ddy");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
char R600TextureIntrinsicsReplacer::ID = 0;
|
||||
|
||||
}
|
||||
|
||||
FunctionPass *llvm::createR600TextureIntrinsicsReplacer() {
|
||||
return new R600TextureIntrinsicsReplacer();
|
||||
}
|
File diff suppressed because it is too large
Load Diff
@ -1,27 +0,0 @@
|
||||
;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
||||
|
||||
; Exactly one constant vector can be folded into dot4, which means exactly
|
||||
; 4 MOV instructions
|
||||
; CHECK: {{^}}main:
|
||||
; CHECK: MOV
|
||||
; CHECK: MOV
|
||||
; CHECK: MOV
|
||||
; CHECK: MOV
|
||||
; CHECK-NOT: MOV
|
||||
; CHECK-NOT: MOV
|
||||
; CHECK-NOT: MOV
|
||||
; CHECK-NOT: MOV
|
||||
|
||||
define void @main(float addrspace(1)* %out) {
|
||||
main_body:
|
||||
%0 = load <4 x float>, <4 x float> addrspace(8)* null
|
||||
%1 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
|
||||
%2 = call float @llvm.AMDGPU.dp4(<4 x float> %0,<4 x float> %1)
|
||||
%3 = insertelement <4 x float> undef, float %2, i32 0
|
||||
call void @llvm.R600.store.swizzle(<4 x float> %3, i32 0, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
|
||||
declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
|
||||
attributes #1 = { readnone }
|
@ -9,38 +9,48 @@
|
||||
|
||||
define amdgpu_ps void @fetch_limits_r600() {
|
||||
entry:
|
||||
%0 = load <4 x float>, <4 x float> addrspace(8)* null
|
||||
%1 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
|
||||
%2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
|
||||
%3 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
|
||||
%4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
|
||||
%5 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
|
||||
%6 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
|
||||
%7 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
|
||||
%8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
|
||||
%res0 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %0, i32 0, i32 0, i32 1)
|
||||
%res1 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %1, i32 0, i32 0, i32 1)
|
||||
%res2 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %2, i32 0, i32 0, i32 1)
|
||||
%res3 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %3, i32 0, i32 0, i32 1)
|
||||
%res4 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %4, i32 0, i32 0, i32 1)
|
||||
%res5 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %5, i32 0, i32 0, i32 1)
|
||||
%res6 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %6, i32 0, i32 0, i32 1)
|
||||
%res7 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %7, i32 0, i32 0, i32 1)
|
||||
%res8 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 0, i32 0, i32 1)
|
||||
%a = fadd <4 x float> %res0, %res1
|
||||
%b = fadd <4 x float> %res2, %res3
|
||||
%c = fadd <4 x float> %res4, %res5
|
||||
%d = fadd <4 x float> %res6, %res7
|
||||
%e = fadd <4 x float> %res8, %a
|
||||
|
||||
%tmp = load <4 x float>, <4 x float> addrspace(8)* null
|
||||
%tmp1 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
|
||||
%tmp2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
|
||||
%tmp3 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
|
||||
%tmp4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
|
||||
%tmp5 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
|
||||
%tmp6 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
|
||||
%tmp7 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
|
||||
%tmp8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
|
||||
%tmp9 = shufflevector <4 x float> %tmp, <4 x float> %tmp, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%tmp10 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp9, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
|
||||
%tmp11 = shufflevector <4 x float> %tmp1, <4 x float> %tmp1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%tmp12 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp11, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
|
||||
%tmp13 = shufflevector <4 x float> %tmp2, <4 x float> %tmp2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%tmp14 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp13, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
|
||||
%tmp15 = shufflevector <4 x float> %tmp3, <4 x float> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%tmp16 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
|
||||
%tmp17 = shufflevector <4 x float> %tmp4, <4 x float> %tmp4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%tmp18 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp17, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
|
||||
%tmp19 = shufflevector <4 x float> %tmp5, <4 x float> %tmp5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%tmp20 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp19, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
|
||||
%tmp21 = shufflevector <4 x float> %tmp6, <4 x float> %tmp6, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%tmp22 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp21, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
|
||||
%tmp23 = shufflevector <4 x float> %tmp7, <4 x float> %tmp7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%tmp24 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp23, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
|
||||
%tmp25 = shufflevector <4 x float> %tmp8, <4 x float> %tmp8, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%tmp26 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp25, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
|
||||
%a = fadd <4 x float> %tmp10, %tmp12
|
||||
%b = fadd <4 x float> %tmp14, %tmp16
|
||||
%c = fadd <4 x float> %tmp18, %tmp20
|
||||
%d = fadd <4 x float> %tmp22, %tmp24
|
||||
%e = fadd <4 x float> %tmp26, %a
|
||||
%bc = fadd <4 x float> %b, %c
|
||||
%de = fadd <4 x float> %d, %e
|
||||
|
||||
%bcde = fadd <4 x float> %bc, %de
|
||||
|
||||
call void @llvm.R600.store.swizzle(<4 x float> %bcde, i32 0, i32 1)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone
|
||||
declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
|
||||
|
||||
; Function Attrs: readnone
|
||||
declare <4 x float> @llvm.r600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) #0
|
||||
|
||||
attributes #0 = { nounwind readnone }
|
||||
|
@ -35,45 +35,63 @@ entry:
|
||||
%14 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
|
||||
%15 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 15)
|
||||
%16 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16)
|
||||
%res0 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %0, i32 0, i32 0, i32 1)
|
||||
%res1 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %1, i32 0, i32 0, i32 1)
|
||||
%res2 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %2, i32 0, i32 0, i32 1)
|
||||
%res3 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %3, i32 0, i32 0, i32 1)
|
||||
%res4 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %4, i32 0, i32 0, i32 1)
|
||||
%res5 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %5, i32 0, i32 0, i32 1)
|
||||
%res6 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %6, i32 0, i32 0, i32 1)
|
||||
%res7 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %7, i32 0, i32 0, i32 1)
|
||||
%res8 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 0, i32 0, i32 1)
|
||||
%res9 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %9, i32 0, i32 0, i32 1)
|
||||
%res10 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %10, i32 0, i32 0, i32 1)
|
||||
%res11 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %11, i32 0, i32 0, i32 1)
|
||||
%res12 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %12, i32 0, i32 0, i32 1)
|
||||
%res13 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %13, i32 0, i32 0, i32 1)
|
||||
%res14 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %14, i32 0, i32 0, i32 1)
|
||||
%res15 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %15, i32 0, i32 0, i32 1)
|
||||
%res16 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %16, i32 0, i32 0, i32 1)
|
||||
%a = fadd <4 x float> %res0, %res1
|
||||
%b = fadd <4 x float> %res2, %res3
|
||||
%c = fadd <4 x float> %res4, %res5
|
||||
%d = fadd <4 x float> %res6, %res7
|
||||
%e = fadd <4 x float> %res8, %res9
|
||||
%f = fadd <4 x float> %res10, %res11
|
||||
%g = fadd <4 x float> %res12, %res13
|
||||
%h = fadd <4 x float> %res14, %res15
|
||||
%i = fadd <4 x float> %res16, %a
|
||||
|
||||
%17 = shufflevector <4 x float> %0, <4 x float> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%18 = call <4 x float> @llvm.r600.tex(<4 x float> %17, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
|
||||
%19 = shufflevector <4 x float> %1, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%20 = call <4 x float> @llvm.r600.tex(<4 x float> %19, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
|
||||
%21 = shufflevector <4 x float> %2, <4 x float> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%22 = call <4 x float> @llvm.r600.tex(<4 x float> %21, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
|
||||
%23 = shufflevector <4 x float> %3, <4 x float> %3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%24 = call <4 x float> @llvm.r600.tex(<4 x float> %23, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
|
||||
%25 = shufflevector <4 x float> %4, <4 x float> %4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%26 = call <4 x float> @llvm.r600.tex(<4 x float> %25, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
|
||||
%27 = shufflevector <4 x float> %5, <4 x float> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%28 = call <4 x float> @llvm.r600.tex(<4 x float> %27, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
|
||||
%29 = shufflevector <4 x float> %6, <4 x float> %6, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%30 = call <4 x float> @llvm.r600.tex(<4 x float> %29, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
|
||||
%31 = shufflevector <4 x float> %7, <4 x float> %7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%32 = call <4 x float> @llvm.r600.tex(<4 x float> %31, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
|
||||
%33 = shufflevector <4 x float> %8, <4 x float> %8, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%34 = call <4 x float> @llvm.r600.tex(<4 x float> %33, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
|
||||
%35 = shufflevector <4 x float> %9, <4 x float> %9, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%36 = call <4 x float> @llvm.r600.tex(<4 x float> %35, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
|
||||
%37 = shufflevector <4 x float> %10, <4 x float> %10, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%38 = call <4 x float> @llvm.r600.tex(<4 x float> %37, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
|
||||
%39 = shufflevector <4 x float> %11, <4 x float> %11, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%40 = call <4 x float> @llvm.r600.tex(<4 x float> %39, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
|
||||
%41 = shufflevector <4 x float> %12, <4 x float> %12, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%42 = call <4 x float> @llvm.r600.tex(<4 x float> %41, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
|
||||
%43 = shufflevector <4 x float> %13, <4 x float> %13, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%44 = call <4 x float> @llvm.r600.tex(<4 x float> %43, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
|
||||
%45 = shufflevector <4 x float> %14, <4 x float> %14, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%46 = call <4 x float> @llvm.r600.tex(<4 x float> %45, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
|
||||
%47 = shufflevector <4 x float> %15, <4 x float> %15, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%48 = call <4 x float> @llvm.r600.tex(<4 x float> %47, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
|
||||
%49 = shufflevector <4 x float> %16, <4 x float> %16, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%50 = call <4 x float> @llvm.r600.tex(<4 x float> %49, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
|
||||
%a = fadd <4 x float> %18, %20
|
||||
%b = fadd <4 x float> %22, %24
|
||||
%c = fadd <4 x float> %26, %28
|
||||
%d = fadd <4 x float> %30, %32
|
||||
%e = fadd <4 x float> %34, %36
|
||||
%f = fadd <4 x float> %38, %40
|
||||
%g = fadd <4 x float> %42, %44
|
||||
%h = fadd <4 x float> %46, %48
|
||||
%i = fadd <4 x float> %50, %a
|
||||
%bc = fadd <4 x float> %b, %c
|
||||
%de = fadd <4 x float> %d, %e
|
||||
%fg = fadd <4 x float> %f, %g
|
||||
%hi = fadd <4 x float> %h, %i
|
||||
|
||||
%bcde = fadd <4 x float> %bc, %de
|
||||
%fghi = fadd <4 x float> %fg, %hi
|
||||
|
||||
%bcdefghi = fadd <4 x float> %bcde, %fghi
|
||||
call void @llvm.R600.store.swizzle(<4 x float> %bcdefghi, i32 0, i32 1)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone
|
||||
declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
|
||||
|
||||
; Function Attrs: readnone
|
||||
declare <4 x float> @llvm.r600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) #0
|
||||
|
||||
attributes #0 = { readnone }
|
||||
|
@ -54,11 +54,11 @@ entry:
|
||||
; CHECK-NEXT: DOT4 * T[[GPR]].W (MASKED), 1.0
|
||||
define void @inline_literal_dot4(float addrspace(1)* %out) {
|
||||
entry:
|
||||
%0 = call float @llvm.AMDGPU.dp4(<4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
%0 = call float @llvm.r600.dot4(<4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
|
||||
store float %0, float addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
|
||||
declare float @llvm.r600.dot4(<4 x float>, <4 x float>) #1
|
||||
|
||||
attributes #1 = { readnone }
|
||||
|
@ -1,4 +1,4 @@
|
||||
; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
||||
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s
|
||||
|
||||
; CHECK-LABEL: {{^}}cube:
|
||||
; CHECK: CUBE T{{[0-9]}}.X
|
||||
@ -7,51 +7,51 @@
|
||||
; CHECK: CUBE * T{{[0-9]}}.W
|
||||
define amdgpu_ps void @cube() {
|
||||
main_body:
|
||||
%0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
|
||||
%1 = extractelement <4 x float> %0, i32 3
|
||||
%2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
|
||||
%3 = extractelement <4 x float> %2, i32 0
|
||||
%4 = fdiv float %3, %1
|
||||
%5 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
|
||||
%6 = extractelement <4 x float> %5, i32 1
|
||||
%7 = fdiv float %6, %1
|
||||
%8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
|
||||
%9 = extractelement <4 x float> %8, i32 2
|
||||
%10 = fdiv float %9, %1
|
||||
%11 = insertelement <4 x float> undef, float %4, i32 0
|
||||
%12 = insertelement <4 x float> %11, float %7, i32 1
|
||||
%13 = insertelement <4 x float> %12, float %10, i32 2
|
||||
%14 = insertelement <4 x float> %13, float 1.000000e+00, i32 3
|
||||
%15 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %14)
|
||||
%16 = extractelement <4 x float> %15, i32 0
|
||||
%17 = extractelement <4 x float> %15, i32 1
|
||||
%18 = extractelement <4 x float> %15, i32 2
|
||||
%19 = extractelement <4 x float> %15, i32 3
|
||||
%20 = call float @fabs(float %18)
|
||||
%21 = fdiv float 1.000000e+00, %20
|
||||
%22 = fmul float %16, %21
|
||||
%23 = fadd float %22, 1.500000e+00
|
||||
%24 = fmul float %17, %21
|
||||
%25 = fadd float %24, 1.500000e+00
|
||||
%26 = insertelement <4 x float> undef, float %25, i32 0
|
||||
%27 = insertelement <4 x float> %26, float %23, i32 1
|
||||
%28 = insertelement <4 x float> %27, float %19, i32 2
|
||||
%29 = insertelement <4 x float> %28, float %25, i32 3
|
||||
%30 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %29, i32 16, i32 0, i32 4)
|
||||
call void @llvm.R600.store.swizzle(<4 x float> %30, i32 0, i32 0)
|
||||
%tmp = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
|
||||
%tmp1 = extractelement <4 x float> %tmp, i32 3
|
||||
%tmp2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
|
||||
%tmp3 = extractelement <4 x float> %tmp2, i32 0
|
||||
%tmp4 = fdiv float %tmp3, %tmp1
|
||||
%tmp5 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
|
||||
%tmp6 = extractelement <4 x float> %tmp5, i32 1
|
||||
%tmp7 = fdiv float %tmp6, %tmp1
|
||||
%tmp8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
|
||||
%tmp9 = extractelement <4 x float> %tmp8, i32 2
|
||||
%tmp10 = fdiv float %tmp9, %tmp1
|
||||
%tmp11 = insertelement <4 x float> undef, float %tmp4, i32 0
|
||||
%tmp12 = insertelement <4 x float> %tmp11, float %tmp7, i32 1
|
||||
%tmp13 = insertelement <4 x float> %tmp12, float %tmp10, i32 2
|
||||
%tmp14 = insertelement <4 x float> %tmp13, float 1.000000e+00, i32 3
|
||||
%tmp15 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %tmp14)
|
||||
%tmp16 = extractelement <4 x float> %tmp15, i32 0
|
||||
%tmp17 = extractelement <4 x float> %tmp15, i32 1
|
||||
%tmp18 = extractelement <4 x float> %tmp15, i32 2
|
||||
%tmp19 = extractelement <4 x float> %tmp15, i32 3
|
||||
%tmp20 = call float @llvm.fabs.f32(float %tmp18)
|
||||
%tmp21 = fdiv float 1.000000e+00, %tmp20
|
||||
%tmp22 = fmul float %tmp16, %tmp21
|
||||
%tmp23 = fadd float %tmp22, 1.500000e+00
|
||||
%tmp24 = fmul float %tmp17, %tmp21
|
||||
%tmp25 = fadd float %tmp24, 1.500000e+00
|
||||
%tmp26 = insertelement <4 x float> undef, float %tmp25, i32 0
|
||||
%tmp27 = insertelement <4 x float> %tmp26, float %tmp23, i32 1
|
||||
%tmp28 = insertelement <4 x float> %tmp27, float %tmp19, i32 2
|
||||
%tmp29 = insertelement <4 x float> %tmp28, float %tmp25, i32 3
|
||||
%tmp30 = shufflevector <4 x float> %tmp29, <4 x float> %tmp29, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%tmp31 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp30, i32 0, i32 0, i32 0, i32 16, i32 0, i32 1, i32 1, i32 1, i32 1)
|
||||
call void @llvm.R600.store.swizzle(<4 x float> %tmp31, i32 0, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: readnone
|
||||
declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #0
|
||||
|
||||
; Function Attrs: readnone
|
||||
declare float @fabs(float) #0
|
||||
|
||||
; Function Attrs: readnone
|
||||
declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) #0
|
||||
; Function Attrs: nounwind readnone
|
||||
declare float @llvm.fabs.f32(float) #0
|
||||
|
||||
declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
|
||||
|
||||
attributes #0 = { readnone }
|
||||
; Function Attrs: readnone
|
||||
declare <4 x float> @llvm.r600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) #0
|
||||
|
||||
attributes #0 = { nounwind readnone }
|
||||
|
@ -1,42 +0,0 @@
|
||||
;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
||||
|
||||
;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
|
||||
;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
|
||||
;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
|
||||
;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
|
||||
;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:UUNN
|
||||
;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZZ}} RID:0 SID:0 CT:NNNN
|
||||
;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZZ}} RID:0 SID:0 CT:NNNN
|
||||
;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZZ}} RID:0 SID:0 CT:UUNN
|
||||
;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYYW}} RID:0 SID:0 CT:NNUN
|
||||
;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN
|
||||
;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYYZ}} RID:0 SID:0 CT:NNUN
|
||||
;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN
|
||||
;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
|
||||
;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
|
||||
;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
|
||||
;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN
|
||||
|
||||
define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
|
||||
%addr = load <4 x float>, <4 x float> addrspace(1)* %in
|
||||
%res1 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %addr, i32 0, i32 0, i32 1)
|
||||
%res2 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res1, i32 0, i32 0, i32 2)
|
||||
%res3 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res2, i32 0, i32 0, i32 3)
|
||||
%res4 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res3, i32 0, i32 0, i32 4)
|
||||
%res5 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res4, i32 0, i32 0, i32 5)
|
||||
%res6 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res5, i32 0, i32 0, i32 6)
|
||||
%res7 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res6, i32 0, i32 0, i32 7)
|
||||
%res8 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res7, i32 0, i32 0, i32 8)
|
||||
%res9 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res8, i32 0, i32 0, i32 9)
|
||||
%res10 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res9, i32 0, i32 0, i32 10)
|
||||
%res11 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res10, i32 0, i32 0, i32 11)
|
||||
%res12 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res11, i32 0, i32 0, i32 12)
|
||||
%res13 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res12, i32 0, i32 0, i32 13)
|
||||
%res14 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res13, i32 0, i32 0, i32 14)
|
||||
%res15 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res14, i32 0, i32 0, i32 15)
|
||||
%res16 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res15, i32 0, i32 0, i32 16)
|
||||
store <4 x float> %res16, <4 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone
|
@ -1,11 +1,11 @@
|
||||
; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s
|
||||
|
||||
declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) nounwind readnone
|
||||
declare float @llvm.r600.dot4(<4 x float>, <4 x float>) nounwind readnone
|
||||
|
||||
define void @test_dp4(float addrspace(1)* %out, <4 x float> addrspace(1)* %a, <4 x float> addrspace(1)* %b) nounwind {
|
||||
%src0 = load <4 x float>, <4 x float> addrspace(1)* %a, align 16
|
||||
%src1 = load <4 x float>, <4 x float> addrspace(1)* %b, align 16
|
||||
%dp4 = call float @llvm.AMDGPU.dp4(<4 x float> %src0, <4 x float> %src1) nounwind readnone
|
||||
%dp4 = call float @llvm.r600.dot4(<4 x float> %src0, <4 x float> %src1) nounwind readnone
|
||||
store float %dp4, float addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
65
test/CodeGen/AMDGPU/llvm.r600.tex.ll
Normal file
65
test/CodeGen/AMDGPU/llvm.r600.tex.ll
Normal file
@ -0,0 +1,65 @@
|
||||
;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
|
||||
|
||||
;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
|
||||
;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
|
||||
;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
|
||||
;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
|
||||
;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:UUNN
|
||||
;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZZ}} RID:0 SID:0 CT:NNNN
|
||||
;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZZ}} RID:0 SID:0 CT:NNNN
|
||||
;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZZ}} RID:0 SID:0 CT:UUNN
|
||||
;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYYW}} RID:0 SID:0 CT:NNUN
|
||||
;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN
|
||||
;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYYZ}} RID:0 SID:0 CT:NNUN
|
||||
;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN
|
||||
;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
|
||||
;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
|
||||
;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
|
||||
;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN
|
||||
|
||||
define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
|
||||
bb:
|
||||
%addr = load <4 x float>, <4 x float> addrspace(1)* %in
|
||||
%tmp = shufflevector <4 x float> %addr, <4 x float> %addr, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%tmp1 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
|
||||
%tmp2 = shufflevector <4 x float> %tmp1, <4 x float> %tmp1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%tmp3 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
|
||||
%tmp4 = shufflevector <4 x float> %tmp3, <4 x float> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%tmp5 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp4, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
|
||||
%tmp6 = shufflevector <4 x float> %tmp5, <4 x float> %tmp5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%tmp7 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
|
||||
%tmp8 = shufflevector <4 x float> %tmp7, <4 x float> %tmp7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%tmp9 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1)
|
||||
%tmp10 = shufflevector <4 x float> %tmp9, <4 x float> %tmp9, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
|
||||
%tmp11 = call <4 x float> @llvm.r600.texc(<4 x float> %tmp10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
|
||||
%tmp12 = shufflevector <4 x float> %tmp11, <4 x float> %tmp11, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
|
||||
%tmp13 = call <4 x float> @llvm.r600.texc(<4 x float> %tmp12, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
|
||||
%tmp14 = shufflevector <4 x float> %tmp13, <4 x float> %tmp13, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
|
||||
%tmp15 = call <4 x float> @llvm.r600.texc(<4 x float> %tmp14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1)
|
||||
%tmp16 = shufflevector <4 x float> %tmp15, <4 x float> %tmp15, <4 x i32> <i32 0, i32 1, i32 1, i32 3>
|
||||
%tmp17 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 0, i32 1)
|
||||
%tmp18 = shufflevector <4 x float> %tmp17, <4 x float> %tmp17, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%tmp19 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp18, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 0, i32 1)
|
||||
%tmp20 = shufflevector <4 x float> %tmp19, <4 x float> %tmp19, <4 x i32> <i32 0, i32 1, i32 1, i32 2>
|
||||
%tmp21 = call <4 x float> @llvm.r600.texc(<4 x float> %tmp20, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 0, i32 1)
|
||||
%tmp22 = shufflevector <4 x float> %tmp21, <4 x float> %tmp21, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%tmp23 = call <4 x float> @llvm.r600.texc(<4 x float> %tmp22, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 0, i32 1)
|
||||
%tmp24 = shufflevector <4 x float> %tmp23, <4 x float> %tmp23, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%tmp25 = call <4 x float> @llvm.r600.texc(<4 x float> %tmp24, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
|
||||
%tmp26 = shufflevector <4 x float> %tmp25, <4 x float> %tmp25, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%tmp27 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp26, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
|
||||
%tmp28 = shufflevector <4 x float> %tmp27, <4 x float> %tmp27, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%tmp29 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp28, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
|
||||
%tmp30 = shufflevector <4 x float> %tmp29, <4 x float> %tmp29, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%tmp31 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp30, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 0, i32 1)
|
||||
store <4 x float> %tmp31, <4 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: readnone
|
||||
declare <4 x float> @llvm.r600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) #0
|
||||
|
||||
; Function Attrs: readnone
|
||||
declare <4 x float> @llvm.r600.texc(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) #0
|
||||
|
||||
attributes #0 = { nounwind readnone }
|
@ -88,14 +88,14 @@ main_body:
|
||||
%83 = insertelement <4 x float> %82, float %75, i32 1
|
||||
%84 = insertelement <4 x float> %83, float %77, i32 2
|
||||
%85 = insertelement <4 x float> %84, float 0.000000e+00, i32 3
|
||||
%86 = call float @llvm.AMDGPU.dp4(<4 x float> %81, <4 x float> %85)
|
||||
%86 = call float @llvm.r600.dot4(<4 x float> %81, <4 x float> %85)
|
||||
%87 = insertelement <4 x float> undef, float %86, i32 0
|
||||
call void @llvm.R600.store.swizzle(<4 x float> %87, i32 2, i32 2)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: readnone
|
||||
declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
|
||||
declare float @llvm.r600.dot4(<4 x float>, <4 x float>) #1
|
||||
|
||||
; Function Attrs: readonly
|
||||
declare float @fabs(float) #2
|
||||
|
@ -23,7 +23,7 @@ main_body:
|
||||
%15 = insertelement <4 x float> %14, float %8, i32 3
|
||||
%16 = insertelement <4 x float> %15, float %11, i32 3
|
||||
|
||||
%17 = call float @llvm.AMDGPU.dp4(<4 x float> %15,<4 x float> %16)
|
||||
%17 = call float @llvm.r600.dot4(<4 x float> %15,<4 x float> %16)
|
||||
%18 = insertelement <4 x float> undef, float %17, i32 0
|
||||
call void @llvm.R600.store.swizzle(<4 x float> %18, i32 0, i32 2)
|
||||
ret void
|
||||
@ -52,14 +52,14 @@ main_body:
|
||||
%15 = insertelement <4 x float> %14, float %8, i32 3
|
||||
%16 = insertelement <4 x float> %15, float %11, i32 3
|
||||
|
||||
%17 = call float @llvm.AMDGPU.dp4(<4 x float> %15,<4 x float> %16)
|
||||
%17 = call float @llvm.r600.dot4(<4 x float> %15,<4 x float> %16)
|
||||
%18 = insertelement <4 x float> undef, float %17, i32 0
|
||||
call void @llvm.R600.store.swizzle(<4 x float> %18, i32 0, i32 2)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: readnone
|
||||
declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
|
||||
declare float @llvm.r600.dot4(<4 x float>, <4 x float>) #1
|
||||
|
||||
declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
|
||||
|
||||
|
@ -11,7 +11,7 @@ main_body:
|
||||
br i1 %3, label %IF, label %ENDIF
|
||||
|
||||
IF: ; preds = %main_body
|
||||
%4 = call float @llvm.AMDGPU.dp4(<4 x float> %0, <4 x float> %0)
|
||||
%4 = call float @llvm.r600.dot4(<4 x float> %0, <4 x float> %0)
|
||||
br label %ENDIF
|
||||
|
||||
ENDIF: ; preds = %IF, %main_body
|
||||
@ -21,6 +21,6 @@ ENDIF: ; preds = %IF, %main_body
|
||||
ret void
|
||||
}
|
||||
|
||||
declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
|
||||
declare float @llvm.r600.dot4(<4 x float>, <4 x float>) #1
|
||||
declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
|
||||
attributes #1 = { readnone }
|
||||
|
@ -16,7 +16,7 @@ main_body:
|
||||
%8 = extractelement <4 x float> %reg3, i32 2
|
||||
%9 = load <4 x float>, <4 x float> addrspace(8)* null
|
||||
%10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
|
||||
%11 = call float @llvm.AMDGPU.dp4(<4 x float> %9, <4 x float> %9)
|
||||
%11 = call float @llvm.r600.dot4(<4 x float> %9, <4 x float> %9)
|
||||
%12 = fmul float %0, %3
|
||||
%13 = fadd float %12, %6
|
||||
%14 = fmul float %1, %4
|
||||
@ -29,14 +29,14 @@ main_body:
|
||||
%21 = insertelement <4 x float> %20, float %15, i32 1
|
||||
%22 = insertelement <4 x float> %21, float %17, i32 2
|
||||
%23 = insertelement <4 x float> %22, float %19, i32 3
|
||||
%24 = call float @llvm.AMDGPU.dp4(<4 x float> %23, <4 x float> %10)
|
||||
%24 = call float @llvm.r600.dot4(<4 x float> %23, <4 x float> %10)
|
||||
%25 = insertelement <4 x float> undef, float %24, i32 0
|
||||
call void @llvm.R600.store.swizzle(<4 x float> %25, i32 0, i32 2)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: readnone
|
||||
declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
|
||||
declare float @llvm.r600.dot4(<4 x float>, <4 x float>) #1
|
||||
|
||||
|
||||
declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
|
||||
|
@ -101,7 +101,7 @@ main_body:
|
||||
%93 = insertelement <4 x float> %92, float %5, i32 1
|
||||
%94 = insertelement <4 x float> %93, float %6, i32 2
|
||||
%95 = insertelement <4 x float> %94, float 0.000000e+00, i32 3
|
||||
%96 = call float @llvm.AMDGPU.dp4(<4 x float> %91, <4 x float> %95)
|
||||
%96 = call float @llvm.r600.dot4(<4 x float> %91, <4 x float> %95)
|
||||
%97 = call float @fabs(float %96)
|
||||
%98 = call float @llvm.AMDGPU.rsq.clamped.f32(float %97)
|
||||
%99 = fmul float %4, %98
|
||||
@ -137,7 +137,7 @@ main_body:
|
||||
%129 = insertelement <4 x float> %128, float %121, i32 1
|
||||
%130 = insertelement <4 x float> %129, float %123, i32 2
|
||||
%131 = insertelement <4 x float> %130, float 0.000000e+00, i32 3
|
||||
%132 = call float @llvm.AMDGPU.dp4(<4 x float> %127, <4 x float> %131)
|
||||
%132 = call float @llvm.r600.dot4(<4 x float> %127, <4 x float> %131)
|
||||
%133 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
|
||||
%134 = extractelement <4 x float> %133, i32 0
|
||||
%135 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
|
||||
@ -152,7 +152,7 @@ main_body:
|
||||
%144 = insertelement <4 x float> %143, float %136, i32 1
|
||||
%145 = insertelement <4 x float> %144, float %138, i32 2
|
||||
%146 = insertelement <4 x float> %145, float 0.000000e+00, i32 3
|
||||
%147 = call float @llvm.AMDGPU.dp4(<4 x float> %142, <4 x float> %146)
|
||||
%147 = call float @llvm.r600.dot4(<4 x float> %142, <4 x float> %146)
|
||||
%148 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
|
||||
%149 = extractelement <4 x float> %148, i32 0
|
||||
%150 = fmul float %149, %8
|
||||
@ -219,7 +219,7 @@ main_body:
|
||||
}
|
||||
|
||||
; Function Attrs: readnone
|
||||
declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
|
||||
declare float @llvm.r600.dot4(<4 x float>, <4 x float>) #1
|
||||
|
||||
; Function Attrs: readonly
|
||||
declare float @fabs(float) #2
|
||||
|
@ -1,57 +1,58 @@
|
||||
;RUN: llc < %s -march=r600 -mcpu=cayman
|
||||
; RUN: llc -march=r600 -mcpu=cayman < %s
|
||||
|
||||
define amdgpu_ps void @main(<4 x float> inreg, <4 x float> inreg) {
|
||||
define amdgpu_ps void @main(<4 x float> inreg %arg, <4 x float> inreg %arg1) {
|
||||
main_body:
|
||||
%2 = extractelement <4 x float> %0, i32 0
|
||||
%3 = extractelement <4 x float> %0, i32 1
|
||||
%4 = extractelement <4 x float> %0, i32 2
|
||||
%5 = extractelement <4 x float> %0, i32 3
|
||||
%6 = insertelement <4 x float> undef, float %2, i32 0
|
||||
%7 = insertelement <4 x float> %6, float %3, i32 1
|
||||
%8 = insertelement <4 x float> %7, float %4, i32 2
|
||||
%9 = insertelement <4 x float> %8, float %5, i32 3
|
||||
%10 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %9)
|
||||
%11 = extractelement <4 x float> %10, i32 0
|
||||
%12 = extractelement <4 x float> %10, i32 1
|
||||
%13 = extractelement <4 x float> %10, i32 2
|
||||
%14 = extractelement <4 x float> %10, i32 3
|
||||
%15 = call float @fabs(float %13)
|
||||
%16 = fdiv float 1.000000e+00, %15
|
||||
%17 = fmul float %11, %16
|
||||
%18 = fadd float %17, 1.500000e+00
|
||||
%19 = fmul float %12, %16
|
||||
%20 = fadd float %19, 1.500000e+00
|
||||
%21 = insertelement <4 x float> undef, float %20, i32 0
|
||||
%22 = insertelement <4 x float> %21, float %18, i32 1
|
||||
%23 = insertelement <4 x float> %22, float %14, i32 2
|
||||
%24 = insertelement <4 x float> %23, float %5, i32 3
|
||||
%25 = extractelement <4 x float> %24, i32 0
|
||||
%26 = extractelement <4 x float> %24, i32 1
|
||||
%27 = extractelement <4 x float> %24, i32 2
|
||||
%28 = extractelement <4 x float> %24, i32 3
|
||||
%29 = insertelement <4 x float> undef, float %25, i32 0
|
||||
%30 = insertelement <4 x float> %29, float %26, i32 1
|
||||
%31 = insertelement <4 x float> %30, float %27, i32 2
|
||||
%32 = insertelement <4 x float> %31, float %28, i32 3
|
||||
%33 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %32, i32 16, i32 0, i32 13)
|
||||
%34 = extractelement <4 x float> %33, i32 0
|
||||
%35 = insertelement <4 x float> undef, float %34, i32 0
|
||||
%36 = insertelement <4 x float> %35, float %34, i32 1
|
||||
%37 = insertelement <4 x float> %36, float %34, i32 2
|
||||
%38 = insertelement <4 x float> %37, float 1.000000e+00, i32 3
|
||||
call void @llvm.R600.store.swizzle(<4 x float> %38, i32 0, i32 0)
|
||||
%tmp = extractelement <4 x float> %arg, i32 0
|
||||
%tmp2 = extractelement <4 x float> %arg, i32 1
|
||||
%tmp3 = extractelement <4 x float> %arg, i32 2
|
||||
%tmp4 = extractelement <4 x float> %arg, i32 3
|
||||
%tmp5 = insertelement <4 x float> undef, float %tmp, i32 0
|
||||
%tmp6 = insertelement <4 x float> %tmp5, float %tmp2, i32 1
|
||||
%tmp7 = insertelement <4 x float> %tmp6, float %tmp3, i32 2
|
||||
%tmp8 = insertelement <4 x float> %tmp7, float %tmp4, i32 3
|
||||
%tmp9 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %tmp8)
|
||||
%tmp10 = extractelement <4 x float> %tmp9, i32 0
|
||||
%tmp11 = extractelement <4 x float> %tmp9, i32 1
|
||||
%tmp12 = extractelement <4 x float> %tmp9, i32 2
|
||||
%tmp13 = extractelement <4 x float> %tmp9, i32 3
|
||||
%tmp14 = call float @fabs(float %tmp12)
|
||||
%tmp15 = fdiv float 1.000000e+00, %tmp14
|
||||
%tmp16 = fmul float %tmp10, %tmp15
|
||||
%tmp17 = fadd float %tmp16, 1.500000e+00
|
||||
%tmp18 = fmul float %tmp11, %tmp15
|
||||
%tmp19 = fadd float %tmp18, 1.500000e+00
|
||||
%tmp20 = insertelement <4 x float> undef, float %tmp19, i32 0
|
||||
%tmp21 = insertelement <4 x float> %tmp20, float %tmp17, i32 1
|
||||
%tmp22 = insertelement <4 x float> %tmp21, float %tmp13, i32 2
|
||||
%tmp23 = insertelement <4 x float> %tmp22, float %tmp4, i32 3
|
||||
%tmp24 = extractelement <4 x float> %tmp23, i32 0
|
||||
%tmp25 = extractelement <4 x float> %tmp23, i32 1
|
||||
%tmp26 = extractelement <4 x float> %tmp23, i32 2
|
||||
%tmp27 = extractelement <4 x float> %tmp23, i32 3
|
||||
%tmp28 = insertelement <4 x float> undef, float %tmp24, i32 0
|
||||
%tmp29 = insertelement <4 x float> %tmp28, float %tmp25, i32 1
|
||||
%tmp30 = insertelement <4 x float> %tmp29, float %tmp26, i32 2
|
||||
%tmp31 = insertelement <4 x float> %tmp30, float %tmp27, i32 3
|
||||
%tmp32 = shufflevector <4 x float> %tmp31, <4 x float> %tmp31, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%tmp33 = call <4 x float> @llvm.r600.texc(<4 x float> %tmp32, i32 0, i32 0, i32 0, i32 16, i32 0, i32 1, i32 1, i32 1, i32 1)
|
||||
%tmp34 = extractelement <4 x float> %tmp33, i32 0
|
||||
%tmp35 = insertelement <4 x float> undef, float %tmp34, i32 0
|
||||
%tmp36 = insertelement <4 x float> %tmp35, float %tmp34, i32 1
|
||||
%tmp37 = insertelement <4 x float> %tmp36, float %tmp34, i32 2
|
||||
%tmp38 = insertelement <4 x float> %tmp37, float 1.000000e+00, i32 3
|
||||
call void @llvm.R600.store.swizzle(<4 x float> %tmp38, i32 0, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: readnone
|
||||
declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #1
|
||||
declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #0
|
||||
|
||||
; Function Attrs: readnone
|
||||
declare float @fabs(float) #1
|
||||
|
||||
; Function Attrs: readnone
|
||||
declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) #1
|
||||
declare float @fabs(float) #0
|
||||
|
||||
declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
|
||||
|
||||
attributes #1 = { readnone }
|
||||
; Function Attrs: readnone
|
||||
declare <4 x float> @llvm.r600.texc(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) #0
|
||||
|
||||
attributes #0 = { nounwind readnone }
|
||||
|
@ -1,39 +1,52 @@
|
||||
; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=rv710 | FileCheck %s
|
||||
|
||||
; CHECK: TEX 9 @6 ; encoding: [0x06,0x00,0x00,0x00,0x00,0x04,0x88,0x80]
|
||||
|
||||
define amdgpu_vs void @test(<4 x float> inreg %reg0, <4 x float> inreg %reg1) {
|
||||
%1 = extractelement <4 x float> %reg1, i32 0
|
||||
%2 = extractelement <4 x float> %reg1, i32 1
|
||||
%3 = extractelement <4 x float> %reg1, i32 2
|
||||
%4 = extractelement <4 x float> %reg1, i32 3
|
||||
%5 = insertelement <4 x float> undef, float %1, i32 0
|
||||
%6 = insertelement <4 x float> %5, float %2, i32 1
|
||||
%7 = insertelement <4 x float> %6, float %3, i32 2
|
||||
%8 = insertelement <4 x float> %7, float %4, i32 3
|
||||
%9 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 0, i32 0, i32 1)
|
||||
%10 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 1, i32 0, i32 1)
|
||||
%11 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 2, i32 0, i32 1)
|
||||
%12 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 3, i32 0, i32 1)
|
||||
%13 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 4, i32 0, i32 1)
|
||||
%14 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 5, i32 0, i32 1)
|
||||
%15 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 6, i32 0, i32 1)
|
||||
%16 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 7, i32 0, i32 1)
|
||||
%17 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 8, i32 0, i32 1)
|
||||
%18 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 9, i32 0, i32 1)
|
||||
%19 = fadd <4 x float> %9, %10
|
||||
%20 = fadd <4 x float> %19, %11
|
||||
%21 = fadd <4 x float> %20, %12
|
||||
%22 = fadd <4 x float> %21, %13
|
||||
%23 = fadd <4 x float> %22, %14
|
||||
%24 = fadd <4 x float> %23, %15
|
||||
%25 = fadd <4 x float> %24, %16
|
||||
%26 = fadd <4 x float> %25, %17
|
||||
%27 = fadd <4 x float> %26, %18
|
||||
call void @llvm.R600.store.swizzle(<4 x float> %27, i32 0, i32 2)
|
||||
ret void
|
||||
bb:
|
||||
%tmp = extractelement <4 x float> %reg1, i32 0
|
||||
%tmp1 = extractelement <4 x float> %reg1, i32 1
|
||||
%tmp2 = extractelement <4 x float> %reg1, i32 2
|
||||
%tmp3 = extractelement <4 x float> %reg1, i32 3
|
||||
%tmp4 = insertelement <4 x float> undef, float %tmp, i32 0
|
||||
%tmp5 = insertelement <4 x float> %tmp4, float %tmp1, i32 1
|
||||
%tmp6 = insertelement <4 x float> %tmp5, float %tmp2, i32 2
|
||||
%tmp7 = insertelement <4 x float> %tmp6, float %tmp3, i32 3
|
||||
%tmp8 = shufflevector <4 x float> %tmp7, <4 x float> %tmp7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%tmp9 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
|
||||
%tmp10 = shufflevector <4 x float> %tmp7, <4 x float> %tmp7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%tmp11 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp10, i32 0, i32 0, i32 0, i32 1, i32 0, i32 1, i32 1, i32 1, i32 1)
|
||||
%tmp12 = shufflevector <4 x float> %tmp7, <4 x float> %tmp7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%tmp13 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp12, i32 0, i32 0, i32 0, i32 2, i32 0, i32 1, i32 1, i32 1, i32 1)
|
||||
%tmp14 = shufflevector <4 x float> %tmp7, <4 x float> %tmp7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%tmp15 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp14, i32 0, i32 0, i32 0, i32 3, i32 0, i32 1, i32 1, i32 1, i32 1)
|
||||
%tmp16 = shufflevector <4 x float> %tmp7, <4 x float> %tmp7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%tmp17 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp16, i32 0, i32 0, i32 0, i32 4, i32 0, i32 1, i32 1, i32 1, i32 1)
|
||||
%tmp18 = shufflevector <4 x float> %tmp7, <4 x float> %tmp7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%tmp19 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp18, i32 0, i32 0, i32 0, i32 5, i32 0, i32 1, i32 1, i32 1, i32 1)
|
||||
%tmp20 = shufflevector <4 x float> %tmp7, <4 x float> %tmp7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%tmp21 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp20, i32 0, i32 0, i32 0, i32 6, i32 0, i32 1, i32 1, i32 1, i32 1)
|
||||
%tmp22 = shufflevector <4 x float> %tmp7, <4 x float> %tmp7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%tmp23 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp22, i32 0, i32 0, i32 0, i32 7, i32 0, i32 1, i32 1, i32 1, i32 1)
|
||||
%tmp24 = shufflevector <4 x float> %tmp7, <4 x float> %tmp7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%tmp25 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp24, i32 0, i32 0, i32 0, i32 8, i32 0, i32 1, i32 1, i32 1, i32 1)
|
||||
%tmp26 = shufflevector <4 x float> %tmp7, <4 x float> %tmp7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%tmp27 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp26, i32 0, i32 0, i32 0, i32 9, i32 0, i32 1, i32 1, i32 1, i32 1)
|
||||
%tmp28 = fadd <4 x float> %tmp9, %tmp11
|
||||
%tmp29 = fadd <4 x float> %tmp28, %tmp13
|
||||
%tmp30 = fadd <4 x float> %tmp29, %tmp15
|
||||
%tmp31 = fadd <4 x float> %tmp30, %tmp17
|
||||
%tmp32 = fadd <4 x float> %tmp31, %tmp19
|
||||
%tmp33 = fadd <4 x float> %tmp32, %tmp21
|
||||
%tmp34 = fadd <4 x float> %tmp33, %tmp23
|
||||
%tmp35 = fadd <4 x float> %tmp34, %tmp25
|
||||
%tmp36 = fadd <4 x float> %tmp35, %tmp27
|
||||
call void @llvm.R600.store.swizzle(<4 x float> %tmp36, i32 0, i32 2)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone
|
||||
|
||||
declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare <4 x float> @llvm.r600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) #0
|
||||
|
||||
attributes #0 = { nounwind readnone }
|
||||
|
@ -17,14 +17,14 @@ define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4
|
||||
%v0 = insertelement <4 x float> undef, float %r0, i32 0
|
||||
%v1 = insertelement <4 x float> %v0, float %r1, i32 1
|
||||
%v2 = insertelement <4 x float> %v1, float %r2, i32 2
|
||||
%res = call float @llvm.AMDGPU.dp4(<4 x float> %v2, <4 x float> %v2)
|
||||
%res = call float @llvm.r600.dot4(<4 x float> %v2, <4 x float> %v2)
|
||||
%vecres = insertelement <4 x float> undef, float %res, i32 0
|
||||
call void @llvm.R600.store.swizzle(<4 x float> %vecres, i32 0, i32 2)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: readnone
|
||||
declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
|
||||
declare float @llvm.r600.dot4(<4 x float>, <4 x float>) #1
|
||||
|
||||
declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user