mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-12-02 00:16:25 +00:00
AMDGPU: Change fdiv lowering based on !fpmath metadata
If 2.5 ulp is acceptable, denormals are not required, and isn't a reciprocal which will already be handled, replace with a faster fdiv. Simplify the lowering tests by using per function subtarget features. llvm-svn: 276051
This commit is contained in:
parent
7e559dbd5a
commit
75622c9e16
@ -20,6 +20,7 @@ class AMDGPUInstrPrinter;
|
||||
class AMDGPUSubtarget;
|
||||
class AMDGPUTargetMachine;
|
||||
class FunctionPass;
|
||||
class GCNTargetMachine;
|
||||
struct MachineSchedContext;
|
||||
class MCAsmInfo;
|
||||
class raw_ostream;
|
||||
@ -50,7 +51,7 @@ FunctionPass *createSIFixSGPRCopiesPass();
|
||||
FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
|
||||
FunctionPass *createSIDebuggerInsertNopsPass();
|
||||
FunctionPass *createSIInsertWaitsPass();
|
||||
FunctionPass *createAMDGPUCodeGenPreparePass(const TargetMachine *TM = nullptr);
|
||||
FunctionPass *createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM = nullptr);
|
||||
|
||||
ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C);
|
||||
|
||||
|
@ -14,7 +14,9 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "AMDGPU.h"
|
||||
#include "AMDGPUIntrinsicInfo.h"
|
||||
#include "AMDGPUSubtarget.h"
|
||||
#include "AMDGPUTargetMachine.h"
|
||||
|
||||
#include "llvm/Analysis/DivergenceAnalysis.h"
|
||||
#include "llvm/CodeGen/Passes.h"
|
||||
@ -30,15 +32,28 @@ using namespace llvm;
|
||||
namespace {
|
||||
|
||||
class AMDGPUCodeGenPrepare : public FunctionPass,
|
||||
public InstVisitor<AMDGPUCodeGenPrepare> {
|
||||
public InstVisitor<AMDGPUCodeGenPrepare, bool> {
|
||||
const GCNTargetMachine *TM;
|
||||
const SISubtarget *ST;
|
||||
DivergenceAnalysis *DA;
|
||||
const TargetMachine *TM;
|
||||
Module *Mod;
|
||||
bool HasUnsafeFPMath;
|
||||
|
||||
public:
|
||||
static char ID;
|
||||
AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) :
|
||||
FunctionPass(ID),
|
||||
TM(TM) { }
|
||||
TM(static_cast<const GCNTargetMachine *>(TM)),
|
||||
ST(nullptr),
|
||||
DA(nullptr),
|
||||
Mod(nullptr),
|
||||
HasUnsafeFPMath(false) { }
|
||||
|
||||
bool visitFDiv(BinaryOperator &I);
|
||||
|
||||
bool visitInstruction(Instruction &I) {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool doInitialization(Module &M) override;
|
||||
bool runOnFunction(Function &F) override;
|
||||
@ -55,7 +70,92 @@ public:
|
||||
|
||||
} // End anonymous namespace
|
||||
|
||||
static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) {
|
||||
const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
|
||||
if (!CNum)
|
||||
return false;
|
||||
|
||||
// Reciprocal f32 is handled separately without denormals.
|
||||
return UnsafeDiv && CNum->isExactlyValue(+1.0);
|
||||
}
|
||||
|
||||
// Insert an intrinsic for fast fdiv for safe math situations where we can
|
||||
// reduce precision. Leave fdiv for situations where the generic node is
|
||||
// expected to be optimized.
|
||||
bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
|
||||
Type *Ty = FDiv.getType();
|
||||
|
||||
// TODO: Handle half
|
||||
if (!Ty->getScalarType()->isFloatTy())
|
||||
return false;
|
||||
|
||||
MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
|
||||
if (!FPMath)
|
||||
return false;
|
||||
|
||||
const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
|
||||
float ULP = FPOp->getFPAccuracy();
|
||||
if (ULP < 2.5f)
|
||||
return false;
|
||||
|
||||
FastMathFlags FMF = FPOp->getFastMathFlags();
|
||||
bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() ||
|
||||
FMF.allowReciprocal();
|
||||
if (ST->hasFP32Denormals() && !UnsafeDiv)
|
||||
return false;
|
||||
|
||||
IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
|
||||
Builder.setFastMathFlags(FMF);
|
||||
Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
|
||||
|
||||
const AMDGPUIntrinsicInfo *II = TM->getIntrinsicInfo();
|
||||
Function *Decl
|
||||
= II->getDeclaration(Mod, AMDGPUIntrinsic::amdgcn_fdiv_fast, {});
|
||||
|
||||
Value *Num = FDiv.getOperand(0);
|
||||
Value *Den = FDiv.getOperand(1);
|
||||
|
||||
Value *NewFDiv = nullptr;
|
||||
|
||||
if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
|
||||
NewFDiv = UndefValue::get(VT);
|
||||
|
||||
// FIXME: Doesn't do the right thing for cases where the vector is partially
|
||||
// constant. This works when the scalarizer pass is run first.
|
||||
for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
|
||||
Value *NumEltI = Builder.CreateExtractElement(Num, I);
|
||||
Value *DenEltI = Builder.CreateExtractElement(Den, I);
|
||||
Value *NewElt;
|
||||
|
||||
if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) {
|
||||
NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
|
||||
} else {
|
||||
NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
|
||||
}
|
||||
|
||||
NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
|
||||
}
|
||||
} else {
|
||||
if (!shouldKeepFDivF32(Num, UnsafeDiv))
|
||||
NewFDiv = Builder.CreateCall(Decl, { Num, Den });
|
||||
}
|
||||
|
||||
if (NewFDiv) {
|
||||
FDiv.replaceAllUsesWith(NewFDiv);
|
||||
NewFDiv->takeName(&FDiv);
|
||||
FDiv.eraseFromParent();
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool hasUnsafeFPMath(const Function &F) {
|
||||
Attribute Attr = F.getFnAttribute("unsafe-fp-math");
|
||||
return Attr.getValueAsString() == "true";
|
||||
}
|
||||
|
||||
bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
|
||||
Mod = &M;
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -63,10 +163,21 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
|
||||
if (!TM || skipFunction(F))
|
||||
return false;
|
||||
|
||||
ST = &TM->getSubtarget<SISubtarget>(F);
|
||||
DA = &getAnalysis<DivergenceAnalysis>();
|
||||
visit(F);
|
||||
HasUnsafeFPMath = hasUnsafeFPMath(F);
|
||||
|
||||
return true;
|
||||
bool MadeChange = false;
|
||||
|
||||
for (BasicBlock &BB : F) {
|
||||
BasicBlock::iterator Next;
|
||||
for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) {
|
||||
Next = std::next(I);
|
||||
MadeChange |= visit(*I);
|
||||
}
|
||||
}
|
||||
|
||||
return MadeChange;
|
||||
}
|
||||
|
||||
INITIALIZE_TM_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
|
||||
@ -77,6 +188,6 @@ INITIALIZE_TM_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE,
|
||||
|
||||
char AMDGPUCodeGenPrepare::ID = 0;
|
||||
|
||||
FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const TargetMachine *TM) {
|
||||
FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM) {
|
||||
return new AMDGPUCodeGenPrepare(TM);
|
||||
}
|
||||
|
@ -29,16 +29,39 @@ static const char *const IntrinsicNameTable[] = {
|
||||
#undef GET_INTRINSIC_NAME_TABLE
|
||||
};
|
||||
|
||||
std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys,
|
||||
unsigned numTys) const {
|
||||
if (IntrID < Intrinsic::num_intrinsics) {
|
||||
return nullptr;
|
||||
}
|
||||
namespace {
|
||||
#define GET_INTRINSIC_ATTRIBUTES
|
||||
#include "AMDGPUGenIntrinsics.inc"
|
||||
#undef GET_INTRINSIC_ATTRIBUTES
|
||||
}
|
||||
|
||||
StringRef AMDGPUIntrinsicInfo::getName(unsigned IntrID,
|
||||
ArrayRef<Type *> Tys) const {
|
||||
if (IntrID < Intrinsic::num_intrinsics)
|
||||
return StringRef();
|
||||
|
||||
assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics &&
|
||||
"Invalid intrinsic ID");
|
||||
|
||||
std::string Result(IntrinsicNameTable[IntrID - Intrinsic::num_intrinsics]);
|
||||
return Result;
|
||||
return IntrinsicNameTable[IntrID - Intrinsic::num_intrinsics];
|
||||
}
|
||||
|
||||
std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys,
|
||||
unsigned NumTys) const {
|
||||
return getName(IntrID, makeArrayRef(Tys, NumTys)).str();
|
||||
}
|
||||
|
||||
FunctionType *AMDGPUIntrinsicInfo::getType(LLVMContext &Context, unsigned ID,
|
||||
ArrayRef<Type*> Tys) const {
|
||||
// FIXME: Re-use Intrinsic::getType machinery
|
||||
switch (ID) {
|
||||
case AMDGPUIntrinsic::amdgcn_fdiv_fast: {
|
||||
Type *F32Ty = Type::getFloatTy(Context);
|
||||
return FunctionType::get(F32Ty, { F32Ty, F32Ty }, false);
|
||||
}
|
||||
default:
|
||||
llvm_unreachable("unhandled intrinsic");
|
||||
}
|
||||
}
|
||||
|
||||
unsigned AMDGPUIntrinsicInfo::lookupName(const char *NameData,
|
||||
@ -69,7 +92,19 @@ bool AMDGPUIntrinsicInfo::isOverloaded(unsigned id) const {
|
||||
}
|
||||
|
||||
Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
|
||||
Type **Tys,
|
||||
unsigned numTys) const {
|
||||
llvm_unreachable("Not implemented");
|
||||
ArrayRef<Type *> Tys) const {
|
||||
FunctionType *FTy = getType(M->getContext(), IntrID, Tys);
|
||||
Function *F
|
||||
= cast<Function>(M->getOrInsertFunction(getName(IntrID, Tys), FTy));
|
||||
|
||||
AttributeSet AS = getAttributes(M->getContext(),
|
||||
static_cast<AMDGPUIntrinsic::ID>(IntrID));
|
||||
F->setAttributes(AS);
|
||||
return F;
|
||||
}
|
||||
|
||||
Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
|
||||
Type **Tys,
|
||||
unsigned NumTys) const {
|
||||
return getDeclaration(M, IntrID, makeArrayRef(Tys, NumTys));
|
||||
}
|
||||
|
@ -34,13 +34,23 @@ enum ID {
|
||||
class AMDGPUIntrinsicInfo final : public TargetIntrinsicInfo {
|
||||
public:
|
||||
AMDGPUIntrinsicInfo();
|
||||
|
||||
StringRef getName(unsigned IntrId, ArrayRef<Type *> Tys = None) const;
|
||||
|
||||
std::string getName(unsigned IntrId, Type **Tys = nullptr,
|
||||
unsigned numTys = 0) const override;
|
||||
unsigned NumTys = 0) const override;
|
||||
|
||||
unsigned lookupName(const char *Name, unsigned Len) const override;
|
||||
bool isOverloaded(unsigned IID) const override;
|
||||
Function *getDeclaration(Module *M, unsigned ID,
|
||||
Type **Tys = nullptr,
|
||||
unsigned numTys = 0) const override;
|
||||
unsigned NumTys = 0) const override;
|
||||
|
||||
Function *getDeclaration(Module *M, unsigned ID,
|
||||
ArrayRef<Type *> = None) const;
|
||||
|
||||
FunctionType *getType(LLVMContext &Context, unsigned ID,
|
||||
ArrayRef<Type*> Tys = None) const;
|
||||
};
|
||||
|
||||
} // end namespace llvm
|
||||
|
@ -309,6 +309,7 @@ public:
|
||||
ScheduleDAGInstrs *
|
||||
createMachineScheduler(MachineSchedContext *C) const override;
|
||||
|
||||
void addIRPasses() override;
|
||||
bool addPreISel() override;
|
||||
void addMachineSSAOptimization() override;
|
||||
bool addInstSelector() override;
|
||||
@ -499,6 +500,13 @@ void GCNPassConfig::addMachineSSAOptimization() {
|
||||
addPass(&DeadMachineInstructionElimID);
|
||||
}
|
||||
|
||||
void GCNPassConfig::addIRPasses() {
|
||||
// TODO: May want to move later or split into an early and late one.
|
||||
addPass(createAMDGPUCodeGenPreparePass(&getGCNTargetMachine()));
|
||||
|
||||
AMDGPUPassConfig::addIRPasses();
|
||||
}
|
||||
|
||||
bool GCNPassConfig::addInstSelector() {
|
||||
AMDGPUPassConfig::addInstSelector();
|
||||
addPass(createSILowerI1CopiesPass());
|
||||
|
@ -2113,6 +2113,9 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
|
||||
return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
|
||||
Op->getVTList(), Ops, VT, MMO);
|
||||
}
|
||||
case AMDGPUIntrinsic::amdgcn_fdiv_fast: {
|
||||
return lowerFDIV_FAST(Op, DAG);
|
||||
}
|
||||
case AMDGPUIntrinsic::SI_vs_load_input:
|
||||
return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT,
|
||||
Op.getOperand(1),
|
||||
@ -2427,7 +2430,8 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
|
||||
|
||||
// Catch division cases where we can use shortcuts with rcp and rsq
|
||||
// instructions.
|
||||
SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const {
|
||||
SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
|
||||
SelectionDAG &DAG) const {
|
||||
SDLoc SL(Op);
|
||||
SDValue LHS = Op.getOperand(0);
|
||||
SDValue RHS = Op.getOperand(1);
|
||||
@ -2468,47 +2472,48 @@ SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const {
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
// Faster 2.5 ULP division that does not support denormals.
|
||||
SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
|
||||
SDLoc SL(Op);
|
||||
SDValue LHS = Op.getOperand(1);
|
||||
SDValue RHS = Op.getOperand(2);
|
||||
|
||||
SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
|
||||
|
||||
const APFloat K0Val(BitsToFloat(0x6f800000));
|
||||
const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
|
||||
|
||||
const APFloat K1Val(BitsToFloat(0x2f800000));
|
||||
const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
|
||||
|
||||
const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
|
||||
|
||||
EVT SetCCVT =
|
||||
getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
|
||||
|
||||
SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
|
||||
|
||||
SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
|
||||
|
||||
// TODO: Should this propagate fast-math-flags?
|
||||
r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
|
||||
|
||||
// rcp does not support denormals.
|
||||
SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
|
||||
|
||||
SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
|
||||
|
||||
return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
|
||||
}
|
||||
|
||||
SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
|
||||
if (SDValue FastLowered = LowerFastFDIV(Op, DAG))
|
||||
if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
|
||||
return FastLowered;
|
||||
|
||||
SDLoc SL(Op);
|
||||
SDValue LHS = Op.getOperand(0);
|
||||
SDValue RHS = Op.getOperand(1);
|
||||
|
||||
// faster 2.5 ulp fdiv when using -amdgpu-fast-fdiv flag
|
||||
if (EnableAMDGPUFastFDIV) {
|
||||
// This does not support denormals.
|
||||
SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
|
||||
|
||||
const APFloat K0Val(BitsToFloat(0x6f800000));
|
||||
const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
|
||||
|
||||
const APFloat K1Val(BitsToFloat(0x2f800000));
|
||||
const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
|
||||
|
||||
const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
|
||||
|
||||
EVT SetCCVT =
|
||||
getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
|
||||
|
||||
SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
|
||||
|
||||
SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
|
||||
|
||||
// TODO: Should this propagate fast-math-flags?
|
||||
|
||||
r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
|
||||
|
||||
// rcp does not support denormals.
|
||||
SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
|
||||
|
||||
SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
|
||||
|
||||
return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
|
||||
}
|
||||
|
||||
// Generates more precise fpdiv32.
|
||||
const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
|
||||
|
||||
SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
|
||||
@ -2538,7 +2543,7 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
|
||||
|
||||
SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
|
||||
if (DAG.getTarget().Options.UnsafeFPMath)
|
||||
return LowerFastFDIV(Op, DAG);
|
||||
return lowerFastUnsafeFDIV(Op, DAG);
|
||||
|
||||
SDLoc SL(Op);
|
||||
SDValue X = Op.getOperand(0);
|
||||
|
@ -36,7 +36,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
|
||||
SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue lowerFastUnsafeFDIV(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerFDIV64(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const;
|
||||
|
@ -7,7 +7,8 @@
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// SI Intrinsic Definitions
|
||||
// Backend internal SI Intrinsic Definitions. User code should not
|
||||
// directly use these.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
@ -177,6 +178,12 @@ let TargetPrefix = "SI", isTarget = 1 in {
|
||||
} // End TargetPrefix = "SI", isTarget = 1
|
||||
|
||||
let TargetPrefix = "amdgcn", isTarget = 1 in {
|
||||
// Emit 2.5 ulp, no denormal division. Should only be inserted by
|
||||
// pass based on !fpmath metadata.
|
||||
def int_amdgcn_fdiv_fast : Intrinsic<
|
||||
[llvm_float_ty], [llvm_float_ty], [IntrNoMem]
|
||||
>;
|
||||
|
||||
/* Control flow Intrinsics */
|
||||
|
||||
def int_amdgcn_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], []>;
|
||||
|
@ -1,8 +1,242 @@
|
||||
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-codegenprepare < %s | FileCheck %s
|
||||
; RUN: opt -S -amdgpu-codegenprepare < %s
|
||||
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-codegenprepare %s | FileCheck %s
|
||||
; RUN: opt -S -amdgpu-codegenprepare %s | FileCheck -check-prefix=NOOP %s
|
||||
; Make sure this doesn't crash with no triple
|
||||
|
||||
; CHECK-LABEL: @foo(
|
||||
define void @foo() {
|
||||
; NOOP-LABEL: @noop_fdiv_fpmath(
|
||||
; NOOP: %md.25ulp = fdiv float %a, %b, !fpmath !0
|
||||
define void @noop_fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #3 {
|
||||
%md.25ulp = fdiv float %a, %b, !fpmath !0
|
||||
store volatile float %md.25ulp, float addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @fdiv_fpmath(
|
||||
; CHECK: %no.md = fdiv float %a, %b{{$}}
|
||||
; CHECK: %md.half.ulp = fdiv float %a, %b, !fpmath !1
|
||||
; CHECK: %md.1ulp = fdiv float %a, %b, !fpmath !2
|
||||
; CHECK: %md.25ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
|
||||
; CHECK: %md.3ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !3
|
||||
; CHECK: %fast.md.25ulp = call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
|
||||
; CHECK: arcp.md.25ulp = call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
|
||||
define void @fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #1 {
|
||||
%no.md = fdiv float %a, %b
|
||||
store volatile float %no.md, float addrspace(1)* %out
|
||||
|
||||
%md.half.ulp = fdiv float %a, %b, !fpmath !1
|
||||
store volatile float %md.half.ulp, float addrspace(1)* %out
|
||||
|
||||
%md.1ulp = fdiv float %a, %b, !fpmath !2
|
||||
store volatile float %md.1ulp, float addrspace(1)* %out
|
||||
|
||||
%md.25ulp = fdiv float %a, %b, !fpmath !0
|
||||
store volatile float %md.25ulp, float addrspace(1)* %out
|
||||
|
||||
%md.3ulp = fdiv float %a, %b, !fpmath !3
|
||||
store volatile float %md.3ulp, float addrspace(1)* %out
|
||||
|
||||
%fast.md.25ulp = fdiv fast float %a, %b, !fpmath !0
|
||||
store volatile float %fast.md.25ulp, float addrspace(1)* %out
|
||||
|
||||
%arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0
|
||||
store volatile float %arcp.md.25ulp, float addrspace(1)* %out
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @rcp_fdiv_fpmath(
|
||||
; CHECK: %no.md = fdiv float 1.000000e+00, %x{{$}}
|
||||
; CHECK: %md.half.ulp = fdiv float 1.000000e+00, %x, !fpmath !1
|
||||
; CHECK: %arcp.no.md = fdiv arcp float 1.000000e+00, %x{{$}}
|
||||
; CHECK: %arcp.25ulp = fdiv arcp float 1.000000e+00, %x, !fpmath !0
|
||||
; CHECK: %fast.no.md = fdiv fast float 1.000000e+00, %x{{$}}
|
||||
; CHECK: %fast.25ulp = fdiv fast float 1.000000e+00, %x, !fpmath !0
|
||||
define void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #1 {
|
||||
%no.md = fdiv float 1.0, %x
|
||||
store volatile float %no.md, float addrspace(1)* %out
|
||||
|
||||
%md.half.ulp = fdiv float 1.0, %x, !fpmath !1
|
||||
store volatile float %md.half.ulp, float addrspace(1)* %out
|
||||
|
||||
%arcp.no.md = fdiv arcp float 1.0, %x
|
||||
store volatile float %arcp.no.md, float addrspace(1)* %out
|
||||
|
||||
%arcp.25ulp = fdiv arcp float 1.0, %x, !fpmath !0
|
||||
store volatile float %arcp.25ulp, float addrspace(1)* %out
|
||||
|
||||
%fast.no.md = fdiv fast float 1.0, %x
|
||||
store volatile float %fast.no.md, float addrspace(1)* %out
|
||||
|
||||
%fast.25ulp = fdiv fast float 1.0, %x, !fpmath !0
|
||||
store volatile float %fast.25ulp, float addrspace(1)* %out
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @fdiv_fpmath_vector(
|
||||
; CHECK: %no.md = fdiv <2 x float> %a, %b{{$}}
|
||||
; CHECK: %md.half.ulp = fdiv <2 x float> %a, %b, !fpmath !1
|
||||
; CHECK: %md.1ulp = fdiv <2 x float> %a, %b, !fpmath !2
|
||||
|
||||
; CHECK: %[[A0:[0-9]+]] = extractelement <2 x float> %a, i64 0
|
||||
; CHECK: %[[B0:[0-9]+]] = extractelement <2 x float> %b, i64 0
|
||||
; CHECK: %[[FDIV0:[0-9]+]] = call float @llvm.amdgcn.fdiv.fast(float %[[A0]], float %[[B0]]), !fpmath !0
|
||||
; CHECK: %[[INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[FDIV0]], i64 0
|
||||
; CHECK: %[[A1:[0-9]+]] = extractelement <2 x float> %a, i64 1
|
||||
; CHECK: %[[B1:[0-9]+]] = extractelement <2 x float> %b, i64 1
|
||||
; CHECK: %[[FDIV1:[0-9]+]] = call float @llvm.amdgcn.fdiv.fast(float %[[A1]], float %[[B1]]), !fpmath !0
|
||||
; CHECK: %md.25ulp = insertelement <2 x float> %[[INS0]], float %[[FDIV1]], i64 1
|
||||
define void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #1 {
|
||||
%no.md = fdiv <2 x float> %a, %b
|
||||
store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
|
||||
|
||||
%md.half.ulp = fdiv <2 x float> %a, %b, !fpmath !1
|
||||
store volatile <2 x float> %md.half.ulp, <2 x float> addrspace(1)* %out
|
||||
|
||||
%md.1ulp = fdiv <2 x float> %a, %b, !fpmath !2
|
||||
store volatile <2 x float> %md.1ulp, <2 x float> addrspace(1)* %out
|
||||
|
||||
%md.25ulp = fdiv <2 x float> %a, %b, !fpmath !0
|
||||
store volatile <2 x float> %md.25ulp, <2 x float> addrspace(1)* %out
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @rcp_fdiv_fpmath_vector(
|
||||
; CHECK: %no.md = fdiv <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x{{$}}
|
||||
; CHECK: %md.half.ulp = fdiv <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x, !fpmath !1
|
||||
; CHECK: %arcp.no.md = fdiv arcp <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x{{$}}
|
||||
; CHECK: %fast.no.md = fdiv fast <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x{{$}}
|
||||
|
||||
; CHECK: extractelement <2 x float> %x
|
||||
; CHECK: fdiv arcp float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
|
||||
; CHECK: extractelement <2 x float> %x
|
||||
; CHECK: fdiv arcp float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
|
||||
; CHECK: store volatile <2 x float> %arcp.25ulp
|
||||
|
||||
; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
|
||||
; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
|
||||
; CHECK: store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
|
||||
define void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
|
||||
%no.md = fdiv <2 x float> <float 1.0, float 1.0>, %x
|
||||
store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
|
||||
|
||||
%md.half.ulp = fdiv <2 x float> <float 1.0, float 1.0>, %x, !fpmath !1
|
||||
store volatile <2 x float> %md.half.ulp, <2 x float> addrspace(1)* %out
|
||||
|
||||
%arcp.no.md = fdiv arcp <2 x float> <float 1.0, float 1.0>, %x
|
||||
store volatile <2 x float> %arcp.no.md, <2 x float> addrspace(1)* %out
|
||||
|
||||
%fast.no.md = fdiv fast <2 x float> <float 1.0, float 1.0>, %x
|
||||
store volatile <2 x float> %fast.no.md, <2 x float> addrspace(1)* %out
|
||||
|
||||
%arcp.25ulp = fdiv arcp <2 x float> <float 1.0, float 1.0>, %x, !fpmath !0
|
||||
store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out
|
||||
|
||||
%fast.25ulp = fdiv fast <2 x float> <float 1.0, float 1.0>, %x, !fpmath !0
|
||||
store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @rcp_fdiv_fpmath_vector_nonsplat(
|
||||
; CHECK: %no.md = fdiv <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x
|
||||
; CHECK: %arcp.no.md = fdiv arcp <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x
|
||||
; CHECK: %fast.no.md = fdiv fast <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x{{$}}
|
||||
|
||||
; CHECK: %[[X0:[0-9]+]] = extractelement <2 x float> %x, i64 0
|
||||
; CHECK: fdiv arcp float 1.000000e+00, %[[X0]], !fpmath !0
|
||||
; CHECK: %[[X1:[0-9]+]] = extractelement <2 x float> %x, i64 1
|
||||
; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float 2.000000e+00, float %[[X1]]), !fpmath !0
|
||||
; CHECK: store volatile <2 x float> %arcp.25ulp
|
||||
|
||||
; CHECK: %[[X0:[0-9]+]] = extractelement <2 x float> %x, i64 0
|
||||
; CHECK: fdiv fast float 1.000000e+00, %[[X0]], !fpmath !0
|
||||
; CHECK: %[[X1:[0-9]+]] = extractelement <2 x float> %x, i64 1
|
||||
; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float 2.000000e+00, float %[[X1]]), !fpmath !0
|
||||
; CHECK: store volatile <2 x float> %fast.25ulp
|
||||
define void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
|
||||
%no.md = fdiv <2 x float> <float 1.0, float 2.0>, %x
|
||||
store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
|
||||
|
||||
%arcp.no.md = fdiv arcp <2 x float> <float 1.0, float 2.0>, %x
|
||||
store volatile <2 x float> %arcp.no.md, <2 x float> addrspace(1)* %out
|
||||
|
||||
%fast.no.md = fdiv fast <2 x float> <float 1.0, float 2.0>, %x
|
||||
store volatile <2 x float> %fast.no.md, <2 x float> addrspace(1)* %out
|
||||
|
||||
%arcp.25ulp = fdiv arcp <2 x float> <float 1.0, float 2.0>, %x, !fpmath !0
|
||||
store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out
|
||||
|
||||
%fast.25ulp = fdiv fast <2 x float> <float 1.0, float 2.0>, %x, !fpmath !0
|
||||
store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; FIXME: Should be able to get fdiv for 1.0 component
|
||||
; CHECK-LABEL: @rcp_fdiv_fpmath_vector_partial_constant(
|
||||
; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
|
||||
; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
|
||||
; CHECK: store volatile <2 x float> %arcp.25ulp
|
||||
|
||||
; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
|
||||
; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
|
||||
; CHECK: store volatile <2 x float> %fast.25ulp
|
||||
define void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> addrspace(1)* %out, <2 x float> %x, <2 x float> %y) #1 {
|
||||
%x.insert = insertelement <2 x float> %x, float 1.0, i32 0
|
||||
|
||||
%arcp.25ulp = fdiv arcp <2 x float> %x.insert, %y, !fpmath !0
|
||||
store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out
|
||||
|
||||
%fast.25ulp = fdiv fast <2 x float> %x.insert, %y, !fpmath !0
|
||||
store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @fdiv_fpmath_f32_denormals(
|
||||
; CHECK: %no.md = fdiv float %a, %b{{$}}
|
||||
; CHECK: %md.half.ulp = fdiv float %a, %b, !fpmath !1
|
||||
; CHECK: %md.1ulp = fdiv float %a, %b, !fpmath !2
|
||||
; CHECK: %md.25ulp = fdiv float %a, %b, !fpmath !0
|
||||
; CHECK: %md.3ulp = fdiv float %a, %b, !fpmath !3
|
||||
; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
|
||||
; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
|
||||
define void @fdiv_fpmath_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {
|
||||
%no.md = fdiv float %a, %b
|
||||
store volatile float %no.md, float addrspace(1)* %out
|
||||
|
||||
%md.half.ulp = fdiv float %a, %b, !fpmath !1
|
||||
store volatile float %md.half.ulp, float addrspace(1)* %out
|
||||
|
||||
%md.1ulp = fdiv float %a, %b, !fpmath !2
|
||||
store volatile float %md.1ulp, float addrspace(1)* %out
|
||||
|
||||
%md.25ulp = fdiv float %a, %b, !fpmath !0
|
||||
store volatile float %md.25ulp, float addrspace(1)* %out
|
||||
|
||||
%md.3ulp = fdiv float %a, %b, !fpmath !3
|
||||
store volatile float %md.3ulp, float addrspace(1)* %out
|
||||
|
||||
%fast.md.25ulp = fdiv fast float %a, %b, !fpmath !0
|
||||
store volatile float %fast.md.25ulp, float addrspace(1)* %out
|
||||
|
||||
%arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0
|
||||
store volatile float %arcp.md.25ulp, float addrspace(1)* %out
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind optnone noinline }
|
||||
attributes #1 = { nounwind }
|
||||
attributes #2 = { nounwind "target-features"="+fp32-denormals" }
|
||||
|
||||
; CHECK: !0 = !{float 2.500000e+00}
|
||||
; CHECK: !1 = !{float 5.000000e-01}
|
||||
; CHECK: !2 = !{float 1.000000e+00}
|
||||
; CHECK: !3 = !{float 3.000000e+00}
|
||||
|
||||
!0 = !{float 2.500000e+00}
|
||||
!1 = !{float 5.000000e-01}
|
||||
!2 = !{float 1.000000e+00}
|
||||
!3 = !{float 3.000000e+00}
|
||||
|
@ -1,8 +1,4 @@
|
||||
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=I754 -check-prefix=FUNC %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -amdgpu-fast-fdiv < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
|
||||
; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=I754 -check-prefix=FUNC %s
|
||||
; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=UNSAFE-FP -check-prefix=FUNC %s
|
||||
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
|
||||
|
||||
; These tests check that fdiv is expanded correctly and also test that the
|
||||
@ -15,22 +11,59 @@
|
||||
; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
|
||||
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
|
||||
|
||||
; UNSAFE-FP: v_rcp_f32
|
||||
; UNSAFE-FP: v_mul_f32_e32
|
||||
; SI: v_div_scale_f32
|
||||
; SI-DAG: v_div_scale_f32
|
||||
|
||||
; SI-DAG: v_rcp_f32
|
||||
; SI-DAG: v_mul_f32
|
||||
|
||||
; I754-DAG: v_div_scale_f32
|
||||
; I754-DAG: v_rcp_f32
|
||||
; I754-DAG: v_fma_f32
|
||||
; I754-DAG: v_mul_f32
|
||||
; I754-DAG: v_fma_f32
|
||||
; I754-DAG: v_div_fixup_f32
|
||||
define void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) {
|
||||
; SI: v_fma_f32
|
||||
; SI: v_fma_f32
|
||||
; SI: v_mul_f32
|
||||
; SI: v_fma_f32
|
||||
; SI: v_fma_f32
|
||||
; SI: v_fma_f32
|
||||
; SI: v_div_fmas_f32
|
||||
; SI: v_div_fixup_f32
|
||||
define void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) #0 {
|
||||
entry:
|
||||
%0 = fdiv float %a, %b
|
||||
store float %0, float addrspace(1)* %out
|
||||
%fdiv = fdiv float %a, %b
|
||||
store float %fdiv, float addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}fdiv_25ulp_f32:
|
||||
; SI: v_cndmask_b32
|
||||
; SI: v_mul_f32
|
||||
; SI: v_rcp_f32
|
||||
; SI: v_mul_f32
|
||||
; SI: v_mul_f32
|
||||
define void @fdiv_25ulp_f32(float addrspace(1)* %out, float %a, float %b) #0 {
|
||||
entry:
|
||||
%fdiv = fdiv float %a, %b, !fpmath !0
|
||||
store float %fdiv, float addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; Use correct fdiv
|
||||
; FUNC-LABEL: {{^}}fdiv_25ulp_denormals_f32:
|
||||
; SI: v_fma_f32
|
||||
; SI: v_div_fmas_f32
|
||||
; SI: v_div_fixup_f32
|
||||
define void @fdiv_25ulp_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 {
|
||||
entry:
|
||||
%fdiv = fdiv float %a, %b, !fpmath !0
|
||||
store float %fdiv, float addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}fdiv_fast_denormals_f32:
|
||||
; SI: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
|
||||
; SI: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
|
||||
; SI-NOT: [[RESULT]]
|
||||
; SI: buffer_store_dword [[RESULT]]
|
||||
define void @fdiv_fast_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 {
|
||||
entry:
|
||||
%fdiv = fdiv fast float %a, %b
|
||||
store float %fdiv, float addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -38,15 +71,14 @@ entry:
|
||||
; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
|
||||
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
|
||||
|
||||
; UNSAFE-FP: v_rcp_f32
|
||||
; UNSAFE-FP: v_mul_f32_e32
|
||||
|
||||
; SI-DAG: v_rcp_f32
|
||||
; SI-DAG: v_mul_f32
|
||||
define void @fdiv_f32_fast_math(float addrspace(1)* %out, float %a, float %b) {
|
||||
; SI: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
|
||||
; SI: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
|
||||
; SI-NOT: [[RESULT]]
|
||||
; SI: buffer_store_dword [[RESULT]]
|
||||
define void @fdiv_f32_fast_math(float addrspace(1)* %out, float %a, float %b) #0 {
|
||||
entry:
|
||||
%0 = fdiv fast float %a, %b
|
||||
store float %0, float addrspace(1)* %out
|
||||
%fdiv = fdiv fast float %a, %b
|
||||
store float %fdiv, float addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -54,15 +86,14 @@ entry:
|
||||
; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
|
||||
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
|
||||
|
||||
; UNSAFE-FP: v_rcp_f32
|
||||
; UNSAFE-FP: v_mul_f32_e32
|
||||
|
||||
; SI-DAG: v_rcp_f32
|
||||
; SI-DAG: v_mul_f32
|
||||
define void @fdiv_f32_arcp_math(float addrspace(1)* %out, float %a, float %b) {
|
||||
; SI: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
|
||||
; SI: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
|
||||
; SI-NOT: [[RESULT]]
|
||||
; SI: buffer_store_dword [[RESULT]]
|
||||
define void @fdiv_f32_arcp_math(float addrspace(1)* %out, float %a, float %b) #0 {
|
||||
entry:
|
||||
%0 = fdiv arcp float %a, %b
|
||||
store float %0, float addrspace(1)* %out
|
||||
%fdiv = fdiv arcp float %a, %b
|
||||
store float %fdiv, float addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -72,26 +103,24 @@ entry:
|
||||
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
|
||||
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
|
||||
|
||||
; UNSAFE-FP: v_rcp_f32
|
||||
; UNSAFE-FP: v_rcp_f32
|
||||
; UNSAFE-FP: v_mul_f32_e32
|
||||
; UNSAFE-FP: v_mul_f32_e32
|
||||
|
||||
; SI-DAG: v_rcp_f32
|
||||
; SI-DAG: v_mul_f32
|
||||
; SI-DAG: v_rcp_f32
|
||||
; SI-DAG: v_mul_f32
|
||||
|
||||
; I754: v_div_scale_f32
|
||||
; I754: v_div_scale_f32
|
||||
; I754: v_div_scale_f32
|
||||
; I754: v_div_scale_f32
|
||||
; I754: v_div_fixup_f32
|
||||
; I754: v_div_fixup_f32
|
||||
define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
|
||||
; SI: v_div_scale_f32
|
||||
; SI: v_div_scale_f32
|
||||
; SI: v_div_scale_f32
|
||||
; SI: v_div_scale_f32
|
||||
define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
|
||||
entry:
|
||||
%0 = fdiv <2 x float> %a, %b
|
||||
store <2 x float> %0, <2 x float> addrspace(1)* %out
|
||||
%fdiv = fdiv <2 x float> %a, %b
|
||||
store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}fdiv_ulp25_v2f32:
|
||||
; SI: v_cmp_gt_f32
|
||||
; SI: v_cmp_gt_f32
|
||||
define void @fdiv_ulp25_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
|
||||
entry:
|
||||
%fdiv = fdiv arcp <2 x float> %a, %b, !fpmath !0
|
||||
store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -101,19 +130,12 @@ entry:
|
||||
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
|
||||
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
|
||||
|
||||
; UNSAFE-FP: v_rcp_f32
|
||||
; UNSAFE-FP: v_rcp_f32
|
||||
; UNSAFE-FP: v_mul_f32_e32
|
||||
; UNSAFE-FP: v_mul_f32_e32
|
||||
|
||||
; SI-DAG: v_rcp_f32
|
||||
; SI-DAG: v_mul_f32
|
||||
; SI-DAG: v_rcp_f32
|
||||
; SI-DAG: v_mul_f32
|
||||
define void @fdiv_v2f32_fast_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
|
||||
; SI: v_rcp_f32
|
||||
; SI: v_rcp_f32
|
||||
define void @fdiv_v2f32_fast_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
|
||||
entry:
|
||||
%0 = fdiv fast <2 x float> %a, %b
|
||||
store <2 x float> %0, <2 x float> addrspace(1)* %out
|
||||
%fdiv = fdiv fast <2 x float> %a, %b
|
||||
store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -123,19 +145,12 @@ entry:
|
||||
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
|
||||
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
|
||||
|
||||
; UNSAFE-FP: v_rcp_f32
|
||||
; UNSAFE-FP: v_rcp_f32
|
||||
; UNSAFE-FP: v_mul_f32_e32
|
||||
; UNSAFE-FP: v_mul_f32_e32
|
||||
|
||||
; SI-DAG: v_rcp_f32
|
||||
; SI-DAG: v_mul_f32
|
||||
; SI-DAG: v_rcp_f32
|
||||
; SI-DAG: v_mul_f32
|
||||
define void @fdiv_v2f32_arcp_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
|
||||
; SI: v_rcp_f32
|
||||
; SI: v_rcp_f32
|
||||
define void @fdiv_v2f32_arcp_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
|
||||
entry:
|
||||
%0 = fdiv arcp <2 x float> %a, %b
|
||||
store <2 x float> %0, <2 x float> addrspace(1)* %out
|
||||
%fdiv = fdiv arcp <2 x float> %a, %b
|
||||
store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -149,37 +164,11 @@ entry:
|
||||
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
|
||||
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
|
||||
|
||||
; UNSAFE-FP: v_rcp_f32_e32
|
||||
; UNSAFE-FP: v_rcp_f32_e32
|
||||
; UNSAFE-FP: v_rcp_f32_e32
|
||||
; UNSAFE-FP: v_rcp_f32_e32
|
||||
; UNSAFE-FP: v_mul_f32_e32
|
||||
; UNSAFE-FP: v_mul_f32_e32
|
||||
; UNSAFE-FP: v_mul_f32_e32
|
||||
; UNSAFE-FP: v_mul_f32_e32
|
||||
|
||||
; SI-DAG: v_rcp_f32
|
||||
; SI-DAG: v_mul_f32
|
||||
; SI-DAG: v_rcp_f32
|
||||
; SI-DAG: v_mul_f32
|
||||
; SI-DAG: v_rcp_f32
|
||||
; SI-DAG: v_mul_f32
|
||||
; SI-DAG: v_rcp_f32
|
||||
; SI-DAG: v_mul_f32
|
||||
|
||||
; I754: v_div_scale_f32
|
||||
; I754: v_div_scale_f32
|
||||
; I754: v_div_scale_f32
|
||||
; I754: v_div_scale_f32
|
||||
; I754: v_div_scale_f32
|
||||
; I754: v_div_scale_f32
|
||||
; I754: v_div_scale_f32
|
||||
; I754: v_div_scale_f32
|
||||
; I754: v_div_fixup_f32
|
||||
; I754: v_div_fixup_f32
|
||||
; I754: v_div_fixup_f32
|
||||
; I754: v_div_fixup_f32
|
||||
define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
|
||||
; SI: v_div_fixup_f32
|
||||
; SI: v_div_fixup_f32
|
||||
; SI: v_div_fixup_f32
|
||||
; SI: v_div_fixup_f32
|
||||
define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
|
||||
%b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
|
||||
%a = load <4 x float>, <4 x float> addrspace(1) * %in
|
||||
%b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
|
||||
@ -198,24 +187,11 @@ define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)
|
||||
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
|
||||
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
|
||||
|
||||
; UNSAFE-FP: v_rcp_f32_e32
|
||||
; UNSAFE-FP: v_rcp_f32_e32
|
||||
; UNSAFE-FP: v_rcp_f32_e32
|
||||
; UNSAFE-FP: v_rcp_f32_e32
|
||||
; UNSAFE-FP: v_mul_f32_e32
|
||||
; UNSAFE-FP: v_mul_f32_e32
|
||||
; UNSAFE-FP: v_mul_f32_e32
|
||||
; UNSAFE-FP: v_mul_f32_e32
|
||||
|
||||
; SI-DAG: v_rcp_f32
|
||||
; SI-DAG: v_mul_f32
|
||||
; SI-DAG: v_rcp_f32
|
||||
; SI-DAG: v_mul_f32
|
||||
; SI-DAG: v_rcp_f32
|
||||
; SI-DAG: v_mul_f32
|
||||
; SI-DAG: v_rcp_f32
|
||||
; SI-DAG: v_mul_f32
|
||||
define void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
|
||||
; SI: v_rcp_f32
|
||||
; SI: v_rcp_f32
|
||||
; SI: v_rcp_f32
|
||||
; SI: v_rcp_f32
|
||||
define void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
|
||||
%b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
|
||||
%a = load <4 x float>, <4 x float> addrspace(1) * %in
|
||||
%b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
|
||||
@ -234,24 +210,11 @@ define void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> ad
|
||||
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
|
||||
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
|
||||
|
||||
; UNSAFE-FP: v_rcp_f32_e32
|
||||
; UNSAFE-FP: v_rcp_f32_e32
|
||||
; UNSAFE-FP: v_rcp_f32_e32
|
||||
; UNSAFE-FP: v_rcp_f32_e32
|
||||
; UNSAFE-FP: v_mul_f32_e32
|
||||
; UNSAFE-FP: v_mul_f32_e32
|
||||
; UNSAFE-FP: v_mul_f32_e32
|
||||
; UNSAFE-FP: v_mul_f32_e32
|
||||
|
||||
; SI-DAG: v_rcp_f32
|
||||
; SI-DAG: v_mul_f32
|
||||
; SI-DAG: v_rcp_f32
|
||||
; SI-DAG: v_mul_f32
|
||||
; SI-DAG: v_rcp_f32
|
||||
; SI-DAG: v_mul_f32
|
||||
; SI-DAG: v_rcp_f32
|
||||
; SI-DAG: v_mul_f32
|
||||
define void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
|
||||
; SI: v_rcp_f32
|
||||
; SI: v_rcp_f32
|
||||
; SI: v_rcp_f32
|
||||
; SI: v_rcp_f32
|
||||
define void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
|
||||
%b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
|
||||
%a = load <4 x float>, <4 x float> addrspace(1) * %in
|
||||
%b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
|
||||
@ -259,3 +222,9 @@ define void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out, <4 x float> ad
|
||||
store <4 x float> %result, <4 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind "enable-unsafe-fp-math"="false" "target-features"="-fp32-denormals" }
|
||||
attributes #1 = { nounwind "enable-unsafe-fp-math"="true" "target-features"="-fp32-denormals" }
|
||||
attributes #2 = { nounwind "enable-unsafe-fp-math"="false" "target-features"="+fp32-denormals" }
|
||||
|
||||
!0 = !{float 2.500000e+00}
|
||||
|
18
test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll
Normal file
18
test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll
Normal file
@ -0,0 +1,18 @@
|
||||
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
|
||||
|
||||
declare float @llvm.amdgcn.fdiv.fast(float, float) #0
|
||||
|
||||
; CHECK-LABEL: {{^}}test_fdiv_fast:
|
||||
; CHECK: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc
|
||||
; CHECK: v_mul_f32_e32
|
||||
; CHECK: v_rcp_f32_e32
|
||||
; CHECK: v_mul_f32_e32
|
||||
; CHECK: v_mul_f32_e32
|
||||
define void @test_fdiv_fast(float addrspace(1)* %out, float %a, float %b) #1 {
|
||||
%fdiv = call float @llvm.amdgcn.fdiv.fast(float %a, float %b)
|
||||
store float %fdiv, float addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind readnone }
|
||||
attributes #1 = { nounwind }
|
Loading…
Reference in New Issue
Block a user