AMDGPU: Redefine clamp node as clamp 0.0-1.0

Change implementation to use max instead of add.
min/max/med3 do not flush denormals regardless of the mode,
so it is OK to use it whether or not they are enabled.

Also allow using clamp with f16, and use knowledge
of dx10_clamp.

llvm-svn: 295788
This commit is contained in:
Matt Arsenault 2017-02-21 23:35:48 +00:00
parent e6e206d4b4
commit 2fdf2a1a18
15 changed files with 713 additions and 33 deletions

View File

@ -257,6 +257,12 @@ def FeatureFP16Denormals : SubtargetFeature<"fp16-denormals",
[FeatureFP64FP16Denormals]
>;
def FeatureDX10Clamp : SubtargetFeature<"dx10-clamp",
"DX10Clamp",
"true",
"clamp modifier clamps NaNs to 0.0"
>;
def FeatureFPExceptions : SubtargetFeature<"fp-exceptions",
"FPExceptions",
"true",

View File

@ -586,7 +586,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
ProgInfo.IEEEMode = STM.enableIEEEBit(MF);
// Make clamp modifier on NaN input returns 0.
ProgInfo.DX10Clamp = 1;
ProgInfo.DX10Clamp = STM.enableDX10Clamp();
const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
ProgInfo.ScratchSize = FrameInfo.getStackSize();

View File

@ -1012,22 +1012,29 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
EVT VT = Op.getValueType();
switch (IntrinsicID) {
default: return Op;
case AMDGPUIntrinsic::AMDGPU_clamp: // Legacy name.
return DAG.getNode(AMDGPUISD::CLAMP, DL, VT,
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
default: return Op;
case AMDGPUIntrinsic::AMDGPU_clamp: {
// Deprecated in favor of emitting min/max combo or fmed3.
ConstantFPSDNode *CSrc1 = dyn_cast<ConstantFPSDNode>(Op.getOperand(2));
ConstantFPSDNode *CSrc2 = dyn_cast<ConstantFPSDNode>(Op.getOperand(3));
if (CSrc1 && CSrc2 && CSrc1->isZero() && CSrc2->isExactlyValue(1.0))
return DAG.getNode(AMDGPUISD::CLAMP, DL, VT, Op.getOperand(1));
case AMDGPUIntrinsic::AMDGPU_bfe_i32:
return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3));
SDValue Max = DAG.getNode(ISD::FMAXNUM, DL, VT, Op.getOperand(1),
Op.getOperand(2));
return DAG.getNode(ISD::FMINNUM, DL, VT, Max, Op.getOperand(3));
}
case AMDGPUIntrinsic::AMDGPU_bfe_i32:
return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3));
case AMDGPUIntrinsic::AMDGPU_bfe_u32:
return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3));
case AMDGPUIntrinsic::AMDGPU_bfe_u32:
return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3));
}
}
@ -2445,6 +2452,28 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
SN->getBasePtr(), SN->getMemOperand());
}
SDValue AMDGPUTargetLowering::performClampCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
if (!CSrc)
return SDValue();
const APFloat &F = CSrc->getValueAPF();
APFloat Zero = APFloat::getZero(F.getSemantics());
APFloat::cmpResult Cmp0 = F.compare(Zero);
if (Cmp0 == APFloat::cmpLessThan ||
(Cmp0 == APFloat::cmpUnordered && Subtarget->enableDX10Clamp())) {
return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
}
APFloat One(F.getSemantics(), "1.0");
APFloat::cmpResult Cmp1 = F.compare(One);
if (Cmp1 == APFloat::cmpGreaterThan)
return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
return SDValue(CSrc, 0);
}
/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
/// binary operation \p Opc to it with the corresponding constant operands.
SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
@ -3323,6 +3352,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
return performLoadCombine(N, DCI);
case ISD::STORE:
return performStoreCombine(N, DCI);
case AMDGPUISD::CLAMP:
return performClampCombine(N, DCI);
}
return SDValue();
}

View File

@ -70,6 +70,7 @@ protected:
bool shouldCombineMemoryType(EVT VT) const;
SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performClampCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL,
unsigned Opc, SDValue LHS,
@ -238,7 +239,11 @@ enum NodeType : unsigned {
RETURN,
DWORDADDR,
FRACT,
/// CLAMP value between 0.0 and 1.0. NaN clamped to 0, following clamp output
/// modifier behavior with dx10_enable.
CLAMP,
// This is SETCC with the full mask result which is used for a compare with a
// result bit per item in the wavefront.
SETCC,

View File

@ -92,7 +92,7 @@ def AMDGPUfmul_legacy : SDNode<"AMDGPUISD::FMUL_LEGACY", SDTFPBinOp,
[SDNPCommutative, SDNPAssociative]
>;
def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPTernaryOp, []>;
def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPUnaryOp>;
// out = min(a, b) a and b are floats, where a nan comparison fails.
def AMDGPUfmin_legacy : SDNode<"AMDGPUISD::FMIN_LEGACY", SDTFPBinOp,

View File

@ -452,7 +452,7 @@ class CLAMP <RegisterClass rc> : AMDGPUShaderInst <
(outs rc:$dst),
(ins rc:$src0),
"CLAMP $dst, $src0",
[(set f32:$dst, (AMDGPUclamp f32:$src0, (f32 FP_ZERO), (f32 FP_ONE)))]
[(set f32:$dst, (AMDGPUclamp f32:$src0))]
>;
class FABS <RegisterClass rc> : AMDGPUShaderInst <

View File

@ -42,7 +42,7 @@ AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
// for SI has the unhelpful behavior that it unsets everything else if you
// disable it.
SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+load-store-opt,");
SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+dx10-clamp,+load-store-opt,");
if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
@ -89,6 +89,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
FP32Denormals(false),
FP64FP16Denormals(false),
FPExceptions(false),
DX10Clamp(false),
FlatForGlobal(false),
UnalignedScratchAccess(false),
UnalignedBufferAccess(false),

View File

@ -103,6 +103,7 @@ protected:
bool FP32Denormals;
bool FP64FP16Denormals;
bool FPExceptions;
bool DX10Clamp;
bool FlatForGlobal;
bool UnalignedScratchAccess;
bool UnalignedBufferAccess;
@ -294,10 +295,6 @@ public:
return DumpCode;
}
bool enableIEEEBit(const MachineFunction &MF) const {
return AMDGPU::isCompute(MF.getFunction()->getCallingConv());
}
/// Return the amount of LDS that can be used that will not restrict the
/// occupancy lower than WaveCount.
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
@ -323,6 +320,14 @@ public:
return FPExceptions;
}
bool enableDX10Clamp() const {
return DX10Clamp;
}
bool enableIEEEBit(const MachineFunction &MF) const {
return AMDGPU::isCompute(MF.getFunction()->getCallingConv());
}
bool useFlatForGlobal() const {
return FlatForGlobal;
}

View File

@ -3995,8 +3995,10 @@ static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
return DAG.isKnownNeverNaN(Op);
}
static SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
SDValue Op0, SDValue Op1) {
SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
const SDLoc &SL,
SDValue Op0,
SDValue Op1) const {
ConstantFPSDNode *K1 = dyn_cast<ConstantFPSDNode>(Op1);
if (!K1)
return SDValue();
@ -4010,6 +4012,20 @@ static SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
if (Cmp == APFloat::cmpGreaterThan)
return SDValue();
// TODO: Check IEEE bit enabled?
EVT VT = K0->getValueType(0);
if (Subtarget->enableDX10Clamp()) {
// If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
// hardware fmed3 behavior converting to a min.
// FIXME: Should this be allowing -0.0?
if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
}
// No med3 for f16, but clamp is possible.
if (VT == MVT::f16)
return SDValue();
// This isn't safe with signaling NaNs because in IEEE mode, min/max on a
// signaling NaN gives a quiet NaN. The quiet NaN input to the min would then
// give the other result, which is different from med3 with a NaN input.
@ -4074,7 +4090,9 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
(Opc == AMDGPUISD::FMIN_LEGACY &&
Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
N->getValueType(0) == MVT::f32 && Op0.hasOneUse()) {
(N->getValueType(0) == MVT::f32 ||
(N->getValueType(0) == MVT::f16 && Subtarget->has16BitInsts())) &&
Op0.hasOneUse()) {
if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
return Res;
}
@ -4082,6 +4100,60 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
return SDValue();
}
static bool isClampZeroToOne(SDValue A, SDValue B) {
if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {
if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {
// FIXME: Should this be allowing -0.0?
return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
(CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
}
}
return false;
}
// FIXME: Should only worry about snans for version with chain.
SDValue SITargetLowering::performFMed3Combine(SDNode *N,
DAGCombinerInfo &DCI) const {
EVT VT = N->getValueType(0);
// v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
// NaNs. With a NaN input, the order of the operands may change the result.
SelectionDAG &DAG = DCI.DAG;
SDLoc SL(N);
SDValue Src0 = N->getOperand(0);
SDValue Src1 = N->getOperand(1);
SDValue Src2 = N->getOperand(2);
if (isClampZeroToOne(Src0, Src1)) {
// const_a, const_b, x -> clamp is safe in all cases including signaling
// nans.
// FIXME: Should this be allowing -0.0?
return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
}
// FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
// handling no dx10-clamp?
if (Subtarget->enableDX10Clamp()) {
// If NaNs is clamped to 0, we are free to reorder the inputs.
if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
std::swap(Src0, Src1);
if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
std::swap(Src1, Src2);
if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
std::swap(Src0, Src1);
if (isClampZeroToOne(Src1, Src2))
return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
}
return SDValue();
}
unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
const SDNode *N0,
const SDNode *N1) const {
@ -4348,6 +4420,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
case AMDGPUISD::CVT_F32_UBYTE2:
case AMDGPUISD::CVT_F32_UBYTE3:
return performCvtF32UByteNCombine(N, DCI);
case AMDGPUISD::FMED3:
return performFMed3Combine(N, DCI);
}
return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
}

View File

@ -84,7 +84,10 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFCanonicalizeCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
SDValue Op0, SDValue Op1) const;
SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const;
unsigned getFusedOpcode(const SelectionDAG &DAG,
const SDNode *N0, const SDNode *N1) const;

View File

@ -625,6 +625,7 @@ def SRCMODS {
def DSTCLAMP {
int NONE = 0;
int ENABLE = 1;
}
def DSTOMOD {

View File

@ -647,12 +647,20 @@ def : BitConvert <v16f32, v16i32, VReg_512>;
/********** Src & Dst modifiers **********/
/********** =================== **********/
def : Pat <
(AMDGPUclamp (VOP3Mods0Clamp f32:$src0, i32:$src0_modifiers, i32:$omod),
(f32 FP_ZERO), (f32 FP_ONE)),
(V_ADD_F32_e64 $src0_modifiers, $src0, 0, (i32 0), 1, $omod)
// If denormals are not enabled, it only impacts the compare of the
// inputs. The output result is not flushed.
class ClampPat<Instruction inst, ValueType vt> : Pat <
(vt (AMDGPUclamp
(VOP3Mods0Clamp vt:$src0, i32:$src0_modifiers, i32:$omod))),
(inst i32:$src0_modifiers, vt:$src0,
i32:$src0_modifiers, vt:$src0, DSTCLAMP.ENABLE, $omod)
>;
// TODO: Does f64 support clamp?
def : ClampPat<V_MAX_F32_e64, f32>;
def : ClampPat<V_MAX_F16_e64, f16>;
/********** ================================ **********/
/********** Floating point absolute/negative **********/
/********** ================================ **********/

View File

@ -0,0 +1,535 @@
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; GCN-LABEL: {{^}}v_clamp_f32:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
define amdgpu_kernel void @v_clamp_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
%a = load float, float addrspace(1)* %gep0
%max = call float @llvm.maxnum.f32(float %a, float 0.0)
%med = call float @llvm.minnum.f32(float %max, float 1.0)
store float %med, float addrspace(1)* %out.gep
ret void
}
; GCN-LABEL: {{^}}v_clamp_neg_f32:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: v_max_f32_e64 v{{[0-9]+}}, -[[A]], -[[A]] clamp{{$}}
define amdgpu_kernel void @v_clamp_neg_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
%a = load float, float addrspace(1)* %gep0
%fneg.a = fsub float -0.0, %a
%max = call float @llvm.maxnum.f32(float %fneg.a, float 0.0)
%med = call float @llvm.minnum.f32(float %max, float 1.0)
store float %med, float addrspace(1)* %out.gep
ret void
}
; GCN-LABEL: {{^}}v_clamp_negabs_f32:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: v_max_f32_e64 v{{[0-9]+}}, -|[[A]]|, -|[[A]]| clamp{{$}}
define amdgpu_kernel void @v_clamp_negabs_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
%a = load float, float addrspace(1)* %gep0
%fabs.a = call float @llvm.fabs.f32(float %a)
%fneg.fabs.a = fsub float -0.0, %fabs.a
%max = call float @llvm.maxnum.f32(float %fneg.fabs.a, float 0.0)
%med = call float @llvm.minnum.f32(float %max, float 1.0)
store float %med, float addrspace(1)* %out.gep
ret void
}
; GCN-LABEL: {{^}}v_clamp_negzero_f32:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: v_bfrev_b32_e32 [[SIGNBIT:v[0-9]+]], 1
; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[SIGNBIT]], 1.0
define amdgpu_kernel void @v_clamp_negzero_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
%a = load float, float addrspace(1)* %gep0
%max = call float @llvm.maxnum.f32(float %a, float -0.0)
%med = call float @llvm.minnum.f32(float %max, float 1.0)
store float %med, float addrspace(1)* %out.gep
ret void
}
; GCN-LABEL: {{^}}v_clamp_multi_use_max_f32:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]]
; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 1.0, [[MAX]]
define amdgpu_kernel void @v_clamp_multi_use_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
%a = load float, float addrspace(1)* %gep0
%max = call float @llvm.maxnum.f32(float %a, float 0.0)
%med = call float @llvm.minnum.f32(float %max, float 1.0)
store float %med, float addrspace(1)* %out.gep
store volatile float %max, float addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}v_clamp_f16:
; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
; VI: v_max_f16_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
; SI: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[A]]
; SI: v_max_f32_e64 v{{[0-9]+}}, [[CVT]], [[CVT]] clamp{{$}}
; SI: v_cvt_f16_f32_e32
define amdgpu_kernel void @v_clamp_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
%out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
%a = load half, half addrspace(1)* %gep0
%max = call half @llvm.maxnum.f16(half %a, half 0.0)
%med = call half @llvm.minnum.f16(half %max, half 1.0)
store half %med, half addrspace(1)* %out.gep
ret void
}
; GCN-LABEL: {{^}}v_clamp_neg_f16:
; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
; VI: v_max_f16_e64 v{{[0-9]+}}, -[[A]], -[[A]] clamp{{$}}
; FIXME: Better to fold neg into max
; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]]
; SI: v_max_f32_e64 v{{[0-9]+}}, [[CVT]], [[CVT]] clamp{{$}}
; SI: v_cvt_f16_f32
define amdgpu_kernel void @v_clamp_neg_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
%out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
%a = load half, half addrspace(1)* %gep0
%fneg.a = fsub half -0.0, %a
%max = call half @llvm.maxnum.f16(half %fneg.a, half 0.0)
%med = call half @llvm.minnum.f16(half %max, half 1.0)
store half %med, half addrspace(1)* %out.gep
ret void
}
; GCN-LABEL: {{^}}v_clamp_negabs_f16:
; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
; VI: v_max_f16_e64 v{{[0-9]+}}, -|[[A]]|, -|[[A]]| clamp{{$}}
; FIXME: Better to fold neg/abs into max
; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -|[[A]]|
; SI: v_max_f32_e64 v{{[0-9]+}}, [[CVT]], [[CVT]] clamp{{$}}
; SI: v_cvt_f16_f32_e32
define amdgpu_kernel void @v_clamp_negabs_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
%out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
%a = load half, half addrspace(1)* %gep0
%fabs.a = call half @llvm.fabs.f16(half %a)
%fneg.fabs.a = fsub half -0.0, %fabs.a
%max = call half @llvm.maxnum.f16(half %fneg.fabs.a, half 0.0)
%med = call half @llvm.minnum.f16(half %max, half 1.0)
store half %med, half addrspace(1)* %out.gep
ret void
}
; FIXME: Do f64 instructions support clamp?
; GCN-LABEL: {{^}}v_clamp_f64:
; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
; GCN: v_max_f64
; GCN: v_min_f64
define amdgpu_kernel void @v_clamp_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
%out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
%a = load double, double addrspace(1)* %gep0
%max = call double @llvm.maxnum.f64(double %a, double 0.0)
%med = call double @llvm.minnum.f64(double %max, double 1.0)
store double %med, double addrspace(1)* %out.gep
ret void
}
; GCN-LABEL: {{^}}v_clamp_neg_f64:
; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
; GCN: v_max_f64
; GCN: v_min_f64
define amdgpu_kernel void @v_clamp_neg_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
%out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
%a = load double, double addrspace(1)* %gep0
%fneg.a = fsub double -0.0, %a
%max = call double @llvm.maxnum.f64(double %fneg.a, double 0.0)
%med = call double @llvm.minnum.f64(double %max, double 1.0)
store double %med, double addrspace(1)* %out.gep
ret void
}
; GCN-LABEL: {{^}}v_clamp_negabs_f64:
; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
; GCN: v_max_f64
; GCN: v_min_f64
define amdgpu_kernel void @v_clamp_negabs_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
%out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
%a = load double, double addrspace(1)* %gep0
%fabs.a = call double @llvm.fabs.f64(double %a)
%fneg.fabs.a = fsub double -0.0, %fabs.a
%max = call double @llvm.maxnum.f64(double %fneg.fabs.a, double 0.0)
%med = call double @llvm.minnum.f64(double %max, double 1.0)
store double %med, double addrspace(1)* %out.gep
ret void
}
; GCN-LABEL: {{^}}v_clamp_med3_aby_negzero_f32:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: v_med3_f32
define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
%a = load float, float addrspace(1)* %gep0
%med = call float @llvm.amdgcn.fmed3.f32(float -0.0, float 1.0, float %a)
store float %med, float addrspace(1)* %out.gep
ret void
}
; GCN-LABEL: {{^}}v_clamp_med3_aby_f32:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
define amdgpu_kernel void @v_clamp_med3_aby_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
%a = load float, float addrspace(1)* %gep0
%med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float %a)
store float %med, float addrspace(1)* %out.gep
ret void
}
; GCN-LABEL: {{^}}v_clamp_med3_bay_f32:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
define amdgpu_kernel void @v_clamp_med3_bay_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
%a = load float, float addrspace(1)* %gep0
%med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float 0.0, float %a)
store float %med, float addrspace(1)* %out.gep
ret void
}
; GCN-LABEL: {{^}}v_clamp_med3_yab_f32:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
define amdgpu_kernel void @v_clamp_med3_yab_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
%a = load float, float addrspace(1)* %gep0
%med = call float @llvm.amdgcn.fmed3.f32(float %a, float 0.0, float 1.0)
store float %med, float addrspace(1)* %out.gep
ret void
}
; GCN-LABEL: {{^}}v_clamp_med3_yba_f32:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
define amdgpu_kernel void @v_clamp_med3_yba_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
%a = load float, float addrspace(1)* %gep0
%med = call float @llvm.amdgcn.fmed3.f32(float %a, float 1.0, float 0.0)
store float %med, float addrspace(1)* %out.gep
ret void
}
; GCN-LABEL: {{^}}v_clamp_med3_ayb_f32:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
define amdgpu_kernel void @v_clamp_med3_ayb_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
%a = load float, float addrspace(1)* %gep0
%med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %a, float 1.0)
store float %med, float addrspace(1)* %out.gep
ret void
}
; GCN-LABEL: {{^}}v_clamp_med3_bya_f32:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
define amdgpu_kernel void @v_clamp_med3_bya_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
%a = load float, float addrspace(1)* %gep0
%med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float %a, float 0.0)
store float %med, float addrspace(1)* %out.gep
ret void
}
; GCN-LABEL: {{^}}v_clamp_constants_to_one_f32:
; GCN: v_mov_b32_e32 v{{[0-9]+}}, 1.0
define amdgpu_kernel void @v_clamp_constants_to_one_f32(float addrspace(1)* %out) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
%med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 4.0)
store float %med, float addrspace(1)* %out.gep
ret void
}
; GCN-LABEL: {{^}}v_clamp_constants_to_zero_f32:
; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
define amdgpu_kernel void @v_clamp_constants_to_zero_f32(float addrspace(1)* %out) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
%med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float -4.0)
store float %med, float addrspace(1)* %out.gep
ret void
}
; GCN-LABEL: {{^}}v_clamp_constant_preserve_f32:
; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0.5
define amdgpu_kernel void @v_clamp_constant_preserve_f32(float addrspace(1)* %out) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
%med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0.5)
store float %med, float addrspace(1)* %out.gep
ret void
}
; GCN-LABEL: {{^}}v_clamp_constant_preserve_denorm_f32:
; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0x7fffff{{$}}
define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(float addrspace(1)* %out) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
%med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 8388607 to float))
store float %med, float addrspace(1)* %out.gep
ret void
}
; GCN-LABEL: {{^}}v_clamp_constant_qnan_f32:
; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
define amdgpu_kernel void @v_clamp_constant_qnan_f32(float addrspace(1)* %out) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
%med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0x7FF8000000000000)
store float %med, float addrspace(1)* %out.gep
ret void
}
; GCN-LABEL: {{^}}v_clamp_constant_snan_f32:
; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
define amdgpu_kernel void @v_clamp_constant_snan_f32(float addrspace(1)* %out) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
%med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 2139095041 to float))
store float %med, float addrspace(1)* %out.gep
ret void
}
; ---------------------------------------------------------------------
; Test non-default behaviors enabling snans and disabling dx10_clamp
; ---------------------------------------------------------------------
; GCN-LABEL: {{^}}v_clamp_f32_no_dx10_clamp:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 0, 1.0
define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
%a = load float, float addrspace(1)* %gep0
%max = call float @llvm.maxnum.f32(float %a, float 0.0)
%med = call float @llvm.minnum.f32(float %max, float 1.0)
store float %med, float addrspace(1)* %out.gep
ret void
}
; GCN-LABEL: {{^}}v_clamp_f32_snan_dx10clamp:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #3 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
%a = load float, float addrspace(1)* %gep0
%max = call float @llvm.maxnum.f32(float %a, float 0.0)
%med = call float @llvm.minnum.f32(float %max, float 1.0)
store float %med, float addrspace(1)* %out.gep
ret void
}
; GCN-LABEL: {{^}}v_clamp_f32_snan_no_dx10clamp:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]]
; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 1.0, [[MAX]]
define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
%a = load float, float addrspace(1)* %gep0
%max = call float @llvm.maxnum.f32(float %a, float 0.0)
%med = call float @llvm.minnum.f32(float %max, float 1.0)
store float %med, float addrspace(1)* %out.gep
ret void
}
; GCN-LABEL: {{^}}v_clamp_f32_snan_no_dx10clamp_nnan_src:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 0, 1.0
define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
%a = load float, float addrspace(1)* %gep0
%add = fadd nnan float %a, 1.0
%max = call float @llvm.maxnum.f32(float %add, float 0.0)
%med = call float @llvm.minnum.f32(float %max, float 1.0)
store float %med, float addrspace(1)* %out.gep
ret void
}
; GCN-LABEL: {{^}}v_clamp_med3_aby_f32_no_dx10_clamp:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
%a = load float, float addrspace(1)* %gep0
%med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float %a)
store float %med, float addrspace(1)* %out.gep
ret void
}
; GCN-LABEL: {{^}}v_clamp_med3_bay_f32_no_dx10_clamp:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
%a = load float, float addrspace(1)* %gep0
%med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float 0.0, float %a)
store float %med, float addrspace(1)* %out.gep
ret void
}
; GCN-LABEL: {{^}}v_clamp_med3_yab_f32_no_dx10_clamp:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 0, 1.0
define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
%a = load float, float addrspace(1)* %gep0
%med = call float @llvm.amdgcn.fmed3.f32(float %a, float 0.0, float 1.0)
store float %med, float addrspace(1)* %out.gep
ret void
}
; GCN-LABEL: {{^}}v_clamp_med3_yba_f32_no_dx10_clamp:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 1.0, 0
define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
%a = load float, float addrspace(1)* %gep0
%med = call float @llvm.amdgcn.fmed3.f32(float %a, float 1.0, float 0.0)
store float %med, float addrspace(1)* %out.gep
ret void
}
; GCN-LABEL: {{^}}v_clamp_med3_ayb_f32_no_dx10_clamp:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: v_med3_f32 v{{[0-9]+}}, 0, [[A]], 1.0
define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
%a = load float, float addrspace(1)* %gep0
%med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %a, float 1.0)
store float %med, float addrspace(1)* %out.gep
ret void
}
; GCN-LABEL: {{^}}v_clamp_med3_bya_f32_no_dx10_clamp:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: v_med3_f32 v{{[0-9]+}}, 1.0, [[A]], 0
define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
%a = load float, float addrspace(1)* %gep0
%med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float %a, float 0.0)
store float %med, float addrspace(1)* %out.gep
ret void
}
; GCN-LABEL: {{^}}v_clamp_constant_qnan_f32_no_dx10_clamp:
; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0x7fc00000
define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(float addrspace(1)* %out) #2 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
%med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0x7FF8000000000000)
store float %med, float addrspace(1)* %out.gep
ret void
}
; GCN-LABEL: {{^}}v_clamp_constant_snan_f32_no_dx10_clamp:
; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0x7f800001
define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(float addrspace(1)* %out) #2 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
%med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 2139095041 to float))
store float %med, float addrspace(1)* %out.gep
ret void
}
declare i32 @llvm.amdgcn.workitem.id.x() #1
declare float @llvm.fabs.f32(float) #1
declare float @llvm.minnum.f32(float, float) #1
declare float @llvm.maxnum.f32(float, float) #1
declare float @llvm.amdgcn.fmed3.f32(float, float, float) #1
declare double @llvm.fabs.f64(double) #1
declare double @llvm.minnum.f64(double, double) #1
declare double @llvm.maxnum.f64(double, double) #1
declare half @llvm.fabs.f16(half) #1
declare half @llvm.minnum.f16(half, half) #1
declare half @llvm.maxnum.f16(half, half) #1
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind "target-features"="-dx10-clamp,-fp-exceptions" "no-nans-fp-math"="false" }
attributes #3 = { nounwind "target-features"="+dx10-clamp,+fp-exceptions" "no-nans-fp-math"="false" }
attributes #4 = { nounwind "target-features"="-dx10-clamp,+fp-exceptions" "no-nans-fp-math"="false" }

View File

@ -60,9 +60,20 @@ define void @test_no_denormals(float addrspace(1)* %out0, double addrspace(1)* %
ret void
}
; GCN-LABEL: {{^}}test_no_dx10_clamp_vi:
; GCN: float_mode = 192
; GCN: enable_dx10_clamp = 0
; GCN: enable_ieee_mode = 1
define void @test_no_dx10_clamp_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #6 {
store float 0.0, float addrspace(1)* %out0
store double 0.0, double addrspace(1)* %out1
ret void
}
attributes #0 = { nounwind "target-cpu"="kaveri" }
attributes #1 = { nounwind "target-cpu"="fiji" }
attributes #2 = { nounwind "target-features"="-fp32-denormals,+fp64-fp16-denormals" }
attributes #3 = { nounwind "target-features"="+fp32-denormals,-fp64-fp16-denormals" }
attributes #4 = { nounwind "target-features"="+fp32-denormals,+fp64-fp16-denormals" }
attributes #5 = { nounwind "target-features"="-fp32-denormals,-fp64-fp16-denormals" }
attributes #6 = { nounwind "target-cpu"="fiji" "target-features"="-dx10-clamp" }

View File

@ -7,7 +7,7 @@ declare float @llvm.AMDGPU.clamp.f32(float, float, float) nounwind readnone
; FUNC-LABEL: {{^}}clamp_0_1_f32:
; SI: s_load_dword [[ARG:s[0-9]+]],
; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], [[ARG]], 0 clamp{{$}}
; SI: v_max_f32_e64 [[RESULT:v[0-9]+]], [[ARG]], [[ARG]] clamp{{$}}
; SI: buffer_store_dword [[RESULT]]
; SI: s_endpgm
@ -20,7 +20,7 @@ define void @clamp_0_1_f32(float addrspace(1)* %out, float %src) nounwind {
; FUNC-LABEL: {{^}}clamp_fabs_0_1_f32:
; SI: s_load_dword [[ARG:s[0-9]+]],
; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], |[[ARG]]|, 0 clamp{{$}}
; SI: v_max_f32_e64 [[RESULT:v[0-9]+]], |[[ARG]]|, |[[ARG]]| clamp{{$}}
; SI: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define void @clamp_fabs_0_1_f32(float addrspace(1)* %out, float %src) nounwind {
@ -32,7 +32,7 @@ define void @clamp_fabs_0_1_f32(float addrspace(1)* %out, float %src) nounwind {
; FUNC-LABEL: {{^}}clamp_fneg_0_1_f32:
; SI: s_load_dword [[ARG:s[0-9]+]],
; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], -[[ARG]], 0 clamp{{$}}
; SI: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[ARG]], -[[ARG]] clamp{{$}}
; SI: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define void @clamp_fneg_0_1_f32(float addrspace(1)* %out, float %src) nounwind {
@ -44,7 +44,7 @@ define void @clamp_fneg_0_1_f32(float addrspace(1)* %out, float %src) nounwind {
; FUNC-LABEL: {{^}}clamp_fneg_fabs_0_1_f32:
; SI: s_load_dword [[ARG:s[0-9]+]],
; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], -|[[ARG]]|, 0 clamp{{$}}
; SI: v_max_f32_e64 [[RESULT:v[0-9]+]], -|[[ARG]]|, -|[[ARG]]| clamp{{$}}
; SI: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define void @clamp_fneg_fabs_0_1_f32(float addrspace(1)* %out, float %src) nounwind {