From 28a667546a86f983d709d4821c70b7e53b4d8794 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 28 Jan 2016 20:53:42 +0000 Subject: [PATCH] AMDGPU: Match some med3 patterns llvm-svn: 259089 --- lib/Target/AMDGPU/AMDGPU.td | 6 ++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 5 +- lib/Target/AMDGPU/AMDGPUISelLowering.h | 3 + lib/Target/AMDGPU/AMDGPUInstrInfo.td | 10 ++ lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 6 +- lib/Target/AMDGPU/AMDGPUSubtarget.h | 5 + lib/Target/AMDGPU/SIISelLowering.cpp | 93 +++++++++++++++- lib/Target/AMDGPU/SIISelLowering.h | 3 +- lib/Target/AMDGPU/SIInstructions.td | 6 +- test/CodeGen/AMDGPU/fmed3.ll | 131 +++++++++++++++++++++++ test/CodeGen/AMDGPU/smed3.ll | 120 +++++++++++++++++++++ test/CodeGen/AMDGPU/umed3.ll | 119 ++++++++++++++++++++ 12 files changed, 494 insertions(+), 13 deletions(-) create mode 100644 test/CodeGen/AMDGPU/fmed3.ll create mode 100644 test/CodeGen/AMDGPU/smed3.ll create mode 100644 test/CodeGen/AMDGPU/umed3.ll diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td index c8c550cd0e5..b842ba17675 100644 --- a/lib/Target/AMDGPU/AMDGPU.td +++ b/lib/Target/AMDGPU/AMDGPU.td @@ -169,6 +169,12 @@ def FeatureFP64Denormals : SubtargetFeature<"fp64-denormals", [FeatureFP64] >; +def FeatureFPExceptions : SubtargetFeature<"fp-exceptions", + "FPExceptions", + "true", + "Enable floating point exceptions" +>; + def FeatureEnableHugeScratchBuffer : SubtargetFeature< "huge-scratch-buffer", "EnableHugeScratchBuffer", diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 48f61fb250b..0d5a8086fe4 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -397,7 +397,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM, // SI at least has hardware support for floating point exceptions, but no way // of using or handling them is implemented. They are also optional in OpenCL // (Section 7.3) - setHasFloatingPointExceptions(false); + setHasFloatingPointExceptions(Subtarget->hasFPExceptions()); setSelectIsExpensive(false); PredictableSelectIsExpensive = false; @@ -2949,6 +2949,9 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(FMIN3) NODE_NAME_CASE(SMIN3) NODE_NAME_CASE(UMIN3) + NODE_NAME_CASE(FMED3) + NODE_NAME_CASE(SMED3) + NODE_NAME_CASE(UMED3) NODE_NAME_CASE(URECIP) NODE_NAME_CASE(DIV_SCALE) NODE_NAME_CASE(DIV_FMAS) diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h index abd2b5e2c5e..34e13f56536 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -257,6 +257,9 @@ enum NodeType : unsigned { FMIN3, SMIN3, UMIN3, + FMED3, + SMED3, + UMED3, URECIP, DIV_SCALE, DIV_FMAS, diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td index 575dfe41365..5e6d3102027 100644 --- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -209,6 +209,16 @@ def AMDGPUmad_i24 : SDNode<"AMDGPUISD::MAD_I24", AMDGPUDTIntTernaryOp, [] >; +def AMDGPUsmed3 : SDNode<"AMDGPUISD::SMED3", AMDGPUDTIntTernaryOp, + [] +>; + +def AMDGPUumed3 : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp, + [] +>; + +def AMDGPUfmed3 : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>; + def AMDGPUsendmsg : SDNode<"AMDGPUISD::SENDMSG", SDTypeProfile<0, 1, [SDTCisInt<0>]>, [SDNPHasChain, SDNPInGlue]>; diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index b0dae4a30c7..39b7030aa84 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -66,9 +66,9 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, : AMDGPUGenSubtargetInfo(TT, GPU, FS), DumpCode(false), R600ALUInst(false), HasVertexCache(false), TexVTXClauseSize(0), Gen(AMDGPUSubtarget::R600), FP64(false), - FP64Denormals(false), FP32Denormals(false), FastFMAF32(false), - HalfRate64Ops(false), CaymanISA(false), FlatAddressSpace(false), - FlatForGlobal(false), EnableIRStructurizer(true), + FP64Denormals(false), FP32Denormals(false), FPExceptions(false), + FastFMAF32(false), HalfRate64Ops(false), CaymanISA(false), + FlatAddressSpace(false), FlatForGlobal(false), EnableIRStructurizer(true), EnablePromoteAlloca(false), EnableIfCvt(true), EnableLoadStoreOpt(false), EnableUnsafeDSOffsetFolding(false), diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h index 97c521949ca..109ca9f9ce1 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -66,6 +66,7 @@ private: bool FP64; bool FP64Denormals; bool FP32Denormals; + bool FPExceptions; bool FastFMAF32; bool HalfRate64Ops; bool CaymanISA; @@ -150,6 +151,10 @@ public: return FP64Denormals; } + bool hasFPExceptions() const { + return FPExceptions; + } + bool hasFastFMAF32() const { return FastFMAF32; } diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index faecf3c1da9..90f74d48065 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2131,8 +2131,70 @@ static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { } } -SDValue SITargetLowering::performMin3Max3Combine(SDNode *N, - DAGCombinerInfo &DCI) const { +static SDValue performIntMed3ImmCombine(SelectionDAG &DAG, + SDLoc SL, + SDValue Op0, + SDValue Op1, + bool Signed) { + ConstantSDNode *K1 = dyn_cast(Op1); + if (!K1) + return SDValue(); + + ConstantSDNode *K0 = dyn_cast(Op0.getOperand(1)); + if (!K0) + return SDValue(); + + + if (Signed) { + if (K0->getAPIntValue().sge(K1->getAPIntValue())) + return SDValue(); + } else { + if (K0->getAPIntValue().uge(K1->getAPIntValue())) + return SDValue(); + } + + EVT VT = K0->getValueType(0); + return DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, VT, + Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0)); +} + +static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) { + if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions()) + return true; + + return DAG.isKnownNeverNaN(Op); +} + +static SDValue performFPMed3ImmCombine(SelectionDAG &DAG, + SDLoc SL, + SDValue Op0, + SDValue Op1) { + ConstantFPSDNode *K1 = dyn_cast(Op1); + if (!K1) + return SDValue(); + + ConstantFPSDNode *K0 = dyn_cast(Op0.getOperand(1)); + if (!K0) + return SDValue(); + + // Ordered >= (although NaN inputs should have folded away by now). + APFloat::cmpResult Cmp = K0->getValueAPF().compare(K1->getValueAPF()); + if (Cmp == APFloat::cmpGreaterThan) + return SDValue(); + + // This isn't safe with signaling NaNs because in IEEE mode, min/max on a + // signaling NaN gives a quiet NaN. The quiet NaN input to the min would then + // give the other result, which is different from med3 with a NaN input. + SDValue Var = Op0.getOperand(0); + if (!isKnownNeverSNan(DAG, Var)) + return SDValue(); + + return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), + Var, SDValue(K0, 0), SDValue(K1, 0)); +} + +SDValue SITargetLowering::performMinMaxCombine(SDNode *N, + DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; unsigned Opc = N->getOpcode(); @@ -2142,7 +2204,8 @@ SDValue SITargetLowering::performMin3Max3Combine(SDNode *N, // Only do this if the inner op has one use since this will just increases // register pressure for no benefit. - // max(max(a, b), c) + // max(max(a, b), c) -> max3(a, b, c) + // min(min(a, b), c) -> min3(a, b, c) if (Op0.getOpcode() == Opc && Op0.hasOneUse()) { SDLoc DL(N); return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), @@ -2153,7 +2216,9 @@ SDValue SITargetLowering::performMin3Max3Combine(SDNode *N, Op1); } - // max(a, max(b, c)) + // Try commuted. + // max(a, max(b, c)) -> max3(a, b, c) + // min(a, min(b, c)) -> min3(a, b, c) if (Op1.getOpcode() == Opc && Op1.hasOneUse()) { SDLoc DL(N); return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), @@ -2164,6 +2229,24 @@ SDValue SITargetLowering::performMin3Max3Combine(SDNode *N, Op1.getOperand(1)); } + // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1) + if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) { + if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, true)) + return Med3; + } + + if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) { + if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, false)) + return Med3; + } + + // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1) + if (Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM && + N->getValueType(0) == MVT::f32 && Op0.hasOneUse()) { + if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1)) + return Res; + } + return SDValue(); } @@ -2217,7 +2300,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG && N->getValueType(0) != MVT::f64 && getTargetMachine().getOptLevel() > CodeGenOpt::None) - return performMin3Max3Combine(N, DCI); + return performMinMaxCombine(N, DCI); break; } diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h index 4587b030cab..d321805ec46 100644 --- a/lib/Target/AMDGPU/SIISelLowering.h +++ b/lib/Target/AMDGPU/SIISelLowering.h @@ -54,7 +54,8 @@ class SITargetLowering : public AMDGPUTargetLowering { SDValue performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const; - SDValue performMin3Max3Combine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const; bool isLegalFlatAddressingMode(const AddrMode &AM) const; diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td index a4fc2e3374c..a16491e1961 100644 --- a/lib/Target/AMDGPU/SIInstructions.td +++ b/lib/Target/AMDGPU/SIInstructions.td @@ -1695,13 +1695,13 @@ defm V_MAX3_U32 : VOP3Inst , "v_max3_u32", VOP_I32_I32_I32_I32, AMDGPUumax3 >; defm V_MED3_F32 : VOP3Inst , "v_med3_f32", - VOP_F32_F32_F32_F32 + VOP_F32_F32_F32_F32, AMDGPUfmed3 >; defm V_MED3_I32 : VOP3Inst , "v_med3_i32", - VOP_I32_I32_I32_I32 + VOP_I32_I32_I32_I32, AMDGPUsmed3 >; defm V_MED3_U32 : VOP3Inst , "v_med3_u32", - VOP_I32_I32_I32_I32 + VOP_I32_I32_I32_I32, AMDGPUumed3 >; //def V_SAD_U8 : VOP3_U8 <0x0000015a, "v_sad_u8", []>; diff --git a/test/CodeGen/AMDGPU/fmed3.ll b/test/CodeGen/AMDGPU/fmed3.ll new file mode 100644 index 00000000000..c02c0844024 --- /dev/null +++ b/test/CodeGen/AMDGPU/fmed3.ll @@ -0,0 +1,131 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=NOSNAN -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mattr=+fp-exceptions -verify-machineinstrs < %s | FileCheck -check-prefix=SNAN -check-prefix=GCN %s + +declare i32 @llvm.r600.read.tidig.x() #0 +declare float @llvm.minnum.f32(float, float) #0 +declare float @llvm.maxnum.f32(float, float) #0 +declare double @llvm.minnum.f64(double, double) #0 +declare double @llvm.maxnum.f64(double, double) #0 + +; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_f32: +; NOSNAN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0 + +; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} +; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} +define void @v_test_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + + %max = call float @llvm.maxnum.f32(float %a, float 2.0) + %med = call float @llvm.minnum.f32(float %max, float 4.0) + + store float %med, float addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_commute0_f32: +; NOSNAN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0 + +; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} +; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} +define void @v_test_fmed3_r_i_i_commute0_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + + %max = call float @llvm.maxnum.f32(float 2.0, float %a) + %med = call float @llvm.minnum.f32(float 4.0, float %max) + + store float %med, float addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_commute1_f32: +; NOSNAN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0 + +; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} +; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} +define void @v_test_fmed3_r_i_i_commute1_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + + %max = call float @llvm.maxnum.f32(float %a, float 2.0) + %med = call float @llvm.minnum.f32(float 4.0, float %max) + + store float %med, float addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_constant_order_f32: +; GCN: v_max_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} +; GCN: v_min_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} +define void @v_test_fmed3_r_i_i_constant_order_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + + %max = call float @llvm.maxnum.f32(float %a, float 4.0) + %med = call float @llvm.minnum.f32(float %max, float 2.0) + + store float %med, float addrspace(1)* %outgep + ret void +} + + +; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_multi_use_f32: +; GCN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} +; GCN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} +define void @v_test_fmed3_r_i_i_multi_use_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + + %max = call float @llvm.maxnum.f32(float %a, float 2.0) + %med = call float @llvm.minnum.f32(float %max, float 4.0) + + store volatile float %med, float addrspace(1)* %outgep + store volatile float %max, float addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_f64: +; GCN: v_max_f64 {{v\[[0-9]+:[0-9]+\]}}, 2.0, {{v\[[0-9]+:[0-9]+\]}} +; GCN: v_min_f64 {{v\[[0-9]+:[0-9]+\]}}, 4.0, {{v\[[0-9]+:[0-9]+\]}} +define void @v_test_fmed3_r_i_i_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() + %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid + %outgep = getelementptr double, double addrspace(1)* %out, i32 %tid + %a = load double, double addrspace(1)* %gep0 + + %max = call double @llvm.maxnum.f64(double %a, double 2.0) + %med = call double @llvm.minnum.f64(double %max, double 4.0) + + store double %med, double addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_no_nans_f32: +; GCN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0 +define void @v_test_fmed3_r_i_i_no_nans_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 { + %tid = call i32 @llvm.r600.read.tidig.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + + %max = call float @llvm.maxnum.f32(float %a, float 2.0) + %med = call float @llvm.minnum.f32(float %max, float 4.0) + + store float %med, float addrspace(1)* %outgep + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="false" } +attributes #2 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="true" } diff --git a/test/CodeGen/AMDGPU/smed3.ll b/test/CodeGen/AMDGPU/smed3.ll new file mode 100644 index 00000000000..18c8c2c0818 --- /dev/null +++ b/test/CodeGen/AMDGPU/smed3.ll @@ -0,0 +1,120 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +declare i32 @llvm.r600.read.tidig.x() #0 + +; GCN-LABEL: {{^}}v_test_smed3_r_i_i_i32: +; GCN: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17 +define void @v_test_smed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() + %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %gep0 + + %icmp0 = icmp sgt i32 %a, 12 + %i0 = select i1 %icmp0, i32 %a, i32 12 + + %icmp1 = icmp slt i32 %i0, 17 + %i1 = select i1 %icmp1, i32 %i0, i32 17 + + store i32 %i1, i32 addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_smed3_multi_use_r_i_i_i32: +; GCN: v_max_i32 +; GCN: v_min_i32 +define void @v_test_smed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() + %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %gep0 + + %icmp0 = icmp sgt i32 %a, 12 + %i0 = select i1 %icmp0, i32 %a, i32 12 + + %icmp1 = icmp slt i32 %i0, 17 + %i1 = select i1 %icmp1, i32 %i0, i32 17 + + store volatile i32 %i0, i32 addrspace(1)* %outgep + store volatile i32 %i1, i32 addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_smed3_r_i_i_constant_order_i32: +; GCN: v_max_i32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}} +; GCN: v_min_i32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}} +define void @v_test_smed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() + %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %gep0 + + %icmp0 = icmp sgt i32 %a, 17 + %i0 = select i1 %icmp0, i32 %a, i32 17 + + %icmp1 = icmp slt i32 %i0, 12 + %i1 = select i1 %icmp1, i32 %i0, i32 12 + + store i32 %i1, i32 addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_smed3_r_i_i_sign_mismatch_i32: +; GCN: v_max_u32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}} +; GCN: v_min_i32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}} +define void @v_test_smed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() + %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %gep0 + + %icmp0 = icmp ugt i32 %a, 12 + %i0 = select i1 %icmp0, i32 %a, i32 12 + + %icmp1 = icmp slt i32 %i0, 17 + %i1 = select i1 %icmp1, i32 %i0, i32 17 + + store i32 %i1, i32 addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_smed3_r_i_i_i64: +; GCN: v_cmp_lt_i64 +; GCN: v_cmp_gt_i64 +define void @v_test_smed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() + %gep0 = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid + %outgep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid + %a = load i64, i64 addrspace(1)* %gep0 + + %icmp0 = icmp sgt i64 %a, 12 + %i0 = select i1 %icmp0, i64 %a, i64 12 + + %icmp1 = icmp slt i64 %i0, 17 + %i1 = select i1 %icmp1, i64 %i0, i64 17 + + store i64 %i1, i64 addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_smed3_r_i_i_i16: +; GCN: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17 +define void @v_test_smed3_r_i_i_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() + %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid + %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid + %a = load i16, i16 addrspace(1)* %gep0 + + %icmp0 = icmp sgt i16 %a, 12 + %i0 = select i1 %icmp0, i16 %a, i16 12 + + %icmp1 = icmp slt i16 %i0, 17 + %i1 = select i1 %icmp1, i16 %i0, i16 17 + + store i16 %i1, i16 addrspace(1)* %outgep + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } diff --git a/test/CodeGen/AMDGPU/umed3.ll b/test/CodeGen/AMDGPU/umed3.ll new file mode 100644 index 00000000000..3cdd5cea6d9 --- /dev/null +++ b/test/CodeGen/AMDGPU/umed3.ll @@ -0,0 +1,119 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +declare i32 @llvm.r600.read.tidig.x() #0 + +; GCN-LABEL: {{^}}v_test_umed3_r_i_i_i32: +; GCN: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17 +define void @v_test_umed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() + %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %gep0 + + %icmp0 = icmp ugt i32 %a, 12 + %i0 = select i1 %icmp0, i32 %a, i32 12 + + %icmp1 = icmp ult i32 %i0, 17 + %i1 = select i1 %icmp1, i32 %i0, i32 17 + + store i32 %i1, i32 addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_umed3_multi_use_r_i_i_i32: +; GCN: v_max_u32 +; GCN: v_min_u32 +define void @v_test_umed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() + %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %gep0 + + %icmp0 = icmp ugt i32 %a, 12 + %i0 = select i1 %icmp0, i32 %a, i32 12 + + %icmp1 = icmp ult i32 %i0, 17 + %i1 = select i1 %icmp1, i32 %i0, i32 17 + + store volatile i32 %i0, i32 addrspace(1)* %outgep + store volatile i32 %i1, i32 addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_umed3_r_i_i_constant_order_i32: +; GCN: v_max_u32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}} +; GCN: v_min_u32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}} +define void @v_test_umed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() + %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %gep0 + + %icmp0 = icmp ugt i32 %a, 17 + %i0 = select i1 %icmp0, i32 %a, i32 17 + + %icmp1 = icmp ult i32 %i0, 12 + %i1 = select i1 %icmp1, i32 %i0, i32 12 + + store i32 %i1, i32 addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_umed3_r_i_i_sign_mismatch_i32: +; GCN: v_max_i32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}} +; GCN: v_min_u32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}} +define void @v_test_umed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() + %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %gep0 + + %icmp0 = icmp sgt i32 %a, 12 + %i0 = select i1 %icmp0, i32 %a, i32 12 + + %icmp1 = icmp ult i32 %i0, 17 + %i1 = select i1 %icmp1, i32 %i0, i32 17 + + store i32 %i1, i32 addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_umed3_r_i_i_i64: +; GCN: v_cmp_lt_u64 +; GCN: v_cmp_gt_u64 +define void @v_test_umed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() + %gep0 = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid + %outgep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid + %a = load i64, i64 addrspace(1)* %gep0 + + %icmp0 = icmp ugt i64 %a, 12 + %i0 = select i1 %icmp0, i64 %a, i64 12 + + %icmp1 = icmp ult i64 %i0, 17 + %i1 = select i1 %icmp1, i64 %i0, i64 17 + + store i64 %i1, i64 addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_umed3_r_i_i_i16: +; GCN: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17 +define void @v_test_umed3_r_i_i_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() + %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid + %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid + %a = load i16, i16 addrspace(1)* %gep0 + + %icmp0 = icmp ugt i16 %a, 12 + %i0 = select i1 %icmp0, i16 %a, i16 12 + + %icmp1 = icmp ult i16 %i0, 17 + %i1 = select i1 %icmp1, i16 %i0, i16 17 + + store i16 %i1, i16 addrspace(1)* %outgep + ret void +} +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind }