From 1d07592ec77957f002189a0e2b4f5ed2462c5e3d Mon Sep 17 00:00:00 2001 From: Jan Vesely Date: Thu, 22 Jan 2015 23:42:43 +0000 Subject: [PATCH] R600: Try to use lower types for 64bit division if possible v2: add and enable tests for SI Signed-off-by: Jan Vesely Reviewed-by: Matt Arsenault git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@226881 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/R600/AMDGPUISelLowering.cpp | 50 ++++-- lib/Target/R600/AMDGPUISelLowering.h | 2 +- test/CodeGen/R600/sdivrem64.ll | 218 +++++++++++++++++++++++++ test/CodeGen/R600/udivrem64.ll | 138 +++++++++++++++- 4 files changed, 393 insertions(+), 15 deletions(-) create mode 100644 test/CodeGen/R600/sdivrem64.ll diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp index 95ef963fb21..e169f912624 100644 --- a/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/lib/Target/R600/AMDGPUISelLowering.cpp @@ -1661,6 +1661,20 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op, SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero); SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one); + if (VT == MVT::i64 && + DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) && + DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) { + + SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT), + LHS_Lo, RHS_Lo); + + SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, Res.getValue(0), zero); + SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, Res.getValue(1), zero); + Results.push_back(DIV); + Results.push_back(REM); + return; + } + // Get Speculative values SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo); SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo); @@ -1722,8 +1736,8 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, SDValue Den = Op.getOperand(1); if (VT == MVT::i32) { - if (DAG.MaskedValueIsZero(Op.getOperand(0), APInt(32, 0xff << 24)) && - DAG.MaskedValueIsZero(Op.getOperand(1), APInt(32, 0xff << 24))) { + if (DAG.MaskedValueIsZero(Num, APInt::getHighBitsSet(32, 8)) && + DAG.MaskedValueIsZero(Den, APInt::getHighBitsSet(32, 8))) { // TODO: We technically could do this for i64, but shouldn't that just be // handled by something generally reducing 64-bit division on 32-bit // values to 32-bit? @@ -1835,19 +1849,31 @@ SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op, SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); - if (VT == MVT::i32) { - if (DAG.ComputeNumSignBits(Op.getOperand(0)) > 8 && - DAG.ComputeNumSignBits(Op.getOperand(1)) > 8) { - // TODO: We technically could do this for i64, but shouldn't that just be - // handled by something generally reducing 64-bit division on 32-bit - // values to 32-bit? - return LowerDIVREM24(Op, DAG, true); - } - } - SDValue Zero = DAG.getConstant(0, VT); SDValue NegOne = DAG.getConstant(-1, VT); + if (VT == MVT::i32 && + DAG.ComputeNumSignBits(LHS) > 8 && + DAG.ComputeNumSignBits(RHS) > 8) { + return LowerDIVREM24(Op, DAG, true); + } + if (VT == MVT::i64 && + DAG.ComputeNumSignBits(LHS) > 32 && + DAG.ComputeNumSignBits(RHS) > 32) { + EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); + + //HiLo split + SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero); + SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero); + SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT), + LHS_Lo, RHS_Lo); + SDValue Res[2] = { + DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)), + DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1)) + }; + return DAG.getMergeValues(Res, DL); + } + SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT); SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT); SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign); diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h index 87c40d09e8b..387a58e1e8c 100644 --- a/lib/Target/R600/AMDGPUISelLowering.h +++ b/lib/Target/R600/AMDGPUISelLowering.h @@ -43,7 +43,6 @@ private: /// \brief Split a vector store into multiple scalar stores. /// \returns The resulting chain. - SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFREM(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const; @@ -90,6 +89,7 @@ protected: SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const; void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG, SmallVectorImpl &Results) const; diff --git a/test/CodeGen/R600/sdivrem64.ll b/test/CodeGen/R600/sdivrem64.ll new file mode 100644 index 00000000000..425ad28634d --- /dev/null +++ b/test/CodeGen/R600/sdivrem64.ll @@ -0,0 +1,218 @@ +;RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=FUNC %s +;RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s + +;FUNC-LABEL: {{^}}test_sdiv: +;EG: RECIP_UINT +;EG: LSHL {{.*}}, 1, +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT + +;SI: v_bfe_u32 +;SI: v_bfe_u32 +;SI: v_bfe_u32 +;SI: v_bfe_u32 +;SI: v_bfe_u32 +;SI: v_bfe_u32 +;SI: v_bfe_u32 +;SI: v_bfe_u32 +;SI: v_bfe_u32 +;SI: v_bfe_u32 +;SI: v_bfe_u32 +;SI: v_bfe_u32 +;SI: v_bfe_u32 +;SI: v_bfe_u32 +;SI: v_bfe_u32 +;SI: v_bfe_u32 +;SI: v_bfe_u32 +;SI: v_bfe_u32 +;SI: v_bfe_u32 +;SI: v_bfe_u32 +;SI: v_bfe_u32 +;SI: v_bfe_u32 +;SI: v_bfe_u32 +;SI: v_bfe_u32 +;SI: v_bfe_u32 +;SI: v_bfe_u32 +;SI: v_bfe_u32 +;SI: v_bfe_u32 +;SI: v_bfe_u32 +;SI: v_bfe_u32 +;SI-NOT: v_mad_f32 +;SI-NOT: v_lshr_64 +;SI: s_endpgm +define void @test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) { + %result = sdiv i64 %x, %y + store i64 %result, i64 addrspace(1)* %out + ret void +} + +;FUNC-LABEL: {{^}}test_srem: +;EG: RECIP_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: AND_INT {{.*}}, 1, + +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI-NOT: v_mad_f32 +;SI-NOT: v_lshr_64 +;SI: s_endpgm +define void @test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) { + %result = urem i64 %x, %y + store i64 %result, i64 addrspace(1)* %out + ret void +} + +;FUNC-LABEL: {{^}}test_sdiv3264: +;EG: RECIP_UINT +;EG-NOT: BFE_UINT + +;SI-NOT: s_bfe_u32 +;SI-NOT: v_mad_f32 +;SI-NOT: v_lshr_64 +;SI: s_endpgm +define void @test_sdiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) { + %1 = ashr i64 %x, 33 + %2 = ashr i64 %y, 33 + %result = sdiv i64 %1, %2 + store i64 %result, i64 addrspace(1)* %out + ret void +} + +;FUNC-LABEL: {{^}}test_srem3264: +;EG: RECIP_UINT +;EG-NOT: BFE_UINT + +;SI-NOT: s_bfe_u32 +;SI-NOT: v_mad_f32 +;SI-NOT: v_lshr_64 +;SI: s_endpgm +define void @test_srem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) { + %1 = ashr i64 %x, 33 + %2 = ashr i64 %y, 33 + %result = srem i64 %1, %2 + store i64 %result, i64 addrspace(1)* %out + ret void +} + +;FUNC-LABEL: {{^}}test_sdiv2464: +;EG: INT_TO_FLT +;EG: INT_TO_FLT +;EG: FLT_TO_INT +;EG-NOT: RECIP_UINT +;EG-NOT: BFE_UINT + +;SI-NOT: s_bfe_u32 +;SI: v_mad_f32 +;SI-NOT: v_lshr_64 +;SI: s_endpgm +define void @test_sdiv2464(i64 addrspace(1)* %out, i64 %x, i64 %y) { + %1 = ashr i64 %x, 40 + %2 = ashr i64 %y, 40 + %result = sdiv i64 %1, %2 + store i64 %result, i64 addrspace(1)* %out + ret void +} + +;FUNC-LABEL: {{^}}test_srem2464: +;EG: INT_TO_FLT +;EG: INT_TO_FLT +;EG: FLT_TO_INT +;EG-NOT: RECIP_UINT +;EG-NOT: BFE_UINT + +;SI-NOT: s_bfe_u32 +;SI: v_mad_f32 +;SI-NOT: v_lshr_64 +;SI: s_endpgm +define void @test_srem2464(i64 addrspace(1)* %out, i64 %x, i64 %y) { + %1 = ashr i64 %x, 40 + %2 = ashr i64 %y, 40 + %result = srem i64 %1, %2 + store i64 %result, i64 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/R600/udivrem64.ll b/test/CodeGen/R600/udivrem64.ll index a0474585987..a147e81d581 100644 --- a/test/CodeGen/R600/udivrem64.ll +++ b/test/CodeGen/R600/udivrem64.ll @@ -1,5 +1,5 @@ -;XUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s +;RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=FUNC %s +;RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s ;FUNC-LABEL: {{^}}test_udiv: ;EG: RECIP_UINT @@ -34,6 +34,39 @@ ;EG: BFE_UINT ;EG: BFE_UINT ;EG: BFE_UINT + +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI-NOT: v_mad_f32 +;SI-NOT: v_lshr_64 ;SI: s_endpgm define void @test_udiv(i64 addrspace(1)* %out, i64 %x, i64 %y) { %result = udiv i64 %x, %y @@ -74,9 +107,110 @@ define void @test_udiv(i64 addrspace(1)* %out, i64 %x, i64 %y) { ;EG: BFE_UINT ;EG: BFE_UINT ;EG: AND_INT {{.*}}, 1, + +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI: s_bfe_u32 +;SI-NOT: v_mad_f32 +;SI-NOT: v_lshr_64 ;SI: s_endpgm define void @test_urem(i64 addrspace(1)* %out, i64 %x, i64 %y) { %result = urem i64 %x, %y store i64 %result, i64 addrspace(1)* %out ret void } + +;FUNC-LABEL: {{^}}test_udiv3264: +;EG: RECIP_UINT +;EG-NOT: BFE_UINT + +;SI-NOT: s_bfe_u32 +;SI-NOT: v_mad_f32 +;SI-NOT: v_lshr_64 +;SI: s_endpgm +define void @test_udiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) { + %1 = lshr i64 %x, 33 + %2 = lshr i64 %y, 33 + %result = udiv i64 %1, %2 + store i64 %result, i64 addrspace(1)* %out + ret void +} + +;FUNC-LABEL: {{^}}test_urem3264: +;EG: RECIP_UINT +;EG-NOT: BFE_UINT + +;SI-NOT: s_bfe_u32 +;SI-NOT: v_mad_f32 +;SI-NOT: v_lshr_64 +;SI: s_endpgm +define void @test_urem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) { + %1 = lshr i64 %x, 33 + %2 = lshr i64 %y, 33 + %result = urem i64 %1, %2 + store i64 %result, i64 addrspace(1)* %out + ret void +} + +;FUNC-LABEL: {{^}}test_udiv2464: +;EG: UINT_TO_FLT +;EG: UINT_TO_FLT +;EG: FLT_TO_UINT +;EG-NOT: RECIP_UINT +;EG-NOT: BFE_UINT + +;SI-NOT: v_lshr_64 +;SI: v_mad_f32 +;SI: s_endpgm +define void @test_udiv2464(i64 addrspace(1)* %out, i64 %x, i64 %y) { + %1 = lshr i64 %x, 40 + %2 = lshr i64 %y, 40 + %result = udiv i64 %1, %2 + store i64 %result, i64 addrspace(1)* %out + ret void +} + +;FUNC-LABEL: {{^}}test_urem2464: +;EG: UINT_TO_FLT +;EG: UINT_TO_FLT +;EG: FLT_TO_UINT +;EG-NOT: RECIP_UINT +;EG-NOT: BFE_UINT + +;SI-NOT: v_lshr_64 +;SI: v_mad_f32 +;SI: s_endpgm +define void @test_urem2464(i64 addrspace(1)* %out, i64 %x, i64 %y) { + %1 = lshr i64 %x, 40 + %2 = lshr i64 %y, 40 + %result = urem i64 %1, %2 + store i64 %result, i64 addrspace(1)* %out + ret void +}