AMDGPU: Avoid using 64-bit shift for i64 (shl x, 32)

This can be done only with moves which theoretically
will optimize better later.

Although this transform increases the instruction count,
it should be code size / cycle count neutral in the worst
VALU case. It also seems to slightly improve a couple
of testcases due to other DAG combines this exposes.

This is probably slightly worse for the SALU case, so
it might be better to handle this during moveToVALU,
although then you lose some simplifications like
the load width reducing in the simple testcase.

llvm-svn: 242177
This commit is contained in:
Matt Arsenault 2015-07-14 18:20:33 +00:00
parent 988dd980cf
commit efaaa8cb7c
6 changed files with 117 additions and 16 deletions

View File

@ -406,6 +406,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom); setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom); setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
setTargetDAGCombine(ISD::SHL);
setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::MUL);
setTargetDAGCombine(ISD::SELECT); setTargetDAGCombine(ISD::SELECT);
setTargetDAGCombine(ISD::SELECT_CC); setTargetDAGCombine(ISD::SELECT_CC);
@ -2415,6 +2416,33 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
SN->getBasePtr(), SN->getMemOperand()); SN->getBasePtr(), SN->getMemOperand());
} }
SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
if (N->getValueType(0) != MVT::i64)
return SDValue();
// i64 (shl x, 32) -> (build_pair 0, x)
// Doing this with moves theoretically helps MI optimizations that understand
// copies. 2 v_mov_b32_e32 will have the same code size / cycle count as
// v_lshl_b64. In the SALU case, I think this is slightly worse since it
// doubles the code size and I'm unsure about cycle count.
const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
if (!RHS || RHS->getZExtValue() != 32)
return SDValue();
SDValue LHS = N->getOperand(0);
SDLoc SL(N);
SelectionDAG &DAG = DCI.DAG;
// Extract low 32-bits.
SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Zero, Lo);
}
SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N, SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
DAGCombinerInfo &DCI) const { DAGCombinerInfo &DCI) const {
EVT VT = N->getValueType(0); EVT VT = N->getValueType(0);
@ -2454,6 +2482,12 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
switch(N->getOpcode()) { switch(N->getOpcode()) {
default: default:
break; break;
case ISD::SHL: {
if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
break;
return performShlCombine(N, DCI);
}
case ISD::MUL: case ISD::MUL:
return performMulCombine(N, DCI); return performMulCombine(N, DCI);
case AMDGPUISD::MUL_I24: case AMDGPUISD::MUL_I24:

View File

@ -65,6 +65,7 @@ private:
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const;
protected: protected:

View File

@ -3,8 +3,9 @@
declare i32 @llvm.SI.tid() readnone declare i32 @llvm.SI.tid() readnone
; SI-LABEL: {{^}}test_array_ptr_calc: ; SI-LABEL: {{^}}test_array_ptr_calc:
; SI: v_mul_lo_i32 ; SI-DAG: v_mul_lo_i32
; SI: v_mul_hi_i32 ; SI-DAG: v_mul_hi_i32
; SI: s_endpgm
define void @test_array_ptr_calc(i32 addrspace(1)* noalias %out, [1025 x i32] addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) { define void @test_array_ptr_calc(i32 addrspace(1)* noalias %out, [1025 x i32] addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
%tid = call i32 @llvm.SI.tid() readnone %tid = call i32 @llvm.SI.tid() readnone
%a_ptr = getelementptr [1025 x i32], [1025 x i32] addrspace(1)* %inA, i32 %tid, i32 0 %a_ptr = getelementptr [1025 x i32], [1025 x i32] addrspace(1)* %inA, i32 %tid, i32 0

View File

@ -52,16 +52,18 @@ entry:
; FUNC_LABEL: {{^}}mul24_i64: ; FUNC_LABEL: {{^}}mul24_i64:
; EG; MUL_UINT24 ; EG; MUL_UINT24
; EG: MULHI ; EG: MULHI
; SI: v_mul_u32_u24
; FIXME: SI support 24-bit mulhi ; FIXME: SI support 24-bit mulhi
; SI: v_mul_hi_u32
define void @mul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { ; SI-DAG: v_mul_u32_u24
; SI-DAG: v_mul_hi_u32
; SI: s_endpgm
define void @mul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b, i64 %c) {
entry: entry:
%0 = shl i64 %a, 40 %tmp0 = shl i64 %a, 40
%a_24 = lshr i64 %0, 40 %a_24 = lshr i64 %tmp0, 40
%1 = shl i64 %b, 40 %tmp1 = shl i64 %b, 40
%b_24 = lshr i64 %1, 40 %b_24 = lshr i64 %tmp1, 40
%2 = mul i64 %a_24, %b_24 %tmp2 = mul i64 %a_24, %b_24
store i64 %2, i64 addrspace(1)* %out store i64 %tmp2, i64 addrspace(1)* %out
ret void ret void
} }

View File

@ -1,6 +1,9 @@
;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s ; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=SI %s
;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=VI %s ; XUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=VI %s
declare i32 @llvm.r600.read.tidig.x() #0
;EG: {{^}}shl_v2i32: ;EG: {{^}}shl_v2i32:
;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
@ -178,3 +181,32 @@ define void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in
store <4 x i64> %result, <4 x i64> addrspace(1)* %out store <4 x i64> %result, <4 x i64> addrspace(1)* %out
ret void ret void
} }
; Make sure load width gets reduced to i32 load.
; GCN-LABEL: {{^}}s_shl_32_i64:
; GCN-DAG: s_load_dword [[LO_A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb{{$}}
; GCN-DAG: s_mov_b32 s[[SLO:[0-9]+]], 0{{$}}
; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[LO_A]]
; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
define void @s_shl_32_i64(i64 addrspace(1)* %out, i64 %a) {
%result = shl i64 %a, 32
store i64 %result, i64 addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}v_shl_32_i64:
; GCN-DAG: buffer_load_dword v[[LO_A:[0-9]+]],
; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 0{{$}}
; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[LO_A]]{{\]}}
define void @v_shl_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
%tid = call i32 @llvm.r600.read.tidig.x() #0
%gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
%a = load i64, i64 addrspace(1)* %gep.in
%result = shl i64 %a, 32
store i64 %result, i64 addrspace(1)* %gep.out
ret void
}
attributes #0 = { nounwind readnone }

View File

@ -1,7 +1,9 @@
; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s ; XUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
declare i32 @llvm.r600.read.tidig.x() #0
; FUNC-LABEL: {{^}}lshr_i32: ; FUNC-LABEL: {{^}}lshr_i32:
; SI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} ; SI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} ; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
@ -184,3 +186,32 @@ define void @lshr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %i
store <4 x i64> %result, <4 x i64> addrspace(1)* %out store <4 x i64> %result, <4 x i64> addrspace(1)* %out
ret void ret void
} }
; Make sure load width gets reduced to i32 load.
; GCN-LABEL: {{^}}s_lshr_32_i64:
; GCN-DAG: s_load_dword [[HI_A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc{{$}}
; GCN-DAG: s_mov_b32 s[[SHI:[0-9]+]], 0{{$}}
; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[HI_A]]
; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
define void @s_lshr_32_i64(i64 addrspace(1)* %out, i64 %a) {
%result = lshr i64 %a, 32
store i64 %result, i64 addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}v_lshr_32_i64:
; GCN-DAG: buffer_load_dword v[[HI_A:[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0{{$}}
; GCN: buffer_store_dwordx2 v{{\[}}[[HI_A]]:[[VHI]]{{\]}}
define void @v_lshr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
%tid = call i32 @llvm.r600.read.tidig.x() #0
%gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
%a = load i64, i64 addrspace(1)* %gep.in
%result = lshr i64 %a, 32
store i64 %result, i64 addrspace(1)* %gep.out
ret void
}
attributes #0 = { nounwind readnone }