mirror of
https://github.com/RPCSX/llvm.git
synced 2024-12-03 01:12:59 +00:00
AMDGPU: Use generic bitreverse intrinsic
Also fix bug in vector legalization for bitreverse. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255512 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
59dc7be11d
commit
1451e94ee0
@ -105,6 +105,7 @@ class VectorLegalizer {
|
||||
SDValue ExpandLoad(SDValue Op);
|
||||
SDValue ExpandStore(SDValue Op);
|
||||
SDValue ExpandFNEG(SDValue Op);
|
||||
SDValue ExpandBITREVERSE(SDValue Op);
|
||||
|
||||
/// \brief Implements vector promotion.
|
||||
///
|
||||
@ -280,6 +281,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
|
||||
case ISD::ROTL:
|
||||
case ISD::ROTR:
|
||||
case ISD::BSWAP:
|
||||
case ISD::BITREVERSE:
|
||||
case ISD::CTLZ:
|
||||
case ISD::CTTZ:
|
||||
case ISD::CTLZ_ZERO_UNDEF:
|
||||
@ -417,7 +419,7 @@ SDValue VectorLegalizer::Promote(SDValue Op) {
|
||||
else
|
||||
Operands[j] = Op.getOperand(j);
|
||||
}
|
||||
|
||||
|
||||
Op = DAG.getNode(Op.getOpcode(), dl, NVT, Operands, Op.getNode()->getFlags());
|
||||
if ((VT.isFloatingPoint() && NVT.isFloatingPoint()) ||
|
||||
(VT.isVector() && VT.getVectorElementType().isFloatingPoint() &&
|
||||
@ -715,6 +717,8 @@ SDValue VectorLegalizer::Expand(SDValue Op) {
|
||||
return ExpandFNEG(Op);
|
||||
case ISD::SETCC:
|
||||
return UnrollVSETCC(Op);
|
||||
case ISD::BITREVERSE:
|
||||
return ExpandBITREVERSE(Op);
|
||||
default:
|
||||
return DAG.UnrollVectorOp(Op.getNode());
|
||||
}
|
||||
@ -900,6 +904,25 @@ SDValue VectorLegalizer::ExpandBSWAP(SDValue Op) {
|
||||
return DAG.getNode(ISD::BITCAST, DL, VT, Op);
|
||||
}
|
||||
|
||||
SDValue VectorLegalizer::ExpandBITREVERSE(SDValue Op) {
|
||||
EVT VT = Op.getValueType();
|
||||
|
||||
// If we have the scalar operation, it's probably cheaper to unroll it.
|
||||
if (TLI.isOperationLegalOrCustom(ISD::BITREVERSE, VT.getScalarType()))
|
||||
return DAG.UnrollVectorOp(Op.getNode());
|
||||
|
||||
// If we have the appropriate vector bit operations, it is better to use them
|
||||
// than unrolling and expanding each component.
|
||||
if (!TLI.isOperationLegalOrCustom(ISD::SHL, VT) ||
|
||||
!TLI.isOperationLegalOrCustom(ISD::SRL, VT) ||
|
||||
!TLI.isOperationLegalOrCustom(ISD::AND, VT) ||
|
||||
!TLI.isOperationLegalOrCustom(ISD::OR, VT))
|
||||
return DAG.UnrollVectorOp(Op.getNode());
|
||||
|
||||
// Let LegalizeDAG handle this later.
|
||||
return Op;
|
||||
}
|
||||
|
||||
SDValue VectorLegalizer::ExpandVSELECT(SDValue Op) {
|
||||
// Implement VSELECT in terms of XOR, AND, OR
|
||||
// on platforms which do not support blend natively.
|
||||
|
@ -1036,9 +1036,6 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
|
||||
Op.getOperand(1),
|
||||
Op.getOperand(2));
|
||||
|
||||
case AMDGPUIntrinsic::AMDGPU_brev:
|
||||
return DAG.getNode(AMDGPUISD::BREV, DL, VT, Op.getOperand(1));
|
||||
|
||||
case Intrinsic::AMDGPU_class:
|
||||
return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
|
||||
Op.getOperand(1), Op.getOperand(2));
|
||||
@ -1050,6 +1047,8 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
|
||||
return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1));
|
||||
case AMDGPUIntrinsic::AMDGPU_trunc: // Legacy name.
|
||||
return DAG.getNode(ISD::FTRUNC, DL, VT, Op.getOperand(1));
|
||||
case AMDGPUIntrinsic::AMDGPU_brev: // Legacy name
|
||||
return DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(1));
|
||||
}
|
||||
}
|
||||
|
||||
@ -2700,7 +2699,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
|
||||
NODE_NAME_CASE(BFE_I32)
|
||||
NODE_NAME_CASE(BFI)
|
||||
NODE_NAME_CASE(BFM)
|
||||
NODE_NAME_CASE(BREV)
|
||||
NODE_NAME_CASE(MUL_U24)
|
||||
NODE_NAME_CASE(MUL_I24)
|
||||
NODE_NAME_CASE(MAD_U24)
|
||||
|
@ -263,7 +263,6 @@ enum NodeType : unsigned {
|
||||
BFE_I32, // Extract range of bits with sign extension to 32-bits.
|
||||
BFI, // (src0 & src1) | (~src0 & src2)
|
||||
BFM, // Insert a range of bits into a 32-bit word.
|
||||
BREV, // Reverse bits.
|
||||
MUL_U24,
|
||||
MUL_I24,
|
||||
MAD_U24,
|
||||
|
@ -191,8 +191,6 @@ def AMDGPUbfe_i32 : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>;
|
||||
def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>;
|
||||
def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>;
|
||||
|
||||
def AMDGPUbrev : SDNode<"AMDGPUISD::BREV", SDTIntUnaryOp>;
|
||||
|
||||
// Signed and unsigned 24-bit mulitply. The highest 8-bits are ignore when
|
||||
// performing the mulitply. The result is a 32-bit value.
|
||||
def AMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp,
|
||||
|
@ -107,6 +107,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
|
||||
setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
|
||||
|
||||
setOperationAction(ISD::BSWAP, MVT::i32, Legal);
|
||||
setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
|
||||
|
||||
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal);
|
||||
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
|
||||
|
@ -127,7 +127,7 @@ let Defs = [SCC] in {
|
||||
|
||||
|
||||
defm S_BREV_B32 : SOP1_32 <sop1<0x0b, 0x08>, "s_brev_b32",
|
||||
[(set i32:$dst, (AMDGPUbrev i32:$src0))]
|
||||
[(set i32:$dst, (bitreverse i32:$src0))]
|
||||
>;
|
||||
defm S_BREV_B64 : SOP1_64 <sop1<0x0c, 0x09>, "s_brev_b64", []>;
|
||||
|
||||
|
115
test/CodeGen/AMDGPU/bitreverse.ll
Normal file
115
test/CodeGen/AMDGPU/bitreverse.ll
Normal file
@ -0,0 +1,115 @@
|
||||
; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
|
||||
|
||||
declare i16 @llvm.bitreverse.i16(i16) #1
|
||||
declare i32 @llvm.bitreverse.i32(i32) #1
|
||||
declare i64 @llvm.bitreverse.i64(i64) #1
|
||||
|
||||
declare <2 x i32> @llvm.bitreverse.v2i32(<2 x i32>) #1
|
||||
declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) #1
|
||||
|
||||
declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) #1
|
||||
declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) #1
|
||||
|
||||
declare i32 @llvm.AMDGPU.brev(i32) #1
|
||||
|
||||
; FUNC-LABEL: {{^}}s_brev_i16:
|
||||
; SI: s_brev_b32
|
||||
define void @s_brev_i16(i16 addrspace(1)* noalias %out, i16 %val) #0 {
|
||||
%brev = call i16 @llvm.bitreverse.i16(i16 %val) #1
|
||||
store i16 %brev, i16 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}v_brev_i16:
|
||||
; SI: v_bfrev_b32_e32
|
||||
define void @v_brev_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) #0 {
|
||||
%val = load i16, i16 addrspace(1)* %valptr
|
||||
%brev = call i16 @llvm.bitreverse.i16(i16 %val) #1
|
||||
store i16 %brev, i16 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}s_brev_i32:
|
||||
; SI: s_load_dword [[VAL:s[0-9]+]],
|
||||
; SI: s_brev_b32 [[SRESULT:s[0-9]+]], [[VAL]]
|
||||
; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
|
||||
; SI: buffer_store_dword [[VRESULT]],
|
||||
; SI: s_endpgm
|
||||
define void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) #0 {
|
||||
%brev = call i32 @llvm.bitreverse.i32(i32 %val) #1
|
||||
store i32 %brev, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}v_brev_i32:
|
||||
; SI: buffer_load_dword [[VAL:v[0-9]+]],
|
||||
; SI: v_bfrev_b32_e32 [[RESULT:v[0-9]+]], [[VAL]]
|
||||
; SI: buffer_store_dword [[RESULT]],
|
||||
; SI: s_endpgm
|
||||
define void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 {
|
||||
%val = load i32, i32 addrspace(1)* %valptr
|
||||
%brev = call i32 @llvm.bitreverse.i32(i32 %val) #1
|
||||
store i32 %brev, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}s_brev_v2i32:
|
||||
; SI: s_brev_b32
|
||||
; SI: s_brev_b32
|
||||
define void @s_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> %val) #0 {
|
||||
%brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1
|
||||
store <2 x i32> %brev, <2 x i32> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}v_brev_v2i32:
|
||||
; SI: v_bfrev_b32_e32
|
||||
; SI: v_bfrev_b32_e32
|
||||
define void @v_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) #0 {
|
||||
%val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr
|
||||
%brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1
|
||||
store <2 x i32> %brev, <2 x i32> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}s_brev_i64:
|
||||
define void @s_brev_i64(i64 addrspace(1)* noalias %out, i64 %val) #0 {
|
||||
%brev = call i64 @llvm.bitreverse.i64(i64 %val) #1
|
||||
store i64 %brev, i64 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}v_brev_i64:
|
||||
define void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %valptr) #0 {
|
||||
%val = load i64, i64 addrspace(1)* %valptr
|
||||
%brev = call i64 @llvm.bitreverse.i64(i64 %val) #1
|
||||
store i64 %brev, i64 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}s_brev_v2i64:
|
||||
define void @s_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %val) #0 {
|
||||
%brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1
|
||||
store <2 x i64> %brev, <2 x i64> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}v_brev_v2i64:
|
||||
define void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %valptr) #0 {
|
||||
%val = load <2 x i64>, <2 x i64> addrspace(1)* %valptr
|
||||
%brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1
|
||||
store <2 x i64> %brev, <2 x i64> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}legacy_s_brev_i32:
|
||||
; SI: s_brev_b32
|
||||
define void @legacy_s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
|
||||
%brev = call i32 @llvm.AMDGPU.brev(i32 %val) #1
|
||||
store i32 %brev, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
attributes #1 = { nounwind readnone }
|
@ -1,28 +0,0 @@
|
||||
; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
|
||||
|
||||
declare i32 @llvm.AMDGPU.brev(i32) nounwind readnone
|
||||
|
||||
; FUNC-LABEL: {{^}}s_brev_i32:
|
||||
; SI: s_load_dword [[VAL:s[0-9]+]],
|
||||
; SI: s_brev_b32 [[SRESULT:s[0-9]+]], [[VAL]]
|
||||
; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
|
||||
; SI: buffer_store_dword [[VRESULT]],
|
||||
; SI: s_endpgm
|
||||
define void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
|
||||
%ctlz = call i32 @llvm.AMDGPU.brev(i32 %val) nounwind readnone
|
||||
store i32 %ctlz, i32 addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}v_brev_i32:
|
||||
; SI: buffer_load_dword [[VAL:v[0-9]+]],
|
||||
; SI: v_bfrev_b32_e32 [[RESULT:v[0-9]+]], [[VAL]]
|
||||
; SI: buffer_store_dword [[RESULT]],
|
||||
; SI: s_endpgm
|
||||
define void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
|
||||
%val = load i32, i32 addrspace(1)* %valptr, align 4
|
||||
%ctlz = call i32 @llvm.AMDGPU.brev(i32 %val) nounwind readnone
|
||||
store i32 %ctlz, i32 addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
Loading…
Reference in New Issue
Block a user