[SelectionDAG] Attempt to split BITREVERSE vector legalization into BSWAP and BITREVERSE stages

For BITREVERSE, bit shifting/masking every bit in a vector element is a very lengthy procedure.

If the input vector type is a whole multiple of bytes wide then we can split this into a BSWAP shuffle stage (to reverse at the byte level) and then a BITREVERSE stage applied to each byte. Most vector capable targets can efficiently BSWAP using shuffles resulting in a considerable reduction in instructions.

With this patch targets would only need to implement a target specific vXi8 BITREVERSE implementation to efficiently reverse most legal vector types.

Differential Revision: http://reviews.llvm.org/D19978

llvm-svn: 269290
This commit is contained in:
Simon Pilgrim 2016-05-12 13:09:49 +00:00
parent 3f6f8de238
commit 89b89650f3
3 changed files with 1180 additions and 2782 deletions

View File

@ -860,16 +860,19 @@ SDValue VectorLegalizer::ExpandZERO_EXTEND_VECTOR_INREG(SDValue Op) {
DAG.getVectorShuffle(SrcVT, DL, Zero, Src, ShuffleMask));
}
static void createBSWAPShuffleMask(EVT VT, SmallVectorImpl<int> &ShuffleMask) {
int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
for (int I = 0, E = VT.getVectorNumElements(); I != E; ++I)
for (int J = ScalarSizeInBytes - 1; J >= 0; --J)
ShuffleMask.push_back((I * ScalarSizeInBytes) + J);
}
SDValue VectorLegalizer::ExpandBSWAP(SDValue Op) {
EVT VT = Op.getValueType();
// Generate a byte wise shuffle mask for the BSWAP.
SmallVector<int, 16> ShuffleMask;
int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
for (int I = 0, E = VT.getVectorNumElements(); I != E; ++I)
for (int J = ScalarSizeInBytes - 1; J >= 0; --J)
ShuffleMask.push_back((I * ScalarSizeInBytes) + J);
createBSWAPShuffleMask(VT, ShuffleMask);
EVT ByteVT = EVT::getVectorVT(*DAG.getContext(), MVT::i8, ShuffleMask.size());
// Only emit a shuffle if the mask is legal.
@ -890,6 +893,30 @@ SDValue VectorLegalizer::ExpandBITREVERSE(SDValue Op) {
if (TLI.isOperationLegalOrCustom(ISD::BITREVERSE, VT.getScalarType()))
return DAG.UnrollVectorOp(Op.getNode());
// If the vector element width is a whole number of bytes, test if its legal
// to BSWAP shuffle the bytes and then perform the BITREVERSE on the byte
// vector. This greatly reduces the number of bit shifts necessary.
unsigned ScalarSizeInBits = VT.getScalarSizeInBits();
if (ScalarSizeInBits > 8 && (ScalarSizeInBits % 8) == 0) {
SmallVector<int, 16> BSWAPMask;
createBSWAPShuffleMask(VT, BSWAPMask);
EVT ByteVT = EVT::getVectorVT(*DAG.getContext(), MVT::i8, BSWAPMask.size());
if (TLI.isShuffleMaskLegal(BSWAPMask, ByteVT) &&
(TLI.isOperationLegalOrCustom(ISD::BITREVERSE, ByteVT) ||
(TLI.isOperationLegalOrCustom(ISD::SHL, ByteVT) &&
TLI.isOperationLegalOrCustom(ISD::SRL, ByteVT) &&
TLI.isOperationLegalOrCustomOrPromote(ISD::AND, ByteVT) &&
TLI.isOperationLegalOrCustomOrPromote(ISD::OR, ByteVT)))) {
SDLoc DL(Op);
Op = DAG.getNode(ISD::BITCAST, DL, ByteVT, Op.getOperand(0));
Op = DAG.getVectorShuffle(ByteVT, DL, Op, DAG.getUNDEF(ByteVT),
BSWAPMask.data());
Op = DAG.getNode(ISD::BITREVERSE, DL, ByteVT, Op);
return DAG.getNode(ISD::BITCAST, DL, VT, Op);
}
}
// If we have the appropriate vector bit operations, it is better to use them
// than unrolling and expanding each component.
if (!TLI.isOperationLegalOrCustom(ISD::SHL, VT) ||

View File

@ -7,6 +7,7 @@ declare <2 x i16> @llvm.bitreverse.v2i16(<2 x i16>) readnone
define <2 x i16> @f(<2 x i16> %a) {
; CHECK-LABEL: f:
; CHECK: rev32
; CHECK: ushr
%b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> %a)
ret <2 x i16> %b

File diff suppressed because it is too large Load Diff