[TTI, AArch64] Add transpose shuffle kind

This patch adds a new shuffle kind useful for transposing a 2xn matrix. These
transpose shuffle masks read corresponding even- or odd-numbered vector
elements from two n-dimensional source vectors and write each result into
consecutive elements of an n-dimensional destination vector. The transpose
shuffle kind is meant to model the TRN1 and TRN2 AArch64 instructions. As such,
this patch also considers transpose shuffles in the AArch64 implementation of
getShuffleCost.

Differential Revision: https://reviews.llvm.org/D45982

llvm-svn: 330941
This commit is contained in:
Matthew Simpson 2018-04-26 13:48:33 +00:00
parent 57ab217363
commit 0ecc1283b2
6 changed files with 131 additions and 33 deletions

View File

@ -642,6 +642,7 @@ public:
SK_Broadcast, ///< Broadcast element 0 to all other elements.
SK_Reverse, ///< Reverse the order of the vector.
SK_Alternate, ///< Choose alternate elements from vector.
SK_Transpose, ///< Transpose two vectors.
SK_InsertSubvector, ///< InsertSubvector. Index indicates start offset.
SK_ExtractSubvector,///< ExtractSubvector Index indicates start offset.
SK_PermuteTwoSrc, ///< Merge elements from two source vectors into one

View File

@ -553,11 +553,15 @@ public:
unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
Type *SubTp) {
if (Kind == TTI::SK_Alternate || Kind == TTI::SK_PermuteTwoSrc ||
Kind == TTI::SK_PermuteSingleSrc) {
switch (Kind) {
case TTI::SK_Alternate:
case TTI::SK_Transpose:
case TTI::SK_PermuteSingleSrc:
case TTI::SK_PermuteTwoSrc:
return getPermuteShuffleOverhead(Tp);
default:
return 1;
}
return 1;
}
unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,

View File

@ -683,6 +683,66 @@ static bool isAlternateVectorMask(ArrayRef<int> Mask) {
return isAlternate;
}
static bool isTransposeVectorMask(ArrayRef<int> Mask) {
// Transpose vector masks transpose a 2xn matrix. They read corresponding
// even- or odd-numbered vector elements from two n-dimensional source
// vectors and write each result into consecutive elements of an
// n-dimensional destination vector. Two shuffles are necessary to complete
// the transpose, one for the even elements and another for the odd elements.
// This description closely follows how the TRN1 and TRN2 AArch64
// instructions operate.
//
// For example, a simple 2x2 matrix can be transposed with:
//
// ; Original matrix
// m0 = <a, b>
// m1 = <c, d>
//
// ; Transposed matrix
// t0 = <a, c> = shufflevector m0, m1, <0, 2>
// t1 = <b, d> = shufflevector m0, m1, <1, 3>
//
// For matrices having greater than n columns, the resulting nx2 transposed
// matrix is stored in two result vectors such that one vector contains
// interleaved elements from all the even-numbered rows and the other vector
// contains interleaved elements from all the odd-numbered rows. For example,
// a 2x4 matrix can be transposed with:
//
// ; Original matrix
// m0 = <a, b, c, d>
// m1 = <e, f, g, h>
//
// ; Transposed matrix
// t0 = <a, e, c, g> = shufflevector m0, m1 <0, 4, 2, 6>
// t1 = <b, f, d, h> = shufflevector m0, m1 <1, 5, 3, 7>
//
// The above explanation places limitations on what valid transpose masks can
// look like. These limitations are defined by the checks below.
//
// 1. The number of elements in the mask must be a power of two.
if (!isPowerOf2_32(Mask.size()))
return false;
// 2. The first element of the mask must be either a zero (for the
// even-numbered vector elements) or a one (for the odd-numbered vector
// elements).
if (Mask[0] != 0 && Mask[0] != 1)
return false;
// 3. The difference between the first two elements must be equal to the
// number of elements in the mask.
if (Mask[1] - Mask[0] != (int)Mask.size())
return false;
// 4. The difference between consecutive even-numbered and odd-numbered
// elements must be equal to two.
for (int I = 2; I < (int)Mask.size(); ++I)
if (Mask[I] - Mask[I - 2] != 2)
return false;
return true;
}
static TargetTransformInfo::OperandValueKind getOperandInfo(Value *V) {
TargetTransformInfo::OperandValueKind OpInfo =
TargetTransformInfo::OK_AnyValue;
@ -1139,22 +1199,26 @@ int TargetTransformInfo::getInstructionThroughput(const Instruction *I) const {
if (NumVecElems == Mask.size()) {
if (isReverseVectorMask(Mask))
return getShuffleCost(TargetTransformInfo::SK_Reverse, VecTypOp0,
0, nullptr);
return TTIImpl->getShuffleCost(TargetTransformInfo::SK_Reverse,
VecTypOp0, 0, nullptr);
if (isAlternateVectorMask(Mask))
return getShuffleCost(TargetTransformInfo::SK_Alternate,
VecTypOp0, 0, nullptr);
return TTIImpl->getShuffleCost(TargetTransformInfo::SK_Alternate,
VecTypOp0, 0, nullptr);
if (isTransposeVectorMask(Mask))
return TTIImpl->getShuffleCost(TargetTransformInfo::SK_Transpose,
VecTypOp0, 0, nullptr);
if (isZeroEltBroadcastVectorMask(Mask))
return getShuffleCost(TargetTransformInfo::SK_Broadcast,
VecTypOp0, 0, nullptr);
return TTIImpl->getShuffleCost(TargetTransformInfo::SK_Broadcast,
VecTypOp0, 0, nullptr);
if (isSingleSourceVectorMask(Mask))
return getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
VecTypOp0, 0, nullptr);
return TTIImpl->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
VecTypOp0, 0, nullptr);
return getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc,
VecTypOp0, 0, nullptr);
return TTIImpl->getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc,
VecTypOp0, 0, nullptr);
}
return -1;

View File

@ -912,3 +912,30 @@ int AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm);
}
int AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
Type *SubTp) {
// Transpose shuffle kinds can be performed with 'trn1/trn2' and 'zip1/zip2'
// instructions.
if (Kind == TTI::SK_Transpose) {
static const CostTblEntry TransposeTbl[] = {
{ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
{ISD::VECTOR_SHUFFLE, MVT::v16i8, 1},
{ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
{ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
{ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
{ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
{ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
{ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
{ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
{ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
};
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
if (const auto *Entry =
CostTableLookup(TransposeTbl, ISD::VECTOR_SHUFFLE, LT.second))
return LT.first * Entry->Cost;
}
return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
}

View File

@ -169,6 +169,8 @@ public:
int getArithmeticReductionCost(unsigned Opcode, Type *Ty,
bool IsPairwiseForm);
int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
/// @}
};

View File

@ -2,7 +2,7 @@
; RUN: llc < %s -mtriple=aarch64--linux-gnu | FileCheck %s --check-prefix=CODE
; COST-LABEL: trn1.v8i8
; COST: Found an estimated cost of 42 for instruction: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
; CODE-LABEL: trn1.v8i8
; CODE: trn1 v0.8b, v0.8b, v1.8b
define <8 x i8> @trn1.v8i8(<8 x i8> %v0, <8 x i8> %v1) {
@ -11,7 +11,7 @@ define <8 x i8> @trn1.v8i8(<8 x i8> %v0, <8 x i8> %v1) {
}
; COST-LABEL: trn2.v8i8
; COST: Found an estimated cost of 42 for instruction: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
; CODE-LABEL: trn2.v8i8
; CODE: trn2 v0.8b, v0.8b, v1.8b
define <8 x i8> @trn2.v8i8(<8 x i8> %v0, <8 x i8> %v1) {
@ -20,7 +20,7 @@ define <8 x i8> @trn2.v8i8(<8 x i8> %v0, <8 x i8> %v1) {
}
; COST-LABEL: trn1.v16i8
; COST: Found an estimated cost of 90 for instruction: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
; CODE-LABEL: trn1.v16i8
; CODE: trn1 v0.16b, v0.16b, v1.16b
define <16 x i8> @trn1.v16i8(<16 x i8> %v0, <16 x i8> %v1) {
@ -29,7 +29,7 @@ define <16 x i8> @trn1.v16i8(<16 x i8> %v0, <16 x i8> %v1) {
}
; COST-LABEL: trn2.v16i8
; COST: Found an estimated cost of 90 for instruction: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
; CODE-LABEL: trn2.v16i8
; CODE: trn2 v0.16b, v0.16b, v1.16b
define <16 x i8> @trn2.v16i8(<16 x i8> %v0, <16 x i8> %v1) {
@ -38,7 +38,7 @@ define <16 x i8> @trn2.v16i8(<16 x i8> %v0, <16 x i8> %v1) {
}
; COST-LABEL: trn1.v4i16
; COST: Found an estimated cost of 18 for instruction: %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
; CODE-LABEL: trn1.v4i16
; CODE: trn1 v0.4h, v0.4h, v1.4h
define <4 x i16> @trn1.v4i16(<4 x i16> %v0, <4 x i16> %v1) {
@ -47,7 +47,7 @@ define <4 x i16> @trn1.v4i16(<4 x i16> %v0, <4 x i16> %v1) {
}
; COST-LABEL: trn2.v4i16
; COST: Found an estimated cost of 18 for instruction: %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
; CODE-LABEL: trn2.v4i16
; CODE: trn2 v0.4h, v0.4h, v1.4h
define <4 x i16> @trn2.v4i16(<4 x i16> %v0, <4 x i16> %v1) {
@ -56,7 +56,7 @@ define <4 x i16> @trn2.v4i16(<4 x i16> %v0, <4 x i16> %v1) {
}
; COST-LABEL: trn1.v8i16
; COST: Found an estimated cost of 42 for instruction: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
; CODE-LABEL: trn1.v8i16
; CODE: trn1 v0.8h, v0.8h, v1.8h
define <8 x i16> @trn1.v8i16(<8 x i16> %v0, <8 x i16> %v1) {
@ -65,7 +65,7 @@ define <8 x i16> @trn1.v8i16(<8 x i16> %v0, <8 x i16> %v1) {
}
; COST-LABEL: trn2.v8i16
; COST: Found an estimated cost of 42 for instruction: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
; CODE-LABEL: trn2.v8i16
; CODE: trn2 v0.8h, v0.8h, v1.8h
define <8 x i16> @trn2.v8i16(<8 x i16> %v0, <8 x i16> %v1) {
@ -74,7 +74,7 @@ define <8 x i16> @trn2.v8i16(<8 x i16> %v0, <8 x i16> %v1) {
}
; COST-LABEL: trn1.v2i32
; COST: Found an estimated cost of 6 for instruction: %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> <i32 0, i32 2>
; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> <i32 0, i32 2>
; CODE-LABEL: trn1.v2i32
; CODE: zip1 v0.2s, v0.2s, v1.2s
define <2 x i32> @trn1.v2i32(<2 x i32> %v0, <2 x i32> %v1) {
@ -83,7 +83,7 @@ define <2 x i32> @trn1.v2i32(<2 x i32> %v0, <2 x i32> %v1) {
}
; COST-LABEL: trn2.v2i32
; COST: Found an estimated cost of 6 for instruction: %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> <i32 1, i32 3>
; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> <i32 1, i32 3>
; CODE-LABEL: trn2.v2i32
; CODE: zip2 v0.2s, v0.2s, v1.2s
define <2 x i32> @trn2.v2i32(<2 x i32> %v0, <2 x i32> %v1) {
@ -92,7 +92,7 @@ define <2 x i32> @trn2.v2i32(<2 x i32> %v0, <2 x i32> %v1) {
}
; COST-LABEL: trn1.v4i32
; COST: Found an estimated cost of 18 for instruction: %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
; CODE-LABEL: trn1.v4i32
; CODE: trn1 v0.4s, v0.4s, v1.4s
define <4 x i32> @trn1.v4i32(<4 x i32> %v0, <4 x i32> %v1) {
@ -101,7 +101,7 @@ define <4 x i32> @trn1.v4i32(<4 x i32> %v0, <4 x i32> %v1) {
}
; COST-LABEL: trn2.v4i32
; COST: Found an estimated cost of 18 for instruction: %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
; CODE-LABEL: trn2.v4i32
; CODE: trn2 v0.4s, v0.4s, v1.4s
define <4 x i32> @trn2.v4i32(<4 x i32> %v0, <4 x i32> %v1) {
@ -110,7 +110,7 @@ define <4 x i32> @trn2.v4i32(<4 x i32> %v0, <4 x i32> %v1) {
}
; COST-LABEL: trn1.v2i64
; COST: Found an estimated cost of 6 for instruction: %tmp0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <2 x i32> <i32 0, i32 2>
; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <2 x i32> <i32 0, i32 2>
; CODE-LABEL: trn1.v2i64
; CODE: zip1 v0.2d, v0.2d, v1.2d
define <2 x i64> @trn1.v2i64(<2 x i64> %v0, <2 x i64> %v1) {
@ -119,7 +119,7 @@ define <2 x i64> @trn1.v2i64(<2 x i64> %v0, <2 x i64> %v1) {
}
; COST-LABEL: trn2.v2i64
; COST: Found an estimated cost of 6 for instruction: %tmp0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <2 x i32> <i32 1, i32 3>
; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <2 x i32> <i32 1, i32 3>
; CODE-LABEL: trn2.v2i64
; CODE: zip2 v0.2d, v0.2d, v1.2d
define <2 x i64> @trn2.v2i64(<2 x i64> %v0, <2 x i64> %v1) {
@ -128,7 +128,7 @@ define <2 x i64> @trn2.v2i64(<2 x i64> %v0, <2 x i64> %v1) {
}
; COST-LABEL: trn1.v2f32
; COST: Found an estimated cost of 6 for instruction: %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> <i32 0, i32 2>
; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> <i32 0, i32 2>
; CODE-LABEL: trn1.v2f32
; CODE: zip1 v0.2s, v0.2s, v1.2s
define <2 x float> @trn1.v2f32(<2 x float> %v0, <2 x float> %v1) {
@ -137,7 +137,7 @@ define <2 x float> @trn1.v2f32(<2 x float> %v0, <2 x float> %v1) {
}
; COST-LABEL: trn2.v2f32
; COST: Found an estimated cost of 6 for instruction: %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> <i32 1, i32 3>
; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> <i32 1, i32 3>
; CODE-LABEL: trn2.v2f32
; CODE: zip2 v0.2s, v0.2s, v1.2s
define <2 x float> @trn2.v2f32(<2 x float> %v0, <2 x float> %v1) {
@ -146,7 +146,7 @@ define <2 x float> @trn2.v2f32(<2 x float> %v0, <2 x float> %v1) {
}
; COST-LABEL: trn1.v4f32
; COST: Found an estimated cost of 18 for instruction: %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
; CODE-LABEL: trn1.v4f32
; CODE: trn1 v0.4s, v0.4s, v1.4s
define <4 x float> @trn1.v4f32(<4 x float> %v0, <4 x float> %v1) {
@ -155,7 +155,7 @@ define <4 x float> @trn1.v4f32(<4 x float> %v0, <4 x float> %v1) {
}
; COST-LABEL: trn2.v4f32
; COST: Found an estimated cost of 18 for instruction: %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
; CODE-LABEL: trn2.v4f32
; CODE: trn2 v0.4s, v0.4s, v1.4s
define <4 x float> @trn2.v4f32(<4 x float> %v0, <4 x float> %v1) {
@ -164,7 +164,7 @@ define <4 x float> @trn2.v4f32(<4 x float> %v0, <4 x float> %v1) {
}
; COST-LABEL: trn1.v2f64
; COST: Found an estimated cost of 6 for instruction: %tmp0 = shufflevector <2 x double> %v0, <2 x double> %v1, <2 x i32> <i32 0, i32 2>
; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x double> %v0, <2 x double> %v1, <2 x i32> <i32 0, i32 2>
; CODE-LABEL: trn1.v2f64
; CODE: zip1 v0.2d, v0.2d, v1.2d
define <2 x double> @trn1.v2f64(<2 x double> %v0, <2 x double> %v1) {
@ -173,7 +173,7 @@ define <2 x double> @trn1.v2f64(<2 x double> %v0, <2 x double> %v1) {
}
; COST-LABEL: trn2.v2f64
; COST: Found an estimated cost of 6 for instruction: %tmp0 = shufflevector <2 x double> %v0, <2 x double> %v1, <2 x i32> <i32 1, i32 3>
; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x double> %v0, <2 x double> %v1, <2 x i32> <i32 1, i32 3>
; CODE-LABEL: trn2.v2f64
; CODE: zip2 v0.2d, v0.2d, v1.2d
define <2 x double> @trn2.v2f64(<2 x double> %v0, <2 x double> %v1) {