AVX-512: Added shuffle instructions -

VPSHUFD, VPERMILPS, VMOVDDUP, VMOVLHPS, VMOVHLPS, VSHUFPS, VALIGN
 single and double forms.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@189215 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Elena Demikhovsky 2013-08-26 12:45:35 +00:00
parent e4bf77a128
commit 92bfb54770
4 changed files with 239 additions and 34 deletions

View File

@ -3600,7 +3600,7 @@ static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT,
return false;
unsigned NumElts = VT.getVectorNumElements();
unsigned NumLanes = VT.getSizeInBits()/128;
unsigned NumLanes = VT.is512BitVector() ? 1: VT.getSizeInBits()/128;
unsigned NumLaneElts = NumElts/NumLanes;
// Do not handle 64-bit element shuffles with palignr.
@ -3683,10 +3683,7 @@ static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
/// specifies a shuffle of elements that is suitable for input to 128/256-bit
/// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
/// reverse of what x86 shuffles want.
static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool HasFp256,
bool Commuted = false) {
if (!HasFp256 && VT.is256BitVector())
return false;
static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool Commuted = false) {
unsigned NumElems = VT.getVectorNumElements();
unsigned NumLanes = VT.getSizeInBits()/128;
@ -3695,6 +3692,10 @@ static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool HasFp256,
if (NumLaneElems != 2 && NumLaneElems != 4)
return false;
unsigned EltSize = VT.getVectorElementType().getSizeInBits();
bool symetricMaskRequired =
(VT.getSizeInBits() >= 256) && (EltSize == 32);
// VSHUFPSY divides the resulting vector into 4 chunks.
// The sources are also splitted into 4 chunks, and each destination
// chunk must come from a different source chunk.
@ -3714,6 +3715,7 @@ static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool HasFp256,
//
// DST => Y3..Y2, X3..X2, Y1..Y0, X1..X0
//
SmallVector<int, 4> MaskVal(NumLaneElems, -1);
unsigned HalfLaneElems = NumLaneElems/2;
for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
for (unsigned i = 0; i != NumLaneElems; ++i) {
@ -3724,9 +3726,13 @@ static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool HasFp256,
// For VSHUFPSY, the mask of the second half must be the same as the
// first but with the appropriate offsets. This works in the same way as
// VPERMILPS works with masks.
if (NumElems != 8 || l == 0 || Mask[i] < 0)
if (!symetricMaskRequired || Idx < 0)
continue;
if (!isUndefOrEqual(Idx, Mask[i]+l))
if (MaskVal[i] < 0) {
MaskVal[i] = Idx - l;
continue;
}
if ((signed)(Idx - l) != MaskVal[i])
return false;
}
}
@ -4158,31 +4164,32 @@ static bool isPermImmMask(ArrayRef<int> Mask, MVT VT, unsigned& Imm8) {
/// to the same elements of the low, but to the higher half of the source.
/// In VPERMILPD the two lanes could be shuffled independently of each other
/// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
if (!HasFp256)
static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT) {
unsigned EltSize = VT.getVectorElementType().getSizeInBits();
if (VT.getSizeInBits() < 256 || EltSize < 32)
return false;
bool symetricMaskRequired = (EltSize == 32);
unsigned NumElts = VT.getVectorNumElements();
// Only match 256-bit with 32/64-bit types
if (!VT.is256BitVector() || (NumElts != 4 && NumElts != 8))
return false;
unsigned NumLanes = VT.getSizeInBits()/128;
unsigned LaneSize = NumElts/NumLanes;
// 2 or 4 elements in one lane
SmallVector<int, 4> ExpectedMaskVal(LaneSize, -1);
for (unsigned l = 0; l != NumElts; l += LaneSize) {
for (unsigned i = 0; i != LaneSize; ++i) {
if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
return false;
if (NumElts != 8 || l == 0)
continue;
// VPERMILPS handling
if (Mask[i] < 0)
continue;
if (!isUndefOrEqual(Mask[i+l], Mask[i]+l))
return false;
if (symetricMaskRequired) {
if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) {
ExpectedMaskVal[i] = Mask[i+l] - l;
continue;
}
if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l))
return false;
}
}
}
return true;
}
@ -4431,10 +4438,11 @@ static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
MVT VT = SVOp->getSimpleValueType(0);
unsigned EltSize = VT.getVectorElementType().getSizeInBits() >> 3;
unsigned EltSize = VT.is512BitVector() ? 1 :
VT.getVectorElementType().getSizeInBits() >> 3;
unsigned NumElts = VT.getVectorNumElements();
unsigned NumLanes = VT.getSizeInBits()/128;
unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128;
unsigned NumLaneElts = NumElts/NumLanes;
int Val = 0;
@ -7407,7 +7415,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
}
// Normalize the node to match x86 shuffle ops if needed
if (!V2IsUndef && (isSHUFPMask(M, VT, HasFp256, /* Commuted */ true)))
if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true)))
return CommuteVectorShuffle(SVOp, DAG);
// The checks below are all present in isShuffleMaskLegal, but they are
@ -7430,7 +7438,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
getShufflePSHUFLWImmediate(SVOp),
DAG);
if (isSHUFPMask(M, VT, HasFp256))
if (isSHUFPMask(M, VT))
return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2,
getShuffleSHUFImmediate(SVOp), DAG);
@ -7449,8 +7457,8 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
// Handle VPERMILPS/D* permutations
if (isVPERMILPMask(M, VT, HasFp256)) {
if (HasInt256 && VT == MVT::v8i32)
if (isVPERMILPMask(M, VT)) {
if ((HasInt256 && VT == MVT::v8i32) || VT == MVT::v16i32)
return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1,
getShuffleSHUFImmediate(SVOp), DAG);
return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1,
@ -13621,7 +13629,7 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
return (SVT.getVectorNumElements() == 2 ||
ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
isMOVLMask(M, SVT) ||
isSHUFPMask(M, SVT, Subtarget->hasFp256()) ||
isSHUFPMask(M, SVT) ||
isPSHUFDMask(M, SVT) ||
isPSHUFHWMask(M, SVT, Subtarget->hasInt256()) ||
isPSHUFLWMask(M, SVT, Subtarget->hasInt256()) ||
@ -13646,8 +13654,8 @@ X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
if (NumElts == 4 && SVT.is128BitVector()) {
return (isMOVLMask(Mask, SVT) ||
isCommutedMOVLMask(Mask, SVT, true) ||
isSHUFPMask(Mask, SVT, Subtarget->hasFp256()) ||
isSHUFPMask(Mask, SVT, Subtarget->hasFp256(), /* Commuted */ true));
isSHUFPMask(Mask, SVT) ||
isSHUFPMask(Mask, SVT, /* Commuted */ true));
}
return false;
}

View File

@ -1621,6 +1621,45 @@ defm VPUNPCKHDQZ : avx512_unpack_int<0x6A, "vpunpckhdq", X86Unpckh, v16i32,
defm VPUNPCKHQDQZ : avx512_unpack_int<0x6D, "vpunpckhqdq", X86Unpckh, v8i64,
VR512, memopv8i64, i512mem>, EVEX_V512,
VEX_W, EVEX_CD8<64, CD8VF>;
//===----------------------------------------------------------------------===//
// AVX-512 - PSHUFD
//
multiclass avx512_pshuf_imm<bits<8> opc, string OpcodeStr, RegisterClass RC,
SDNode OpNode, PatFrag mem_frag,
X86MemOperand x86memop, ValueType OpVT> {
def ri : AVX512Ii8<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, i8imm:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set RC:$dst,
(OpVT (OpNode RC:$src1, (i8 imm:$src2))))]>,
EVEX;
def mi : AVX512Ii8<opc, MRMSrcMem, (outs RC:$dst),
(ins x86memop:$src1, i8imm:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set RC:$dst,
(OpVT (OpNode (mem_frag addr:$src1),
(i8 imm:$src2))))]>, EVEX;
}
defm VPSHUFDZ : avx512_pshuf_imm<0x70, "vpshufd", VR512, X86PShufd, memopv16i32,
i512mem, v16i32>, OpSize, EVEX_V512, EVEX_CD8<32, CD8VF>;
let ExeDomain = SSEPackedSingle in
defm VPERMILPSZ : avx512_pshuf_imm<0x04, "vpermilps", VR512, X86VPermilp,
memopv16f32, i512mem, v16f32>, OpSize, TA, EVEX_V512,
EVEX_CD8<32, CD8VF>;
let ExeDomain = SSEPackedDouble in
defm VPERMILPDZ : avx512_pshuf_imm<0x05, "vpermilpd", VR512, X86VPermilp,
memopv8f64, i512mem, v8f64>, OpSize, TA, EVEX_V512,
VEX_W, EVEX_CD8<32, CD8VF>;
def : Pat<(v16i32 (X86VPermilp VR512:$src1, (i8 imm:$imm))),
(VPERMILPSZri VR512:$src1, imm:$imm)>;
def : Pat<(v8i64 (X86VPermilp VR512:$src1, (i8 imm:$imm))),
(VPERMILPDZri VR512:$src1, imm:$imm)>;
//===----------------------------------------------------------------------===//
// AVX-512 Logical Instructions
@ -1774,8 +1813,8 @@ multiclass avx512_vptest<bits<8> opc, string OpcodeStr, RegisterClass KRC,
defm VPTESTMDZ : avx512_vptest<0x27, "vptestmd", VK16, VR512, f512mem,
memopv16i32, X86testm, v16i32>, EVEX_V512,
EVEX_CD8<32, CD8VF>;
defm VPTESTMQZ : avx512_vptest<0x27, "vptestmq", VK8, VR512, f512mem, memopv8i64,
X86testm, v8i64>, EVEX_V512, VEX_W,
defm VPTESTMQZ : avx512_vptest<0x27, "vptestmq", VK8, VR512, f512mem,
memopv8i64, X86testm, v8i64>, EVEX_V512, VEX_W,
EVEX_CD8<64, CD8VF>;
//===----------------------------------------------------------------------===//
@ -1914,3 +1953,99 @@ defm VPSRAVDZ : avx512_var_shift<0x46, "vpsravd", sra, VR512, v16i32,
defm VPSRAVQZ : avx512_var_shift<0x46, "vpsravq", sra, VR512, v8i64,
i512mem, memopv8i64>, EVEX_V512, VEX_W,
EVEX_CD8<64, CD8VF>;
//===----------------------------------------------------------------------===//
// AVX-512 - MOVDDUP
//===----------------------------------------------------------------------===//
multiclass avx512_movddup<string OpcodeStr, RegisterClass RC, ValueType VT,
X86MemOperand x86memop, PatFrag memop_frag> {
def rr : AVX512PDI<0x12, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set RC:$dst, (VT (X86Movddup RC:$src)))]>, EVEX;
def rm : AVX512PDI<0x12, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set RC:$dst,
(VT (X86Movddup (memop_frag addr:$src))))]>, EVEX;
}
defm VMOVDDUPZ : avx512_movddup<"vmovddup", VR512, v8f64, f512mem, memopv8f64>,
VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
def : Pat<(X86Movddup (v8f64 (scalar_to_vector (loadf64 addr:$src)))),
(VMOVDDUPZrm addr:$src)>;
def VMOVLHPSZrr : AVX512PSI<0x16, MRMSrcReg, (outs VR128X:$dst),
(ins VR128X:$src1, VR128X:$src2),
"vmovlhps{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR128X:$dst, (v4f32 (X86Movlhps VR128X:$src1, VR128X:$src2)))],
IIC_SSE_MOV_LH>, EVEX_4V;
def VMOVHLPSZrr : AVX512PSI<0x12, MRMSrcReg, (outs VR128X:$dst),
(ins VR128X:$src1, VR128X:$src2),
"vmovhlps{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR128X:$dst, (v4f32 (X86Movhlps VR128X:$src1, VR128X:$src2)))],
IIC_SSE_MOV_LH>, EVEX_4V;
// MOVLHPS patterns
def : Pat<(v4i32 (X86Movlhps VR128X:$src1, VR128X:$src2)),
(VMOVLHPSZrr VR128X:$src1, VR128X:$src2)>;
def : Pat<(v2i64 (X86Movlhps VR128X:$src1, VR128X:$src2)),
(VMOVLHPSZrr (v2i64 VR128X:$src1), VR128X:$src2)>;
// MOVHLPS patterns
def : Pat<(v4i32 (X86Movhlps VR128X:$src1, VR128X:$src2)),
(VMOVHLPSZrr VR128X:$src1, VR128X:$src2)>;
//===----------------------------------------------------------------------===//
// VSHUFPS - VSHUFPD Operations
multiclass avx512_shufp<RegisterClass RC, X86MemOperand x86memop,
ValueType vt, string OpcodeStr, PatFrag mem_frag,
Domain d> {
def rmi : AVX512PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop:$src2, i8imm:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
(i8 imm:$src3))))], d, IIC_SSE_SHUFP>,
EVEX_4V, TB, Sched<[WriteShuffleLd, ReadAfterLd]>;
def rri : AVX512PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, i8imm:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
(i8 imm:$src3))))], d, IIC_SSE_SHUFP>,
EVEX_4V, TB, Sched<[WriteShuffle]>;
}
defm VSHUFPSZ : avx512_shufp<VR512, f512mem, v16f32, "vshufps", memopv16f32,
SSEPackedSingle>, EVEX_V512, EVEX_CD8<32, CD8VF>;
defm VSHUFPDZ : avx512_shufp<VR512, f512mem, v8f64, "vshufpd", memopv8f64,
SSEPackedDouble>, OpSize, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
multiclass avx512_alignr<string OpcodeStr, RegisterClass RC,
X86MemOperand x86memop> {
def rri : AVX512AIi8<0x03, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, i8imm:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[]>, EVEX_4V;
def rmi : AVX512AIi8<0x03, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop:$src2, i8imm:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[]>, EVEX_4V;
}
defm VALIGND : avx512_alignr<"valignd", VR512, i512mem>,
EVEX_V512, EVEX_CD8<32, CD8VF>;
defm VALIGNQ : avx512_alignr<"valignq", VR512, i512mem>,
VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
def : Pat<(v16f32 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))),
(VALIGNDrri VR512:$src2, VR512:$src1, imm:$imm)>;
def : Pat<(v8f64 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))),
(VALIGNQrri VR512:$src2, VR512:$src1, imm:$imm)>;
def : Pat<(v16i32 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))),
(VALIGNDrri VR512:$src2, VR512:$src1, imm:$imm)>;
def : Pat<(v8i64 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))),
(VALIGNQrri VR512:$src2, VR512:$src1, imm:$imm)>;

View File

@ -1327,7 +1327,7 @@ let Predicates = [UseSSE2] in {
// SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions
//===----------------------------------------------------------------------===//
let AddedComplexity = 20 in {
let AddedComplexity = 20, Predicates = [UseAVX] in {
def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2),
"movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@ -1358,7 +1358,7 @@ let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
IIC_SSE_MOV_LH>, Sched<[WriteShuffle]>;
}
let Predicates = [HasAVX] in {
let Predicates = [UseAVX] in {
// MOVLHPS patterns
def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
(VMOVLHPSrr VR128:$src1, VR128:$src2)>;

View File

@ -106,6 +106,53 @@ define <16 x i32> @test11(<16 x i32> %a, <16 x i32>* %b) nounwind {
ret <16 x i32> %d
}
; CHECK-LABEL: test12
; CHECK: vmovlhpsz %xmm
; CHECK: ret
define <4 x i32> @test12(<4 x i32> %a, <4 x i32> %b) nounwind {
%c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
ret <4 x i32> %c
}
; CHECK-LABEL: test13
; CHECK: vpermilps $-79, %zmm
; CHECK: ret
define <16 x float> @test13(<16 x float> %a) {
%b = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32><i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
ret <16 x float> %b
}
; CHECK-LABEL: test14
; CHECK: vpermilpd $-53, %zmm
; CHECK: ret
define <8 x double> @test14(<8 x double> %a) {
%b = shufflevector <8 x double> %a, <8 x double> undef, <8 x i32><i32 1, i32 1, i32 2, i32 3, i32 4, i32 4, i32 7, i32 7>
ret <8 x double> %b
}
; CHECK-LABEL: test15
; CHECK: vpshufd $-79, %zmm
; CHECK: ret
define <16 x i32> @test15(<16 x i32> %a) {
%b = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32><i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
ret <16 x i32> %b
}
; CHECK-LABEL: test16
; CHECK: valignq $2, %zmm0, %zmm1
; CHECK: ret
define <8 x double> @test16(<8 x double> %a, <8 x double> %b) nounwind {
%c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
ret <8 x double> %c
}
; CHECK-LABEL: test17
; CHECK: vshufpd $19, %zmm1, %zmm0
; CHECK: ret
define <8 x double> @test17(<8 x double> %a, <8 x double> %b) nounwind {
%c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 9, i32 2, i32 10, i32 5, i32 undef, i32 undef, i32 undef>
ret <8 x double> %c
}
; CHECK-LABEL: test18
; CHECK: vpunpckhdq %zmm
; CHECK: ret
@ -138,3 +185,18 @@ define <16 x float> @test21(<16 x float> %a, <16 x float> %c) {
ret <16 x float> %b
}
; CHECK-LABEL: test22
; CHECK: vmovhlpsz %xmm
; CHECK: ret
define <4 x i32> @test22(<4 x i32> %a, <4 x i32> %b) nounwind {
%c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
ret <4 x i32> %c
}
; CHECK-LABEL: @test23
; CHECK: vshufps $-112, %zmm
; CHECK: ret
define <16 x float> @test23(<16 x float> %a, <16 x float> %c) {
%b = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32><i32 0, i32 0, i32 17, i32 18, i32 4, i32 4, i32 21, i32 22, i32 8, i32 8, i32 25, i32 26, i32 12, i32 12, i32 29, i32 30>
ret <16 x float> %b
}