Add support for 256-bit versions of VPERMIL instruction. This is a new

instruction introduced in AVX, which can operate on 128 and 256-bit vectors.
It considers a 256-bit vector as two independent 128-bit lanes. It can permute
any 32 or 64 elements inside a lane, and restricts the second lane to
have the same permutation of the first one. With the improved splat support
introduced early today, adding codegen for this instruction enable more
efficient 256-bit code:

Instead of:
  vextractf128  $0, %ymm0, %xmm0
  punpcklbw %xmm0, %xmm0
  punpckhbw %xmm0, %xmm0
  vinsertf128 $0, %xmm0, %ymm0, %ymm1
  vinsertf128 $1, %xmm0, %ymm1, %ymm0
  vextractf128  $1, %ymm0, %xmm1
  shufps  $1, %xmm1, %xmm1
  movss %xmm1, 28(%rsp)
  movss %xmm1, 24(%rsp)
  movss %xmm1, 20(%rsp)
  movss %xmm1, 16(%rsp)
  vextractf128  $0, %ymm0, %xmm0
  shufps  $1, %xmm0, %xmm0
  movss %xmm0, 12(%rsp)
  movss %xmm0, 8(%rsp)
  movss %xmm0, 4(%rsp)
  movss %xmm0, (%rsp)
  vmovaps (%rsp), %ymm0
We get:
  vextractf128  $0, %ymm0, %xmm0
  punpcklbw %xmm0, %xmm0
  punpckhbw %xmm0, %xmm0
  vinsertf128 $0, %xmm0, %ymm0, %ymm1
  vinsertf128 $1, %xmm0, %ymm1, %ymm0
  vpermilps $85, %ymm0, %ymm0

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@135662 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Bruno Cardoso Lopes 2011-07-21 01:55:47 +00:00
parent 9283b668a1
commit 65b74e1d00
8 changed files with 137 additions and 0 deletions

View File

@ -205,6 +205,16 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
DecodeUNPCKHPMask(4, ShuffleMask);
Src1Name = getRegName(MI->getOperand(0).getReg());
break;
case X86::VPERMILPSYri:
DecodeVPERMILPSMask(8, MI->getOperand(2).getImm(),
ShuffleMask);
Src1Name = getRegName(MI->getOperand(0).getReg());
break;
case X86::VPERMILPDYri:
DecodeVPERMILPSMask(4, MI->getOperand(2).getImm(),
ShuffleMask);
Src1Name = getRegName(MI->getOperand(0).getReg());
break;
}

View File

@ -187,4 +187,31 @@ void DecodeUNPCKLPMask(EVT VT,
}
}
void DecodeVPERMILPSMask(unsigned NElts, unsigned Imm,
SmallVectorImpl<unsigned> &ShuffleMask) {
DecodeVPERMILMask(MVT::getVectorVT(MVT::i32, NElts), Imm, ShuffleMask);
}
void DecodeVPERMILPDMask(unsigned NElts, unsigned Imm,
SmallVectorImpl<unsigned> &ShuffleMask) {
DecodeVPERMILMask(MVT::getVectorVT(MVT::i64, NElts), Imm, ShuffleMask);
}
// DecodeVPERMILMask - Decodes VPERMIL permutes for any 128-bit
// with 32/64-bit elements. For 256-bit vectors, it's considered
// as two 128 lanes and the mask of the first lane should be
// identical of the second one.
void DecodeVPERMILMask(EVT VT, unsigned Imm,
SmallVectorImpl<unsigned> &ShuffleMask) {
unsigned NumElts = VT.getVectorNumElements();
unsigned NumLanes = VT.getSizeInBits()/128;
for (unsigned l = 0; l != NumLanes; ++l) {
for (unsigned i = 0; i != NumElts/NumLanes; ++i) {
unsigned Idx = (Imm >> (i*2)) & 0x3 ;
ShuffleMask.push_back(Idx+(l*NumElts/NumLanes));
}
}
}
} // llvm namespace

View File

@ -82,6 +82,20 @@ void DecodeUNPCKLPDMask(unsigned NElts,
void DecodeUNPCKLPMask(EVT VT,
SmallVectorImpl<unsigned> &ShuffleMask);
void DecodeVPERMILPSMask(unsigned NElts, unsigned Imm,
SmallVectorImpl<unsigned> &ShuffleMask);
void DecodeVPERMILPDMask(unsigned NElts, unsigned Imm,
SmallVectorImpl<unsigned> &ShuffleMask);
// DecodeVPERMILMask - Decodes VPERMIL permutes for any 128-bit
// with 32/64-bit elements. For 256-bit vectors, it's considered
// as two 128 lanes and the mask of the first lane should be
// identical of the second one.
void DecodeVPERMILMask(EVT VT, unsigned Imm,
SmallVectorImpl<unsigned> &ShuffleMask);
} // llvm namespace
#endif

View File

@ -2747,6 +2747,7 @@ static bool isTargetShuffle(unsigned Opcode) {
case X86ISD::PUNPCKHBW:
case X86ISD::PUNPCKHDQ:
case X86ISD::PUNPCKHQDQ:
case X86ISD::VPERMIL:
return true;
}
return false;
@ -2772,6 +2773,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
case X86ISD::PSHUFD:
case X86ISD::PSHUFHW:
case X86ISD::PSHUFLW:
case X86ISD::VPERMIL:
return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
}
@ -3422,6 +3424,54 @@ bool X86::isMOVLMask(ShuffleVectorSDNode *N) {
return ::isMOVLMask(M, N->getValueType(0));
}
/// isVPERMILMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to VPERMIL*.
static bool isVPERMILMask(const SmallVectorImpl<int> &Mask, EVT VT) {
unsigned NumElts = VT.getVectorNumElements();
unsigned NumLanes = VT.getSizeInBits()/128;
// Match any permutation of 128-bit vector with 32/64-bit types
if (NumLanes == 1) {
if (NumElts == 4 || NumElts == 2)
return true;
return false;
}
// Only match 256-bit with 32/64-bit types
if (NumElts != 8 && NumElts != 4)
return false;
// The mask on the high lane should be the same as the low. Actually,
// they can differ if any of the corresponding index in a lane is undef.
int LaneSize = NumElts/NumLanes;
for (int i = 0; i < LaneSize; ++i) {
int HighElt = i+LaneSize;
if (Mask[i] < 0 || Mask[HighElt] < 0)
continue;
if (Mask[HighElt]-Mask[i] != LaneSize)
return false;
}
return true;
}
/// getShuffleVPERMILImmediateediate - Return the appropriate immediate to shuffle
/// the specified VECTOR_MASK mask with VPERMIL* instructions.
static unsigned getShuffleVPERMILImmediate(SDNode *N) {
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
EVT VT = SVOp->getValueType(0);
int NumElts = VT.getVectorNumElements();
int NumLanes = VT.getSizeInBits()/128;
unsigned Mask = 0;
for (int i = 0; i < NumElts/NumLanes /* lane size */; ++i)
Mask |= SVOp->getMaskElt(i) << (i*2);
return Mask;
}
/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse
/// of what x86 movss want. X86 movs requires the lowest element to be lowest
/// element of vector 2 and the other elements to come from vector 1 in order.
@ -4097,6 +4147,10 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
return getShuffleScalarElt(V.getOperand(OpNum).getNode(), Index, DAG,
Depth+1);
}
case X86ISD::VPERMIL:
ImmN = N->getOperand(N->getNumOperands()-1);
DecodeVPERMILMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(),
ShuffleMask);
default:
assert("not implemented for target shuffle node");
return SDValue();
@ -6043,6 +6097,13 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
if (NumElems == 4)
return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG);
// Handle VPERMIL permutations
if (isVPERMILMask(M, VT)) {
unsigned TargetMask = getShuffleVPERMILImmediate(SVOp);
if (VT == MVT::v8f32)
return getTargetShuffleNode(X86ISD::VPERMIL, dl, VT, V1, TargetMask, DAG);
}
return SDValue();
}
@ -9660,6 +9721,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::PUNPCKHWD: return "X86ISD::PUNPCKHWD";
case X86ISD::PUNPCKHDQ: return "X86ISD::PUNPCKHDQ";
case X86ISD::PUNPCKHQDQ: return "X86ISD::PUNPCKHQDQ";
case X86ISD::VPERMIL: return "X86ISD::VPERMIL";
case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
@ -12465,6 +12527,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::PSHUFLW:
case X86ISD::MOVSS:
case X86ISD::MOVSD:
case X86ISD::VPERMIL:
case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI);
}

View File

@ -271,6 +271,7 @@ namespace llvm {
PUNPCKHWD,
PUNPCKHDQ,
PUNPCKHQDQ,
VPERMIL,
// VASTART_SAVE_XMM_REGS - Save xmm argument registers to the stack,
// according to %al. An operator is needed so that this can be expanded

View File

@ -150,6 +150,8 @@ def X86Punpckhwd : SDNode<"X86ISD::PUNPCKHWD", SDTShuff2Op>;
def X86Punpckhdq : SDNode<"X86ISD::PUNPCKHDQ", SDTShuff2Op>;
def X86Punpckhqdq : SDNode<"X86ISD::PUNPCKHQDQ", SDTShuff2Op>;
def X86VPermil : SDNode<"X86ISD::VPERMIL", SDTShuff2OpI>;
//===----------------------------------------------------------------------===//
// SSE Complex Patterns
//===----------------------------------------------------------------------===//

View File

@ -5529,6 +5529,10 @@ def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
// The AVX version of some but not all of them are described here, and more
// should come in a near future.
// Shuffle with VPERMIL instructions
def : Pat<(v8f32 (X86VPermil VR256:$src1, (i8 imm:$imm))),
(VPERMILPSYri VR256:$src1, imm:$imm)>;
// Shuffle with PSHUFD instruction folding loads. The first two patterns match
// SSE2 loads, which are always promoted to v2i64. The last one should match
// the SSE1 case, where the only legal load is v4f32, but there is no PSHUFD

View File

@ -0,0 +1,16 @@
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
; FIXME: use avx versions for punpcklbw and punpckhbw
; CHECK: vextractf128 $0
; CHECK-NEXT: punpcklbw
; CHECK-NEXT: punpckhbw
; CHECK-NEXT: vinsertf128 $0
; CHECK-NEXT: vinsertf128 $1
; CHECK-NEXT: vpermilps $85
define <32 x i8> @funcA(<32 x i8> %a) nounwind uwtable readnone ssp {
entry:
%shuffle = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
ret <32 x i8> %shuffle
}