mirror of
https://github.com/RPCSX/llvm.git
synced 2024-11-26 21:20:37 +00:00
Added VPERM optimization for AVX2 shuffles
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@154761 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
8a81df1b7f
commit
73c504af9d
@ -2935,6 +2935,8 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
|
||||
case X86ISD::PSHUFHW:
|
||||
case X86ISD::PSHUFLW:
|
||||
case X86ISD::VPERMILP:
|
||||
case X86ISD::VPERMQ:
|
||||
case X86ISD::VPERMPD:
|
||||
return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
|
||||
}
|
||||
}
|
||||
@ -3976,6 +3978,27 @@ unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) {
|
||||
return Index / NumElemsPerChunk;
|
||||
}
|
||||
|
||||
/// getShuffleCLImmediate - Return the appropriate immediate to shuffle
|
||||
/// the specified VECTOR_SHUFFLE mask with VPERMQ and VPERMPD instructions.
|
||||
/// Handles 256-bit.
|
||||
static unsigned getShuffleCLImmediate(ShuffleVectorSDNode *N) {
|
||||
EVT VT = N->getValueType(0);
|
||||
|
||||
assert((VT.is256BitVector() && VT.getVectorNumElements() == 4) &&
|
||||
"Unsupported vector type for VPERMQ/VPERMPD");
|
||||
|
||||
unsigned NumElts = VT.getVectorNumElements();
|
||||
|
||||
unsigned Mask = 0;
|
||||
for (unsigned i = 0; i != NumElts; ++i) {
|
||||
int Elt = N->getMaskElt(i);
|
||||
if (Elt < 0)
|
||||
continue;
|
||||
Mask |= Elt << (i*2);
|
||||
}
|
||||
|
||||
return Mask;
|
||||
}
|
||||
/// isZeroNode - Returns true if Elt is a constant zero or a floating point
|
||||
/// constant +0.0.
|
||||
bool X86::isZeroNode(SDValue Elt) {
|
||||
@ -6627,6 +6650,20 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
|
||||
SDValue BlendOp = LowerVECTOR_SHUFFLEtoBlend(Op, Subtarget, DAG);
|
||||
if (BlendOp.getNode())
|
||||
return BlendOp;
|
||||
if (V2IsUndef && HasAVX2 && (VT == MVT::v8i32 || VT == MVT::v8f32)) {
|
||||
SmallVector<SDValue,8> permclMask;
|
||||
for (unsigned i = 0; i != 8; ++i) {
|
||||
permclMask.push_back(DAG.getConstant((M[i] >= 0)?M[i]:0x80, MVT::i32));
|
||||
}
|
||||
return DAG.getNode(VT.isInteger()? X86ISD::VPERMD:X86ISD::VPERMPS, dl, VT,
|
||||
DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32,
|
||||
&permclMask[0], 8), V1);
|
||||
|
||||
}
|
||||
if (V2IsUndef && HasAVX2 && (VT == MVT::v4i64 || VT == MVT::v4f64))
|
||||
return getTargetShuffleNode(VT.isInteger()? X86ISD::VPERMQ : X86ISD::VPERMPD, dl, VT, V1,
|
||||
getShuffleCLImmediate(SVOp), DAG);
|
||||
|
||||
|
||||
//===--------------------------------------------------------------------===//
|
||||
// Since no target specific shuffle was selected for this generic one,
|
||||
@ -11141,6 +11178,10 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
|
||||
case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
|
||||
case X86ISD::VPERMILP: return "X86ISD::VPERMILP";
|
||||
case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
|
||||
case X86ISD::VPERMD: return "X86ISD::VPERMD";
|
||||
case X86ISD::VPERMQ: return "X86ISD::VPERMQ";
|
||||
case X86ISD::VPERMPS: return "X86ISD::VPERMPS";
|
||||
case X86ISD::VPERMPD: return "X86ISD::VPERMPD";
|
||||
case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
|
||||
case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
|
||||
case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
|
||||
|
@ -285,6 +285,10 @@ namespace llvm {
|
||||
UNPCKL,
|
||||
UNPCKH,
|
||||
VPERMILP,
|
||||
VPERMD,
|
||||
VPERMQ,
|
||||
VPERMPS,
|
||||
VPERMPD,
|
||||
VPERM2X128,
|
||||
VBROADCAST,
|
||||
|
||||
|
@ -155,6 +155,10 @@ def X86Unpckl : SDNode<"X86ISD::UNPCKL", SDTShuff2Op>;
|
||||
def X86Unpckh : SDNode<"X86ISD::UNPCKH", SDTShuff2Op>;
|
||||
|
||||
def X86VPermilp : SDNode<"X86ISD::VPERMILP", SDTShuff2OpI>;
|
||||
def X86VPermd : SDNode<"X86ISD::VPERMD", SDTShuff2Op>;
|
||||
def X86VPermps : SDNode<"X86ISD::VPERMPS", SDTShuff2Op>;
|
||||
def X86VPermq : SDNode<"X86ISD::VPERMQ", SDTShuff2OpI>;
|
||||
def X86VPermpd : SDNode<"X86ISD::VPERMPD", SDTShuff2OpI>;
|
||||
|
||||
def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>;
|
||||
|
||||
|
@ -1049,9 +1049,9 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
|
||||
{ X86::VPCMPGTWYrr, X86::VPCMPGTWYrm, TB_ALIGN_32 },
|
||||
{ X86::VPERM2I128rr, X86::VPERM2I128rm, TB_ALIGN_32 },
|
||||
{ X86::VPERMDYrr, X86::VPERMDYrm, TB_ALIGN_32 },
|
||||
{ X86::VPERMPDYrr, X86::VPERMPDYrm, TB_ALIGN_32 },
|
||||
{ X86::VPERMPDYri, X86::VPERMPDYmi, TB_ALIGN_32 },
|
||||
{ X86::VPERMPSYrr, X86::VPERMPSYrm, TB_ALIGN_32 },
|
||||
{ X86::VPERMQYrr, X86::VPERMQYrm, TB_ALIGN_32 },
|
||||
{ X86::VPERMQYri, X86::VPERMQYmi, TB_ALIGN_32 },
|
||||
{ X86::VPHADDDYrr, X86::VPHADDDYrm, TB_ALIGN_32 },
|
||||
{ X86::VPHADDSWrr256, X86::VPHADDSWrm256, TB_ALIGN_32 },
|
||||
{ X86::VPHADDWYrr, X86::VPHADDWYrm, TB_ALIGN_32 },
|
||||
|
@ -7746,12 +7746,12 @@ defm VPERMPS : avx2_perm<0x16, "vpermps", memopv8f32, int_x86_avx2_permps>;
|
||||
|
||||
multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
|
||||
Intrinsic Int> {
|
||||
def Yrr : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst),
|
||||
def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst),
|
||||
(ins VR256:$src1, i8imm:$src2),
|
||||
!strconcat(OpcodeStr,
|
||||
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
|
||||
[(set VR256:$dst, (Int VR256:$src1, imm:$src2))]>, VEX;
|
||||
def Yrm : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
|
||||
def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
|
||||
(ins i256mem:$src1, i8imm:$src2),
|
||||
!strconcat(OpcodeStr,
|
||||
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
|
||||
@ -7765,6 +7765,29 @@ let ExeDomain = SSEPackedDouble in
|
||||
defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", memopv4f64, int_x86_avx2_permpd>,
|
||||
VEX_W;
|
||||
|
||||
let Predicates = [HasAVX2] in {
|
||||
def : Pat<(v8i32 (X86VPermd VR256:$src1, VR256:$src2)),
|
||||
(VPERMDYrr VR256:$src1, VR256:$src2)>;
|
||||
def : Pat<(v8f32 (X86VPermps VR256:$src1, VR256:$src2)),
|
||||
(VPERMPSYrr VR256:$src1, VR256:$src2)>;
|
||||
|
||||
def : Pat<(v4i64 (X86VPermq VR256:$src1, (i8 imm:$imm))),
|
||||
(VPERMQYri VR256:$src1, imm:$imm)>;
|
||||
def : Pat<(v4f64 (X86VPermpd VR256:$src1, (i8 imm:$imm))),
|
||||
(VPERMPDYri VR256:$src1, imm:$imm)>;
|
||||
|
||||
def : Pat<(v8i32 (X86VPermps VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))),
|
||||
(VPERMDYrm VR256:$src1, addr:$src2)>;
|
||||
def : Pat<(v8f32 (X86VPermps VR256:$src1, (memopv8f32 addr:$src2))),
|
||||
(VPERMPSYrm VR256:$src1, addr:$src2)>;
|
||||
|
||||
def : Pat<(v4i64 (X86VPermq (memopv4i64 addr:$src1), (i8 imm:$imm))),
|
||||
(VPERMQYmi addr:$src1, imm:$imm)>;
|
||||
def : Pat<(v4f64 (X86VPermpd (memopv4f64 addr:$src1), (i8 imm:$imm))),
|
||||
(VPERMPDYmi addr:$src1, imm:$imm)>;
|
||||
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// VPERM2I128 - Permute Floating-Point Values in 128-bit chunks
|
||||
//
|
||||
|
34
test/CodeGen/X86/avx2-vperm.ll
Executable file
34
test/CodeGen/X86/avx2-vperm.ll
Executable file
@ -0,0 +1,34 @@
|
||||
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s
|
||||
|
||||
define <8 x i32> @perm_cl_int_8x32(<8 x i32> %A) nounwind readnone {
|
||||
entry:
|
||||
; CHECK: perm_cl_int_8x32
|
||||
; CHECK: vpermd
|
||||
%B = shufflevector <8 x i32> %A, <8 x i32> undef, <8 x i32> <i32 0, i32 7, i32 2, i32 1, i32 2, i32 7, i32 6, i32 0>
|
||||
ret <8 x i32> %B
|
||||
}
|
||||
|
||||
|
||||
define <8 x float> @perm_cl_fp_8x32(<8 x float> %A) nounwind readnone {
|
||||
entry:
|
||||
; CHECK: perm_cl_fp_8x32
|
||||
; CHECK: vpermps
|
||||
%B = shufflevector <8 x float> %A, <8 x float> undef, <8 x i32> <i32 undef, i32 7, i32 2, i32 undef, i32 4, i32 undef, i32 1, i32 6>
|
||||
ret <8 x float> %B
|
||||
}
|
||||
|
||||
define <4 x i64> @perm_cl_int_4x64(<4 x i64> %A) nounwind readnone {
|
||||
entry:
|
||||
; CHECK: perm_cl_int_4x64
|
||||
; CHECK: vpermq
|
||||
%B = shufflevector <4 x i64> %A, <4 x i64> undef, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
|
||||
ret <4 x i64> %B
|
||||
}
|
||||
|
||||
define <4 x double> @perm_cl_fp_4x64(<4 x double> %A) nounwind readnone {
|
||||
entry:
|
||||
; CHECK: perm_cl_fp_4x64
|
||||
; CHECK: vpermpd
|
||||
%B = shufflevector <4 x double> %A, <4 x double> undef, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
|
||||
ret <4 x double> %B
|
||||
}
|
Loading…
Reference in New Issue
Block a user