AVX-512: Added SHIFT instructions.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@188899 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Elena Demikhovsky 2013-08-21 09:36:02 +00:00
parent df40f8e8ad
commit 8ba76daba0
6 changed files with 302 additions and 6 deletions

View File

@ -11269,6 +11269,16 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
}
case Intrinsic::x86_avx512_kortestz:
case Intrinsic::x86_avx512_kortestc: {
unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz)? X86::COND_E: X86::COND_B;
SDValue LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(1));
SDValue RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(2));
SDValue CC = DAG.getConstant(X86CC, MVT::i8);
SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
}
// SSE/AVX shift intrinsics
case Intrinsic::x86_sse2_psll_w:
@ -12135,7 +12145,9 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
(Subtarget->hasInt256() &&
(VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16))) {
(VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16)) ||
(Subtarget->hasAVX512() &&
(VT == MVT::v8i64 || VT == MVT::v16i32))) {
if (Op.getOpcode() == ISD::SHL)
return DAG.getNode(X86ISD::VSHLI, dl, VT, R,
DAG.getConstant(ShiftAmt, MVT::i32));
@ -12297,7 +12309,8 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
VT == MVT::v4i32 || VT == MVT::v8i16 ||
(Subtarget->hasInt256() &&
((VT == MVT::v4i64 && Op.getOpcode() != ISD::SRA) ||
VT == MVT::v8i32 || VT == MVT::v16i16))) {
VT == MVT::v8i32 || VT == MVT::v16i16)) ||
(Subtarget->hasAVX512() && (VT == MVT::v8i64 || VT == MVT::v16i32))) {
SDValue BaseShAmt;
EVT EltVT = VT.getVectorElementType();
@ -12365,6 +12378,8 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
case MVT::v4i64:
case MVT::v8i32:
case MVT::v16i16:
case MVT::v16i32:
case MVT::v8i64:
return getTargetVShiftNode(X86ISD::VSHLI, dl, VT, R, BaseShAmt, DAG);
}
case ISD::SRA:
@ -12374,6 +12389,8 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
case MVT::v8i16:
case MVT::v8i32:
case MVT::v16i16:
case MVT::v16i32:
case MVT::v8i64:
return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, R, BaseShAmt, DAG);
}
case ISD::SRL:
@ -12385,6 +12402,8 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
case MVT::v4i64:
case MVT::v8i32:
case MVT::v16i16:
case MVT::v16i32:
case MVT::v8i64:
return getTargetVShiftNode(X86ISD::VSRLI, dl, VT, R, BaseShAmt, DAG);
}
}
@ -12393,7 +12412,8 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
// Special case in 32-bit mode, where i64 is expanded into high and low parts.
if (!Subtarget->is64Bit() &&
(VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) &&
(VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64) ||
(Subtarget->hasAVX512() && VT == MVT::v8i64)) &&
Amt.getOpcode() == ISD::BITCAST &&
Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
Amt = Amt.getOperand(0);
@ -12442,6 +12462,8 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
if (V.getNode())
return V;
if (Subtarget->hasAVX512() && (VT == MVT::v16i32 || VT == MVT::v8i64))
return Op;
// AVX2 has VPSLLV/VPSRAV/VPSRLV.
if (Subtarget->hasInt256()) {
if (Op.getOpcode() == ISD::SRL &&
@ -13350,6 +13372,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
case X86ISD::PTEST: return "X86ISD::PTEST";
case X86ISD::TESTP: return "X86ISD::TESTP";
case X86ISD::TESTM: return "X86ISD::TESTM";
case X86ISD::KORTEST: return "X86ISD::KORTEST";
case X86ISD::KTEST: return "X86ISD::KTEST";
case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";

View File

@ -274,7 +274,7 @@ namespace llvm {
// PCMP* - Vector integer comparisons.
PCMPEQ, PCMPGT,
// PCMP*M - Vector integer comparisons, the result is in a mask vector
// PCMP*M - Vector integer comparisons, the result is in a mask vector.
PCMPEQM, PCMPGTM,
/// CMPM, CMPMU - Vector comparison generating mask bits for fp and
@ -295,12 +295,15 @@ namespace llvm {
// MUL_IMM - X86 specific multiply by immediate.
MUL_IMM,
// PTEST - Vector bitwise comparisons
// PTEST - Vector bitwise comparisons.
PTEST,
// TESTP - Vector packed fp sign bitwise comparisons
// TESTP - Vector packed fp sign bitwise comparisons.
TESTP,
// TESTM - Vector "test" in AVX-512, the result is in a mask vector.
TESTM,
// OR/AND test for masks
KORTEST,
KTEST,

View File

@ -1691,3 +1691,144 @@ defm VDIVPDZ : avx512_fp_packed<0x5E, "divpd", fdiv, VR512, v8f64, f512mem,
SSE_ALU_ITINS_P.d, 0>,
EVEX_V512, OpSize, VEX_W, EVEX_CD8<64, CD8VF>;
//===----------------------------------------------------------------------===//
// AVX-512 VPTESTM instructions
//===----------------------------------------------------------------------===//
multiclass avx512_vptest<bits<8> opc, string OpcodeStr, RegisterClass KRC,
RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag,
SDNode OpNode, ValueType vt> {
def rr : AVX5128I<opc, MRMSrcReg,
(outs KRC:$dst), (ins RC:$src1, RC:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set KRC:$dst, (OpNode (vt RC:$src1), (vt RC:$src2)))]>, EVEX_4V;
def rm : AVX5128I<opc, MRMSrcMem,
(outs KRC:$dst), (ins RC:$src1, x86memop:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set KRC:$dst, (OpNode (vt RC:$src1),
(bitconvert (memop_frag addr:$src2))))]>, EVEX_4V;
}
defm VPTESTMDZ : avx512_vptest<0x27, "vptestmd", VK16, VR512, f512mem,
memopv16i32, X86testm, v16i32>, EVEX_V512,
EVEX_CD8<32, CD8VF>;
defm VPTESTMQZ : avx512_vptest<0x27, "vptestmq", VK8, VR512, f512mem, memopv8i64,
X86testm, v8i64>, EVEX_V512, VEX_W,
EVEX_CD8<64, CD8VF>;
//===----------------------------------------------------------------------===//
// AVX-512 Shift instructions
//===----------------------------------------------------------------------===//
multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM,
string OpcodeStr,
SDNode OpNode, RegisterClass RC, ValueType vt,
X86MemOperand x86memop, PatFrag mem_frag> {
def ri : AVX512BIi8<opc, ImmFormR, (outs RC:$dst),
(ins RC:$src1, i32i8imm:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set RC:$dst, (vt (OpNode RC:$src1, (i32 imm:$src2))))],
SSE_INTSHIFT_ITINS_P.rr>, EVEX_4V;
def mi: AVX512BIi8<opc, ImmFormM, (outs RC:$dst),
(ins x86memop:$src1, i32i8imm:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set RC:$dst, (OpNode (mem_frag addr:$src1),
(i32 imm:$src2)))], SSE_INTSHIFT_ITINS_P.rm>, EVEX_4V;
}
multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode,
RegisterClass RC, ValueType vt, ValueType SrcVT,
PatFrag bc_frag> {
// src2 is always 128-bit
def rr : AVX512BI<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, VR128X:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set RC:$dst, (vt (OpNode RC:$src1, (SrcVT VR128X:$src2))))],
SSE_INTSHIFT_ITINS_P.rr>, EVEX_4V;
def rm : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, i128mem:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set RC:$dst, (vt (OpNode RC:$src1,
(bc_frag (memopv2i64 addr:$src2)))))],
SSE_INTSHIFT_ITINS_P.rm>, EVEX_4V;
}
defm VPSRLDZ : avx512_shift_rmi<0x72, MRM2r, MRM2m, "vpsrld", X86vsrli,
VR512, v16i32, i512mem, memopv16i32>, EVEX_V512,
EVEX_CD8<32, CD8VF>;
defm VPSRLDZ : avx512_shift_rrm<0xD2, "vpsrld", X86vsrl,
VR512, v16i32, v4i32, bc_v4i32>, EVEX_V512,
EVEX_CD8<32, CD8VQ>;
defm VPSRLQZ : avx512_shift_rmi<0x73, MRM2r, MRM2m, "vpsrlq", X86vsrli,
VR512, v8i64, i512mem, memopv8i64>, EVEX_V512,
EVEX_CD8<64, CD8VF>, VEX_W;
defm VPSRLQZ : avx512_shift_rrm<0xD3, "vpsrlq", X86vsrl,
VR512, v8i64, v2i64, bc_v2i64>, EVEX_V512,
EVEX_CD8<64, CD8VQ>, VEX_W;
defm VPSLLDZ : avx512_shift_rmi<0x72, MRM6r, MRM6m, "vpslld", X86vshli,
VR512, v16i32, i512mem, memopv16i32>, EVEX_V512,
EVEX_CD8<32, CD8VF>;
defm VPSLLDZ : avx512_shift_rrm<0xF2, "vpslld", X86vshl,
VR512, v16i32, v4i32, bc_v4i32>, EVEX_V512,
EVEX_CD8<32, CD8VQ>;
defm VPSLLQZ : avx512_shift_rmi<0x73, MRM6r, MRM6m, "vpsllq", X86vshli,
VR512, v8i64, i512mem, memopv8i64>, EVEX_V512,
EVEX_CD8<64, CD8VF>, VEX_W;
defm VPSLLQZ : avx512_shift_rrm<0xF3, "vpsllq", X86vshl,
VR512, v8i64, v2i64, bc_v2i64>, EVEX_V512,
EVEX_CD8<64, CD8VQ>, VEX_W;
defm VPSRADZ : avx512_shift_rmi<0x72, MRM4r, MRM4m, "vpsrad", X86vsrai,
VR512, v16i32, i512mem, memopv16i32>, EVEX_V512,
EVEX_CD8<32, CD8VF>;
defm VPSRADZ : avx512_shift_rrm<0xE2, "vpsrad", X86vsra,
VR512, v16i32, v4i32, bc_v4i32>, EVEX_V512,
EVEX_CD8<32, CD8VQ>;
defm VPSRAQZ : avx512_shift_rmi<0x72, MRM4r, MRM4m, "vpsraq", X86vsrai,
VR512, v8i64, i512mem, memopv8i64>, EVEX_V512,
EVEX_CD8<64, CD8VF>, VEX_W;
defm VPSRAQZ : avx512_shift_rrm<0xE2, "vpsraq", X86vsra,
VR512, v8i64, v2i64, bc_v2i64>, EVEX_V512,
EVEX_CD8<64, CD8VQ>, VEX_W;
//===-------------------------------------------------------------------===//
// Variable Bit Shifts
//===-------------------------------------------------------------------===//
multiclass avx512_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
RegisterClass RC, ValueType vt,
X86MemOperand x86memop, PatFrag mem_frag> {
def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set RC:$dst,
(vt (OpNode RC:$src1, (vt RC:$src2))))]>,
EVEX_4V;
def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set RC:$dst,
(vt (OpNode RC:$src1, (mem_frag addr:$src2))))]>,
EVEX_4V;
}
defm VPSLLVDZ : avx512_var_shift<0x47, "vpsllvd", shl, VR512, v16i32,
i512mem, memopv16i32>, EVEX_V512,
EVEX_CD8<32, CD8VF>;
defm VPSLLVQZ : avx512_var_shift<0x47, "vpsllvq", shl, VR512, v8i64,
i512mem, memopv8i64>, EVEX_V512, VEX_W,
EVEX_CD8<64, CD8VF>;
defm VPSRLVDZ : avx512_var_shift<0x45, "vpsrlvd", srl, VR512, v16i32,
i512mem, memopv16i32>, EVEX_V512,
EVEX_CD8<32, CD8VF>;
defm VPSRLVQZ : avx512_var_shift<0x45, "vpsrlvq", srl, VR512, v8i64,
i512mem, memopv8i64>, EVEX_V512, VEX_W,
EVEX_CD8<64, CD8VF>;
defm VPSRAVDZ : avx512_var_shift<0x46, "vpsravd", sra, VR512, v16i32,
i512mem, memopv16i32>, EVEX_V512,
EVEX_CD8<32, CD8VF>;
defm VPSRAVQZ : avx512_var_shift<0x46, "vpsravq", sra, VR512, v8i64,
i512mem, memopv8i64>, EVEX_V512, VEX_W,
EVEX_CD8<64, CD8VF>;

View File

@ -149,6 +149,9 @@ def X86ptest : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>;
def X86testp : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>;
def X86kortest : SDNode<"X86ISD::KORTEST", SDTX86CmpPTest>;
def X86ktest : SDNode<"X86ISD::KTEST", SDTX86CmpPTest>;
def X86testm : SDNode<"X86ISD::TESTM", SDTypeProfile<1, 2, [SDTCisVec<0>,
SDTCisVec<1>,
SDTCisSameAs<2, 1>]>>;
def X86pmuludq : SDNode<"X86ISD::PMULUDQ",
SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,

View File

@ -0,0 +1,108 @@
;RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
;CHECK-LABEL: shift_16_i32
;CHECK: vpsrld
;CHECK: vpslld
;CHECK: vpsrad
;CHECK: ret
define <16 x i32> @shift_16_i32(<16 x i32> %a) {
%b = lshr <16 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%c = shl <16 x i32> %b, <i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12>
%d = ashr <16 x i32> %c, <i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12>
ret <16 x i32> %d;
}
;CHECK-LABEL: shift_8_i64
;CHECK: vpsrlq
;CHECK: vpsllq
;CHECK: vpsraq
;CHECK: ret
define <8 x i64> @shift_8_i64(<8 x i64> %a) {
%b = lshr <8 x i64> %a, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
%c = shl <8 x i64> %b, <i64 12, i64 12, i64 12, i64 12, i64 12, i64 12, i64 12, i64 12>
%d = ashr <8 x i64> %c, <i64 12, i64 12, i64 12, i64 12, i64 12, i64 12, i64 12, i64 12>
ret <8 x i64> %d;
}
; CHECK-LABEL: variable_shl4
; CHECK: vpsllvq %zmm
; CHECK: ret
define <8 x i64> @variable_shl4(<8 x i64> %x, <8 x i64> %y) {
%k = shl <8 x i64> %x, %y
ret <8 x i64> %k
}
; CHECK-LABEL: variable_shl5
; CHECK: vpsllvd %zmm
; CHECK: ret
define <16 x i32> @variable_shl5(<16 x i32> %x, <16 x i32> %y) {
%k = shl <16 x i32> %x, %y
ret <16 x i32> %k
}
; CHECK-LABEL: variable_srl0
; CHECK: vpsrlvd
; CHECK: ret
define <16 x i32> @variable_srl0(<16 x i32> %x, <16 x i32> %y) {
%k = lshr <16 x i32> %x, %y
ret <16 x i32> %k
}
; CHECK-LABEL: variable_srl2
; CHECK: psrlvq
; CHECK: ret
define <8 x i64> @variable_srl2(<8 x i64> %x, <8 x i64> %y) {
%k = lshr <8 x i64> %x, %y
ret <8 x i64> %k
}
; CHECK-LABEL: variable_sra1
; CHECK: vpsravd
; CHECK: ret
define <16 x i32> @variable_sra1(<16 x i32> %x, <16 x i32> %y) {
%k = ashr <16 x i32> %x, %y
ret <16 x i32> %k
}
; CHECK-LABEL: variable_sra2
; CHECK: vpsravq %zmm
; CHECK: ret
define <8 x i64> @variable_sra2(<8 x i64> %x, <8 x i64> %y) {
%k = ashr <8 x i64> %x, %y
ret <8 x i64> %k
}
; CHECK-LABEL: variable_sra01_load
; CHECK: vpsravd (%
; CHECK: ret
define <16 x i32> @variable_sra01_load(<16 x i32> %x, <16 x i32>* %y) {
%y1 = load <16 x i32>* %y
%k = ashr <16 x i32> %x, %y1
ret <16 x i32> %k
}
; CHECK-LABEL: variable_shl1_load
; CHECK: vpsllvd (%
; CHECK: ret
define <16 x i32> @variable_shl1_load(<16 x i32> %x, <16 x i32>* %y) {
%y1 = load <16 x i32>* %y
%k = shl <16 x i32> %x, %y1
ret <16 x i32> %k
}
; CHECK: variable_srl0_load
; CHECK: vpsrlvd (%
; CHECK: ret
define <16 x i32> @variable_srl0_load(<16 x i32> %x, <16 x i32>* %y) {
%y1 = load <16 x i32>* %y
%k = lshr <16 x i32> %x, %y1
ret <16 x i32> %k
}
; CHECK: variable_srl3_load
; CHECK: vpsrlvq (%
; CHECK: ret
define <8 x i64> @variable_srl3_load(<8 x i64> %x, <8 x i64>* %y) {
%y1 = load <8 x i64>* %y
%k = lshr <8 x i64> %x, %y1
ret <8 x i64> %k
}

View File

@ -19,3 +19,19 @@ vextracti64x4 $1, %zmm9, %ymm17
// CHECK: vextracti64x4
// CHECK: encoding: [0x62,0x73,0xfd,0x48,0x3b,0x4f,0x10,0x01]
vextracti64x4 $1, %zmm9, 512(%rdi)
// CHECK: vpsrad
// CHECK: encoding: [0x62,0xb1,0x35,0x40,0x72,0xe1,0x02]
vpsrad $2, %zmm17, %zmm25
// CHECK: vpsrad
// CHECK: encoding: [0x62,0xf1,0x35,0x40,0x72,0x64,0xb7,0x08,0x02]
vpsrad $2, 512(%rdi, %rsi, 4), %zmm25
// CHECK: vpsrad
// CHECK: encoding: [0x62,0x21,0x1d,0x48,0xe2,0xc9]
vpsrad %xmm17, %zmm12, %zmm25
// CHECK: vpsrad
// CHECK: encoding: [0x62,0x61,0x1d,0x48,0xe2,0x4c,0xb7,0x20]
vpsrad 512(%rdi, %rsi, 4), %zmm12, %zmm25