[AVX-512] Handle kor/kand/kandn/kxor/kxnor/knot intrinsics at lowering time instead of isel

Summary:
Currently we handle these intrinsics at isel with special patterns. But as they just map to normal logic operations, we should just handle them at lowering. This will expose them to DAG combine optimizations. Right now the kor-sequence test generates a bunch of regclass copies between GR16 and VK16 that the peephole optimizer and/or register coallescing are removing to keep everything in the mask domain. By handling the logic op intrinsics earlier, these copies become bitcasts in the DAG and get removed by DAG combine which seems more robust.

This should help enable my plan to stop copying between K registers and GR8/GR16. The peephole optimizer can't remove a chain of copies between K and GR32 with insert_subreg/extract_subreg present in the chain so the kor-sequence test break. But this patch should dodge the problem entirely.

Reviewers: zvi, delena, RKSimon, igorb

Reviewed By: igorb

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D31056

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@298228 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Craig Topper 2017-03-19 17:11:09 +00:00
parent bc9cb1ed06
commit 4131f92ade
4 changed files with 61 additions and 47 deletions

View File

@ -19648,6 +19648,15 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
Src2, Src1);
return DAG.getBitcast(VT, Res);
}
case MASK_BINOP: {
MVT VT = Op.getSimpleValueType();
MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
SDValue Res = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Src2);
return DAG.getBitcast(VT, Res);
}
case FIXUPIMMS:
case FIXUPIMMS_MASKZ:
case FIXUPIMM:
@ -19820,6 +19829,33 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
}
case Intrinsic::x86_avx512_knot_w: {
SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
SDValue RHS = DAG.getConstant(1, dl, MVT::v16i1);
SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
return DAG.getBitcast(MVT::i16, Res);
}
case Intrinsic::x86_avx512_kandn_w: {
SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
// Invert LHS for the not.
LHS = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS,
DAG.getConstant(1, dl, MVT::v16i1));
SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
SDValue Res = DAG.getNode(ISD::AND, dl, MVT::v16i1, LHS, RHS);
return DAG.getBitcast(MVT::i16, Res);
}
case Intrinsic::x86_avx512_kxnor_w: {
SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
// Invert result for the not.
Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, Res,
DAG.getConstant(1, dl, MVT::v16i1));
return DAG.getBitcast(MVT::i16, Res);
}
case Intrinsic::x86_sse42_pcmpistria128:
case Intrinsic::x86_sse42_pcmpestria128:
case Intrinsic::x86_sse42_pcmpistric128:

View File

@ -2381,15 +2381,6 @@ multiclass avx512_mask_unop_all<bits<8> opc, string OpcodeStr,
defm KNOT : avx512_mask_unop_all<0x44, "knot", vnot>;
multiclass avx512_mask_unop_int<string IntName, string InstName> {
let Predicates = [HasAVX512] in
def : Pat<(!cast<Intrinsic>("int_x86_avx512_"##IntName##"_w")
(i16 GR16:$src)),
(COPY_TO_REGCLASS (!cast<Instruction>(InstName##"Wrr")
(v16i1 (COPY_TO_REGCLASS GR16:$src, VK16))), GR16)>;
}
defm : avx512_mask_unop_int<"knot", "KNOT">;
// KNL does not support KMOVB, 8-bit mask is promoted to 16-bit
let Predicates = [HasAVX512, NoDQI] in
def : Pat<(vnot VK8:$src),
@ -2438,21 +2429,6 @@ defm KXOR : avx512_mask_binop_all<0x47, "kxor", xor, 1>;
defm KANDN : avx512_mask_binop_all<0x42, "kandn", vandn, 0>;
defm KADD : avx512_mask_binop_all<0x4A, "kadd", add, 1, HasDQI>;
multiclass avx512_mask_binop_int<string IntName, string InstName> {
let Predicates = [HasAVX512] in
def : Pat<(!cast<Intrinsic>("int_x86_avx512_"##IntName##"_w")
(i16 GR16:$src1), (i16 GR16:$src2)),
(COPY_TO_REGCLASS (!cast<Instruction>(InstName##"Wrr")
(v16i1 (COPY_TO_REGCLASS GR16:$src1, VK16)),
(v16i1 (COPY_TO_REGCLASS GR16:$src2, VK16))), GR16)>;
}
defm : avx512_mask_binop_int<"kand", "KAND">;
defm : avx512_mask_binop_int<"kandn", "KANDN">;
defm : avx512_mask_binop_int<"kor", "KOR">;
defm : avx512_mask_binop_int<"kxnor", "KXNOR">;
defm : avx512_mask_binop_int<"kxor", "KXOR">;
multiclass avx512_binop_pat<SDPatternOperator VOpNode, SDPatternOperator OpNode,
Instruction Inst> {
// With AVX512F, 8-bit mask is promoted to 16-bit mask,

View File

@ -36,7 +36,7 @@ enum IntrinsicType : uint16_t {
TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32,
EXPAND_FROM_MEM,
TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS,
FIXUPIMMS_MASKZ, CONVERT_MASK_TO_VEC, CONVERT_TO_MASK, GATHER_AVX2
FIXUPIMMS_MASKZ, CONVERT_MASK_TO_VEC, CONVERT_TO_MASK, GATHER_AVX2, MASK_BINOP,
};
struct IntrinsicData {
@ -474,10 +474,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_cvtw2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
X86_INTRINSIC_DATA(avx512_exp2_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0),
X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0),
X86_INTRINSIC_DATA(avx512_kand_w, MASK_BINOP, ISD::AND, 0),
X86_INTRINSIC_DATA(avx512_kor_w, MASK_BINOP, ISD::OR, 0),
X86_INTRINSIC_DATA(avx512_kunpck_bw, KUNPCK, ISD::CONCAT_VECTORS, 0),
X86_INTRINSIC_DATA(avx512_kunpck_dq, KUNPCK, ISD::CONCAT_VECTORS, 0),
X86_INTRINSIC_DATA(avx512_kunpck_wd, KUNPCK, ISD::CONCAT_VECTORS, 0),
X86_INTRINSIC_DATA(avx512_kxor_w, MASK_BINOP, ISD::XOR, 0),
X86_INTRINSIC_DATA(avx512_mask_add_pd_512, INTR_TYPE_2OP_MASK, ISD::FADD,
X86ISD::FADD_RND),
X86_INTRINSIC_DATA(avx512_mask_add_ps_512, INTR_TYPE_2OP_MASK, ISD::FADD,

View File

@ -33,12 +33,12 @@ declare i16 @llvm.x86.avx512.kand.w(i16, i16) nounwind readnone
define i16 @test_kand(i16 %a0, i16 %a1) {
; CHECK-LABEL: test_kand:
; CHECK: ## BB#0:
; CHECK-NEXT: movw $8, %ax
; CHECK-NEXT: kmovw %eax, %k0
; CHECK-NEXT: kmovw %esi, %k0
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: movw $8, %ax
; CHECK-NEXT: kmovw %eax, %k2
; CHECK-NEXT: kandw %k0, %k1, %k0
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: kandw %k1, %k0, %k0
; CHECK-NEXT: kandw %k0, %k2, %k0
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: retq
%t1 = call i16 @llvm.x86.avx512.kand.w(i16 %a0, i16 8)
@ -50,12 +50,12 @@ declare i16 @llvm.x86.avx512.kandn.w(i16, i16) nounwind readnone
define i16 @test_kandn(i16 %a0, i16 %a1) {
; CHECK-LABEL: test_kandn:
; CHECK: ## BB#0:
; CHECK-NEXT: movw $8, %ax
; CHECK-NEXT: kmovw %eax, %k0
; CHECK-NEXT: kmovw %esi, %k0
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: movw $8, %ax
; CHECK-NEXT: kmovw %eax, %k2
; CHECK-NEXT: kandnw %k2, %k1, %k1
; CHECK-NEXT: kandnw %k0, %k1, %k0
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: kandnw %k1, %k0, %k0
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: retq
%t1 = call i16 @llvm.x86.avx512.kandn.w(i16 %a0, i16 8)
@ -79,12 +79,12 @@ declare i16 @llvm.x86.avx512.kor.w(i16, i16) nounwind readnone
define i16 @test_kor(i16 %a0, i16 %a1) {
; CHECK-LABEL: test_kor:
; CHECK: ## BB#0:
; CHECK-NEXT: movw $8, %ax
; CHECK-NEXT: kmovw %eax, %k0
; CHECK-NEXT: kmovw %esi, %k0
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: movw $8, %ax
; CHECK-NEXT: kmovw %eax, %k2
; CHECK-NEXT: korw %k0, %k1, %k0
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: korw %k1, %k0, %k0
; CHECK-NEXT: korw %k0, %k2, %k0
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: retq
%t1 = call i16 @llvm.x86.avx512.kor.w(i16 %a0, i16 8)
@ -110,12 +110,12 @@ declare i16 @llvm.x86.avx512.kxnor.w(i16, i16) nounwind readnone
define i16 @test_kxnor(i16 %a0, i16 %a1) {
; CHECK-LABEL: test_kxnor:
; CHECK: ## BB#0:
; CHECK-NEXT: movw $8, %ax
; CHECK-NEXT: kmovw %eax, %k0
; CHECK-NEXT: kmovw %esi, %k0
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: kxnorw %k0, %k1, %k0
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: kxnorw %k1, %k0, %k0
; CHECK-NEXT: movw $8, %ax
; CHECK-NEXT: kmovw %eax, %k2
; CHECK-NEXT: kxorw %k0, %k1, %k0
; CHECK-NEXT: kxorw %k0, %k2, %k0
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: retq
%t1 = call i16 @llvm.x86.avx512.kxnor.w(i16 %a0, i16 8)
@ -127,12 +127,12 @@ declare i16 @llvm.x86.avx512.kxor.w(i16, i16) nounwind readnone
define i16 @test_kxor(i16 %a0, i16 %a1) {
; CHECK-LABEL: test_kxor:
; CHECK: ## BB#0:
; CHECK-NEXT: movw $8, %ax
; CHECK-NEXT: kmovw %eax, %k0
; CHECK-NEXT: kmovw %esi, %k0
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: movw $8, %ax
; CHECK-NEXT: kmovw %eax, %k2
; CHECK-NEXT: kxorw %k0, %k1, %k0
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: kxorw %k1, %k0, %k0
; CHECK-NEXT: kxorw %k0, %k2, %k0
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: retq
%t1 = call i16 @llvm.x86.avx512.kxor.w(i16 %a0, i16 8)