From 3fad8a772e854e08b853f90bb11b4fb653ba7272 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 31 Aug 2018 21:31:53 +0000 Subject: [PATCH] [X86] Add intrinsics for KTEST instructions. These intrinsics use the same implementation as PTEST intrinsics, but use vXi1 vectors. New clang builtins will be accompanying them shortly. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@341259 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/IR/IntrinsicsX86.td | 18 +++++++ lib/Target/X86/X86ISelLowering.cpp | 32 ++++++++++-- test/CodeGen/X86/avx512bw-intrinsics.ll | 68 +++++++++++++++++++++++++ test/CodeGen/X86/avx512dq-intrinsics.ll | 68 +++++++++++++++++++++++++ 4 files changed, 181 insertions(+), 5 deletions(-) diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td index ecea44ed6c1..7b915f77a8f 100644 --- a/include/llvm/IR/IntrinsicsX86.td +++ b/include/llvm/IR/IntrinsicsX86.td @@ -2773,6 +2773,24 @@ let TargetPrefix = "x86" in { Intrinsic<[llvm_v32i1_ty], [llvm_v32i1_ty, llvm_v32i1_ty], [IntrNoMem]>; def int_x86_avx512_kadd_q : Intrinsic<[llvm_v64i1_ty], [llvm_v64i1_ty, llvm_v64i1_ty], [IntrNoMem]>; + + def int_x86_avx512_ktestc_b : + Intrinsic<[llvm_i32_ty], [llvm_v8i1_ty, llvm_v8i1_ty], [IntrNoMem]>; + def int_x86_avx512_ktestc_w : + Intrinsic<[llvm_i32_ty], [llvm_v16i1_ty, llvm_v16i1_ty], [IntrNoMem]>; + def int_x86_avx512_ktestc_d : + Intrinsic<[llvm_i32_ty], [llvm_v32i1_ty, llvm_v32i1_ty], [IntrNoMem]>; + def int_x86_avx512_ktestc_q : + Intrinsic<[llvm_i32_ty], [llvm_v64i1_ty, llvm_v64i1_ty], [IntrNoMem]>; + + def int_x86_avx512_ktestz_b : + Intrinsic<[llvm_i32_ty], [llvm_v8i1_ty, llvm_v8i1_ty], [IntrNoMem]>; + def int_x86_avx512_ktestz_w : + Intrinsic<[llvm_i32_ty], [llvm_v16i1_ty, llvm_v16i1_ty], [IntrNoMem]>; + def int_x86_avx512_ktestz_d : + Intrinsic<[llvm_i32_ty], [llvm_v32i1_ty, llvm_v32i1_ty], [IntrNoMem]>; + def int_x86_avx512_ktestz_q : + Intrinsic<[llvm_i32_ty], [llvm_v64i1_ty, llvm_v64i1_ty], [IntrNoMem]>; } // Conversion ops diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 527e7b76734..5c5e7f0d9b5 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -21297,6 +21297,14 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, // ptest and testp intrinsics. The intrinsic these come from are designed to // return an integer value, not just an instruction so lower it to the ptest // or testp pattern and a setcc for the result. + case Intrinsic::x86_avx512_ktestc_b: + case Intrinsic::x86_avx512_ktestc_w: + case Intrinsic::x86_avx512_ktestc_d: + case Intrinsic::x86_avx512_ktestc_q: + case Intrinsic::x86_avx512_ktestz_b: + case Intrinsic::x86_avx512_ktestz_w: + case Intrinsic::x86_avx512_ktestz_d: + case Intrinsic::x86_avx512_ktestz_q: case Intrinsic::x86_sse41_ptestz: case Intrinsic::x86_sse41_ptestc: case Intrinsic::x86_sse41_ptestnzc: @@ -21315,15 +21323,30 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::x86_avx_vtestz_pd_256: case Intrinsic::x86_avx_vtestc_pd_256: case Intrinsic::x86_avx_vtestnzc_pd_256: { - bool IsTestPacked = false; + unsigned TestOpc = X86ISD::PTEST; X86::CondCode X86CC; switch (IntNo) { default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); + case Intrinsic::x86_avx512_ktestc_b: + case Intrinsic::x86_avx512_ktestc_w: + case Intrinsic::x86_avx512_ktestc_d: + case Intrinsic::x86_avx512_ktestc_q: + // CF = 1 + TestOpc = X86ISD::KTEST; + X86CC = X86::COND_B; + break; + case Intrinsic::x86_avx512_ktestz_b: + case Intrinsic::x86_avx512_ktestz_w: + case Intrinsic::x86_avx512_ktestz_d: + case Intrinsic::x86_avx512_ktestz_q: + TestOpc = X86ISD::KTEST; + X86CC = X86::COND_E; + break; case Intrinsic::x86_avx_vtestz_ps: case Intrinsic::x86_avx_vtestz_pd: case Intrinsic::x86_avx_vtestz_ps_256: case Intrinsic::x86_avx_vtestz_pd_256: - IsTestPacked = true; + TestOpc = X86ISD::TESTP; LLVM_FALLTHROUGH; case Intrinsic::x86_sse41_ptestz: case Intrinsic::x86_avx_ptestz_256: @@ -21334,7 +21357,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::x86_avx_vtestc_pd: case Intrinsic::x86_avx_vtestc_ps_256: case Intrinsic::x86_avx_vtestc_pd_256: - IsTestPacked = true; + TestOpc = X86ISD::TESTP; LLVM_FALLTHROUGH; case Intrinsic::x86_sse41_ptestc: case Intrinsic::x86_avx_ptestc_256: @@ -21345,7 +21368,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::x86_avx_vtestnzc_pd: case Intrinsic::x86_avx_vtestnzc_ps_256: case Intrinsic::x86_avx_vtestnzc_pd_256: - IsTestPacked = true; + TestOpc = X86ISD::TESTP; LLVM_FALLTHROUGH; case Intrinsic::x86_sse41_ptestnzc: case Intrinsic::x86_avx_ptestnzc_256: @@ -21356,7 +21379,6 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); - unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST; SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS); SDValue SetCC = getSETCC(X86CC, Test, dl, DAG); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll index 34254bd2407..c17ba57d11a 100644 --- a/test/CodeGen/X86/avx512bw-intrinsics.ll +++ b/test/CodeGen/X86/avx512bw-intrinsics.ll @@ -58,6 +58,74 @@ entry: } declare <64 x i1> @llvm.x86.avx512.kadd.q(<64 x i1>, <64 x i1>) +define i32 @test_x86_avx512_ktestc_d(<32 x i16> %A, <32 x i16> %B) { +; CHECK-LABEL: test_x86_avx512_ktestc_d: +; CHECK: # %bb.0: +; CHECK-NEXT: vptestmw %zmm0, %zmm0, %k0 # encoding: [0x62,0xf2,0xfd,0x48,0x26,0xc0] +; CHECK-NEXT: vptestmw %zmm1, %zmm1, %k1 # encoding: [0x62,0xf2,0xf5,0x48,0x26,0xc9] +; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] +; CHECK-NEXT: ktestd %k1, %k0 # encoding: [0xc4,0xe1,0xf9,0x99,0xc1] +; CHECK-NEXT: setb %al # encoding: [0x0f,0x92,0xc0] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = icmp ne <32 x i16> %A, zeroinitializer + %2 = icmp ne <32 x i16> %B, zeroinitializer + %res = call i32 @llvm.x86.avx512.ktestc.d(<32 x i1> %1, <32 x i1> %2) ; [#uses=1] + ret i32 %res +} +declare i32 @llvm.x86.avx512.ktestc.d(<32 x i1>, <32 x i1>) nounwind readnone + +define i32 @test_x86_avx512_ktestz_d(<32 x i16> %A, <32 x i16> %B) { +; CHECK-LABEL: test_x86_avx512_ktestz_d: +; CHECK: # %bb.0: +; CHECK-NEXT: vptestmw %zmm0, %zmm0, %k0 # encoding: [0x62,0xf2,0xfd,0x48,0x26,0xc0] +; CHECK-NEXT: vptestmw %zmm1, %zmm1, %k1 # encoding: [0x62,0xf2,0xf5,0x48,0x26,0xc9] +; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] +; CHECK-NEXT: ktestd %k1, %k0 # encoding: [0xc4,0xe1,0xf9,0x99,0xc1] +; CHECK-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = icmp ne <32 x i16> %A, zeroinitializer + %2 = icmp ne <32 x i16> %B, zeroinitializer + %res = call i32 @llvm.x86.avx512.ktestz.d(<32 x i1> %1, <32 x i1> %2) ; [#uses=1] + ret i32 %res +} +declare i32 @llvm.x86.avx512.ktestz.d(<32 x i1>, <32 x i1>) nounwind readnone + +define i32 @test_x86_avx512_ktestc_q(<64 x i8> %A, <64 x i8> %B) { +; CHECK-LABEL: test_x86_avx512_ktestc_q: +; CHECK: # %bb.0: +; CHECK-NEXT: vptestmb %zmm0, %zmm0, %k0 # encoding: [0x62,0xf2,0x7d,0x48,0x26,0xc0] +; CHECK-NEXT: vptestmb %zmm1, %zmm1, %k1 # encoding: [0x62,0xf2,0x75,0x48,0x26,0xc9] +; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] +; CHECK-NEXT: ktestq %k1, %k0 # encoding: [0xc4,0xe1,0xf8,0x99,0xc1] +; CHECK-NEXT: setb %al # encoding: [0x0f,0x92,0xc0] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = icmp ne <64 x i8> %A, zeroinitializer + %2 = icmp ne <64 x i8> %B, zeroinitializer + %res = call i32 @llvm.x86.avx512.ktestc.q(<64 x i1> %1, <64 x i1> %2) ; [#uses=1] + ret i32 %res +} +declare i32 @llvm.x86.avx512.ktestc.q(<64 x i1>, <64 x i1>) nounwind readnone + +define i32 @test_x86_avx512_ktestz_q(<64 x i8> %A, <64 x i8> %B) { +; CHECK-LABEL: test_x86_avx512_ktestz_q: +; CHECK: # %bb.0: +; CHECK-NEXT: vptestmb %zmm0, %zmm0, %k0 # encoding: [0x62,0xf2,0x7d,0x48,0x26,0xc0] +; CHECK-NEXT: vptestmb %zmm1, %zmm1, %k1 # encoding: [0x62,0xf2,0x75,0x48,0x26,0xc9] +; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] +; CHECK-NEXT: ktestq %k1, %k0 # encoding: [0xc4,0xe1,0xf8,0x99,0xc1] +; CHECK-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = icmp ne <64 x i8> %A, zeroinitializer + %2 = icmp ne <64 x i8> %B, zeroinitializer + %res = call i32 @llvm.x86.avx512.ktestz.q(<64 x i1> %1, <64 x i1> %2) ; [#uses=1] + ret i32 %res +} +declare i32 @llvm.x86.avx512.ktestz.q(<64 x i1>, <64 x i1>) nounwind readnone + define <32 x i16> @test_mask_packs_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) { ; CHECK-LABEL: test_mask_packs_epi32_rr_512: ; CHECK: # %bb.0: diff --git a/test/CodeGen/X86/avx512dq-intrinsics.ll b/test/CodeGen/X86/avx512dq-intrinsics.ll index 05c4ad5dcbe..47470fd8f16 100644 --- a/test/CodeGen/X86/avx512dq-intrinsics.ll +++ b/test/CodeGen/X86/avx512dq-intrinsics.ll @@ -48,6 +48,74 @@ entry: } declare <8 x i1> @llvm.x86.avx512.kadd.b(<8 x i1>, <8 x i1>) +define i32 @test_x86_avx512_ktestc_w(<16 x i32> %A, <16 x i32> %B) { +; CHECK-LABEL: test_x86_avx512_ktestc_w: +; CHECK: # %bb.0: +; CHECK-NEXT: vptestmd %zmm0, %zmm0, %k0 # encoding: [0x62,0xf2,0x7d,0x48,0x27,0xc0] +; CHECK-NEXT: vptestmd %zmm1, %zmm1, %k1 # encoding: [0x62,0xf2,0x75,0x48,0x27,0xc9] +; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] +; CHECK-NEXT: ktestw %k1, %k0 # encoding: [0xc5,0xf8,0x99,0xc1] +; CHECK-NEXT: setb %al # encoding: [0x0f,0x92,0xc0] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = icmp ne <16 x i32> %A, zeroinitializer + %2 = icmp ne <16 x i32> %B, zeroinitializer + %res = call i32 @llvm.x86.avx512.ktestc.w(<16 x i1> %1, <16 x i1> %2) ; [#uses=1] + ret i32 %res +} +declare i32 @llvm.x86.avx512.ktestc.w(<16 x i1>, <16 x i1>) nounwind readnone + +define i32 @test_x86_avx512_ktestz_w(<16 x i32> %A, <16 x i32> %B) { +; CHECK-LABEL: test_x86_avx512_ktestz_w: +; CHECK: # %bb.0: +; CHECK-NEXT: vptestmd %zmm0, %zmm0, %k0 # encoding: [0x62,0xf2,0x7d,0x48,0x27,0xc0] +; CHECK-NEXT: vptestmd %zmm1, %zmm1, %k1 # encoding: [0x62,0xf2,0x75,0x48,0x27,0xc9] +; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] +; CHECK-NEXT: ktestw %k1, %k0 # encoding: [0xc5,0xf8,0x99,0xc1] +; CHECK-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = icmp ne <16 x i32> %A, zeroinitializer + %2 = icmp ne <16 x i32> %B, zeroinitializer + %res = call i32 @llvm.x86.avx512.ktestz.w(<16 x i1> %1, <16 x i1> %2) ; [#uses=1] + ret i32 %res +} +declare i32 @llvm.x86.avx512.ktestz.w(<16 x i1>, <16 x i1>) nounwind readnone + +define i32 @test_x86_avx512_ktestc_b(<8 x i64> %A, <8 x i64> %B) { +; CHECK-LABEL: test_x86_avx512_ktestc_b: +; CHECK: # %bb.0: +; CHECK-NEXT: vptestmq %zmm0, %zmm0, %k0 # encoding: [0x62,0xf2,0xfd,0x48,0x27,0xc0] +; CHECK-NEXT: vptestmq %zmm1, %zmm1, %k1 # encoding: [0x62,0xf2,0xf5,0x48,0x27,0xc9] +; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] +; CHECK-NEXT: ktestb %k1, %k0 # encoding: [0xc5,0xf9,0x99,0xc1] +; CHECK-NEXT: setb %al # encoding: [0x0f,0x92,0xc0] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = icmp ne <8 x i64> %A, zeroinitializer + %2 = icmp ne <8 x i64> %B, zeroinitializer + %res = call i32 @llvm.x86.avx512.ktestc.b(<8 x i1> %1, <8 x i1> %2) ; [#uses=1] + ret i32 %res +} +declare i32 @llvm.x86.avx512.ktestc.b(<8 x i1>, <8 x i1>) nounwind readnone + +define i32 @test_x86_avx512_ktestz_b(<8 x i64> %A, <8 x i64> %B) { +; CHECK-LABEL: test_x86_avx512_ktestz_b: +; CHECK: # %bb.0: +; CHECK-NEXT: vptestmq %zmm0, %zmm0, %k0 # encoding: [0x62,0xf2,0xfd,0x48,0x27,0xc0] +; CHECK-NEXT: vptestmq %zmm1, %zmm1, %k1 # encoding: [0x62,0xf2,0xf5,0x48,0x27,0xc9] +; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] +; CHECK-NEXT: ktestb %k1, %k0 # encoding: [0xc5,0xf9,0x99,0xc1] +; CHECK-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = icmp ne <8 x i64> %A, zeroinitializer + %2 = icmp ne <8 x i64> %B, zeroinitializer + %res = call i32 @llvm.x86.avx512.ktestz.b(<8 x i1> %1, <8 x i1> %2) ; [#uses=1] + ret i32 %res +} +declare i32 @llvm.x86.avx512.ktestz.b(<8 x i1>, <8 x i1>) nounwind readnone + declare <8 x i64> @llvm.x86.avx512.mask.cvtpd2qq.512(<8 x double>, <8 x i64>, i8, i32) define <8 x i64>@test_int_x86_avx512_mask_cvt_pd2qq_512(<8 x double> %x0, <8 x i64> %x1, i8 %x2) {