From f91b2666bbb7e408323ea6c35e436eedaeae6811 Mon Sep 17 00:00:00 2001 From: Igor Breger Date: Sun, 24 Jan 2016 08:04:33 +0000 Subject: [PATCH] AVX512: VMOVDQU8/16/32/64 (load) intrinsic implementation. Differential Revision: http://reviews.llvm.org/D16137 llvm-svn: 258657 --- include/llvm/IR/IntrinsicsX86.td | 58 ++++++++++++++++-- lib/Target/X86/X86ISelLowering.cpp | 22 +++++++ lib/Target/X86/X86ISelLowering.h | 8 +++ lib/Target/X86/X86InstrAVX512.td | 8 --- lib/Target/X86/X86IntrinsicsInfo.h | 12 ++++ test/CodeGen/X86/avx512-intrinsics.ll | 36 +++++++++++ test/CodeGen/X86/avx512bw-intrinsics.ll | 61 ++++++++++++++++++- test/CodeGen/X86/avx512bwvl-intrinsics.ll | 72 ++++++++++++++++++++++ test/CodeGen/X86/avx512vl-intrinsics.ll | 73 +++++++++++++++++++++++ 9 files changed, 335 insertions(+), 15 deletions(-) diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td index d083d54644f..5881e78432c 100644 --- a/include/llvm/IR/IntrinsicsX86.td +++ b/include/llvm/IR/IntrinsicsX86.td @@ -3028,12 +3028,58 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx2_maskload_q_256 : GCCBuiltin<"__builtin_ia32_maskloadq256">, Intrinsic<[llvm_v4i64_ty], [llvm_ptr_ty, llvm_v4i64_ty], [IntrReadArgMem]>; - def int_x86_avx512_mask_loadu_d_512 : GCCBuiltin<"__builtin_ia32_loaddqusi512_mask">, - Intrinsic<[llvm_v16i32_ty], [llvm_ptr_ty, llvm_v16i32_ty, llvm_i16_ty], - [IntrReadArgMem]>; - def int_x86_avx512_mask_loadu_q_512 : GCCBuiltin<"__builtin_ia32_loaddqudi512_mask">, - Intrinsic<[llvm_v8i64_ty], [llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty], - [IntrReadArgMem]>; + + def int_x86_avx512_mask_loadu_b_128 : + GCCBuiltin<"__builtin_ia32_loaddquqi128_mask">, + Intrinsic<[llvm_v16i8_ty], + [llvm_ptr_ty, llvm_v16i8_ty, llvm_i16_ty], [IntrReadArgMem]>; + def int_x86_avx512_mask_loadu_b_256 : + GCCBuiltin<"__builtin_ia32_loaddquqi256_mask">, + Intrinsic<[llvm_v32i8_ty], + [llvm_ptr_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrReadArgMem]>; + def int_x86_avx512_mask_loadu_b_512 : + GCCBuiltin<"__builtin_ia32_loaddquqi512_mask">, + Intrinsic<[llvm_v64i8_ty], + [llvm_ptr_ty, llvm_v64i8_ty, llvm_i64_ty], [IntrReadArgMem]>; + + def int_x86_avx512_mask_loadu_w_128 : + GCCBuiltin<"__builtin_ia32_loaddquhi128_mask">, + Intrinsic<[llvm_v8i16_ty], + [llvm_ptr_ty, llvm_v8i16_ty, llvm_i8_ty], [IntrReadArgMem]>; + def int_x86_avx512_mask_loadu_w_256 : + GCCBuiltin<"__builtin_ia32_loaddquhi256_mask">, + Intrinsic<[llvm_v16i16_ty], + [llvm_ptr_ty, llvm_v16i16_ty, llvm_i16_ty], [IntrReadArgMem]>; + def int_x86_avx512_mask_loadu_w_512 : + GCCBuiltin<"__builtin_ia32_loaddquhi512_mask">, + Intrinsic<[llvm_v32i16_ty], + [llvm_ptr_ty, llvm_v32i16_ty, llvm_i32_ty], [IntrReadArgMem]>; + + def int_x86_avx512_mask_loadu_d_128 : + GCCBuiltin<"__builtin_ia32_loaddqusi128_mask">, + Intrinsic<[llvm_v4i32_ty], + [llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrReadArgMem]>; + def int_x86_avx512_mask_loadu_d_256 : + GCCBuiltin<"__builtin_ia32_loaddqusi256_mask">, + Intrinsic<[llvm_v8i32_ty], + [llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty], [IntrReadArgMem]>; + def int_x86_avx512_mask_loadu_d_512 : + GCCBuiltin<"__builtin_ia32_loaddqusi512_mask">, + Intrinsic<[llvm_v16i32_ty], + [llvm_ptr_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrReadArgMem]>; + + def int_x86_avx512_mask_loadu_q_128 : + GCCBuiltin<"__builtin_ia32_loaddqudi128_mask">, + Intrinsic<[llvm_v2i64_ty], + [llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrReadArgMem]>; + def int_x86_avx512_mask_loadu_q_256 : + GCCBuiltin<"__builtin_ia32_loaddqudi256_mask">, + Intrinsic<[llvm_v4i64_ty], + [llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrReadArgMem]>; + def int_x86_avx512_mask_loadu_q_512 : + GCCBuiltin<"__builtin_ia32_loaddqudi512_mask">, + Intrinsic<[llvm_v8i64_ty], + [llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrReadArgMem]>; def int_x86_avx512_mask_load_d_128 : GCCBuiltin<"__builtin_ia32_movdqa32load128_mask">, diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 993e4dad0a6..813b54f5735 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -20582,6 +20582,28 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { } } +/// Places new result values for the node in Results (their number +/// and types must exactly match those of the original return values of +/// the node), or leaves Results empty, which indicates that the node is not +/// to be custom lowered after all. +void X86TargetLowering::LowerOperationWrapper(SDNode *N, + SmallVectorImpl &Results, + SelectionDAG &DAG) const { + SDValue Res = LowerOperation(SDValue(N, 0), DAG); + + if (!Res.getNode()) + return; + + assert((N->getNumValues() <= Res->getNumValues()) && + "Lowering returned the wrong number of results!"); + + // Places new result values base on N result number. + // In some cases (LowerSINT_TO_FP for example) Res has more result values + // than original node, chain should be dropped(last value). + for (unsigned I = 0, E = N->getNumValues(); I != E; ++I) + Results.push_back(Res.getValue(I)); +} + /// ReplaceNodeResults - Replace a node with an illegal result type /// with a new node built out of custom code. void X86TargetLowering::ReplaceNodeResults(SDNode *N, diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 182028657b6..def395896b1 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -680,6 +680,14 @@ namespace llvm { /// SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; + /// Places new result values for the node in Results (their number + /// and types must exactly match those of the original return values of + /// the node), or leaves Results empty, which indicates that the node is not + /// to be custom lowered after all. + virtual void LowerOperationWrapper(SDNode *N, + SmallVectorImpl &Results, + SelectionDAG &DAG) const override; + /// Replace the results of node with an illegal result /// type with new values built out of custom code. /// diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 0b57d581fb4..96f41695ff1 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -2752,14 +2752,6 @@ defm VMOVDQU64 : avx512_load_vl<0x6F, "vmovdqu64", avx512vl_i64_info, HasAVX512> avx512_store_vl<0x7F, "vmovdqu64", avx512vl_i64_info, HasAVX512>, XS, VEX_W, EVEX_CD8<64, CD8VF>; -def: Pat<(v16i32 (int_x86_avx512_mask_loadu_d_512 addr:$ptr, - (v16i32 immAllZerosV), GR16:$mask)), - (VMOVDQU32Zrmkz (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), addr:$ptr)>; - -def: Pat<(v8i64 (int_x86_avx512_mask_loadu_q_512 addr:$ptr, - (bc_v8i64 (v16i32 immAllZerosV)), GR8:$mask)), - (VMOVDQU64Zrmkz (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), addr:$ptr)>; - let AddedComplexity = 20 in { def : Pat<(v8i64 (vselect VK8WM:$mask, (bc_v8i64 (v16i32 immAllZerosV)), (v8i64 VR512:$src))), diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index 17e1fee4c7c..0adb49b2a52 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -156,12 +156,24 @@ static const IntrinsicData IntrinsicsWithChain[] = { X86_INTRINSIC_DATA(avx512_mask_load_q_128, LOADA, ISD::DELETED_NODE, 0), X86_INTRINSIC_DATA(avx512_mask_load_q_256, LOADA, ISD::DELETED_NODE, 0), X86_INTRINSIC_DATA(avx512_mask_load_q_512, LOADA, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_loadu_b_128, LOADU, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_loadu_b_256, LOADU, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_loadu_b_512, LOADU, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_loadu_d_128, LOADU, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_loadu_d_256, LOADU, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_loadu_d_512, LOADU, ISD::DELETED_NODE, 0), X86_INTRINSIC_DATA(avx512_mask_loadu_pd_128, LOADU, ISD::DELETED_NODE, 0), X86_INTRINSIC_DATA(avx512_mask_loadu_pd_256, LOADU, ISD::DELETED_NODE, 0), X86_INTRINSIC_DATA(avx512_mask_loadu_pd_512, LOADU, ISD::DELETED_NODE, 0), X86_INTRINSIC_DATA(avx512_mask_loadu_ps_128, LOADU, ISD::DELETED_NODE, 0), X86_INTRINSIC_DATA(avx512_mask_loadu_ps_256, LOADU, ISD::DELETED_NODE, 0), X86_INTRINSIC_DATA(avx512_mask_loadu_ps_512, LOADU, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_loadu_q_128, LOADU, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_loadu_q_256, LOADU, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_loadu_q_512, LOADU, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_loadu_w_128, LOADU, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_loadu_w_256, LOADU, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_loadu_w_512, LOADU, ISD::DELETED_NODE, 0), X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_128, TRUNCATE_TO_MEM_VI8, X86ISD::VTRUNC, 0), X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_256, TRUNCATE_TO_MEM_VI8, diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll index dab0f48863b..5692401241d 100644 --- a/test/CodeGen/X86/avx512-intrinsics.ll +++ b/test/CodeGen/X86/avx512-intrinsics.ll @@ -6603,6 +6603,42 @@ define <8 x i64>@test_int_x86_avx512_mask_prorv_q_512(<8 x i64> %x0, <8 x i64> % ret <8 x i64> %res4 } +declare <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(i8*, <16 x i32>, i16) + +define <16 x i32> @test_mask_load_unaligned_d(i8* %ptr, i8* %ptr2, <16 x i32> %data, i16 %mask) { +; CHECK-LABEL: test_mask_load_unaligned_d: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edx, %k1 +; CHECK-NEXT: vmovdqu32 (%rdi), %zmm0 +; CHECK-NEXT: vmovdqu32 (%rsi), %zmm0 {%k1} +; CHECK-NEXT: vmovdqu32 (%rdi), %zmm1 {%k1} {z} +; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(i8* %ptr, <16 x i32> zeroinitializer, i16 -1) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(i8* %ptr2, <16 x i32> %res, i16 %mask) + %res2 = call <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(i8* %ptr, <16 x i32> zeroinitializer, i16 %mask) + %res4 = add <16 x i32> %res2, %res1 + ret <16 x i32> %res4 +} + +declare <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8*, <8 x i64>, i8) + +define <8 x i64> @test_mask_load_unaligned_q(i8* %ptr, i8* %ptr2, <8 x i64> %data, i8 %mask) { +; CHECK-LABEL: test_mask_load_unaligned_q: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edx, %k1 +; CHECK-NEXT: vmovdqu64 (%rdi), %zmm0 +; CHECK-NEXT: vmovdqu64 (%rsi), %zmm0 {%k1} +; CHECK-NEXT: vmovdqu64 (%rdi), %zmm1 {%k1} {z} +; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq + %res = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8* %ptr, <8 x i64> zeroinitializer, i8 -1) + %res1 = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8* %ptr2, <8 x i64> %res, i8 %mask) + %res2 = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8* %ptr, <8 x i64> zeroinitializer, i8 %mask) + %res4 = add <8 x i64> %res2, %res1 + ret <8 x i64> %res4 +} + declare <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32>, i8, <16 x i32>, i16) define <16 x i32>@test_int_x86_avx512_mask_prol_d_512(<16 x i32> %x0, i8 %x1, <16 x i32> %x2, i16 %x3) { diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll index a7c005ba9a1..3e7c0d949f2 100644 --- a/test/CodeGen/X86/avx512bw-intrinsics.ll +++ b/test/CodeGen/X86/avx512bw-intrinsics.ll @@ -3087,6 +3087,66 @@ define <32 x i16>@test_int_x86_avx512_mask_psllv32hi(<32 x i16> %x0, <32 x i16> ret <32 x i16> %res4 } +declare <32 x i16> @llvm.x86.avx512.mask.loadu.w.512(i8*, <32 x i16>, i32) + +define <32 x i16>@test_int_x86_avx512_mask_loadu_w_512(i8* %ptr, i8* %ptr2, <32 x i16> %x1, i32 %mask) { +; AVX512BW-LABEL: test_int_x86_avx512_mask_loadu_w_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %edx, %k1 +; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqu16 (%rsi), %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm1 {%k1} {z} +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_int_x86_avx512_mask_loadu_w_512: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; AVX512F-32-NEXT: vmovdqu16 (%ecx), %zmm0 +; AVX512F-32-NEXT: vmovdqu16 (%eax), %zmm0 {%k1} +; AVX512F-32-NEXT: vmovdqu16 (%ecx), %zmm1 {%k1} {z} +; AVX512F-32-NEXT: vpaddw %zmm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %res0 = call <32 x i16> @llvm.x86.avx512.mask.loadu.w.512(i8* %ptr, <32 x i16> %x1, i32 -1) + %res = call <32 x i16> @llvm.x86.avx512.mask.loadu.w.512(i8* %ptr2, <32 x i16> %res0, i32 %mask) + %res1 = call <32 x i16> @llvm.x86.avx512.mask.loadu.w.512(i8* %ptr, <32 x i16> zeroinitializer, i32 %mask) + %res2 = add <32 x i16> %res, %res1 + ret <32 x i16> %res2 +} + +declare <64 x i8> @llvm.x86.avx512.mask.loadu.b.512(i8*, <64 x i8>, i64) + +define <64 x i8>@test_int_x86_avx512_mask_loadu_b_512(i8* %ptr, i8* %ptr2, <64 x i8> %x1, i64 %mask) { +; AVX512BW-LABEL: test_int_x86_avx512_mask_loadu_b_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovq %rdx, %k1 +; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm1 {%k1} {z} +; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_int_x86_avx512_mask_loadu_b_512: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; AVX512F-32-NEXT: vmovdqu8 (%ecx), %zmm0 +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0 +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k1 +; AVX512F-32-NEXT: vmovdqu8 (%eax), %zmm0 {%k1} +; AVX512F-32-NEXT: vmovdqu8 (%ecx), %zmm1 {%k1} {z} +; AVX512F-32-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %res0 = call <64 x i8> @llvm.x86.avx512.mask.loadu.b.512(i8* %ptr, <64 x i8> %x1, i64 -1) + %res = call <64 x i8> @llvm.x86.avx512.mask.loadu.b.512(i8* %ptr2, <64 x i8> %res0, i64 %mask) + %res1 = call <64 x i8> @llvm.x86.avx512.mask.loadu.b.512(i8* %ptr, <64 x i8> zeroinitializer, i64 %mask) + %res2 = add <64 x i8> %res, %res1 + ret <64 x i8> %res2 +} + declare <32 x i16> @llvm.x86.avx512.mask.pmovzxb.w.512(<32 x i8>, <32 x i16>, i32) define <32 x i16>@test_int_x86_avx512_mask_pmovzxb_w_512(<32 x i8> %x0, <32 x i16> %x1, i32 %x2) { @@ -3244,4 +3304,3 @@ define <64 x i8>@test_int_x86_avx512_mask_movu_b_512(<64 x i8> %x0, <64 x i8> %x %res2 = add <64 x i8> %res, %res1 ret <64 x i8> %res2 } - diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/test/CodeGen/X86/avx512bwvl-intrinsics.ll index 654c4e96014..4c624e547b5 100644 --- a/test/CodeGen/X86/avx512bwvl-intrinsics.ll +++ b/test/CodeGen/X86/avx512bwvl-intrinsics.ll @@ -4947,6 +4947,78 @@ define <8 x i16>@test_int_x86_avx512_mask_psllv8_hi(<8 x i16> %x0, <8 x i16> %x1 ret <8 x i16> %res4 } +declare <8 x i16> @llvm.x86.avx512.mask.loadu.w.128(i8*, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_loadu_w_128(i8* %ptr, i8* %ptr2, <8 x i16> %x1, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_loadu_w_128: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edx, %k1 +; CHECK-NEXT: vmovdqu16 (%rdi), %xmm0 +; CHECK-NEXT: vmovdqu16 (%rsi), %xmm0 {%k1} +; CHECK-NEXT: vmovdqu16 (%rdi), %xmm1 {%k1} {z} +; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res0 = call <8 x i16> @llvm.x86.avx512.mask.loadu.w.128(i8* %ptr, <8 x i16> %x1, i8 -1) + %res = call <8 x i16> @llvm.x86.avx512.mask.loadu.w.128(i8* %ptr2, <8 x i16> %res0, i8 %mask) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.loadu.w.128(i8* %ptr, <8 x i16> zeroinitializer, i8 %mask) + %res2 = add <8 x i16> %res, %res1 + ret <8 x i16> %res2 +} + +declare <16 x i16> @llvm.x86.avx512.mask.loadu.w.256(i8*, <16 x i16>, i16) + +define <16 x i16>@test_int_x86_avx512_mask_loadu_w_256(i8* %ptr, i8* %ptr2, <16 x i16> %x1, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_loadu_w_256: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edx, %k1 +; CHECK-NEXT: vmovdqu16 (%rdi), %ymm0 +; CHECK-NEXT: vmovdqu16 (%rsi), %ymm0 {%k1} +; CHECK-NEXT: vmovdqu16 (%rdi), %ymm1 {%k1} {z} +; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %res0 = call <16 x i16> @llvm.x86.avx512.mask.loadu.w.256(i8* %ptr, <16 x i16> %x1, i16 -1) + %res = call <16 x i16> @llvm.x86.avx512.mask.loadu.w.256(i8* %ptr2, <16 x i16> %res0, i16 %mask) + %res1 = call <16 x i16> @llvm.x86.avx512.mask.loadu.w.256(i8* %ptr, <16 x i16> zeroinitializer, i16 %mask) + %res2 = add <16 x i16> %res, %res1 + ret <16 x i16> %res2 +} + +declare <16 x i8> @llvm.x86.avx512.mask.loadu.b.128(i8*, <16 x i8>, i16) + +define <16 x i8>@test_int_x86_avx512_mask_loadu_b_128(i8* %ptr, i8* %ptr2, <16 x i8> %x1, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_loadu_b_128: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edx, %k1 +; CHECK-NEXT: vmovdqu8 (%rdi), %xmm0 +; CHECK-NEXT: vmovdqu8 (%rsi), %xmm0 {%k1} +; CHECK-NEXT: vmovdqu8 (%rdi), %xmm1 {%k1} {z} +; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res0 = call <16 x i8> @llvm.x86.avx512.mask.loadu.b.128(i8* %ptr, <16 x i8> %x1, i16 -1) + %res = call <16 x i8> @llvm.x86.avx512.mask.loadu.b.128(i8* %ptr2, <16 x i8> %res0, i16 %mask) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.loadu.b.128(i8* %ptr, <16 x i8> zeroinitializer, i16 %mask) + %res2 = add <16 x i8> %res, %res1 + ret <16 x i8> %res2 +} + +declare <32 x i8> @llvm.x86.avx512.mask.loadu.b.256(i8*, <32 x i8>, i32) + +define <32 x i8>@test_int_x86_avx512_mask_loadu_b_256(i8* %ptr, i8* %ptr2, <32 x i8> %x1, i32 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_loadu_b_256: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovd %edx, %k1 +; CHECK-NEXT: vmovdqu8 (%rdi), %ymm0 +; CHECK-NEXT: vmovdqu8 (%rsi), %ymm0 {%k1} +; CHECK-NEXT: vmovdqu8 (%rdi), %ymm1 {%k1} {z} +; CHECK-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %res0 = call <32 x i8> @llvm.x86.avx512.mask.loadu.b.256(i8* %ptr, <32 x i8> %x1, i32 -1) + %res = call <32 x i8> @llvm.x86.avx512.mask.loadu.b.256(i8* %ptr2, <32 x i8> %res0, i32 %mask) + %res1 = call <32 x i8> @llvm.x86.avx512.mask.loadu.b.256(i8* %ptr, <32 x i8> zeroinitializer, i32 %mask) + %res2 = add <32 x i8> %res, %res1 + ret <32 x i8> %res2 +} + declare <8 x i16> @llvm.x86.avx512.mask.pmovzxb.w.128(<16 x i8>, <8 x i16>, i8) define <8 x i16>@test_int_x86_avx512_mask_pmovzxb_w_128(<16 x i8> %x0, <8 x i16> %x1, i8 %x2) { diff --git a/test/CodeGen/X86/avx512vl-intrinsics.ll b/test/CodeGen/X86/avx512vl-intrinsics.ll index d351647fcec..3955f31dc89 100644 --- a/test/CodeGen/X86/avx512vl-intrinsics.ll +++ b/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -6672,6 +6672,79 @@ define <4 x i64>@test_int_x86_avx512_mask_prorv_q_256(<4 x i64> %x0, <4 x i64> % %res4 = add <4 x i64> %res3, %res2 ret <4 x i64> %res4 } + +declare <4 x i32> @llvm.x86.avx512.mask.loadu.d.128(i8*, <4 x i32>, i8) + +define <4 x i32> @test_mask_load_unaligned_d_128(i8* %ptr, i8* %ptr2, <4 x i32> %data, i8 %mask) { +; CHECK-LABEL: test_mask_load_unaligned_d_128: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edx, %k1 +; CHECK-NEXT: vmovdqu32 (%rdi), %xmm0 +; CHECK-NEXT: vmovdqu32 (%rsi), %xmm0 {%k1} +; CHECK-NEXT: vmovdqu32 (%rdi), %xmm1 {%k1} {z} +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x i32> @llvm.x86.avx512.mask.loadu.d.128(i8* %ptr, <4 x i32> zeroinitializer, i8 -1) + %res1 = call <4 x i32> @llvm.x86.avx512.mask.loadu.d.128(i8* %ptr2, <4 x i32> %res, i8 %mask) + %res2 = call <4 x i32> @llvm.x86.avx512.mask.loadu.d.128(i8* %ptr, <4 x i32> zeroinitializer, i8 %mask) + %res4 = add <4 x i32> %res2, %res1 + ret <4 x i32> %res4 +} + +declare <8 x i32> @llvm.x86.avx512.mask.loadu.d.256(i8*, <8 x i32>, i8) + +define <8 x i32> @test_mask_load_unaligned_d_256(i8* %ptr, i8* %ptr2, <8 x i32> %data, i8 %mask) { +; CHECK-LABEL: test_mask_load_unaligned_d_256: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edx, %k1 +; CHECK-NEXT: vmovdqu32 (%rdi), %ymm0 +; CHECK-NEXT: vmovdqu32 (%rsi), %ymm0 {%k1} +; CHECK-NEXT: vmovdqu32 (%rdi), %ymm1 {%k1} {z} +; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x i32> @llvm.x86.avx512.mask.loadu.d.256(i8* %ptr, <8 x i32> zeroinitializer, i8 -1) + %res1 = call <8 x i32> @llvm.x86.avx512.mask.loadu.d.256(i8* %ptr2, <8 x i32> %res, i8 %mask) + %res2 = call <8 x i32> @llvm.x86.avx512.mask.loadu.d.256(i8* %ptr, <8 x i32> zeroinitializer, i8 %mask) + %res4 = add <8 x i32> %res2, %res1 + ret <8 x i32> %res4 +} + +declare <2 x i64> @llvm.x86.avx512.mask.loadu.q.128(i8*, <2 x i64>, i8) + +define <2 x i64> @test_mask_load_unaligned_q_128(i8* %ptr, i8* %ptr2, <2 x i64> %data, i8 %mask) { +; CHECK-LABEL: test_mask_load_unaligned_q_128: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edx, %k1 +; CHECK-NEXT: vmovdqu64 (%rdi), %xmm0 +; CHECK-NEXT: vmovdqu64 (%rsi), %xmm0 {%k1} +; CHECK-NEXT: vmovdqu64 (%rdi), %xmm1 {%k1} {z} +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x i64> @llvm.x86.avx512.mask.loadu.q.128(i8* %ptr, <2 x i64> zeroinitializer, i8 -1) + %res1 = call <2 x i64> @llvm.x86.avx512.mask.loadu.q.128(i8* %ptr2, <2 x i64> %res, i8 %mask) + %res2 = call <2 x i64> @llvm.x86.avx512.mask.loadu.q.128(i8* %ptr, <2 x i64> zeroinitializer, i8 %mask) + %res4 = add <2 x i64> %res2, %res1 + ret <2 x i64> %res4 +} + +declare <4 x i64> @llvm.x86.avx512.mask.loadu.q.256(i8*, <4 x i64>, i8) + +define <4 x i64> @test_mask_load_unaligned_q_256(i8* %ptr, i8* %ptr2, <4 x i64> %data, i8 %mask) { +; CHECK-LABEL: test_mask_load_unaligned_q_256: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edx, %k1 +; CHECK-NEXT: vmovdqu64 (%rdi), %ymm0 +; CHECK-NEXT: vmovdqu64 (%rsi), %ymm0 {%k1} +; CHECK-NEXT: vmovdqu64 (%rdi), %ymm1 {%k1} {z} +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq + %res = call <4 x i64> @llvm.x86.avx512.mask.loadu.q.256(i8* %ptr, <4 x i64> zeroinitializer, i8 -1) + %res1 = call <4 x i64> @llvm.x86.avx512.mask.loadu.q.256(i8* %ptr2, <4 x i64> %res, i8 %mask) + %res2 = call <4 x i64> @llvm.x86.avx512.mask.loadu.q.256(i8* %ptr, <4 x i64> zeroinitializer, i8 %mask) + %res4 = add <4 x i64> %res2, %res1 + ret <4 x i64> %res4 +} + declare <4 x i32> @llvm.x86.avx512.mask.prol.d.128(<4 x i32>, i8, <4 x i32>, i8) define <4 x i32>@test_int_x86_avx512_mask_prol_d_128(<4 x i32> %x0, i8 %x1, <4 x i32> %x2, i8 %x3) {