Revert r271362 "[AVX512] Remove masked load intrinsics. Clang now emits generic masked load intrinsics instead."

Looks like something isn't quite right still. Also forgot to move the test cases to an autoupgrade test.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@271363 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Craig Topper 2016-06-01 05:57:55 +00:00
parent c1bee8aab0
commit 323e9f1870
5 changed files with 142 additions and 70 deletions

View File

@ -1935,6 +1935,46 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_avx_maskload_ps_256 : GCCBuiltin<"__builtin_ia32_maskloadps256">,
Intrinsic<[llvm_v8f32_ty], [llvm_ptr_ty, llvm_v8i32_ty],
[IntrReadMem, IntrArgMemOnly]>;
def int_x86_avx512_mask_loadu_ps_128 :
Intrinsic<[llvm_v4f32_ty],
[llvm_ptr_ty, llvm_v4f32_ty, llvm_i8_ty], [IntrReadMem, IntrArgMemOnly]>;
def int_x86_avx512_mask_loadu_ps_256 :
Intrinsic<[llvm_v8f32_ty],
[llvm_ptr_ty, llvm_v8f32_ty, llvm_i8_ty], [IntrReadMem, IntrArgMemOnly]>;
def int_x86_avx512_mask_loadu_ps_512 :
Intrinsic<[llvm_v16f32_ty],
[llvm_ptr_ty, llvm_v16f32_ty, llvm_i16_ty], [IntrReadMem, IntrArgMemOnly]>;
def int_x86_avx512_mask_loadu_pd_128 :
Intrinsic<[llvm_v2f64_ty],
[llvm_ptr_ty, llvm_v2f64_ty, llvm_i8_ty], [IntrReadMem, IntrArgMemOnly]>;
def int_x86_avx512_mask_loadu_pd_256 :
Intrinsic<[llvm_v4f64_ty],
[llvm_ptr_ty, llvm_v4f64_ty, llvm_i8_ty], [IntrReadMem, IntrArgMemOnly]>;
def int_x86_avx512_mask_loadu_pd_512 :
Intrinsic<[llvm_v8f64_ty],
[llvm_ptr_ty, llvm_v8f64_ty, llvm_i8_ty], [IntrReadMem, IntrArgMemOnly]>;
def int_x86_avx512_mask_load_ps_128 :
Intrinsic<[llvm_v4f32_ty],
[llvm_ptr_ty, llvm_v4f32_ty, llvm_i8_ty], [IntrReadMem, IntrArgMemOnly]>;
def int_x86_avx512_mask_load_ps_256 :
Intrinsic<[llvm_v8f32_ty],
[llvm_ptr_ty, llvm_v8f32_ty, llvm_i8_ty], [IntrReadMem, IntrArgMemOnly]>;
def int_x86_avx512_mask_load_ps_512 :
Intrinsic<[llvm_v16f32_ty],
[llvm_ptr_ty, llvm_v16f32_ty, llvm_i16_ty], [IntrReadMem, IntrArgMemOnly]>;
def int_x86_avx512_mask_load_pd_128 :
Intrinsic<[llvm_v2f64_ty],
[llvm_ptr_ty, llvm_v2f64_ty, llvm_i8_ty], [IntrReadMem, IntrArgMemOnly]>;
def int_x86_avx512_mask_load_pd_256 :
Intrinsic<[llvm_v4f64_ty],
[llvm_ptr_ty, llvm_v4f64_ty, llvm_i8_ty], [IntrReadMem, IntrArgMemOnly]>;
def int_x86_avx512_mask_load_pd_512 :
Intrinsic<[llvm_v8f64_ty],
[llvm_ptr_ty, llvm_v8f64_ty, llvm_i8_ty], [IntrReadMem, IntrArgMemOnly]>;
}
// Conditional move ops
@ -2931,6 +2971,66 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_avx2_maskload_q_256 : GCCBuiltin<"__builtin_ia32_maskloadq256">,
Intrinsic<[llvm_v4i64_ty], [llvm_ptr_ty, llvm_v4i64_ty],
[IntrReadMem, IntrArgMemOnly]>;
def int_x86_avx512_mask_loadu_b_128 :
Intrinsic<[llvm_v16i8_ty],
[llvm_ptr_ty, llvm_v16i8_ty, llvm_i16_ty], [IntrReadMem, IntrArgMemOnly]>;
def int_x86_avx512_mask_loadu_b_256 :
Intrinsic<[llvm_v32i8_ty],
[llvm_ptr_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrReadMem, IntrArgMemOnly]>;
def int_x86_avx512_mask_loadu_b_512 :
Intrinsic<[llvm_v64i8_ty],
[llvm_ptr_ty, llvm_v64i8_ty, llvm_i64_ty], [IntrReadMem, IntrArgMemOnly]>;
def int_x86_avx512_mask_loadu_w_128 :
Intrinsic<[llvm_v8i16_ty],
[llvm_ptr_ty, llvm_v8i16_ty, llvm_i8_ty], [IntrReadMem, IntrArgMemOnly]>;
def int_x86_avx512_mask_loadu_w_256 :
Intrinsic<[llvm_v16i16_ty],
[llvm_ptr_ty, llvm_v16i16_ty, llvm_i16_ty], [IntrReadMem, IntrArgMemOnly]>;
def int_x86_avx512_mask_loadu_w_512 :
Intrinsic<[llvm_v32i16_ty],
[llvm_ptr_ty, llvm_v32i16_ty, llvm_i32_ty], [IntrReadMem, IntrArgMemOnly]>;
def int_x86_avx512_mask_loadu_d_128 :
Intrinsic<[llvm_v4i32_ty],
[llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrReadMem, IntrArgMemOnly]>;
def int_x86_avx512_mask_loadu_d_256 :
Intrinsic<[llvm_v8i32_ty],
[llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty], [IntrReadMem, IntrArgMemOnly]>;
def int_x86_avx512_mask_loadu_d_512 :
Intrinsic<[llvm_v16i32_ty],
[llvm_ptr_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrReadMem, IntrArgMemOnly]>;
def int_x86_avx512_mask_loadu_q_128 :
Intrinsic<[llvm_v2i64_ty],
[llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrReadMem, IntrArgMemOnly]>;
def int_x86_avx512_mask_loadu_q_256 :
Intrinsic<[llvm_v4i64_ty],
[llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrReadMem, IntrArgMemOnly]>;
def int_x86_avx512_mask_loadu_q_512 :
Intrinsic<[llvm_v8i64_ty],
[llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrReadMem, IntrArgMemOnly]>;
def int_x86_avx512_mask_load_d_128 :
Intrinsic<[llvm_v4i32_ty],
[llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrReadMem, IntrArgMemOnly]>;
def int_x86_avx512_mask_load_d_256 :
Intrinsic<[llvm_v8i32_ty],
[llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty], [IntrReadMem, IntrArgMemOnly]>;
def int_x86_avx512_mask_load_d_512 :
Intrinsic<[llvm_v16i32_ty],
[llvm_ptr_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrReadMem, IntrArgMemOnly]>;
def int_x86_avx512_mask_load_q_128 :
Intrinsic<[llvm_v2i64_ty],
[llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrReadMem, IntrArgMemOnly]>;
def int_x86_avx512_mask_load_q_256 :
Intrinsic<[llvm_v4i64_ty],
[llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrReadMem, IntrArgMemOnly]>;
def int_x86_avx512_mask_load_q_512 :
Intrinsic<[llvm_v8i64_ty],
[llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrReadMem, IntrArgMemOnly]>;
}
// Conditional store ops

View File

@ -204,16 +204,6 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
Name.startswith("x86.avx512.mask.store.w.") ||
Name.startswith("x86.avx512.mask.store.d.") ||
Name.startswith("x86.avx512.mask.store.q.") ||
Name.startswith("x86.avx512.mask.loadu.p") ||
Name.startswith("x86.avx512.mask.loadu.b.") ||
Name.startswith("x86.avx512.mask.loadu.w.") ||
Name.startswith("x86.avx512.mask.loadu.d.") ||
Name.startswith("x86.avx512.mask.loadu.q.") ||
Name.startswith("x86.avx512.mask.load.p") ||
Name.startswith("x86.avx512.mask.load.b.") ||
Name.startswith("x86.avx512.mask.load.w.") ||
Name.startswith("x86.avx512.mask.load.d.") ||
Name.startswith("x86.avx512.mask.load.q.") ||
Name == "x86.sse42.crc32.64.8" ||
Name.startswith("x86.avx.vbroadcast.s") ||
Name.startswith("x86.sse2.psll.dq") ||
@ -405,47 +395,13 @@ static Value *UpgradeMaskedStore(IRBuilder<> &Builder, LLVMContext &C,
for (unsigned i = 0; i != NumElts; ++i)
Indices[i] = i;
Mask = Builder.CreateShuffleVector(Mask, Mask,
makeArrayRef(Indices, NumElts),
"extract");
makeArrayRef(Indices, NumElts),
"extract");
}
return Builder.CreateMaskedStore(Data, Ptr, Align, Mask);
}
static Value *UpgradeMaskedLoad(IRBuilder<> &Builder, LLVMContext &C,
Value *Ptr, Value *Passthru, Value *Mask,
bool Aligned) {
// Cast the pointer to the right type.
Ptr = Builder.CreateBitCast(Ptr,
llvm::PointerType::getUnqual(Passthru->getType()));
unsigned Align =
Aligned ? cast<VectorType>(Passthru->getType())->getBitWidth() / 8 : 1;
// If the mask is all ones just emit a regular store.
if (const auto *C = dyn_cast<Constant>(Mask))
if (C->isAllOnesValue())
return Builder.CreateAlignedLoad(Ptr, Align);
// Convert the mask from an integer type to a vector of i1.
unsigned NumElts = Passthru->getType()->getVectorNumElements();
llvm::VectorType *MaskTy = llvm::VectorType::get(Builder.getInt1Ty(),
cast<IntegerType>(Mask->getType())->getBitWidth());
Mask = Builder.CreateBitCast(Mask, MaskTy);
// If we have less than 8 elements, then the starting mask was an i8 and
// we need to extract down to the right number of elements.
if (NumElts < 8) {
int Indices[4];
for (unsigned i = 0; i != NumElts; ++i)
Indices[i] = i;
Mask = Builder.CreateShuffleVector(Mask, Mask,
makeArrayRef(Indices, NumElts),
"extract");
}
return Builder.CreateMaskedLoad(Ptr, Align, Mask, Passthru);
}
// UpgradeIntrinsicCall - Upgrade a call to an old intrinsic to be a call the
// upgraded intrinsic. All argument and return casting must be provided in
// order to seamlessly integrate with existing context.
@ -569,22 +525,6 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
// Remove intrinsic.
CI->eraseFromParent();
return;
} else if (Name.startswith("llvm.x86.avx512.mask.loadu.p") ||
Name.startswith("llvm.x86.avx512.mask.loadu.b.") ||
Name.startswith("llvm.x86.avx512.mask.loadu.w.") ||
Name.startswith("llvm.x86.avx512.mask.loadu.d.") ||
Name.startswith("llvm.x86.avx512.mask.loadu.q.")) {
Rep = UpgradeMaskedLoad(Builder, C, CI->getArgOperand(0),
CI->getArgOperand(1), CI->getArgOperand(2),
/*Aligned*/false);
} else if (Name.startswith("llvm.x86.avx512.mask.load.p") ||
Name.startswith("llvm.x86.avx512.mask.load.b.") ||
Name.startswith("llvm.x86.avx512.mask.load.w.") ||
Name.startswith("llvm.x86.avx512.mask.load.d.") ||
Name.startswith("llvm.x86.avx512.mask.load.q.")) {
Rep = UpgradeMaskedStore(Builder, C, CI->getArgOperand(0),
CI->getArgOperand(1),CI->getArgOperand(2),
/*Aligned*/true);
} else if (Name.startswith("llvm.x86.xop.vpcom")) {
Intrinsic::ID intID;
if (Name.endswith("ub"))

View File

@ -148,6 +148,36 @@ static const IntrinsicData IntrinsicsWithChain[] = {
EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
X86_INTRINSIC_DATA(avx512_mask_expand_load_q_512,
EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
X86_INTRINSIC_DATA(avx512_mask_load_d_128, LOADA, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_load_d_256, LOADA, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_load_d_512, LOADA, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_load_pd_128, LOADA, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_load_pd_256, LOADA, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_load_pd_512, LOADA, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_load_ps_128, LOADA, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_load_ps_256, LOADA, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_load_ps_512, LOADA, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_load_q_128, LOADA, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_load_q_256, LOADA, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_load_q_512, LOADA, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_loadu_b_128, LOADU, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_loadu_b_256, LOADU, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_loadu_b_512, LOADU, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_loadu_d_128, LOADU, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_loadu_d_256, LOADU, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_loadu_d_512, LOADU, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_loadu_pd_128, LOADU, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_loadu_pd_256, LOADU, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_loadu_pd_512, LOADU, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_loadu_ps_128, LOADU, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_loadu_ps_256, LOADU, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_loadu_ps_512, LOADU, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_loadu_q_128, LOADU, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_loadu_q_256, LOADU, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_loadu_q_512, LOADU, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_loadu_w_128, LOADU, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_loadu_w_256, LOADU, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_loadu_w_512, LOADU, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_128, TRUNCATE_TO_MEM_VI8,
X86ISD::VTRUNC, 0),
X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_256, TRUNCATE_TO_MEM_VI8,

View File

@ -3202,8 +3202,8 @@ declare <32 x i16> @llvm.x86.avx512.mask.loadu.w.512(i8*, <32 x i16>, i32)
define <32 x i16>@test_int_x86_avx512_mask_loadu_w_512(i8* %ptr, i8* %ptr2, <32 x i16> %x1, i32 %mask) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_loadu_w_512:
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0
; AVX512BW-NEXT: kmovd %edx, %k1
; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0
; AVX512BW-NEXT: vmovdqu16 (%rsi), %zmm0 {%k1}
; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm1 {%k1} {z}
; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0
@ -3212,9 +3212,9 @@ define <32 x i16>@test_int_x86_avx512_mask_loadu_w_512(i8* %ptr, i8* %ptr2, <32
; AVX512F-32-LABEL: test_int_x86_avx512_mask_loadu_w_512:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; AVX512F-32-NEXT: vmovdqu16 (%ecx), %zmm0
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vmovdqu16 (%eax), %zmm0 {%k1}
; AVX512F-32-NEXT: vmovdqu16 (%ecx), %zmm1 {%k1} {z}
; AVX512F-32-NEXT: vpaddw %zmm1, %zmm0, %zmm0
@ -3231,8 +3231,8 @@ declare <64 x i8> @llvm.x86.avx512.mask.loadu.b.512(i8*, <64 x i8>, i64)
define <64 x i8>@test_int_x86_avx512_mask_loadu_b_512(i8* %ptr, i8* %ptr2, <64 x i8> %x1, i64 %mask) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_loadu_b_512:
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0
; AVX512BW-NEXT: kmovq %rdx, %k1
; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0
; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0 {%k1}
; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm1 {%k1} {z}
; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
@ -3243,7 +3243,9 @@ define <64 x i8>@test_int_x86_avx512_mask_loadu_b_512(i8* %ptr, i8* %ptr2, <64 x
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; AVX512F-32-NEXT: vmovdqu8 (%ecx), %zmm0
; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k1
; AVX512F-32-NEXT: vmovdqu8 (%eax), %zmm0 {%k1}
; AVX512F-32-NEXT: vmovdqu8 (%ecx), %zmm1 {%k1} {z}
; AVX512F-32-NEXT: vpaddb %zmm1, %zmm0, %zmm0

View File

@ -6230,8 +6230,8 @@ declare <8 x i16> @llvm.x86.avx512.mask.loadu.w.128(i8*, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_loadu_w_128(i8* %ptr, i8* %ptr2, <8 x i16> %x1, i8 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_loadu_w_128:
; CHECK: ## BB#0:
; CHECK-NEXT: vmovdqu16 (%rdi), %xmm0 ## encoding: [0x62,0xf1,0xff,0x08,0x6f,0x07]
; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
; CHECK-NEXT: vmovdqu16 (%rdi), %xmm0 ## encoding: [0x62,0xf1,0xff,0x08,0x6f,0x07]
; CHECK-NEXT: vmovdqu16 (%rsi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x6f,0x06]
; CHECK-NEXT: vmovdqu16 (%rdi), %xmm1 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0x89,0x6f,0x0f]
; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfd,0xc1]
@ -6248,8 +6248,8 @@ declare <16 x i16> @llvm.x86.avx512.mask.loadu.w.256(i8*, <16 x i16>, i16)
define <16 x i16>@test_int_x86_avx512_mask_loadu_w_256(i8* %ptr, i8* %ptr2, <16 x i16> %x1, i16 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_loadu_w_256:
; CHECK: ## BB#0:
; CHECK-NEXT: vmovdqu16 (%rdi), %ymm0 ## encoding: [0x62,0xf1,0xff,0x28,0x6f,0x07]
; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
; CHECK-NEXT: vmovdqu16 (%rdi), %ymm0 ## encoding: [0x62,0xf1,0xff,0x28,0x6f,0x07]
; CHECK-NEXT: vmovdqu16 (%rsi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xff,0x29,0x6f,0x06]
; CHECK-NEXT: vmovdqu16 (%rdi), %ymm1 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0xa9,0x6f,0x0f]
; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfd,0xc1]
@ -6266,8 +6266,8 @@ declare <16 x i8> @llvm.x86.avx512.mask.loadu.b.128(i8*, <16 x i8>, i16)
define <16 x i8>@test_int_x86_avx512_mask_loadu_b_128(i8* %ptr, i8* %ptr2, <16 x i8> %x1, i16 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_loadu_b_128:
; CHECK: ## BB#0:
; CHECK-NEXT: vmovdqu8 (%rdi), %xmm0 ## encoding: [0x62,0xf1,0x7f,0x08,0x6f,0x07]
; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
; CHECK-NEXT: vmovdqu8 (%rdi), %xmm0 ## encoding: [0x62,0xf1,0x7f,0x08,0x6f,0x07]
; CHECK-NEXT: vmovdqu8 (%rsi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7f,0x09,0x6f,0x06]
; CHECK-NEXT: vmovdqu8 (%rdi), %xmm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7f,0x89,0x6f,0x0f]
; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfc,0xc1]
@ -6284,8 +6284,8 @@ declare <32 x i8> @llvm.x86.avx512.mask.loadu.b.256(i8*, <32 x i8>, i32)
define <32 x i8>@test_int_x86_avx512_mask_loadu_b_256(i8* %ptr, i8* %ptr2, <32 x i8> %x1, i32 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_loadu_b_256:
; CHECK: ## BB#0:
; CHECK-NEXT: vmovdqu8 (%rdi), %ymm0 ## encoding: [0x62,0xf1,0x7f,0x28,0x6f,0x07]
; CHECK-NEXT: kmovd %edx, %k1 ## encoding: [0xc5,0xfb,0x92,0xca]
; CHECK-NEXT: vmovdqu8 (%rdi), %ymm0 ## encoding: [0x62,0xf1,0x7f,0x28,0x6f,0x07]
; CHECK-NEXT: vmovdqu8 (%rsi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7f,0x29,0x6f,0x06]
; CHECK-NEXT: vmovdqu8 (%rdi), %ymm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7f,0xa9,0x6f,0x0f]
; CHECK-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfc,0xc1]