diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td index e6d85e86798..8f64b5d8ee9 100644 --- a/include/llvm/IR/IntrinsicsX86.td +++ b/include/llvm/IR/IntrinsicsX86.td @@ -1321,6 +1321,12 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx_maskload_ps_256 : GCCBuiltin<"__builtin_ia32_maskloadps256">, Intrinsic<[llvm_v8f32_ty], [llvm_ptr_ty, llvm_v8f32_ty], [IntrReadArgMem]>; + def int_x86_avx512_mask_loadu_ps_512 : GCCBuiltin<"__builtin_ia32_loadups512_mask">, + Intrinsic<[llvm_v16f32_ty], [llvm_ptr_ty, llvm_v16f32_ty, llvm_i16_ty], + [IntrReadArgMem]>; + def int_x86_avx512_mask_loadu_pd_512 : GCCBuiltin<"__builtin_ia32_loadupd512_mask">, + Intrinsic<[llvm_v8f64_ty], [llvm_ptr_ty, llvm_v8f64_ty, llvm_i8_ty], + [IntrReadArgMem]>; } // Conditional store ops @@ -1339,6 +1345,14 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". GCCBuiltin<"__builtin_ia32_maskstoreps256">, Intrinsic<[], [llvm_ptr_ty, llvm_v8f32_ty, llvm_v8f32_ty], [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_storeu_ps_512 : + GCCBuiltin<"__builtin_ia32_storeups512_mask">, + Intrinsic<[], [llvm_ptr_ty, llvm_v16f32_ty, llvm_i16_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_storeu_pd_512 : + GCCBuiltin<"__builtin_ia32_storeupd512_mask">, + Intrinsic<[], [llvm_ptr_ty, llvm_v8f64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; } //===----------------------------------------------------------------------===// @@ -1753,6 +1767,12 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx2_maskload_q_256 : GCCBuiltin<"__builtin_ia32_maskloadq256">, Intrinsic<[llvm_v4i64_ty], [llvm_ptr_ty, llvm_v4i64_ty], [IntrReadArgMem]>; + def int_x86_avx512_mask_loadu_d_512 : GCCBuiltin<"__builtin_ia32_loaddqusi512_mask">, + Intrinsic<[llvm_v16i32_ty], [llvm_ptr_ty, llvm_v16i32_ty, llvm_i16_ty], + [IntrReadArgMem]>; + def int_x86_avx512_mask_loadu_q_512 : GCCBuiltin<"__builtin_ia32_loaddqudi512_mask">, + Intrinsic<[llvm_v8i64_ty], [llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty], + [IntrReadArgMem]>; } // Conditional store ops @@ -1771,6 +1791,14 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". GCCBuiltin<"__builtin_ia32_maskstoreq256">, Intrinsic<[], [llvm_ptr_ty, llvm_v4i64_ty, llvm_v4i64_ty], [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_storeu_d_512 : + GCCBuiltin<"__builtin_ia32_storedqusi512_mask">, + Intrinsic<[], [llvm_ptr_ty, llvm_v16i32_ty, llvm_i16_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_storeu_q_512 : + GCCBuiltin<"__builtin_ia32_storedqudi512_mask">, + Intrinsic<[], [llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; } // Variable bit shift ops diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index a4ad207ec93..0cdf1527a4b 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -1222,152 +1222,139 @@ def : Pat<(v8i1 (X86vsrli VK8:$src, (i8 imm:$imm))), // AVX-512 - Aligned and unaligned load and store // -multiclass avx512_mov_packed opc, RegisterClass RC, RegisterClass KRC, +multiclass avx512_load opc, RegisterClass RC, RegisterClass KRC, X86MemOperand x86memop, PatFrag ld_frag, - string asm, Domain d, bit IsReMaterializable = 1> { -let hasSideEffects = 0 in + string asm, Domain d, + ValueType vt, bit IsReMaterializable = 1> { +let hasSideEffects = 0 in { def rr : AVX512PI, EVEX; -let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in + def rrkz : AVX512PI, EVEX, EVEX_KZ; + } + let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in def rm : AVX512PI, EVEX; -let Constraints = "$src1 = $dst" in { + [(set (vt RC:$dst), (ld_frag addr:$src))], d>, EVEX; + let Constraints = "$src1 = $dst", hasSideEffects = 0 in { def rrk : AVX512PI, EVEX, EVEX_K; + let mayLoad = 1 in def rmk : AVX512PI, EVEX, EVEX_K; -} + } + let mayLoad = 1 in + def rmkz : AVX512PI, EVEX, EVEX_KZ; } -defm VMOVAPSZ : avx512_mov_packed<0x28, VR512, VK16WM, f512mem, alignedloadv16f32, - "vmovaps", SSEPackedSingle>, +multiclass avx512_store opc, RegisterClass RC, RegisterClass KRC, + X86MemOperand x86memop, PatFrag store_frag, + string asm, Domain d, ValueType vt> { + let isAsmParserOnly = 1, hasSideEffects = 0 in { + def rr_alt : AVX512PI, + EVEX; + let Constraints = "$src1 = $dst" in + def alt_rrk : AVX512PI, + EVEX, EVEX_K; + def alt_rrkz : AVX512PI, EVEX, EVEX_KZ; + } + let mayStore = 1 in { + def mr : AVX512PI, EVEX; + def mrk : AVX512PI, EVEX, EVEX_K; + def mrkz : AVX512PI, EVEX, EVEX_KZ; + } +} + +defm VMOVAPSZ : avx512_load<0x28, VR512, VK16WM, f512mem, alignedloadv16f32, + "vmovaps", SSEPackedSingle, v16f32>, + avx512_store<0x29, VR512, VK16WM, f512mem, alignedstore512, + "vmovaps", SSEPackedSingle, v16f32>, PS, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VMOVAPDZ : avx512_mov_packed<0x28, VR512, VK8WM, f512mem, alignedloadv8f64, - "vmovapd", SSEPackedDouble>, +defm VMOVAPDZ : avx512_load<0x28, VR512, VK8WM, f512mem, alignedloadv8f64, + "vmovapd", SSEPackedDouble, v8f64>, + avx512_store<0x29, VR512, VK8WM, f512mem, alignedstore512, + "vmovapd", SSEPackedDouble, v8f64>, PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; -defm VMOVUPSZ : avx512_mov_packed<0x10, VR512, VK16WM, f512mem, loadv16f32, - "vmovups", SSEPackedSingle>, +defm VMOVUPSZ : avx512_load<0x10, VR512, VK16WM, f512mem, loadv16f32, + "vmovups", SSEPackedSingle, v16f32>, + avx512_store<0x11, VR512, VK16WM, f512mem, store, + "vmovups", SSEPackedSingle, v16f32>, PS, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VMOVUPDZ : avx512_mov_packed<0x10, VR512, VK8WM, f512mem, loadv8f64, - "vmovupd", SSEPackedDouble, 0>, +defm VMOVUPDZ : avx512_load<0x10, VR512, VK8WM, f512mem, loadv8f64, + "vmovupd", SSEPackedDouble, v8f64, 0>, + avx512_store<0x11, VR512, VK8WM, f512mem, store, + "vmovupd", SSEPackedDouble, v8f64>, PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; -def VMOVAPSZmr : AVX512PI<0x29, MRMDestMem, (outs), (ins f512mem:$dst, VR512:$src), - "vmovaps\t{$src, $dst|$dst, $src}", - [(alignedstore512 (v16f32 VR512:$src), addr:$dst)], - SSEPackedSingle>, EVEX, EVEX_V512, PS, EVEX_CD8<32, CD8VF>; -def VMOVAPDZmr : AVX512PI<0x29, MRMDestMem, (outs), (ins f512mem:$dst, VR512:$src), - "vmovapd\t{$src, $dst|$dst, $src}", - [(alignedstore512 (v8f64 VR512:$src), addr:$dst)], - SSEPackedDouble>, EVEX, EVEX_V512, - PD, VEX_W, EVEX_CD8<64, CD8VF>; -def VMOVUPSZmr : AVX512PI<0x11, MRMDestMem, (outs), (ins f512mem:$dst, VR512:$src), - "vmovups\t{$src, $dst|$dst, $src}", - [(store (v16f32 VR512:$src), addr:$dst)], - SSEPackedSingle>, EVEX, EVEX_V512, PS, EVEX_CD8<32, CD8VF>; -def VMOVUPDZmr : AVX512PI<0x11, MRMDestMem, (outs), (ins f512mem:$dst, VR512:$src), - "vmovupd\t{$src, $dst|$dst, $src}", - [(store (v8f64 VR512:$src), addr:$dst)], - SSEPackedDouble>, EVEX, EVEX_V512, - PD, VEX_W, EVEX_CD8<64, CD8VF>; +def: Pat<(v8f64 (int_x86_avx512_mask_loadu_pd_512 addr:$ptr, + (bc_v8f64 (v16i32 immAllZerosV)), GR8:$mask)), + (VMOVUPDZrmkz (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), addr:$ptr)>; -let hasSideEffects = 0 in { - def VMOVDQA32rr : AVX512BI<0x6F, MRMSrcReg, (outs VR512:$dst), - (ins VR512:$src), - "vmovdqa32\t{$src, $dst|$dst, $src}", []>, - EVEX, EVEX_V512; - def VMOVDQA64rr : AVX512BI<0x6F, MRMSrcReg, (outs VR512:$dst), - (ins VR512:$src), - "vmovdqa64\t{$src, $dst|$dst, $src}", []>, - EVEX, EVEX_V512, VEX_W; -let mayStore = 1 in { - def VMOVDQA32mr : AVX512BI<0x7F, MRMDestMem, (outs), - (ins i512mem:$dst, VR512:$src), - "vmovdqa32\t{$src, $dst|$dst, $src}", []>, - EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>; - def VMOVDQA64mr : AVX512BI<0x7F, MRMDestMem, (outs), - (ins i512mem:$dst, VR512:$src), - "vmovdqa64\t{$src, $dst|$dst, $src}", []>, - EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; -} -let mayLoad = 1 in { -def VMOVDQA32rm : AVX512BI<0x6F, MRMSrcMem, (outs VR512:$dst), - (ins i512mem:$src), - "vmovdqa32\t{$src, $dst|$dst, $src}", []>, - EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>; -def VMOVDQA64rm : AVX512BI<0x6F, MRMSrcMem, (outs VR512:$dst), - (ins i512mem:$src), - "vmovdqa64\t{$src, $dst|$dst, $src}", []>, - EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; -} -} +def: Pat<(v16f32 (int_x86_avx512_mask_loadu_ps_512 addr:$ptr, + (bc_v16f32 (v16i32 immAllZerosV)), GR16:$mask)), + (VMOVUPSZrmkz (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), addr:$ptr)>; -// 512-bit aligned load/store -def : Pat<(alignedloadv8i64 addr:$src), (VMOVDQA64rm addr:$src)>; -def : Pat<(alignedloadv16i32 addr:$src), (VMOVDQA32rm addr:$src)>; +def: Pat<(int_x86_avx512_mask_storeu_ps_512 addr:$ptr, (v16f32 VR512:$src), + GR16:$mask), + (VMOVUPSZmrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), + VR512:$src)>; +def: Pat<(int_x86_avx512_mask_storeu_pd_512 addr:$ptr, (v8f64 VR512:$src), + GR8:$mask), + (VMOVUPDZmrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), + VR512:$src)>; -def : Pat<(alignedstore512 (v8i64 VR512:$src), addr:$dst), - (VMOVDQA64mr addr:$dst, VR512:$src)>; -def : Pat<(alignedstore512 (v16i32 VR512:$src), addr:$dst), - (VMOVDQA32mr addr:$dst, VR512:$src)>; - -multiclass avx512_mov_int load_opc, bits<8> store_opc, string asm, - RegisterClass RC, RegisterClass KRC, - PatFrag ld_frag, X86MemOperand x86memop> { -let hasSideEffects = 0 in - def rr : AVX512XSI, EVEX; -let canFoldAsLoad = 1 in - def rm : AVX512XSI, EVEX; -let mayStore = 1 in - def mr : AVX512XSI, EVEX; -let Constraints = "$src1 = $dst" in { - def rrk : AVX512XSI, - EVEX, EVEX_K; - def rmk : AVX512XSI, EVEX, EVEX_K; -} - def rrkz : AVX512XSI, - EVEX, EVEX_KZ; -} - -defm VMOVDQU32 : avx512_mov_int<0x6F, 0x7F, "vmovdqu32", VR512, VK16WM, - memopv16i32, i512mem>, - EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VMOVDQU64 : avx512_mov_int<0x6F, 0x7F, "vmovdqu64", VR512, VK8WM, - memopv8i64, i512mem>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - -// 512-bit unaligned load/store -def : Pat<(loadv8i64 addr:$src), (VMOVDQU64rm addr:$src)>; -def : Pat<(loadv16i32 addr:$src), (VMOVDQU32rm addr:$src)>; - -def : Pat<(store (v8i64 VR512:$src), addr:$dst), - (VMOVDQU64mr addr:$dst, VR512:$src)>; -def : Pat<(store (v16i32 VR512:$src), addr:$dst), - (VMOVDQU32mr addr:$dst, VR512:$src)>; +defm VMOVDQA32: avx512_load<0x6F, VR512, VK16WM, i512mem, alignedloadv16i32, + "vmovdqa32", SSEPackedInt, v16i32>, + avx512_store<0x7F, VR512, VK16WM, i512mem, alignedstore512, + "vmovdqa32", SSEPackedInt, v16i32>, + PD, EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VMOVDQA64: avx512_load<0x6F, VR512, VK8WM, i512mem, alignedloadv8i64, + "vmovdqa64", SSEPackedInt, v8i64>, + avx512_store<0x7F, VR512, VK8WM, i512mem, alignedstore512, + "vmovdqa64", SSEPackedInt, v8i64>, + PD, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; +defm VMOVDQU32: avx512_load<0x6F, VR512, VK16WM, i512mem, load, + "vmovdqu32", SSEPackedInt, v16i32>, + avx512_store<0x7F, VR512, VK16WM, i512mem, store, + "vmovdqu32", SSEPackedInt, v16i32>, + XS, EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VMOVDQU64: avx512_load<0x6F, VR512, VK8WM, i512mem, load, + "vmovdqu64", SSEPackedInt, v8i64>, + avx512_store<0x7F, VR512, VK8WM, i512mem, store, + "vmovdqu64", SSEPackedInt, v8i64>, + XS, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; let AddedComplexity = 20 in { def : Pat<(v8i64 (vselect VK8WM:$mask, (v8i64 VR512:$src), diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll index 0e39e64e431..3fb38edee5c 100644 --- a/test/CodeGen/X86/avx512-intrinsics.ll +++ b/test/CodeGen/X86/avx512-intrinsics.ll @@ -521,3 +521,19 @@ define i16 @test_vptestmd(<16 x i32> %a0, <16 x i32> %a1) { ret i16 %res } declare i16 @llvm.x86.avx512.mask.ptestm.d.512(<16 x i32>, <16 x i32>, i16) + +define void @test_store1(<16 x float> %data, i8* %ptr, i16 %mask) { +; CHECK: vmovups {{.*}}encoding: [0x62,0xf1,0x7c,0x49,0x11,0x07] + call void @llvm.x86.avx512.mask.storeu.ps.512(i8* %ptr, <16 x float> %data, i16 %mask) + ret void +} + +declare void @llvm.x86.avx512.mask.storeu.ps.512(i8*, <16 x float>, i16 ) + +define void @test_store2(<8 x double> %data, i8* %ptr, i8 %mask) { +; CHECK: vmovupd {{.*}}encoding: [0x62,0xf1,0xfd,0x49,0x11,0x07] + call void @llvm.x86.avx512.mask.storeu.pd.512(i8* %ptr, <8 x double> %data, i8 %mask) + ret void +} + +declare void @llvm.x86.avx512.mask.storeu.pd.512(i8*, <8 x double>, i8 ) \ No newline at end of file diff --git a/test/MC/X86/avx512-encodings.s b/test/MC/X86/avx512-encodings.s index e3a983d14e2..26fac83046a 100644 --- a/test/MC/X86/avx512-encodings.s +++ b/test/MC/X86/avx512-encodings.s @@ -713,8 +713,12 @@ vpsrad 512(%rdi, %rsi, 4), %zmm12, %zmm25 vpbroadcastd %xmm0, %zmm1 {%k1} {z} // CHECK: vmovdqu64 {{.*}} {%k3} -// CHECK: encoding: [0x62,0xf1,0xfe,0x4b,0x6f,0xc8] -vmovdqu64 %zmm0, %zmm1 {%k3} +// CHECK: encoding: [0x62,0xf1,0xfe,0x4b,0x7f,0x07] +vmovdqu64 %zmm0, (%rdi) {%k3} + +// CHECK: vmovdqa32 {{.*}} {%k4} +// CHECK: encoding: [0x62,0x61,0x7d,0x4c,0x6f,0x1e] +vmovdqa32 (%rsi), %zmm27 {%k4} // CHECK: vmovd // CHECK: encoding: [0x62,0xe1,0x7d,0x08,0x7e,0x74,0x24,0xeb]