mirror of
https://github.com/RPCSX/llvm.git
synced 2025-03-04 19:07:26 +00:00
[AVX-512] Add many of the VPERM instructions to the load folding table. Move VPERMPDZri to the correct table.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@288591 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
0f9eec21e0
commit
ad37a58d2d
@ -883,6 +883,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
|
||||
{ X86::VMOVUPSZrr, X86::VMOVUPSZrm, 0 },
|
||||
{ X86::VPABSDZrr, X86::VPABSDZrm, 0 },
|
||||
{ X86::VPABSQZrr, X86::VPABSQZrm, 0 },
|
||||
{ X86::VPERMPDZri, X86::VPERMPDZmi, 0 },
|
||||
{ X86::VPERMQZri, X86::VPERMQZmi, 0 },
|
||||
{ X86::VPMOVSXBDZrr, X86::VPMOVSXBDZrm, 0 },
|
||||
{ X86::VPMOVSXBQZrr, X86::VPMOVSXBQZrm, TB_NO_REVERSE },
|
||||
{ X86::VPMOVSXBWZrr, X86::VPMOVSXBWZrm, 0 },
|
||||
@ -914,6 +916,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
|
||||
{ X86::VMOVDQU64Z256rr, X86::VMOVDQU64Z256rm, 0 },
|
||||
{ X86::VMOVUPDZ256rr, X86::VMOVUPDZ256rm, 0 },
|
||||
{ X86::VMOVUPSZ256rr, X86::VMOVUPSZ256rm, 0 },
|
||||
{ X86::VPERMPDZ256ri, X86::VPERMPDZ256mi, 0 },
|
||||
{ X86::VPERMQZ256ri, X86::VPERMQZ256mi, 0 },
|
||||
{ X86::VPMOVSXBDZ256rr, X86::VPMOVSXBDZ256rm, TB_NO_REVERSE },
|
||||
{ X86::VPMOVSXBQZ256rr, X86::VPMOVSXBQZ256rm, TB_NO_REVERSE },
|
||||
{ X86::VPMOVSXBWZ256rr, X86::VPMOVSXBWZ256rm, 0 },
|
||||
@ -1868,8 +1872,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
|
||||
{ X86::VPCMPUQZrri, X86::VPCMPUQZrmi, 0 },
|
||||
{ X86::VPCMPUWZrri, X86::VPCMPUWZrmi, 0 },
|
||||
{ X86::VPCMPWZrri, X86::VPCMPWZrmi, 0 },
|
||||
{ X86::VPERMPDZri, X86::VPERMPDZmi, 0 },
|
||||
{ X86::VPERMBZrr, X86::VPERMBZrm, 0 },
|
||||
{ X86::VPERMDZrr, X86::VPERMDZrm, 0 },
|
||||
{ X86::VPERMPDZrr, X86::VPERMPDZrm, 0 },
|
||||
{ X86::VPERMPSZrr, X86::VPERMPSZrm, 0 },
|
||||
{ X86::VPERMQZrr, X86::VPERMQZrm, 0 },
|
||||
{ X86::VPERMWZrr, X86::VPERMWZrm, 0 },
|
||||
{ X86::VPMADDUBSWZrr, X86::VPMADDUBSWZrm, 0 },
|
||||
{ X86::VPMADDWDZrr, X86::VPMADDWDZrm, 0 },
|
||||
{ X86::VPMAXSDZrr, X86::VPMAXSDZrm, 0 },
|
||||
@ -2031,6 +2039,14 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
|
||||
{ X86::VPCMPUWZ256rri, X86::VPCMPUWZ256rmi, 0 },
|
||||
{ X86::VPCMPWZ128rri, X86::VPCMPWZ128rmi, 0 },
|
||||
{ X86::VPCMPWZ256rri, X86::VPCMPWZ256rmi, 0 },
|
||||
{ X86::VPERMBZ128rr, X86::VPERMBZ128rm, 0 },
|
||||
{ X86::VPERMBZ256rr, X86::VPERMBZ256rm, 0 },
|
||||
{ X86::VPERMDZ256rr, X86::VPERMDZ256rm, 0 },
|
||||
{ X86::VPERMPDZ256rr, X86::VPERMPDZ256rm, 0 },
|
||||
{ X86::VPERMPSZ256rr, X86::VPERMPSZ256rm, 0 },
|
||||
{ X86::VPERMQZ256rr, X86::VPERMQZ256rm, 0 },
|
||||
{ X86::VPERMWZ128rr, X86::VPERMWZ128rm, 0 },
|
||||
{ X86::VPERMWZ256rr, X86::VPERMWZ256rm, 0 },
|
||||
{ X86::VPMADDUBSWZ128rr, X86::VPMADDUBSWZ128rm, 0 },
|
||||
{ X86::VPMADDUBSWZ256rr, X86::VPMADDUBSWZ256rm, 0 },
|
||||
{ X86::VPMADDWDZ128rr, X86::VPMADDWDZ128rm, 0 },
|
||||
@ -2095,6 +2111,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
|
||||
{ X86::VXORPSZ256rr, X86::VXORPSZ256rm, 0 },
|
||||
|
||||
// AVX-512 masked foldable instructions
|
||||
{ X86::VPERMPDZrikz, X86::VPERMPDZmikz, 0 },
|
||||
{ X86::VPERMQZrikz, X86::VPERMQZmikz, 0 },
|
||||
{ X86::VPMOVSXBDZrrkz, X86::VPMOVSXBDZrmkz, 0 },
|
||||
{ X86::VPMOVSXBQZrrkz, X86::VPMOVSXBQZrmkz, TB_NO_REVERSE },
|
||||
{ X86::VPMOVSXBWZrrkz, X86::VPMOVSXBWZrmkz, 0 },
|
||||
@ -2112,6 +2130,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
|
||||
{ X86::VPSHUFLWZrikz, X86::VPSHUFLWZmikz, 0 },
|
||||
|
||||
// AVX-512VL 256-bit masked foldable instructions
|
||||
{ X86::VPERMPDZ256rikz, X86::VPERMPDZ256mikz, 0 },
|
||||
{ X86::VPERMQZ256rikz, X86::VPERMQZ256mikz, 0 },
|
||||
{ X86::VPMOVSXBDZ256rrkz, X86::VPMOVSXBDZ256rmkz, TB_NO_REVERSE },
|
||||
{ X86::VPMOVSXBQZ256rrkz, X86::VPMOVSXBQZ256rmkz, TB_NO_REVERSE },
|
||||
{ X86::VPMOVSXBWZ256rrkz, X86::VPMOVSXBWZ256rmkz, 0 },
|
||||
@ -2322,6 +2342,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
|
||||
{ X86::VPANDNDZrrkz, X86::VPANDNDZrmkz, 0 },
|
||||
{ X86::VPANDNQZrrkz, X86::VPANDNQZrmkz, 0 },
|
||||
{ X86::VPANDQZrrkz, X86::VPANDQZrmkz, 0 },
|
||||
{ X86::VPERMBZrrkz, X86::VPERMBZrmkz, 0 },
|
||||
{ X86::VPERMDZrrkz, X86::VPERMDZrmkz, 0 },
|
||||
{ X86::VPERMPDZrrkz, X86::VPERMPDZrmkz, 0 },
|
||||
{ X86::VPERMPSZrrkz, X86::VPERMPSZrmkz, 0 },
|
||||
{ X86::VPERMQZrrkz, X86::VPERMQZrmkz, 0 },
|
||||
{ X86::VPERMWZrrkz, X86::VPERMWZrmkz, 0 },
|
||||
{ X86::VPMADDUBSWZrrkz, X86::VPMADDUBSWZrmkz, 0 },
|
||||
{ X86::VPMADDWDZrrkz, X86::VPMADDWDZrmkz, 0 },
|
||||
{ X86::VPORDZrrkz, X86::VPORDZrmkz, 0 },
|
||||
@ -2391,6 +2417,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
|
||||
{ X86::VPANDNDZ256rrkz, X86::VPANDNDZ256rmkz, 0 },
|
||||
{ X86::VPANDNQZ256rrkz, X86::VPANDNQZ256rmkz, 0 },
|
||||
{ X86::VPANDQZ256rrkz, X86::VPANDQZ256rmkz, 0 },
|
||||
{ X86::VPERMBZ256rrkz, X86::VPERMBZ256rmkz, 0 },
|
||||
{ X86::VPERMDZ256rrkz, X86::VPERMDZ256rmkz, 0 },
|
||||
{ X86::VPERMPDZ256rrkz, X86::VPERMPDZ256rmkz, 0 },
|
||||
{ X86::VPERMPSZ256rrkz, X86::VPERMPSZ256rmkz, 0 },
|
||||
{ X86::VPERMQZ256rrkz, X86::VPERMQZ256rmkz, 0 },
|
||||
{ X86::VPERMWZ256rrkz, X86::VPERMWZ256rmkz, 0 },
|
||||
{ X86::VPMADDUBSWZ256rrkz, X86::VPMADDUBSWZ256rmkz, 0 },
|
||||
{ X86::VPMADDWDZ256rrkz, X86::VPMADDWDZ256rmkz, 0 },
|
||||
{ X86::VPORDZ256rrkz, X86::VPORDZ256rmkz, 0 },
|
||||
@ -2456,6 +2488,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
|
||||
{ X86::VPANDNDZ128rrkz, X86::VPANDNDZ128rmkz, 0 },
|
||||
{ X86::VPANDNQZ128rrkz, X86::VPANDNQZ128rmkz, 0 },
|
||||
{ X86::VPANDQZ128rrkz, X86::VPANDQZ128rmkz, 0 },
|
||||
{ X86::VPERMBZ128rrkz, X86::VPERMBZ128rmkz, 0 },
|
||||
{ X86::VPERMWZ128rrkz, X86::VPERMWZ128rmkz, 0 },
|
||||
{ X86::VPMADDUBSWZ128rrkz, X86::VPMADDUBSWZ128rmkz, 0 },
|
||||
{ X86::VPMADDWDZ128rrkz, X86::VPMADDWDZ128rmkz, 0 },
|
||||
{ X86::VPORDZ128rrkz, X86::VPORDZ128rmkz, 0 },
|
||||
@ -2489,6 +2523,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
|
||||
{ X86::VXORPSZ128rrkz, X86::VXORPSZ128rmkz, 0 },
|
||||
|
||||
// AVX-512 masked foldable instructions
|
||||
{ X86::VPERMPDZrik, X86::VPERMPDZmik, 0 },
|
||||
{ X86::VPERMQZrik, X86::VPERMQZmik, 0 },
|
||||
{ X86::VPMOVSXBDZrrk, X86::VPMOVSXBDZrmk, 0 },
|
||||
{ X86::VPMOVSXBQZrrk, X86::VPMOVSXBQZrmk, TB_NO_REVERSE },
|
||||
{ X86::VPMOVSXBWZrrk, X86::VPMOVSXBWZrmk, 0 },
|
||||
@ -2506,6 +2542,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
|
||||
{ X86::VPSHUFLWZrik, X86::VPSHUFLWZmik, 0 },
|
||||
|
||||
// AVX-512VL 256-bit masked foldable instructions
|
||||
{ X86::VPERMPDZ256rik, X86::VPERMPDZ256mik, 0 },
|
||||
{ X86::VPERMQZ256rik, X86::VPERMQZ256mik, 0 },
|
||||
{ X86::VPMOVSXBDZ256rrk, X86::VPMOVSXBDZ256rmk, TB_NO_REVERSE },
|
||||
{ X86::VPMOVSXBQZ256rrk, X86::VPMOVSXBQZ256rmk, TB_NO_REVERSE },
|
||||
{ X86::VPMOVSXBWZ256rrk, X86::VPMOVSXBWZ256rmk, 0 },
|
||||
@ -2605,6 +2643,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
|
||||
{ X86::VPANDNDZrrk, X86::VPANDNDZrmk, 0 },
|
||||
{ X86::VPANDNQZrrk, X86::VPANDNQZrmk, 0 },
|
||||
{ X86::VPANDQZrrk, X86::VPANDQZrmk, 0 },
|
||||
{ X86::VPERMBZrrk, X86::VPERMBZrmk, 0 },
|
||||
{ X86::VPERMDZrrk, X86::VPERMDZrmk, 0 },
|
||||
{ X86::VPERMPDZrrk, X86::VPERMPDZrmk, 0 },
|
||||
{ X86::VPERMPSZrrk, X86::VPERMPSZrmk, 0 },
|
||||
{ X86::VPERMQZrrk, X86::VPERMQZrmk, 0 },
|
||||
{ X86::VPERMWZrrk, X86::VPERMWZrmk, 0 },
|
||||
{ X86::VPMADDUBSWZrrk, X86::VPMADDUBSWZrmk, 0 },
|
||||
{ X86::VPMADDWDZrrk, X86::VPMADDWDZrmk, 0 },
|
||||
{ X86::VPORDZrrk, X86::VPORDZrmk, 0 },
|
||||
@ -2677,6 +2721,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
|
||||
{ X86::VPANDNDZ256rrk, X86::VPANDNDZ256rmk, 0 },
|
||||
{ X86::VPANDNQZ256rrk, X86::VPANDNQZ256rmk, 0 },
|
||||
{ X86::VPANDQZ256rrk, X86::VPANDQZ256rmk, 0 },
|
||||
{ X86::VPERMBZ256rrk, X86::VPERMBZ256rmk, 0 },
|
||||
{ X86::VPERMDZ256rrk, X86::VPERMDZ256rmk, 0 },
|
||||
{ X86::VPERMPDZ256rrk, X86::VPERMPDZ256rmk, 0 },
|
||||
{ X86::VPERMPSZ256rrk, X86::VPERMPSZ256rmk, 0 },
|
||||
{ X86::VPERMQZ256rrk, X86::VPERMQZ256rmk, 0 },
|
||||
{ X86::VPERMWZ256rrk, X86::VPERMWZ256rmk, 0 },
|
||||
{ X86::VPMADDUBSWZ256rrk, X86::VPMADDUBSWZ256rmk, 0 },
|
||||
{ X86::VPMADDWDZ256rrk, X86::VPMADDWDZ256rmk, 0 },
|
||||
{ X86::VPORDZ256rrk, X86::VPORDZ256rmk, 0 },
|
||||
@ -2746,6 +2796,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
|
||||
{ X86::VPANDNDZ128rrk, X86::VPANDNDZ128rmk, 0 },
|
||||
{ X86::VPANDNQZ128rrk, X86::VPANDNQZ128rmk, 0 },
|
||||
{ X86::VPANDQZ128rrk, X86::VPANDQZ128rmk, 0 },
|
||||
{ X86::VPERMBZ128rrk, X86::VPERMBZ128rmk, 0 },
|
||||
{ X86::VPERMWZ128rrk, X86::VPERMWZ128rmk, 0 },
|
||||
{ X86::VPMADDUBSWZ128rrk, X86::VPMADDUBSWZ128rmk, 0 },
|
||||
{ X86::VPMADDWDZ128rrk, X86::VPMADDWDZ128rmk, 0 },
|
||||
{ X86::VPORDZ128rrk, X86::VPORDZ128rmk, 0 },
|
||||
|
@ -870,7 +870,190 @@ define <16 x i32> @stack_fold_pmaddwd_zmm_maskz(<16 x i32>* %passthru, <32 x i16
|
||||
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
|
||||
%2 = call <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16> %a0, <32 x i16> %a1, <16 x i32> undef, i16 -1)
|
||||
%3 = bitcast i16 %mask to <16 x i1>
|
||||
; load needed to keep the operation from being scheduled about the asm block
|
||||
%4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
|
||||
ret <16 x i32> %4
|
||||
}
|
||||
|
||||
define <16 x i32> @stack_fold_permd(<16 x i32> %a0, <16 x i32> %a1) {
|
||||
;CHECK-LABEL: stack_fold_permd
|
||||
;CHECK: vpermd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
|
||||
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
|
||||
%2 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %a1, <16 x i32> %a0, <16 x i32> undef, i16 -1)
|
||||
ret <16 x i32> %2
|
||||
}
|
||||
declare <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) nounwind readonly
|
||||
|
||||
define <8 x double> @stack_fold_permpd(<8 x double> %a0) {
|
||||
;CHECK-LABEL: stack_fold_permpd
|
||||
;CHECK: vpermpd $235, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
|
||||
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
|
||||
%2 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7>
|
||||
; fadd forces execution domain
|
||||
%3 = fadd <8 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0>
|
||||
ret <8 x double> %3
|
||||
}
|
||||
|
||||
define <8 x double> @stack_fold_permpd_mask(<8 x double>* %passthru, <8 x double> %a0, i8 %mask) {
|
||||
;CHECK-LABEL: stack_fold_permpd_mask
|
||||
;CHECK: vpermpd $235, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
|
||||
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
|
||||
%2 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7>
|
||||
%3 = bitcast i8 %mask to <8 x i1>
|
||||
; load needed to keep the operation from being scheduled above the asm block
|
||||
%4 = load <8 x double>, <8 x double>* %passthru
|
||||
%5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4
|
||||
; fadd forces execution domain
|
||||
%6 = fadd <8 x double> %5, <double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0>
|
||||
ret <8 x double> %6
|
||||
}
|
||||
|
||||
define <8 x double> @stack_fold_permpd_maskz(<8 x double> %a0, i8 %mask) {
|
||||
;CHECK-LABEL: stack_fold_permpd_maskz
|
||||
;CHECK: vpermpd $235, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
|
||||
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
|
||||
%2 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7>
|
||||
%3 = bitcast i8 %mask to <8 x i1>
|
||||
%4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer
|
||||
ret <8 x double> %4
|
||||
}
|
||||
|
||||
define <8 x double> @stack_fold_permpdvar(<8 x i64> %a0, <8 x double> %a1) {
|
||||
;CHECK-LABEL: stack_fold_permpdvar
|
||||
;CHECK: vpermpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
|
||||
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
|
||||
%2 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %a1, <8 x i64> %a0, <8 x double> undef, i8 -1)
|
||||
; fadd forces execution domain
|
||||
%3 = fadd <8 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0>
|
||||
ret <8 x double> %3
|
||||
}
|
||||
declare <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double>, <8 x i64>, <8 x double>, i8) nounwind readonly
|
||||
|
||||
define <16 x float> @stack_fold_permps(<16 x i32> %a0, <16 x float> %a1) {
|
||||
;CHECK-LABEL: stack_fold_permps
|
||||
;CHECK: vpermps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
|
||||
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
|
||||
%2 = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %a1, <16 x i32> %a0, <16 x float> undef, i16 -1)
|
||||
ret <16 x float> %2
|
||||
}
|
||||
declare <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float>, <16 x i32>, <16 x float>, i16) nounwind readonly
|
||||
|
||||
define <8 x i64> @stack_fold_permq(<8 x i64> %a0) {
|
||||
;CHECK-LABEL: stack_fold_permq
|
||||
;CHECK: vpermq $235, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
|
||||
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
|
||||
%2 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7>
|
||||
; add forces execution domain
|
||||
%3 = add <8 x i64> %2, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
|
||||
ret <8 x i64> %3
|
||||
}
|
||||
|
||||
define <8 x i64> @stack_fold_permq_mask(<8 x i64>* %passthru, <8 x i64> %a0, i8 %mask) {
|
||||
;CHECK-LABEL: stack_fold_permq_mask
|
||||
;CHECK: vpermq $235, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
|
||||
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
|
||||
%2 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7>
|
||||
%3 = bitcast i8 %mask to <8 x i1>
|
||||
; load needed to keep the operation from being scheduled above the asm block
|
||||
%4 = load <8 x i64>, <8 x i64>* %passthru
|
||||
%5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
|
||||
; add forces execution domain
|
||||
%6 = add <8 x i64> %5, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
|
||||
ret <8 x i64> %6
|
||||
}
|
||||
|
||||
define <8 x i64> @stack_fold_permq_maskz(<8 x i64>* %passthru, <8 x i64> %a0, i8 %mask) {
|
||||
;CHECK-LABEL: stack_fold_permq_maskz
|
||||
;CHECK: vpermq $235, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
|
||||
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
|
||||
%2 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7>
|
||||
%3 = bitcast i8 %mask to <8 x i1>
|
||||
%4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
|
||||
ret <8 x i64> %4
|
||||
}
|
||||
|
||||
define <8 x i64> @stack_fold_permqvar(<8 x i64> %a0, <8 x i64> %a1) {
|
||||
;CHECK-LABEL: stack_fold_permqvar
|
||||
;CHECK: vpermq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
|
||||
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
|
||||
%2 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %a1, <8 x i64> %a0, <8 x i64> undef, i8 -1)
|
||||
; add forces execution domain
|
||||
%3 = add <8 x i64> %2, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
|
||||
ret <8 x i64> %3
|
||||
}
|
||||
declare <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) nounwind readonly
|
||||
|
||||
define <8 x i64> @stack_fold_permqvar_mask(<8 x i64>* %passthru, <8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
|
||||
;CHECK-LABEL: stack_fold_permqvar_mask
|
||||
;CHECK: vpermq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
|
||||
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
|
||||
%2 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %a1, <8 x i64> %a0, <8 x i64> undef, i8 -1)
|
||||
%3 = bitcast i8 %mask to <8 x i1>
|
||||
; load needed to keep the operation from being scheduled above the asm block
|
||||
%4 = load <8 x i64>, <8 x i64>* %passthru
|
||||
%5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
|
||||
; add forces execution domain
|
||||
%6 = add <8 x i64> %5, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
|
||||
ret <8 x i64> %6
|
||||
}
|
||||
|
||||
define <32 x i16> @stack_fold_permwvar(<32 x i16> %a0, <32 x i16> %a1) {
|
||||
;CHECK-LABEL: stack_fold_permwvar
|
||||
;CHECK: vpermw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
|
||||
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
|
||||
%2 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a1, <32 x i16> %a0, <32 x i16> undef, i32 -1)
|
||||
ret <32 x i16> %2
|
||||
}
|
||||
declare <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) nounwind readonly
|
||||
|
||||
define <32 x i16> @stack_fold_permwvar_mask(<32 x i16>* %passthru, <32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
|
||||
;CHECK-LABEL: stack_fold_permwvar_mask
|
||||
;CHECK: vpermw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
|
||||
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
|
||||
%2 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a1, <32 x i16> %a0, <32 x i16> undef, i32 -1)
|
||||
%3 = bitcast i32 %mask to <32 x i1>
|
||||
; load needed to keep the operation from being scheduled above the asm block
|
||||
%4 = load <32 x i16>, <32 x i16>* %passthru
|
||||
%5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
|
||||
ret <32 x i16> %5
|
||||
}
|
||||
|
||||
define <32 x i16> @stack_fold_permwvar_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
|
||||
;CHECK-LABEL: stack_fold_permwvar_maskz
|
||||
;CHECK: vpermw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
|
||||
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
|
||||
%2 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a1, <32 x i16> %a0, <32 x i16> undef, i32 -1)
|
||||
%3 = bitcast i32 %mask to <32 x i1>
|
||||
%4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
|
||||
ret <32 x i16> %4
|
||||
}
|
||||
|
||||
define <64 x i8> @stack_fold_permbvar(<64 x i8> %a0, <64 x i8> %a1) {
|
||||
;CHECK-LABEL: stack_fold_permbvar
|
||||
;CHECK: vpermb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
|
||||
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
|
||||
%2 = call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %a1, <64 x i8> %a0, <64 x i8> undef, i64 -1)
|
||||
ret <64 x i8> %2
|
||||
}
|
||||
declare <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) nounwind readonly
|
||||
|
||||
define <64 x i8> @stack_fold_permbvar_mask(<64 x i8>* %passthru, <64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
|
||||
;CHECK-LABEL: stack_fold_permbvar_mask
|
||||
;CHECK: vpermb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
|
||||
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
|
||||
%2 = call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %a1, <64 x i8> %a0, <64 x i8> undef, i64 -1)
|
||||
%3 = bitcast i64 %mask to <64 x i1>
|
||||
; load needed to keep the operation from being scheduled above the asm block
|
||||
%4 = load <64 x i8>, <64 x i8>* %passthru
|
||||
%5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4
|
||||
ret <64 x i8> %5
|
||||
}
|
||||
|
||||
define <64 x i8> @stack_fold_permbvar_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
|
||||
;CHECK-LABEL: stack_fold_permbvar_maskz
|
||||
;CHECK: vpermb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
|
||||
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
|
||||
%2 = call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %a1, <64 x i8> %a0, <64 x i8> undef, i64 -1)
|
||||
%3 = bitcast i64 %mask to <64 x i1>
|
||||
%4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
|
||||
ret <64 x i8> %4
|
||||
}
|
||||
|
@ -1340,3 +1340,85 @@ define <8 x i32> @stack_fold_pmaddwd_ymm_maskz(<16 x i16> %a0, <16 x i16> %a1, i
|
||||
%4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
|
||||
ret <8 x i32> %4
|
||||
}
|
||||
|
||||
define <8 x i32> @stack_fold_permd(<8 x i32> %a0, <8 x i32> %a1) {
|
||||
;CHECK-LABEL: stack_fold_permd
|
||||
;CHECK: vpermd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
|
||||
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
|
||||
%2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a1, <8 x i32> %a0)
|
||||
ret <8 x i32> %2
|
||||
}
|
||||
declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly
|
||||
|
||||
define <4 x double> @stack_fold_permpd(<4 x double> %a0) {
|
||||
;CHECK-LABEL: stack_fold_permpd
|
||||
;CHECK: vpermpd $235, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
|
||||
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
|
||||
%2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 3>
|
||||
; fadd forces execution domain
|
||||
%3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
|
||||
ret <4 x double> %3
|
||||
}
|
||||
|
||||
define <4 x double> @stack_fold_permpdvar(<4 x i64> %a0, <4 x double> %a1) {
|
||||
;CHECK-LABEL: stack_fold_permpdvar
|
||||
;CHECK: vpermpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
|
||||
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
|
||||
%2 = call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %a1, <4 x i64> %a0, <4 x double> undef, i8 -1)
|
||||
; fadd forces execution domain
|
||||
%3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
|
||||
ret <4 x double> %3
|
||||
}
|
||||
declare <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double>, <4 x i64>, <4 x double>, i8) nounwind readonly
|
||||
|
||||
define <8 x float> @stack_fold_permps(<8 x i32> %a0, <8 x float> %a1) {
|
||||
;CHECK-LABEL: stack_fold_permps
|
||||
;CHECK: vpermps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
|
||||
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
|
||||
%2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a1, <8 x i32> %a0)
|
||||
ret <8 x float> %2
|
||||
}
|
||||
declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind readonly
|
||||
|
||||
define <4 x i64> @stack_fold_permq(<4 x i64> %a0) {
|
||||
;CHECK-LABEL: stack_fold_permq
|
||||
;CHECK: vpermq $235, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
|
||||
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
|
||||
%2 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 3>
|
||||
; add forces execution domain
|
||||
%3 = add <4 x i64> %2, <i64 1, i64 1, i64 1, i64 1>
|
||||
ret <4 x i64> %3
|
||||
}
|
||||
|
||||
define <4 x i64> @stack_fold_permqvar(<4 x i64> %a0, <4 x i64> %a1) {
|
||||
;CHECK-LABEL: stack_fold_permqvar
|
||||
;CHECK: vpermq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
|
||||
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
|
||||
%2 = call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %a1, <4 x i64> %a0, <4 x i64> undef, i8 -1)
|
||||
; add forces execution domain
|
||||
%3 = add <4 x i64> %2, <i64 1, i64 1, i64 1, i64 1>
|
||||
ret <4 x i64> %3
|
||||
}
|
||||
declare <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) nounwind readonly
|
||||
|
||||
define <16 x i16> @stack_fold_permwvar(<16 x i16> %a0, <16 x i16> %a1) {
|
||||
;CHECK-LABEL: stack_fold_permwvar
|
||||
;CHECK: vpermw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
|
||||
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
|
||||
%2 = call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %a1, <16 x i16> %a0, <16 x i16> undef, i16 -1)
|
||||
; add forces execution domain
|
||||
%3 = add <16 x i16> %2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
|
||||
ret <16 x i16> %3
|
||||
}
|
||||
declare <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) nounwind readonly
|
||||
|
||||
define <32 x i8> @stack_fold_permbvar(<32 x i8> %a0, <32 x i8> %a1) {
|
||||
;CHECK-LABEL: stack_fold_permbvar
|
||||
;CHECK: vpermb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
|
||||
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
|
||||
%2 = call <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8> %a1, <32 x i8> %a0, <32 x i8> undef, i32 -1)
|
||||
; add forces execution domain
|
||||
%3 = add <32 x i8> %2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
|
||||
ret <32 x i8> %3
|
||||
}
|
||||
declare <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) nounwind readonly
|
||||
|
Loading…
x
Reference in New Issue
Block a user