[AVX-512] Add support for commuting VPERMT2(B/W/D/Q/PS/PD) to/from VPERMI2(B/W/D/Q/PS/PD).

Summary:
The index and one of the table operands can be swapped by changing the opcode to the other version. Neither of these operands are the one that can load from memory so this can't be used to increase memory folding opportunities.

We need to handle the unmasked forms and the kz forms. Since the load operand isn't being commuted we can commute the load and broadcast instructions too.

Reviewers: igorb, delena, Ayal, Farhana, RKSimon

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D25652

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@287621 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Craig Topper 2016-11-22 04:57:34 +00:00
parent 8edd5b452f
commit aa9982b218
15 changed files with 389 additions and 316 deletions

View File

@ -1352,14 +1352,14 @@ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
defm rr: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
(_.VT (X86VPermi2X _.RC:$src1, _.RC:$src2, _.RC:$src3))>, EVEX_4V,
(_.VT (X86VPermi2X _.RC:$src1, _.RC:$src2, _.RC:$src3)), 1>, EVEX_4V,
AVX5128IBase;
defm rm: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.MemOp:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
(_.VT (X86VPermi2X _.RC:$src1, _.RC:$src2,
(_.VT (bitconvert (_.LdFrag addr:$src3)))))>,
(_.VT (bitconvert (_.LdFrag addr:$src3))))), 1>,
EVEX_4V, AVX5128IBase;
}
}
@ -1371,8 +1371,8 @@ multiclass avx512_perm_i_mb<bits<8> opc, string OpcodeStr,
OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"),
!strconcat("$src2, ${src3}", _.BroadcastStr ),
(_.VT (X86VPermi2X _.RC:$src1,
_.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))))>,
AVX5128IBase, EVEX_4V, EVEX_B;
_.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))),
1>, AVX5128IBase, EVEX_4V, EVEX_B;
}
multiclass avx512_perm_i_sizes<bits<8> opc, string OpcodeStr,
@ -1420,14 +1420,14 @@ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
defm rr: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins IdxVT.RC:$src2, _.RC:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
(_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2, _.RC:$src3))>, EVEX_4V,
AVX5128IBase;
(_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2, _.RC:$src3)), 1>,
EVEX_4V, AVX5128IBase;
defm rm: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins IdxVT.RC:$src2, _.MemOp:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
(_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2,
(bitconvert (_.LdFrag addr:$src3))))>,
(bitconvert (_.LdFrag addr:$src3)))), 1>,
EVEX_4V, AVX5128IBase;
}
}
@ -1439,8 +1439,8 @@ multiclass avx512_perm_t_mb<bits<8> opc, string OpcodeStr,
OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"),
!strconcat("$src2, ${src3}", _.BroadcastStr ),
(_.VT (X86VPermt2 _.RC:$src1,
IdxVT.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))))>,
AVX5128IBase, EVEX_4V, EVEX_B;
IdxVT.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))),
1>, AVX5128IBase, EVEX_4V, EVEX_B;
}
multiclass avx512_perm_t_sizes<bits<8> opc, string OpcodeStr,

View File

@ -3533,6 +3533,92 @@ static bool commuteVPTERNLOG(MachineInstr &MI, unsigned SrcOpIdx1,
return true;
}
// Returns true if this is a VPERMI2 or VPERMT2 instrution that can be
// commuted.
static bool isCommutableVPERMV3Instruction(unsigned Opcode) {
#define VPERM_CASES(Suffix) \
case X86::VPERMI2##Suffix##128rr: case X86::VPERMT2##Suffix##128rr: \
case X86::VPERMI2##Suffix##256rr: case X86::VPERMT2##Suffix##256rr: \
case X86::VPERMI2##Suffix##rr: case X86::VPERMT2##Suffix##rr: \
case X86::VPERMI2##Suffix##128rm: case X86::VPERMT2##Suffix##128rm: \
case X86::VPERMI2##Suffix##256rm: case X86::VPERMT2##Suffix##256rm: \
case X86::VPERMI2##Suffix##rm: case X86::VPERMT2##Suffix##rm: \
case X86::VPERMI2##Suffix##128rrkz: case X86::VPERMT2##Suffix##128rrkz: \
case X86::VPERMI2##Suffix##256rrkz: case X86::VPERMT2##Suffix##256rrkz: \
case X86::VPERMI2##Suffix##rrkz: case X86::VPERMT2##Suffix##rrkz: \
case X86::VPERMI2##Suffix##128rmkz: case X86::VPERMT2##Suffix##128rmkz: \
case X86::VPERMI2##Suffix##256rmkz: case X86::VPERMT2##Suffix##256rmkz: \
case X86::VPERMI2##Suffix##rmkz: case X86::VPERMT2##Suffix##rmkz:
#define VPERM_CASES_BROADCAST(Suffix) \
VPERM_CASES(Suffix) \
case X86::VPERMI2##Suffix##128rmb: case X86::VPERMT2##Suffix##128rmb: \
case X86::VPERMI2##Suffix##256rmb: case X86::VPERMT2##Suffix##256rmb: \
case X86::VPERMI2##Suffix##rmb: case X86::VPERMT2##Suffix##rmb: \
case X86::VPERMI2##Suffix##128rmbkz: case X86::VPERMT2##Suffix##128rmbkz: \
case X86::VPERMI2##Suffix##256rmbkz: case X86::VPERMT2##Suffix##256rmbkz: \
case X86::VPERMI2##Suffix##rmbkz: case X86::VPERMT2##Suffix##rmbkz:
switch (Opcode) {
default: return false;
VPERM_CASES(B)
VPERM_CASES_BROADCAST(D)
VPERM_CASES_BROADCAST(PD)
VPERM_CASES_BROADCAST(PS)
VPERM_CASES_BROADCAST(Q)
VPERM_CASES(W)
return true;
}
#undef VPERM_CASES_BROADCAST
#undef VPERM_CASES
}
// Returns commuted opcode for VPERMI2 and VPERMT2 instructions by switching
// from the I opcod to the T opcode and vice versa.
static unsigned getCommutedVPERMV3Opcode(unsigned Opcode) {
#define VPERM_CASES(Orig, New) \
case X86::Orig##128rr: return X86::New##128rr; \
case X86::Orig##128rrkz: return X86::New##128rrkz; \
case X86::Orig##128rm: return X86::New##128rm; \
case X86::Orig##128rmkz: return X86::New##128rmkz; \
case X86::Orig##256rr: return X86::New##256rr; \
case X86::Orig##256rrkz: return X86::New##256rrkz; \
case X86::Orig##256rm: return X86::New##256rm; \
case X86::Orig##256rmkz: return X86::New##256rmkz; \
case X86::Orig##rr: return X86::New##rr; \
case X86::Orig##rrkz: return X86::New##rrkz; \
case X86::Orig##rm: return X86::New##rm; \
case X86::Orig##rmkz: return X86::New##rmkz;
#define VPERM_CASES_BROADCAST(Orig, New) \
VPERM_CASES(Orig, New) \
case X86::Orig##128rmb: return X86::New##128rmb; \
case X86::Orig##128rmbkz: return X86::New##128rmbkz; \
case X86::Orig##256rmb: return X86::New##256rmb; \
case X86::Orig##256rmbkz: return X86::New##256rmbkz; \
case X86::Orig##rmb: return X86::New##rmb; \
case X86::Orig##rmbkz: return X86::New##rmbkz;
switch (Opcode) {
VPERM_CASES(VPERMI2B, VPERMT2B)
VPERM_CASES_BROADCAST(VPERMI2D, VPERMT2D)
VPERM_CASES_BROADCAST(VPERMI2PD, VPERMT2PD)
VPERM_CASES_BROADCAST(VPERMI2PS, VPERMT2PS)
VPERM_CASES_BROADCAST(VPERMI2Q, VPERMT2Q)
VPERM_CASES(VPERMI2W, VPERMT2W)
VPERM_CASES(VPERMT2B, VPERMI2B)
VPERM_CASES_BROADCAST(VPERMT2D, VPERMI2D)
VPERM_CASES_BROADCAST(VPERMT2PD, VPERMI2PD)
VPERM_CASES_BROADCAST(VPERMT2PS, VPERMI2PS)
VPERM_CASES_BROADCAST(VPERMT2Q, VPERMI2Q)
VPERM_CASES(VPERMT2W, VPERMI2W)
}
llvm_unreachable("Unreachable!");
#undef VPERM_CASES_BROADCAST
#undef VPERM_CASES
}
MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
unsigned OpIdx1,
unsigned OpIdx2) const {
@ -3854,7 +3940,15 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
default:
default: {
if (isCommutableVPERMV3Instruction(MI.getOpcode())) {
unsigned Opc = getCommutedVPERMV3Opcode(MI.getOpcode());
auto &WorkingMI = cloneIfNew(MI);
WorkingMI.setDesc(get(Opc));
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
const X86InstrFMA3Group *FMA3Group =
X86InstrFMA3Info::getFMA3Group(MI.getOpcode());
if (FMA3Group) {
@ -3870,6 +3964,7 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
}
}
}
bool X86InstrInfo::findFMA3CommutedOpIndices(
@ -4041,12 +4136,26 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
// Handled masked instructions since we need to skip over the mask input
// and the preserved input.
if (Desc.TSFlags & X86II::EVEX_K) {
// First assume that the first input is the mask operand and skip past it.
unsigned CommutableOpIdx1 = Desc.getNumDefs() + 1;
// If there is no preserved input we only need to skip 1 operand.
if (MI.getDesc().getOperandConstraint(Desc.getNumDefs(),
MCOI::TIED_TO) != -1)
++CommutableOpIdx1;
unsigned CommutableOpIdx2 = CommutableOpIdx1 + 1;
unsigned CommutableOpIdx2 = Desc.getNumDefs() + 2;
// Check if the first input is tied. If there isn't one then we only
// need to skip the mask operand which we did above.
if ((MI.getDesc().getOperandConstraint(Desc.getNumDefs(),
MCOI::TIED_TO) != -1)) {
// If this is zero masking instruction with a tied operand, we need to
// move the first index back to the first input since this must
// be a 3 input instruction and we want the first two non-mask inputs.
// Otherwise this is a 2 input instruction with a preserved input and
// mask, so we need to move the indices to skip one more input.
if (Desc.TSFlags & X86II::EVEX_Z)
--CommutableOpIdx1;
else {
++CommutableOpIdx1;
++CommutableOpIdx2;
}
}
if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
CommutableOpIdx1, CommutableOpIdx2))
return false;

View File

@ -369,8 +369,8 @@ define i16 @test16(i1 *%addr, i16 %a) {
; KNL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} {z}
; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,16,11,12,13,14,15]
; KNL-NEXT: vpermt2d %zmm0, %zmm2, %zmm1
; KNL-NEXT: vpslld $31, %zmm1, %zmm0
; KNL-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
; KNL-NEXT: vpslld $31, %zmm2, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: retq
@ -384,8 +384,8 @@ define i16 @test16(i1 *%addr, i16 %a) {
; SKX-NEXT: vpmovm2d %k1, %zmm0
; SKX-NEXT: vpmovm2d %k0, %zmm1
; SKX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,16,11,12,13,14,15]
; SKX-NEXT: vpermt2d %zmm1, %zmm2, %zmm0
; SKX-NEXT: vpmovd2m %zmm0, %k0
; SKX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
; SKX-NEXT: vpmovd2m %zmm2, %k0
; SKX-NEXT: kmovw %k0, %eax
; SKX-NEXT: retq
%x = load i1 , i1 * %addr, align 128
@ -406,8 +406,8 @@ define i8 @test17(i1 *%addr, i8 %a) {
; KNL-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} {z}
; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,8,5,6,7]
; KNL-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
; KNL-NEXT: vpsllq $63, %zmm1, %zmm0
; KNL-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
; KNL-NEXT: vpsllq $63, %zmm2, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: retq
@ -421,8 +421,8 @@ define i8 @test17(i1 *%addr, i8 %a) {
; SKX-NEXT: vpmovm2q %k1, %zmm0
; SKX-NEXT: vpmovm2q %k0, %zmm1
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,8,5,6,7]
; SKX-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
; SKX-NEXT: vpmovq2m %zmm0, %k0
; SKX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
; SKX-NEXT: vpmovq2m %zmm2, %k0
; SKX-NEXT: kmovb %k0, %eax
; SKX-NEXT: retq
%x = load i1 , i1 * %addr, align 128
@ -1217,8 +1217,8 @@ define i32 @test_insertelement_v32i1(i32 %a, i32 %b, <32 x i32> %x , <32 x i32>
; SKX-NEXT: vpmovm2w %k1, %zmm0
; SKX-NEXT: vpmovm2w %k0, %zmm1
; SKX-NEXT: vmovdqu16 {{.*#+}} zmm2 = [0,1,2,3,32,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
; SKX-NEXT: vpermt2w %zmm1, %zmm2, %zmm0
; SKX-NEXT: vpmovw2m %zmm0, %k0
; SKX-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
; SKX-NEXT: vpmovw2m %zmm2, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: retq
%cmp_res_i1 = icmp ult i32 %a, %b
@ -1249,14 +1249,14 @@ define i8 @test_iinsertelement_v4i1(i32 %a, i32 %b, <4 x i32> %x , <4 x i32> %y)
; KNL-NEXT: kmovw %eax, %k2
; KNL-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} {z}
; KNL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,8,2,3,4,5,6,7]
; KNL-NEXT: vpermt2q %zmm2, %zmm4, %zmm3
; KNL-NEXT: vpsllq $63, %zmm3, %zmm2
; KNL-NEXT: vpermi2q %zmm2, %zmm3, %zmm4
; KNL-NEXT: vpsllq $63, %zmm4, %zmm2
; KNL-NEXT: vptestmq %zmm2, %zmm2, %k2
; KNL-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} {z}
; KNL-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} {z}
; KNL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,8,3,4,5,6,7]
; KNL-NEXT: vpermt2q %zmm3, %zmm4, %zmm2
; KNL-NEXT: vpsllq $63, %zmm2, %zmm2
; KNL-NEXT: vpermi2q %zmm3, %zmm2, %zmm4
; KNL-NEXT: vpsllq $63, %zmm4, %zmm2
; KNL-NEXT: vptestmq %zmm2, %zmm2, %k1
; KNL-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} {z}
; KNL-NEXT: vpextrd $3, %xmm0, %eax
@ -1264,8 +1264,8 @@ define i8 @test_iinsertelement_v4i1(i32 %a, i32 %b, <4 x i32> %x , <4 x i32> %y)
; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
; KNL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,8,4,5,6,7]
; KNL-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
; KNL-NEXT: vpsllq $63, %zmm2, %zmm0
; KNL-NEXT: vpermi2q %zmm0, %zmm2, %zmm1
; KNL-NEXT: vpsllq $63, %zmm1, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: retq
@ -1310,8 +1310,8 @@ define i8 @test_iinsertelement_v2i1(i32 %a, i32 %b, <2 x i64> %x , <2 x i64> %y)
; KNL-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} {z}
; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
; KNL-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
; KNL-NEXT: vpsllq $63, %zmm1, %zmm0
; KNL-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
; KNL-NEXT: vpsllq $63, %zmm2, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: retq

View File

@ -643,8 +643,8 @@ define <8 x i1> @test18(i8 %a, i16 %y) {
; KNL-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} {z}
; KNL-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} {z}
; KNL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,8,7]
; KNL-NEXT: vpermt2q %zmm2, %zmm3, %zmm1
; KNL-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
; KNL-NEXT: vpsllq $63, %zmm3, %zmm1
; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL-NEXT: kshiftlw $1, %k1, %k1
; KNL-NEXT: kshiftrw $1, %k1, %k1
@ -665,8 +665,8 @@ define <8 x i1> @test18(i8 %a, i16 %y) {
; SKX-NEXT: vpmovm2q %k0, %zmm0
; SKX-NEXT: vpmovm2q %k1, %zmm1
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7]
; SKX-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
; SKX-NEXT: vpmovq2m %zmm0, %k0
; SKX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
; SKX-NEXT: vpmovq2m %zmm2, %k0
; SKX-NEXT: kshiftlb $1, %k0, %k0
; SKX-NEXT: kshiftrb $1, %k0, %k0
; SKX-NEXT: kshiftlb $7, %k2, %k1

View File

@ -8,8 +8,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32>, <16 x i32>
define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_d_512:
; CHECK: ## BB#0:
; CHECK-NEXT: vpermi2d (%rdi), %zmm0, %zmm1
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: vpermt2d (%rdi), %zmm1, %zmm0
; CHECK-NEXT: retq
%x2 = load <16 x i32>, <16 x i32>* %x2p
%res = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
@ -21,8 +20,7 @@ declare <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double>, <8 x
define <8 x double>@test_int_x86_avx512_mask_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_512:
; CHECK: ## BB#0:
; CHECK-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1
; CHECK-NEXT: vmovapd %zmm1, %zmm0
; CHECK-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1)
ret <8 x double> %res
@ -33,8 +31,7 @@ declare <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float>, <16 x
define <16 x float>@test_int_x86_avx512_mask_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_512:
; CHECK: ## BB#0:
; CHECK-NEXT: vpermi2ps %zmm2, %zmm0, %zmm1
; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1)
ret <16 x float> %res
@ -45,8 +42,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64>, <8 x i64>, <
define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_q_512:
; CHECK: ## BB#0:
; CHECK-NEXT: vpermi2q %zmm2, %zmm0, %zmm1
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: vpermt2q %zmm2, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
ret <8 x i64> %res
@ -58,8 +54,7 @@ define <16 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_512(<16 x i32> %x0, <16
; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 {%k1} {z}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: vpermi2d (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%x2 = load <16 x i32>, <16 x i32>* %x2p
%res = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
@ -72,8 +67,7 @@ define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <
; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_pd_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovb %esi, %k1
; CHECK-NEXT: vpermt2pd (%rdi){1to8}, %zmm0, %zmm1 {%k1} {z}
; CHECK-NEXT: vmovapd %zmm1, %zmm0
; CHECK-NEXT: vpermi2pd (%rdi){1to8}, %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%x2s = load double, double* %x2ptr
%x2ins = insertelement <8 x double> undef, double %x2s, i32 0
@ -88,8 +82,7 @@ define <16 x float>@test_int_x86_avx512_maskz_vpermt2var_ps_512(<16 x i32> %x0,
; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_ps_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpermt2ps %zmm2, %zmm0, %zmm1 {%k1} {z}
; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: vpermi2ps %zmm2, %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3)
ret <16 x float> %res
@ -102,8 +95,7 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x
; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovb %edi, %k1
; CHECK-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 {%k1} {z}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: vpermi2q %zmm2, %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
ret <8 x i64> %res
@ -114,8 +106,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32>, <16 x i32>
define <16 x i32>@test_int_x86_avx512_mask_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_d_512:
; CHECK: ## BB#0:
; CHECK-NEXT: vpermt2d %zmm2, %zmm0, %zmm1
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: vpermi2d %zmm2, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
ret <16 x i32> %res
@ -126,8 +117,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.vpermt2var.d.128(<4 x i32>, <4 x i32>, <
define <4 x i32>@test_int_x86_avx512_mask_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_d_128:
; CHECK: ## BB#0:
; CHECK-NEXT: vpermt2d %xmm2, %xmm0, %xmm1
; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0
; CHECK-NEXT: vpermi2d %xmm2, %xmm1, %xmm0
; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.avx512.mask.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
ret <4 x i32> %res
@ -139,8 +129,7 @@ define <4 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_128(<4 x i32> %x0, <4 x
; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_128:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovb %edi, %k1
; CHECK-NEXT: vpermt2d %xmm2, %xmm0, %xmm1 {%k1} {z}
; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0
; CHECK-NEXT: vpermi2d %xmm2, %xmm1, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
ret <4 x i32> %res
@ -150,8 +139,7 @@ define <4 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_128_broadcast(<4 x i32>
; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_128_broadcast:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovb %esi, %k1
; CHECK-NEXT: vpermt2d (%rdi){1to4}, %xmm0, %xmm1 {%k1} {z}
; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0
; CHECK-NEXT: vpermi2d (%rdi){1to4}, %xmm1, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%x2s = load i32, i32* %x2ptr
%x2ins = insertelement <4 x i32> undef, i32 %x2s, i32 0
@ -165,8 +153,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.vpermt2var.d.256(<8 x i32>, <8 x i32>, <
define <8 x i32>@test_int_x86_avx512_mask_vpermt2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_d_256:
; CHECK: ## BB#0:
; CHECK-NEXT: vpermt2d %ymm2, %ymm0, %ymm1
; CHECK-NEXT: vmovdqa64 %ymm1, %ymm0
; CHECK-NEXT: vpermi2d %ymm2, %ymm1, %ymm0
; CHECK-NEXT: retq
%res = call <8 x i32> @llvm.x86.avx512.mask.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
ret <8 x i32> %res
@ -178,8 +165,7 @@ define <8 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_256(<8 x i32> %x0, <8 x
; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_256:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovb %edi, %k1
; CHECK-NEXT: vpermt2d %ymm2, %ymm0, %ymm1 {%k1} {z}
; CHECK-NEXT: vmovdqa64 %ymm1, %ymm0
; CHECK-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%res = call <8 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
ret <8 x i32> %res
@ -190,8 +176,7 @@ declare <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double>, <2 x
define <2 x double>@test_int_x86_avx512_mask_vpermi2var_pd_128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_128:
; CHECK: ## BB#0:
; CHECK-NEXT: vpermi2pd %xmm2, %xmm0, %xmm1
; CHECK-NEXT: vmovapd %xmm1, %xmm0
; CHECK-NEXT: vpermt2pd %xmm2, %xmm1, %xmm0
; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 -1)
ret <2 x double> %res
@ -202,8 +187,7 @@ declare <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double>, <4 x
define <4 x double>@test_int_x86_avx512_mask_vpermi2var_pd_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_256:
; CHECK: ## BB#0:
; CHECK-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1
; CHECK-NEXT: vmovapd %ymm1, %ymm0
; CHECK-NEXT: vpermt2pd %ymm2, %ymm1, %ymm0
; CHECK-NEXT: retq
%res = call <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 -1)
ret <4 x double> %res
@ -214,8 +198,7 @@ declare <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float>, <4 x i3
define <4 x float>@test_int_x86_avx512_mask_vpermi2var_ps_128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_128:
; CHECK: ## BB#0:
; CHECK-NEXT: vpermi2ps %xmm2, %xmm0, %xmm1
; CHECK-NEXT: vmovaps %xmm1, %xmm0
; CHECK-NEXT: vpermt2ps %xmm2, %xmm1, %xmm0
; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 -1)
ret <4 x float> %res
@ -226,8 +209,7 @@ declare <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float>, <8 x i3
define <8 x float>@test_int_x86_avx512_mask_vpermi2var_ps_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_256:
; CHECK: ## BB#0:
; CHECK-NEXT: vpermi2ps %ymm2, %ymm0, %ymm1
; CHECK-NEXT: vmovaps %ymm1, %ymm0
; CHECK-NEXT: vpermt2ps %ymm2, %ymm1, %ymm0
; CHECK-NEXT: retq
%res = call <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 -1)
ret <8 x float> %res
@ -236,8 +218,7 @@ define <8 x float>@test_int_x86_avx512_mask_vpermi2var_ps_256(<8 x float> %x0, <
define <8 x float>@test_int_x86_avx512_mask_vpermi2var_ps_256_load(<8 x float> %x0, <8 x i32> %x1, <8 x float>* %x2p) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_256_load:
; CHECK: ## BB#0:
; CHECK-NEXT: vpermi2ps (%rdi), %ymm0, %ymm1
; CHECK-NEXT: vmovaps %ymm1, %ymm0
; CHECK-NEXT: vpermt2ps (%rdi), %ymm1, %ymm0
; CHECK-NEXT: retq
%x2 = load <8 x float>, <8 x float>* %x2p
%res = call <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 -1)
@ -247,8 +228,7 @@ define <8 x float>@test_int_x86_avx512_mask_vpermi2var_ps_256_load(<8 x float> %
define <8 x float>@test_int_x86_avx512_mask_vpermi2var_ps_256_broadcast(<8 x float> %x0, <8 x i32> %x1, float* %x2ptr) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_256_broadcast:
; CHECK: ## BB#0:
; CHECK-NEXT: vpermi2ps (%rdi){1to8}, %ymm0, %ymm1
; CHECK-NEXT: vmovaps %ymm1, %ymm0
; CHECK-NEXT: vpermt2ps (%rdi){1to8}, %ymm1, %ymm0
; CHECK-NEXT: retq
%x2s = load float, float* %x2ptr
%x2ins = insertelement <8 x float> undef, float %x2s, i32 0
@ -262,8 +242,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8>, <16 x i8>,
define <16 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_qi_128:
; CHECK: ## BB#0:
; CHECK-NEXT: vpermi2b %xmm2, %xmm0, %xmm1
; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0
; CHECK-NEXT: vpermt2b %xmm2, %xmm1, %xmm0
; CHECK-NEXT: retq
%res = call <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1)
ret <16 x i8> %res
@ -274,8 +253,7 @@ declare <32 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.256(<32 x i8>, <32 x i8>,
define <32 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_qi_256:
; CHECK: ## BB#0:
; CHECK-NEXT: vpermi2b %ymm2, %ymm0, %ymm1
; CHECK-NEXT: vmovdqa64 %ymm1, %ymm0
; CHECK-NEXT: vpermt2b %ymm2, %ymm1, %ymm0
; CHECK-NEXT: retq
%res = call <32 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
ret <32 x i8> %res
@ -286,8 +264,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.128(<16 x i8>, <16 x i8>,
define <16 x i8>@test_int_x86_avx512_mask_vpermt2var_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_128:
; CHECK: ## BB#0:
; CHECK-NEXT: vpermt2b %xmm2, %xmm0, %xmm1
; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0
; CHECK-NEXT: vpermi2b %xmm2, %xmm1, %xmm0
; CHECK-NEXT: retq
%res = call <16 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1)
ret <16 x i8> %res
@ -296,8 +273,7 @@ define <16 x i8>@test_int_x86_avx512_mask_vpermt2var_qi_128(<16 x i8> %x0, <16 x
define <16 x i8>@test_int_x86_avx512_mask_vpermt2var_qi_128_load(<16 x i8> %x0, <16 x i8> %x1, <16 x i8>* %x2p) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_128_load:
; CHECK: ## BB#0:
; CHECK-NEXT: vpermt2b (%rdi), %xmm0, %xmm1
; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0
; CHECK-NEXT: vpermi2b (%rdi), %xmm1, %xmm0
; CHECK-NEXT: retq
%x2 = load <16 x i8>, <16 x i8>* %x2p
%res = call <16 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1)
@ -309,8 +285,7 @@ declare <32 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.256(<32 x i8>, <32 x i8>,
define <32 x i8>@test_int_x86_avx512_mask_vpermt2var_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_256:
; CHECK: ## BB#0:
; CHECK-NEXT: vpermt2b %ymm2, %ymm0, %ymm1
; CHECK-NEXT: vmovdqa64 %ymm1, %ymm0
; CHECK-NEXT: vpermi2b %ymm2, %ymm1, %ymm0
; CHECK-NEXT: retq
%res = call <32 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
ret <32 x i8> %res
@ -322,8 +297,7 @@ define <16 x i8>@test_int_x86_avx512_maskz_vpermt2var_qi_128(<16 x i8> %x0, <16
; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_qi_128:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpermt2b %xmm2, %xmm0, %xmm1 {%k1} {z}
; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0
; CHECK-NEXT: vpermi2b %xmm2, %xmm1, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%res = call <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3)
ret <16 x i8> %res
@ -333,8 +307,7 @@ define <16 x i8>@test_int_x86_avx512_maskz_vpermt2var_qi_128_load(<16 x i8> %x0,
; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_qi_128_load:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpermt2b (%rdi), %xmm0, %xmm1 {%k1} {z}
; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0
; CHECK-NEXT: vpermi2b (%rdi), %xmm1, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%x2 = load <16 x i8>, <16 x i8>* %x2p
%res = call <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3)
@ -347,8 +320,7 @@ define <32 x i8>@test_int_x86_avx512_maskz_vpermt2var_qi_256(<32 x i8> %x0, <32
; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_qi_256:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vpermt2b %ymm2, %ymm0, %ymm1 {%k1} {z}
; CHECK-NEXT: vmovdqa64 %ymm1, %ymm0
; CHECK-NEXT: vpermi2b %ymm2, %ymm1, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%res = call <32 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
ret <32 x i8> %res
@ -358,8 +330,7 @@ define <32 x i8>@test_int_x86_avx512_maskz_vpermt2var_qi_256_load(<32 x i8> %x0,
; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_qi_256_load:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpermt2b (%rdi), %ymm0, %ymm1 {%k1} {z}
; CHECK-NEXT: vmovdqa64 %ymm1, %ymm0
; CHECK-NEXT: vpermi2b (%rdi), %ymm1, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%x2 = load <32 x i8>, <32 x i8>* %x2p
%res = call <32 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)

View File

@ -90,8 +90,7 @@ define <64 x i8>@test_int_x86_avx512_maskz_vpermt2var_qi_512(<64 x i8> %x0, <64
; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_qi_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovq %rdi, %k1
; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm1 {%k1} {z}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: vpermi2b %zmm2, %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%res = call <64 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
ret <64 x i8> %res

View File

@ -175,8 +175,7 @@ define <16 x i8>@test_int_x86_avx512_maskz_vpermt2var_qi_128(<16 x i8> %x0, <16
; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_qi_128:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpermt2b %xmm2, %xmm0, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7d,0xca]
; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x6f,0xc1]
; CHECK-NEXT: vpermi2b %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0x89,0x75,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3)
ret <16 x i8> %res
@ -188,8 +187,7 @@ define <32 x i8>@test_int_x86_avx512_maskz_vpermt2var_qi_256(<32 x i8> %x0, <32
; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_qi_256:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpermt2b %ymm2, %ymm0, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7d,0xca]
; CHECK-NEXT: vmovdqa64 %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x6f,0xc1]
; CHECK-NEXT: vpermi2b %ymm2, %ymm1, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xa9,0x75,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
ret <32 x i8> %res

View File

@ -138,19 +138,19 @@ define <8 x double> @merge_8f64_f64_12zzuuzz(double* %ptr) nounwind uwtable noin
define <8 x double> @merge_8f64_f64_1u3u5zu8(double* %ptr) nounwind uwtable noinline ssp {
; ALL-LABEL: merge_8f64_f64_1u3u5zu8:
; ALL: # BB#0:
; ALL-NEXT: vmovupd 8(%rdi), %zmm0
; ALL-NEXT: vpxord %zmm1, %zmm1, %zmm1
; ALL-NEXT: vmovapd {{.*#+}} zmm2 = <0,u,2,u,4,13,u,7>
; ALL-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0
; ALL-NEXT: vmovupd 8(%rdi), %zmm1
; ALL-NEXT: vpxord %zmm2, %zmm2, %zmm2
; ALL-NEXT: vmovapd {{.*#+}} zmm0 = <0,u,2,u,4,13,u,7>
; ALL-NEXT: vpermi2pd %zmm2, %zmm1, %zmm0
; ALL-NEXT: retq
;
; X32-AVX512F-LABEL: merge_8f64_f64_1u3u5zu8:
; X32-AVX512F: # BB#0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512F-NEXT: vmovupd 8(%eax), %zmm0
; X32-AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1
; X32-AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = <0,0,u,u,2,0,u,u,4,0,13,0,u,u,7,0>
; X32-AVX512F-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0
; X32-AVX512F-NEXT: vmovupd 8(%eax), %zmm1
; X32-AVX512F-NEXT: vpxord %zmm2, %zmm2, %zmm2
; X32-AVX512F-NEXT: vmovapd {{.*#+}} zmm0 = <0,0,u,u,2,0,u,u,4,0,13,0,u,u,7,0>
; X32-AVX512F-NEXT: vpermi2pd %zmm2, %zmm1, %zmm0
; X32-AVX512F-NEXT: retl
%ptr0 = getelementptr inbounds double, double* %ptr, i64 1
%ptr2 = getelementptr inbounds double, double* %ptr, i64 3
@ -225,19 +225,19 @@ define <8 x i64> @merge_8i64_i64_56zz9uzz(i64* %ptr) nounwind uwtable noinline s
define <8 x i64> @merge_8i64_i64_1u3u5zu8(i64* %ptr) nounwind uwtable noinline ssp {
; ALL-LABEL: merge_8i64_i64_1u3u5zu8:
; ALL: # BB#0:
; ALL-NEXT: vmovdqu64 8(%rdi), %zmm0
; ALL-NEXT: vpxord %zmm1, %zmm1, %zmm1
; ALL-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,u,2,u,4,13,u,7>
; ALL-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
; ALL-NEXT: vmovdqu64 8(%rdi), %zmm1
; ALL-NEXT: vpxord %zmm2, %zmm2, %zmm2
; ALL-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,u,2,u,4,13,u,7>
; ALL-NEXT: vpermi2q %zmm2, %zmm1, %zmm0
; ALL-NEXT: retq
;
; X32-AVX512F-LABEL: merge_8i64_i64_1u3u5zu8:
; X32-AVX512F: # BB#0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512F-NEXT: vmovdqu64 8(%eax), %zmm0
; X32-AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1
; X32-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,0,u,u,2,0,u,u,4,0,13,0,u,u,7,0>
; X32-AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
; X32-AVX512F-NEXT: vmovdqu64 8(%eax), %zmm1
; X32-AVX512F-NEXT: vpxord %zmm2, %zmm2, %zmm2
; X32-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,0,u,u,2,0,u,u,4,0,13,0,u,u,7,0>
; X32-AVX512F-NEXT: vpermi2q %zmm2, %zmm1, %zmm0
; X32-AVX512F-NEXT: retl
%ptr0 = getelementptr inbounds i64, i64* %ptr, i64 1
%ptr2 = getelementptr inbounds i64, i64* %ptr, i64 3
@ -334,19 +334,19 @@ define <16 x float> @merge_16f32_f32_0uu3uuuuuuuuCuEF(float* %ptr) nounwind uwta
define <16 x float> @merge_16f32_f32_0uu3zzuuuuuzCuEF(float* %ptr) nounwind uwtable noinline ssp {
; ALL-LABEL: merge_16f32_f32_0uu3zzuuuuuzCuEF:
; ALL: # BB#0:
; ALL-NEXT: vmovups (%rdi), %zmm0
; ALL-NEXT: vpxord %zmm1, %zmm1, %zmm1
; ALL-NEXT: vmovaps {{.*#+}} zmm2 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15>
; ALL-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0
; ALL-NEXT: vmovups (%rdi), %zmm1
; ALL-NEXT: vpxord %zmm2, %zmm2, %zmm2
; ALL-NEXT: vmovaps {{.*#+}} zmm0 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15>
; ALL-NEXT: vpermi2ps %zmm2, %zmm1, %zmm0
; ALL-NEXT: retq
;
; X32-AVX512F-LABEL: merge_16f32_f32_0uu3zzuuuuuzCuEF:
; X32-AVX512F: # BB#0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512F-NEXT: vmovups (%eax), %zmm0
; X32-AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1
; X32-AVX512F-NEXT: vmovaps {{.*#+}} zmm2 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15>
; X32-AVX512F-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0
; X32-AVX512F-NEXT: vmovups (%eax), %zmm1
; X32-AVX512F-NEXT: vpxord %zmm2, %zmm2, %zmm2
; X32-AVX512F-NEXT: vmovaps {{.*#+}} zmm0 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15>
; X32-AVX512F-NEXT: vpermi2ps %zmm2, %zmm1, %zmm0
; X32-AVX512F-NEXT: retl
%ptr0 = getelementptr inbounds float, float* %ptr, i64 0
%ptr3 = getelementptr inbounds float, float* %ptr, i64 3
@ -448,19 +448,19 @@ define <16 x i32> @merge_16i32_i32_0uu3uuuuuuuuCuEF(i32* %ptr) nounwind uwtable
define <16 x i32> @merge_16i32_i32_0uu3zzuuuuuzCuEF(i32* %ptr) nounwind uwtable noinline ssp {
; ALL-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF:
; ALL: # BB#0:
; ALL-NEXT: vmovdqu32 (%rdi), %zmm0
; ALL-NEXT: vpxord %zmm1, %zmm1, %zmm1
; ALL-NEXT: vmovdqa32 {{.*#+}} zmm2 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15>
; ALL-NEXT: vpermt2d %zmm1, %zmm2, %zmm0
; ALL-NEXT: vmovdqu32 (%rdi), %zmm1
; ALL-NEXT: vpxord %zmm2, %zmm2, %zmm2
; ALL-NEXT: vmovdqa32 {{.*#+}} zmm0 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15>
; ALL-NEXT: vpermi2d %zmm2, %zmm1, %zmm0
; ALL-NEXT: retq
;
; X32-AVX512F-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF:
; X32-AVX512F: # BB#0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512F-NEXT: vmovdqu32 (%eax), %zmm0
; X32-AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1
; X32-AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm2 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15>
; X32-AVX512F-NEXT: vpermt2d %zmm1, %zmm2, %zmm0
; X32-AVX512F-NEXT: vmovdqu32 (%eax), %zmm1
; X32-AVX512F-NEXT: vpxord %zmm2, %zmm2, %zmm2
; X32-AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm0 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15>
; X32-AVX512F-NEXT: vpermi2d %zmm2, %zmm1, %zmm0
; X32-AVX512F-NEXT: retl
%ptr0 = getelementptr inbounds i32, i32* %ptr, i64 0
%ptr3 = getelementptr inbounds i32, i32* %ptr, i64 3

View File

@ -877,8 +877,8 @@ define <16 x i16> @shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_1
; AVX512VL-LABEL: shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_15:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vmovdqu16 {{.*#+}} ymm2 = [0,0,0,0,20,21,22,23,8,8,8,8,28,29,30,31]
; AVX512VL-NEXT: vpermt2w %ymm0, %ymm2, %ymm1
; AVX512VL-NEXT: vmovdqa64 %ymm1, %ymm0
; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
; AVX512VL-NEXT: vmovdqa64 %ymm2, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 4, i32 5, i32 6, i32 7, i32 24, i32 24, i32 24, i32 24, i32 12, i32 13, i32 14, i32 15>
ret <16 x i16> %shuffle
@ -910,8 +910,8 @@ define <16 x i16> @shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_1
; AVX512VL-LABEL: shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vmovdqu16 {{.*#+}} ymm2 = [3,2,1,0,23,22,21,20,11,10,9,8,31,30,29,28]
; AVX512VL-NEXT: vpermt2w %ymm0, %ymm2, %ymm1
; AVX512VL-NEXT: vmovdqa64 %ymm1, %ymm0
; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
; AVX512VL-NEXT: vmovdqa64 %ymm2, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 19, i32 18, i32 17, i32 16, i32 7, i32 6, i32 5, i32 4, i32 27, i32 26, i32 25, i32 24, i32 15, i32 14, i32 13, i32 12>
ret <16 x i16> %shuffle
@ -941,8 +941,8 @@ define <16 x i16> @shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_0
; AVX512VL-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vmovdqu16 {{.*#+}} ymm2 = [3,2,1,0,19,18,17,16,11,10,9,8,27,26,25,24]
; AVX512VL-NEXT: vpermt2w %ymm0, %ymm2, %ymm1
; AVX512VL-NEXT: vmovdqa64 %ymm1, %ymm0
; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
; AVX512VL-NEXT: vmovdqa64 %ymm2, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 19, i32 18, i32 17, i32 16, i32 3, i32 2, i32 1, i32 0, i32 27, i32 26, i32 25, i32 24, i32 11, i32 10, i32 9, i32 8>
ret <16 x i16> %shuffle
@ -3279,8 +3279,8 @@ define <16 x i16> @shuffle_v16i16_16_00_17_01_18_02_19_11_24_08_25_09_26_10_27_1
; AVX512VL-LABEL: shuffle_v16i16_16_00_17_01_18_02_19_11_24_08_25_09_26_10_27_11:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vmovdqu16 {{.*#+}} ymm2 = [0,16,1,17,2,18,3,27,8,24,9,25,10,26,11,27]
; AVX512VL-NEXT: vpermt2w %ymm0, %ymm2, %ymm1
; AVX512VL-NEXT: vmovdqa64 %ymm1, %ymm0
; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
; AVX512VL-NEXT: vmovdqa64 %ymm2, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 0, i32 17, i32 1, i32 18, i32 2, i32 19, i32 11, i32 24, i32 8, i32 25, i32 9, i32 26, i32 10, i32 27, i32 11>
ret <16 x i16> %shuffle
@ -3313,8 +3313,8 @@ define <16 x i16> @shuffle_v16i16_20_04_21_05_22_06_23_15_28_12_29_13_30_14_31_1
; AVX512VL-LABEL: shuffle_v16i16_20_04_21_05_22_06_23_15_28_12_29_13_30_14_31_15:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vmovdqu16 {{.*#+}} ymm2 = [4,20,5,21,6,22,7,31,12,28,13,29,14,30,15,31]
; AVX512VL-NEXT: vpermt2w %ymm0, %ymm2, %ymm1
; AVX512VL-NEXT: vmovdqa64 %ymm1, %ymm0
; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
; AVX512VL-NEXT: vmovdqa64 %ymm2, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 20, i32 4, i32 21, i32 5, i32 22, i32 6, i32 23, i32 15, i32 28, i32 12, i32 29, i32 13, i32 30, i32 14, i32 31, i32 15>
ret <16 x i16> %shuffle
@ -3476,8 +3476,8 @@ define <16 x i16> @shuffle_v16i16_uu_uu_uu_uu_20_21_22_11_uu_uu_uu_uu_28_29_30_1
; AVX512VL-LABEL: shuffle_v16i16_uu_uu_uu_uu_20_21_22_11_uu_uu_uu_uu_28_29_30_11:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vmovdqu16 {{.*#+}} ymm2 = <u,u,u,u,4,5,6,27,u,u,u,u,12,13,14,27>
; AVX512VL-NEXT: vpermt2w %ymm0, %ymm2, %ymm1
; AVX512VL-NEXT: vmovdqa64 %ymm1, %ymm0
; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
; AVX512VL-NEXT: vmovdqa64 %ymm2, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 20, i32 21, i32 22, i32 11, i32 undef, i32 undef, i32 undef, i32 undef, i32 28, i32 29, i32 30, i32 11>
ret <16 x i16> %shuffle
@ -3504,8 +3504,8 @@ define <16 x i16> @shuffle_v16i16_20_21_22_03_uu_uu_uu_uu_28_29_30_11_uu_uu_uu_u
; AVX512VL-LABEL: shuffle_v16i16_20_21_22_03_uu_uu_uu_uu_28_29_30_11_uu_uu_uu_uu:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vmovdqu16 {{.*#+}} ymm2 = <4,5,6,19,u,u,u,u,12,13,14,27,u,u,u,u>
; AVX512VL-NEXT: vpermt2w %ymm0, %ymm2, %ymm1
; AVX512VL-NEXT: vmovdqa64 %ymm1, %ymm0
; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
; AVX512VL-NEXT: vmovdqa64 %ymm2, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 20, i32 21, i32 22, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 28, i32 29, i32 30, i32 11, i32 undef, i32 undef, i32 undef, i32 undef>
ret <16 x i16> %shuffle
@ -3628,8 +3628,8 @@ define <16 x i16> @shuffle_v16i16_uu_uu_04_uu_16_18_20_uu_uu_uu_12_uu_24_26_28_u
; AVX512VL-LABEL: shuffle_v16i16_uu_uu_04_uu_16_18_20_uu_uu_uu_12_uu_24_26_28_uu:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vmovdqu16 {{.*#+}} ymm2 = <u,u,20,u,0,2,4,u,u,u,28,u,8,10,12,u>
; AVX512VL-NEXT: vpermt2w %ymm0, %ymm2, %ymm1
; AVX512VL-NEXT: vmovdqa64 %ymm1, %ymm0
; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
; AVX512VL-NEXT: vmovdqa64 %ymm2, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 undef, i32 4, i32 undef, i32 16, i32 18, i32 20, i32 undef, i32 undef, i32 undef, i32 12, i32 undef, i32 24, i32 26, i32 28, i32 undef>
ret <16 x i16> %shuffle
@ -3774,8 +3774,8 @@ define <16 x i16> @shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_1
; AVX512VL-LABEL: shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_10:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vmovdqu16 {{.*#+}} ymm2 = [3,4,5,6,7,16,17,26,11,12,13,14,15,24,25,26]
; AVX512VL-NEXT: vpermt2w %ymm0, %ymm2, %ymm1
; AVX512VL-NEXT: vmovdqa64 %ymm1, %ymm0
; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
; AVX512VL-NEXT: vmovdqa64 %ymm2, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 10, i32 27, i32 28, i32 29, i32 30, i32 31, i32 8, i32 9, i32 10>
ret <16 x i16> %shuffle
@ -3938,8 +3938,8 @@ define <16 x i16> @shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_2
; AVX512VL-LABEL: shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_28:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vmovdqu16 {{.*#+}} ymm2 = [21,22,23,0,1,2,3,12,29,30,31,8,9,10,11,12]
; AVX512VL-NEXT: vpermt2w %ymm0, %ymm2, %ymm1
; AVX512VL-NEXT: vmovdqa64 %ymm1, %ymm0
; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
; AVX512VL-NEXT: vmovdqa64 %ymm2, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 28, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28>
ret <16 x i16> %shuffle
@ -3986,8 +3986,8 @@ define <16 x i16> @shuffle_v16i16_23_uu_03_uu_20_20_05_uu_31_uu_11_uu_28_28_13_u
; AVX512VL-LABEL: shuffle_v16i16_23_uu_03_uu_20_20_05_uu_31_uu_11_uu_28_28_13_uu:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vmovdqu16 {{.*#+}} ymm2 = <7,u,19,u,4,4,21,u,15,u,27,u,12,12,29,u>
; AVX512VL-NEXT: vpermt2w %ymm0, %ymm2, %ymm1
; AVX512VL-NEXT: vmovdqa64 %ymm1, %ymm0
; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
; AVX512VL-NEXT: vmovdqa64 %ymm2, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 23, i32 undef, i32 3, i32 undef, i32 20, i32 20, i32 5, i32 undef, i32 31, i32 undef, i32 11, i32 undef, i32 28, i32 28, i32 13, i32 undef>
ret <16 x i16> %shuffle
@ -4167,8 +4167,8 @@ define <16 x i16> @shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_2
; AVX512VL-LABEL: shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_25:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vmovdqu16 {{.*#+}} ymm2 = [2,18,3,19,0,16,1,17,10,26,11,27,8,24,9,25]
; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512VL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm2[0,2,1,3]
; AVX512VL-NEXT: retq
%1 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 0, i32 16, i32 1, i32 17, i32 10, i32 26, i32 11, i32 27, i32 8, i32 24, i32 9, i32 25>
%2 = bitcast <16 x i16> %1 to <4 x i64>
@ -4257,8 +4257,8 @@ define <16 x i16> @PR24935(<16 x i16> %a, <16 x i16> %b) {
; AVX512VL-LABEL: PR24935:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vmovdqu16 {{.*#+}} ymm2 = [11,10,17,13,10,7,27,0,17,25,0,12,29,20,16,8]
; AVX512VL-NEXT: vpermt2w %ymm0, %ymm2, %ymm1
; AVX512VL-NEXT: vmovdqa64 %ymm1, %ymm0
; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
; AVX512VL-NEXT: vmovdqa64 %ymm2, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 27, i32 26, i32 1, i32 29, i32 26, i32 23, i32 11, i32 16, i32 1, i32 9, i32 16, i32 28, i32 13, i32 4, i32 0, i32 24>
ret <16 x i16> %shuffle

View File

@ -312,10 +312,9 @@ define <8 x float> @shuffle_v8f32_08991abb(<8 x float> %a, <8 x float> %b) {
;
; AVX512VL-LABEL: shuffle_v8f32_08991abb:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
; AVX512VL-NEXT: vmovaps {{.*#+}} ymm2 = [8,0,1,1,10,2,3,3]
; AVX512VL-NEXT: vpermt2ps %ymm0, %ymm2, %ymm1
; AVX512VL-NEXT: vmovaps %ymm1, %ymm0
; AVX512VL-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3]
; AVX512VL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,1,1,10,2,3,3]
; AVX512VL-NEXT: vpermi2ps %ymm2, %ymm1, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 9, i32 9, i32 1, i32 10, i32 11, i32 11>
ret <8 x float> %shuffle
@ -675,8 +674,8 @@ define <8 x float> @shuffle_v8f32_c348cda0(<8 x float> %a, <8 x float> %b) {
; AVX512VL-LABEL: shuffle_v8f32_c348cda0:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vmovaps {{.*#+}} ymm2 = [4,11,12,0,4,5,2,8]
; AVX512VL-NEXT: vpermt2ps %ymm0, %ymm2, %ymm1
; AVX512VL-NEXT: vmovaps %ymm1, %ymm0
; AVX512VL-NEXT: vpermi2ps %ymm0, %ymm1, %ymm2
; AVX512VL-NEXT: vmovaps %ymm2, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 12, i32 3, i32 4, i32 8, i32 12, i32 13, i32 10, i32 0>
ret <8 x float> %shuffle
@ -1316,9 +1315,9 @@ define <8 x i32> @shuffle_v8i32_08192a3b(<8 x i32> %a, <8 x i32> %b) {
;
; AVX512VL-LABEL: shuffle_v8i32_08192a3b:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX512VL-NEXT: vmovdqa32 {{.*#+}} ymm2 = [0,8,2,9,4,10,6,11]
; AVX512VL-NEXT: vpermt2d %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX512VL-NEXT: vmovdqa32 {{.*#+}} ymm0 = [0,8,2,9,4,10,6,11]
; AVX512VL-NEXT: vpermi2d %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
ret <8 x i32> %shuffle
@ -1345,10 +1344,9 @@ define <8 x i32> @shuffle_v8i32_08991abb(<8 x i32> %a, <8 x i32> %b) {
;
; AVX512VL-LABEL: shuffle_v8i32_08991abb:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX512VL-NEXT: vmovdqa32 {{.*#+}} ymm2 = [8,0,1,1,10,2,3,3]
; AVX512VL-NEXT: vpermt2d %ymm0, %ymm2, %ymm1
; AVX512VL-NEXT: vmovdqa64 %ymm1, %ymm0
; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
; AVX512VL-NEXT: vmovdqa32 {{.*#+}} ymm0 = [8,0,1,1,10,2,3,3]
; AVX512VL-NEXT: vpermi2d %ymm2, %ymm1, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 9, i32 9, i32 1, i32 10, i32 11, i32 11>
ret <8 x i32> %shuffle
@ -1992,8 +1990,8 @@ define <8 x i32> @shuffle_v8i32_6caa87e5(<8 x i32> %a, <8 x i32> %b) {
; AVX512VL-LABEL: shuffle_v8i32_6caa87e5:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vmovdqa32 {{.*#+}} ymm2 = [14,4,2,2,0,15,6,13]
; AVX512VL-NEXT: vpermt2d %ymm0, %ymm2, %ymm1
; AVX512VL-NEXT: vmovdqa64 %ymm1, %ymm0
; AVX512VL-NEXT: vpermi2d %ymm0, %ymm1, %ymm2
; AVX512VL-NEXT: vmovdqa64 %ymm2, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 6, i32 12, i32 10, i32 10, i32 8, i32 7, i32 14, i32 5>
ret <8 x i32> %shuffle

View File

@ -262,15 +262,15 @@ define <8 x double> @shuffle_v8f64_8823cc67(<8 x double> %a, <8 x double> %b) {
; AVX512F-LABEL: shuffle_v8f64_8823cc67:
; AVX512F: # BB#0:
; AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [0,0,10,11,4,4,14,15]
; AVX512F-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1
; AVX512F-NEXT: vmovapd %zmm1, %zmm0
; AVX512F-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2
; AVX512F-NEXT: vmovapd %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_8823cc67:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm2 = [0,0,0,0,10,0,11,0,4,0,4,0,14,0,15,0]
; AVX512F-32-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1
; AVX512F-32-NEXT: vmovapd %zmm1, %zmm0
; AVX512F-32-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2
; AVX512F-32-NEXT: vmovapd %zmm2, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 12, i32 12, i32 6, i32 7>
ret <8 x double> %shuffle
@ -281,15 +281,15 @@ define <8 x double> @shuffle_v8f64_9832dc76(<8 x double> %a, <8 x double> %b) {
; AVX512F-LABEL: shuffle_v8f64_9832dc76:
; AVX512F: # BB#0:
; AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [1,0,11,10,5,4,15,14]
; AVX512F-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1
; AVX512F-NEXT: vmovapd %zmm1, %zmm0
; AVX512F-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2
; AVX512F-NEXT: vmovapd %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_9832dc76:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm2 = [1,0,0,0,11,0,10,0,5,0,4,0,15,0,14,0]
; AVX512F-32-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1
; AVX512F-32-NEXT: vmovapd %zmm1, %zmm0
; AVX512F-32-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2
; AVX512F-32-NEXT: vmovapd %zmm2, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 9, i32 8, i32 3, i32 2, i32 13, i32 12, i32 7, i32 6>
ret <8 x double> %shuffle
@ -300,15 +300,15 @@ define <8 x double> @shuffle_v8f64_9810dc54(<8 x double> %a, <8 x double> %b) {
; AVX512F-LABEL: shuffle_v8f64_9810dc54:
; AVX512F: # BB#0:
; AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [1,0,9,8,5,4,13,12]
; AVX512F-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1
; AVX512F-NEXT: vmovapd %zmm1, %zmm0
; AVX512F-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2
; AVX512F-NEXT: vmovapd %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_9810dc54:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm2 = [1,0,0,0,9,0,8,0,5,0,4,0,13,0,12,0]
; AVX512F-32-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1
; AVX512F-32-NEXT: vmovapd %zmm1, %zmm0
; AVX512F-32-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2
; AVX512F-32-NEXT: vmovapd %zmm2, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 9, i32 8, i32 1, i32 0, i32 13, i32 12, i32 5, i32 4>
ret <8 x double> %shuffle
@ -370,15 +370,15 @@ define <8 x double> @shuffle_v8f64_08991abb(<8 x double> %a, <8 x double> %b) {
; AVX512F-LABEL: shuffle_v8f64_08991abb:
; AVX512F: # BB#0:
; AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [8,0,1,1,9,2,3,3]
; AVX512F-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1
; AVX512F-NEXT: vmovapd %zmm1, %zmm0
; AVX512F-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2
; AVX512F-NEXT: vmovapd %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_08991abb:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm2 = [8,0,0,0,1,0,1,0,9,0,2,0,3,0,3,0]
; AVX512F-32-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1
; AVX512F-32-NEXT: vmovapd %zmm1, %zmm0
; AVX512F-32-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2
; AVX512F-32-NEXT: vmovapd %zmm2, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 9, i32 9, i32 1, i32 10, i32 11, i32 11>
ret <8 x double> %shuffle
@ -406,15 +406,15 @@ define <8 x double> @shuffle_v8f64_09ab1def(<8 x double> %a, <8 x double> %b) {
; AVX512F-LABEL: shuffle_v8f64_09ab1def:
; AVX512F: # BB#0:
; AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [8,1,2,3,9,5,6,7]
; AVX512F-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1
; AVX512F-NEXT: vmovapd %zmm1, %zmm0
; AVX512F-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2
; AVX512F-NEXT: vmovapd %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_09ab1def:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm2 = [8,0,1,0,2,0,3,0,9,0,5,0,6,0,7,0]
; AVX512F-32-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1
; AVX512F-32-NEXT: vmovapd %zmm1, %zmm0
; AVX512F-32-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2
; AVX512F-32-NEXT: vmovapd %zmm2, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>
ret <8 x double> %shuffle
@ -927,15 +927,15 @@ define <8 x double> @shuffle_v8f64_c348cda0(<8 x double> %a, <8 x double> %b) {
; AVX512F-LABEL: shuffle_v8f64_c348cda0:
; AVX512F: # BB#0:
; AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [4,11,12,0,4,5,2,8]
; AVX512F-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1
; AVX512F-NEXT: vmovapd %zmm1, %zmm0
; AVX512F-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2
; AVX512F-NEXT: vmovapd %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_c348cda0:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm2 = [4,0,11,0,12,0,0,0,4,0,5,0,2,0,8,0]
; AVX512F-32-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1
; AVX512F-32-NEXT: vmovapd %zmm1, %zmm0
; AVX512F-32-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2
; AVX512F-32-NEXT: vmovapd %zmm2, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 12, i32 3, i32 4, i32 8, i32 12, i32 13, i32 10, i32 0>
ret <8 x double> %shuffle
@ -1180,15 +1180,15 @@ define <8 x i64> @shuffle_v8i64_81a3c5e7(<8 x i64> %a, <8 x i64> %b) {
; AVX512F-LABEL: shuffle_v8i64_81a3c5e7:
; AVX512F: # BB#0:
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,9,2,11,4,13,6,15]
; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_81a3c5e7:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,9,0,2,0,11,0,4,0,13,0,6,0,15,0]
; AVX512F-32-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-32-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
ret <8 x i64> %shuffle
@ -1233,15 +1233,15 @@ define <8 x i64> @shuffle_v8i64_8823cc67(<8 x i64> %a, <8 x i64> %b) {
; AVX512F-LABEL: shuffle_v8i64_8823cc67:
; AVX512F: # BB#0:
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,10,11,4,4,14,15]
; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_8823cc67:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,10,0,11,0,4,0,4,0,14,0,15,0]
; AVX512F-32-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-32-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 12, i32 12, i32 6, i32 7>
ret <8 x i64> %shuffle
@ -1252,15 +1252,15 @@ define <8 x i64> @shuffle_v8i64_9832dc76(<8 x i64> %a, <8 x i64> %b) {
; AVX512F-LABEL: shuffle_v8i64_9832dc76:
; AVX512F: # BB#0:
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,11,10,5,4,15,14]
; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_9832dc76:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,0,0,11,0,10,0,5,0,4,0,15,0,14,0]
; AVX512F-32-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-32-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 9, i32 8, i32 3, i32 2, i32 13, i32 12, i32 7, i32 6>
ret <8 x i64> %shuffle
@ -1271,15 +1271,15 @@ define <8 x i64> @shuffle_v8i64_9810dc54(<8 x i64> %a, <8 x i64> %b) {
; AVX512F-LABEL: shuffle_v8i64_9810dc54:
; AVX512F: # BB#0:
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,9,8,5,4,13,12]
; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_9810dc54:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,0,0,9,0,8,0,5,0,4,0,13,0,12,0]
; AVX512F-32-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-32-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 9, i32 8, i32 1, i32 0, i32 13, i32 12, i32 5, i32 4>
ret <8 x i64> %shuffle
@ -1341,15 +1341,15 @@ define <8 x i64> @shuffle_v8i64_08991abb(<8 x i64> %a, <8 x i64> %b) {
; AVX512F-LABEL: shuffle_v8i64_08991abb:
; AVX512F: # BB#0:
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,0,1,1,9,2,3,3]
; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_08991abb:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,0,0,0,1,0,1,0,9,0,2,0,3,0,3,0]
; AVX512F-32-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-32-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 9, i32 9, i32 1, i32 10, i32 11, i32 11>
ret <8 x i64> %shuffle
@ -1377,15 +1377,15 @@ define <8 x i64> @shuffle_v8i64_09ab1def(<8 x i64> %a, <8 x i64> %b) {
; AVX512F-LABEL: shuffle_v8i64_09ab1def:
; AVX512F: # BB#0:
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,1,2,3,9,5,6,7]
; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_09ab1def:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,0,1,0,2,0,3,0,9,0,5,0,6,0,7,0]
; AVX512F-32-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-32-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>
ret <8 x i64> %shuffle
@ -1914,15 +1914,15 @@ define <8 x i64> @shuffle_v8i64_6caa87e5(<8 x i64> %a, <8 x i64> %b) {
; AVX512F-LABEL: shuffle_v8i64_6caa87e5:
; AVX512F: # BB#0:
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,4,2,2,0,15,6,13]
; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_6caa87e5:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,0,4,0,2,0,2,0,0,0,15,0,6,0,13,0]
; AVX512F-32-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-32-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 6, i32 12, i32 10, i32 10, i32 8, i32 7, i32 14, i32 5>
ret <8 x i64> %shuffle

View File

@ -123,18 +123,18 @@ define <8 x double> @combine_vpermt2var_8f64_identity_mask(<8 x double> %x0, <8
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kmovd %eax, %k1
; X32-NEXT: vmovapd {{.*#+}} zmm2 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
; X32-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 {%k1} {z}
; X32-NEXT: vmovapd {{.*#+}} zmm1 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
; X32-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z}
; X32-NEXT: vpermi2pd %zmm1, %zmm0, %zmm2 {%k1} {z}
; X32-NEXT: vmovapd {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
; X32-NEXT: vpermi2pd %zmm2, %zmm2, %zmm0 {%k1} {z}
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_8f64_identity_mask:
; X64: # BB#0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vmovapd {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
; X64-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 {%k1} {z}
; X64-NEXT: vmovapd {{.*#+}} zmm1 = [7,14,5,12,3,10,1,8]
; X64-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z}
; X64-NEXT: vpermi2pd %zmm1, %zmm0, %zmm2 {%k1} {z}
; X64-NEXT: vmovapd {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
; X64-NEXT: vpermi2pd %zmm2, %zmm2, %zmm0 {%k1} {z}
; X64-NEXT: retq
%res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> %x0, <8 x double> %x1, i8 %m)
%res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x double> %res0, <8 x double> %res0, i8 %m)
@ -190,9 +190,9 @@ define <8 x i64> @combine_vpermt2var_8i64_identity(<8 x i64> %x0, <8 x i64> %x1)
; X32-LABEL: combine_vpermt2var_8i64_identity:
; X32: # BB#0:
; X32-NEXT: vmovdqa64 {{.*#+}} zmm2 = <u,u,6,0,5,0,4,0,3,0,2,0,1,0,0,0>
; X32-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
; X32-NEXT: vmovdqa64 {{.*#+}} zmm1 = <u,u,14,0,5,0,12,0,3,0,10,0,1,0,8,0>
; X32-NEXT: vpermt2q %zmm0, %zmm1, %zmm0
; X32-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
; X32-NEXT: vmovdqa64 {{.*#+}} zmm0 = <u,u,14,0,5,0,12,0,3,0,10,0,1,0,8,0>
; X32-NEXT: vpermi2q %zmm2, %zmm2, %zmm0
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_8i64_identity:
@ -208,18 +208,18 @@ define <8 x i64> @combine_vpermt2var_8i64_identity_mask(<8 x i64> %x0, <8 x i64>
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kmovd %eax, %k1
; X32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
; X32-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 {%k1} {z}
; X32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
; X32-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z}
; X32-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 {%k1} {z}
; X32-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
; X32-NEXT: vpermi2q %zmm2, %zmm2, %zmm0 {%k1} {z}
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_8i64_identity_mask:
; X64: # BB#0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
; X64-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 {%k1} {z}
; X64-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,14,5,12,3,10,1,8]
; X64-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z}
; X64-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 {%k1} {z}
; X64-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
; X64-NEXT: vpermi2q %zmm2, %zmm2, %zmm0 {%k1} {z}
; X64-NEXT: retq
%res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> %x0, <8 x i64> %x1, i8 %m)
%res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x i64> %res0, <8 x i64> %res0, i8 %m)
@ -243,18 +243,18 @@ define <16 x float> @combine_vpermt2var_16f32_identity_mask(<16 x float> %x0, <1
; X32: # BB#0:
; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X32-NEXT: vmovaps {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; X32-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0 {%k1} {z}
; X32-NEXT: vmovaps {{.*#+}} zmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
; X32-NEXT: vpermt2ps %zmm0, %zmm1, %zmm0 {%k1} {z}
; X32-NEXT: vpermi2ps %zmm1, %zmm0, %zmm2 {%k1} {z}
; X32-NEXT: vmovaps {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
; X32-NEXT: vpermi2ps %zmm2, %zmm2, %zmm0 {%k1} {z}
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_16f32_identity_mask:
; X64: # BB#0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vmovaps {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; X64-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0 {%k1} {z}
; X64-NEXT: vmovaps {{.*#+}} zmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
; X64-NEXT: vpermt2ps %zmm0, %zmm1, %zmm0 {%k1} {z}
; X64-NEXT: vpermi2ps %zmm1, %zmm0, %zmm2 {%k1} {z}
; X64-NEXT: vmovaps {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
; X64-NEXT: vpermi2ps %zmm2, %zmm2, %zmm0 {%k1} {z}
; X64-NEXT: retq
%res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x float> %x0, <16 x float> %x1, i16 %m)
%res1 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 11, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x float> %res0, <16 x float> %res0, i16 %m)
@ -280,17 +280,17 @@ define <16 x float> @combine_vpermt2var_16f32_vmovddup_load(<16 x float> *%p0, <
; X32-LABEL: combine_vpermt2var_16f32_vmovddup_load:
; X32: # BB#0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovaps (%eax), %zmm1
; X32-NEXT: vmovaps {{.*#+}} zmm2 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
; X32-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1
; X32-NEXT: vmovaps (%eax), %zmm2
; X32-NEXT: vmovaps {{.*#+}} zmm1 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
; X32-NEXT: vpermi2ps %zmm0, %zmm2, %zmm1
; X32-NEXT: vmovaps %zmm1, %zmm0
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_16f32_vmovddup_load:
; X64: # BB#0:
; X64-NEXT: vmovaps (%rdi), %zmm1
; X64-NEXT: vmovaps {{.*#+}} zmm2 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
; X64-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1
; X64-NEXT: vmovaps (%rdi), %zmm2
; X64-NEXT: vmovaps {{.*#+}} zmm1 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
; X64-NEXT: vpermi2ps %zmm0, %zmm2, %zmm1
; X64-NEXT: vmovaps %zmm1, %zmm0
; X64-NEXT: retq
%x0 = load <16 x float>, <16 x float> *%p0
@ -319,18 +319,18 @@ define <16 x float> @combine_vpermt2var_16f32_vmovddup_mask_load(<16 x float> *%
; X32: # BB#0:
; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovaps (%eax), %zmm1
; X32-NEXT: vmovaps {{.*#+}} zmm2 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
; X32-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1 {%k1} {z}
; X32-NEXT: vmovaps (%eax), %zmm2
; X32-NEXT: vmovaps {{.*#+}} zmm1 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
; X32-NEXT: vpermi2ps %zmm0, %zmm2, %zmm1 {%k1} {z}
; X32-NEXT: vmovaps %zmm1, %zmm0
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_16f32_vmovddup_mask_load:
; X64: # BB#0:
; X64-NEXT: kmovw %esi, %k1
; X64-NEXT: vmovaps (%rdi), %zmm1
; X64-NEXT: vmovaps {{.*#+}} zmm2 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
; X64-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1 {%k1} {z}
; X64-NEXT: vmovaps (%rdi), %zmm2
; X64-NEXT: vmovaps {{.*#+}} zmm1 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
; X64-NEXT: vpermi2ps %zmm0, %zmm2, %zmm1 {%k1} {z}
; X64-NEXT: vmovaps %zmm1, %zmm0
; X64-NEXT: retq
%x0 = load <16 x float>, <16 x float> *%p0
@ -521,18 +521,18 @@ define <16 x i32> @combine_vpermt2var_16i32_identity_mask(<16 x i32> %x0, <16 x
; X32: # BB#0:
; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X32-NEXT: vmovdqa32 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; X32-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 {%k1} {z}
; X32-NEXT: vmovdqa32 {{.*#+}} zmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
; X32-NEXT: vpermt2d %zmm0, %zmm1, %zmm0 {%k1} {z}
; X32-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 {%k1} {z}
; X32-NEXT: vmovdqa32 {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
; X32-NEXT: vpermi2d %zmm2, %zmm2, %zmm0 {%k1} {z}
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_16i32_identity_mask:
; X64: # BB#0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vmovdqa32 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; X64-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 {%k1} {z}
; X64-NEXT: vmovdqa32 {{.*#+}} zmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
; X64-NEXT: vpermt2d %zmm0, %zmm1, %zmm0 {%k1} {z}
; X64-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 {%k1} {z}
; X64-NEXT: vmovdqa32 {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
; X64-NEXT: vpermi2d %zmm2, %zmm2, %zmm0 {%k1} {z}
; X64-NEXT: retq
%res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x i32> %x0, <16 x i32> %x1, i16 %m)
%res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 11, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x i32> %res0, <16 x i32> %res0, i16 %m)
@ -556,18 +556,18 @@ define <32 x i16> @combine_vpermt2var_32i16_identity_mask(<32 x i16> %x0, <32 x
; X32: # BB#0:
; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; X32-NEXT: vmovdqu16 {{.*#+}} zmm2 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; X32-NEXT: vpermt2w %zmm1, %zmm2, %zmm0 {%k1} {z}
; X32-NEXT: vmovdqu16 {{.*#+}} zmm1 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32]
; X32-NEXT: vpermt2w %zmm0, %zmm1, %zmm0 {%k1} {z}
; X32-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 {%k1} {z}
; X32-NEXT: vmovdqu16 {{.*#+}} zmm0 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32]
; X32-NEXT: vpermi2w %zmm2, %zmm2, %zmm0 {%k1} {z}
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_32i16_identity_mask:
; X64: # BB#0:
; X64-NEXT: kmovd %edi, %k1
; X64-NEXT: vmovdqu16 {{.*#+}} zmm2 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; X64-NEXT: vpermt2w %zmm1, %zmm2, %zmm0 {%k1} {z}
; X64-NEXT: vmovdqu16 {{.*#+}} zmm1 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32]
; X64-NEXT: vpermt2w %zmm0, %zmm1, %zmm0 {%k1} {z}
; X64-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 {%k1} {z}
; X64-NEXT: vmovdqu16 {{.*#+}} zmm0 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32]
; X64-NEXT: vpermi2w %zmm2, %zmm2, %zmm0 {%k1} {z}
; X64-NEXT: retq
%res0 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> <i16 31, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <32 x i16> %x0, <32 x i16> %x1, i32 %m)
%res1 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> <i16 63, i16 30, i16 61, i16 28, i16 59, i16 26, i16 57, i16 24, i16 55, i16 22, i16 53, i16 20, i16 51, i16 18, i16 49, i16 16, i16 47, i16 46, i16 13, i16 44, i16 11, i16 42, i16 9, i16 40, i16 7, i16 38, i16 5, i16 36, i16 3, i16 34, i16 1, i16 32>, <32 x i16> %res0, <32 x i16> %res0, i32 %m)
@ -938,9 +938,9 @@ define <8 x i64> @combine_vpermt2var_8i64_as_vpermq(<8 x i64> %x0, <8 x i64> %x1
; X32-LABEL: combine_vpermt2var_8i64_as_vpermq:
; X32: # BB#0:
; X32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0]
; X32-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
; X32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,0,5,0,14,0,7,0,8,0,1,0,10,0,3,0]
; X32-NEXT: vpermt2q %zmm0, %zmm1, %zmm0
; X32-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
; X32-NEXT: vmovdqa64 {{.*#+}} zmm0 = [12,0,5,0,14,0,7,0,8,0,1,0,10,0,3,0]
; X32-NEXT: vpermi2q %zmm2, %zmm2, %zmm0
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_8i64_as_vpermq:
@ -1008,15 +1008,15 @@ define <8 x double> @combine_vpermi2var_vpermt2var_8f64_as_vperm2(<8 x double> %
; X32-LABEL: combine_vpermi2var_vpermt2var_8f64_as_vperm2:
; X32: # BB#0:
; X32-NEXT: vmovapd {{.*#+}} zmm2 = [4,0,14,0,3,0,12,0,7,0,8,0,0,0,15,0]
; X32-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1
; X32-NEXT: vmovapd %zmm1, %zmm0
; X32-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2
; X32-NEXT: vmovapd %zmm2, %zmm0
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermi2var_vpermt2var_8f64_as_vperm2:
; X64: # BB#0:
; X64-NEXT: vmovapd {{.*#+}} zmm2 = [4,14,3,12,7,8,0,15]
; X64-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1
; X64-NEXT: vmovapd %zmm1, %zmm0
; X64-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2
; X64-NEXT: vmovapd %zmm2, %zmm0
; X64-NEXT: retq
%res0 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> <i64 15, i64 0, i64 8, i64 7, i64 12, i64 6, i64 11, i64 4>, <8 x double> %x1, i8 -1)
%res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 12, i64 5, i64 14, i64 7, i64 8, i64 1, i64 10, i64 3>, <8 x double> %res0, <8 x double> %res0, i8 -1)
@ -1044,15 +1044,15 @@ define <32 x i16> @combine_vpermt2var_vpermi2var_32i16_as_permw(<32 x i16> %x0,
; X32-LABEL: combine_vpermt2var_vpermi2var_32i16_as_permw:
; X32: # BB#0:
; X32-NEXT: vmovdqu16 {{.*#+}} zmm2 = [17,39,19,38,21,37,23,36,25,35,27,34,29,33,31,32,1,47,3,46,5,45,7,44,9,43,11,42,13,41,15,40]
; X32-NEXT: vpermt2w %zmm0, %zmm2, %zmm1
; X32-NEXT: vmovdqa64 %zmm1, %zmm0
; X32-NEXT: vpermi2w %zmm0, %zmm1, %zmm2
; X32-NEXT: vmovdqa64 %zmm2, %zmm0
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_vpermi2var_32i16_as_permw:
; X64: # BB#0:
; X64-NEXT: vmovdqu16 {{.*#+}} zmm2 = [17,39,19,38,21,37,23,36,25,35,27,34,29,33,31,32,1,47,3,46,5,45,7,44,9,43,11,42,13,41,15,40]
; X64-NEXT: vpermt2w %zmm0, %zmm2, %zmm1
; X64-NEXT: vmovdqa64 %zmm1, %zmm0
; X64-NEXT: vpermi2w %zmm0, %zmm1, %zmm2
; X64-NEXT: vmovdqa64 %zmm2, %zmm0
; X64-NEXT: retq
%res0 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> <i16 0, i16 63, i16 1, i16 61, i16 2, i16 59, i16 3, i16 57, i16 4, i16 55, i16 5, i16 53, i16 6, i16 51, i16 7, i16 49, i16 8, i16 47, i16 9, i16 45, i16 10, i16 43, i16 11, i16 41, i16 12, i16 39, i16 13, i16 37, i16 14, i16 35, i16 15, i16 33>, <32 x i16> %x0, <32 x i16> %x1, i32 -1)
%res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %res0, <32 x i16> <i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0, i16 31, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16>, <32 x i16> %res0, i32 -1)

View File

@ -23,18 +23,18 @@ define <16 x i16> @combine_vpermt2var_16i16_identity_mask(<16 x i16> %x0, <16 x
; X32: # BB#0:
; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X32-NEXT: vmovdqu16 {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; X32-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 {%k1} {z}
; X32-NEXT: vmovdqu16 {{.*#+}} ymm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
; X32-NEXT: vpermt2w %ymm0, %ymm1, %ymm0 {%k1} {z}
; X32-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 {%k1} {z}
; X32-NEXT: vmovdqu16 {{.*#+}} ymm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
; X32-NEXT: vpermi2w %ymm2, %ymm2, %ymm0 {%k1} {z}
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_16i16_identity_mask:
; X64: # BB#0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vmovdqu16 {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; X64-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 {%k1} {z}
; X64-NEXT: vmovdqu16 {{.*#+}} ymm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
; X64-NEXT: vpermt2w %ymm0, %ymm1, %ymm0 {%k1} {z}
; X64-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 {%k1} {z}
; X64-NEXT: vmovdqu16 {{.*#+}} ymm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
; X64-NEXT: vpermi2w %ymm2, %ymm2, %ymm0 {%k1} {z}
; X64-NEXT: retq
%res0 = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> <i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <16 x i16> %x0, <16 x i16> %x1, i16 %m)
%res1 = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> <i16 15, i16 30, i16 13, i16 28, i16 11, i16 26, i16 9, i16 24, i16 7, i16 22, i16 5, i16 20, i16 3, i16 18, i16 1, i16 16>, <16 x i16> %res0, <16 x i16> %res0, i16 %m)

View File

@ -38,18 +38,18 @@ define <16 x i8> @combine_vpermt2var_16i8_identity_mask(<16 x i8> %x0, <16 x i8>
; X32: # BB#0:
; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X32-NEXT: vmovdqu8 {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; X32-NEXT: vpermt2b %xmm1, %xmm2, %xmm0 {%k1} {z}
; X32-NEXT: vmovdqu8 {{.*#+}} xmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
; X32-NEXT: vpermt2b %xmm0, %xmm1, %xmm0 {%k1} {z}
; X32-NEXT: vpermi2b %xmm1, %xmm0, %xmm2 {%k1} {z}
; X32-NEXT: vmovdqu8 {{.*#+}} xmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
; X32-NEXT: vpermi2b %xmm2, %xmm2, %xmm0 {%k1} {z}
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_16i8_identity_mask:
; X64: # BB#0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vmovdqu8 {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; X64-NEXT: vpermt2b %xmm1, %xmm2, %xmm0 {%k1} {z}
; X64-NEXT: vmovdqu8 {{.*#+}} xmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
; X64-NEXT: vpermt2b %xmm0, %xmm1, %xmm0 {%k1} {z}
; X64-NEXT: vpermi2b %xmm1, %xmm0, %xmm2 {%k1} {z}
; X64-NEXT: vmovdqu8 {{.*#+}} xmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
; X64-NEXT: vpermi2b %xmm2, %xmm2, %xmm0 {%k1} {z}
; X64-NEXT: retq
%res0 = call <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <16 x i8> %x0, <16 x i8> %x1, i16 %m)
%res1 = call <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8> <i8 15, i8 30, i8 13, i8 28, i8 11, i8 26, i8 9, i8 24, i8 7, i8 22, i8 5, i8 20, i8 3, i8 18, i8 1, i8 16>, <16 x i8> %res0, <16 x i8> %res0, i16 %m)
@ -109,8 +109,7 @@ define <16 x i8> @combine_vpermt2var_vpermi2var_16i8_as_vperm2(<16 x i8> %x0, <1
; X32-NEXT: vmovdqu8 {{.*#+}} xmm2 = [0,31,2,29,4,27,6,25,8,23,10,21,12,19,14,17]
; X32-NEXT: vpermi2b %xmm1, %xmm0, %xmm2
; X32-NEXT: vmovdqu8 {{.*#+}} xmm0 = [0,17,2,18,4,19,6,21,8,23,10,25,12,27,14,29]
; X32-NEXT: vpermt2b %xmm2, %xmm0, %xmm2
; X32-NEXT: vmovdqa64 %xmm2, %xmm0
; X32-NEXT: vpermi2b %xmm2, %xmm2, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_vpermi2var_16i8_as_vperm2:
@ -118,8 +117,7 @@ define <16 x i8> @combine_vpermt2var_vpermi2var_16i8_as_vperm2(<16 x i8> %x0, <1
; X64-NEXT: vmovdqu8 {{.*#+}} xmm2 = [0,31,2,29,4,27,6,25,8,23,10,21,12,19,14,17]
; X64-NEXT: vpermi2b %xmm1, %xmm0, %xmm2
; X64-NEXT: vmovdqu8 {{.*#+}} xmm0 = [0,17,2,18,4,19,6,21,8,23,10,25,12,27,14,29]
; X64-NEXT: vpermt2b %xmm2, %xmm0, %xmm2
; X64-NEXT: vmovdqa64 %xmm2, %xmm0
; X64-NEXT: vpermi2b %xmm2, %xmm2, %xmm0
; X64-NEXT: retq
%res0 = call <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> <i8 0, i8 31, i8 2, i8 29, i8 4, i8 27, i8 6, i8 25, i8 8, i8 23, i8 10, i8 21, i8 12, i8 19, i8 14, i8 17>, <16 x i8> %x1, i16 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8> <i8 0, i8 17, i8 2, i8 18, i8 4, i8 19, i8 6, i8 21, i8 8, i8 23, i8 10, i8 25, i8 12, i8 27, i8 14, i8 29>, <16 x i8> %res0, <16 x i8> %res0, i16 -1)

View File

@ -105,8 +105,8 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <1
; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} {z}
; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} {z}
; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
; AVX512F-NEXT: vpermt2d %zmm1, %zmm3, %zmm2
; AVX512F-NEXT: vpslld $31, %zmm2, %zmm1
; AVX512F-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; AVX512F-NEXT: vpslld $31, %zmm3, %zmm1
; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1
; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
@ -119,8 +119,8 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <1
; VL_BW_DQ-NEXT: vpmovm2d %k1, %zmm0
; VL_BW_DQ-NEXT: vpmovm2d %k0, %zmm1
; VL_BW_DQ-NEXT: vmovdqa32 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
; VL_BW_DQ-NEXT: vpermt2d %zmm0, %zmm2, %zmm1
; VL_BW_DQ-NEXT: vpmovd2m %zmm1, %k0
; VL_BW_DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
; VL_BW_DQ-NEXT: vpmovd2m %zmm2, %k0
; VL_BW_DQ-NEXT: vpmovm2b %k0, %xmm0
; VL_BW_DQ-NEXT: retq
%a2 = icmp eq <16 x i32> %a, %a1
@ -189,8 +189,8 @@ define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) {
; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u>
; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: retq
@ -201,8 +201,8 @@ define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) {
; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0
; VL_BW_DQ-NEXT: vpxord %zmm1, %zmm1, %zmm1
; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u>
; VL_BW_DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0
; VL_BW_DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0
; VL_BW_DQ-NEXT: kmovb %k0, %eax
; VL_BW_DQ-NEXT: retq
%b = bitcast i8 %a to <8 x i1>
@ -245,8 +245,8 @@ define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) {
; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0]
; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: retq
@ -257,8 +257,8 @@ define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) {
; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0
; VL_BW_DQ-NEXT: vpxord %zmm1, %zmm1, %zmm1
; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0]
; VL_BW_DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0
; VL_BW_DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0
; VL_BW_DQ-NEXT: kmovb %k0, %eax
; VL_BW_DQ-NEXT: retq
%b = bitcast i8 %a to <8 x i1>
@ -307,8 +307,8 @@ define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) {
; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} {z}
; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1]
; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm0
; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: retq
@ -321,8 +321,8 @@ define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) {
; VL_BW_DQ-NEXT: vpmovm2q %k1, %zmm0
; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm1
; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1]
; VL_BW_DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0
; VL_BW_DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0
; VL_BW_DQ-NEXT: kmovb %k0, %eax
; VL_BW_DQ-NEXT: retq
%b = bitcast i8 %a to <8 x i1>
@ -340,8 +340,8 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) {
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} {z}
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,1,2,3,4,5,6,7]
; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: retq