Avoid false dependencies of undef machine operands

This patch helps avoid false dependencies on undef registers by updating the machine instructions' undef operand to use a register that the instruction is truly dependent on, or use a register with clearance higher than Pref.

Pseudo example:

loop:
xmm0 = ...
xmm1 = vcvtsi2sdl eax, xmm0<undef>
... = inst xmm0
jmp loop

In this example, selecting xmm0 as the undef register creates false dependency between loop iterations.
This false dependency cannot be solved by inserting an xor before vcvtsi2sdl because xmm0 is alive at the point of the vcvtsi2sdl instruction.
Selecting a different register instead of xmm0, especially a register that is not used in the loop, will eliminate this problem.

Differential Revision: https://reviews.llvm.org/D22466



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@278321 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Marina Yatsina 2016-08-11 07:32:08 +00:00
parent 9c5342337a
commit ac9ca3bbe7
8 changed files with 320 additions and 243 deletions

View File

@ -203,6 +203,8 @@ private:
void processDefs(MachineInstr*, bool Kill);
void visitSoftInstr(MachineInstr*, unsigned mask);
void visitHardInstr(MachineInstr*, unsigned domain);
void pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx,
unsigned Pref);
bool shouldBreakDependence(MachineInstr*, unsigned OpIdx, unsigned Pref);
void processUndefReads(MachineBasicBlock*);
};
@ -473,6 +475,56 @@ void ExeDepsFix::visitInstr(MachineInstr *MI) {
processDefs(MI, !DomP.first);
}
/// \brief Helps avoid false dependencies on undef registers by updating the
/// machine instructions' undef operand to use a register that the instruction
/// is truly dependent on, or use a register with clearance higher than Pref.
void ExeDepsFix::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx,
unsigned Pref) {
MachineOperand &MO = MI->getOperand(OpIdx);
assert(MO.isUndef() && "Expected undef machine operand");
unsigned OriginalReg = MO.getReg();
// Update only undef operands that are mapped to one register.
if (AliasMap[OriginalReg].size() != 1)
return;
// Get the undef operand's register class
const TargetRegisterClass *OpRC =
TII->getRegClass(MI->getDesc(), OpIdx, TRI, *MF);
// If the instruction has a true dependency, we can hide the false depdency
// behind it.
for (MachineOperand &CurrMO : MI->operands()) {
if (!CurrMO.isReg() || CurrMO.isDef() || CurrMO.isUndef() ||
!OpRC->contains(CurrMO.getReg()))
continue;
// We found a true dependency - replace the undef register with the true
// dependency.
MO.setReg(CurrMO.getReg());
return;
}
// Go over all registers in the register class and find the register with
// max clearance or clearance higher than Pref.
unsigned MaxClearance = 0;
unsigned MaxClearanceReg = OriginalReg;
for (unsigned rx = 0; rx < OpRC->getNumRegs(); ++rx) {
unsigned Clearance = CurInstr - LiveRegs[rx].Def;
if (Clearance <= MaxClearance)
continue;
MaxClearance = Clearance;
MaxClearanceReg = OpRC->getRegister(rx);
if (MaxClearance > Pref)
break;
}
// Update the operand if we found a register with better clearance.
if (MaxClearanceReg != OriginalReg)
MO.setReg(MaxClearanceReg);
}
/// \brief Return true to if it makes sense to break dependence on a partial def
/// or undef use.
bool ExeDepsFix::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx,
@ -510,6 +562,7 @@ void ExeDepsFix::processDefs(MachineInstr *MI, bool Kill) {
unsigned OpNum;
unsigned Pref = TII->getUndefRegClearance(*MI, OpNum, TRI);
if (Pref) {
pickBestRegisterForUndef(MI, OpNum, Pref);
if (shouldBreakDependence(MI, OpNum, Pref))
UndefReads.push_back(std::make_pair(MI, OpNum));
}

View File

@ -68,7 +68,7 @@ static cl::opt<unsigned>
UndefRegClearance("undef-reg-clearance",
cl::desc("How many idle instructions we would like before "
"certain undef register reads"),
cl::init(64), cl::Hidden);
cl::init(128), cl::Hidden);
enum {
// Select which memory operand is being unfolded.

View File

@ -16,28 +16,27 @@ define <8 x double> @sltof864(<8 x i64> %a) {
; KNL: ## BB#0:
; KNL-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; KNL-NEXT: vpextrq $1, %xmm1, %rax
; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
; KNL-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2
; KNL-NEXT: vmovq %xmm1, %rax
; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
; KNL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1
; KNL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; KNL-NEXT: vextracti32x4 $2, %zmm0, %xmm2
; KNL-NEXT: vpextrq $1, %xmm2, %rax
; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm3
; KNL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm3
; KNL-NEXT: vmovq %xmm2, %rax
; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
; KNL-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2
; KNL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; KNL-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; KNL-NEXT: vextracti32x4 $1, %zmm0, %xmm2
; KNL-NEXT: vpextrq $1, %xmm2, %rax
; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm3
; KNL-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm3
; KNL-NEXT: vmovq %xmm2, %rax
; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
; KNL-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2
; KNL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; KNL-NEXT: vpextrq $1, %xmm0, %rax
; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm3
; KNL-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm3
; KNL-NEXT: vmovq %xmm0, %rax
; KNL-NEXT: vxorps %xmm0, %xmm0, %xmm0
; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
; KNL-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm0
; KNL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; KNL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; KNL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
@ -56,15 +55,14 @@ define <4 x double> @sltof464(<4 x i64> %a) {
; KNL: ## BB#0:
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
; KNL-NEXT: vpextrq $1, %xmm1, %rax
; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
; KNL-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2
; KNL-NEXT: vmovq %xmm1, %rax
; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
; KNL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1
; KNL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; KNL-NEXT: vpextrq $1, %xmm0, %rax
; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
; KNL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2
; KNL-NEXT: vmovq %xmm0, %rax
; KNL-NEXT: vxorps %xmm0, %xmm0, %xmm0
; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
; KNL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0
; KNL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; KNL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; KNL-NEXT: retq
@ -81,12 +79,11 @@ define <2 x float> @sltof2f32(<2 x i64> %a) {
; KNL-LABEL: sltof2f32:
; KNL: ## BB#0:
; KNL-NEXT: vpextrq $1, %xmm0, %rax
; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
; KNL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; KNL-NEXT: vmovq %xmm0, %rax
; KNL-NEXT: vxorps %xmm0, %xmm0, %xmm0
; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
; KNL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0
; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
; KNL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1
; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
; KNL-NEXT: retq
@ -105,17 +102,16 @@ define <4 x float> @sltof4f32_mem(<4 x i64>* %a) {
; KNL: ## BB#0:
; KNL-NEXT: vmovdqu (%rdi), %ymm0
; KNL-NEXT: vpextrq $1, %xmm0, %rax
; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
; KNL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; KNL-NEXT: vmovq %xmm0, %rax
; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
; KNL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL-NEXT: vmovq %xmm0, %rax
; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
; KNL-NEXT: vpextrq $1, %xmm0, %rax
; KNL-NEXT: vxorps %xmm0, %xmm0, %xmm0
; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; KNL-NEXT: retq
;
@ -186,17 +182,16 @@ define <4 x float> @sltof432(<4 x i64> %a) {
; KNL-LABEL: sltof432:
; KNL: ## BB#0:
; KNL-NEXT: vpextrq $1, %xmm0, %rax
; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
; KNL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; KNL-NEXT: vmovq %xmm0, %rax
; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
; KNL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL-NEXT: vmovq %xmm0, %rax
; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
; KNL-NEXT: vpextrq $1, %xmm0, %rax
; KNL-NEXT: vxorps %xmm0, %xmm0, %xmm0
; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; KNL-NEXT: retq
;
@ -884,12 +879,11 @@ define <2 x float> @sitofp_2i1_float(<2 x float> %a) {
; KNL-NEXT: movl $-1, %eax
; KNL-NEXT: movl $0, %edx
; KNL-NEXT: cmovnel %eax, %edx
; KNL-NEXT: vcvtsi2ssl %edx, %xmm0, %xmm1
; KNL-NEXT: vcvtsi2ssl %edx, %xmm2, %xmm1
; KNL-NEXT: vmovq %xmm0, %rdx
; KNL-NEXT: testb $1, %dl
; KNL-NEXT: cmovnel %eax, %ecx
; KNL-NEXT: vxorps %xmm0, %xmm0, %xmm0
; KNL-NEXT: vcvtsi2ssl %ecx, %xmm0, %xmm0
; KNL-NEXT: vcvtsi2ssl %ecx, %xmm2, %xmm0
; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; KNL-NEXT: retq
;
@ -1091,11 +1085,10 @@ define <2 x float> @uitofp_2i1_float(<2 x i32> %a) {
; KNL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; KNL-NEXT: vpextrq $1, %xmm0, %rax
; KNL-NEXT: andl $1, %eax
; KNL-NEXT: vcvtsi2ssl %eax, %xmm0, %xmm1
; KNL-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm1
; KNL-NEXT: vmovq %xmm0, %rax
; KNL-NEXT: andl $1, %eax
; KNL-NEXT: vxorps %xmm0, %xmm0, %xmm0
; KNL-NEXT: vcvtsi2ssl %eax, %xmm0, %xmm0
; KNL-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0
; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; KNL-NEXT: retq
;

View File

@ -126,6 +126,7 @@ loop:
%i = phi i64 [ 1, %entry ], [ %inc, %loop ]
%s1 = phi i64 [ %vx, %entry ], [ %s2, %loop ]
%fi = sitofp i64 %i to double
tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"()
%vy = load double, double* %y
%fipy = fadd double %fi, %vy
%iipy = fptosi double %fipy to i64
@ -174,6 +175,7 @@ for.body3:
store double %mul11, double* %arrayidx13, align 8
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 1024
tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"()
br i1 %exitcond, label %for.inc14, label %for.body3
for.inc14: ; preds = %for.body3
@ -193,7 +195,7 @@ for.end16: ; preds = %for.inc14
;SSE-NEXT: movsd [[XMM0]],
;AVX-LABEL:@loopdep3
;AVX: vxorps [[XMM0:%xmm[0-9]+]], [[XMM0]]
;AVX-NEXT: vcvtsi2sdl {{.*}}, [[XMM0]], [[XMM0]]
;AVX-NEXT: vcvtsi2sdl {{.*}}, [[XMM0]], {{%xmm[0-9]+}}
;AVX-NEXT: vmulsd {{.*}}, [[XMM0]], [[XMM0]]
;AVX-NEXT: vmulsd {{.*}}, [[XMM0]], [[XMM0]]
;AVX-NEXT: vmulsd {{.*}}, [[XMM0]], [[XMM0]]
@ -202,10 +204,76 @@ for.end16: ; preds = %for.inc14
define double @inlineasmdep(i64 %arg) {
top:
tail call void asm sideeffect "", "~{xmm0},~{dirflag},~{fpsr},~{flags}"()
tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{dirflag},~{fpsr},~{flags}"()
tail call void asm sideeffect "", "~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{dirflag},~{fpsr},~{flags}"()
tail call void asm sideeffect "", "~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{dirflag},~{fpsr},~{flags}"()
tail call void asm sideeffect "", "~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"()
%tmp1 = sitofp i64 %arg to double
ret double %tmp1
;AVX-LABEL:@inlineasmdep
;AVX: vxorps [[XMM0:%xmm[0-9]+]], [[XMM0]], [[XMM0]]
;AVX-NEXT: vcvtsi2sdq {{.*}}, [[XMM0]], {{%xmm[0-9]+}}
}
; Make sure we are making a smart choice regarding undef registers and
; hiding the false dependency behind a true dependency
define double @truedeps(float %arg) {
top:
tail call void asm sideeffect "", "~{xmm6},~{dirflag},~{fpsr},~{flags}"()
tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{dirflag},~{fpsr},~{flags}"()
tail call void asm sideeffect "", "~{xmm4},~{xmm5},~{xmm7},~{dirflag},~{fpsr},~{flags}"()
tail call void asm sideeffect "", "~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{dirflag},~{fpsr},~{flags}"()
tail call void asm sideeffect "", "~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"()
%tmp1 = fpext float %arg to double
ret double %tmp1
;AVX-LABEL:@truedeps
;AVX-NOT: vxorps
;AVX: vcvtss2sd [[XMM0:%xmm[0-9]+]], [[XMM0]], {{%xmm[0-9]+}}
}
; Make sure we are making a smart choice regarding undef registers and
; choosing the register with the highest clearence
define double @clearence(i64 %arg) {
top:
tail call void asm sideeffect "", "~{xmm6},~{dirflag},~{fpsr},~{flags}"()
tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{dirflag},~{fpsr},~{flags}"()
tail call void asm sideeffect "", "~{xmm4},~{xmm5},~{xmm7},~{dirflag},~{fpsr},~{flags}"()
tail call void asm sideeffect "", "~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{dirflag},~{fpsr},~{flags}"()
tail call void asm sideeffect "", "~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"()
%tmp1 = sitofp i64 %arg to double
ret double %tmp1
;AVX-LABEL:@clearence
;AVX: vxorps [[XMM6:%xmm6]], [[XMM6]], [[XMM6]]
;AVX-NEXT: vcvtsi2sdq {{.*}}, [[XMM6]], {{%xmm[0-9]+}}
}
; Make sure we are making a smart choice regarding undef registers in order to
; avoid a cyclic dependence on a write to the same register in a previous
; iteration, especially when we cannot zero out the undef register because it
; is alive.
define i64 @loopclearence(i64* nocapture %x, double* nocapture %y) nounwind {
entry:
%vx = load i64, i64* %x
br label %loop
loop:
%i = phi i64 [ 1, %entry ], [ %inc, %loop ]
%s1 = phi i64 [ %vx, %entry ], [ %s2, %loop ]
%fi = sitofp i64 %i to double
tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{dirflag},~{fpsr},~{flags}"()
tail call void asm sideeffect "", "~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{dirflag},~{fpsr},~{flags}"()
tail call void asm sideeffect "", "~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"()
%vy = load double, double* %y
%fipy = fadd double %fi, %vy
%iipy = fptosi double %fipy to i64
%s2 = add i64 %s1, %iipy
%inc = add nsw i64 %i, 1
%exitcond = icmp eq i64 %inc, 156250000
br i1 %exitcond, label %ret, label %loop
ret:
ret i64 %s2
;AVX-LABEL:@loopclearence
;Registers 4-7 are not used and therefore one of them should be chosen
;AVX-NOT: {{%xmm[4-7]}}
;AVX: vcvtsi2sdq {{.*}}, [[XMM4_7:%xmm[4-7]]], {{%xmm[0-9]+}}
;AVX-NOT: [[XMM4_7]]
}

View File

@ -26,7 +26,7 @@ target triple = "x86_64-pc-win32-elf"
; Copy the result in a temporary.
; Note: Technically the regalloc could have been smarter and this move not required,
; which would have hidden the bug.
; CHECK-NEXT: vmovapd %xmm0, [[TMP:%xmm[0-9]+]]
; CHECK: vmovapd %xmm0, [[TMP:%xmm[0-9]+]]
; Crush xmm0.
; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
; CHECK: movl $339772768, %e[[INDIRECT_CALL2:[a-z]+]]
@ -37,6 +37,7 @@ target triple = "x86_64-pc-win32-elf"
define double @foo(i64 %arg) {
top:
%tmp = call double inttoptr (i64 339752784 to double (double, double)*)(double 1.000000e+00, double 0.000000e+00)
tail call void asm sideeffect "", "x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"(double %tmp)
%tmp1 = sitofp i64 %arg to double
call void inttoptr (i64 339772768 to void (double, double)*)(double %tmp, double %tmp1)
%tmp3 = fadd double %tmp1, %tmp

View File

@ -299,7 +299,7 @@ define half @test_f80trunc_nodagcombine() #0 {
; CHECK-F16C-NEXT: movswl (%rsi), %eax
; CHECK-F16C-NEXT: vmovd %eax, %xmm0
; CHECK-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; CHECK-F16C-NEXT: vcvtsi2ssl %edi, %xmm0, %xmm1
; CHECK-F16C-NEXT: vcvtsi2ssl %edi, %xmm1, %xmm1
; CHECK-F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1
; CHECK-F16C-NEXT: vcvtph2ps %xmm1, %xmm1
; CHECK-F16C-NEXT: vaddss %xmm1, %xmm0, %xmm0

View File

@ -39,16 +39,15 @@ define void @signum64a(<2 x double>*) {
; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vcmpltpd %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vpextrq $1, %xmm2, %rax
; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm3
; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm3
; AVX1-NEXT: vmovq %xmm2, %rax
; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
; AVX1-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2
; AVX1-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX1-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
; AVX1-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm1
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
; AVX1-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm0
; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX1-NEXT: vsubpd %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vmovapd %xmm0, (%rdi)
@ -60,16 +59,15 @@ define void @signum64a(<2 x double>*) {
; AVX2-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vcmpltpd %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vpextrq $1, %xmm2, %rax
; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm3
; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm3
; AVX2-NEXT: vmovq %xmm2, %rax
; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
; AVX2-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2
; AVX2-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX2-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
; AVX2-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm1
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
; AVX2-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm0
; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-NEXT: vsubpd %xmm0, %xmm2, %xmm0
; AVX2-NEXT: vmovapd %xmm0, (%rdi)

View File

@ -28,10 +28,9 @@ define <2 x double> @sitofp_2i64_to_2f64(<2 x i64> %a) {
; AVX-LABEL: sitofp_2i64_to_2f64:
; AVX: # BB#0:
; AVX-NEXT: vpextrq $1, %xmm0, %rax
; AVX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
; AVX-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm1
; AVX-NEXT: vmovq %xmm0, %rax
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
; AVX-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0
; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX-NEXT: retq
%cvt = sitofp <2 x i64> %a to <2 x double>
@ -209,15 +208,14 @@ define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) {
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpextrq $1, %xmm1, %rax
; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
; AVX1-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2
; AVX1-NEXT: vmovq %xmm1, %rax
; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1
; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0
; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
@ -226,15 +224,14 @@ define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) {
; AVX2: # BB#0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpextrq $1, %xmm1, %rax
; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
; AVX2-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2
; AVX2-NEXT: vmovq %xmm1, %rax
; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1
; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0
; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: retq
@ -243,15 +240,14 @@ define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) {
; AVX512: # BB#0:
; AVX512-NEXT: vextracti32x4 $1, %ymm0, %xmm1
; AVX512-NEXT: vpextrq $1, %xmm1, %rax
; AVX512-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
; AVX512-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2
; AVX512-NEXT: vmovq %xmm1, %rax
; AVX512-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
; AVX512-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1
; AVX512-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX512-NEXT: vpextrq $1, %xmm0, %rax
; AVX512-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
; AVX512-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2
; AVX512-NEXT: vmovq %xmm0, %rax
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
; AVX512-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0
; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX512-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0
; AVX512-NEXT: retq
@ -941,12 +937,11 @@ define <4 x float> @sitofp_2i64_to_4f32(<2 x i64> %a) {
; AVX-LABEL: sitofp_2i64_to_4f32:
; AVX: # BB#0:
; AVX-NEXT: vpextrq $1, %xmm0, %rax
; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
; AVX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; AVX-NEXT: vmovq %xmm0, %rax
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
; AVX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
; AVX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
; AVX-NEXT: retq
@ -974,12 +969,11 @@ define <4 x float> @sitofp_4i64_to_4f32_undef(<2 x i64> %a) {
; AVX-LABEL: sitofp_4i64_to_4f32_undef:
; AVX: # BB#0:
; AVX-NEXT: vpextrq $1, %xmm0, %rax
; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
; AVX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; AVX-NEXT: vmovq %xmm0, %rax
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
; AVX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
; AVX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
; AVX-NEXT: retq
@ -1140,17 +1134,16 @@ define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) {
; AVX1-LABEL: sitofp_4i64_to_4f32:
; AVX1: # BB#0:
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
@ -1158,17 +1151,16 @@ define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) {
; AVX2-LABEL: sitofp_4i64_to_4f32:
; AVX2: # BB#0:
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@ -1176,17 +1168,16 @@ define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) {
; AVX512-LABEL: sitofp_4i64_to_4f32:
; AVX512: # BB#0:
; AVX512-NEXT: vpextrq $1, %xmm0, %rax
; AVX512-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
; AVX512-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; AVX512-NEXT: vmovq %xmm0, %rax
; AVX512-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
; AVX512-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
; AVX512-NEXT: vextracti32x4 $1, %ymm0, %xmm0
; AVX512-NEXT: vmovq %xmm0, %rax
; AVX512-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
; AVX512-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
; AVX512-NEXT: vpextrq $1, %xmm0, %rax
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
; AVX512-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; AVX512-NEXT: retq
%cvt = sitofp <4 x i64> %a to <4 x float>
@ -1377,12 +1368,12 @@ define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) {
; VEX-NEXT: testq %rax, %rax
; VEX-NEXT: js .LBB38_1
; VEX-NEXT: # BB#2:
; VEX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; VEX-NEXT: jmp .LBB38_3
; VEX-NEXT: .LBB38_1:
; VEX-NEXT: shrq %rax
; VEX-NEXT: orq %rax, %rcx
; VEX-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1
; VEX-NEXT: vcvtsi2ssq %rcx, %xmm1, %xmm1
; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1
; VEX-NEXT: .LBB38_3:
; VEX-NEXT: vmovq %xmm0, %rax
@ -1391,14 +1382,12 @@ define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) {
; VEX-NEXT: testq %rax, %rax
; VEX-NEXT: js .LBB38_4
; VEX-NEXT: # BB#5:
; VEX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; VEX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0
; VEX-NEXT: jmp .LBB38_6
; VEX-NEXT: .LBB38_4:
; VEX-NEXT: shrq %rax
; VEX-NEXT: orq %rax, %rcx
; VEX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; VEX-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0
; VEX-NEXT: vcvtsi2ssq %rcx, %xmm2, %xmm0
; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0
; VEX-NEXT: .LBB38_6:
; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
@ -1406,7 +1395,7 @@ define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) {
; VEX-NEXT: testq %rax, %rax
; VEX-NEXT: js .LBB38_8
; VEX-NEXT: # BB#7:
; VEX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1
; VEX-NEXT: .LBB38_8:
; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
@ -1485,12 +1474,12 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) {
; VEX-NEXT: testq %rax, %rax
; VEX-NEXT: js .LBB39_1
; VEX-NEXT: # BB#2:
; VEX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; VEX-NEXT: jmp .LBB39_3
; VEX-NEXT: .LBB39_1:
; VEX-NEXT: shrq %rax
; VEX-NEXT: orq %rax, %rcx
; VEX-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1
; VEX-NEXT: vcvtsi2ssq %rcx, %xmm1, %xmm1
; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1
; VEX-NEXT: .LBB39_3:
; VEX-NEXT: vmovq %xmm0, %rax
@ -1499,14 +1488,12 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) {
; VEX-NEXT: testq %rax, %rax
; VEX-NEXT: js .LBB39_4
; VEX-NEXT: # BB#5:
; VEX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; VEX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0
; VEX-NEXT: jmp .LBB39_6
; VEX-NEXT: .LBB39_4:
; VEX-NEXT: shrq %rax
; VEX-NEXT: orq %rax, %rcx
; VEX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; VEX-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0
; VEX-NEXT: vcvtsi2ssq %rcx, %xmm2, %xmm0
; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0
; VEX-NEXT: .LBB39_6:
; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
@ -1514,7 +1501,7 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) {
; VEX-NEXT: testq %rax, %rax
; VEX-NEXT: js .LBB39_8
; VEX-NEXT: # BB#7:
; VEX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1
; VEX-NEXT: .LBB39_8:
; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
@ -1782,12 +1769,12 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB45_1
; AVX1-NEXT: # BB#2:
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; AVX1-NEXT: jmp .LBB45_3
; AVX1-NEXT: .LBB45_1:
; AVX1-NEXT: shrq %rax
; AVX1-NEXT: orq %rax, %rcx
; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1
; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm1, %xmm1
; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1
; AVX1-NEXT: .LBB45_3:
; AVX1-NEXT: vmovq %xmm0, %rax
@ -1796,12 +1783,12 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB45_4
; AVX1-NEXT: # BB#5:
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
; AVX1-NEXT: jmp .LBB45_6
; AVX1-NEXT: .LBB45_4:
; AVX1-NEXT: shrq %rax
; AVX1-NEXT: orq %rax, %rcx
; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm2, %xmm2
; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
; AVX1-NEXT: .LBB45_6:
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
@ -1812,12 +1799,12 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB45_7
; AVX1-NEXT: # BB#8:
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
; AVX1-NEXT: jmp .LBB45_9
; AVX1-NEXT: .LBB45_7:
; AVX1-NEXT: shrq %rax
; AVX1-NEXT: orq %rax, %rcx
; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm2
; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
; AVX1-NEXT: .LBB45_9:
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
@ -1827,16 +1814,14 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB45_10
; AVX1-NEXT: # BB#11:
; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
; AVX1-NEXT: .LBB45_10:
; AVX1-NEXT: shrq %rax
; AVX1-NEXT: orq %rax, %rcx
; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0
; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm0
; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; AVX1-NEXT: vzeroupper
@ -1850,12 +1835,12 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB45_1
; AVX2-NEXT: # BB#2:
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; AVX2-NEXT: jmp .LBB45_3
; AVX2-NEXT: .LBB45_1:
; AVX2-NEXT: shrq %rax
; AVX2-NEXT: orq %rax, %rcx
; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1
; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm1, %xmm1
; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1
; AVX2-NEXT: .LBB45_3:
; AVX2-NEXT: vmovq %xmm0, %rax
@ -1864,12 +1849,12 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB45_4
; AVX2-NEXT: # BB#5:
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
; AVX2-NEXT: jmp .LBB45_6
; AVX2-NEXT: .LBB45_4:
; AVX2-NEXT: shrq %rax
; AVX2-NEXT: orq %rax, %rcx
; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm2, %xmm2
; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2
; AVX2-NEXT: .LBB45_6:
; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
@ -1880,12 +1865,12 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB45_7
; AVX2-NEXT: # BB#8:
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
; AVX2-NEXT: jmp .LBB45_9
; AVX2-NEXT: .LBB45_7:
; AVX2-NEXT: shrq %rax
; AVX2-NEXT: orq %rax, %rcx
; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm2
; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2
; AVX2-NEXT: .LBB45_9:
; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
@ -1895,16 +1880,14 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB45_10
; AVX2-NEXT: # BB#11:
; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
; AVX2-NEXT: .LBB45_10:
; AVX2-NEXT: shrq %rax
; AVX2-NEXT: orq %rax, %rcx
; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0
; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm0
; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; AVX2-NEXT: vzeroupper
@ -2118,10 +2101,9 @@ define <2 x double> @sitofp_load_2i64_to_2f64(<2 x i64> *%a) {
; VEX: # BB#0:
; VEX-NEXT: vmovdqa (%rdi), %xmm0
; VEX-NEXT: vpextrq $1, %xmm0, %rax
; VEX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
; VEX-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm1
; VEX-NEXT: vmovq %xmm0, %rax
; VEX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; VEX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
; VEX-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0
; VEX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; VEX-NEXT: retq
;
@ -2129,10 +2111,9 @@ define <2 x double> @sitofp_load_2i64_to_2f64(<2 x i64> *%a) {
; AVX512: # BB#0:
; AVX512-NEXT: vmovdqa64 (%rdi), %xmm0
; AVX512-NEXT: vpextrq $1, %xmm0, %rax
; AVX512-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
; AVX512-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm1
; AVX512-NEXT: vmovq %xmm0, %rax
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
; AVX512-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0
; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512-NEXT: retq
%ld = load <2 x i64>, <2 x i64> *%a
@ -2231,15 +2212,14 @@ define <4 x double> @sitofp_load_4i64_to_4f64(<4 x i64> *%a) {
; AVX1-NEXT: vmovaps (%rdi), %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpextrq $1, %xmm1, %rax
; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
; AVX1-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2
; AVX1-NEXT: vmovq %xmm1, %rax
; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1
; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0
; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
@ -2249,15 +2229,14 @@ define <4 x double> @sitofp_load_4i64_to_4f64(<4 x i64> *%a) {
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpextrq $1, %xmm1, %rax
; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
; AVX2-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2
; AVX2-NEXT: vmovq %xmm1, %rax
; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1
; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0
; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: retq
@ -2267,15 +2246,14 @@ define <4 x double> @sitofp_load_4i64_to_4f64(<4 x i64> *%a) {
; AVX512-NEXT: vmovdqa64 (%rdi), %ymm0
; AVX512-NEXT: vextracti32x4 $1, %ymm0, %xmm1
; AVX512-NEXT: vpextrq $1, %xmm1, %rax
; AVX512-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
; AVX512-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2
; AVX512-NEXT: vmovq %xmm1, %rax
; AVX512-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
; AVX512-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1
; AVX512-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX512-NEXT: vpextrq $1, %xmm0, %rax
; AVX512-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
; AVX512-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2
; AVX512-NEXT: vmovq %xmm0, %rax
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
; AVX512-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0
; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX512-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0
; AVX512-NEXT: retq
@ -2756,17 +2734,16 @@ define <4 x float> @sitofp_load_4i64_to_4f32(<4 x i64> *%a) {
; AVX1: # BB#0:
; AVX1-NEXT: vmovdqa (%rdi), %ymm0
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
@ -2775,17 +2752,16 @@ define <4 x float> @sitofp_load_4i64_to_4f32(<4 x i64> *%a) {
; AVX2: # BB#0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@ -2794,17 +2770,16 @@ define <4 x float> @sitofp_load_4i64_to_4f32(<4 x i64> *%a) {
; AVX512: # BB#0:
; AVX512-NEXT: vmovdqa64 (%rdi), %ymm0
; AVX512-NEXT: vpextrq $1, %xmm0, %rax
; AVX512-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
; AVX512-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; AVX512-NEXT: vmovq %xmm0, %rax
; AVX512-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
; AVX512-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
; AVX512-NEXT: vextracti32x4 $1, %ymm0, %xmm0
; AVX512-NEXT: vmovq %xmm0, %rax
; AVX512-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
; AVX512-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
; AVX512-NEXT: vpextrq $1, %xmm0, %rax
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
; AVX512-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; AVX512-NEXT: retq
%ld = load <4 x i64>, <4 x i64> *%a
@ -2912,29 +2887,28 @@ define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX1-NEXT: vmovdqa (%rdi), %ymm0
; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX1-NEXT: vpextrq $1, %xmm1, %rax
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
; AVX1-NEXT: vmovq %xmm1, %rax
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3
; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vmovq %xmm1, %rax
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3
; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
; AVX1-NEXT: vpextrq $1, %xmm1, %rax
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm1
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3
; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3
; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm0
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
@ -2944,29 +2918,28 @@ define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX2-NEXT: vpextrq $1, %xmm1, %rax
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
; AVX2-NEXT: vmovq %xmm1, %rax
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3
; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
; AVX2-NEXT: vmovq %xmm1, %rax
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3
; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
; AVX2-NEXT: vpextrq $1, %xmm1, %rax
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm1
; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3
; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3
; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm0
; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: retq
@ -2976,29 +2949,28 @@ define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm1
; AVX512-NEXT: vpextrq $1, %xmm1, %rax
; AVX512-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
; AVX512-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
; AVX512-NEXT: vmovq %xmm1, %rax
; AVX512-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
; AVX512-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1
; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm2
; AVX512-NEXT: vmovq %xmm2, %rax
; AVX512-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
; AVX512-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3
; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
; AVX512-NEXT: vpextrq $1, %xmm2, %rax
; AVX512-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
; AVX512-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2
; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
; AVX512-NEXT: vpextrq $1, %xmm0, %rax
; AVX512-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
; AVX512-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2
; AVX512-NEXT: vmovq %xmm0, %rax
; AVX512-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
; AVX512-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3
; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
; AVX512-NEXT: vextracti32x4 $1, %zmm0, %xmm0
; AVX512-NEXT: vmovq %xmm0, %rax
; AVX512-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
; AVX512-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3
; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
; AVX512-NEXT: vpextrq $1, %xmm0, %rax
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
; AVX512-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm0
; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
; AVX512-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0
; AVX512-NEXT: retq
@ -3186,12 +3158,12 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB74_1
; AVX1-NEXT: # BB#2:
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; AVX1-NEXT: jmp .LBB74_3
; AVX1-NEXT: .LBB74_1:
; AVX1-NEXT: shrq %rax
; AVX1-NEXT: orq %rax, %rcx
; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1
; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm1, %xmm1
; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1
; AVX1-NEXT: .LBB74_3:
; AVX1-NEXT: vmovq %xmm0, %rax
@ -3200,12 +3172,12 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB74_4
; AVX1-NEXT: # BB#5:
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
; AVX1-NEXT: jmp .LBB74_6
; AVX1-NEXT: .LBB74_4:
; AVX1-NEXT: shrq %rax
; AVX1-NEXT: orq %rax, %rcx
; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm2, %xmm2
; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
; AVX1-NEXT: .LBB74_6:
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
@ -3216,12 +3188,12 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB74_7
; AVX1-NEXT: # BB#8:
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
; AVX1-NEXT: jmp .LBB74_9
; AVX1-NEXT: .LBB74_7:
; AVX1-NEXT: shrq %rax
; AVX1-NEXT: orq %rax, %rcx
; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm2
; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
; AVX1-NEXT: .LBB74_9:
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
@ -3231,16 +3203,14 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB74_10
; AVX1-NEXT: # BB#11:
; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
; AVX1-NEXT: .LBB74_10:
; AVX1-NEXT: shrq %rax
; AVX1-NEXT: orq %rax, %rcx
; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0
; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm0
; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; AVX1-NEXT: vzeroupper
@ -3255,12 +3225,12 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB74_1
; AVX2-NEXT: # BB#2:
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; AVX2-NEXT: jmp .LBB74_3
; AVX2-NEXT: .LBB74_1:
; AVX2-NEXT: shrq %rax
; AVX2-NEXT: orq %rax, %rcx
; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1
; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm1, %xmm1
; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1
; AVX2-NEXT: .LBB74_3:
; AVX2-NEXT: vmovq %xmm0, %rax
@ -3269,12 +3239,12 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB74_4
; AVX2-NEXT: # BB#5:
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
; AVX2-NEXT: jmp .LBB74_6
; AVX2-NEXT: .LBB74_4:
; AVX2-NEXT: shrq %rax
; AVX2-NEXT: orq %rax, %rcx
; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm2, %xmm2
; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2
; AVX2-NEXT: .LBB74_6:
; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
@ -3285,12 +3255,12 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB74_7
; AVX2-NEXT: # BB#8:
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
; AVX2-NEXT: jmp .LBB74_9
; AVX2-NEXT: .LBB74_7:
; AVX2-NEXT: shrq %rax
; AVX2-NEXT: orq %rax, %rcx
; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm2
; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2
; AVX2-NEXT: .LBB74_9:
; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
@ -3300,16 +3270,14 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB74_10
; AVX2-NEXT: # BB#11:
; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
; AVX2-NEXT: .LBB74_10:
; AVX2-NEXT: shrq %rax
; AVX2-NEXT: orq %rax, %rcx
; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0
; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm0
; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; AVX2-NEXT: vzeroupper
@ -3581,12 +3549,12 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB78_1
; AVX1-NEXT: # BB#2:
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; AVX1-NEXT: jmp .LBB78_3
; AVX1-NEXT: .LBB78_1:
; AVX1-NEXT: shrq %rax
; AVX1-NEXT: orq %rax, %rcx
; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1
; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm1, %xmm1
; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1
; AVX1-NEXT: .LBB78_3:
; AVX1-NEXT: vmovq %xmm2, %rax
@ -3595,12 +3563,12 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB78_4
; AVX1-NEXT: # BB#5:
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3
; AVX1-NEXT: jmp .LBB78_6
; AVX1-NEXT: .LBB78_4:
; AVX1-NEXT: shrq %rax
; AVX1-NEXT: orq %rax, %rcx
; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm3
; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm3
; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3
; AVX1-NEXT: .LBB78_6:
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
@ -3610,12 +3578,12 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB78_7
; AVX1-NEXT: # BB#8:
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm4
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4
; AVX1-NEXT: jmp .LBB78_9
; AVX1-NEXT: .LBB78_7:
; AVX1-NEXT: shrq %rax
; AVX1-NEXT: orq %rax, %rcx
; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm4
; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm4, %xmm4
; AVX1-NEXT: vaddss %xmm4, %xmm4, %xmm4
; AVX1-NEXT: .LBB78_9:
; AVX1-NEXT: vpextrq $1, %xmm2, %rax
@ -3624,12 +3592,12 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB78_10
; AVX1-NEXT: # BB#11:
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2
; AVX1-NEXT: jmp .LBB78_12
; AVX1-NEXT: .LBB78_10:
; AVX1-NEXT: shrq %rax
; AVX1-NEXT: orq %rax, %rcx
; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm5, %xmm2
; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
; AVX1-NEXT: .LBB78_12:
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
@ -3638,12 +3606,12 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB78_13
; AVX1-NEXT: # BB#14:
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm5
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5
; AVX1-NEXT: jmp .LBB78_15
; AVX1-NEXT: .LBB78_13:
; AVX1-NEXT: shrq %rax
; AVX1-NEXT: orq %rax, %rcx
; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm5
; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm5, %xmm5
; AVX1-NEXT: vaddss %xmm5, %xmm5, %xmm5
; AVX1-NEXT: .LBB78_15:
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3]
@ -3653,12 +3621,12 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB78_16
; AVX1-NEXT: # BB#17:
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3
; AVX1-NEXT: jmp .LBB78_18
; AVX1-NEXT: .LBB78_16:
; AVX1-NEXT: shrq %rax
; AVX1-NEXT: orq %rax, %rcx
; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm3
; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm6, %xmm3
; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3
; AVX1-NEXT: .LBB78_18:
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3]
@ -3670,14 +3638,12 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB78_19
; AVX1-NEXT: # BB#20:
; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm5
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm5
; AVX1-NEXT: jmp .LBB78_21
; AVX1-NEXT: .LBB78_19:
; AVX1-NEXT: shrq %rax
; AVX1-NEXT: orq %rax, %rcx
; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0
; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm6, %xmm0
; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm5
; AVX1-NEXT: .LBB78_21:
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0]
@ -3688,12 +3654,12 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB78_22
; AVX1-NEXT: # BB#23:
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2
; AVX1-NEXT: jmp .LBB78_24
; AVX1-NEXT: .LBB78_22:
; AVX1-NEXT: shrq %rax
; AVX1-NEXT: orq %rax, %rcx
; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm6, %xmm2
; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
; AVX1-NEXT: .LBB78_24:
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
@ -3710,12 +3676,12 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB78_1
; AVX2-NEXT: # BB#2:
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; AVX2-NEXT: jmp .LBB78_3
; AVX2-NEXT: .LBB78_1:
; AVX2-NEXT: shrq %rax
; AVX2-NEXT: orq %rax, %rcx
; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1
; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm1, %xmm1
; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1
; AVX2-NEXT: .LBB78_3:
; AVX2-NEXT: vmovq %xmm2, %rax
@ -3724,12 +3690,12 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB78_4
; AVX2-NEXT: # BB#5:
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3
; AVX2-NEXT: jmp .LBB78_6
; AVX2-NEXT: .LBB78_4:
; AVX2-NEXT: shrq %rax
; AVX2-NEXT: orq %rax, %rcx
; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm3
; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm3
; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3
; AVX2-NEXT: .LBB78_6:
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2
@ -3739,12 +3705,12 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB78_7
; AVX2-NEXT: # BB#8:
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm4
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4
; AVX2-NEXT: jmp .LBB78_9
; AVX2-NEXT: .LBB78_7:
; AVX2-NEXT: shrq %rax
; AVX2-NEXT: orq %rax, %rcx
; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm4
; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm4, %xmm4
; AVX2-NEXT: vaddss %xmm4, %xmm4, %xmm4
; AVX2-NEXT: .LBB78_9:
; AVX2-NEXT: vpextrq $1, %xmm2, %rax
@ -3753,12 +3719,12 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB78_10
; AVX2-NEXT: # BB#11:
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2
; AVX2-NEXT: jmp .LBB78_12
; AVX2-NEXT: .LBB78_10:
; AVX2-NEXT: shrq %rax
; AVX2-NEXT: orq %rax, %rcx
; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm5, %xmm2
; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2
; AVX2-NEXT: .LBB78_12:
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
@ -3767,12 +3733,12 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB78_13
; AVX2-NEXT: # BB#14:
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm5
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5
; AVX2-NEXT: jmp .LBB78_15
; AVX2-NEXT: .LBB78_13:
; AVX2-NEXT: shrq %rax
; AVX2-NEXT: orq %rax, %rcx
; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm5
; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm5, %xmm5
; AVX2-NEXT: vaddss %xmm5, %xmm5, %xmm5
; AVX2-NEXT: .LBB78_15:
; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3]
@ -3782,12 +3748,12 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB78_16
; AVX2-NEXT: # BB#17:
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3
; AVX2-NEXT: jmp .LBB78_18
; AVX2-NEXT: .LBB78_16:
; AVX2-NEXT: shrq %rax
; AVX2-NEXT: orq %rax, %rcx
; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm3
; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm6, %xmm3
; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3
; AVX2-NEXT: .LBB78_18:
; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3]
@ -3799,14 +3765,12 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB78_19
; AVX2-NEXT: # BB#20:
; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm5
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm5
; AVX2-NEXT: jmp .LBB78_21
; AVX2-NEXT: .LBB78_19:
; AVX2-NEXT: shrq %rax
; AVX2-NEXT: orq %rax, %rcx
; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0
; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm6, %xmm0
; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm5
; AVX2-NEXT: .LBB78_21:
; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0]
@ -3817,12 +3781,12 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB78_22
; AVX2-NEXT: # BB#23:
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2
; AVX2-NEXT: jmp .LBB78_24
; AVX2-NEXT: .LBB78_22:
; AVX2-NEXT: shrq %rax
; AVX2-NEXT: orq %rax, %rcx
; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm6, %xmm2
; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2
; AVX2-NEXT: .LBB78_24:
; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]