[DAGCombiner] Set the right SDLoc on a newly-created zextload (1/N)

Setting the right SDLoc on a newly-created zextload fixes a line table
bug which resulted in non-linear stepping behavior.

Several backend tests contained CHECK lines which relied on the IROrder
inherited from the wrong SDLoc. This patch breaks that dependence where
feasbile and regenerates test cases where not.

In some cases, changing a node's IROrder may alter register allocation
and spill behavior. This can affect performance. I have chosen not to
prevent this by applying a "known good" IROrder to SDLocs, as this may
hide a more general bug in the scheduler, or cause regressions on other
test inputs.

rdar://33755881, Part of: llvm.org/PR37262

Differential Revision: https://reviews.llvm.org/D45995

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@331300 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Vedant Kumar 2018-05-01 19:26:15 +00:00
parent 53f34ec97a
commit f3a5c86e78
17 changed files with 258 additions and 222 deletions

View File

@ -8094,7 +8094,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
DoXform &= TLI.isVectorLoadExtDesirable(SDValue(N, 0));
if (DoXform) {
LoadSDNode *LN0 = cast<LoadSDNode>(N0);
SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT,
SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(LN0), VT,
LN0->getChain(),
LN0->getBasePtr(), N0.getValueType(),
LN0->getMemOperand());

View File

@ -24,36 +24,35 @@ define [2 x i64] @test_i64x2_align(i32, [2 x i64] %arg, i32 %after) {
@var64 = global i64 0, align 8
; Check stack slots are 64-bit at all times.
; Check stack slots are 64-bit at all times.
define void @test_stack_slots([8 x i32], i1 %bool, i8 %char, i16 %short,
i32 %int, i64 %long) {
; CHECK-LABEL: test_stack_slots:
; CHECK-DAG: ldr w[[ext1:[0-9]+]], [sp, #24]
; CHECK-DAG: ldrh w[[ext2:[0-9]+]], [sp, #16]
; CHECK-DAG: ldrb w[[ext3:[0-9]+]], [sp, #8]
; CHECK-DAG: ldr x[[ext4:[0-9]+]], [sp, #32]
; CHECK-DAG: ldrb w[[ext5:[0-9]+]], [sp]
; CHECK-DAG: and x[[ext5]], x[[ext5]], #0x1
%ext_bool = zext i1 %bool to i64
store volatile i64 %ext_bool, i64* @var64, align 8
; Part of last store. Blasted scheduler.
; CHECK: ldr [[LONG:x[0-9]+]], [sp, #32]
; CHECK: ldrb w[[EXT:[0-9]+]], [sp]
; CHECK: and x[[EXTED:[0-9]+]], x[[EXT]], #0x1
; CHECK: str x[[EXTED]], [{{x[0-9]+}}, :lo12:var64]
; CHECK: str x[[ext5]], [{{x[0-9]+}}, :lo12:var64]
%ext_char = zext i8 %char to i64
store volatile i64 %ext_char, i64* @var64, align 8
; CHECK: ldrb w[[EXT:[0-9]+]], [sp, #8]
; CHECK: str x[[EXT]], [{{x[0-9]+}}, :lo12:var64]
; CHECK: str x[[ext3]], [{{x[0-9]+}}, :lo12:var64]
%ext_short = zext i16 %short to i64
store volatile i64 %ext_short, i64* @var64, align 8
; CHECK: ldrh w[[EXT:[0-9]+]], [sp, #16]
; CHECK: str x[[EXT]], [{{x[0-9]+}}, :lo12:var64]
; CHECK: str x[[ext2]], [{{x[0-9]+}}, :lo12:var64]
%ext_int = zext i32 %int to i64
store volatile i64 %ext_int, i64* @var64, align 8
; CHECK: ldr{{b?}} w[[EXT:[0-9]+]], [sp, #24]
; CHECK: str x[[EXT]], [{{x[0-9]+}}, :lo12:var64]
; CHECK: str x[[ext1]], [{{x[0-9]+}}, :lo12:var64]
store volatile i64 %long, i64* @var64, align 8
; CHECK: str [[LONG]], [{{x[0-9]+}}, :lo12:var64]
; CHECK: str x[[ext4]], [{{x[0-9]+}}, :lo12:var64]
ret void
}

View File

@ -67,14 +67,14 @@ define i32 @ldur_int(i32* %a) nounwind {
; Test sext + zext clustering.
; CHECK: ********** MI Scheduling **********
; CHECK-LABEL: ldp_half_sext_zext_int:%bb.0
; CHECK: Cluster ld/st SU(3) - SU(4)
; CHECK: SU(3): %{{[0-9]+}}:gpr64 = LDRSWui
; CHECK: SU(4): undef %{{[0-9]+}}.sub_32:gpr64 = LDRWui
; CHECK: Cluster ld/st SU(4) - SU(3)
; CHECK: SU(3): undef %{{[0-9]+}}.sub_32:gpr64 = LDRWui
; CHECK: SU(4): %{{[0-9]+}}:gpr64 = LDRSWui
; EXYNOSM1: ********** MI Scheduling **********
; EXYNOSM1-LABEL: ldp_half_sext_zext_int:%bb.0
; EXYNOSM1: Cluster ld/st SU(3) - SU(4)
; EXYNOSM1: SU(3): %{{[0-9]+}}:gpr64 = LDRSWui
; EXYNOSM1: SU(4): undef %{{[0-9]+}}.sub_32:gpr64 = LDRWui
; EXYNOSM1: Cluster ld/st SU(4) - SU(3)
; EXYNOSM1: SU(3): undef %{{[0-9]+}}.sub_32:gpr64 = LDRWui
; EXYNOSM1: SU(4): %{{[0-9]+}}:gpr64 = LDRSWui
define i64 @ldp_half_sext_zext_int(i64* %q, i32* %p) nounwind {
%tmp0 = load i64, i64* %q, align 4
%tmp = load i32, i32* %p, align 4

View File

@ -240,9 +240,9 @@ define <4 x i32> @zextload_v8i8tov8i32_fake_update(<4 x i8>** %ptr) {
;CHECK-LABEL: zextload_v8i8tov8i32_fake_update:
;CHECK: ldr r[[PTRREG:[0-9]+]], [r0]
;CHECK: vld1.32 {{{d[0-9]+}}[0]}, [r[[PTRREG]]:32]
;CHECK: add.w r[[INCREG:[0-9]+]], r[[PTRREG]], #16
;CHECK: vmovl.u8 {{q[0-9]+}}, {{d[0-9]+}}
;CHECK: vmovl.u16 {{q[0-9]+}}, {{d[0-9]+}}
;CHECK: add.w r[[INCREG:[0-9]+]], r[[PTRREG]], #16
;CHECK: str r[[INCREG]], [r0]
%A = load <4 x i8>*, <4 x i8>** %ptr
%lA = load <4 x i8>, <4 x i8>* %A, align 4

View File

@ -178,34 +178,34 @@ define void @call_floatarg(float %f1, double %d2, float %f5, double *%p) {
;; endian, since the 64-bit math needs to be split
; CHECK-LABEL: i64arg:
; CHECK: save %sp, -96, %sp
; CHECK-BE: ld [%fp+100], %g2
; CHECK-BE-NEXT: ld [%fp+96], %g3
; CHECK-BE-NEXT: ld [%fp+92], %g4
; CHECK-BE: ld [%fp+104], %g2
; CHECK-BE-NEXT: ld [%fp+100], %g3
; CHECK-BE-NEXT: ld [%fp+96], %g4
; CHECK-BE-NEXT: ld [%fp+92], %l0
; CHECK-BE-NEXT: addcc %i1, %i2, %i1
; CHECK-BE-NEXT: addxcc %i0, 0, %i0
; CHECK-BE-NEXT: addcc %i4, %i1, %i1
; CHECK-BE-NEXT: addxcc %i3, %i0, %i0
; CHECK-BE-NEXT: addcc %g4, %i1, %i1
; CHECK-BE-NEXT: ld [%fp+104], %i2
; CHECK-BE-NEXT: addcc %l0, %i1, %i1
; CHECK-BE-NEXT: addxcc %i5, %i0, %i0
; CHECK-BE-NEXT: addcc %g3, %i1, %i1
; CHECK-BE-NEXT: addxcc %g4, %i0, %i0
; CHECK-BE-NEXT: addcc %g2, %i1, %i1
; CHECK-BE-NEXT: addxcc %g3, %i0, %i0
; CHECK-BE-NEXT: addcc %i2, %i1, %i1
; CHECK-BE-NEXT: addxcc %i0, 0, %i0
;
; CHECK-LE: ld [%fp+96], %g2
; CHECK-LE-NEXT: ld [%fp+100], %g3
; CHECK-LE-NEXT: ld [%fp+92], %g4
; CHECK-LE: ld [%fp+104], %g2
; CHECK-LE-NEXT: ld [%fp+96], %g3
; CHECK-LE-NEXT: ld [%fp+100], %g4
; CHECK-LE-NEXT: ld [%fp+92], %l0
; CHECK-LE-NEXT: addcc %i0, %i2, %i0
; CHECK-LE-NEXT: addxcc %i1, 0, %i1
; CHECK-LE-NEXT: addcc %i3, %i0, %i0
; CHECK-LE-NEXT: addxcc %i4, %i1, %i1
; CHECK-LE-NEXT: addcc %i5, %i0, %i0
; CHECK-LE-NEXT: ld [%fp+104], %i2
; CHECK-LE-NEXT: addxcc %l0, %i1, %i1
; CHECK-LE-NEXT: addcc %g3, %i0, %i0
; CHECK-LE-NEXT: addxcc %g4, %i1, %i1
; CHECK-LE-NEXT: addcc %g2, %i0, %i0
; CHECK-LE-NEXT: addxcc %g3, %i1, %i1
; CHECK-LE-NEXT: addcc %i2, %i0, %i0
; CHECK-LE-NEXT: addxcc %i1, 0, %i1
; CHECK-NEXT: restore

View File

@ -63,14 +63,14 @@ define void @call_intarg(i32 %i0, i8* %i1) {
; HARD: faddd %f6,
; HARD: fadds %f31, [[F]]
; SOFT: save %sp, -176, %sp
; SOFT: ld [%fp+2299], %i4
; SOFT: ld [%fp+2307], %i5
; SOFT: srl %i0, 0, %o0
; SOFT-NEXT: call __extendsfdf2
; SOFT: mov %o0, %o1
; SOFT: mov %i1, %o0
; SOFT: mov %i2, %o0
; SOFT: mov %i3, %o0
; SOFT: ld [%fp+2299], %o0
; SOFT: ld [%fp+2307], %o1
define double @floatarg(float %a0, ; %f1
double %a1, ; %d2
double %a2, ; %d4

View File

@ -2408,71 +2408,70 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
; AVX2-NEXT: pushq %r12
; AVX2-NEXT: pushq %rbx
; AVX2-NEXT: subq $16, %rsp
; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX2-NEXT: vpextrq $1, %xmm3, %rcx
; AVX2-NEXT: vmovq %xmm3, %rax
; AVX2-NEXT: vpextrq $1, %xmm2, %rbx
; AVX2-NEXT: vmovq %xmm2, %rdx
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2-NEXT: vpextrq $1, %xmm2, %rdi
; AVX2-NEXT: vmovq %xmm2, %r11
; AVX2-NEXT: vpextrq $1, %xmm1, %r13
; AVX2-NEXT: vmovq %xmm1, %r12
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2-NEXT: vpextrq $1, %xmm2, %rbp
; AVX2-NEXT: vmovq %xmm2, %r10
; AVX2-NEXT: vpextrq $1, %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
; AVX2-NEXT: vmovq %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpextrq $1, %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
; AVX2-NEXT: vmovq %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
; AVX2-NEXT: vpextrq $1, %xmm4, %r15
; AVX2-NEXT: addq %rcx, %r15
; AVX2-NEXT: vmovq %xmm4, %r9
; AVX2-NEXT: addq %rax, %r9
; AVX2-NEXT: vpextrq $1, %xmm3, %rax
; AVX2-NEXT: vpextrq $1, %xmm4, %rbx
; AVX2-NEXT: vmovq %xmm4, %rbp
; AVX2-NEXT: vpextrq $1, %xmm3, %rdi
; AVX2-NEXT: vmovq %xmm3, %rcx
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX2-NEXT: vpextrq $1, %xmm3, %rdx
; AVX2-NEXT: vmovq %xmm3, %r9
; AVX2-NEXT: vpextrq $1, %xmm2, %r11
; AVX2-NEXT: vmovq %xmm2, %r12
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX2-NEXT: vpextrq $1, %xmm3, %r15
; AVX2-NEXT: vmovq %xmm3, %rsi
; AVX2-NEXT: vpextrq $1, %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
; AVX2-NEXT: vmovq %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2-NEXT: vpextrq $1, %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
; AVX2-NEXT: vmovq %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
; AVX2-NEXT: vpextrq $1, %xmm4, %rax
; AVX2-NEXT: addq %rbx, %rax
; AVX2-NEXT: movq %rax, %rbx
; AVX2-NEXT: vmovq %xmm3, %rax
; AVX2-NEXT: addq %rdx, %rax
; AVX2-NEXT: movq %rax, %r8
; AVX2-NEXT: vmovq %xmm4, %r13
; AVX2-NEXT: addq %rbp, %r13
; AVX2-NEXT: vpextrq $1, %xmm3, %r10
; AVX2-NEXT: addq %rdi, %r10
; AVX2-NEXT: vmovq %xmm3, %r14
; AVX2-NEXT: addq %rcx, %r14
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX2-NEXT: vpextrq $1, %xmm3, %rax
; AVX2-NEXT: addq %rdi, %rax
; AVX2-NEXT: addq %rdx, %rax
; AVX2-NEXT: movq %rax, %rcx
; AVX2-NEXT: vmovq %xmm3, %rax
; AVX2-NEXT: vmovq %xmm3, %r8
; AVX2-NEXT: addq %r9, %r8
; AVX2-NEXT: vpextrq $1, %xmm2, %rax
; AVX2-NEXT: addq %r11, %rax
; AVX2-NEXT: movq %rax, %r11
; AVX2-NEXT: vpextrq $1, %xmm2, %r14
; AVX2-NEXT: addq %r13, %r14
; AVX2-NEXT: vmovq %xmm2, %rax
; AVX2-NEXT: addq %r12, %rax
; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX2-NEXT: vpextrq $1, %xmm3, %rax
; AVX2-NEXT: addq %rbp, %rax
; AVX2-NEXT: addq %r15, %rax
; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX2-NEXT: vmovq %xmm3, %rax
; AVX2-NEXT: addq %r10, %rax
; AVX2-NEXT: addq %rsi, %rax
; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX2-NEXT: vpextrq $1, %xmm2, %rax
; AVX2-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
@ -2480,36 +2479,36 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
; AVX2-NEXT: vmovq %xmm2, %rax
; AVX2-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX2-NEXT: vpextrq $1, %xmm2, %rbp
; AVX2-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
; AVX2-NEXT: vmovq %xmm2, %r10
; AVX2-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
; AVX2-NEXT: vpextrq $1, %xmm1, %rdi
; AVX2-NEXT: vmovq %xmm2, %r9
; AVX2-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
; AVX2-NEXT: vpextrq $1, %xmm1, %rax
; AVX2-NEXT: vpextrq $1, %xmm0, %rdi
; AVX2-NEXT: addq %rax, %rdi
; AVX2-NEXT: vmovq %xmm0, %rdx
; AVX2-NEXT: vmovq %xmm1, %rsi
; AVX2-NEXT: vmovq %xmm1, %rdx
; AVX2-NEXT: vmovq %xmm0, %rsi
; AVX2-NEXT: addq %rdx, %rsi
; AVX2-NEXT: addq $-1, %r15
; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX2-NEXT: movl $0, %eax
; AVX2-NEXT: adcq $-1, %rax
; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX2-NEXT: addq $-1, %r9
; AVX2-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX2-NEXT: movl $0, %eax
; AVX2-NEXT: adcq $-1, %rax
; AVX2-NEXT: movq %rax, (%rsp) # 8-byte Spill
; AVX2-NEXT: addq $-1, %rbx
; AVX2-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX2-NEXT: movl $0, %eax
; AVX2-NEXT: adcq $-1, %rax
; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX2-NEXT: addq $-1, %r8
; AVX2-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX2-NEXT: addq $-1, %r13
; AVX2-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX2-NEXT: movl $0, %eax
; AVX2-NEXT: adcq $-1, %rax
; AVX2-NEXT: movq %rax, (%rsp) # 8-byte Spill
; AVX2-NEXT: addq $-1, %r10
; AVX2-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX2-NEXT: movl $0, %eax
; AVX2-NEXT: adcq $-1, %rax
; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX2-NEXT: addq $-1, %r14
; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX2-NEXT: movl $0, %r13d
; AVX2-NEXT: adcq $-1, %r13
; AVX2-NEXT: addq $-1, %rcx
@ -2517,12 +2516,12 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
; AVX2-NEXT: movl $0, %eax
; AVX2-NEXT: adcq $-1, %rax
; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX2-NEXT: addq $-1, %r11
; AVX2-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX2-NEXT: addq $-1, %r8
; AVX2-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX2-NEXT: movl $0, %r15d
; AVX2-NEXT: adcq $-1, %r15
; AVX2-NEXT: addq $-1, %r14
; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX2-NEXT: addq $-1, %r11
; AVX2-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX2-NEXT: movl $0, %ebx
; AVX2-NEXT: adcq $-1, %rbx
; AVX2-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
@ -2546,9 +2545,9 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
; AVX2-NEXT: addq $-1, %rbp
; AVX2-NEXT: movl $0, %r14d
; AVX2-NEXT: adcq $-1, %r14
; AVX2-NEXT: addq $-1, %r10
; AVX2-NEXT: movl $0, %r9d
; AVX2-NEXT: adcq $-1, %r9
; AVX2-NEXT: addq $-1, %r9
; AVX2-NEXT: movl $0, %r10d
; AVX2-NEXT: adcq $-1, %r10
; AVX2-NEXT: addq $-1, %rdi
; AVX2-NEXT: movl $0, %edx
; AVX2-NEXT: adcq $-1, %rdx
@ -2558,7 +2557,7 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
; AVX2-NEXT: shldq $63, %rsi, %rax
; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX2-NEXT: shldq $63, %rdi, %rdx
; AVX2-NEXT: shldq $63, %r10, %r9
; AVX2-NEXT: shldq $63, %r9, %r10
; AVX2-NEXT: shldq $63, %rbp, %r14
; AVX2-NEXT: shldq $63, %rcx, %r11
; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
@ -2566,8 +2565,8 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; AVX2-NEXT: shldq $63, %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
; AVX2-NEXT: shldq $63, %rcx, %r10
; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
; AVX2-NEXT: shldq $63, %rcx, %r9
; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; AVX2-NEXT: shldq $63, %rcx, %r8
; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
@ -2596,13 +2595,13 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
; AVX2-NEXT: vmovq %r15, %xmm13
; AVX2-NEXT: vmovq %rbx, %xmm14
; AVX2-NEXT: vmovq %r8, %xmm15
; AVX2-NEXT: vmovq %r10, %xmm0
; AVX2-NEXT: vmovq %r9, %xmm0
; AVX2-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Folded Reload
; AVX2-NEXT: # xmm1 = mem[0],zero
; AVX2-NEXT: vmovq %r12, %xmm2
; AVX2-NEXT: vmovq %r11, %xmm3
; AVX2-NEXT: vmovq %r14, %xmm4
; AVX2-NEXT: vmovq %r9, %xmm5
; AVX2-NEXT: vmovq %r10, %xmm5
; AVX2-NEXT: vmovq %rdx, %xmm6
; AVX2-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 8-byte Folded Reload
; AVX2-NEXT: # xmm7 = mem[0],zero
@ -2658,58 +2657,58 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
; AVX512-NEXT: pushq %rbx
; AVX512-NEXT: subq $24, %rsp
; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4
; AVX512-NEXT: vpextrq $1, %xmm4, %rbx
; AVX512-NEXT: vmovq %xmm4, %rbp
; AVX512-NEXT: vpextrq $1, %xmm3, %rdi
; AVX512-NEXT: vmovq %xmm3, %rsi
; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX512-NEXT: vpextrq $1, %xmm3, %rcx
; AVX512-NEXT: vmovq %xmm3, %rax
; AVX512-NEXT: vpextrq $1, %xmm2, %rbx
; AVX512-NEXT: vmovq %xmm2, %rbp
; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512-NEXT: vpextrq $1, %xmm2, %rdi
; AVX512-NEXT: vmovq %xmm2, %r8
; AVX512-NEXT: vpextrq $1, %xmm1, %r13
; AVX512-NEXT: vmovq %xmm1, %r12
; AVX512-NEXT: vpextrq $1, %xmm3, %rdx
; AVX512-NEXT: vmovq %xmm3, %r8
; AVX512-NEXT: vpextrq $1, %xmm2, %r13
; AVX512-NEXT: vmovq %xmm2, %r12
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512-NEXT: vpextrq $1, %xmm2, %r15
; AVX512-NEXT: vmovq %xmm2, %r14
; AVX512-NEXT: vpextrq $1, %xmm1, %rdx
; AVX512-NEXT: vmovq %xmm1, %r9
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX512-NEXT: vpextrq $1, %xmm3, %r15
; AVX512-NEXT: vmovq %xmm3, %r14
; AVX512-NEXT: vpextrq $1, %xmm2, %r9
; AVX512-NEXT: vmovq %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpextrq $1, %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
; AVX512-NEXT: vmovq %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX512-NEXT: vpextrq $1, %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
; AVX512-NEXT: vmovq %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4
; AVX512-NEXT: vpextrq $1, %xmm4, %rsi
; AVX512-NEXT: addq %rcx, %rsi
; AVX512-NEXT: vmovq %xmm4, %rcx
; AVX512-NEXT: addq %rax, %rcx
; AVX512-NEXT: vpextrq $1, %xmm3, %rax
; AVX512-NEXT: vpextrq $1, %xmm4, %rax
; AVX512-NEXT: addq %rbx, %rax
; AVX512-NEXT: movq %rax, %rbx
; AVX512-NEXT: vmovq %xmm3, %rax
; AVX512-NEXT: vmovq %xmm4, %rax
; AVX512-NEXT: addq %rbp, %rax
; AVX512-NEXT: movq %rax, %r10
; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX512-NEXT: movq %rax, %rbp
; AVX512-NEXT: vpextrq $1, %xmm3, %rax
; AVX512-NEXT: addq %rdi, %rax
; AVX512-NEXT: movq %rax, %rdi
; AVX512-NEXT: vmovq %xmm3, %r10
; AVX512-NEXT: addq %rsi, %r10
; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX512-NEXT: vpextrq $1, %xmm3, %rcx
; AVX512-NEXT: addq %rdx, %rcx
; AVX512-NEXT: vmovq %xmm3, %rax
; AVX512-NEXT: addq %r8, %rax
; AVX512-NEXT: movq %rax, %r8
; AVX512-NEXT: vpextrq $1, %xmm2, %rbp
; AVX512-NEXT: addq %r13, %rbp
; AVX512-NEXT: vpextrq $1, %xmm2, %rsi
; AVX512-NEXT: addq %r13, %rsi
; AVX512-NEXT: vmovq %xmm2, %r11
; AVX512-NEXT: addq %r12, %r11
; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1
@ -2724,10 +2723,10 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
; AVX512-NEXT: addq %r14, %rax
; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX512-NEXT: vpextrq $1, %xmm2, %rax
; AVX512-NEXT: addq %rdx, %rax
; AVX512-NEXT: addq %r9, %rax
; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX512-NEXT: vmovq %xmm2, %rax
; AVX512-NEXT: addq %r9, %rax
; AVX512-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
@ -2742,8 +2741,23 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
; AVX512-NEXT: vmovq %xmm0, %rax
; AVX512-NEXT: vmovq %xmm1, %rdx
; AVX512-NEXT: addq %rax, %rdx
; AVX512-NEXT: addq $-1, %rsi
; AVX512-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX512-NEXT: addq $-1, %rbx
; AVX512-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX512-NEXT: movl $0, %eax
; AVX512-NEXT: adcq $-1, %rax
; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX512-NEXT: addq $-1, %rbp
; AVX512-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX512-NEXT: movl $0, %eax
; AVX512-NEXT: adcq $-1, %rax
; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX512-NEXT: addq $-1, %rdi
; AVX512-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX512-NEXT: movl $0, %eax
; AVX512-NEXT: adcq $-1, %rax
; AVX512-NEXT: movq %rax, (%rsp) # 8-byte Spill
; AVX512-NEXT: addq $-1, %r10
; AVX512-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX512-NEXT: movl $0, %eax
; AVX512-NEXT: adcq $-1, %rax
; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@ -2752,28 +2766,13 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
; AVX512-NEXT: movl $0, %eax
; AVX512-NEXT: adcq $-1, %rax
; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX512-NEXT: addq $-1, %rbx
; AVX512-NEXT: movq %rbx, (%rsp) # 8-byte Spill
; AVX512-NEXT: movl $0, %eax
; AVX512-NEXT: adcq $-1, %rax
; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX512-NEXT: addq $-1, %r10
; AVX512-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX512-NEXT: movl $0, %eax
; AVX512-NEXT: adcq $-1, %rax
; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX512-NEXT: addq $-1, %rdi
; AVX512-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX512-NEXT: movl $0, %eax
; AVX512-NEXT: adcq $-1, %rax
; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX512-NEXT: addq $-1, %r8
; AVX512-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX512-NEXT: movl $0, %eax
; AVX512-NEXT: adcq $-1, %rax
; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX512-NEXT: addq $-1, %rbp
; AVX512-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX512-NEXT: addq $-1, %rsi
; AVX512-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX512-NEXT: movl $0, %r13d
; AVX512-NEXT: adcq $-1, %r13
; AVX512-NEXT: addq $-1, %r11
@ -2833,8 +2832,8 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; AVX512-NEXT: shldq $63, %rdx, %rax
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
; AVX512-NEXT: movq (%rsp), %rdx # 8-byte Reload
; AVX512-NEXT: movq (%rsp), %r14 # 8-byte Reload
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; AVX512-NEXT: shldq $63, %rdx, %r14
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload

View File

@ -24,13 +24,13 @@ define i32 @t(i8* %ref_frame_ptr, i32 %ref_frame_stride, i32 %idxX, i32 %idxY) n
; X64-NEXT: imull %ecx, %esi
; X64-NEXT: leal (%rsi,%rdx), %eax
; X64-NEXT: cltq
; X64-NEXT: movl (%rdi,%rax), %eax
; X64-NEXT: leal 4(%rsi,%rdx), %ecx
; X64-NEXT: movslq %ecx, %rcx
; X64-NEXT: movzwl (%rdi,%rcx), %ecx
; X64-NEXT: shlq $32, %rcx
; X64-NEXT: movl (%rdi,%rax), %eax
; X64-NEXT: orq %rcx, %rax
; X64-NEXT: movq %rax, %xmm0
; X64-NEXT: orq %rax, %rcx
; X64-NEXT: movq %rcx, %xmm0
; X64-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7]
; X64-NEXT: movd %xmm0, %eax

View File

@ -1,4 +1,5 @@
; RUN: llc < %s | FileCheck %s
; RUN: llc < %s | FileCheck %s -check-prefix=ASM
; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -stop-after livedebugvalues -o - | FileCheck %s -check-prefix=MIR
; PR9055
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32"
target triple = "i686-pc-linux-gnu"
@ -7,17 +8,51 @@ target triple = "i686-pc-linux-gnu"
@g_98 = common global %struct.S0 zeroinitializer, align 4
define void @foo() nounwind {
; CHECK: movzbl
; CHECK-NOT: movzbl
; CHECK: calll
define void @foo() nounwind !dbg !5 {
; ASM: movzbl
; ASM-NOT: movzbl
; ASM: calll
entry:
%tmp17 = load i8, i8* getelementptr inbounds (%struct.S0, %struct.S0* @g_98, i32 0, i32 1, i32 0), align 4
%tmp54 = zext i8 %tmp17 to i32
%foo = load i32, i32* bitcast (i8* getelementptr inbounds (%struct.S0, %struct.S0* @g_98, i32 0, i32 1, i32 0) to i32*), align 4
%conv.i = trunc i32 %foo to i8
tail call void @func_12(i32 %tmp54, i8 zeroext %conv.i) nounwind
ret void
%tmp17 = load i8, i8* getelementptr inbounds (%struct.S0, %struct.S0* @g_98, i32 0, i32 1, i32 0), align 4, !dbg !14
%tmp54 = zext i8 %tmp17 to i32, !dbg !15
%foo = load i32, i32* bitcast (i8* getelementptr inbounds (%struct.S0, %struct.S0* @g_98, i32 0, i32 1, i32 0) to i32*), align 4, !dbg !16
; MIR: renamable $edi = MOVZX32rr8 renamable $al, debug-location !16
%conv.i = trunc i32 %foo to i8, !dbg !17
tail call void @func_12(i32 %tmp54, i8 zeroext %conv.i) #0, !dbg !18
call void @llvm.dbg.value(metadata i8 %tmp17, metadata !8, metadata !DIExpression()), !dbg !14
call void @llvm.dbg.value(metadata i32 %tmp54, metadata !10, metadata !DIExpression()), !dbg !15
call void @llvm.dbg.value(metadata i32 %foo, metadata !12, metadata !DIExpression()), !dbg !16
call void @llvm.dbg.value(metadata i8 %conv.i, metadata !13, metadata !DIExpression()), !dbg !17
ret void, !dbg !19
}
declare void @func_12(i32, i8 zeroext)
declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.debugify = !{!3, !4}
!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
!1 = !DIFile(filename: "/Users/vsk/src/llvm.org-master/llvm/test/CodeGen/X86/fold-zext-trunc.ll", directory: "/")
!2 = !{}
!3 = !{i32 6}
!4 = !{i32 4}
!5 = distinct !DISubprogram(name: "foo", linkageName: "foo", scope: null, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, isOptimized: true, unit: !0, variables: !7)
!6 = !DISubroutineType(types: !2)
!7 = !{!8, !10, !12, !13}
!8 = !DILocalVariable(name: "1", scope: !5, file: !1, line: 1, type: !9)
!9 = !DIBasicType(name: "ty8", size: 8, encoding: DW_ATE_unsigned)
!10 = !DILocalVariable(name: "2", scope: !5, file: !1, line: 2, type: !11)
!11 = !DIBasicType(name: "ty32", size: 32, encoding: DW_ATE_unsigned)
!12 = !DILocalVariable(name: "3", scope: !5, file: !1, line: 3, type: !11)
!13 = !DILocalVariable(name: "4", scope: !5, file: !1, line: 4, type: !9)
!14 = !DILocation(line: 1, column: 1, scope: !5)
!15 = !DILocation(line: 2, column: 1, scope: !5)
!16 = !DILocation(line: 3, column: 1, scope: !5)
!17 = !DILocation(line: 4, column: 1, scope: !5)
!18 = !DILocation(line: 5, column: 1, scope: !5)
!19 = !DILocation(line: 6, column: 1, scope: !5)
!20 = !{i32 2, !"Debug Info Version", i32 3}
!llvm.module.flags = !{!20}

View File

@ -51,8 +51,8 @@ define <4 x float> @knownbits_insert_uitofp(<4 x i32> %a0, i16 %a1, i16 %a2) nou
; X32: # %bb.0:
; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vmovd %eax, %xmm0
; X32-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
; X32-NEXT: vmovd %ecx, %xmm0
; X32-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
; X32-NEXT: vcvtdq2ps %xmm0, %xmm0
; X32-NEXT: retl

View File

@ -4,8 +4,8 @@
define i64 @test1(i32 %xx, i32 %test) nounwind {
; CHECK-LABEL: test1:
; CHECK: # %bb.0:
; CHECK-NEXT: movb {{[0-9]+}}(%esp), %cl
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
; CHECK-NEXT: movb {{[0-9]+}}(%esp), %cl
; CHECK-NEXT: andb $7, %cl
; CHECK-NEXT: movl %edx, %eax
; CHECK-NEXT: shll %cl, %eax

View File

@ -915,7 +915,7 @@ define i32 @load_i32_by_i8_base_offset_index(i8* %arg, i32 %i) {
; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movl 12(%eax,%ecx), %eax
; CHECK-NEXT: movl 12(%ecx,%eax), %eax
; CHECK-NEXT: retl
;
; CHECK64-LABEL: load_i32_by_i8_base_offset_index:
@ -960,7 +960,7 @@ define i32 @load_i32_by_i8_base_offset_index_2(i8* %arg, i32 %i) {
; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movl 13(%eax,%ecx), %eax
; CHECK-NEXT: movl 13(%ecx,%eax), %eax
; CHECK-NEXT: retl
;
; CHECK64-LABEL: load_i32_by_i8_base_offset_index_2:
@ -1016,7 +1016,7 @@ define i32 @load_i32_by_i8_zaext_loads(i8* %arg, i32 %arg1) {
; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movl 12(%eax,%ecx), %eax
; CHECK-NEXT: movl 12(%ecx,%eax), %eax
; CHECK-NEXT: retl
;
; CHECK64-LABEL: load_i32_by_i8_zaext_loads:
@ -1072,7 +1072,7 @@ define i32 @load_i32_by_i8_zsext_loads(i8* %arg, i32 %arg1) {
; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movl 12(%eax,%ecx), %eax
; CHECK-NEXT: movl 12(%ecx,%eax), %eax
; CHECK-NEXT: retl
;
; CHECK64-LABEL: load_i32_by_i8_zsext_loads:

View File

@ -17,8 +17,8 @@ define i64 @f1(i32 %a, i32 %b) {
define i64 @f2(i32 %a, i32* %p) {
; CHECK-LABEL: f2:
; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: mulxl (%eax), %eax, %edx
; CHECK-NEXT: retl
%b = load i32, i32* %p

View File

@ -9,10 +9,10 @@ define <3 x i16> @zext_i8(<3 x i8>) {
; SSE3-LABEL: zext_i8:
; SSE3: # %bb.0:
; SSE3-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; SSE3-NEXT: movd %eax, %xmm0
; SSE3-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; SSE3-NEXT: pinsrw $1, %eax, %xmm0
; SSE3-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; SSE3-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; SSE3-NEXT: movzbl {{[0-9]+}}(%esp), %edx
; SSE3-NEXT: movd %edx, %xmm0
; SSE3-NEXT: pinsrw $1, %ecx, %xmm0
; SSE3-NEXT: pinsrw $2, %eax, %xmm0
; SSE3-NEXT: pextrw $0, %xmm0, %eax
; SSE3-NEXT: pextrw $1, %xmm0, %edx
@ -71,10 +71,10 @@ define <3 x i16> @sext_i8(<3 x i8>) {
; SSE3-LABEL: sext_i8:
; SSE3: # %bb.0:
; SSE3-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; SSE3-NEXT: movd %eax, %xmm0
; SSE3-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; SSE3-NEXT: pinsrw $1, %eax, %xmm0
; SSE3-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; SSE3-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; SSE3-NEXT: movzbl {{[0-9]+}}(%esp), %edx
; SSE3-NEXT: movd %edx, %xmm0
; SSE3-NEXT: pinsrw $1, %ecx, %xmm0
; SSE3-NEXT: pinsrw $2, %eax, %xmm0
; SSE3-NEXT: psllw $8, %xmm0
; SSE3-NEXT: psraw $8, %xmm0

View File

@ -60,13 +60,13 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr)
; X86-SSE2-NEXT: movd %edx, %xmm0
; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; X86-SSE2-NEXT: movzbl 2(%ecx), %ecx
; X86-SSE2-NEXT: movdqa %xmm0, (%esp)
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-SSE2-NEXT: shll $8, %edx
; X86-SSE2-NEXT: movzbl (%esp), %esi
; X86-SSE2-NEXT: orl %edx, %esi
; X86-SSE2-NEXT: movd %esi, %xmm0
; X86-SSE2-NEXT: movzbl 2(%ecx), %ecx
; X86-SSE2-NEXT: pinsrw $1, %ecx, %xmm0
; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
@ -108,13 +108,13 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr)
; X64-SSE2-NEXT: movq %rax, %xmm0
; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; X64-SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movl -{{[0-9]+}}(%rsp), %eax
; X64-SSE2-NEXT: shll $8, %eax
; X64-SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
; X64-SSE2-NEXT: orl %eax, %ecx
; X64-SSE2-NEXT: movd %ecx, %xmm0
; X64-SSE2-NEXT: movzbl 2(%rsi), %eax
; X64-SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movl -{{[0-9]+}}(%rsp), %ecx
; X64-SSE2-NEXT: shll $8, %ecx
; X64-SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
; X64-SSE2-NEXT: orl %ecx, %edx
; X64-SSE2-NEXT: movd %edx, %xmm0
; X64-SSE2-NEXT: pinsrw $1, %eax, %xmm0
; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]

View File

@ -86,13 +86,13 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr)
; X86-SSE2-NEXT: movd %edx, %xmm0
; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; X86-SSE2-NEXT: movzbl 2(%ecx), %ecx
; X86-SSE2-NEXT: movdqa %xmm0, (%esp)
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-SSE2-NEXT: shll $8, %edx
; X86-SSE2-NEXT: movzbl (%esp), %esi
; X86-SSE2-NEXT: orl %edx, %esi
; X86-SSE2-NEXT: movd %esi, %xmm0
; X86-SSE2-NEXT: movzbl 2(%ecx), %ecx
; X86-SSE2-NEXT: pinsrw $1, %ecx, %xmm0
; X86-SSE2-NEXT: pxor %xmm1, %xmm1
; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
@ -133,13 +133,13 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr)
; X64-SSE2-NEXT: movq %rax, %xmm0
; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; X64-SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movl -{{[0-9]+}}(%rsp), %eax
; X64-SSE2-NEXT: shll $8, %eax
; X64-SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
; X64-SSE2-NEXT: orl %eax, %ecx
; X64-SSE2-NEXT: movd %ecx, %xmm0
; X64-SSE2-NEXT: movzbl 2(%rsi), %eax
; X64-SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movl -{{[0-9]+}}(%rsp), %ecx
; X64-SSE2-NEXT: shll $8, %ecx
; X64-SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
; X64-SSE2-NEXT: orl %ecx, %edx
; X64-SSE2-NEXT: movd %edx, %xmm0
; X64-SSE2-NEXT: pinsrw $1, %eax, %xmm0
; X64-SSE2-NEXT: pxor %xmm1, %xmm1
; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]

View File

@ -57,11 +57,14 @@ entry:
; WIN32: calll _manyargs
; WIN32-LABEL: _manyargs:
; WIN32-DAG: movsbl 4(%esp),
; WIN32-DAG: movswl 8(%esp),
; WIN32-DAG: movzbl 12(%esp),
; WIN32-DAG: movzwl 16(%esp),
; WIN32-DAG: movzbl 20(%esp),
; WIN32-DAG: movzwl 24(%esp),
; WIN32: pushl %ebx
; WIN32: pushl %edi
; WIN32: pushl %esi
; WIN32-DAG: movsbl 16(%esp),
; WIN32-DAG: movswl 20(%esp),
; WIN32-DAG: movzbl 24(%esp),
; WIN32-DAG: movzwl 28(%esp),
; WIN32-DAG: movzbl 32(%esp),
; WIN32-DAG: movzwl 36(%esp),
; WIN32: retl