[DAGCombiner] Add rotate-extract tests

Add new tests from D47681 to current codegen. Also added i686 codegen tests.

llvm-svn: 337445
This commit is contained in:
Simon Pilgrim 2018-07-19 09:27:34 +00:00
parent 478934d29a
commit d77296c9a0
3 changed files with 750 additions and 0 deletions

View File

@ -0,0 +1,148 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=aarch64-unknown-unknown | FileCheck %s
; Check that under certain conditions we can factor out a rotate
; from the following idioms:
; (a*c0) >> s1 | (a*c1)
; (a/c0) << s1 | (a/c1)
; This targets cases where instcombine has folded a shl/srl/mul/udiv
; with one of the shifts from the rotate idiom
define i64 @ror_extract_shl(i64 %i) nounwind {
; CHECK-LABEL: ror_extract_shl:
; CHECK: // %bb.0:
; CHECK-NEXT: lsl x8, x0, #10
; CHECK-NEXT: bfxil x8, x0, #54, #7
; CHECK-NEXT: mov x0, x8
; CHECK-NEXT: ret
%lhs_mul = shl i64 %i, 3
%rhs_mul = shl i64 %i, 10
%lhs_shift = lshr i64 %lhs_mul, 57
%out = or i64 %lhs_shift, %rhs_mul
ret i64 %out
}
define i32 @ror_extract_shrl(i32 %i) nounwind {
; CHECK-LABEL: ror_extract_shrl:
; CHECK: // %bb.0:
; CHECK-NEXT: ror w8, w0, #7
; CHECK-NEXT: and w0, w8, #0xf1ffffff
; CHECK-NEXT: ret
%lhs_div = lshr i32 %i, 7
%rhs_div = lshr i32 %i, 3
%rhs_shift = shl i32 %rhs_div, 28
%out = or i32 %lhs_div, %rhs_shift
ret i32 %out
}
define i32 @ror_extract_mul(i32 %i) nounwind {
; CHECK-LABEL: ror_extract_mul:
; CHECK: // %bb.0:
; CHECK-NEXT: add w8, w0, w0, lsl #3
; CHECK-NEXT: ror w0, w8, #25
; CHECK-NEXT: ret
%lhs_mul = mul i32 %i, 9
%rhs_mul = mul i32 %i, 1152
%lhs_shift = lshr i32 %lhs_mul, 25
%out = or i32 %lhs_shift, %rhs_mul
ret i32 %out
}
define i64 @ror_extract_udiv(i64 %i) nounwind {
; CHECK-LABEL: ror_extract_udiv:
; CHECK: // %bb.0:
; CHECK-NEXT: mov x8, #-6148914691236517206
; CHECK-NEXT: movk x8, #43691
; CHECK-NEXT: umulh x8, x0, x8
; CHECK-NEXT: ror x8, x8, #5
; CHECK-NEXT: and x0, x8, #0xf7ffffffffffffff
; CHECK-NEXT: ret
%lhs_div = udiv i64 %i, 3
%rhs_div = udiv i64 %i, 48
%lhs_shift = shl i64 %lhs_div, 60
%out = or i64 %lhs_shift, %rhs_div
ret i64 %out
}
define i64 @ror_extract_mul_with_mask(i64 %i) nounwind {
; CHECK-LABEL: ror_extract_mul_with_mask:
; CHECK: // %bb.0:
; CHECK-NEXT: add w8, w0, w0, lsl #3
; CHECK-NEXT: lsl w8, w8, #7
; CHECK-NEXT: add x9, x0, x0, lsl #3
; CHECK-NEXT: and x0, x8, #0x80
; CHECK-NEXT: bfxil x0, x9, #57, #7
; CHECK-NEXT: ret
%lhs_mul = mul i64 %i, 1152
%rhs_mul = mul i64 %i, 9
%lhs_and = and i64 %lhs_mul, 160
%rhs_shift = lshr i64 %rhs_mul, 57
%out = or i64 %lhs_and, %rhs_shift
ret i64 %out
}
; Result would undershift
define i64 @no_extract_shl(i64 %i) nounwind {
; CHECK-LABEL: no_extract_shl:
; CHECK: // %bb.0:
; CHECK-NEXT: lsl x8, x0, #10
; CHECK-NEXT: bfxil x8, x0, #52, #7
; CHECK-NEXT: mov x0, x8
; CHECK-NEXT: ret
%lhs_mul = shl i64 %i, 5
%rhs_mul = shl i64 %i, 10
%lhs_shift = lshr i64 %lhs_mul, 57
%out = or i64 %lhs_shift, %rhs_mul
ret i64 %out
}
; Result would overshift
define i32 @no_extract_shrl(i32 %i) nounwind {
; CHECK-LABEL: no_extract_shrl:
; CHECK: // %bb.0:
; CHECK-NEXT: lsr w8, w0, #3
; CHECK-NEXT: lsr w0, w0, #9
; CHECK-NEXT: bfi w0, w8, #28, #4
; CHECK-NEXT: ret
%lhs_div = lshr i32 %i, 3
%rhs_div = lshr i32 %i, 9
%lhs_shift = shl i32 %lhs_div, 28
%out = or i32 %lhs_shift, %rhs_div
ret i32 %out
}
; Can factor 128 from 2304, but result is 18 instead of 9
define i64 @no_extract_mul(i64 %i) nounwind {
; CHECK-LABEL: no_extract_mul:
; CHECK: // %bb.0:
; CHECK-NEXT: add x8, x0, x0, lsl #3
; CHECK-NEXT: lsr x0, x8, #57
; CHECK-NEXT: bfi x0, x8, #8, #56
; CHECK-NEXT: ret
%lhs_mul = mul i64 %i, 2304
%rhs_mul = mul i64 %i, 9
%rhs_shift = lshr i64 %rhs_mul, 57
%out = or i64 %lhs_mul, %rhs_shift
ret i64 %out
}
; Can't evenly factor 16 from 49
define i32 @no_extract_udiv(i32 %i) nounwind {
; CHECK-LABEL: no_extract_udiv:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #43691
; CHECK-NEXT: mov w9, #33437
; CHECK-NEXT: movk w8, #43690, lsl #16
; CHECK-NEXT: movk w9, #21399, lsl #16
; CHECK-NEXT: umull x8, w0, w8
; CHECK-NEXT: umull x9, w0, w9
; CHECK-NEXT: lsr x8, x8, #33
; CHECK-NEXT: lsr x9, x9, #32
; CHECK-NEXT: extr w0, w8, w9, #4
; CHECK-NEXT: ret
%lhs_div = udiv i32 %i, 3
%rhs_div = udiv i32 %i, 49
%lhs_shift = shl i32 %lhs_div, 28
%out = or i32 %lhs_shift, %rhs_div
ret i32 %out
}

View File

@ -0,0 +1,317 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,X86
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,X64
; Check that under certain conditions we can factor out a rotate
; from the following idioms:
; (a*c0) >> s1 | (a*c1)
; (a/c0) << s1 | (a/c1)
; This targets cases where instcombine has folded a shl/srl/mul/udiv
; with one of the shifts from the rotate idiom
define <4 x i32> @vroll_v4i32_extract_shl(<4 x i32> %i) {
; CHECK-LABEL: vroll_v4i32_extract_shl:
; CHECK: # %bb.0:
; CHECK-NEXT: vpslld $3, %xmm0, %xmm1
; CHECK-NEXT: vpslld $10, %xmm0, %xmm0
; CHECK-NEXT: vpsrld $25, %xmm1, %xmm1
; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0
; CHECK-NEXT: ret{{[l|q]}}
%lhs_mul = shl <4 x i32> %i, <i32 3, i32 3, i32 3, i32 3>
%rhs_mul = shl <4 x i32> %i, <i32 10, i32 10, i32 10, i32 10>
%lhs_shift = lshr <4 x i32> %lhs_mul, <i32 25, i32 25, i32 25, i32 25>
%out = or <4 x i32> %lhs_shift, %rhs_mul
ret <4 x i32> %out
}
define <4 x i64> @vrolq_v4i64_extract_shrl(<4 x i64> %i) nounwind {
; X86-LABEL: vrolq_v4i64_extract_shrl:
; X86: # %bb.0:
; X86-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; X86-NEXT: vprolq $24, %zmm0, %zmm0
; X86-NEXT: vpand {{\.LCPI.*}}, %ymm0, %ymm0
; X86-NEXT: retl
;
; X64-LABEL: vrolq_v4i64_extract_shrl:
; X64: # %bb.0:
; X64-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; X64-NEXT: vprolq $24, %zmm0, %zmm0
; X64-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744073189457919,18446744073189457919,18446744073189457919,18446744073189457919]
; X64-NEXT: vpand %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%lhs_div = lshr <4 x i64> %i, <i64 40, i64 40, i64 40, i64 40>
%rhs_div = lshr <4 x i64> %i, <i64 5, i64 5, i64 5, i64 5>
%rhs_shift = shl <4 x i64> %rhs_div, <i64 29, i64 29, i64 29, i64 29>
%out = or <4 x i64> %lhs_div, %rhs_shift
ret <4 x i64> %out
}
define <8 x i32> @vroll_extract_mul(<8 x i32> %i) nounwind {
; CHECK-LABEL: vroll_extract_mul:
; CHECK: # %bb.0:
; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 = [640,640,640,640,640,640,640,640]
; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm1
; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm2 = [10,10,10,10,10,10,10,10]
; CHECK-NEXT: vpmulld %ymm2, %ymm0, %ymm0
; CHECK-NEXT: vpsrld $26, %ymm0, %ymm0
; CHECK-NEXT: vpor %ymm0, %ymm1, %ymm0
; CHECK-NEXT: ret{{[l|q]}}
%lhs_mul = mul <8 x i32> %i, <i32 640, i32 640, i32 640, i32 640, i32 640, i32 640, i32 640, i32 640>
%rhs_mul = mul <8 x i32> %i, <i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
%rhs_shift = lshr <8 x i32> %rhs_mul, <i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26>
%out = or <8 x i32> %lhs_mul, %rhs_shift
ret <8 x i32> %out
}
define <2 x i64> @vrolq_extract_udiv(<2 x i64> %i) nounwind {
; X86-LABEL: vrolq_extract_udiv:
; X86: # %bb.0:
; X86-NEXT: subl $60, %esp
; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp)
; X86-NEXT: vmovss %xmm0, (%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $3, {{[0-9]+}}(%esp)
; X86-NEXT: calll __udivdi3
; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
; X86-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp)
; X86-NEXT: vextractps $2, %xmm0, (%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $3, {{[0-9]+}}(%esp)
; X86-NEXT: vmovd %eax, %xmm0
; X86-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0
; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-NEXT: calll __udivdi3
; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
; X86-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0
; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp)
; X86-NEXT: vmovss %xmm0, (%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $384, {{[0-9]+}}(%esp) # imm = 0x180
; X86-NEXT: calll __udivdi3
; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
; X86-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp)
; X86-NEXT: vextractps $2, %xmm0, (%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $384, {{[0-9]+}}(%esp) # imm = 0x180
; X86-NEXT: vmovd %eax, %xmm0
; X86-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0
; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-NEXT: calll __udivdi3
; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
; X86-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0
; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
; X86-NEXT: vpsllq $57, %xmm1, %xmm1
; X86-NEXT: vpor %xmm0, %xmm1, %xmm0
; X86-NEXT: addl $60, %esp
; X86-NEXT: retl
;
; X64-LABEL: vrolq_extract_udiv:
; X64: # %bb.0:
; X64-NEXT: vpextrq $1, %xmm0, %rax
; X64-NEXT: movabsq $-6148914691236517205, %rsi # imm = 0xAAAAAAAAAAAAAAAB
; X64-NEXT: mulq %rsi
; X64-NEXT: movq %rdx, %rcx
; X64-NEXT: movq %rdx, %rax
; X64-NEXT: shrq %rax
; X64-NEXT: vmovq %rax, %xmm1
; X64-NEXT: vmovq %xmm0, %rax
; X64-NEXT: mulq %rsi
; X64-NEXT: movq %rdx, %rax
; X64-NEXT: shrq %rax
; X64-NEXT: vmovq %rax, %xmm0
; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X64-NEXT: shrq $8, %rcx
; X64-NEXT: vmovq %rcx, %xmm1
; X64-NEXT: shrq $8, %rdx
; X64-NEXT: vmovq %rdx, %xmm2
; X64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; X64-NEXT: vpsllq $57, %xmm0, %xmm0
; X64-NEXT: vpor %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%lhs_div = udiv <2 x i64> %i, <i64 3, i64 3>
%rhs_div = udiv <2 x i64> %i, <i64 384, i64 384>
%lhs_shift = shl <2 x i64> %lhs_div, <i64 57, i64 57>
%out = or <2 x i64> %lhs_shift, %rhs_div
ret <2 x i64> %out
}
define <4 x i32> @vrolw_extract_mul_with_mask(<4 x i32> %i) nounwind {
; CHECK-LABEL: vrolw_extract_mul_with_mask:
; CHECK: # %bb.0:
; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1152,1152,1152,1152]
; CHECK-NEXT: vpmulld %xmm1, %xmm0, %xmm1
; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm2 = [9,9,9,9]
; CHECK-NEXT: vpmulld %xmm2, %xmm0, %xmm0
; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm2 = [160,160,160,160]
; CHECK-NEXT: vpand %xmm2, %xmm1, %xmm1
; CHECK-NEXT: vpsrld $25, %xmm0, %xmm0
; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0
; CHECK-NEXT: ret{{[l|q]}}
%lhs_mul = mul <4 x i32> %i, <i32 1152, i32 1152, i32 1152, i32 1152>
%rhs_mul = mul <4 x i32> %i, <i32 9, i32 9, i32 9, i32 9>
%lhs_and = and <4 x i32> %lhs_mul, <i32 160, i32 160, i32 160, i32 160>
%rhs_shift = lshr <4 x i32> %rhs_mul, <i32 25, i32 25, i32 25, i32 25>
%out = or <4 x i32> %lhs_and, %rhs_shift
ret <4 x i32> %out
}
define <32 x i16> @illegal_no_extract_mul(<32 x i16> %i) nounwind {
; X86-LABEL: illegal_no_extract_mul:
; X86: # %bb.0:
; X86-NEXT: vpmullw {{\.LCPI.*}}, %zmm0, %zmm1
; X86-NEXT: vpmullw {{\.LCPI.*}}, %zmm0, %zmm0
; X86-NEXT: vpsrlw $10, %zmm0, %zmm0
; X86-NEXT: vporq %zmm0, %zmm1, %zmm0
; X86-NEXT: retl
;
; X64-LABEL: illegal_no_extract_mul:
; X64: # %bb.0:
; X64-NEXT: vpmullw {{.*}}(%rip), %zmm0, %zmm1
; X64-NEXT: vpmullw {{.*}}(%rip), %zmm0, %zmm0
; X64-NEXT: vpsrlw $10, %zmm0, %zmm0
; X64-NEXT: vporq %zmm0, %zmm1, %zmm0
; X64-NEXT: retq
%lhs_mul = mul <32 x i16> %i, <i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640>
%rhs_mul = mul <32 x i16> %i, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
%rhs_shift = lshr <32 x i16> %rhs_mul, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
%out = or <32 x i16> %lhs_mul, %rhs_shift
ret <32 x i16> %out
}
; Result would undershift
define <4 x i64> @no_extract_shl(<4 x i64> %i) nounwind {
; CHECK-LABEL: no_extract_shl:
; CHECK: # %bb.0:
; CHECK-NEXT: vpsllq $11, %ymm0, %ymm1
; CHECK-NEXT: vpsllq $24, %ymm0, %ymm0
; CHECK-NEXT: vpsrlq $50, %ymm1, %ymm1
; CHECK-NEXT: vpor %ymm0, %ymm1, %ymm0
; CHECK-NEXT: ret{{[l|q]}}
%lhs_mul = shl <4 x i64> %i, <i64 11, i64 11, i64 11, i64 11>
%rhs_mul = shl <4 x i64> %i, <i64 24, i64 24, i64 24, i64 24>
%lhs_shift = lshr <4 x i64> %lhs_mul, <i64 50, i64 50, i64 50, i64 50>
%out = or <4 x i64> %lhs_shift, %rhs_mul
ret <4 x i64> %out
}
; Result would overshift
define <4 x i32> @no_extract_shrl(<4 x i32> %i) nounwind {
; CHECK-LABEL: no_extract_shrl:
; CHECK: # %bb.0:
; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4026531840,4026531840,4026531840,4026531840]
; CHECK-NEXT: vpslld $25, %xmm0, %xmm2
; CHECK-NEXT: vpand %xmm1, %xmm2, %xmm1
; CHECK-NEXT: vpsrld $9, %xmm0, %xmm0
; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0
; CHECK-NEXT: ret{{[l|q]}}
%lhs_div = lshr <4 x i32> %i, <i32 3, i32 3, i32 3, i32 3>
%rhs_div = lshr <4 x i32> %i, <i32 9, i32 9, i32 9, i32 9>
%lhs_shift = shl <4 x i32> %lhs_div, <i32 28, i32 28, i32 28, i32 28>
%out = or <4 x i32> %lhs_shift, %rhs_div
ret <4 x i32> %out
}
; Can factor 512 from 1536, but result is 3 instead of 9
define <8 x i32> @no_extract_mul(<8 x i32> %i) nounwind {
; CHECK-LABEL: no_extract_mul:
; CHECK: # %bb.0:
; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1536,1536,1536,1536,1536,1536,1536,1536]
; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm1
; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm2 = [9,9,9,9,9,9,9,9]
; CHECK-NEXT: vpmulld %ymm2, %ymm0, %ymm0
; CHECK-NEXT: vpsrld $23, %ymm0, %ymm0
; CHECK-NEXT: vpor %ymm0, %ymm1, %ymm0
; CHECK-NEXT: ret{{[l|q]}}
%lhs_mul = mul <8 x i32> %i, <i32 1536, i32 1536, i32 1536, i32 1536, i32 1536, i32 1536, i32 1536, i32 1536>
%rhs_mul = mul <8 x i32> %i, <i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9>
%rhs_shift = lshr <8 x i32> %rhs_mul, <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23>
%out = or <8 x i32> %lhs_mul, %rhs_shift
ret <8 x i32> %out
}
; Can't evenly factor 256 from 770
define <2 x i64> @no_extract_udiv(<2 x i64> %i) nounwind {
; X86-LABEL: no_extract_udiv:
; X86: # %bb.0:
; X86-NEXT: subl $60, %esp
; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp)
; X86-NEXT: vmovss %xmm0, (%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $3, {{[0-9]+}}(%esp)
; X86-NEXT: calll __udivdi3
; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
; X86-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp)
; X86-NEXT: vextractps $2, %xmm0, (%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $3, {{[0-9]+}}(%esp)
; X86-NEXT: vmovd %eax, %xmm0
; X86-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0
; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-NEXT: calll __udivdi3
; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
; X86-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0
; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp)
; X86-NEXT: vmovss %xmm0, (%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $770, {{[0-9]+}}(%esp) # imm = 0x302
; X86-NEXT: calll __udivdi3
; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
; X86-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp)
; X86-NEXT: vextractps $2, %xmm0, (%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $770, {{[0-9]+}}(%esp) # imm = 0x302
; X86-NEXT: vmovd %eax, %xmm0
; X86-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0
; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-NEXT: calll __udivdi3
; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
; X86-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0
; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
; X86-NEXT: vpsllq $56, %xmm1, %xmm1
; X86-NEXT: vpor %xmm0, %xmm1, %xmm0
; X86-NEXT: addl $60, %esp
; X86-NEXT: retl
;
; X64-LABEL: no_extract_udiv:
; X64: # %bb.0:
; X64-NEXT: vpextrq $1, %xmm0, %rcx
; X64-NEXT: movabsq $-6148914691236517205, %rdi # imm = 0xAAAAAAAAAAAAAAAB
; X64-NEXT: movq %rcx, %rax
; X64-NEXT: mulq %rdi
; X64-NEXT: shrq %rdx
; X64-NEXT: vmovq %rdx, %xmm1
; X64-NEXT: vmovq %xmm0, %rsi
; X64-NEXT: movq %rsi, %rax
; X64-NEXT: mulq %rdi
; X64-NEXT: shrq %rdx
; X64-NEXT: vmovq %rdx, %xmm0
; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X64-NEXT: movabsq $-6180857105216966645, %rdi # imm = 0xAA392F35DC17F00B
; X64-NEXT: movq %rcx, %rax
; X64-NEXT: mulq %rdi
; X64-NEXT: shrq $9, %rdx
; X64-NEXT: vmovq %rdx, %xmm1
; X64-NEXT: movq %rsi, %rax
; X64-NEXT: mulq %rdi
; X64-NEXT: shrq $9, %rdx
; X64-NEXT: vmovq %rdx, %xmm2
; X64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; X64-NEXT: vpsllq $56, %xmm0, %xmm0
; X64-NEXT: vpor %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%lhs_div = udiv <2 x i64> %i, <i64 3, i64 3>
%rhs_div = udiv <2 x i64> %i, <i64 770, i64 770>
%lhs_shift = shl <2 x i64> %lhs_div, <i64 56, i64 56>
%out = or <2 x i64> %lhs_shift, %rhs_div
ret <2 x i64> %out
}

View File

@ -0,0 +1,285 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefixes=CHECK,X86
; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=CHECK,X64
; Check that under certain conditions we can factor out a rotate
; from the following idioms:
; (a*c0) >> s1 | (a*c1)
; (a/c0) << s1 | (a/c1)
; This targets cases where instcombine has folded a shl/srl/mul/udiv
; with one of the shifts from the rotate idiom
define i64 @rolq_extract_shl(i64 %i) nounwind {
; X86-LABEL: rolq_extract_shl:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: leal (,%edx,8), %eax
; X86-NEXT: shldl $10, %ecx, %edx
; X86-NEXT: shll $10, %ecx
; X86-NEXT: shrl $25, %eax
; X86-NEXT: orl %ecx, %eax
; X86-NEXT: retl
;
; X64-LABEL: rolq_extract_shl:
; X64: # %bb.0:
; X64-NEXT: leaq (,%rdi,8), %rax
; X64-NEXT: shlq $10, %rdi
; X64-NEXT: shrq $57, %rax
; X64-NEXT: orq %rdi, %rax
; X64-NEXT: retq
%lhs_mul = shl i64 %i, 3
%rhs_mul = shl i64 %i, 10
%lhs_shift = lshr i64 %lhs_mul, 57
%out = or i64 %lhs_shift, %rhs_mul
ret i64 %out
}
define i16 @rolw_extract_shrl(i16 %i) nounwind {
; X86-LABEL: rolw_extract_shrl:
; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: rolw $9, %ax
; X86-NEXT: andl $61951, %eax # imm = 0xF1FF
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
; X64-LABEL: rolw_extract_shrl:
; X64: # %bb.0:
; X64-NEXT: rolw $9, %di
; X64-NEXT: andl $61951, %edi # imm = 0xF1FF
; X64-NEXT: movl %edi, %eax
; X64-NEXT: retq
%lhs_div = lshr i16 %i, 7
%rhs_div = lshr i16 %i, 3
%rhs_shift = shl i16 %rhs_div, 12
%out = or i16 %lhs_div, %rhs_shift
ret i16 %out
}
define i32 @roll_extract_mul(i32 %i) nounwind {
; X86-LABEL: roll_extract_mul:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: leal (%ecx,%ecx,8), %eax
; X86-NEXT: shll $7, %ecx
; X86-NEXT: leal (%ecx,%ecx,8), %ecx
; X86-NEXT: shrl $25, %eax
; X86-NEXT: orl %ecx, %eax
; X86-NEXT: retl
;
; X64-LABEL: roll_extract_mul:
; X64: # %bb.0:
; X64-NEXT: # kill: def $edi killed $edi def $rdi
; X64-NEXT: leal (%rdi,%rdi,8), %eax
; X64-NEXT: shll $7, %edi
; X64-NEXT: leal (%rdi,%rdi,8), %ecx
; X64-NEXT: shrl $25, %eax
; X64-NEXT: orl %ecx, %eax
; X64-NEXT: retq
%lhs_mul = mul i32 %i, 9
%rhs_mul = mul i32 %i, 1152
%lhs_shift = lshr i32 %lhs_mul, 25
%out = or i32 %lhs_shift, %rhs_mul
ret i32 %out
}
define i8 @rolb_extract_udiv(i8 %i) nounwind {
; X86-LABEL: rolb_extract_udiv:
; X86: # %bb.0:
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: imull $171, %eax, %eax
; X86-NEXT: movb %ah, %cl
; X86-NEXT: shlb $3, %cl
; X86-NEXT: andb $-16, %cl
; X86-NEXT: shrl $13, %eax
; X86-NEXT: orb %cl, %al
; X86-NEXT: # kill: def $al killed $al killed $eax
; X86-NEXT: retl
;
; X64-LABEL: rolb_extract_udiv:
; X64: # %bb.0:
; X64-NEXT: movzbl %dil, %eax
; X64-NEXT: imull $171, %eax, %eax
; X64-NEXT: movl %eax, %ecx
; X64-NEXT: shrl $8, %ecx
; X64-NEXT: shlb $3, %cl
; X64-NEXT: andb $-16, %cl
; X64-NEXT: shrl $13, %eax
; X64-NEXT: orb %cl, %al
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
%lhs_div = udiv i8 %i, 3
%rhs_div = udiv i8 %i, 48
%lhs_shift = shl i8 %lhs_div, 4
%out = or i8 %lhs_shift, %rhs_div
ret i8 %out
}
define i64 @rolq_extract_mul_with_mask(i64 %i) nounwind {
; X86-LABEL: rolq_extract_mul_with_mask:
; X86: # %bb.0:
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: shll $7, %ecx
; X86-NEXT: leal (%ecx,%ecx,8), %ecx
; X86-NEXT: movl $9, %edx
; X86-NEXT: mull %edx
; X86-NEXT: leal (%esi,%esi,8), %eax
; X86-NEXT: addl %edx, %eax
; X86-NEXT: movzbl %cl, %ecx
; X86-NEXT: shrl $25, %eax
; X86-NEXT: orl %ecx, %eax
; X86-NEXT: xorl %edx, %edx
; X86-NEXT: popl %esi
; X86-NEXT: retl
;
; X64-LABEL: rolq_extract_mul_with_mask:
; X64: # %bb.0:
; X64-NEXT: leaq (%rdi,%rdi,8), %rax
; X64-NEXT: # kill: def $edi killed $edi killed $rdi def $rdi
; X64-NEXT: shll $7, %edi
; X64-NEXT: leal (%rdi,%rdi,8), %ecx
; X64-NEXT: movzbl %cl, %ecx
; X64-NEXT: shrq $57, %rax
; X64-NEXT: orq %rcx, %rax
; X64-NEXT: retq
%lhs_mul = mul i64 %i, 1152
%rhs_mul = mul i64 %i, 9
%lhs_and = and i64 %lhs_mul, 160
%rhs_shift = lshr i64 %rhs_mul, 57
%out = or i64 %lhs_and, %rhs_shift
ret i64 %out
}
; Result would undershift
define i64 @no_extract_shl(i64 %i) nounwind {
; X86-LABEL: no_extract_shl:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl %edx, %eax
; X86-NEXT: shll $5, %eax
; X86-NEXT: shldl $10, %ecx, %edx
; X86-NEXT: shll $10, %ecx
; X86-NEXT: shrl $25, %eax
; X86-NEXT: orl %ecx, %eax
; X86-NEXT: retl
;
; X64-LABEL: no_extract_shl:
; X64: # %bb.0:
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: shlq $5, %rax
; X64-NEXT: shlq $10, %rdi
; X64-NEXT: shrq $57, %rax
; X64-NEXT: leaq (%rax,%rdi), %rax
; X64-NEXT: retq
%lhs_mul = shl i64 %i, 5
%rhs_mul = shl i64 %i, 10
%lhs_shift = lshr i64 %lhs_mul, 57
%out = or i64 %lhs_shift, %rhs_mul
ret i64 %out
}
; Result would overshift
define i32 @no_extract_shrl(i32 %i) nounwind {
; X86-LABEL: no_extract_shrl:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andl $-8, %ecx
; X86-NEXT: shll $25, %ecx
; X86-NEXT: shrl $9, %eax
; X86-NEXT: orl %ecx, %eax
; X86-NEXT: retl
;
; X64-LABEL: no_extract_shrl:
; X64: # %bb.0:
; X64-NEXT: # kill: def $edi killed $edi def $rdi
; X64-NEXT: movl %edi, %eax
; X64-NEXT: andl $-8, %eax
; X64-NEXT: shll $25, %eax
; X64-NEXT: shrl $9, %edi
; X64-NEXT: leal (%rdi,%rax), %eax
; X64-NEXT: retq
%lhs_div = lshr i32 %i, 3
%rhs_div = lshr i32 %i, 9
%lhs_shift = shl i32 %lhs_div, 28
%out = or i32 %lhs_shift, %rhs_div
ret i32 %out
}
; Can factor 128 from 2304, but result is 18 instead of 9
define i16 @no_extract_mul(i16 %i) nounwind {
; X86-LABEL: no_extract_mul:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal (%eax,%eax,8), %ecx
; X86-NEXT: shll $8, %eax
; X86-NEXT: leal (%eax,%eax,8), %edx
; X86-NEXT: movzwl %cx, %eax
; X86-NEXT: shrl $9, %eax
; X86-NEXT: orl %edx, %eax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
; X64-LABEL: no_extract_mul:
; X64: # %bb.0:
; X64-NEXT: # kill: def $edi killed $edi def $rdi
; X64-NEXT: leal (%rdi,%rdi,8), %eax
; X64-NEXT: # kill: def $edi killed $edi killed $rdi def $rdi
; X64-NEXT: shll $8, %edi
; X64-NEXT: leal (%rdi,%rdi,8), %ecx
; X64-NEXT: movzwl %ax, %eax
; X64-NEXT: shrl $9, %eax
; X64-NEXT: orl %ecx, %eax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
%lhs_mul = mul i16 %i, 2304
%rhs_mul = mul i16 %i, 9
%rhs_shift = lshr i16 %rhs_mul, 9
%out = or i16 %lhs_mul, %rhs_shift
ret i16 %out
}
; Can't evenly factor 16 from 49
define i8 @no_extract_udiv(i8 %i) nounwind {
; X86-LABEL: no_extract_udiv:
; X86: # %bb.0:
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: imull $171, %eax, %ecx
; X86-NEXT: shlb $3, %ch
; X86-NEXT: andb $-16, %ch
; X86-NEXT: imull $79, %eax, %edx
; X86-NEXT: subb %dh, %al
; X86-NEXT: shrb %al
; X86-NEXT: addb %dh, %al
; X86-NEXT: shrb $5, %al
; X86-NEXT: orb %ch, %al
; X86-NEXT: # kill: def $al killed $al killed $eax
; X86-NEXT: retl
;
; X64-LABEL: no_extract_udiv:
; X64: # %bb.0:
; X64-NEXT: movzbl %dil, %eax
; X64-NEXT: imull $171, %eax, %ecx
; X64-NEXT: shrl $8, %ecx
; X64-NEXT: shlb $3, %cl
; X64-NEXT: andb $-16, %cl
; X64-NEXT: imull $79, %eax, %edx
; X64-NEXT: shrl $8, %edx
; X64-NEXT: subb %dl, %al
; X64-NEXT: shrb %al
; X64-NEXT: addb %dl, %al
; X64-NEXT: shrb $5, %al
; X64-NEXT: orb %cl, %al
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
%lhs_div = udiv i8 %i, 3
%rhs_div = udiv i8 %i, 49
%lhs_shift = shl i8 %lhs_div,4
%out = or i8 %lhs_shift, %rhs_div
ret i8 %out
}