From 21cdcd7b2b6980f15a7bfe0ee191ddf6498251b7 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sat, 26 Jan 2019 06:27:04 +0000 Subject: [PATCH] [X86] Autoupgrade some of the intrinsics used by stack folding tests that have been previously removed. llvm-svn: 352271 --- .../CodeGen/X86/stack-folding-fp-avx512vl.ll | 45 ++++++------- .../CodeGen/X86/stack-folding-fp-sse42.ll | 20 +++--- .../CodeGen/X86/stack-folding-int-avx1.ll | 1 - .../CodeGen/X86/stack-folding-int-avx512.ll | 66 ++++++++++--------- .../CodeGen/X86/stack-folding-int-avx512vl.ll | 57 ++++++++-------- 5 files changed, 96 insertions(+), 93 deletions(-) diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll index fd8bed3c82e7..0cb72d16245b 100644 --- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll +++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll @@ -639,73 +639,65 @@ define <4 x float> @stack_fold_vpermt2ps(<4 x float> %x0, <4 x i32> %x1, <4 x fl ;CHECK-LABEL: stack_fold_vpermt2ps ;CHECK: vpermt2ps {{-?[0-9]*}}(%rsp), %xmm1, %xmm0 # 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %res = call <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 -1) - ret <4 x float> %res + %2 = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2) + ret <4 x float> %2 } -declare <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float>, <4 x i32>, <4 x float>, i8) define <4 x float> @stack_fold_vpermi2ps(<4 x i32> %x0, <4 x float> %x1, <4 x float> %x2) { ;CHECK-LABEL: stack_fold_vpermi2ps ;CHECK: vpermi2ps {{-?[0-9]*}}(%rsp), %xmm1, %xmm0 # 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %res = call <4 x float> @llvm.x86.avx512.mask.vpermt2var.ps.128(<4 x i32> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) - ret <4 x float> %res + %2 = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %x1, <4 x i32> %x0, <4 x float> %x2) + ret <4 x float> %2 } -declare <4 x float> @llvm.x86.avx512.mask.vpermt2var.ps.128(<4 x i32>, <4 x float>, <4 x float>, i8) define <2 x double> @stack_fold_vpermt2pd(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2) { ;CHECK-LABEL: stack_fold_vpermt2pd ;CHECK: vpermt2pd {{-?[0-9]*}}(%rsp), %xmm1, %xmm0 # 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %res = call <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 -1) - ret <2 x double> %res + %2 = call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2) + ret <2 x double> %2 } -declare <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double>, <2 x i64>, <2 x double>, i8) define <2 x double> @stack_fold_vpermi2pd(<2 x i64> %x0, <2 x double> %x1, <2 x double> %x2) { ;CHECK-LABEL: stack_fold_vpermi2pd ;CHECK: vpermi2pd {{-?[0-9]*}}(%rsp), %xmm1, %xmm0 # 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %res = call <2 x double> @llvm.x86.avx512.mask.vpermt2var.pd.128(<2 x i64> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) - ret <2 x double> %res + %2 = call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %x1, <2 x i64> %x0, <2 x double> %x2) + ret <2 x double> %2 } -declare <2 x double> @llvm.x86.avx512.mask.vpermt2var.pd.128(<2 x i64>, <2 x double>, <2 x double>, i8) define <8 x float> @stack_fold_vpermt2ps_ymm(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2) { ;CHECK-LABEL: stack_fold_vpermt2ps_ymm ;CHECK: vpermt2ps {{-?[0-9]*}}(%rsp), %ymm1, %ymm0 # 32-byte Folded Reload %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %res = call <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 -1) - ret <8 x float> %res + %2 = call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2) + ret <8 x float> %2 } -declare <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float>, <8 x i32>, <8 x float>, i8) define <8 x float> @stack_fold_vpermi2ps_ymm(<8 x i32> %x0, <8 x float> %x1, <8 x float> %x2) { ;CHECK-LABEL: stack_fold_vpermi2ps_ymm ;CHECK: vpermi2ps {{-?[0-9]*}}(%rsp), %ymm1, %ymm0 # 32-byte Folded Reload %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %res = call <8 x float> @llvm.x86.avx512.mask.vpermt2var.ps.256(<8 x i32> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) - ret <8 x float> %res + %2 = call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %x1, <8 x i32> %x0, <8 x float> %x2) + ret <8 x float> %2 } -declare <8 x float> @llvm.x86.avx512.mask.vpermt2var.ps.256(<8 x i32>, <8 x float>, <8 x float>, i8) define <4 x double> @stack_fold_vpermt2pd_ymm(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2) { ;CHECK-LABEL: stack_fold_vpermt2pd_ymm ;CHECK: vpermt2pd {{-?[0-9]*}}(%rsp), %ymm1, %ymm0 # 32-byte Folded Reload %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %res = call <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 -1) - ret <4 x double> %res + %2 = call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2) + ret <4 x double> %2 } -declare <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double>, <4 x i64>, <4 x double>, i8) define <4 x double> @stack_fold_vpermi2pd_ymm(<4 x i64> %x0, <4 x double> %x1, <4 x double> %x2) { ;CHECK-LABEL: stack_fold_vpermi2pd_ymm ;CHECK: vpermi2pd {{-?[0-9]*}}(%rsp), %ymm1, %ymm0 # 32-byte Folded Reload %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %res = call <4 x double> @llvm.x86.avx512.mask.vpermt2var.pd.256(<4 x i64> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) - ret <4 x double> %res + %2 = call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %x1, <4 x i64> %x0, <4 x double> %x2) + ret <4 x double> %2 } -declare <4 x double> @llvm.x86.avx512.mask.vpermt2var.pd.256(<4 x i64>, <4 x double>, <4 x double>, i8) define <4 x double> @stack_fold_permpd(<4 x double> %a0) { ;CHECK-LABEL: stack_fold_permpd @@ -815,5 +807,10 @@ define <8 x float> @stack_fold_permilpsvar_ymm_maskz(<8 x float> %a0, <8 x i32> ret <8 x float> %4 } +declare <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float>, <4 x i32>, <4 x float>) +declare <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double>, <2 x i64>, <2 x double>) +declare <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float>, <8 x i32>, <8 x float>) +declare <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double>, <4 x i64>, <4 x double>) + attributes #0 = { "unsafe-fp-math"="false" } attributes #1 = { "unsafe-fp-math"="true" } diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-sse42.ll b/llvm/test/CodeGen/X86/stack-folding-fp-sse42.ll index 3604aaa35fc7..4599c4d931a2 100644 --- a/llvm/test/CodeGen/X86/stack-folding-fp-sse42.ll +++ b/llvm/test/CodeGen/X86/stack-folding-fp-sse42.ll @@ -1091,19 +1091,17 @@ define <2 x double> @stack_fold_sqrtpd(<2 x double> %a0) { ;CHECK-LABEL: stack_fold_sqrtpd ;CHECK: sqrtpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %a0) + %2 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %a0) ret <2 x double> %2 } -declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone define <4 x float> @stack_fold_sqrtps(<4 x float> %a0) { ;CHECK-LABEL: stack_fold_sqrtps ;CHECK: sqrtps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0) + %2 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a0) ret <4 x float> %2 } -declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone define double @stack_fold_sqrtsd(double %a0) optsize { ;CHECK-LABEL: stack_fold_sqrtsd @@ -1118,12 +1116,13 @@ define <2 x double> @stack_fold_sqrtsd_int(<2 x double> %a0, <2 x double> %a1) o ;CHECK-LABEL: stack_fold_sqrtsd_int ;CHECK: sqrtsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a1) - %3 = extractelement <2 x double> %2, i32 0 - %4 = insertelement <2 x double> %a0, double %3, i32 0 - ret <2 x double> %4 + %2 = extractelement <2 x double> %a1, i64 0 + %3 = call double @llvm.sqrt.f64(double %2) + %4 = insertelement <2 x double> %a1, double %3, i64 0 + %5 = extractelement <2 x double> %4, i32 0 + %6 = insertelement <2 x double> %a0, double %5, i32 0 + ret <2 x double> %6 } -declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone define float @stack_fold_sqrtss(float %a0) minsize { ;CHECK-LABEL: stack_fold_sqrtss @@ -1302,5 +1301,8 @@ define <4 x float> @stack_fold_xorps(<4 x float> %a0, <4 x float> %a1) { ret <4 x float> %6 } +declare <2 x double> @llvm.sqrt.v2f64(<2 x double>) +declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) + attributes #0 = { "unsafe-fp-math"="false" } attributes #1 = { "unsafe-fp-math"="true" } diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avx1.ll b/llvm/test/CodeGen/X86/stack-folding-int-avx1.ll index c694fc169afa..04d62d4bc25e 100644 --- a/llvm/test/CodeGen/X86/stack-folding-int-avx1.ll +++ b/llvm/test/CodeGen/X86/stack-folding-int-avx1.ll @@ -678,7 +678,6 @@ define <8 x i16> @stack_fold_pminuw(<8 x i16> %a0, <8 x i16> %a1) { %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1 ret <8 x i16> %3 } -declare <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16>, <8 x i16>) nounwind readnone define <2 x i64> @stack_fold_pmuldq(<4 x i32> %a0, <4 x i32> %a1) { ;CHECK-LABEL: stack_fold_pmuldq diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avx512.ll b/llvm/test/CodeGen/X86/stack-folding-int-avx512.ll index 3efc63b38baf..a899475c9d80 100644 --- a/llvm/test/CodeGen/X86/stack-folding-int-avx512.ll +++ b/llvm/test/CodeGen/X86/stack-folding-int-avx512.ll @@ -20,7 +20,7 @@ define <16 x i32> @stack_fold_valignd_mask(<16 x i32> %a, <16 x i32> %b, <16 x i ;CHECK-LABEL: stack_fold_valignd_mask ;CHECK: valignd $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> + %2 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> %3 = bitcast i16 %mask to <16 x i1> %4 = load <16 x i32>, <16 x i32>* %passthru %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4 @@ -31,7 +31,7 @@ define <16 x i32> @stack_fold_valignd_maskz(<16 x i32> %a, <16 x i32> %b, i16 %m ;CHECK-LABEL: stack_fold_valignd_maskz ;CHECK: valignd $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> + %2 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> %3 = bitcast i16 %mask to <16 x i1> %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer ret <16 x i32> %4 @@ -74,7 +74,7 @@ define <64 x i8> @stack_fold_pavgb(<64 x i8> %a0, <64 x i8> %a1) { %3 = zext <64 x i8> %a1 to <64 x i16> %4 = add <64 x i16> %2, %3 %5 = add <64 x i16> %4, - %6 = lshr <64 x i16> %5, + %6 = lshr <64 x i16> %5, %7 = trunc <64 x i16> %6 to <64 x i8> ret <64 x i8> %7 } @@ -87,7 +87,7 @@ define <64 x i8> @stack_fold_pavgb_mask(<64 x i8>* %passthru, <64 x i8> %a0, <64 %3 = zext <64 x i8> %a0 to <64 x i16> %4 = zext <64 x i8> %a1 to <64 x i16> %5 = add <64 x i16> %3, %4 - %6 = add <64 x i16> %5, + %6 = add <64 x i16> %5, %7 = lshr <64 x i16> %6, %8 = trunc <64 x i16> %7 to <64 x i8> %9 = bitcast i64 %mask to <64 x i1> @@ -102,7 +102,7 @@ define <64 x i8> @stack_fold_pavgb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask %2 = zext <64 x i8> %a0 to <64 x i16> %3 = zext <64 x i8> %a1 to <64 x i16> %4 = add <64 x i16> %2, %3 - %5 = add <64 x i16> %4, + %5 = add <64 x i16> %4, %6 = lshr <64 x i16> %5, %7 = trunc <64 x i16> %6 to <64 x i8> %8 = bitcast i64 %mask to <64 x i1> @@ -118,7 +118,7 @@ define <32 x i16> @stack_fold_pavgw(<32 x i16> %a0, <32 x i16> %a1) { %3 = zext <32 x i16> %a1 to <32 x i32> %4 = add <32 x i32> %2, %3 %5 = add <32 x i32> %4, - %6 = lshr <32 x i32> %5, + %6 = lshr <32 x i32> %5, %7 = trunc <32 x i32> %6 to <32 x i16> ret <32 x i16> %7 } @@ -131,8 +131,8 @@ define <32 x i16> @stack_fold_pavgw_mask(<32 x i16>* %passthru, <32 x i16> %a0, %3 = zext <32 x i16> %a0 to <32 x i32> %4 = zext <32 x i16> %a1 to <32 x i32> %5 = add <32 x i32> %3, %4 - %6 = add <32 x i32> %5, - %7 = lshr <32 x i32> %6, + %6 = add <32 x i32> %5, + %7 = lshr <32 x i32> %6, %8 = trunc <32 x i32> %7 to <32 x i16> %9 = bitcast i32 %mask to <32 x i1> %10 = select <32 x i1> %9, <32 x i16> %8, <32 x i16> %2 @@ -146,8 +146,8 @@ define <32 x i16> @stack_fold_pavgw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %m %2 = zext <32 x i16> %a0 to <32 x i32> %3 = zext <32 x i16> %a1 to <32 x i32> %4 = add <32 x i32> %2, %3 - %5 = add <32 x i32> %4, - %6 = lshr <32 x i32> %5, + %5 = add <32 x i32> %4, + %6 = lshr <32 x i32> %5, %7 = trunc <32 x i32> %6 to <32 x i16> %8 = bitcast i32 %mask to <32 x i1> %9 = select <32 x i1> %8, <32 x i16> %7, <32 x i16> zeroinitializer @@ -223,7 +223,6 @@ define <64 x i8> @stack_fold_pabsb(<64 x i8> %a0) { %4 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %3 ret <64 x i8> %4 } -declare <64 x i8> @llvm.x86.avx512.mask.pabs.b.512(<64 x i8>, <64 x i8>, i64) nounwind readnone define <64 x i8> @stack_fold_pabsb_mask(<64 x i8> %passthru, <64 x i8> %a0, i64 %mask) { ;CHECK-LABEL: stack_fold_pabsb_mask @@ -458,25 +457,23 @@ define <64 x i8> @stack_fold_paddsb(<64 x i8> %a0, <64 x i8> %a1) { ;CHECK-LABEL: stack_fold_paddsb ;CHECK: vpaddsb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <64 x i8> @llvm.x86.avx512.mask.padds.b.512(<64 x i8> %a0, <64 x i8> %a1, <64 x i8> undef, i64 -1) + %2 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1) ret <64 x i8> %2 } -declare <64 x i8> @llvm.x86.avx512.mask.padds.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) nounwind readnone define <32 x i16> @stack_fold_paddsw(<32 x i16> %a0, <32 x i16> %a1) { ;CHECK-LABEL: stack_fold_paddsw ;CHECK: vpaddsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a0, <32 x i16> %a1, <32 x i16> undef, i32 -1) + %2 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1) ret <32 x i16> %2 } -declare <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) define <64 x i8> @stack_fold_paddusb(<64 x i8> %a0, <64 x i8> %a1) { ;CHECK-LABEL: stack_fold_paddusb ;CHECK: vpaddusb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <64 x i8> @llvm.x86.avx512.mask.paddus.b.512(<64 x i8> %a0, <64 x i8> %a1, <64 x i8> undef, i64 -1) + %2 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1) ret <64 x i8> %2 } declare <64 x i8> @llvm.x86.avx512.mask.paddus.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) nounwind readnone @@ -485,7 +482,7 @@ define <32 x i16> @stack_fold_paddusw(<32 x i16> %a0, <32 x i16> %a1) { ;CHECK-LABEL: stack_fold_paddusw ;CHECK: vpaddusw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a0, <32 x i16> %a1, <32 x i16> undef, i32 -1) + %2 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1) ret <32 x i16> %2 } declare <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) @@ -531,7 +528,7 @@ define <16 x i32> @stack_fold_vpconflictd(<16 x i32> %a0) { ;CHECK-LABEL: stack_fold_vpconflictd ;CHECK: vpconflictd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <16 x i32> @llvm.x86.avx512.mask.conflict.d.512(<16 x i32> %a0, <16 x i32> undef, i16 -1) + %2 = call <16 x i32> @llvm.x86.avx512.conflict.d.512(<16 x i32> %a0) ret <16 x i32> %2 } declare <16 x i32> @llvm.x86.avx512.mask.conflict.d.512(<16 x i32>, <16 x i32>, i16) nounwind readonly @@ -540,7 +537,7 @@ define <8 x i64> @stack_fold_vpconflictq(<8 x i64> %a0) { ;CHECK-LABEL: stack_fold_vpconflictq ;CHECK: vpconflictq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <8 x i64> @llvm.x86.avx512.mask.conflict.q.512(<8 x i64> %a0, <8 x i64> undef, i8 -1) + %2 = call <8 x i64> @llvm.x86.avx512.conflict.q.512(<8 x i64> %a0) ret <8 x i64> %2 } declare <8 x i64> @llvm.x86.avx512.mask.conflict.q.512(<8 x i64>, <8 x i64>, i8) nounwind readnone @@ -892,19 +889,17 @@ define <16 x i32> @stack_fold_vplzcntd(<16 x i32> %a0) { ;CHECK-LABEL: stack_fold_vplzcntd ;CHECK: vplzcntd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a0) + %2 = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a0, i1 false) ret <16 x i32> %2 } -declare <16 x i32> @llvm.ctlz.v16i32(<16 x i32>) nounwind readonly define <8 x i64> @stack_fold_vplzcntq(<8 x i64> %a0) { ;CHECK-LABEL: stack_fold_vplzcntq ;CHECK: vplzcntq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a0) + %2 = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a0, i1 false) ret <8 x i64> %2 } -declare <8 x i64> @llvm.ctlz.v8i64(<8 x i64>) nounwind readnone define <32 x i16> @stack_fold_pmaddubsw_zmm(<64 x i8> %a0, <64 x i8> %a1) { ;CHECK-LABEL: stack_fold_pmaddubsw_zmm @@ -1743,37 +1738,33 @@ define <64 x i8> @stack_fold_psubsb(<64 x i8> %a0, <64 x i8> %a1) { ;CHECK-LABEL: stack_fold_psubsb ;CHECK: vpsubsb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <64 x i8> @llvm.x86.avx512.mask.psubs.b.512(<64 x i8> %a0, <64 x i8> %a1, <64 x i8> undef, i64 -1) + %2 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1) ret <64 x i8> %2 } -declare <64 x i8> @llvm.x86.avx512.mask.psubs.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) nounwind readnone define <32 x i16> @stack_fold_psubsw(<32 x i16> %a0, <32 x i16> %a1) { ;CHECK-LABEL: stack_fold_psubsw ;CHECK: vpsubsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a0, <32 x i16> %a1, <32 x i16> undef, i32 -1) + %2 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1) ret <32 x i16> %2 } -declare <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) define <64 x i8> @stack_fold_psubusb(<64 x i8> %a0, <64 x i8> %a1) { ;CHECK-LABEL: stack_fold_psubusb ;CHECK: vpsubusb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <64 x i8> @llvm.x86.avx512.mask.psubus.b.512(<64 x i8> %a0, <64 x i8> %a1, <64 x i8> undef, i64 -1) + %2 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1) ret <64 x i8> %2 } -declare <64 x i8> @llvm.x86.avx512.mask.psubus.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) nounwind readnone define <32 x i16> @stack_fold_psubusw(<32 x i16> %a0, <32 x i16> %a1) { ;CHECK-LABEL: stack_fold_psubusw ;CHECK: vpsubusw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a0, <32 x i16> %a1, <32 x i16> undef, i32 -1) + %2 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1) ret <32 x i16> %2 } -declare <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) define <32 x i16> @stack_fold_psubw(<32 x i16> %a0, <32 x i16> %a1) { ;CHECK-LABEL: stack_fold_psubw @@ -1883,3 +1874,16 @@ define <64 x i8> @stack_fold_punpckhbw_maskz_zmm(<64 x i8> %a0, <64 x i8> %a1, i %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer ret <64 x i8> %4 } + +declare <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8>, <64 x i8>) +declare <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16>, <32 x i16>) +declare <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8>, <64 x i8>) +declare <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16>, <32 x i16>) +declare <16 x i32> @llvm.x86.avx512.conflict.d.512(<16 x i32>) +declare <8 x i64> @llvm.x86.avx512.conflict.q.512(<8 x i64>) +declare <16 x i32> @llvm.ctlz.v16i32(<16 x i32>, i1) +declare <8 x i64> @llvm.ctlz.v8i64(<8 x i64>, i1) +declare <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8>, <64 x i8>) +declare <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16>, <32 x i16>) +declare <64 x i8> @llvm.usub.sat.v64i8(<64 x i8>, <64 x i8>) +declare <32 x i16> @llvm.usub.sat.v32i16(<32 x i16>, <32 x i16>) diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avx512vl.ll b/llvm/test/CodeGen/X86/stack-folding-int-avx512vl.ll index a3d7da21c5fd..67a02d5b9b6e 100644 --- a/llvm/test/CodeGen/X86/stack-folding-int-avx512vl.ll +++ b/llvm/test/CodeGen/X86/stack-folding-int-avx512vl.ll @@ -12,7 +12,7 @@ define <8 x i32> @stack_fold_valignd_ymm(<8 x i32> %a, <8 x i32> %b) { ;CHECK-LABEL: stack_fold_valignd_ymm ;CHECK: valignd $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> + %2 = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %2 } @@ -20,7 +20,7 @@ define <8 x i32> @stack_fold_valignd_ymm_mask(<8 x i32> %a, <8 x i32> %b, <8 x i ;CHECK-LABEL: stack_fold_valignd_ymm_mask ;CHECK: valignd $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> + %2 = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> %3 = bitcast i8 %mask to <8 x i1> %4 = load <8 x i32>, <8 x i32>* %passthru %5 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> %4 @@ -31,7 +31,7 @@ define <8 x i32> @stack_fold_valignd_ymm_maskz(<8 x i32> %a, <8 x i32> %b, i8 %m ;CHECK-LABEL: stack_fold_valignd_ymm_maskz ;CHECK: valignd $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> + %2 = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> %3 = bitcast i8 %mask to <8 x i1> %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer ret <8 x i32> %4 @@ -65,8 +65,8 @@ define <32 x i8> @stack_fold_pavgb_ymm(<32 x i8> %a0, <32 x i8> %a1) { %2 = zext <32 x i8> %a0 to <32 x i16> %3 = zext <32 x i8> %a1 to <32 x i16> %4 = add <32 x i16> %2, %3 - %5 = add <32 x i16> %4, - %6 = lshr <32 x i16> %5, + %5 = add <32 x i16> %4, + %6 = lshr <32 x i16> %5, %7 = trunc <32 x i16> %6 to <32 x i8> ret <32 x i8> %7 } @@ -88,50 +88,46 @@ define <16 x i16> @stack_fold_pavgw_ymm(<16 x i16> %a0, <16 x i16> %a1) { ;CHECK-LABEL: stack_fold_pavgw_ymm ;CHECK: vpavgw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = zext <16 x i16> %a0 to <16 x i32> - %3 = zext <16 x i16> %a1 to <16 x i32> - %4 = add <16 x i32> %2, %3 - %5 = add <16 x i32> %4, - %6 = lshr <16 x i32> %5, - %7 = trunc <16 x i32> %6 to <16 x i16> - ret <16 x i16> %7 + %2 = zext <16 x i16> %a0 to <16 x i32> + %3 = zext <16 x i16> %a1 to <16 x i32> + %4 = add <16 x i32> %2, %3 + %5 = add <16 x i32> %4, + %6 = lshr <16 x i32> %5, + %7 = trunc <16 x i32> %6 to <16 x i16> + ret <16 x i16> %7 } define <4 x i32> @stack_fold_vpconflictd(<4 x i32> %a0) { ;CHECK-LABEL: stack_fold_vpconflictd ;CHECK: vpconflictd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <4 x i32> @llvm.x86.avx512.mask.conflict.d.128(<4 x i32> %a0, <4 x i32> undef, i8 -1) + %2 = call <4 x i32> @llvm.x86.avx512.conflict.d.128(<4 x i32> %a0) ret <4 x i32> %2 } -declare <4 x i32> @llvm.x86.avx512.mask.conflict.d.128(<4 x i32>, <4 x i32>, i8) nounwind readonly define <8 x i32> @stack_fold_vpconflictd_ymm(<8 x i32> %a0) { ;CHECK-LABEL: stack_fold_vpconflictd_ymm ;CHECK: vpconflictd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <8 x i32> @llvm.x86.avx512.mask.conflict.d.256(<8 x i32> %a0, <8 x i32> undef, i8 -1) + %2 = call <8 x i32> @llvm.x86.avx512.conflict.d.256(<8 x i32> %a0) ret <8 x i32> %2 } -declare <8 x i32> @llvm.x86.avx512.mask.conflict.d.256(<8 x i32>, <8 x i32>, i8) nounwind readonly define <2 x i64> @stack_fold_vpconflictq(<2 x i64> %a0) { ;CHECK-LABEL: stack_fold_vpconflictq ;CHECK: vpconflictq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <2 x i64> @llvm.x86.avx512.mask.conflict.q.128(<2 x i64> %a0, <2 x i64> undef, i8 -1) + %2 = call <2 x i64> @llvm.x86.avx512.conflict.q.128(<2 x i64> %a0) ret <2 x i64> %2 } -declare <2 x i64> @llvm.x86.avx512.mask.conflict.q.128(<2 x i64>, <2 x i64>, i8) nounwind readnone define <4 x i64> @stack_fold_vpconflictq_ymm(<4 x i64> %a0) { ;CHECK-LABEL: stack_fold_vpconflictq_ymm ;CHECK: vpconflictq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <4 x i64> @llvm.x86.avx512.mask.conflict.q.256(<4 x i64> %a0, <4 x i64> undef, i8 -1) + %2 = call <4 x i64> @llvm.x86.avx512.conflict.q.256(<4 x i64> %a0) ret <4 x i64> %2 } -declare <4 x i64> @llvm.x86.avx512.mask.conflict.q.256(<4 x i64>, <4 x i64>, i8) nounwind readnone define <4 x i32> @stack_fold_extracti32x4(<8 x i16> %a0, <8 x i32> %a1) { ;CHECK-LABEL: stack_fold_extracti32x4 @@ -766,37 +762,33 @@ define <4 x i32> @stack_fold_vplzcntd(<4 x i32> %a0) { ;CHECK-LABEL: stack_fold_vplzcntd ;CHECK: vplzcntd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a0) + %2 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a0, i1 false) ret <4 x i32> %2 } -declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>) nounwind readonly define <8 x i32> @stack_fold_vplzcntd_ymm(<8 x i32> %a0) { ;CHECK-LABEL: stack_fold_vplzcntd_ymm ;CHECK: vplzcntd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %a0) + %2 = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %a0, i1 false) ret <8 x i32> %2 } -declare <8 x i32> @llvm.ctlz.v8i32(<8 x i32>) nounwind readonly define <2 x i64> @stack_fold_vplzcntq(<2 x i64> %a0) { ;CHECK-LABEL: stack_fold_vplzcntq ;CHECK: vplzcntq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a0) + %2 = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a0, i1 false) ret <2 x i64> %2 } -declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>) nounwind readnone define <4 x i64> @stack_fold_vplzcntq_ymm(<4 x i64> %a0) { ;CHECK-LABEL: stack_fold_vplzcntq_ymm ;CHECK: vplzcntq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a0) + %2 = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a0, i1 false) ret <4 x i64> %2 } -declare <4 x i64> @llvm.ctlz.v4i64(<4 x i64>) nounwind readnone define <8 x i16> @stack_fold_pmaddubsw(<16 x i8> %a0, <16 x i8> %a1) { ;CHECK-LABEL: stack_fold_pmaddubsw @@ -2519,3 +2511,12 @@ define <8 x i32> @stack_fold_shufi32x4_maskz(<8 x i32> %a, <8 x i32> %b, i8 %mas %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer ret <8 x i32> %4 } + +declare <4 x i32> @llvm.x86.avx512.conflict.d.128(<4 x i32>) +declare <8 x i32> @llvm.x86.avx512.conflict.d.256(<8 x i32>) +declare <2 x i64> @llvm.x86.avx512.conflict.q.128(<2 x i64>) +declare <4 x i64> @llvm.x86.avx512.conflict.q.256(<4 x i64>) +declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) +declare <8 x i32> @llvm.ctlz.v8i32(<8 x i32>, i1) +declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) +declare <4 x i64> @llvm.ctlz.v4i64(<4 x i64>, i1)