Match types of accumulator and result for llvm.experimental.vector.reduce.fadd/fmul

The scalar start/accumulator value of the fadd- and fmul reduction should match the result type of the reduction, as well as the vector element-type of the input vector. Although this was not explicitly specified in the LangRef, it was taken for granted in code implementing the reductions. The patch also fixes the LangRef by adding this constraint. Reviewed By: aemerson, nikic Differential Revision: https://reviews.llvm.org/D60260 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@361133 91177308-0d34-0410-b5e6-96231b3b80d8
2025-02-21 03:00:45 +00:00 · 2019-05-20 09:54:06 +00:00 · 2019-05-20 09:54:06 +00:00 · e75dee78f7
commit e75dee78f7
parent 8d0ebfc10c
10 changed files with 190 additions and 156 deletions
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@ -13580,7 +13580,8 @@ Arguments:
 """"""""""
 The first argument to this intrinsic is a scalar accumulator value, which is
 only used when there are no fast-math flags attached. This argument may be undef
-when fast-math flags are used.
+when fast-math flags are used. The type of the accumulator matches the
+element-type of the vector input.

 The second argument must be a vector of floating-point values.

@ -13643,7 +13644,8 @@ Arguments:
 """"""""""
 The first argument to this intrinsic is a scalar accumulator value, which is
 only used when there are no fast-math flags attached. This argument may be undef
-when fast-math flags are used.
+when fast-math flags are used. The type of the accumulator matches the
+element-type of the vector input.

 The second argument must be a vector of floating-point values.

--- a/include/llvm/IR/Intrinsics.td
+++ b/include/llvm/IR/Intrinsics.td
@ -1134,11 +1134,11 @@ def int_memset_element_unordered_atomic
 //===------------------------ Reduction Intrinsics ------------------------===//
 //
 def int_experimental_vector_reduce_fadd : Intrinsic<[llvm_anyfloat_ty],
-                                                    [llvm_anyfloat_ty,
+                                                    [LLVMMatchType<0>,
                                                     llvm_anyvector_ty],
                                                    [IntrNoMem]>;
 def int_experimental_vector_reduce_fmul : Intrinsic<[llvm_anyfloat_ty],
-                                                    [llvm_anyfloat_ty,
+                                                    [LLVMMatchType<0>,
                                                     llvm_anyvector_ty],
                                                    [IntrNoMem]>;
 def int_experimental_vector_reduce_add : Intrinsic<[llvm_anyint_ty],
--- a/lib/IR/IRBuilder.cpp
+++ b/lib/IR/IRBuilder.cpp
@ -321,8 +321,7 @@ static CallInst *getReductionIntrinsic(IRBuilderBase *Builder, Intrinsic::ID ID,
 CallInst *IRBuilderBase::CreateFAddReduce(Value *Acc, Value *Src) {
  Module *M = GetInsertBlock()->getParent()->getParent();
  Value *Ops[] = {Acc, Src};
-  Type *Tys[] = {Src->getType()->getVectorElementType(), Acc->getType(),
-                 Src->getType()};
+  Type *Tys[] = {Acc->getType(), Src->getType()};
  auto Decl = Intrinsic::getDeclaration(
      M, Intrinsic::experimental_vector_reduce_fadd, Tys);
  return createCallHelper(Decl, Ops, this);
@ -331,8 +330,7 @@ CallInst *IRBuilderBase::CreateFAddReduce(Value *Acc, Value *Src) {
 CallInst *IRBuilderBase::CreateFMulReduce(Value *Acc, Value *Src) {
  Module *M = GetInsertBlock()->getParent()->getParent();
  Value *Ops[] = {Acc, Src};
-  Type *Tys[] = {Src->getType()->getVectorElementType(), Acc->getType(),
-                 Src->getType()};
+  Type *Tys[] = {Acc->getType(), Src->getType()};
  auto Decl = Intrinsic::getDeclaration(
      M, Intrinsic::experimental_vector_reduce_fmul, Tys);
  return createCallHelper(Decl, Ops, this);
--- a/test/Assembler/invalid-vecreduce.ll
+++ b/test/Assembler/invalid-vecreduce.ll
@ -0,0 +1,34 @@
+; RUN: not opt -S < %s 2>&1 | FileCheck %s
+
+; CHECK: Intrinsic has incorrect argument type!
+; CHECK-NEXT: float (double, <2 x double>)* @llvm.experimental.vector.reduce.fadd.f32.f64.v2f64
+define float @fadd_invalid_scalar_res(double %acc, <2 x double> %in) {
+  %res = call float @llvm.experimental.vector.reduce.fadd.f32.f64.v2f64(double %acc, <2 x double> %in)
+  ret float %res
+}
+
+; CHECK: Intrinsic has incorrect argument type!
+; CHECK-NEXT: double (float, <2 x double>)* @llvm.experimental.vector.reduce.fadd.f64.f32.v2f64
+define double @fadd_invalid_scalar_start(float %acc, <2 x double> %in) {
+  %res = call double @llvm.experimental.vector.reduce.fadd.f64.f32.v2f64(float %acc, <2 x double> %in)
+  ret double %res
+}
+
+; CHECK: Intrinsic has incorrect argument type!
+; CHECK-NEXT: <2 x double> (double, <2 x double>)* @llvm.experimental.vector.reduce.fadd.v2f64.f64.v2f64
+define <2 x double> @fadd_invalid_vector_res(double %acc, <2 x double> %in) {
+  %res = call <2 x double> @llvm.experimental.vector.reduce.fadd.v2f64.f64.v2f64(double %acc, <2 x double> %in)
+  ret <2 x double> %res
+}
+
+; CHECK: Intrinsic has incorrect argument type!
+; CHECK-NEXT: double (<2 x double>, <2 x double>)* @llvm.experimental.vector.reduce.fadd.f64.v2f64.v2f64
+define double @fadd_invalid_vector_start(<2 x double> %in, <2 x double> %acc) {
+  %res = call double @llvm.experimental.vector.reduce.fadd.f64.v2f64.v2f64(<2 x double> %acc, <2 x double> %in)
+  ret double %res
+}
+
+declare float @llvm.experimental.vector.reduce.fadd.f32.f64.v2f64(double %acc, <2 x double> %in)
+declare double @llvm.experimental.vector.reduce.fadd.f64.f32.v2f64(float %acc, <2 x double> %in)
+declare double @llvm.experimental.vector.reduce.fadd.f64.v2f64.v2f64(<2 x double> %acc, <2 x double> %in)
+declare <2 x double> @llvm.experimental.vector.reduce.fadd.v2f64.f64.v2f64(double %acc, <2 x double> %in)
--- a/test/CodeGen/AArch64/vecreduce-fadd.ll
+++ b/test/CodeGen/AArch64/vecreduce-fadd.ll
@ -5,7 +5,7 @@ define float @add_HalfS(<2 x float> %bin.rdx)  {
 ; CHECK-LABEL: add_HalfS:
 ; CHECK:       faddp s0, v0.2s
 ; CHECK-NEXT:  ret
-  %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v2f32(<2 x float> undef, <2 x float> %bin.rdx)
+  %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float undef, <2 x float> %bin.rdx)
  ret float %r
 }

@ -23,7 +23,7 @@ define half @add_HalfH(<4 x half> %bin.rdx)  {
 ; CHECKNOFP16-NOT:   fadd h{{[0-9]+}}
 ; CHECKNOFP16-NOT:   fadd v{{[0-9]+}}.{{[0-9]}}h
 ; CHECKNOFP16:       ret
-  %r = call fast half @llvm.experimental.vector.reduce.fadd.f16.v4f16(<4 x half> undef, <4 x half> %bin.rdx)
+  %r = call fast half @llvm.experimental.vector.reduce.fadd.f16.v4f16(half undef, <4 x half> %bin.rdx)
  ret half %r
 }

@ -45,7 +45,7 @@ define half @add_H(<8 x half> %bin.rdx)  {
 ; CHECKNOFP16-NOT:   fadd h{{[0-9]+}}
 ; CHECKNOFP16-NOT:   fadd v{{[0-9]+}}.{{[0-9]}}h
 ; CHECKNOFP16:       ret
-  %r = call fast half @llvm.experimental.vector.reduce.fadd.f16.v8f16(<8 x half> undef, <8 x half> %bin.rdx)
+  %r = call fast half @llvm.experimental.vector.reduce.fadd.f16.v8f16(half undef, <8 x half> %bin.rdx)
  ret half %r
 }

@ -55,7 +55,7 @@ define float @add_S(<4 x float> %bin.rdx)  {
 ; CHECK-NEXT:  fadd  v0.2s, v0.2s, v1.2s
 ; CHECK-NEXT:  faddp s0, v0.2s
 ; CHECK-NEXT:  ret
-  %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(<4 x float> undef, <4 x float> %bin.rdx)
+  %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %bin.rdx)
  ret float %r
 }

@ -63,7 +63,7 @@ define double @add_D(<2 x double> %bin.rdx)  {
 ; CHECK-LABEL: add_D:
 ; CHECK:       faddp d0, v0.2d
 ; CHECK-NEXT:  ret
-  %r = call fast double @llvm.experimental.vector.reduce.fadd.f64.v2f64(<2 x double> undef, <2 x double> %bin.rdx)
+  %r = call fast double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double undef, <2 x double> %bin.rdx)
  ret double %r
 }

@ -84,7 +84,7 @@ define half @add_2H(<16 x half> %bin.rdx)  {
 ; CHECKNOFP16-NOT:   fadd h{{[0-9]+}}
 ; CHECKNOFP16-NOT:   fadd v{{[0-9]+}}.{{[0-9]}}h
 ; CHECKNOFP16:       ret
-  %r = call fast half @llvm.experimental.vector.reduce.fadd.f16.v16f16(<16 x half> undef, <16 x half> %bin.rdx)
+  %r = call fast half @llvm.experimental.vector.reduce.fadd.f16.v16f16(half undef, <16 x half> %bin.rdx)
  ret half %r
 }

@ -95,7 +95,7 @@ define float @add_2S(<8 x float> %bin.rdx)  {
 ; CHECK-NEXT:  fadd  v0.2s, v0.2s, v1.2s
 ; CHECK-NEXT:  faddp s0, v0.2s
 ; CHECK-NEXT:  ret
-  %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v8f32(<8 x float> undef, <8 x float> %bin.rdx)
+  %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float undef, <8 x float> %bin.rdx)
  ret float %r
 }

@ -104,16 +104,16 @@ define double @add_2D(<4 x double> %bin.rdx)  {
 ; CHECK:       fadd v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:  faddp d0, v0.2d
 ; CHECK-NEXT:  ret
-  %r = call fast double @llvm.experimental.vector.reduce.fadd.f64.v4f64(<4 x double> undef, <4 x double> %bin.rdx)
+  %r = call fast double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double undef, <4 x double> %bin.rdx)
  ret double %r
 }

 ; Function Attrs: nounwind readnone
-declare half @llvm.experimental.vector.reduce.fadd.f16.v4f16(<4 x half>, <4 x half>)
-declare half @llvm.experimental.vector.reduce.fadd.f16.v8f16(<8 x half>, <8 x half>)
-declare half @llvm.experimental.vector.reduce.fadd.f16.v16f16(<16 x half>, <16 x half>)
-declare float @llvm.experimental.vector.reduce.fadd.f32.v2f32(<2 x float>, <2 x float>)
-declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(<4 x float>, <4 x float>)
-declare float @llvm.experimental.vector.reduce.fadd.f32.v8f32(<8 x float>, <8 x float>)
-declare double @llvm.experimental.vector.reduce.fadd.f64.v2f64(<2 x double>, <2 x double>)
-declare double @llvm.experimental.vector.reduce.fadd.f64.v4f64(<4 x double>, <4 x double>)
+declare half @llvm.experimental.vector.reduce.fadd.f16.v4f16(half, <4 x half>)
+declare half @llvm.experimental.vector.reduce.fadd.f16.v8f16(half, <8 x half>)
+declare half @llvm.experimental.vector.reduce.fadd.f16.v16f16(half, <16 x half>)
+declare float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float, <2 x float>)
+declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
+declare float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float, <8 x float>)
+declare double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double, <2 x double>)
+declare double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double, <4 x double>)
--- a/test/CodeGen/X86/haddsub.ll
+++ b/test/CodeGen/X86/haddsub.ll
@ -1628,8 +1628,8 @@ define float @extract_extract01_v4f32_fadd_f32_uses3(<4 x float> %x, float* %p1,
 ; Repeat tests from general reductions to verify output for hoppy targets:
 ; PR38971: https://bugs.llvm.org/show_bug.cgi?id=38971

-declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float, <8 x float>)
-declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double, <4 x double>)
+declare float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float, <8 x float>)
+declare double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double, <4 x double>)

 define float @fadd_reduce_v8f32(float %a0, <8 x float> %a1) {
 ; SSE3-SLOW-LABEL: fadd_reduce_v8f32:
@ -1671,7 +1671,7 @@ define float @fadd_reduce_v8f32(float %a0, <8 x float> %a1) {
 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
-  %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float %a0, <8 x float> %a1)
+  %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float %a0, <8 x float> %a1)
  ret float %r
 }

@ -1707,7 +1707,7 @@ define double @fadd_reduce_v4f64(double %a0, <4 x double> %a1) {
 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
-  %r = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double %a0, <4 x double> %a1)
+  %r = call fast double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double %a0, <4 x double> %a1)
  ret double %r
 }

--- a/test/CodeGen/X86/vector-reduce-fadd-fast.ll
+++ b/test/CodeGen/X86/vector-reduce-fadd-fast.ll
@ -47,7 +47,7 @@ define float @test_v2f32(float %a0, <2 x float> %a1) {
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
 ; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float %a0, <2 x float> %a1)
+  %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float %a0, <2 x float> %a1)
  ret float %1
 }

@ -101,7 +101,7 @@ define float @test_v4f32(float %a0, <4 x float> %a1) {
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float %a0, <4 x float> %a1)
+  %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %a1)
  ret float %1
 }

@ -169,7 +169,7 @@ define float @test_v8f32(float %a0, <8 x float> %a1) {
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float %a0, <8 x float> %a1)
+  %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float %a0, <8 x float> %a1)
  ret float %1
 }

@ -246,7 +246,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) {
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float %a0, <16 x float> %a1)
+  %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v16f32(float %a0, <16 x float> %a1)
  ret float %1
 }

@ -291,7 +291,7 @@ define float @test_v2f32_zero(<2 x float> %a0) {
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float 0.0, <2 x float> %a0)
+  %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float 0.0, <2 x float> %a0)
  ret float %1
 }

@ -346,7 +346,7 @@ define float @test_v4f32_zero(<4 x float> %a0) {
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float 0.0, <4 x float> %a0)
+  %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float 0.0, <4 x float> %a0)
  ret float %1
 }

@ -415,7 +415,7 @@ define float @test_v8f32_zero(<8 x float> %a0) {
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float 0.0, <8 x float> %a0)
+  %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float 0.0, <8 x float> %a0)
  ret float %1
 }

@ -493,7 +493,7 @@ define float @test_v16f32_zero(<16 x float> %a0) {
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float 0.0, <16 x float> %a0)
+  %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v16f32(float 0.0, <16 x float> %a0)
  ret float %1
 }

@ -538,7 +538,7 @@ define float @test_v2f32_undef(<2 x float> %a0) {
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float undef, <2 x float> %a0)
+  %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float undef, <2 x float> %a0)
  ret float %1
 }

@ -593,7 +593,7 @@ define float @test_v4f32_undef(<4 x float> %a0) {
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float undef, <4 x float> %a0)
+  %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %a0)
  ret float %1
 }

@ -662,7 +662,7 @@ define float @test_v8f32_undef(<8 x float> %a0) {
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float undef, <8 x float> %a0)
+  %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float undef, <8 x float> %a0)
  ret float %1
 }

@ -740,7 +740,7 @@ define float @test_v16f32_undef(<16 x float> %a0) {
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float undef, <16 x float> %a0)
+  %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v16f32(float undef, <16 x float> %a0)
  ret float %1
 }

@ -778,7 +778,7 @@ define double @test_v2f64(double %a0, <2 x double> %a1) {
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
 ; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double %a0, <2 x double> %a1)
+  %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double %a0, <2 x double> %a1)
  ret double %1
 }

@ -825,7 +825,7 @@ define double @test_v4f64(double %a0, <4 x double> %a1) {
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double %a0, <4 x double> %a1)
+  %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double %a0, <4 x double> %a1)
  ret double %1
 }

@ -879,7 +879,7 @@ define double @test_v8f64(double %a0, <8 x double> %a1) {
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double %a0, <8 x double> %a1)
+  %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v8f64(double %a0, <8 x double> %a1)
  ret double %1
 }

@ -944,7 +944,7 @@ define double @test_v16f64(double %a0, <16 x double> %a1) {
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double %a0, <16 x double> %a1)
+  %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v16f64(double %a0, <16 x double> %a1)
  ret double %1
 }

@ -983,7 +983,7 @@ define double @test_v2f64_zero(<2 x double> %a0) {
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double 0.0, <2 x double> %a0)
+  %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double 0.0, <2 x double> %a0)
  ret double %1
 }

@ -1031,7 +1031,7 @@ define double @test_v4f64_zero(<4 x double> %a0) {
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double 0.0, <4 x double> %a0)
+  %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double 0.0, <4 x double> %a0)
  ret double %1
 }

@ -1086,7 +1086,7 @@ define double @test_v8f64_zero(<8 x double> %a0) {
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double 0.0, <8 x double> %a0)
+  %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v8f64(double 0.0, <8 x double> %a0)
  ret double %1
 }

@ -1151,7 +1151,7 @@ define double @test_v16f64_zero(<16 x double> %a0) {
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double 0.0, <16 x double> %a0)
+  %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v16f64(double 0.0, <16 x double> %a0)
  ret double %1
 }

@ -1190,7 +1190,7 @@ define double @test_v2f64_undef(<2 x double> %a0) {
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double undef, <2 x double> %a0)
+  %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double undef, <2 x double> %a0)
  ret double %1
 }

@ -1238,7 +1238,7 @@ define double @test_v4f64_undef(<4 x double> %a0) {
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double undef, <4 x double> %a0)
+  %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double undef, <4 x double> %a0)
  ret double %1
 }

@ -1293,7 +1293,7 @@ define double @test_v8f64_undef(<8 x double> %a0) {
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double undef, <8 x double> %a0)
+  %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v8f64(double undef, <8 x double> %a0)
  ret double %1
 }

@ -1358,16 +1358,16 @@ define double @test_v16f64_undef(<16 x double> %a0) {
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double undef, <16 x double> %a0)
+  %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v16f64(double undef, <16 x double> %a0)
  ret double %1
 }

-declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float, <2 x float>)
-declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float, <4 x float>)
-declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float, <8 x float>)
-declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float, <16 x float>)
+declare float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float, <2 x float>)
+declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
+declare float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float, <8 x float>)
+declare float @llvm.experimental.vector.reduce.fadd.f32.v16f32(float, <16 x float>)

-declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double, <2 x double>)
-declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double, <4 x double>)
-declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double, <8 x double>)
-declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double, <16 x double>)
+declare double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double, <2 x double>)
+declare double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double, <4 x double>)
+declare double @llvm.experimental.vector.reduce.fadd.f64.v8f64(double, <8 x double>)
+declare double @llvm.experimental.vector.reduce.fadd.f64.v16f64(double, <16 x double>)
--- a/test/CodeGen/X86/vector-reduce-fadd.ll
+++ b/test/CodeGen/X86/vector-reduce-fadd.ll
@ -39,7 +39,7 @@ define float @test_v2f32(float %a0, <2 x float> %a1) {
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float %a0, <2 x float> %a1)
+  %1 = call float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float %a0, <2 x float> %a1)
  ret float %1
 }

@ -90,7 +90,7 @@ define float @test_v4f32(float %a0, <4 x float> %a1) {
 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float %a0, <4 x float> %a1)
+  %1 = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %a1)
  ret float %1
 }

@ -176,7 +176,7 @@ define float @test_v8f32(float %a0, <8 x float> %a1) {
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float %a0, <8 x float> %a1)
+  %1 = call float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float %a0, <8 x float> %a1)
  ret float %1
 }

@ -327,7 +327,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) {
 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float %a0, <16 x float> %a1)
+  %1 = call float @llvm.experimental.vector.reduce.fadd.f32.v16f32(float %a0, <16 x float> %a1)
  ret float %1
 }

@ -367,7 +367,7 @@ define float @test_v2f32_zero(<2 x float> %a0) {
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float 0.0, <2 x float> %a0)
+  %1 = call float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float 0.0, <2 x float> %a0)
  ret float %1
 }

@ -422,7 +422,7 @@ define float @test_v4f32_zero(<4 x float> %a0) {
 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 ; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float 0.0, <4 x float> %a0)
+  %1 = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float 0.0, <4 x float> %a0)
  ret float %1
 }

@ -512,7 +512,7 @@ define float @test_v8f32_zero(<8 x float> %a0) {
 ; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float 0.0, <8 x float> %a0)
+  %1 = call float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float 0.0, <8 x float> %a0)
  ret float %1
 }

@ -667,7 +667,7 @@ define float @test_v16f32_zero(<16 x float> %a0) {
 ; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float 0.0, <16 x float> %a0)
+  %1 = call float @llvm.experimental.vector.reduce.fadd.f32.v16f32(float 0.0, <16 x float> %a0)
  ret float %1
 }

@ -699,7 +699,7 @@ define float @test_v2f32_undef(<2 x float> %a0) {
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vaddss {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float undef, <2 x float> %a0)
+  %1 = call float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float undef, <2 x float> %a0)
  ret float %1
 }

@ -746,7 +746,7 @@ define float @test_v4f32_undef(<4 x float> %a0) {
 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 ; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float undef, <4 x float> %a0)
+  %1 = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %a0)
  ret float %1
 }

@ -828,7 +828,7 @@ define float @test_v8f32_undef(<8 x float> %a0) {
 ; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float undef, <8 x float> %a0)
+  %1 = call float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float undef, <8 x float> %a0)
  ret float %1
 }

@ -975,7 +975,7 @@ define float @test_v16f32_undef(<16 x float> %a0) {
 ; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float undef, <16 x float> %a0)
+  %1 = call float @llvm.experimental.vector.reduce.fadd.f32.v16f32(float undef, <16 x float> %a0)
  ret float %1
 }

@ -1004,7 +1004,7 @@ define double @test_v2f64(double %a0, <2 x double> %a1) {
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double %a0, <2 x double> %a1)
+  %1 = call double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double %a0, <2 x double> %a1)
  ret double %1
 }

@ -1042,7 +1042,7 @@ define double @test_v4f64(double %a0, <4 x double> %a1) {
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double %a0, <4 x double> %a1)
+  %1 = call double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double %a0, <4 x double> %a1)
  ret double %1
 }

@ -1101,7 +1101,7 @@ define double @test_v8f64(double %a0, <8 x double> %a1) {
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double %a0, <8 x double> %a1)
+  %1 = call double @llvm.experimental.vector.reduce.fadd.f64.v8f64(double %a0, <8 x double> %a1)
  ret double %1
 }

@ -1202,7 +1202,7 @@ define double @test_v16f64(double %a0, <16 x double> %a1) {
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double %a0, <16 x double> %a1)
+  %1 = call double @llvm.experimental.vector.reduce.fadd.f64.v16f64(double %a0, <16 x double> %a1)
  ret double %1
 }

@ -1234,7 +1234,7 @@ define double @test_v2f64_zero(<2 x double> %a0) {
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
 ; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double 0.0, <2 x double> %a0)
+  %1 = call double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double 0.0, <2 x double> %a0)
  ret double %1
 }

@ -1275,7 +1275,7 @@ define double @test_v4f64_zero(<4 x double> %a0) {
 ; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double 0.0, <4 x double> %a0)
+  %1 = call double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double 0.0, <4 x double> %a0)
  ret double %1
 }

@ -1337,7 +1337,7 @@ define double @test_v8f64_zero(<8 x double> %a0) {
 ; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double 0.0, <8 x double> %a0)
+  %1 = call double @llvm.experimental.vector.reduce.fadd.f64.v8f64(double 0.0, <8 x double> %a0)
  ret double %1
 }

@ -1440,7 +1440,7 @@ define double @test_v16f64_zero(<16 x double> %a0) {
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double 0.0, <16 x double> %a0)
+  %1 = call double @llvm.experimental.vector.reduce.fadd.f64.v16f64(double 0.0, <16 x double> %a0)
  ret double %1
 }

@ -1466,7 +1466,7 @@ define double @test_v2f64_undef(<2 x double> %a0) {
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
 ; AVX512-NEXT:    vaddsd {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double undef, <2 x double> %a0)
+  %1 = call double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double undef, <2 x double> %a0)
  ret double %1
 }

@ -1501,7 +1501,7 @@ define double @test_v4f64_undef(<4 x double> %a0) {
 ; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double undef, <4 x double> %a0)
+  %1 = call double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double undef, <4 x double> %a0)
  ret double %1
 }

@ -1557,7 +1557,7 @@ define double @test_v8f64_undef(<8 x double> %a0) {
 ; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double undef, <8 x double> %a0)
+  %1 = call double @llvm.experimental.vector.reduce.fadd.f64.v8f64(double undef, <8 x double> %a0)
  ret double %1
 }

@ -1654,16 +1654,16 @@ define double @test_v16f64_undef(<16 x double> %a0) {
 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double undef, <16 x double> %a0)
+  %1 = call double @llvm.experimental.vector.reduce.fadd.f64.v16f64(double undef, <16 x double> %a0)
  ret double %1
 }

-declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float, <2 x float>)
-declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float, <4 x float>)
-declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float, <8 x float>)
-declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float, <16 x float>)
+declare float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float, <2 x float>)
+declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
+declare float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float, <8 x float>)
+declare float @llvm.experimental.vector.reduce.fadd.f32.v16f32(float, <16 x float>)

-declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double, <2 x double>)
-declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double, <4 x double>)
-declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double, <8 x double>)
-declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double, <16 x double>)
+declare double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double, <2 x double>)
+declare double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double, <4 x double>)
+declare double @llvm.experimental.vector.reduce.fadd.f64.v8f64(double, <8 x double>)
+declare double @llvm.experimental.vector.reduce.fadd.f64.v16f64(double, <16 x double>)
--- a/test/CodeGen/X86/vector-reduce-fmul-fast.ll
+++ b/test/CodeGen/X86/vector-reduce-fmul-fast.ll
@ -35,7 +35,7 @@ define float @test_v2f32(float %a0, <2 x float> %a1) {
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
 ; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float %a0, <2 x float> %a1)
+  %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.v2f32(float %a0, <2 x float> %a1)
  ret float %1
 }

@ -74,7 +74,7 @@ define float @test_v4f32(float %a0, <4 x float> %a1) {
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float %a0, <4 x float> %a1)
+  %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %a0, <4 x float> %a1)
  ret float %1
 }

@ -121,7 +121,7 @@ define float @test_v8f32(float %a0, <8 x float> %a1) {
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float %a0, <8 x float> %a1)
+  %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.v8f32(float %a0, <8 x float> %a1)
  ret float %1
 }

@ -175,7 +175,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) {
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float %a0, <16 x float> %a1)
+  %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.v16f32(float %a0, <16 x float> %a1)
  ret float %1
 }

@ -209,7 +209,7 @@ define float @test_v2f32_zero(<2 x float> %a0) {
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float 1.0, <2 x float> %a0)
+  %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %a0)
  ret float %1
 }

@ -249,7 +249,7 @@ define float @test_v4f32_zero(<4 x float> %a0) {
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float 1.0, <4 x float> %a0)
+  %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a0)
  ret float %1
 }

@ -297,7 +297,7 @@ define float @test_v8f32_zero(<8 x float> %a0) {
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float 1.0, <8 x float> %a0)
+  %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a0)
  ret float %1
 }

@ -352,7 +352,7 @@ define float @test_v16f32_zero(<16 x float> %a0) {
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float 1.0, <16 x float> %a0)
+  %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.v16f32(float 1.0, <16 x float> %a0)
  ret float %1
 }

@ -386,7 +386,7 @@ define float @test_v2f32_undef(<2 x float> %a0) {
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float undef, <2 x float> %a0)
+  %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.v2f32(float undef, <2 x float> %a0)
  ret float %1
 }

@ -426,7 +426,7 @@ define float @test_v4f32_undef(<4 x float> %a0) {
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float undef, <4 x float> %a0)
+  %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %a0)
  ret float %1
 }

@ -474,7 +474,7 @@ define float @test_v8f32_undef(<8 x float> %a0) {
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float undef, <8 x float> %a0)
+  %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.v8f32(float undef, <8 x float> %a0)
  ret float %1
 }

@ -529,7 +529,7 @@ define float @test_v16f32_undef(<16 x float> %a0) {
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float undef, <16 x float> %a0)
+  %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.v16f32(float undef, <16 x float> %a0)
  ret float %1
 }

@ -556,7 +556,7 @@ define double @test_v2f64(double %a0, <2 x double> %a1) {
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
 ; AVX512-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double %a0, <2 x double> %a1)
+  %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.v2f64(double %a0, <2 x double> %a1)
  ret double %1
 }

@ -586,7 +586,7 @@ define double @test_v4f64(double %a0, <4 x double> %a1) {
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double %a0, <4 x double> %a1)
+  %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.v4f64(double %a0, <4 x double> %a1)
  ret double %1
 }

@ -621,7 +621,7 @@ define double @test_v8f64(double %a0, <8 x double> %a1) {
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double %a0, <8 x double> %a1)
+  %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.v8f64(double %a0, <8 x double> %a1)
  ret double %1
 }

@ -663,7 +663,7 @@ define double @test_v16f64(double %a0, <16 x double> %a1) {
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double %a0, <16 x double> %a1)
+  %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.v16f64(double %a0, <16 x double> %a1)
  ret double %1
 }

@ -691,7 +691,7 @@ define double @test_v2f64_zero(<2 x double> %a0) {
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double 1.0, <2 x double> %a0)
+  %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.v2f64(double 1.0, <2 x double> %a0)
  ret double %1
 }

@ -722,7 +722,7 @@ define double @test_v4f64_zero(<4 x double> %a0) {
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double 1.0, <4 x double> %a0)
+  %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.v4f64(double 1.0, <4 x double> %a0)
  ret double %1
 }

@ -758,7 +758,7 @@ define double @test_v8f64_zero(<8 x double> %a0) {
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double 1.0, <8 x double> %a0)
+  %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.v8f64(double 1.0, <8 x double> %a0)
  ret double %1
 }

@ -800,7 +800,7 @@ define double @test_v16f64_zero(<16 x double> %a0) {
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double 1.0, <16 x double> %a0)
+  %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.v16f64(double 1.0, <16 x double> %a0)
  ret double %1
 }

@ -828,7 +828,7 @@ define double @test_v2f64_undef(<2 x double> %a0) {
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double undef, <2 x double> %a0)
+  %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.v2f64(double undef, <2 x double> %a0)
  ret double %1
 }

@ -859,7 +859,7 @@ define double @test_v4f64_undef(<4 x double> %a0) {
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double undef, <4 x double> %a0)
+  %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.v4f64(double undef, <4 x double> %a0)
  ret double %1
 }

@ -895,7 +895,7 @@ define double @test_v8f64_undef(<8 x double> %a0) {
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double undef, <8 x double> %a0)
+  %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.v8f64(double undef, <8 x double> %a0)
  ret double %1
 }

@ -937,16 +937,16 @@ define double @test_v16f64_undef(<16 x double> %a0) {
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double undef, <16 x double> %a0)
+  %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.v16f64(double undef, <16 x double> %a0)
  ret double %1
 }

-declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float, <2 x float>)
-declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float, <4 x float>)
-declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float, <8 x float>)
-declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float, <16 x float>)
+declare float @llvm.experimental.vector.reduce.fmul.f32.v2f32(float, <2 x float>)
+declare float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
+declare float @llvm.experimental.vector.reduce.fmul.f32.v8f32(float, <8 x float>)
+declare float @llvm.experimental.vector.reduce.fmul.f32.v16f32(float, <16 x float>)

-declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double, <2 x double>)
-declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double, <4 x double>)
-declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double, <8 x double>)
-declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double, <16 x double>)
+declare double @llvm.experimental.vector.reduce.fmul.f64.v2f64(double, <2 x double>)
+declare double @llvm.experimental.vector.reduce.fmul.f64.v4f64(double, <4 x double>)
+declare double @llvm.experimental.vector.reduce.fmul.f64.v8f64(double, <8 x double>)
+declare double @llvm.experimental.vector.reduce.fmul.f64.v16f64(double, <16 x double>)
--- a/test/CodeGen/X86/vector-reduce-fmul.ll
+++ b/test/CodeGen/X86/vector-reduce-fmul.ll
@ -38,7 +38,7 @@ define float @test_v2f32(float %a0, <2 x float> %a1) {
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float %a0, <2 x float> %a1)
+  %1 = call float @llvm.experimental.vector.reduce.fmul.f32.v2f32(float %a0, <2 x float> %a1)
  ret float %1
 }

@ -89,7 +89,7 @@ define float @test_v4f32(float %a0, <4 x float> %a1) {
 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float %a0, <4 x float> %a1)
+  %1 = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %a0, <4 x float> %a1)
  ret float %1
 }

@ -175,7 +175,7 @@ define float @test_v8f32(float %a0, <8 x float> %a1) {
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float %a0, <8 x float> %a1)
+  %1 = call float @llvm.experimental.vector.reduce.fmul.f32.v8f32(float %a0, <8 x float> %a1)
  ret float %1
 }

@ -326,7 +326,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) {
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float %a0, <16 x float> %a1)
+  %1 = call float @llvm.experimental.vector.reduce.fmul.f32.v16f32(float %a0, <16 x float> %a1)
  ret float %1
 }

@ -360,7 +360,7 @@ define float @test_v2f32_one(<2 x float> %a0) {
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float 1.0, <2 x float> %a0)
+  %1 = call float @llvm.experimental.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %a0)
  ret float %1
 }

@ -407,7 +407,7 @@ define float @test_v4f32_one(<4 x float> %a0) {
 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 ; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float 1.0, <4 x float> %a0)
+  %1 = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a0)
  ret float %1
 }

@ -489,7 +489,7 @@ define float @test_v8f32_one(<8 x float> %a0) {
 ; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float 1.0, <8 x float> %a0)
+  %1 = call float @llvm.experimental.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a0)
  ret float %1
 }

@ -636,7 +636,7 @@ define float @test_v16f32_one(<16 x float> %a0) {
 ; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float 1.0, <16 x float> %a0)
+  %1 = call float @llvm.experimental.vector.reduce.fmul.f32.v16f32(float 1.0, <16 x float> %a0)
  ret float %1
 }

@ -668,7 +668,7 @@ define float @test_v2f32_undef(<2 x float> %a0) {
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; AVX512-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float undef, <2 x float> %a0)
+  %1 = call float @llvm.experimental.vector.reduce.fmul.f32.v2f32(float undef, <2 x float> %a0)
  ret float %1
 }

@ -715,7 +715,7 @@ define float @test_v4f32_undef(<4 x float> %a0) {
 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 ; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float undef, <4 x float> %a0)
+  %1 = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %a0)
  ret float %1
 }

@ -797,7 +797,7 @@ define float @test_v8f32_undef(<8 x float> %a0) {
 ; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float undef, <8 x float> %a0)
+  %1 = call float @llvm.experimental.vector.reduce.fmul.f32.v8f32(float undef, <8 x float> %a0)
  ret float %1
 }

@ -944,7 +944,7 @@ define float @test_v16f32_undef(<16 x float> %a0) {
 ; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float undef, <16 x float> %a0)
+  %1 = call float @llvm.experimental.vector.reduce.fmul.f32.v16f32(float undef, <16 x float> %a0)
  ret float %1
 }

@ -973,7 +973,7 @@ define double @test_v2f64(double %a0, <2 x double> %a1) {
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double %a0, <2 x double> %a1)
+  %1 = call double @llvm.experimental.vector.reduce.fmul.f64.v2f64(double %a0, <2 x double> %a1)
  ret double %1
 }

@ -1011,7 +1011,7 @@ define double @test_v4f64(double %a0, <4 x double> %a1) {
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double %a0, <4 x double> %a1)
+  %1 = call double @llvm.experimental.vector.reduce.fmul.f64.v4f64(double %a0, <4 x double> %a1)
  ret double %1
 }

@ -1070,7 +1070,7 @@ define double @test_v8f64(double %a0, <8 x double> %a1) {
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double %a0, <8 x double> %a1)
+  %1 = call double @llvm.experimental.vector.reduce.fmul.f64.v8f64(double %a0, <8 x double> %a1)
  ret double %1
 }

@ -1171,7 +1171,7 @@ define double @test_v16f64(double %a0, <16 x double> %a1) {
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double %a0, <16 x double> %a1)
+  %1 = call double @llvm.experimental.vector.reduce.fmul.f64.v16f64(double %a0, <16 x double> %a1)
  ret double %1
 }

@ -1199,7 +1199,7 @@ define double @test_v2f64_one(<2 x double> %a0) {
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double 1.0, <2 x double> %a0)
+  %1 = call double @llvm.experimental.vector.reduce.fmul.f64.v2f64(double 1.0, <2 x double> %a0)
  ret double %1
 }

@ -1236,7 +1236,7 @@ define double @test_v4f64_one(<4 x double> %a0) {
 ; AVX512-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double 1.0, <4 x double> %a0)
+  %1 = call double @llvm.experimental.vector.reduce.fmul.f64.v4f64(double 1.0, <4 x double> %a0)
  ret double %1
 }

@ -1294,7 +1294,7 @@ define double @test_v8f64_one(<8 x double> %a0) {
 ; AVX512-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double 1.0, <8 x double> %a0)
+  %1 = call double @llvm.experimental.vector.reduce.fmul.f64.v8f64(double 1.0, <8 x double> %a0)
  ret double %1
 }

@ -1392,7 +1392,7 @@ define double @test_v16f64_one(<16 x double> %a0) {
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double 1.0, <16 x double> %a0)
+  %1 = call double @llvm.experimental.vector.reduce.fmul.f64.v16f64(double 1.0, <16 x double> %a0)
  ret double %1
 }

@ -1418,7 +1418,7 @@ define double @test_v2f64_undef(<2 x double> %a0) {
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
 ; AVX512-NEXT:    vmulsd {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double undef, <2 x double> %a0)
+  %1 = call double @llvm.experimental.vector.reduce.fmul.f64.v2f64(double undef, <2 x double> %a0)
  ret double %1
 }

@ -1453,7 +1453,7 @@ define double @test_v4f64_undef(<4 x double> %a0) {
 ; AVX512-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double undef, <4 x double> %a0)
+  %1 = call double @llvm.experimental.vector.reduce.fmul.f64.v4f64(double undef, <4 x double> %a0)
  ret double %1
 }

@ -1509,7 +1509,7 @@ define double @test_v8f64_undef(<8 x double> %a0) {
 ; AVX512-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double undef, <8 x double> %a0)
+  %1 = call double @llvm.experimental.vector.reduce.fmul.f64.v8f64(double undef, <8 x double> %a0)
  ret double %1
 }

@ -1606,16 +1606,16 @@ define double @test_v16f64_undef(<16 x double> %a0) {
 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-  %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double undef, <16 x double> %a0)
+  %1 = call double @llvm.experimental.vector.reduce.fmul.f64.v16f64(double undef, <16 x double> %a0)
  ret double %1
 }

-declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float, <2 x float>)
-declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float, <4 x float>)
-declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float, <8 x float>)
-declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float, <16 x float>)
+declare float @llvm.experimental.vector.reduce.fmul.f32.v2f32(float, <2 x float>)
+declare float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
+declare float @llvm.experimental.vector.reduce.fmul.f32.v8f32(float, <8 x float>)
+declare float @llvm.experimental.vector.reduce.fmul.f32.v16f32(float, <16 x float>)

-declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double, <2 x double>)
-declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double, <4 x double>)
-declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double, <8 x double>)
-declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double, <16 x double>)
+declare double @llvm.experimental.vector.reduce.fmul.f64.v2f64(double, <2 x double>)
+declare double @llvm.experimental.vector.reduce.fmul.f64.v4f64(double, <4 x double>)
+declare double @llvm.experimental.vector.reduce.fmul.f64.v8f64(double, <8 x double>)
+declare double @llvm.experimental.vector.reduce.fmul.f64.v16f64(double, <16 x double>)