diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td index 40c85302161..b99f2a1b7f3 100644 --- a/include/llvm/IR/IntrinsicsX86.td +++ b/include/llvm/IR/IntrinsicsX86.td @@ -1912,23 +1912,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>; - def int_x86_fma_vfmaddsub_ps : // FIXME: remove this intrinsic. - Intrinsic<[llvm_v4f32_ty], - [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty], - [IntrNoMem]>; - def int_x86_fma_vfmaddsub_pd : // FIXME: remove this intrinsic. - Intrinsic<[llvm_v2f64_ty], - [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty], - [IntrNoMem]>; - def int_x86_fma_vfmaddsub_ps_256 : // FIXME: remove this intrinsic. - Intrinsic<[llvm_v8f32_ty], - [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty], - [IntrNoMem]>; - def int_x86_fma_vfmaddsub_pd_256 : // FIXME: remove this intrinsic. - Intrinsic<[llvm_v4f64_ty], - [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty], - [IntrNoMem]>; - def int_x86_avx512_mask_vfmadd_pd_128 : // FIXME: remove this intrinsic. Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty], diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp index 4ef514d7b73..912af63d776 100644 --- a/lib/IR/AutoUpgrade.cpp +++ b/lib/IR/AutoUpgrade.cpp @@ -76,6 +76,7 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) { Name=="ssse3.pabs.d.128" || // Added in 6.0 Name.startswith("fma.vfmadd.") || // Added in 7.0 Name.startswith("fma.vfmsub.") || // Added in 7.0 + Name.startswith("fma.vfmaddsub.") || // Added in 7.0 Name.startswith("fma.vfmsubadd.") || // Added in 7.0 Name.startswith("fma.vfnmadd.") || // Added in 7.0 Name.startswith("fma.vfnmsub.") || // Added in 7.0 @@ -2778,25 +2779,30 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { if (IsScalar) Rep = Builder.CreateInsertElement(CI->getArgOperand(0), Rep, (uint64_t)0); - } else if (IsX86 && Name.startswith("fma.vfmsubadd.p")) { - // Handle FSUBADD. - unsigned VecWidth = CI->getType()->getPrimitiveSizeInBits(); - unsigned EltWidth = CI->getType()->getScalarSizeInBits(); - Intrinsic::ID IID; - if (VecWidth == 128 && EltWidth == 32) - IID = Intrinsic::x86_fma_vfmaddsub_ps; - else if (VecWidth == 128 && EltWidth == 64) - IID = Intrinsic::x86_fma_vfmaddsub_pd; - else if (VecWidth == 256 && EltWidth == 32) - IID = Intrinsic::x86_fma_vfmaddsub_ps_256; - else if (VecWidth == 256 && EltWidth == 64) - IID = Intrinsic::x86_fma_vfmaddsub_pd_256; - else - llvm_unreachable("Unexpected intrinsic"); - Value *Arg2 = Builder.CreateFNeg(CI->getArgOperand(2)); - Value *Ops[] = { CI->getArgOperand(0), CI->getArgOperand(1), Arg2 }; - Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(), IID), - Ops); + } else if (IsX86 && (Name.startswith("fma.vfmaddsub.p") || + Name.startswith("fma.vfmsubadd.p"))) { + bool IsSubAdd = Name[7] == 's'; + int NumElts = CI->getType()->getVectorNumElements(); + + + + Value *Ops[] = { CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(2) }; + + Function *FMA = Intrinsic::getDeclaration(CI->getModule(), Intrinsic::fma, + Ops[0]->getType()); + Value *Odd = Builder.CreateCall(FMA, Ops); + Ops[2] = Builder.CreateFNeg(Ops[2]); + Value *Even = Builder.CreateCall(FMA, Ops); + + if (IsSubAdd) + std::swap(Even, Odd); + + SmallVector Idxs(NumElts); + for (int i = 0; i != NumElts; ++i) + Idxs[i] = i + (i % 2) * NumElts; + + Rep = Builder.CreateShuffleVector(Even, Odd, Idxs); } else if (IsX86 && (Name.startswith("avx512.mask.pternlog.") || Name.startswith("avx512.maskz.pternlog."))) { bool ZeroMask = Name[11] == 'z'; diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index af26fe3a6c8..3e936f9de3d 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -1249,10 +1249,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_vpshrd_w_512, INTR_TYPE_3OP_IMM8, X86ISD::VSHRD, 0), X86_INTRINSIC_DATA(bmi_bextr_32, INTR_TYPE_2OP, X86ISD::BEXTR, 0), X86_INTRINSIC_DATA(bmi_bextr_64, INTR_TYPE_2OP, X86ISD::BEXTR, 0), - X86_INTRINSIC_DATA(fma_vfmaddsub_pd, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0), - X86_INTRINSIC_DATA(fma_vfmaddsub_pd_256, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0), - X86_INTRINSIC_DATA(fma_vfmaddsub_ps, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0), - X86_INTRINSIC_DATA(fma_vfmaddsub_ps_256, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(fma4_vfmadd_sd, INTR_TYPE_3OP, X86ISD::FMADD4S, 0), X86_INTRINSIC_DATA(fma4_vfmadd_ss, INTR_TYPE_3OP, X86ISD::FMADD4S, 0), X86_INTRINSIC_DATA(sse_cmp_ps, INTR_TYPE_3OP, X86ISD::CMPP, 0), diff --git a/test/CodeGen/X86/fma-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/fma-intrinsics-x86-upgrade.ll new file mode 100644 index 00000000000..5e30726555e --- /dev/null +++ b/test/CodeGen/X86/fma-intrinsics-x86-upgrade.ll @@ -0,0 +1,1036 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+fma,-fma4 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,-fma4 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-AVX512VL +; RUN: llc < %s -mtriple=x86_64-pc-windows -mattr=+fma,-fma4 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA-WIN + +; VFMADD +define <4 x float> @test_x86_fma_vfmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfmadd_ss: +; CHECK-FMA: # %bb.0: +; CHECK-FMA-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xa9,0xc2] +; CHECK-FMA-NEXT: # xmm0 = (xmm1 * xmm0) + xmm2 +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_ss: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa9,0xc2] +; CHECK-AVX512VL-NEXT: # xmm0 = (xmm1 * xmm0) + xmm2 +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] +; +; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_ss: +; CHECK-FMA-WIN: # %bb.0: +; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01] +; CHECK-FMA-WIN-NEXT: vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08] +; CHECK-FMA-WIN-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-FMA-WIN-NEXT: vfmadd132ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x99,0x02] +; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm0 * mem) + xmm1 +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) + ret <4 x float> %res +} + +define <4 x float> @test_x86_fma_vfmadd_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfmadd_bac_ss: +; CHECK-FMA: # %bb.0: +; CHECK-FMA-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0x79,0xa9,0xca] +; CHECK-FMA-NEXT: # xmm1 = (xmm0 * xmm1) + xmm2 +; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc1] +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_bac_ss: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa9,0xca] +; CHECK-AVX512VL-NEXT: # xmm1 = (xmm0 * xmm1) + xmm2 +; CHECK-AVX512VL-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] +; +; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_bac_ss: +; CHECK-FMA-WIN: # %bb.0: +; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08] +; CHECK-FMA-WIN-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-FMA-WIN-NEXT: vfmadd132ss (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x99,0x01] +; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm0 * mem) + xmm1 +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2) + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>) + +define <2 x double> @test_x86_fma_vfmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfmadd_sd: +; CHECK-FMA: # %bb.0: +; CHECK-FMA-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xa9,0xc2] +; CHECK-FMA-NEXT: # xmm0 = (xmm1 * xmm0) + xmm2 +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_sd: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa9,0xc2] +; CHECK-AVX512VL-NEXT: # xmm0 = (xmm1 * xmm0) + xmm2 +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] +; +; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_sd: +; CHECK-FMA-WIN: # %bb.0: +; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01] +; CHECK-FMA-WIN-NEXT: vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08] +; CHECK-FMA-WIN-NEXT: # xmm1 = mem[0],zero +; CHECK-FMA-WIN-NEXT: vfmadd132sd (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x99,0x02] +; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm0 * mem) + xmm1 +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) + ret <2 x double> %res +} + +define <2 x double> @test_x86_fma_vfmadd_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfmadd_bac_sd: +; CHECK-FMA: # %bb.0: +; CHECK-FMA-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0xf9,0xa9,0xca] +; CHECK-FMA-NEXT: # xmm1 = (xmm0 * xmm1) + xmm2 +; CHECK-FMA-NEXT: vmovapd %xmm1, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc1] +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_bac_sd: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa9,0xca] +; CHECK-AVX512VL-NEXT: # xmm1 = (xmm0 * xmm1) + xmm2 +; CHECK-AVX512VL-NEXT: vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1] +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] +; +; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_bac_sd: +; CHECK-FMA-WIN: # %bb.0: +; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08] +; CHECK-FMA-WIN-NEXT: # xmm1 = mem[0],zero +; CHECK-FMA-WIN-NEXT: vfmadd132sd (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x99,0x01] +; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm0 * mem) + xmm1 +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2) + ret <2 x double> %res +} +declare <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>) + +define <4 x float> @test_x86_fma_vfmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfmadd_ps: +; CHECK-FMA: # %bb.0: +; CHECK-FMA-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xa8,0xc2] +; CHECK-FMA-NEXT: # xmm0 = (xmm1 * xmm0) + xmm2 +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_ps: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa8,0xc2] +; CHECK-AVX512VL-NEXT: # xmm0 = (xmm1 * xmm0) + xmm2 +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] +; +; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_ps: +; CHECK-FMA-WIN: # %bb.0: +; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfmadd213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa8,0x00] +; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm1 * xmm0) + mem +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>) + +define <2 x double> @test_x86_fma_vfmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfmadd_pd: +; CHECK-FMA: # %bb.0: +; CHECK-FMA-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xa8,0xc2] +; CHECK-FMA-NEXT: # xmm0 = (xmm1 * xmm0) + xmm2 +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_pd: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa8,0xc2] +; CHECK-AVX512VL-NEXT: # xmm0 = (xmm1 * xmm0) + xmm2 +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] +; +; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_pd: +; CHECK-FMA-WIN: # %bb.0: +; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfmadd213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xa8,0x00] +; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm1 * xmm0) + mem +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) + ret <2 x double> %res +} +declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x double>) + +define <8 x float> @test_x86_fma_vfmadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfmadd_ps_256: +; CHECK-FMA: # %bb.0: +; CHECK-FMA-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xa8,0xc2] +; CHECK-FMA-NEXT: # ymm0 = (ymm1 * ymm0) + ymm2 +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_ps_256: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xa8,0xc2] +; CHECK-AVX512VL-NEXT: # ymm0 = (ymm1 * ymm0) + ymm2 +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] +; +; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_ps_256: +; CHECK-FMA-WIN: # %bb.0: +; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfmadd213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xa8,0x00] +; CHECK-FMA-WIN-NEXT: # ymm0 = (ymm1 * ymm0) + mem +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) + ret <8 x float> %res +} +declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) + +define <4 x double> @test_x86_fma_vfmadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfmadd_pd_256: +; CHECK-FMA: # %bb.0: +; CHECK-FMA-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xa8,0xc2] +; CHECK-FMA-NEXT: # ymm0 = (ymm1 * ymm0) + ymm2 +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_pd_256: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xa8,0xc2] +; CHECK-AVX512VL-NEXT: # ymm0 = (ymm1 * ymm0) + ymm2 +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] +; +; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_pd_256: +; CHECK-FMA-WIN: # %bb.0: +; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfmadd213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xa8,0x00] +; CHECK-FMA-WIN-NEXT: # ymm0 = (ymm1 * ymm0) + mem +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) + ret <4 x double> %res +} +declare <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>) + +; VFMSUB +define <4 x float> @test_x86_fma_vfmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfmsub_ss: +; CHECK-FMA: # %bb.0: +; CHECK-FMA-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xab,0xc2] +; CHECK-FMA-NEXT: # xmm0 = (xmm1 * xmm0) - xmm2 +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_ss: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xab,0xc2] +; CHECK-AVX512VL-NEXT: # xmm0 = (xmm1 * xmm0) - xmm2 +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] +; +; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_ss: +; CHECK-FMA-WIN: # %bb.0: +; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01] +; CHECK-FMA-WIN-NEXT: vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08] +; CHECK-FMA-WIN-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-FMA-WIN-NEXT: vfmsub132ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9b,0x02] +; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm0 * mem) - xmm1 +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) + ret <4 x float> %res +} + +define <4 x float> @test_x86_fma_vfmsub_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfmsub_bac_ss: +; CHECK-FMA: # %bb.0: +; CHECK-FMA-NEXT: vfmsub213ss %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0x79,0xab,0xca] +; CHECK-FMA-NEXT: # xmm1 = (xmm0 * xmm1) - xmm2 +; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc1] +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_bac_ss: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vfmsub213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xab,0xca] +; CHECK-AVX512VL-NEXT: # xmm1 = (xmm0 * xmm1) - xmm2 +; CHECK-AVX512VL-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] +; +; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_bac_ss: +; CHECK-FMA-WIN: # %bb.0: +; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08] +; CHECK-FMA-WIN-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-FMA-WIN-NEXT: vfmsub132ss (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9b,0x01] +; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm0 * mem) - xmm1 +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2) + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float>, <4 x float>, <4 x float>) + +define <2 x double> @test_x86_fma_vfmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfmsub_sd: +; CHECK-FMA: # %bb.0: +; CHECK-FMA-NEXT: vfmsub213sd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xab,0xc2] +; CHECK-FMA-NEXT: # xmm0 = (xmm1 * xmm0) - xmm2 +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_sd: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vfmsub213sd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xab,0xc2] +; CHECK-AVX512VL-NEXT: # xmm0 = (xmm1 * xmm0) - xmm2 +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] +; +; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_sd: +; CHECK-FMA-WIN: # %bb.0: +; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01] +; CHECK-FMA-WIN-NEXT: vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08] +; CHECK-FMA-WIN-NEXT: # xmm1 = mem[0],zero +; CHECK-FMA-WIN-NEXT: vfmsub132sd (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x9b,0x02] +; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm0 * mem) - xmm1 +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) + ret <2 x double> %res +} + +define <2 x double> @test_x86_fma_vfmsub_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfmsub_bac_sd: +; CHECK-FMA: # %bb.0: +; CHECK-FMA-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0xf9,0xab,0xca] +; CHECK-FMA-NEXT: # xmm1 = (xmm0 * xmm1) - xmm2 +; CHECK-FMA-NEXT: vmovapd %xmm1, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc1] +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_bac_sd: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xab,0xca] +; CHECK-AVX512VL-NEXT: # xmm1 = (xmm0 * xmm1) - xmm2 +; CHECK-AVX512VL-NEXT: vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1] +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] +; +; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_bac_sd: +; CHECK-FMA-WIN: # %bb.0: +; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08] +; CHECK-FMA-WIN-NEXT: # xmm1 = mem[0],zero +; CHECK-FMA-WIN-NEXT: vfmsub132sd (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x9b,0x01] +; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm0 * mem) - xmm1 +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2) + ret <2 x double> %res +} +declare <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double>, <2 x double>, <2 x double>) + +define <4 x float> @test_x86_fma_vfmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfmsub_ps: +; CHECK-FMA: # %bb.0: +; CHECK-FMA-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xaa,0xc2] +; CHECK-FMA-NEXT: # xmm0 = (xmm1 * xmm0) - xmm2 +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_ps: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xaa,0xc2] +; CHECK-AVX512VL-NEXT: # xmm0 = (xmm1 * xmm0) - xmm2 +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] +; +; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_ps: +; CHECK-FMA-WIN: # %bb.0: +; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfmsub213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xaa,0x00] +; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm1 * xmm0) - mem +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float>) + +define <2 x double> @test_x86_fma_vfmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfmsub_pd: +; CHECK-FMA: # %bb.0: +; CHECK-FMA-NEXT: vfmsub213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xaa,0xc2] +; CHECK-FMA-NEXT: # xmm0 = (xmm1 * xmm0) - xmm2 +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_pd: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vfmsub213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xaa,0xc2] +; CHECK-AVX512VL-NEXT: # xmm0 = (xmm1 * xmm0) - xmm2 +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] +; +; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_pd: +; CHECK-FMA-WIN: # %bb.0: +; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfmsub213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xaa,0x00] +; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm1 * xmm0) - mem +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) + ret <2 x double> %res +} +declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x double>) + +define <8 x float> @test_x86_fma_vfmsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfmsub_ps_256: +; CHECK-FMA: # %bb.0: +; CHECK-FMA-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xaa,0xc2] +; CHECK-FMA-NEXT: # ymm0 = (ymm1 * ymm0) - ymm2 +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_ps_256: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xaa,0xc2] +; CHECK-AVX512VL-NEXT: # ymm0 = (ymm1 * ymm0) - ymm2 +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] +; +; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_ps_256: +; CHECK-FMA-WIN: # %bb.0: +; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfmsub213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xaa,0x00] +; CHECK-FMA-WIN-NEXT: # ymm0 = (ymm1 * ymm0) - mem +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) + ret <8 x float> %res +} +declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>) + +define <4 x double> @test_x86_fma_vfmsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfmsub_pd_256: +; CHECK-FMA: # %bb.0: +; CHECK-FMA-NEXT: vfmsub213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xaa,0xc2] +; CHECK-FMA-NEXT: # ymm0 = (ymm1 * ymm0) - ymm2 +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_pd_256: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vfmsub213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xaa,0xc2] +; CHECK-AVX512VL-NEXT: # ymm0 = (ymm1 * ymm0) - ymm2 +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] +; +; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_pd_256: +; CHECK-FMA-WIN: # %bb.0: +; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfmsub213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xaa,0x00] +; CHECK-FMA-WIN-NEXT: # ymm0 = (ymm1 * ymm0) - mem +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) + ret <4 x double> %res +} +declare <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>) + +; VFNMADD +define <4 x float> @test_x86_fma_vfnmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_ss: +; CHECK-FMA: # %bb.0: +; CHECK-FMA-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xad,0xc2] +; CHECK-FMA-NEXT: # xmm0 = -(xmm1 * xmm0) + xmm2 +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_ss: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xad,0xc2] +; CHECK-AVX512VL-NEXT: # xmm0 = -(xmm1 * xmm0) + xmm2 +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] +; +; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_ss: +; CHECK-FMA-WIN: # %bb.0: +; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01] +; CHECK-FMA-WIN-NEXT: vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08] +; CHECK-FMA-WIN-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-FMA-WIN-NEXT: vfnmadd132ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9d,0x02] +; CHECK-FMA-WIN-NEXT: # xmm0 = -(xmm0 * mem) + xmm1 +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) + ret <4 x float> %res +} + +define <4 x float> @test_x86_fma_vfnmadd_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_bac_ss: +; CHECK-FMA: # %bb.0: +; CHECK-FMA-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0x79,0xad,0xca] +; CHECK-FMA-NEXT: # xmm1 = -(xmm0 * xmm1) + xmm2 +; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc1] +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_bac_ss: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xad,0xca] +; CHECK-AVX512VL-NEXT: # xmm1 = -(xmm0 * xmm1) + xmm2 +; CHECK-AVX512VL-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] +; +; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_bac_ss: +; CHECK-FMA-WIN: # %bb.0: +; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08] +; CHECK-FMA-WIN-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-FMA-WIN-NEXT: vfnmadd132ss (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9d,0x01] +; CHECK-FMA-WIN-NEXT: # xmm0 = -(xmm0 * mem) + xmm1 +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2) + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float>, <4 x float>, <4 x float>) + +define <2 x double> @test_x86_fma_vfnmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_sd: +; CHECK-FMA: # %bb.0: +; CHECK-FMA-NEXT: vfnmadd213sd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xad,0xc2] +; CHECK-FMA-NEXT: # xmm0 = -(xmm1 * xmm0) + xmm2 +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_sd: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vfnmadd213sd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xad,0xc2] +; CHECK-AVX512VL-NEXT: # xmm0 = -(xmm1 * xmm0) + xmm2 +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] +; +; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_sd: +; CHECK-FMA-WIN: # %bb.0: +; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01] +; CHECK-FMA-WIN-NEXT: vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08] +; CHECK-FMA-WIN-NEXT: # xmm1 = mem[0],zero +; CHECK-FMA-WIN-NEXT: vfnmadd132sd (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x9d,0x02] +; CHECK-FMA-WIN-NEXT: # xmm0 = -(xmm0 * mem) + xmm1 +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) + ret <2 x double> %res +} + +define <2 x double> @test_x86_fma_vfnmadd_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_bac_sd: +; CHECK-FMA: # %bb.0: +; CHECK-FMA-NEXT: vfnmadd213sd %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0xf9,0xad,0xca] +; CHECK-FMA-NEXT: # xmm1 = -(xmm0 * xmm1) + xmm2 +; CHECK-FMA-NEXT: vmovapd %xmm1, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc1] +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_bac_sd: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vfnmadd213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xad,0xca] +; CHECK-AVX512VL-NEXT: # xmm1 = -(xmm0 * xmm1) + xmm2 +; CHECK-AVX512VL-NEXT: vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1] +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] +; +; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_bac_sd: +; CHECK-FMA-WIN: # %bb.0: +; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08] +; CHECK-FMA-WIN-NEXT: # xmm1 = mem[0],zero +; CHECK-FMA-WIN-NEXT: vfnmadd132sd (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x9d,0x01] +; CHECK-FMA-WIN-NEXT: # xmm0 = -(xmm0 * mem) + xmm1 +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2) + ret <2 x double> %res +} +declare <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double>, <2 x double>, <2 x double>) + +define <4 x float> @test_x86_fma_vfnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_ps: +; CHECK-FMA: # %bb.0: +; CHECK-FMA-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xac,0xc2] +; CHECK-FMA-NEXT: # xmm0 = -(xmm1 * xmm0) + xmm2 +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_ps: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xac,0xc2] +; CHECK-AVX512VL-NEXT: # xmm0 = -(xmm1 * xmm0) + xmm2 +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] +; +; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_ps: +; CHECK-FMA-WIN: # %bb.0: +; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfnmadd213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xac,0x00] +; CHECK-FMA-WIN-NEXT: # xmm0 = -(xmm1 * xmm0) + mem +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float>, <4 x float>, <4 x float>) + +define <2 x double> @test_x86_fma_vfnmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_pd: +; CHECK-FMA: # %bb.0: +; CHECK-FMA-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xac,0xc2] +; CHECK-FMA-NEXT: # xmm0 = -(xmm1 * xmm0) + xmm2 +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_pd: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xac,0xc2] +; CHECK-AVX512VL-NEXT: # xmm0 = -(xmm1 * xmm0) + xmm2 +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] +; +; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_pd: +; CHECK-FMA-WIN: # %bb.0: +; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfnmadd213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xac,0x00] +; CHECK-FMA-WIN-NEXT: # xmm0 = -(xmm1 * xmm0) + mem +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) + ret <2 x double> %res +} +declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x double>) + +define <8 x float> @test_x86_fma_vfnmadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_ps_256: +; CHECK-FMA: # %bb.0: +; CHECK-FMA-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xac,0xc2] +; CHECK-FMA-NEXT: # ymm0 = -(ymm1 * ymm0) + ymm2 +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_ps_256: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xac,0xc2] +; CHECK-AVX512VL-NEXT: # ymm0 = -(ymm1 * ymm0) + ymm2 +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] +; +; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_ps_256: +; CHECK-FMA-WIN: # %bb.0: +; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfnmadd213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xac,0x00] +; CHECK-FMA-WIN-NEXT: # ymm0 = -(ymm1 * ymm0) + mem +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) + ret <8 x float> %res +} +declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) + +define <4 x double> @test_x86_fma_vfnmadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_pd_256: +; CHECK-FMA: # %bb.0: +; CHECK-FMA-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xac,0xc2] +; CHECK-FMA-NEXT: # ymm0 = -(ymm1 * ymm0) + ymm2 +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_pd_256: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xac,0xc2] +; CHECK-AVX512VL-NEXT: # ymm0 = -(ymm1 * ymm0) + ymm2 +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] +; +; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_pd_256: +; CHECK-FMA-WIN: # %bb.0: +; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfnmadd213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xac,0x00] +; CHECK-FMA-WIN-NEXT: # ymm0 = -(ymm1 * ymm0) + mem +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) + ret <4 x double> %res +} +declare <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double>, <4 x double>, <4 x double>) + +; VFNMSUB +define <4 x float> @test_x86_fma_vfnmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_ss: +; CHECK-FMA: # %bb.0: +; CHECK-FMA-NEXT: vfnmsub213ss %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xaf,0xc2] +; CHECK-FMA-NEXT: # xmm0 = -(xmm1 * xmm0) - xmm2 +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_ss: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vfnmsub213ss %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xaf,0xc2] +; CHECK-AVX512VL-NEXT: # xmm0 = -(xmm1 * xmm0) - xmm2 +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] +; +; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_ss: +; CHECK-FMA-WIN: # %bb.0: +; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01] +; CHECK-FMA-WIN-NEXT: vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08] +; CHECK-FMA-WIN-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-FMA-WIN-NEXT: vfnmsub132ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9f,0x02] +; CHECK-FMA-WIN-NEXT: # xmm0 = -(xmm0 * mem) - xmm1 +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) + ret <4 x float> %res +} + +define <4 x float> @test_x86_fma_vfnmsub_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_bac_ss: +; CHECK-FMA: # %bb.0: +; CHECK-FMA-NEXT: vfnmsub213ss %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0x79,0xaf,0xca] +; CHECK-FMA-NEXT: # xmm1 = -(xmm0 * xmm1) - xmm2 +; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc1] +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_bac_ss: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vfnmsub213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xaf,0xca] +; CHECK-AVX512VL-NEXT: # xmm1 = -(xmm0 * xmm1) - xmm2 +; CHECK-AVX512VL-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] +; +; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_bac_ss: +; CHECK-FMA-WIN: # %bb.0: +; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08] +; CHECK-FMA-WIN-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-FMA-WIN-NEXT: vfnmsub132ss (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9f,0x01] +; CHECK-FMA-WIN-NEXT: # xmm0 = -(xmm0 * mem) - xmm1 +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2) + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float>, <4 x float>, <4 x float>) + +define <2 x double> @test_x86_fma_vfnmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_sd: +; CHECK-FMA: # %bb.0: +; CHECK-FMA-NEXT: vfnmsub213sd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xaf,0xc2] +; CHECK-FMA-NEXT: # xmm0 = -(xmm1 * xmm0) - xmm2 +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_sd: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vfnmsub213sd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xaf,0xc2] +; CHECK-AVX512VL-NEXT: # xmm0 = -(xmm1 * xmm0) - xmm2 +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] +; +; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_sd: +; CHECK-FMA-WIN: # %bb.0: +; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01] +; CHECK-FMA-WIN-NEXT: vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08] +; CHECK-FMA-WIN-NEXT: # xmm1 = mem[0],zero +; CHECK-FMA-WIN-NEXT: vfnmsub132sd (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x9f,0x02] +; CHECK-FMA-WIN-NEXT: # xmm0 = -(xmm0 * mem) - xmm1 +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) + ret <2 x double> %res +} + +define <2 x double> @test_x86_fma_vfnmsub_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_bac_sd: +; CHECK-FMA: # %bb.0: +; CHECK-FMA-NEXT: vfnmsub213sd %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0xf9,0xaf,0xca] +; CHECK-FMA-NEXT: # xmm1 = -(xmm0 * xmm1) - xmm2 +; CHECK-FMA-NEXT: vmovapd %xmm1, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc1] +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_bac_sd: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vfnmsub213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xaf,0xca] +; CHECK-AVX512VL-NEXT: # xmm1 = -(xmm0 * xmm1) - xmm2 +; CHECK-AVX512VL-NEXT: vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1] +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] +; +; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_bac_sd: +; CHECK-FMA-WIN: # %bb.0: +; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08] +; CHECK-FMA-WIN-NEXT: # xmm1 = mem[0],zero +; CHECK-FMA-WIN-NEXT: vfnmsub132sd (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x9f,0x01] +; CHECK-FMA-WIN-NEXT: # xmm0 = -(xmm0 * mem) - xmm1 +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2) + ret <2 x double> %res +} +declare <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double>, <2 x double>, <2 x double>) + +define <4 x float> @test_x86_fma_vfnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_ps: +; CHECK-FMA: # %bb.0: +; CHECK-FMA-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xae,0xc2] +; CHECK-FMA-NEXT: # xmm0 = -(xmm1 * xmm0) - xmm2 +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_ps: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xae,0xc2] +; CHECK-AVX512VL-NEXT: # xmm0 = -(xmm1 * xmm0) - xmm2 +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] +; +; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_ps: +; CHECK-FMA-WIN: # %bb.0: +; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfnmsub213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xae,0x00] +; CHECK-FMA-WIN-NEXT: # xmm0 = -(xmm1 * xmm0) - mem +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x float>) + +define <2 x double> @test_x86_fma_vfnmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_pd: +; CHECK-FMA: # %bb.0: +; CHECK-FMA-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xae,0xc2] +; CHECK-FMA-NEXT: # xmm0 = -(xmm1 * xmm0) - xmm2 +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_pd: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xae,0xc2] +; CHECK-AVX512VL-NEXT: # xmm0 = -(xmm1 * xmm0) - xmm2 +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] +; +; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_pd: +; CHECK-FMA-WIN: # %bb.0: +; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfnmsub213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xae,0x00] +; CHECK-FMA-WIN-NEXT: # xmm0 = -(xmm1 * xmm0) - mem +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) + ret <2 x double> %res +} +declare <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double>, <2 x double>, <2 x double>) + +define <8 x float> @test_x86_fma_vfnmsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_ps_256: +; CHECK-FMA: # %bb.0: +; CHECK-FMA-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xae,0xc2] +; CHECK-FMA-NEXT: # ymm0 = -(ymm1 * ymm0) - ymm2 +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_ps_256: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xae,0xc2] +; CHECK-AVX512VL-NEXT: # ymm0 = -(ymm1 * ymm0) - ymm2 +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] +; +; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_ps_256: +; CHECK-FMA-WIN: # %bb.0: +; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfnmsub213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xae,0x00] +; CHECK-FMA-WIN-NEXT: # ymm0 = -(ymm1 * ymm0) - mem +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) + ret <8 x float> %res +} +declare <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>) + +define <4 x double> @test_x86_fma_vfnmsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_pd_256: +; CHECK-FMA: # %bb.0: +; CHECK-FMA-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xae,0xc2] +; CHECK-FMA-NEXT: # ymm0 = -(ymm1 * ymm0) - ymm2 +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_pd_256: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xae,0xc2] +; CHECK-AVX512VL-NEXT: # ymm0 = -(ymm1 * ymm0) - ymm2 +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] +; +; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_pd_256: +; CHECK-FMA-WIN: # %bb.0: +; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfnmsub213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xae,0x00] +; CHECK-FMA-WIN-NEXT: # ymm0 = -(ymm1 * ymm0) - mem +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) + ret <4 x double> %res +} +declare <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>) + +; VFMADDSUB +define <4 x float> @test_x86_fma_vfmaddsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_ps: +; CHECK-FMA: # %bb.0: +; CHECK-FMA-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xa6,0xc2] +; CHECK-FMA-NEXT: # xmm0 = (xmm1 * xmm0) +/- xmm2 +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfmaddsub_ps: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa6,0xc2] +; CHECK-AVX512VL-NEXT: # xmm0 = (xmm1 * xmm0) +/- xmm2 +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] +; +; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmaddsub_ps: +; CHECK-FMA-WIN: # %bb.0: +; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfmaddsub213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa6,0x00] +; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm1 * xmm0) +/- mem +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float>, <4 x float>, <4 x float>) + +define <2 x double> @test_x86_fma_vfmaddsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_pd: +; CHECK-FMA: # %bb.0: +; CHECK-FMA-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xa6,0xc2] +; CHECK-FMA-NEXT: # xmm0 = (xmm1 * xmm0) +/- xmm2 +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfmaddsub_pd: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa6,0xc2] +; CHECK-AVX512VL-NEXT: # xmm0 = (xmm1 * xmm0) +/- xmm2 +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] +; +; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmaddsub_pd: +; CHECK-FMA-WIN: # %bb.0: +; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfmaddsub213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xa6,0x00] +; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm1 * xmm0) +/- mem +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) + ret <2 x double> %res +} +declare <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double>, <2 x double>, <2 x double>) + +define <8 x float> @test_x86_fma_vfmaddsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_ps_256: +; CHECK-FMA: # %bb.0: +; CHECK-FMA-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xa6,0xc2] +; CHECK-FMA-NEXT: # ymm0 = (ymm1 * ymm0) +/- ymm2 +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfmaddsub_ps_256: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xa6,0xc2] +; CHECK-AVX512VL-NEXT: # ymm0 = (ymm1 * ymm0) +/- ymm2 +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] +; +; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmaddsub_ps_256: +; CHECK-FMA-WIN: # %bb.0: +; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfmaddsub213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xa6,0x00] +; CHECK-FMA-WIN-NEXT: # ymm0 = (ymm1 * ymm0) +/- mem +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) + ret <8 x float> %res +} +declare <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>) + +define <4 x double> @test_x86_fma_vfmaddsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_pd_256: +; CHECK-FMA: # %bb.0: +; CHECK-FMA-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xa6,0xc2] +; CHECK-FMA-NEXT: # ymm0 = (ymm1 * ymm0) +/- ymm2 +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfmaddsub_pd_256: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xa6,0xc2] +; CHECK-AVX512VL-NEXT: # ymm0 = (ymm1 * ymm0) +/- ymm2 +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] +; +; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmaddsub_pd_256: +; CHECK-FMA-WIN: # %bb.0: +; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfmaddsub213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xa6,0x00] +; CHECK-FMA-WIN-NEXT: # ymm0 = (ymm1 * ymm0) +/- mem +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) + ret <4 x double> %res +} +declare <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>) + +; VFMSUBADD +define <4 x float> @test_x86_fma_vfmsubadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_ps: +; CHECK-FMA: # %bb.0: +; CHECK-FMA-NEXT: vfmsubadd213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xa7,0xc2] +; CHECK-FMA-NEXT: # xmm0 = (xmm1 * xmm0) -/+ xmm2 +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsubadd_ps: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vfmsubadd213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa7,0xc2] +; CHECK-AVX512VL-NEXT: # xmm0 = (xmm1 * xmm0) -/+ xmm2 +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] +; +; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsubadd_ps: +; CHECK-FMA-WIN: # %bb.0: +; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfmsubadd213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa7,0x00] +; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm1 * xmm0) -/+ mem +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float>, <4 x float>, <4 x float>) + +define <2 x double> @test_x86_fma_vfmsubadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_pd: +; CHECK-FMA: # %bb.0: +; CHECK-FMA-NEXT: vfmsubadd213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xa7,0xc2] +; CHECK-FMA-NEXT: # xmm0 = (xmm1 * xmm0) -/+ xmm2 +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsubadd_pd: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vfmsubadd213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa7,0xc2] +; CHECK-AVX512VL-NEXT: # xmm0 = (xmm1 * xmm0) -/+ xmm2 +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] +; +; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsubadd_pd: +; CHECK-FMA-WIN: # %bb.0: +; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfmsubadd213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xa7,0x00] +; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm1 * xmm0) -/+ mem +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) + ret <2 x double> %res +} +declare <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double>, <2 x double>, <2 x double>) + +define <8 x float> @test_x86_fma_vfmsubadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_ps_256: +; CHECK-FMA: # %bb.0: +; CHECK-FMA-NEXT: vfmsubadd213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xa7,0xc2] +; CHECK-FMA-NEXT: # ymm0 = (ymm1 * ymm0) -/+ ymm2 +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsubadd_ps_256: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vfmsubadd213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xa7,0xc2] +; CHECK-AVX512VL-NEXT: # ymm0 = (ymm1 * ymm0) -/+ ymm2 +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] +; +; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsubadd_ps_256: +; CHECK-FMA-WIN: # %bb.0: +; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfmsubadd213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xa7,0x00] +; CHECK-FMA-WIN-NEXT: # ymm0 = (ymm1 * ymm0) -/+ mem +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) + ret <8 x float> %res +} +declare <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8 x float>) + +define <4 x double> @test_x86_fma_vfmsubadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_pd_256: +; CHECK-FMA: # %bb.0: +; CHECK-FMA-NEXT: vfmsubadd213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xa7,0xc2] +; CHECK-FMA-NEXT: # ymm0 = (ymm1 * ymm0) -/+ ymm2 +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsubadd_pd_256: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vfmsubadd213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xa7,0xc2] +; CHECK-AVX512VL-NEXT: # ymm0 = (ymm1 * ymm0) -/+ ymm2 +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] +; +; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsubadd_pd_256: +; CHECK-FMA-WIN: # %bb.0: +; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfmsubadd213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xa7,0x00] +; CHECK-FMA-WIN-NEXT: # ymm0 = (ymm1 * ymm0) -/+ mem +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) + ret <4 x double> %res +} +declare <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double>, <4 x double>, <4 x double>) + +attributes #0 = { nounwind } diff --git a/test/CodeGen/X86/fma-intrinsics-x86.ll b/test/CodeGen/X86/fma-intrinsics-x86.ll index 5e30726555e..ec4936cf4ce 100644 --- a/test/CodeGen/X86/fma-intrinsics-x86.ll +++ b/test/CodeGen/X86/fma-intrinsics-x86.ll @@ -25,8 +25,12 @@ define <4 x float> @test_x86_fma_vfmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 ; CHECK-FMA-WIN-NEXT: vfmadd132ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x99,0x02] ; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm0 * mem) + xmm1 ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] - %res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) - ret <4 x float> %res + %1 = extractelement <4 x float> %a0, i64 0 + %2 = extractelement <4 x float> %a1, i64 0 + %3 = extractelement <4 x float> %a2, i64 0 + %4 = call float @llvm.fma.f32(float %1, float %2, float %3) + %5 = insertelement <4 x float> %a0, float %4, i64 0 + ret <4 x float> %5 } define <4 x float> @test_x86_fma_vfmadd_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { @@ -52,10 +56,13 @@ define <4 x float> @test_x86_fma_vfmadd_bac_ss(<4 x float> %a0, <4 x float> %a1, ; CHECK-FMA-WIN-NEXT: vfmadd132ss (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x99,0x01] ; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm0 * mem) + xmm1 ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] - %res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2) - ret <4 x float> %res + %1 = extractelement <4 x float> %a1, i64 0 + %2 = extractelement <4 x float> %a0, i64 0 + %3 = extractelement <4 x float> %a2, i64 0 + %4 = call float @llvm.fma.f32(float %1, float %2, float %3) + %5 = insertelement <4 x float> %a1, float %4, i64 0 + ret <4 x float> %5 } -declare <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>) define <2 x double> @test_x86_fma_vfmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfmadd_sd: @@ -78,8 +85,12 @@ define <2 x double> @test_x86_fma_vfmadd_sd(<2 x double> %a0, <2 x double> %a1, ; CHECK-FMA-WIN-NEXT: vfmadd132sd (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x99,0x02] ; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm0 * mem) + xmm1 ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] - %res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) - ret <2 x double> %res + %1 = extractelement <2 x double> %a0, i64 0 + %2 = extractelement <2 x double> %a1, i64 0 + %3 = extractelement <2 x double> %a2, i64 0 + %4 = call double @llvm.fma.f64(double %1, double %2, double %3) + %5 = insertelement <2 x double> %a0, double %4, i64 0 + ret <2 x double> %5 } define <2 x double> @test_x86_fma_vfmadd_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { @@ -105,10 +116,13 @@ define <2 x double> @test_x86_fma_vfmadd_bac_sd(<2 x double> %a0, <2 x double> % ; CHECK-FMA-WIN-NEXT: vfmadd132sd (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x99,0x01] ; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm0 * mem) + xmm1 ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] - %res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2) - ret <2 x double> %res + %1 = extractelement <2 x double> %a1, i64 0 + %2 = extractelement <2 x double> %a0, i64 0 + %3 = extractelement <2 x double> %a2, i64 0 + %4 = call double @llvm.fma.f64(double %1, double %2, double %3) + %5 = insertelement <2 x double> %a1, double %4, i64 0 + ret <2 x double> %5 } -declare <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>) define <4 x float> @test_x86_fma_vfmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfmadd_ps: @@ -130,10 +144,9 @@ define <4 x float> @test_x86_fma_vfmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 ; CHECK-FMA-WIN-NEXT: vfmadd213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa8,0x00] ; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm1 * xmm0) + mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] - %res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) - ret <4 x float> %res + %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) + ret <4 x float> %1 } -declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>) define <2 x double> @test_x86_fma_vfmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfmadd_pd: @@ -155,10 +168,9 @@ define <2 x double> @test_x86_fma_vfmadd_pd(<2 x double> %a0, <2 x double> %a1, ; CHECK-FMA-WIN-NEXT: vfmadd213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xa8,0x00] ; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm1 * xmm0) + mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] - %res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) - ret <2 x double> %res + %1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) + ret <2 x double> %1 } -declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x double>) define <8 x float> @test_x86_fma_vfmadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfmadd_ps_256: @@ -180,10 +192,9 @@ define <8 x float> @test_x86_fma_vfmadd_ps_256(<8 x float> %a0, <8 x float> %a1, ; CHECK-FMA-WIN-NEXT: vfmadd213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xa8,0x00] ; CHECK-FMA-WIN-NEXT: # ymm0 = (ymm1 * ymm0) + mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] - %res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) - ret <8 x float> %res + %1 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) + ret <8 x float> %1 } -declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) define <4 x double> @test_x86_fma_vfmadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfmadd_pd_256: @@ -205,10 +216,9 @@ define <4 x double> @test_x86_fma_vfmadd_pd_256(<4 x double> %a0, <4 x double> % ; CHECK-FMA-WIN-NEXT: vfmadd213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xa8,0x00] ; CHECK-FMA-WIN-NEXT: # ymm0 = (ymm1 * ymm0) + mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] - %res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) - ret <4 x double> %res + %1 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) + ret <4 x double> %1 } -declare <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>) ; VFMSUB define <4 x float> @test_x86_fma_vfmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { @@ -232,8 +242,13 @@ define <4 x float> @test_x86_fma_vfmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 ; CHECK-FMA-WIN-NEXT: vfmsub132ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9b,0x02] ; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm0 * mem) - xmm1 ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] - %res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) - ret <4 x float> %res + %1 = extractelement <4 x float> %a0, i64 0 + %2 = extractelement <4 x float> %a1, i64 0 + %3 = extractelement <4 x float> %a2, i64 0 + %4 = fsub float -0.000000e+00, %3 + %5 = call float @llvm.fma.f32(float %1, float %2, float %4) + %6 = insertelement <4 x float> %a0, float %5, i64 0 + ret <4 x float> %6 } define <4 x float> @test_x86_fma_vfmsub_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { @@ -259,10 +274,14 @@ define <4 x float> @test_x86_fma_vfmsub_bac_ss(<4 x float> %a0, <4 x float> %a1, ; CHECK-FMA-WIN-NEXT: vfmsub132ss (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9b,0x01] ; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm0 * mem) - xmm1 ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] - %res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2) - ret <4 x float> %res + %1 = extractelement <4 x float> %a1, i64 0 + %2 = extractelement <4 x float> %a0, i64 0 + %3 = extractelement <4 x float> %a2, i64 0 + %4 = fsub float -0.000000e+00, %3 + %5 = call float @llvm.fma.f32(float %1, float %2, float %4) + %6 = insertelement <4 x float> %a1, float %5, i64 0 + ret <4 x float> %6 } -declare <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float>, <4 x float>, <4 x float>) define <2 x double> @test_x86_fma_vfmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfmsub_sd: @@ -285,8 +304,13 @@ define <2 x double> @test_x86_fma_vfmsub_sd(<2 x double> %a0, <2 x double> %a1, ; CHECK-FMA-WIN-NEXT: vfmsub132sd (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x9b,0x02] ; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm0 * mem) - xmm1 ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] - %res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) - ret <2 x double> %res + %1 = extractelement <2 x double> %a0, i64 0 + %2 = extractelement <2 x double> %a1, i64 0 + %3 = extractelement <2 x double> %a2, i64 0 + %4 = fsub double -0.000000e+00, %3 + %5 = call double @llvm.fma.f64(double %1, double %2, double %4) + %6 = insertelement <2 x double> %a0, double %5, i64 0 + ret <2 x double> %6 } define <2 x double> @test_x86_fma_vfmsub_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { @@ -312,10 +336,14 @@ define <2 x double> @test_x86_fma_vfmsub_bac_sd(<2 x double> %a0, <2 x double> % ; CHECK-FMA-WIN-NEXT: vfmsub132sd (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x9b,0x01] ; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm0 * mem) - xmm1 ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] - %res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2) - ret <2 x double> %res + %1 = extractelement <2 x double> %a1, i64 0 + %2 = extractelement <2 x double> %a0, i64 0 + %3 = extractelement <2 x double> %a2, i64 0 + %4 = fsub double -0.000000e+00, %3 + %5 = call double @llvm.fma.f64(double %1, double %2, double %4) + %6 = insertelement <2 x double> %a1, double %5, i64 0 + ret <2 x double> %6 } -declare <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double>, <2 x double>, <2 x double>) define <4 x float> @test_x86_fma_vfmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfmsub_ps: @@ -337,10 +365,10 @@ define <4 x float> @test_x86_fma_vfmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 ; CHECK-FMA-WIN-NEXT: vfmsub213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xaa,0x00] ; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm1 * xmm0) - mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] - %res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) - ret <4 x float> %res + %1 = fsub <4 x float> , %a2 + %2 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %1) + ret <4 x float> %2 } -declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float>) define <2 x double> @test_x86_fma_vfmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfmsub_pd: @@ -362,10 +390,10 @@ define <2 x double> @test_x86_fma_vfmsub_pd(<2 x double> %a0, <2 x double> %a1, ; CHECK-FMA-WIN-NEXT: vfmsub213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xaa,0x00] ; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm1 * xmm0) - mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] - %res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) - ret <2 x double> %res + %1 = fsub <2 x double> , %a2 + %2 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %1) + ret <2 x double> %2 } -declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x double>) define <8 x float> @test_x86_fma_vfmsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfmsub_ps_256: @@ -387,10 +415,10 @@ define <8 x float> @test_x86_fma_vfmsub_ps_256(<8 x float> %a0, <8 x float> %a1, ; CHECK-FMA-WIN-NEXT: vfmsub213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xaa,0x00] ; CHECK-FMA-WIN-NEXT: # ymm0 = (ymm1 * ymm0) - mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] - %res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) - ret <8 x float> %res + %1 = fsub <8 x float> , %a2 + %2 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a0, <8 x float> %a1, <8 x float> %1) + ret <8 x float> %2 } -declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>) define <4 x double> @test_x86_fma_vfmsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfmsub_pd_256: @@ -412,10 +440,10 @@ define <4 x double> @test_x86_fma_vfmsub_pd_256(<4 x double> %a0, <4 x double> % ; CHECK-FMA-WIN-NEXT: vfmsub213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xaa,0x00] ; CHECK-FMA-WIN-NEXT: # ymm0 = (ymm1 * ymm0) - mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] - %res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) - ret <4 x double> %res + %1 = fsub <4 x double> , %a2 + %2 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %a1, <4 x double> %1) + ret <4 x double> %2 } -declare <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>) ; VFNMADD define <4 x float> @test_x86_fma_vfnmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { @@ -439,8 +467,13 @@ define <4 x float> @test_x86_fma_vfnmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 ; CHECK-FMA-WIN-NEXT: vfnmadd132ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9d,0x02] ; CHECK-FMA-WIN-NEXT: # xmm0 = -(xmm0 * mem) + xmm1 ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] - %res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) - ret <4 x float> %res + %1 = extractelement <4 x float> %a0, i64 0 + %2 = extractelement <4 x float> %a1, i64 0 + %3 = extractelement <4 x float> %a2, i64 0 + %4 = fsub float -0.000000e+00, %2 + %5 = call float @llvm.fma.f32(float %1, float %4, float %3) + %6 = insertelement <4 x float> %a0, float %5, i64 0 + ret <4 x float> %6 } define <4 x float> @test_x86_fma_vfnmadd_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { @@ -466,10 +499,14 @@ define <4 x float> @test_x86_fma_vfnmadd_bac_ss(<4 x float> %a0, <4 x float> %a1 ; CHECK-FMA-WIN-NEXT: vfnmadd132ss (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9d,0x01] ; CHECK-FMA-WIN-NEXT: # xmm0 = -(xmm0 * mem) + xmm1 ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] - %res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2) - ret <4 x float> %res + %1 = extractelement <4 x float> %a1, i64 0 + %2 = extractelement <4 x float> %a0, i64 0 + %3 = extractelement <4 x float> %a2, i64 0 + %4 = fsub float -0.000000e+00, %2 + %5 = call float @llvm.fma.f32(float %1, float %4, float %3) + %6 = insertelement <4 x float> %a1, float %5, i64 0 + ret <4 x float> %6 } -declare <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float>, <4 x float>, <4 x float>) define <2 x double> @test_x86_fma_vfnmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_sd: @@ -492,8 +529,13 @@ define <2 x double> @test_x86_fma_vfnmadd_sd(<2 x double> %a0, <2 x double> %a1, ; CHECK-FMA-WIN-NEXT: vfnmadd132sd (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x9d,0x02] ; CHECK-FMA-WIN-NEXT: # xmm0 = -(xmm0 * mem) + xmm1 ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] - %res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) - ret <2 x double> %res + %1 = extractelement <2 x double> %a0, i64 0 + %2 = extractelement <2 x double> %a1, i64 0 + %3 = extractelement <2 x double> %a2, i64 0 + %4 = fsub double -0.000000e+00, %2 + %5 = call double @llvm.fma.f64(double %1, double %4, double %3) + %6 = insertelement <2 x double> %a0, double %5, i64 0 + ret <2 x double> %6 } define <2 x double> @test_x86_fma_vfnmadd_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { @@ -519,10 +561,14 @@ define <2 x double> @test_x86_fma_vfnmadd_bac_sd(<2 x double> %a0, <2 x double> ; CHECK-FMA-WIN-NEXT: vfnmadd132sd (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x9d,0x01] ; CHECK-FMA-WIN-NEXT: # xmm0 = -(xmm0 * mem) + xmm1 ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] - %res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2) - ret <2 x double> %res + %1 = extractelement <2 x double> %a1, i64 0 + %2 = extractelement <2 x double> %a0, i64 0 + %3 = extractelement <2 x double> %a2, i64 0 + %4 = fsub double -0.000000e+00, %2 + %5 = call double @llvm.fma.f64(double %1, double %4, double %3) + %6 = insertelement <2 x double> %a1, double %5, i64 0 + ret <2 x double> %6 } -declare <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double>, <2 x double>, <2 x double>) define <4 x float> @test_x86_fma_vfnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_ps: @@ -544,10 +590,10 @@ define <4 x float> @test_x86_fma_vfnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 ; CHECK-FMA-WIN-NEXT: vfnmadd213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xac,0x00] ; CHECK-FMA-WIN-NEXT: # xmm0 = -(xmm1 * xmm0) + mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] - %res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) - ret <4 x float> %res + %1 = fsub <4 x float> , %a0 + %2 = call <4 x float> @llvm.fma.v4f32(<4 x float> %1, <4 x float> %a1, <4 x float> %a2) + ret <4 x float> %2 } -declare <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float>, <4 x float>, <4 x float>) define <2 x double> @test_x86_fma_vfnmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_pd: @@ -569,10 +615,10 @@ define <2 x double> @test_x86_fma_vfnmadd_pd(<2 x double> %a0, <2 x double> %a1, ; CHECK-FMA-WIN-NEXT: vfnmadd213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xac,0x00] ; CHECK-FMA-WIN-NEXT: # xmm0 = -(xmm1 * xmm0) + mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] - %res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) - ret <2 x double> %res + %1 = fsub <2 x double> , %a0 + %2 = call <2 x double> @llvm.fma.v2f64(<2 x double> %1, <2 x double> %a1, <2 x double> %a2) + ret <2 x double> %2 } -declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x double>) define <8 x float> @test_x86_fma_vfnmadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_ps_256: @@ -594,10 +640,10 @@ define <8 x float> @test_x86_fma_vfnmadd_ps_256(<8 x float> %a0, <8 x float> %a1 ; CHECK-FMA-WIN-NEXT: vfnmadd213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xac,0x00] ; CHECK-FMA-WIN-NEXT: # ymm0 = -(ymm1 * ymm0) + mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] - %res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) - ret <8 x float> %res + %1 = fsub <8 x float> , %a0 + %2 = call <8 x float> @llvm.fma.v8f32(<8 x float> %1, <8 x float> %a1, <8 x float> %a2) + ret <8 x float> %2 } -declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) define <4 x double> @test_x86_fma_vfnmadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_pd_256: @@ -619,10 +665,10 @@ define <4 x double> @test_x86_fma_vfnmadd_pd_256(<4 x double> %a0, <4 x double> ; CHECK-FMA-WIN-NEXT: vfnmadd213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xac,0x00] ; CHECK-FMA-WIN-NEXT: # ymm0 = -(ymm1 * ymm0) + mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] - %res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) - ret <4 x double> %res + %1 = fsub <4 x double> , %a0 + %2 = call <4 x double> @llvm.fma.v4f64(<4 x double> %1, <4 x double> %a1, <4 x double> %a2) + ret <4 x double> %2 } -declare <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double>, <4 x double>, <4 x double>) ; VFNMSUB define <4 x float> @test_x86_fma_vfnmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { @@ -646,8 +692,14 @@ define <4 x float> @test_x86_fma_vfnmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 ; CHECK-FMA-WIN-NEXT: vfnmsub132ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9f,0x02] ; CHECK-FMA-WIN-NEXT: # xmm0 = -(xmm0 * mem) - xmm1 ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] - %res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) - ret <4 x float> %res + %1 = extractelement <4 x float> %a0, i64 0 + %2 = extractelement <4 x float> %a1, i64 0 + %3 = extractelement <4 x float> %a2, i64 0 + %4 = fsub float -0.000000e+00, %2 + %5 = fsub float -0.000000e+00, %3 + %6 = call float @llvm.fma.f32(float %1, float %4, float %5) + %7 = insertelement <4 x float> %a0, float %6, i64 0 + ret <4 x float> %7 } define <4 x float> @test_x86_fma_vfnmsub_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { @@ -673,10 +725,15 @@ define <4 x float> @test_x86_fma_vfnmsub_bac_ss(<4 x float> %a0, <4 x float> %a1 ; CHECK-FMA-WIN-NEXT: vfnmsub132ss (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9f,0x01] ; CHECK-FMA-WIN-NEXT: # xmm0 = -(xmm0 * mem) - xmm1 ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] - %res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2) - ret <4 x float> %res + %1 = extractelement <4 x float> %a1, i64 0 + %2 = extractelement <4 x float> %a0, i64 0 + %3 = extractelement <4 x float> %a2, i64 0 + %4 = fsub float -0.000000e+00, %2 + %5 = fsub float -0.000000e+00, %3 + %6 = call float @llvm.fma.f32(float %1, float %4, float %5) + %7 = insertelement <4 x float> %a1, float %6, i64 0 + ret <4 x float> %7 } -declare <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float>, <4 x float>, <4 x float>) define <2 x double> @test_x86_fma_vfnmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_sd: @@ -699,8 +756,14 @@ define <2 x double> @test_x86_fma_vfnmsub_sd(<2 x double> %a0, <2 x double> %a1, ; CHECK-FMA-WIN-NEXT: vfnmsub132sd (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x9f,0x02] ; CHECK-FMA-WIN-NEXT: # xmm0 = -(xmm0 * mem) - xmm1 ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] - %res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) - ret <2 x double> %res + %1 = extractelement <2 x double> %a0, i64 0 + %2 = extractelement <2 x double> %a1, i64 0 + %3 = extractelement <2 x double> %a2, i64 0 + %4 = fsub double -0.000000e+00, %2 + %5 = fsub double -0.000000e+00, %3 + %6 = call double @llvm.fma.f64(double %1, double %4, double %5) + %7 = insertelement <2 x double> %a0, double %6, i64 0 + ret <2 x double> %7 } define <2 x double> @test_x86_fma_vfnmsub_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { @@ -726,10 +789,15 @@ define <2 x double> @test_x86_fma_vfnmsub_bac_sd(<2 x double> %a0, <2 x double> ; CHECK-FMA-WIN-NEXT: vfnmsub132sd (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x9f,0x01] ; CHECK-FMA-WIN-NEXT: # xmm0 = -(xmm0 * mem) - xmm1 ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] - %res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2) - ret <2 x double> %res + %1 = extractelement <2 x double> %a1, i64 0 + %2 = extractelement <2 x double> %a0, i64 0 + %3 = extractelement <2 x double> %a2, i64 0 + %4 = fsub double -0.000000e+00, %2 + %5 = fsub double -0.000000e+00, %3 + %6 = call double @llvm.fma.f64(double %1, double %4, double %5) + %7 = insertelement <2 x double> %a1, double %6, i64 0 + ret <2 x double> %7 } -declare <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double>, <2 x double>, <2 x double>) define <4 x float> @test_x86_fma_vfnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_ps: @@ -751,10 +819,11 @@ define <4 x float> @test_x86_fma_vfnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 ; CHECK-FMA-WIN-NEXT: vfnmsub213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xae,0x00] ; CHECK-FMA-WIN-NEXT: # xmm0 = -(xmm1 * xmm0) - mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] - %res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) - ret <4 x float> %res + %1 = fsub <4 x float> , %a0 + %2 = fsub <4 x float> , %a2 + %3 = call <4 x float> @llvm.fma.v4f32(<4 x float> %1, <4 x float> %a1, <4 x float> %2) + ret <4 x float> %3 } -declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x float>) define <2 x double> @test_x86_fma_vfnmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_pd: @@ -776,10 +845,11 @@ define <2 x double> @test_x86_fma_vfnmsub_pd(<2 x double> %a0, <2 x double> %a1, ; CHECK-FMA-WIN-NEXT: vfnmsub213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xae,0x00] ; CHECK-FMA-WIN-NEXT: # xmm0 = -(xmm1 * xmm0) - mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] - %res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) - ret <2 x double> %res + %1 = fsub <2 x double> , %a0 + %2 = fsub <2 x double> , %a2 + %3 = call <2 x double> @llvm.fma.v2f64(<2 x double> %1, <2 x double> %a1, <2 x double> %2) + ret <2 x double> %3 } -declare <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double>, <2 x double>, <2 x double>) define <8 x float> @test_x86_fma_vfnmsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_ps_256: @@ -801,10 +871,11 @@ define <8 x float> @test_x86_fma_vfnmsub_ps_256(<8 x float> %a0, <8 x float> %a1 ; CHECK-FMA-WIN-NEXT: vfnmsub213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xae,0x00] ; CHECK-FMA-WIN-NEXT: # ymm0 = -(ymm1 * ymm0) - mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] - %res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) - ret <8 x float> %res + %1 = fsub <8 x float> , %a0 + %2 = fsub <8 x float> , %a2 + %3 = call <8 x float> @llvm.fma.v8f32(<8 x float> %1, <8 x float> %a1, <8 x float> %2) + ret <8 x float> %3 } -declare <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>) define <4 x double> @test_x86_fma_vfnmsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_pd_256: @@ -826,10 +897,11 @@ define <4 x double> @test_x86_fma_vfnmsub_pd_256(<4 x double> %a0, <4 x double> ; CHECK-FMA-WIN-NEXT: vfnmsub213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xae,0x00] ; CHECK-FMA-WIN-NEXT: # ymm0 = -(ymm1 * ymm0) - mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] - %res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) - ret <4 x double> %res + %1 = fsub <4 x double> , %a0 + %2 = fsub <4 x double> , %a2 + %3 = call <4 x double> @llvm.fma.v4f64(<4 x double> %1, <4 x double> %a1, <4 x double> %2) + ret <4 x double> %3 } -declare <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>) ; VFMADDSUB define <4 x float> @test_x86_fma_vfmaddsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { @@ -852,10 +924,12 @@ define <4 x float> @test_x86_fma_vfmaddsub_ps(<4 x float> %a0, <4 x float> %a1, ; CHECK-FMA-WIN-NEXT: vfmaddsub213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa6,0x00] ; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm1 * xmm0) +/- mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] - %res = call <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) - ret <4 x float> %res + %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) + %2 = fsub <4 x float> , %a2 + %3 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %2) + %4 = shufflevector <4 x float> %3, <4 x float> %1, <4 x i32> + ret <4 x float> %4 } -declare <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float>, <4 x float>, <4 x float>) define <2 x double> @test_x86_fma_vfmaddsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_pd: @@ -877,10 +951,12 @@ define <2 x double> @test_x86_fma_vfmaddsub_pd(<2 x double> %a0, <2 x double> %a ; CHECK-FMA-WIN-NEXT: vfmaddsub213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xa6,0x00] ; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm1 * xmm0) +/- mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] - %res = call <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) - ret <2 x double> %res + %1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) + %2 = fsub <2 x double> , %a2 + %3 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %2) + %4 = shufflevector <2 x double> %3, <2 x double> %1, <2 x i32> + ret <2 x double> %4 } -declare <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double>, <2 x double>, <2 x double>) define <8 x float> @test_x86_fma_vfmaddsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_ps_256: @@ -902,10 +978,12 @@ define <8 x float> @test_x86_fma_vfmaddsub_ps_256(<8 x float> %a0, <8 x float> % ; CHECK-FMA-WIN-NEXT: vfmaddsub213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xa6,0x00] ; CHECK-FMA-WIN-NEXT: # ymm0 = (ymm1 * ymm0) +/- mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] - %res = call <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) - ret <8 x float> %res + %1 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) + %2 = fsub <8 x float> , %a2 + %3 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a0, <8 x float> %a1, <8 x float> %2) + %4 = shufflevector <8 x float> %3, <8 x float> %1, <8 x i32> + ret <8 x float> %4 } -declare <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>) define <4 x double> @test_x86_fma_vfmaddsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_pd_256: @@ -927,10 +1005,12 @@ define <4 x double> @test_x86_fma_vfmaddsub_pd_256(<4 x double> %a0, <4 x double ; CHECK-FMA-WIN-NEXT: vfmaddsub213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xa6,0x00] ; CHECK-FMA-WIN-NEXT: # ymm0 = (ymm1 * ymm0) +/- mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] - %res = call <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) - ret <4 x double> %res + %1 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) + %2 = fsub <4 x double> , %a2 + %3 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %a1, <4 x double> %2) + %4 = shufflevector <4 x double> %3, <4 x double> %1, <4 x i32> + ret <4 x double> %4 } -declare <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>) ; VFMSUBADD define <4 x float> @test_x86_fma_vfmsubadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { @@ -953,10 +1033,12 @@ define <4 x float> @test_x86_fma_vfmsubadd_ps(<4 x float> %a0, <4 x float> %a1, ; CHECK-FMA-WIN-NEXT: vfmsubadd213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa7,0x00] ; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm1 * xmm0) -/+ mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] - %res = call <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) - ret <4 x float> %res + %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) + %2 = fsub <4 x float> , %a2 + %3 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %2) + %4 = shufflevector <4 x float> %1, <4 x float> %3, <4 x i32> + ret <4 x float> %4 } -declare <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float>, <4 x float>, <4 x float>) define <2 x double> @test_x86_fma_vfmsubadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_pd: @@ -978,10 +1060,12 @@ define <2 x double> @test_x86_fma_vfmsubadd_pd(<2 x double> %a0, <2 x double> %a ; CHECK-FMA-WIN-NEXT: vfmsubadd213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xa7,0x00] ; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm1 * xmm0) -/+ mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] - %res = call <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) - ret <2 x double> %res + %1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) + %2 = fsub <2 x double> , %a2 + %3 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %2) + %4 = shufflevector <2 x double> %1, <2 x double> %3, <2 x i32> + ret <2 x double> %4 } -declare <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double>, <2 x double>, <2 x double>) define <8 x float> @test_x86_fma_vfmsubadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_ps_256: @@ -1003,10 +1087,12 @@ define <8 x float> @test_x86_fma_vfmsubadd_ps_256(<8 x float> %a0, <8 x float> % ; CHECK-FMA-WIN-NEXT: vfmsubadd213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xa7,0x00] ; CHECK-FMA-WIN-NEXT: # ymm0 = (ymm1 * ymm0) -/+ mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] - %res = call <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) - ret <8 x float> %res + %1 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) + %2 = fsub <8 x float> , %a2 + %3 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a0, <8 x float> %a1, <8 x float> %2) + %4 = shufflevector <8 x float> %1, <8 x float> %3, <8 x i32> + ret <8 x float> %4 } -declare <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8 x float>) define <4 x double> @test_x86_fma_vfmsubadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_pd_256: @@ -1028,9 +1114,18 @@ define <4 x double> @test_x86_fma_vfmsubadd_pd_256(<4 x double> %a0, <4 x double ; CHECK-FMA-WIN-NEXT: vfmsubadd213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xa7,0x00] ; CHECK-FMA-WIN-NEXT: # ymm0 = (ymm1 * ymm0) -/+ mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] - %res = call <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) - ret <4 x double> %res + %1 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) + %2 = fsub <4 x double> , %a2 + %3 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %a1, <4 x double> %2) + %4 = shufflevector <4 x double> %1, <4 x double> %3, <4 x i32> + ret <4 x double> %4 } -declare <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double>, <4 x double>, <4 x double>) + +declare float @llvm.fma.f32(float, float, float) +declare double @llvm.fma.f64(double, double, double) +declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) +declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) +declare <8 x float> @llvm.fma.v8f32(<8 x float>, <8 x float>, <8 x float>) +declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) attributes #0 = { nounwind } diff --git a/test/CodeGen/X86/fma4-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/fma4-intrinsics-x86-upgrade.ll new file mode 100644 index 00000000000..1c9c5ba749a --- /dev/null +++ b/test/CodeGen/X86/fma4-intrinsics-x86-upgrade.ll @@ -0,0 +1,250 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+fma4,-fma -show-mc-encoding | FileCheck %s --check-prefix=CHECK +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+fma4,+fma -show-mc-encoding | FileCheck %s --check-prefix=CHECK + +define <4 x float> @test_x86_fma_vfmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { +; CHECK-LABEL: test_x86_fma_vfmadd_ps: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmaddps %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x68,0xc2,0x10] +; CHECK-NEXT: retq # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>) + +define <2 x double> @test_x86_fma_vfmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { +; CHECK-LABEL: test_x86_fma_vfmadd_pd: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmaddpd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x69,0xc2,0x10] +; CHECK-NEXT: retq # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) + ret <2 x double> %res +} +declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x double>) + +define <8 x float> @test_x86_fma_vfmadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 { +; CHECK-LABEL: test_x86_fma_vfmadd_ps_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmaddps %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x68,0xc2,0x10] +; CHECK-NEXT: retq # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) + ret <8 x float> %res +} +declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) + +define <4 x double> @test_x86_fma_vfmadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 { +; CHECK-LABEL: test_x86_fma_vfmadd_pd_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmaddpd %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x69,0xc2,0x10] +; CHECK-NEXT: retq # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) + ret <4 x double> %res +} +declare <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>) + +; VFMSUB +define <4 x float> @test_x86_fma_vfmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { +; CHECK-LABEL: test_x86_fma_vfmsub_ps: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmsubps %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x6c,0xc2,0x10] +; CHECK-NEXT: retq # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float>) + +define <2 x double> @test_x86_fma_vfmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { +; CHECK-LABEL: test_x86_fma_vfmsub_pd: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmsubpd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x6d,0xc2,0x10] +; CHECK-NEXT: retq # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) + ret <2 x double> %res +} +declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x double>) + +define <8 x float> @test_x86_fma_vfmsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 { +; CHECK-LABEL: test_x86_fma_vfmsub_ps_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmsubps %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x6c,0xc2,0x10] +; CHECK-NEXT: retq # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) + ret <8 x float> %res +} +declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>) + +define <4 x double> @test_x86_fma_vfmsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 { +; CHECK-LABEL: test_x86_fma_vfmsub_pd_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmsubpd %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x6d,0xc2,0x10] +; CHECK-NEXT: retq # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) + ret <4 x double> %res +} +declare <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>) + +; VFNMADD +define <4 x float> @test_x86_fma_vfnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { +; CHECK-LABEL: test_x86_fma_vfnmadd_ps: +; CHECK: # %bb.0: +; CHECK-NEXT: vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x78,0xc2,0x10] +; CHECK-NEXT: retq # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float>, <4 x float>, <4 x float>) + +define <2 x double> @test_x86_fma_vfnmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { +; CHECK-LABEL: test_x86_fma_vfnmadd_pd: +; CHECK: # %bb.0: +; CHECK-NEXT: vfnmaddpd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x79,0xc2,0x10] +; CHECK-NEXT: retq # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) + ret <2 x double> %res +} +declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x double>) + +define <8 x float> @test_x86_fma_vfnmadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 { +; CHECK-LABEL: test_x86_fma_vfnmadd_ps_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x78,0xc2,0x10] +; CHECK-NEXT: retq # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) + ret <8 x float> %res +} +declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) + +define <4 x double> @test_x86_fma_vfnmadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 { +; CHECK-LABEL: test_x86_fma_vfnmadd_pd_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vfnmaddpd %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x79,0xc2,0x10] +; CHECK-NEXT: retq # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) + ret <4 x double> %res +} +declare <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double>, <4 x double>, <4 x double>) + +; VFNMSUB +define <4 x float> @test_x86_fma_vfnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { +; CHECK-LABEL: test_x86_fma_vfnmsub_ps: +; CHECK: # %bb.0: +; CHECK-NEXT: vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x7c,0xc2,0x10] +; CHECK-NEXT: retq # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x float>) + +define <2 x double> @test_x86_fma_vfnmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { +; CHECK-LABEL: test_x86_fma_vfnmsub_pd: +; CHECK: # %bb.0: +; CHECK-NEXT: vfnmsubpd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x7d,0xc2,0x10] +; CHECK-NEXT: retq # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) + ret <2 x double> %res +} +declare <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double>, <2 x double>, <2 x double>) + +define <8 x float> @test_x86_fma_vfnmsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 { +; CHECK-LABEL: test_x86_fma_vfnmsub_ps_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vfnmsubps %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x7c,0xc2,0x10] +; CHECK-NEXT: retq # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) + ret <8 x float> %res +} +declare <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>) + +define <4 x double> @test_x86_fma_vfnmsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 { +; CHECK-LABEL: test_x86_fma_vfnmsub_pd_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vfnmsubpd %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x7d,0xc2,0x10] +; CHECK-NEXT: retq # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) + ret <4 x double> %res +} +declare <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>) + +; VFMADDSUB +define <4 x float> @test_x86_fma_vfmaddsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { +; CHECK-LABEL: test_x86_fma_vfmaddsub_ps: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmaddsubps %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x5c,0xc2,0x10] +; CHECK-NEXT: retq # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float>, <4 x float>, <4 x float>) + +define <2 x double> @test_x86_fma_vfmaddsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { +; CHECK-LABEL: test_x86_fma_vfmaddsub_pd: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmaddsubpd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x5d,0xc2,0x10] +; CHECK-NEXT: retq # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) + ret <2 x double> %res +} +declare <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double>, <2 x double>, <2 x double>) + +define <8 x float> @test_x86_fma_vfmaddsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 { +; CHECK-LABEL: test_x86_fma_vfmaddsub_ps_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmaddsubps %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x5c,0xc2,0x10] +; CHECK-NEXT: retq # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) + ret <8 x float> %res +} +declare <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>) + +define <4 x double> @test_x86_fma_vfmaddsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 { +; CHECK-LABEL: test_x86_fma_vfmaddsub_pd_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmaddsubpd %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x5d,0xc2,0x10] +; CHECK-NEXT: retq # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) + ret <4 x double> %res +} +declare <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>) + +; VFMSUBADD +define <4 x float> @test_x86_fma_vfmsubadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { +; CHECK-LABEL: test_x86_fma_vfmsubadd_ps: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmsubaddps %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x5e,0xc2,0x10] +; CHECK-NEXT: retq # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float>, <4 x float>, <4 x float>) + +define <2 x double> @test_x86_fma_vfmsubadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { +; CHECK-LABEL: test_x86_fma_vfmsubadd_pd: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmsubaddpd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x5f,0xc2,0x10] +; CHECK-NEXT: retq # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) + ret <2 x double> %res +} +declare <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double>, <2 x double>, <2 x double>) + +define <8 x float> @test_x86_fma_vfmsubadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 { +; CHECK-LABEL: test_x86_fma_vfmsubadd_ps_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmsubaddps %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x5e,0xc2,0x10] +; CHECK-NEXT: retq # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) + ret <8 x float> %res +} +declare <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8 x float>) + +define <4 x double> @test_x86_fma_vfmsubadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 { +; CHECK-LABEL: test_x86_fma_vfmsubadd_pd_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmsubaddpd %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x5f,0xc2,0x10] +; CHECK-NEXT: retq # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) + ret <4 x double> %res +} +declare <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double>, <4 x double>, <4 x double>) + +attributes #0 = { nounwind } diff --git a/test/CodeGen/X86/fma4-intrinsics-x86.ll b/test/CodeGen/X86/fma4-intrinsics-x86.ll index ee6a7ec1b55..96e285bc25f 100644 --- a/test/CodeGen/X86/fma4-intrinsics-x86.ll +++ b/test/CodeGen/X86/fma4-intrinsics-x86.ll @@ -46,40 +46,36 @@ define <4 x float> @test_x86_fma_vfmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 ; CHECK: # %bb.0: ; CHECK-NEXT: vfmaddps %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x68,0xc2,0x10] ; CHECK-NEXT: retq # encoding: [0xc3] - %res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) - ret <4 x float> %res + %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) + ret <4 x float> %1 } -declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>) define <2 x double> @test_x86_fma_vfmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { ; CHECK-LABEL: test_x86_fma_vfmadd_pd: ; CHECK: # %bb.0: ; CHECK-NEXT: vfmaddpd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x69,0xc2,0x10] ; CHECK-NEXT: retq # encoding: [0xc3] - %res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) - ret <2 x double> %res + %1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) + ret <2 x double> %1 } -declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x double>) define <8 x float> @test_x86_fma_vfmadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 { ; CHECK-LABEL: test_x86_fma_vfmadd_ps_256: ; CHECK: # %bb.0: ; CHECK-NEXT: vfmaddps %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x68,0xc2,0x10] ; CHECK-NEXT: retq # encoding: [0xc3] - %res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) - ret <8 x float> %res + %1 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) + ret <8 x float> %1 } -declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) define <4 x double> @test_x86_fma_vfmadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 { ; CHECK-LABEL: test_x86_fma_vfmadd_pd_256: ; CHECK: # %bb.0: ; CHECK-NEXT: vfmaddpd %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x69,0xc2,0x10] ; CHECK-NEXT: retq # encoding: [0xc3] - %res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) - ret <4 x double> %res + %1 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) + ret <4 x double> %1 } -declare <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>) ; VFMSUB define <4 x float> @test_x86_fma_vfmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { @@ -87,40 +83,40 @@ define <4 x float> @test_x86_fma_vfmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 ; CHECK: # %bb.0: ; CHECK-NEXT: vfmsubps %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x6c,0xc2,0x10] ; CHECK-NEXT: retq # encoding: [0xc3] - %res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) - ret <4 x float> %res + %1 = fsub <4 x float> , %a2 + %2 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %1) + ret <4 x float> %2 } -declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float>) define <2 x double> @test_x86_fma_vfmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { ; CHECK-LABEL: test_x86_fma_vfmsub_pd: ; CHECK: # %bb.0: ; CHECK-NEXT: vfmsubpd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x6d,0xc2,0x10] ; CHECK-NEXT: retq # encoding: [0xc3] - %res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) - ret <2 x double> %res + %1 = fsub <2 x double> , %a2 + %2 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %1) + ret <2 x double> %2 } -declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x double>) define <8 x float> @test_x86_fma_vfmsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 { ; CHECK-LABEL: test_x86_fma_vfmsub_ps_256: ; CHECK: # %bb.0: ; CHECK-NEXT: vfmsubps %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x6c,0xc2,0x10] ; CHECK-NEXT: retq # encoding: [0xc3] - %res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) - ret <8 x float> %res + %1 = fsub <8 x float> , %a2 + %2 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a0, <8 x float> %a1, <8 x float> %1) + ret <8 x float> %2 } -declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>) define <4 x double> @test_x86_fma_vfmsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 { ; CHECK-LABEL: test_x86_fma_vfmsub_pd_256: ; CHECK: # %bb.0: ; CHECK-NEXT: vfmsubpd %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x6d,0xc2,0x10] ; CHECK-NEXT: retq # encoding: [0xc3] - %res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) - ret <4 x double> %res + %1 = fsub <4 x double> , %a2 + %2 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %a1, <4 x double> %1) + ret <4 x double> %2 } -declare <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>) ; VFNMADD define <4 x float> @test_x86_fma_vfnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { @@ -128,40 +124,40 @@ define <4 x float> @test_x86_fma_vfnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 ; CHECK: # %bb.0: ; CHECK-NEXT: vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x78,0xc2,0x10] ; CHECK-NEXT: retq # encoding: [0xc3] - %res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) - ret <4 x float> %res + %1 = fsub <4 x float> , %a0 + %2 = call <4 x float> @llvm.fma.v4f32(<4 x float> %1, <4 x float> %a1, <4 x float> %a2) + ret <4 x float> %2 } -declare <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float>, <4 x float>, <4 x float>) define <2 x double> @test_x86_fma_vfnmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { ; CHECK-LABEL: test_x86_fma_vfnmadd_pd: ; CHECK: # %bb.0: ; CHECK-NEXT: vfnmaddpd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x79,0xc2,0x10] ; CHECK-NEXT: retq # encoding: [0xc3] - %res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) - ret <2 x double> %res + %1 = fsub <2 x double> , %a0 + %2 = call <2 x double> @llvm.fma.v2f64(<2 x double> %1, <2 x double> %a1, <2 x double> %a2) + ret <2 x double> %2 } -declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x double>) define <8 x float> @test_x86_fma_vfnmadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 { ; CHECK-LABEL: test_x86_fma_vfnmadd_ps_256: ; CHECK: # %bb.0: ; CHECK-NEXT: vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x78,0xc2,0x10] ; CHECK-NEXT: retq # encoding: [0xc3] - %res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) - ret <8 x float> %res + %1 = fsub <8 x float> , %a0 + %2 = call <8 x float> @llvm.fma.v8f32(<8 x float> %1, <8 x float> %a1, <8 x float> %a2) + ret <8 x float> %2 } -declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) define <4 x double> @test_x86_fma_vfnmadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 { ; CHECK-LABEL: test_x86_fma_vfnmadd_pd_256: ; CHECK: # %bb.0: ; CHECK-NEXT: vfnmaddpd %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x79,0xc2,0x10] ; CHECK-NEXT: retq # encoding: [0xc3] - %res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) - ret <4 x double> %res + %1 = fsub <4 x double> , %a0 + %2 = call <4 x double> @llvm.fma.v4f64(<4 x double> %1, <4 x double> %a1, <4 x double> %a2) + ret <4 x double> %2 } -declare <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double>, <4 x double>, <4 x double>) ; VFNMSUB define <4 x float> @test_x86_fma_vfnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { @@ -169,40 +165,44 @@ define <4 x float> @test_x86_fma_vfnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 ; CHECK: # %bb.0: ; CHECK-NEXT: vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x7c,0xc2,0x10] ; CHECK-NEXT: retq # encoding: [0xc3] - %res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) - ret <4 x float> %res + %1 = fsub <4 x float> , %a0 + %2 = fsub <4 x float> , %a2 + %3 = call <4 x float> @llvm.fma.v4f32(<4 x float> %1, <4 x float> %a1, <4 x float> %2) + ret <4 x float> %3 } -declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x float>) define <2 x double> @test_x86_fma_vfnmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { ; CHECK-LABEL: test_x86_fma_vfnmsub_pd: ; CHECK: # %bb.0: ; CHECK-NEXT: vfnmsubpd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x7d,0xc2,0x10] ; CHECK-NEXT: retq # encoding: [0xc3] - %res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) - ret <2 x double> %res + %1 = fsub <2 x double> , %a0 + %2 = fsub <2 x double> , %a2 + %3 = call <2 x double> @llvm.fma.v2f64(<2 x double> %1, <2 x double> %a1, <2 x double> %2) + ret <2 x double> %3 } -declare <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double>, <2 x double>, <2 x double>) define <8 x float> @test_x86_fma_vfnmsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 { ; CHECK-LABEL: test_x86_fma_vfnmsub_ps_256: ; CHECK: # %bb.0: ; CHECK-NEXT: vfnmsubps %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x7c,0xc2,0x10] ; CHECK-NEXT: retq # encoding: [0xc3] - %res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) - ret <8 x float> %res + %1 = fsub <8 x float> , %a0 + %2 = fsub <8 x float> , %a2 + %3 = call <8 x float> @llvm.fma.v8f32(<8 x float> %1, <8 x float> %a1, <8 x float> %2) + ret <8 x float> %3 } -declare <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>) define <4 x double> @test_x86_fma_vfnmsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 { ; CHECK-LABEL: test_x86_fma_vfnmsub_pd_256: ; CHECK: # %bb.0: ; CHECK-NEXT: vfnmsubpd %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x7d,0xc2,0x10] ; CHECK-NEXT: retq # encoding: [0xc3] - %res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) - ret <4 x double> %res + %1 = fsub <4 x double> , %a0 + %2 = fsub <4 x double> , %a2 + %3 = call <4 x double> @llvm.fma.v4f64(<4 x double> %1, <4 x double> %a1, <4 x double> %2) + ret <4 x double> %3 } -declare <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>) ; VFMADDSUB define <4 x float> @test_x86_fma_vfmaddsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { @@ -210,40 +210,48 @@ define <4 x float> @test_x86_fma_vfmaddsub_ps(<4 x float> %a0, <4 x float> %a1, ; CHECK: # %bb.0: ; CHECK-NEXT: vfmaddsubps %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x5c,0xc2,0x10] ; CHECK-NEXT: retq # encoding: [0xc3] - %res = call <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) - ret <4 x float> %res + %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) + %2 = fsub <4 x float> , %a2 + %3 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %2) + %4 = shufflevector <4 x float> %3, <4 x float> %1, <4 x i32> + ret <4 x float> %4 } -declare <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float>, <4 x float>, <4 x float>) define <2 x double> @test_x86_fma_vfmaddsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { ; CHECK-LABEL: test_x86_fma_vfmaddsub_pd: ; CHECK: # %bb.0: ; CHECK-NEXT: vfmaddsubpd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x5d,0xc2,0x10] ; CHECK-NEXT: retq # encoding: [0xc3] - %res = call <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) - ret <2 x double> %res + %1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) + %2 = fsub <2 x double> , %a2 + %3 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %2) + %4 = shufflevector <2 x double> %3, <2 x double> %1, <2 x i32> + ret <2 x double> %4 } -declare <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double>, <2 x double>, <2 x double>) define <8 x float> @test_x86_fma_vfmaddsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 { ; CHECK-LABEL: test_x86_fma_vfmaddsub_ps_256: ; CHECK: # %bb.0: ; CHECK-NEXT: vfmaddsubps %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x5c,0xc2,0x10] ; CHECK-NEXT: retq # encoding: [0xc3] - %res = call <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) - ret <8 x float> %res + %1 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) + %2 = fsub <8 x float> , %a2 + %3 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a0, <8 x float> %a1, <8 x float> %2) + %4 = shufflevector <8 x float> %3, <8 x float> %1, <8 x i32> + ret <8 x float> %4 } -declare <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>) define <4 x double> @test_x86_fma_vfmaddsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 { ; CHECK-LABEL: test_x86_fma_vfmaddsub_pd_256: ; CHECK: # %bb.0: ; CHECK-NEXT: vfmaddsubpd %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x5d,0xc2,0x10] ; CHECK-NEXT: retq # encoding: [0xc3] - %res = call <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) - ret <4 x double> %res + %1 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) + %2 = fsub <4 x double> , %a2 + %3 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %a1, <4 x double> %2) + %4 = shufflevector <4 x double> %3, <4 x double> %1, <4 x i32> + ret <4 x double> %4 } -declare <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>) ; VFMSUBADD define <4 x float> @test_x86_fma_vfmsubadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { @@ -251,39 +259,52 @@ define <4 x float> @test_x86_fma_vfmsubadd_ps(<4 x float> %a0, <4 x float> %a1, ; CHECK: # %bb.0: ; CHECK-NEXT: vfmsubaddps %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x5e,0xc2,0x10] ; CHECK-NEXT: retq # encoding: [0xc3] - %res = call <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) - ret <4 x float> %res + %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) + %2 = fsub <4 x float> , %a2 + %3 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %2) + %4 = shufflevector <4 x float> %1, <4 x float> %3, <4 x i32> + ret <4 x float> %4 } -declare <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float>, <4 x float>, <4 x float>) define <2 x double> @test_x86_fma_vfmsubadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { ; CHECK-LABEL: test_x86_fma_vfmsubadd_pd: ; CHECK: # %bb.0: ; CHECK-NEXT: vfmsubaddpd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x5f,0xc2,0x10] ; CHECK-NEXT: retq # encoding: [0xc3] - %res = call <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) - ret <2 x double> %res + %1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) + %2 = fsub <2 x double> , %a2 + %3 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %2) + %4 = shufflevector <2 x double> %1, <2 x double> %3, <2 x i32> + ret <2 x double> %4 } -declare <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double>, <2 x double>, <2 x double>) define <8 x float> @test_x86_fma_vfmsubadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 { ; CHECK-LABEL: test_x86_fma_vfmsubadd_ps_256: ; CHECK: # %bb.0: ; CHECK-NEXT: vfmsubaddps %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x5e,0xc2,0x10] ; CHECK-NEXT: retq # encoding: [0xc3] - %res = call <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) - ret <8 x float> %res + %1 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) + %2 = fsub <8 x float> , %a2 + %3 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a0, <8 x float> %a1, <8 x float> %2) + %4 = shufflevector <8 x float> %1, <8 x float> %3, <8 x i32> + ret <8 x float> %4 } -declare <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8 x float>) define <4 x double> @test_x86_fma_vfmsubadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 { ; CHECK-LABEL: test_x86_fma_vfmsubadd_pd_256: ; CHECK: # %bb.0: ; CHECK-NEXT: vfmsubaddpd %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x5f,0xc2,0x10] ; CHECK-NEXT: retq # encoding: [0xc3] - %res = call <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) - ret <4 x double> %res + %1 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) + %2 = fsub <4 x double> , %a2 + %3 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %a1, <4 x double> %2) + %4 = shufflevector <4 x double> %1, <4 x double> %3, <4 x i32> + ret <4 x double> %4 } -declare <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double>, <4 x double>, <4 x double>) + +declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #2 +declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) #2 +declare <8 x float> @llvm.fma.v8f32(<8 x float>, <8 x float>, <8 x float>) #2 +declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) #2 attributes #0 = { nounwind }