[TLI][AArch64] Add mappings to vectorized functions from ArmPL

Arm Performance Libraries contain math library which provides
vectorized versions of common math functions.
This patch allows to use it with clang and llvm via -fveclib=ArmPL or
-vector-library=ArmPL, so loops with such calls can be vectorized.
The executable needs to be linked with the amath library.

Arm Performance Libraries are available at:
https://developer.arm.com/Tools%20and%20Software/Arm%20Performance%20Libraries

Reviewed by: paulwalker-arm
Differential Revision: https://reviews.llvm.org/D154508
This commit is contained in:
Maciej Gabka 2023-07-04 08:43:07 +00:00
parent 221c5fb0a4
commit 5b0e19a7ab
12 changed files with 2908 additions and 12 deletions

View File

@ -55,13 +55,14 @@ public:
};
enum VectorLibrary {
NoLibrary, // Don't use any vector library.
Accelerate, // Use the Accelerate framework.
LIBMVEC, // GLIBC vector math library.
MASSV, // IBM MASS vector library.
SVML, // Intel short vector math library.
SLEEF, // SLEEF SIMD Library for Evaluating Elementary Functions.
Darwin_libsystem_m // Use Darwin's libsytem_m vector functions.
NoLibrary, // Don't use any vector library.
Accelerate, // Use the Accelerate framework.
LIBMVEC, // GLIBC vector math library.
MASSV, // IBM MASS vector library.
SVML, // Intel short vector math library.
SLEEF, // SLEEF SIMD Library for Evaluating Elementary Functions.
Darwin_libsystem_m, // Use Darwin's libsytem_m vector functions.
ArmPL // Arm Performance Libraries.
};
enum ObjCDispatchMethodKind {

View File

@ -2621,10 +2621,10 @@ def fno_experimental_isel : Flag<["-"], "fno-experimental-isel">, Group<f_clang_
Alias<fno_global_isel>;
def fveclib : Joined<["-"], "fveclib=">, Group<f_Group>, Flags<[CC1Option]>,
HelpText<"Use the given vector functions library">,
Values<"Accelerate,libmvec,MASSV,SVML,SLEEF,Darwin_libsystem_m,none">,
Values<"Accelerate,libmvec,MASSV,SVML,SLEEF,Darwin_libsystem_m,ArmPL,none">,
NormalizedValuesScope<"CodeGenOptions">,
NormalizedValues<["Accelerate", "LIBMVEC", "MASSV", "SVML", "SLEEF",
"Darwin_libsystem_m", "NoLibrary"]>,
"Darwin_libsystem_m", "ArmPL", "NoLibrary"]>,
MarshallingInfoEnum<CodeGenOpts<"VecLib">, "NoLibrary">;
def fno_lax_vector_conversions : Flag<["-"], "fno-lax-vector-conversions">, Group<f_Group>,
Alias<flax_vector_conversions_EQ>, AliasArgs<["none"]>;

View File

@ -284,6 +284,10 @@ static TargetLibraryInfoImpl *createTLII(llvm::Triple &TargetTriple,
TLII->addVectorizableFunctionsFromVecLib(
TargetLibraryInfoImpl::DarwinLibSystemM, TargetTriple);
break;
case CodeGenOptions::ArmPL:
TLII->addVectorizableFunctionsFromVecLib(TargetLibraryInfoImpl::ArmPL,
TargetTriple);
break;
default:
break;
}

View File

@ -5334,7 +5334,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
Triple.getArch() != llvm::Triple::x86_64)
D.Diag(diag::err_drv_unsupported_opt_for_target)
<< Name << Triple.getArchName();
} else if (Name == "SLEEF") {
} else if (Name == "SLEEF" || Name == "ArmPL") {
if (Triple.getArch() != llvm::Triple::aarch64 &&
Triple.getArch() != llvm::Triple::aarch64_be)
D.Diag(diag::err_drv_unsupported_opt_for_target)

View File

@ -80,6 +80,7 @@
// FLTOALL-NEXT: thin
// RUN: %clang --autocomplete=-fveclib= | FileCheck %s -check-prefix=FVECLIBALL
// FVECLIBALL: Accelerate
// FVECLIBALL-NEXT: ArmPL
// FVECLIBALL-NEXT: Darwin_libsystem_m
// FVECLIBALL-NEXT: libmvec
// FVECLIBALL-NEXT: MASSV

View File

@ -4,6 +4,7 @@
// RUN: %clang -### -c -fveclib=MASSV %s 2>&1 | FileCheck -check-prefix CHECK-MASSV %s
// RUN: %clang -### -c -fveclib=Darwin_libsystem_m %s 2>&1 | FileCheck -check-prefix CHECK-DARWIN_LIBSYSTEM_M %s
// RUN: %clang -### -c --target=aarch64-none-none -fveclib=SLEEF %s 2>&1 | FileCheck -check-prefix CHECK-SLEEF %s
// RUN: %clang -### -c --target=aarch64-none-none -fveclib=ArmPL %s 2>&1 | FileCheck -check-prefix CHECK-ARMPL %s
// RUN: not %clang -c -fveclib=something %s 2>&1 | FileCheck -check-prefix CHECK-INVALID %s
// CHECK-NOLIB: "-fveclib=none"
@ -12,10 +13,12 @@
// CHECK-MASSV: "-fveclib=MASSV"
// CHECK-DARWIN_LIBSYSTEM_M: "-fveclib=Darwin_libsystem_m"
// CHECK-SLEEF: "-fveclib=SLEEF"
// CHECK-ARMPL: "-fveclib=ArmPL"
// CHECK-INVALID: error: invalid value 'something' in '-fveclib=something'
// RUN: not %clang --target=x86-none-none -c -fveclib=SLEEF %s 2>&1 | FileCheck -check-prefix CHECK-ERROR %s
// RUN: not %clang --target=x86-none-none -c -fveclib=ArmPL %s 2>&1 | FileCheck -check-prefix CHECK-ERROR %s
// RUN: not %clang --target=aarch64-none-none -c -fveclib=LIBMVEC-X86 %s 2>&1 | FileCheck -check-prefix CHECK-ERROR %s
// RUN: not %clang --target=aarch64-none-none -c -fveclib=SVML %s 2>&1 | FileCheck -check-prefix CHECK-ERROR %s
// CHECK-ERROR: unsupported option {{.*}} for target

View File

@ -96,7 +96,8 @@ public:
LIBMVEC_X86, // GLIBC Vector Math library.
MASSV, // IBM MASS vector library.
SVML, // Intel short vector math library.
SLEEFGNUABI // SLEEF - SIMD Library for Evaluating Elementary Functions.
SLEEFGNUABI, // SLEEF - SIMD Library for Evaluating Elementary Functions.
ArmPL // Arm Performance Libraries.
};
TargetLibraryInfoImpl();

View File

@ -682,6 +682,228 @@ TLI_DEFINE_VECFUNC("tanhf", "_ZGVsMxv_tanhf", SCALABLE(4), MASKED)
TLI_DEFINE_VECFUNC("tgamma", "_ZGVsMxv_tgamma", SCALABLE(2), MASKED)
TLI_DEFINE_VECFUNC("tgammaf", "_ZGVsMxv_tgammaf", SCALABLE(4), MASKED)
#elif defined(TLI_DEFINE_ARMPL_VECFUNCS)
TLI_DEFINE_VECFUNC("acos", "armpl_vacosq_f64", FIXED(2), NOMASK)
TLI_DEFINE_VECFUNC("acosf", "armpl_vacosq_f32", FIXED(4), NOMASK)
TLI_DEFINE_VECFUNC("acos", "armpl_svacos_f64_x", SCALABLE(2), MASKED)
TLI_DEFINE_VECFUNC("acosf", "armpl_svacos_f32_x", SCALABLE(4), MASKED)
TLI_DEFINE_VECFUNC("acosh", "armpl_vacoshq_f64", FIXED(2), NOMASK)
TLI_DEFINE_VECFUNC("acoshf", "armpl_vacoshq_f32", FIXED(4), NOMASK)
TLI_DEFINE_VECFUNC("acosh", "armpl_svacosh_f64_x", SCALABLE(2), MASKED)
TLI_DEFINE_VECFUNC("acoshf", "armpl_svacosh_f32_x", SCALABLE(4), MASKED)
TLI_DEFINE_VECFUNC("asin", "armpl_vasinq_f64", FIXED(2), NOMASK)
TLI_DEFINE_VECFUNC("asinf", "armpl_vasinq_f32", FIXED(4), NOMASK)
TLI_DEFINE_VECFUNC("asin", "armpl_svasin_f64_x", SCALABLE(2), MASKED)
TLI_DEFINE_VECFUNC("asinf", "armpl_svasin_f32_x", SCALABLE(4), MASKED)
TLI_DEFINE_VECFUNC("asinh", "armpl_vasinhq_f64", FIXED(2), NOMASK)
TLI_DEFINE_VECFUNC("asinhf", "armpl_vasinhq_f32", FIXED(4), NOMASK)
TLI_DEFINE_VECFUNC("asinh", "armpl_svasinh_f64_x", SCALABLE(2), MASKED)
TLI_DEFINE_VECFUNC("asinhf", "armpl_svasinh_f32_x", SCALABLE(4), MASKED)
TLI_DEFINE_VECFUNC("atan", "armpl_vatanq_f64", FIXED(2), NOMASK)
TLI_DEFINE_VECFUNC("atanf", "armpl_vatanq_f32", FIXED(4), NOMASK)
TLI_DEFINE_VECFUNC("atan", "armpl_svatan_f64_x", SCALABLE(2), MASKED)
TLI_DEFINE_VECFUNC("atanf", "armpl_svatan_f32_x", SCALABLE(4), MASKED)
TLI_DEFINE_VECFUNC("atan2", "armpl_vatan2q_f64", FIXED(2), NOMASK)
TLI_DEFINE_VECFUNC("atan2f", "armpl_vatan2q_f32", FIXED(4), NOMASK)
TLI_DEFINE_VECFUNC("atan2", "armpl_svatan2_f64_x", SCALABLE(2), MASKED)
TLI_DEFINE_VECFUNC("atan2f", "armpl_svatan2_f32_x", SCALABLE(4), MASKED)
TLI_DEFINE_VECFUNC("atanh", "armpl_vatanhq_f64", FIXED(2), NOMASK)
TLI_DEFINE_VECFUNC("atanhf", "armpl_vatanhq_f32", FIXED(4), NOMASK)
TLI_DEFINE_VECFUNC("atanh", "armpl_svatanh_f64_x", SCALABLE(2), MASKED)
TLI_DEFINE_VECFUNC("atanhf", "armpl_svatanh_f32_x", SCALABLE(4), MASKED)
TLI_DEFINE_VECFUNC("cbrt", "armpl_vcbrtq_f64", FIXED(2), NOMASK)
TLI_DEFINE_VECFUNC("cbrtf", "armpl_vcbrtq_f32", FIXED(4), NOMASK)
TLI_DEFINE_VECFUNC("cbrt", "armpl_svcbrt_f64_x", SCALABLE(2), MASKED)
TLI_DEFINE_VECFUNC("cbrtf", "armpl_svcbrt_f32_x", SCALABLE(4), MASKED)
TLI_DEFINE_VECFUNC("copysign", "armpl_vcopysignq_f64", FIXED(2), NOMASK)
TLI_DEFINE_VECFUNC("copysignf", "armpl_vcopysignq_f32", FIXED(4), NOMASK)
TLI_DEFINE_VECFUNC("copysign", "armpl_svcopysign_f64_x", SCALABLE(2), MASKED)
TLI_DEFINE_VECFUNC("copysignf", "armpl_svcopysign_f32_x", SCALABLE(4), MASKED)
TLI_DEFINE_VECFUNC("cos", "armpl_vcosq_f64", FIXED(2), NOMASK)
TLI_DEFINE_VECFUNC("cosf", "armpl_vcosq_f32", FIXED(4), NOMASK)
TLI_DEFINE_VECFUNC("cos", "armpl_svcos_f64_x", SCALABLE(2), MASKED)
TLI_DEFINE_VECFUNC("cosf", "armpl_svcos_f32_x", SCALABLE(4), MASKED)
TLI_DEFINE_VECFUNC("llvm.cos.f64", "armpl_vcosq_f64", FIXED(2), NOMASK)
TLI_DEFINE_VECFUNC("llvm.cos.f32", "armpl_vcosq_f32", FIXED(4), NOMASK)
TLI_DEFINE_VECFUNC("llvm.cos.f64", "armpl_svcos_f64_x", SCALABLE(2), MASKED)
TLI_DEFINE_VECFUNC("llvm.cos.f32", "armpl_svcos_f32_x", SCALABLE(4), MASKED)
TLI_DEFINE_VECFUNC("cosh", "armpl_vcoshq_f64", FIXED(2), NOMASK)
TLI_DEFINE_VECFUNC("coshf", "armpl_vcoshq_f32", FIXED(4), NOMASK)
TLI_DEFINE_VECFUNC("cosh", "armpl_svcosh_f64_x", SCALABLE(2), MASKED)
TLI_DEFINE_VECFUNC("coshf", "armpl_svcosh_f32_x", SCALABLE(4), MASKED)
TLI_DEFINE_VECFUNC("erf", "armpl_verfq_f64", FIXED(2), NOMASK)
TLI_DEFINE_VECFUNC("erff", "armpl_verfq_f32", FIXED(4), NOMASK)
TLI_DEFINE_VECFUNC("erf", "armpl_sverf_f64_x", SCALABLE(2), MASKED)
TLI_DEFINE_VECFUNC("erff", "armpl_sverf_f32_x", SCALABLE(4), MASKED)
TLI_DEFINE_VECFUNC("erfc", "armpl_verfcq_f64", FIXED(2), NOMASK)
TLI_DEFINE_VECFUNC("erfcf", "armpl_verfcq_f32", FIXED(4), NOMASK)
TLI_DEFINE_VECFUNC("erfc", "armpl_sverfc_f64_x", SCALABLE(2), MASKED)
TLI_DEFINE_VECFUNC("erfcf", "armpl_sverfc_f32_x", SCALABLE(4), MASKED)
TLI_DEFINE_VECFUNC("exp", "armpl_vexpq_f64", FIXED(2), NOMASK)
TLI_DEFINE_VECFUNC("expf", "armpl_vexpq_f32", FIXED(4), NOMASK)
TLI_DEFINE_VECFUNC("exp", "armpl_svexp_f64_x", SCALABLE(2), MASKED)
TLI_DEFINE_VECFUNC("expf", "armpl_svexp_f32_x", SCALABLE(4), MASKED)
TLI_DEFINE_VECFUNC("llvm.exp.f64", "armpl_vexpq_f64", FIXED(2), NOMASK)
TLI_DEFINE_VECFUNC("llvm.exp.f32", "armpl_vexpq_f32", FIXED(4), NOMASK)
TLI_DEFINE_VECFUNC("llvm.exp.f64", "armpl_svexp_f64_x", SCALABLE(2), MASKED)
TLI_DEFINE_VECFUNC("llvm.exp.f32", "armpl_svexp_f32_x", SCALABLE(4), MASKED)
TLI_DEFINE_VECFUNC("exp2", "armpl_vexp2q_f64", FIXED(2), NOMASK)
TLI_DEFINE_VECFUNC("exp2f", "armpl_vexp2q_f32", FIXED(4), NOMASK)
TLI_DEFINE_VECFUNC("exp2", "armpl_svexp2_f64_x", SCALABLE(2), MASKED)
TLI_DEFINE_VECFUNC("exp2f", "armpl_svexp2_f32_x", SCALABLE(4), MASKED)
TLI_DEFINE_VECFUNC("llvm.exp2.f64", "armpl_vexp2q_f64", FIXED(2), NOMASK)
TLI_DEFINE_VECFUNC("llvm.exp2.f32", "armpl_vexp2q_f32", FIXED(4), NOMASK)
TLI_DEFINE_VECFUNC("llvm.exp2.f64", "armpl_svexp2_f64_x", SCALABLE(2), MASKED)
TLI_DEFINE_VECFUNC("llvm.exp2.f32", "armpl_svexp2_f32_x", SCALABLE(4), MASKED)
TLI_DEFINE_VECFUNC("exp10", "armpl_vexp10q_f64", FIXED(2), NOMASK)
TLI_DEFINE_VECFUNC("exp10f", "armpl_vexp10q_f32", FIXED(4), NOMASK)
TLI_DEFINE_VECFUNC("exp10", "armpl_svexp10_f64_x", SCALABLE(2), MASKED)
TLI_DEFINE_VECFUNC("exp10f", "armpl_svexp10_f32_x", SCALABLE(4), MASKED)
TLI_DEFINE_VECFUNC("expm1", "armpl_vexpm1q_f64", FIXED(2), NOMASK)
TLI_DEFINE_VECFUNC("expm1f", "armpl_vexpm1q_f32", FIXED(4), NOMASK)
TLI_DEFINE_VECFUNC("expm1", "armpl_svexpm1_f64_x", SCALABLE(2), MASKED)
TLI_DEFINE_VECFUNC("expm1f", "armpl_svexpm1_f32_x", SCALABLE(4), MASKED)
TLI_DEFINE_VECFUNC("fdim", "armpl_vfdimq_f64", FIXED(2), NOMASK)
TLI_DEFINE_VECFUNC("fdimf", "armpl_vfdimq_f32", FIXED(4), NOMASK)
TLI_DEFINE_VECFUNC("fdim", "armpl_svfdim_f64_x", SCALABLE(2), MASKED)
TLI_DEFINE_VECFUNC("fdimf", "armpl_svfdim_f32_x", SCALABLE(4), MASKED)
TLI_DEFINE_VECFUNC("fma", "armpl_vfmaq_f64", FIXED(2), NOMASK)
TLI_DEFINE_VECFUNC("fmaf", "armpl_vfmaq_f32", FIXED(4), NOMASK)
TLI_DEFINE_VECFUNC("fma", "armpl_svfma_f64_x", SCALABLE(2), MASKED)
TLI_DEFINE_VECFUNC("fmaf", "armpl_svfma_f32_x", SCALABLE(4), MASKED)
TLI_DEFINE_VECFUNC("fmin", "armpl_vfminq_f64", FIXED(2), NOMASK)
TLI_DEFINE_VECFUNC("fminf", "armpl_vfminq_f32", FIXED(4), NOMASK)
TLI_DEFINE_VECFUNC("fmin", "armpl_svfmin_f64_x", SCALABLE(2), MASKED)
TLI_DEFINE_VECFUNC("fminf", "armpl_svfmin_f32_x", SCALABLE(4), MASKED)
TLI_DEFINE_VECFUNC("fmod", "armpl_vfmodq_f64", FIXED(2), NOMASK)
TLI_DEFINE_VECFUNC("fmodf", "armpl_vfmodq_f32", FIXED(4), NOMASK)
TLI_DEFINE_VECFUNC("fmod", "armpl_svfmod_f64_x", SCALABLE(2), MASKED)
TLI_DEFINE_VECFUNC("fmodf", "armpl_svfmod_f32_x", SCALABLE(4), MASKED)
TLI_DEFINE_VECFUNC("hypot", "armpl_vhypotq_f64", FIXED(2), NOMASK)
TLI_DEFINE_VECFUNC("hypotf", "armpl_vhypotq_f32", FIXED(4), NOMASK)
TLI_DEFINE_VECFUNC("hypot", "armpl_svhypot_f64_x", SCALABLE(2), MASKED)
TLI_DEFINE_VECFUNC("hypotf", "armpl_svhypot_f32_x", SCALABLE(4), MASKED)
TLI_DEFINE_VECFUNC("lgamma", "armpl_vlgammaq_f64", FIXED(2), NOMASK)
TLI_DEFINE_VECFUNC("lgammaf", "armpl_vlgammaq_f32", FIXED(4), NOMASK)
TLI_DEFINE_VECFUNC("lgamma", "armpl_svlgamma_f64_x", SCALABLE(2), MASKED)
TLI_DEFINE_VECFUNC("lgammaf", "armpl_svlgamma_f32_x", SCALABLE(4), MASKED)
TLI_DEFINE_VECFUNC("log", "armpl_vlogq_f64", FIXED(2), NOMASK)
TLI_DEFINE_VECFUNC("logf", "armpl_vlogq_f32", FIXED(4), NOMASK)
TLI_DEFINE_VECFUNC("log", "armpl_svlog_f64_x", SCALABLE(2), MASKED)
TLI_DEFINE_VECFUNC("logf", "armpl_svlog_f32_x", SCALABLE(4), MASKED)
TLI_DEFINE_VECFUNC("llvm.log.f64", "armpl_vlogq_f64", FIXED(2), NOMASK)
TLI_DEFINE_VECFUNC("llvm.log.f32", "armpl_vlogq_f32", FIXED(4), NOMASK)
TLI_DEFINE_VECFUNC("llvm.log.f64", "armpl_svlog_f64_x", SCALABLE(2), MASKED)
TLI_DEFINE_VECFUNC("llvm.log.f32", "armpl_svlog_f32_x", SCALABLE(4), MASKED)
TLI_DEFINE_VECFUNC("log1p", "armpl_vlog1pq_f64", FIXED(2), NOMASK)
TLI_DEFINE_VECFUNC("log1pf", "armpl_vlog1pq_f32", FIXED(4), NOMASK)
TLI_DEFINE_VECFUNC("log1p", "armpl_svlog1p_f64_x", SCALABLE(2), MASKED)
TLI_DEFINE_VECFUNC("log1pf", "armpl_svlog1p_f32_x", SCALABLE(4), MASKED)
TLI_DEFINE_VECFUNC("log2", "armpl_vlog2q_f64", FIXED(2), NOMASK)
TLI_DEFINE_VECFUNC("log2f", "armpl_vlog2q_f32", FIXED(4), NOMASK)
TLI_DEFINE_VECFUNC("log2", "armpl_svlog2_f64_x", SCALABLE(2), MASKED)
TLI_DEFINE_VECFUNC("log2f", "armpl_svlog2_f32_x", SCALABLE(4), MASKED)
TLI_DEFINE_VECFUNC("llvm.log2.f64", "armpl_vlog2q_f64", FIXED(2), NOMASK)
TLI_DEFINE_VECFUNC("llvm.log2.f32", "armpl_vlog2q_f32", FIXED(4), NOMASK)
TLI_DEFINE_VECFUNC("llvm.log2.f64", "armpl_svlog2_f64_x", SCALABLE(2), MASKED)
TLI_DEFINE_VECFUNC("llvm.log2.f32", "armpl_svlog2_f32_x", SCALABLE(4), MASKED)
TLI_DEFINE_VECFUNC("log10", "armpl_vlog10q_f64", FIXED(2), NOMASK)
TLI_DEFINE_VECFUNC("log10f", "armpl_vlog10q_f32", FIXED(4), NOMASK)
TLI_DEFINE_VECFUNC("log10", "armpl_svlog10_f64_x", SCALABLE(2), MASKED)
TLI_DEFINE_VECFUNC("log10f", "armpl_svlog10_f32_x", SCALABLE(4), MASKED)
TLI_DEFINE_VECFUNC("llvm.log10.f64", "armpl_vlog10q_f64", FIXED(2), NOMASK)
TLI_DEFINE_VECFUNC("llvm.log10.f32", "armpl_vlog10q_f32", FIXED(4), NOMASK)
TLI_DEFINE_VECFUNC("llvm.log10.f64", "armpl_svlog10_f64_x", SCALABLE(2), MASKED)
TLI_DEFINE_VECFUNC("llvm.log10.f32", "armpl_svlog10_f32_x", SCALABLE(4), MASKED)
TLI_DEFINE_VECFUNC("nextafter", "armpl_vnextafterq_f64", FIXED(2), NOMASK)
TLI_DEFINE_VECFUNC("nextafterf", "armpl_vnextafterq_f32", FIXED(4), NOMASK)
TLI_DEFINE_VECFUNC("nextafter", "armpl_svnextafter_f64_x", SCALABLE(2), MASKED)
TLI_DEFINE_VECFUNC("nextafterf", "armpl_svnextafter_f32_x", SCALABLE(4), MASKED)
TLI_DEFINE_VECFUNC("pow", "armpl_vpowq_f64", FIXED(2), NOMASK)
TLI_DEFINE_VECFUNC("powf", "armpl_vpowq_f32", FIXED(4), NOMASK)
TLI_DEFINE_VECFUNC("pow", "armpl_svpow_f64_x", SCALABLE(2), MASKED)
TLI_DEFINE_VECFUNC("powf", "armpl_svpow_f32_x", SCALABLE(4), MASKED)
TLI_DEFINE_VECFUNC("llvm.pow.f64", "armpl_vpowq_f64", FIXED(2), NOMASK)
TLI_DEFINE_VECFUNC("llvm.pow.f32", "armpl_vpowq_f32", FIXED(4), NOMASK)
TLI_DEFINE_VECFUNC("llvm.pow.f64", "armpl_svpow_f64_x", SCALABLE(2), MASKED)
TLI_DEFINE_VECFUNC("llvm.pow.f32", "armpl_svpow_f32_x", SCALABLE(4), MASKED)
TLI_DEFINE_VECFUNC("sin", "armpl_vsinq_f64", FIXED(2), NOMASK)
TLI_DEFINE_VECFUNC("sinf", "armpl_vsinq_f32", FIXED(4), NOMASK)
TLI_DEFINE_VECFUNC("sin", "armpl_svsin_f64_x", SCALABLE(2), MASKED)
TLI_DEFINE_VECFUNC("sinf", "armpl_svsin_f32_x", SCALABLE(4), MASKED)
TLI_DEFINE_VECFUNC("llvm.sin.f64", "armpl_vsinq_f64", FIXED(2), NOMASK)
TLI_DEFINE_VECFUNC("llvm.sin.f32", "armpl_vsinq_f32", FIXED(4), NOMASK)
TLI_DEFINE_VECFUNC("llvm.sin.f64", "armpl_svsin_f64_x", SCALABLE(2), MASKED)
TLI_DEFINE_VECFUNC("llvm.sin.f32", "armpl_svsin_f32_x", SCALABLE(4), MASKED)
TLI_DEFINE_VECFUNC("sinh", "armpl_vsinhq_f64", FIXED(2), NOMASK)
TLI_DEFINE_VECFUNC("sinhf", "armpl_vsinhq_f32", FIXED(4), NOMASK)
TLI_DEFINE_VECFUNC("sinh", "armpl_svsinh_f64_x", SCALABLE(2), MASKED)
TLI_DEFINE_VECFUNC("sinhf", "armpl_svsinh_f32_x", SCALABLE(4), MASKED)
TLI_DEFINE_VECFUNC("sinpi", "armpl_vsinpiq_f64", FIXED(2), NOMASK)
TLI_DEFINE_VECFUNC("sinpif", "armpl_vsinpiq_f32", FIXED(4), NOMASK)
TLI_DEFINE_VECFUNC("sinpi", "armpl_svsinpi_f64_x", SCALABLE(2), MASKED)
TLI_DEFINE_VECFUNC("sinpif", "armpl_svsinpi_f32_x", SCALABLE(4), MASKED)
TLI_DEFINE_VECFUNC("sqrt", "armpl_vsqrtq_f64", FIXED(2), NOMASK)
TLI_DEFINE_VECFUNC("sqrtf", "armpl_vsqrtq_f32", FIXED(4), NOMASK)
TLI_DEFINE_VECFUNC("sqrt", "armpl_svsqrt_f64_x", SCALABLE(2), MASKED)
TLI_DEFINE_VECFUNC("sqrtf", "armpl_svsqrt_f32_x", SCALABLE(4), MASKED)
TLI_DEFINE_VECFUNC("tan", "armpl_vtanq_f64", FIXED(2), NOMASK)
TLI_DEFINE_VECFUNC("tanf", "armpl_vtanq_f32", FIXED(4), NOMASK)
TLI_DEFINE_VECFUNC("tan", "armpl_svtan_f64_x", SCALABLE(2), MASKED)
TLI_DEFINE_VECFUNC("tanf", "armpl_svtan_f32_x", SCALABLE(4), MASKED)
TLI_DEFINE_VECFUNC("tanh", "armpl_vtanhq_f64", FIXED(2), NOMASK)
TLI_DEFINE_VECFUNC("tanhf", "armpl_vtanhq_f32", FIXED(4), NOMASK)
TLI_DEFINE_VECFUNC("tanh", "armpl_svtanh_f64_x", SCALABLE(2), MASKED)
TLI_DEFINE_VECFUNC("tanhf", "armpl_svtanh_f32_x", SCALABLE(4), MASKED)
TLI_DEFINE_VECFUNC("tgamma", "armpl_vtgammaq_f64", FIXED(2), NOMASK)
TLI_DEFINE_VECFUNC("tgammaf", "armpl_vtgammaq_f32", FIXED(4), NOMASK)
TLI_DEFINE_VECFUNC("tgamma", "armpl_svtgamma_f64_x", SCALABLE(2), MASKED)
TLI_DEFINE_VECFUNC("tgammaf", "armpl_svtgamma_f32_x", SCALABLE(4), MASKED)
#else
#error "Must choose which vector library functions are to be defined."
#endif
@ -701,3 +923,4 @@ TLI_DEFINE_VECFUNC("tgammaf", "_ZGVsMxv_tgammaf", SCALABLE(4), MASKED)
#undef TLI_DEFINE_SLEEFGNUABI_VF4_VECFUNCS
#undef TLI_DEFINE_SLEEFGNUABI_SCALABLE_VECFUNCS
#undef TLI_DEFINE_MASSV_VECFUNCS_NAMES
#undef TLI_DEFINE_ARMPL_VECFUNCS

View File

@ -33,7 +33,9 @@ static cl::opt<TargetLibraryInfoImpl::VectorLibrary> ClVectorLibrary(
clEnumValN(TargetLibraryInfoImpl::SVML, "SVML",
"Intel SVML library"),
clEnumValN(TargetLibraryInfoImpl::SLEEFGNUABI, "sleefgnuabi",
"SIMD Library for Evaluating Elementary Functions")));
"SIMD Library for Evaluating Elementary Functions"),
clEnumValN(TargetLibraryInfoImpl::ArmPL, "ArmPL",
"Arm Performance Libraries")));
StringLiteral const TargetLibraryInfoImpl::StandardNames[LibFunc::NumLibFuncs] =
{
@ -1215,6 +1217,23 @@ void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib(
}
break;
}
case ArmPL: {
const VecDesc VecFuncs[] = {
#define TLI_DEFINE_ARMPL_VECFUNCS
#define TLI_DEFINE_VECFUNC(SCAL, VEC, VF, MASK) {SCAL, VEC, VF, MASK},
#include "llvm/Analysis/VecFuncs.def"
};
switch (TargetTriple.getArch()) {
default:
break;
case llvm::Triple::aarch64:
case llvm::Triple::aarch64_be:
addVectorizableFunctions(VecFuncs);
break;
}
break;
}
case NoLibrary:
break;
}

View File

@ -0,0 +1,380 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
; RUN: opt -S -vector-library=ArmPL -replace-with-veclib < %s | FileCheck %s
target triple = "aarch64-unknown-linux-gnu"
;
; The replace-with-veclib pass does not work with scalable types, thus
; the mappings aren't utilised. Tests will need to be regenerated when the
; pass is improved.
;
declare <2 x double> @llvm.cos.v2f64(<2 x double>)
declare <4 x float> @llvm.cos.v4f32(<4 x float>)
declare <vscale x 2 x double> @llvm.cos.nxv2f64(<vscale x 2 x double>)
declare <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float>)
define <2 x double> @llvm_cos_f64(<2 x double> %in) {
; CHECK-LABEL: define <2 x double> @llvm_cos_f64
; CHECK-SAME: (<2 x double> [[IN:%.*]]) {
; CHECK-NEXT: [[TMP1:%.*]] = call fast <2 x double> @armpl_vcosq_f64(<2 x double> [[IN]])
; CHECK-NEXT: ret <2 x double> [[TMP1]]
;
%1 = call fast <2 x double> @llvm.cos.v2f64(<2 x double> %in)
ret <2 x double> %1
}
define <4 x float> @llvm_cos_f32(<4 x float> %in) {
; CHECK-LABEL: define <4 x float> @llvm_cos_f32
; CHECK-SAME: (<4 x float> [[IN:%.*]]) {
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @armpl_vcosq_f32(<4 x float> [[IN]])
; CHECK-NEXT: ret <4 x float> [[TMP1]]
;
%1 = call fast <4 x float> @llvm.cos.v4f32(<4 x float> %in)
ret <4 x float> %1
}
define <vscale x 2 x double> @llvm_cos_vscale_f64(<vscale x 2 x double> %in) #0 {
; CHECK-LABEL: define <vscale x 2 x double> @llvm_cos_vscale_f64
; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1:[0-9]+]] {
; CHECK-NEXT: [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.cos.nxv2f64(<vscale x 2 x double> [[IN]])
; CHECK-NEXT: ret <vscale x 2 x double> [[TMP1]]
;
%1 = call fast <vscale x 2 x double> @llvm.cos.nxv2f64(<vscale x 2 x double> %in)
ret <vscale x 2 x double> %1
}
define <vscale x 4 x float> @llvm_cos_vscale_f32(<vscale x 4 x float> %in) #0 {
; CHECK-LABEL: define <vscale x 4 x float> @llvm_cos_vscale_f32
; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float> [[IN]])
; CHECK-NEXT: ret <vscale x 4 x float> [[TMP1]]
;
%1 = call fast <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float> %in)
ret <vscale x 4 x float> %1
}
declare <2 x double> @llvm.sin.v2f64(<2 x double>)
declare <4 x float> @llvm.sin.v4f32(<4 x float>)
declare <vscale x 2 x double> @llvm.sin.nxv2f64(<vscale x 2 x double>)
declare <vscale x 4 x float> @llvm.sin.nxv4f32(<vscale x 4 x float>)
define <2 x double> @llvm_sin_f64(<2 x double> %in) {
; CHECK-LABEL: define <2 x double> @llvm_sin_f64
; CHECK-SAME: (<2 x double> [[IN:%.*]]) {
; CHECK-NEXT: [[TMP1:%.*]] = call fast <2 x double> @armpl_vsinq_f64(<2 x double> [[IN]])
; CHECK-NEXT: ret <2 x double> [[TMP1]]
;
%1 = call fast <2 x double> @llvm.sin.v2f64(<2 x double> %in)
ret <2 x double> %1
}
define <4 x float> @llvm_sin_f32(<4 x float> %in) {
; CHECK-LABEL: define <4 x float> @llvm_sin_f32
; CHECK-SAME: (<4 x float> [[IN:%.*]]) {
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @armpl_vsinq_f32(<4 x float> [[IN]])
; CHECK-NEXT: ret <4 x float> [[TMP1]]
;
%1 = call fast <4 x float> @llvm.sin.v4f32(<4 x float> %in)
ret <4 x float> %1
}
define <vscale x 2 x double> @llvm_sin_vscale_f64(<vscale x 2 x double> %in) #0 {
; CHECK-LABEL: define <vscale x 2 x double> @llvm_sin_vscale_f64
; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.sin.nxv2f64(<vscale x 2 x double> [[IN]])
; CHECK-NEXT: ret <vscale x 2 x double> [[TMP1]]
;
%1 = call fast <vscale x 2 x double> @llvm.sin.nxv2f64(<vscale x 2 x double> %in)
ret <vscale x 2 x double> %1
}
define <vscale x 4 x float> @llvm_sin_vscale_f32(<vscale x 4 x float> %in) #0 {
; CHECK-LABEL: define <vscale x 4 x float> @llvm_sin_vscale_f32
; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.sin.nxv4f32(<vscale x 4 x float> [[IN]])
; CHECK-NEXT: ret <vscale x 4 x float> [[TMP1]]
;
%1 = call fast <vscale x 4 x float> @llvm.sin.nxv4f32(<vscale x 4 x float> %in)
ret <vscale x 4 x float> %1
}
declare <2 x double> @llvm.exp.v2f64(<2 x double>)
declare <4 x float> @llvm.exp.v4f32(<4 x float>)
declare <vscale x 2 x double> @llvm.exp.nxv2f64(<vscale x 2 x double>)
declare <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float>)
define <2 x double> @llvm_exp_f64(<2 x double> %in) {
; CHECK-LABEL: define <2 x double> @llvm_exp_f64
; CHECK-SAME: (<2 x double> [[IN:%.*]]) {
; CHECK-NEXT: [[TMP1:%.*]] = call fast <2 x double> @armpl_vexpq_f64(<2 x double> [[IN]])
; CHECK-NEXT: ret <2 x double> [[TMP1]]
;
%1 = call fast <2 x double> @llvm.exp.v2f64(<2 x double> %in)
ret <2 x double> %1
}
define <4 x float> @llvm_exp_f32(<4 x float> %in) {
; CHECK-LABEL: define <4 x float> @llvm_exp_f32
; CHECK-SAME: (<4 x float> [[IN:%.*]]) {
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @armpl_vexpq_f32(<4 x float> [[IN]])
; CHECK-NEXT: ret <4 x float> [[TMP1]]
;
%1 = call fast <4 x float> @llvm.exp.v4f32(<4 x float> %in)
ret <4 x float> %1
}
define <vscale x 2 x double> @llvm_exp_vscale_f64(<vscale x 2 x double> %in) #0 {
; CHECK-LABEL: define <vscale x 2 x double> @llvm_exp_vscale_f64
; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.exp.nxv2f64(<vscale x 2 x double> [[IN]])
; CHECK-NEXT: ret <vscale x 2 x double> [[TMP1]]
;
%1 = call fast <vscale x 2 x double> @llvm.exp.nxv2f64(<vscale x 2 x double> %in)
ret <vscale x 2 x double> %1
}
define <vscale x 4 x float> @llvm_exp_vscale_f32(<vscale x 4 x float> %in) #0 {
; CHECK-LABEL: define <vscale x 4 x float> @llvm_exp_vscale_f32
; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float> [[IN]])
; CHECK-NEXT: ret <vscale x 4 x float> [[TMP1]]
;
%1 = call fast <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float> %in)
ret <vscale x 4 x float> %1
}
declare <2 x double> @llvm.exp2.v2f64(<2 x double>)
declare <4 x float> @llvm.exp2.v4f32(<4 x float>)
declare <vscale x 2 x double> @llvm.exp2.nxv2f64(<vscale x 2 x double>)
declare <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float>)
define <2 x double> @llvm_exp2_f64(<2 x double> %in) {
; CHECK-LABEL: define <2 x double> @llvm_exp2_f64
; CHECK-SAME: (<2 x double> [[IN:%.*]]) {
; CHECK-NEXT: [[TMP1:%.*]] = call fast <2 x double> @armpl_vexp2q_f64(<2 x double> [[IN]])
; CHECK-NEXT: ret <2 x double> [[TMP1]]
;
%1 = call fast <2 x double> @llvm.exp2.v2f64(<2 x double> %in)
ret <2 x double> %1
}
define <4 x float> @llvm_exp2_f32(<4 x float> %in) {
; CHECK-LABEL: define <4 x float> @llvm_exp2_f32
; CHECK-SAME: (<4 x float> [[IN:%.*]]) {
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @armpl_vexp2q_f32(<4 x float> [[IN]])
; CHECK-NEXT: ret <4 x float> [[TMP1]]
;
%1 = call fast <4 x float> @llvm.exp2.v4f32(<4 x float> %in)
ret <4 x float> %1
}
define <vscale x 2 x double> @llvm_exp2_vscale_f64(<vscale x 2 x double> %in) #0 {
; CHECK-LABEL: define <vscale x 2 x double> @llvm_exp2_vscale_f64
; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.exp2.nxv2f64(<vscale x 2 x double> [[IN]])
; CHECK-NEXT: ret <vscale x 2 x double> [[TMP1]]
;
%1 = call fast <vscale x 2 x double> @llvm.exp2.nxv2f64(<vscale x 2 x double> %in)
ret <vscale x 2 x double> %1
}
define <vscale x 4 x float> @llvm_exp2_vscale_f32(<vscale x 4 x float> %in) #0 {
; CHECK-LABEL: define <vscale x 4 x float> @llvm_exp2_vscale_f32
; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float> [[IN]])
; CHECK-NEXT: ret <vscale x 4 x float> [[TMP1]]
;
%1 = call fast <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float> %in)
ret <vscale x 4 x float> %1
}
declare <2 x double> @llvm.log.v2f64(<2 x double>)
declare <4 x float> @llvm.log.v4f32(<4 x float>)
declare <vscale x 2 x double> @llvm.log.nxv2f64(<vscale x 2 x double>)
declare <vscale x 4 x float> @llvm.log.nxv4f32(<vscale x 4 x float>)
define <2 x double> @llvm_log_f64(<2 x double> %in) {
; CHECK-LABEL: define <2 x double> @llvm_log_f64
; CHECK-SAME: (<2 x double> [[IN:%.*]]) {
; CHECK-NEXT: [[TMP1:%.*]] = call fast <2 x double> @armpl_vlogq_f64(<2 x double> [[IN]])
; CHECK-NEXT: ret <2 x double> [[TMP1]]
;
%1 = call fast <2 x double> @llvm.log.v2f64(<2 x double> %in)
ret <2 x double> %1
}
define <4 x float> @llvm_log_f32(<4 x float> %in) {
; CHECK-LABEL: define <4 x float> @llvm_log_f32
; CHECK-SAME: (<4 x float> [[IN:%.*]]) {
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @armpl_vlogq_f32(<4 x float> [[IN]])
; CHECK-NEXT: ret <4 x float> [[TMP1]]
;
%1 = call fast <4 x float> @llvm.log.v4f32(<4 x float> %in)
ret <4 x float> %1
}
define <vscale x 2 x double> @llvm_log_vscale_f64(<vscale x 2 x double> %in) #0 {
; CHECK-LABEL: define <vscale x 2 x double> @llvm_log_vscale_f64
; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.log.nxv2f64(<vscale x 2 x double> [[IN]])
; CHECK-NEXT: ret <vscale x 2 x double> [[TMP1]]
;
%1 = call fast <vscale x 2 x double> @llvm.log.nxv2f64(<vscale x 2 x double> %in)
ret <vscale x 2 x double> %1
}
define <vscale x 4 x float> @llvm_log_vscale_f32(<vscale x 4 x float> %in) #0 {
; CHECK-LABEL: define <vscale x 4 x float> @llvm_log_vscale_f32
; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.log.nxv4f32(<vscale x 4 x float> [[IN]])
; CHECK-NEXT: ret <vscale x 4 x float> [[TMP1]]
;
%1 = call fast <vscale x 4 x float> @llvm.log.nxv4f32(<vscale x 4 x float> %in)
ret <vscale x 4 x float> %1
}
declare <2 x double> @llvm.log2.v2f64(<2 x double>)
declare <4 x float> @llvm.log2.v4f32(<4 x float>)
declare <vscale x 2 x double> @llvm.log2.nxv2f64(<vscale x 2 x double>)
declare <vscale x 4 x float> @llvm.log2.nxv4f32(<vscale x 4 x float>)
define <2 x double> @llvm_log2_f64(<2 x double> %in) {
; CHECK-LABEL: define <2 x double> @llvm_log2_f64
; CHECK-SAME: (<2 x double> [[IN:%.*]]) {
; CHECK-NEXT: [[TMP1:%.*]] = call fast <2 x double> @armpl_vlog2q_f64(<2 x double> [[IN]])
; CHECK-NEXT: ret <2 x double> [[TMP1]]
;
%1 = call fast <2 x double> @llvm.log2.v2f64(<2 x double> %in)
ret <2 x double> %1
}
define <4 x float> @llvm_log2_f32(<4 x float> %in) {
; CHECK-LABEL: define <4 x float> @llvm_log2_f32
; CHECK-SAME: (<4 x float> [[IN:%.*]]) {
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @armpl_vlog2q_f32(<4 x float> [[IN]])
; CHECK-NEXT: ret <4 x float> [[TMP1]]
;
%1 = call fast <4 x float> @llvm.log2.v4f32(<4 x float> %in)
ret <4 x float> %1
}
define <vscale x 2 x double> @llvm_log2_vscale_f64(<vscale x 2 x double> %in) #0 {
; CHECK-LABEL: define <vscale x 2 x double> @llvm_log2_vscale_f64
; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.log2.nxv2f64(<vscale x 2 x double> [[IN]])
; CHECK-NEXT: ret <vscale x 2 x double> [[TMP1]]
;
%1 = call fast <vscale x 2 x double> @llvm.log2.nxv2f64(<vscale x 2 x double> %in)
ret <vscale x 2 x double> %1
}
define <vscale x 4 x float> @llvm_log2_vscale_f32(<vscale x 4 x float> %in) #0 {
; CHECK-LABEL: define <vscale x 4 x float> @llvm_log2_vscale_f32
; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.log2.nxv4f32(<vscale x 4 x float> [[IN]])
; CHECK-NEXT: ret <vscale x 4 x float> [[TMP1]]
;
%1 = call fast <vscale x 4 x float> @llvm.log2.nxv4f32(<vscale x 4 x float> %in)
ret <vscale x 4 x float> %1
}
declare <2 x double> @llvm.log10.v2f64(<2 x double>)
declare <4 x float> @llvm.log10.v4f32(<4 x float>)
declare <vscale x 2 x double> @llvm.log10.nxv2f64(<vscale x 2 x double>)
declare <vscale x 4 x float> @llvm.log10.nxv4f32(<vscale x 4 x float>)
define <2 x double> @llvm_log10_f64(<2 x double> %in) {
; CHECK-LABEL: define <2 x double> @llvm_log10_f64
; CHECK-SAME: (<2 x double> [[IN:%.*]]) {
; CHECK-NEXT: [[TMP1:%.*]] = call fast <2 x double> @armpl_vlog10q_f64(<2 x double> [[IN]])
; CHECK-NEXT: ret <2 x double> [[TMP1]]
;
%1 = call fast <2 x double> @llvm.log10.v2f64(<2 x double> %in)
ret <2 x double> %1
}
define <4 x float> @llvm_log10_f32(<4 x float> %in) {
; CHECK-LABEL: define <4 x float> @llvm_log10_f32
; CHECK-SAME: (<4 x float> [[IN:%.*]]) {
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @armpl_vlog10q_f32(<4 x float> [[IN]])
; CHECK-NEXT: ret <4 x float> [[TMP1]]
;
%1 = call fast <4 x float> @llvm.log10.v4f32(<4 x float> %in)
ret <4 x float> %1
}
define <vscale x 2 x double> @llvm_log10_vscale_f64(<vscale x 2 x double> %in) #0 {
; CHECK-LABEL: define <vscale x 2 x double> @llvm_log10_vscale_f64
; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.log10.nxv2f64(<vscale x 2 x double> [[IN]])
; CHECK-NEXT: ret <vscale x 2 x double> [[TMP1]]
;
%1 = call fast <vscale x 2 x double> @llvm.log10.nxv2f64(<vscale x 2 x double> %in)
ret <vscale x 2 x double> %1
}
define <vscale x 4 x float> @llvm_log10_vscale_f32(<vscale x 4 x float> %in) #0 {
; CHECK-LABEL: define <vscale x 4 x float> @llvm_log10_vscale_f32
; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.log10.nxv4f32(<vscale x 4 x float> [[IN]])
; CHECK-NEXT: ret <vscale x 4 x float> [[TMP1]]
;
%1 = call fast <vscale x 4 x float> @llvm.log10.nxv4f32(<vscale x 4 x float> %in)
ret <vscale x 4 x float> %1
}
declare <2 x double> @llvm.pow.v2f64(<2 x double>, <2 x double>)
declare <4 x float> @llvm.pow.v4f32(<4 x float>, <4 x float>)
declare <vscale x 2 x double> @llvm.pow.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>)
declare <vscale x 4 x float> @llvm.pow.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>)
;
; There is a bug in the replace-with-veclib pass, and for intrinsics which take
; more than one arguments, but has just one overloaded type, it incorrectly
; reconstructs the scalar name, for pow specificlly it is searching for:
; llvm.pow.f64.f64 and llvm.pow.f32.f32
;
define <2 x double> @llvm_pow_f64(<2 x double> %in, <2 x double> %power) {
; CHECK-LABEL: define <2 x double> @llvm_pow_f64
; CHECK-SAME: (<2 x double> [[IN:%.*]], <2 x double> [[POWER:%.*]]) {
; CHECK-NEXT: [[TMP1:%.*]] = call fast <2 x double> @llvm.pow.v2f64(<2 x double> [[IN]], <2 x double> [[POWER]])
; CHECK-NEXT: ret <2 x double> [[TMP1]]
;
%1 = call fast <2 x double> @llvm.pow.v2f64(<2 x double> %in, <2 x double> %power)
ret <2 x double> %1
}
define <4 x float> @llvm_pow_f32(<4 x float> %in, <4 x float> %power) {
; CHECK-LABEL: define <4 x float> @llvm_pow_f32
; CHECK-SAME: (<4 x float> [[IN:%.*]], <4 x float> [[POWER:%.*]]) {
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.pow.v4f32(<4 x float> [[IN]], <4 x float> [[POWER]])
; CHECK-NEXT: ret <4 x float> [[TMP1]]
;
%1 = call fast <4 x float> @llvm.pow.v4f32(<4 x float> %in, <4 x float> %power)
ret <4 x float> %1
}
define <vscale x 2 x double> @llvm_pow_vscale_f64(<vscale x 2 x double> %in, <vscale x 2 x double> %power) #0 {
; CHECK-LABEL: define <vscale x 2 x double> @llvm_pow_vscale_f64
; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x double> [[POWER:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.pow.nxv2f64(<vscale x 2 x double> [[IN]], <vscale x 2 x double> [[POWER]])
; CHECK-NEXT: ret <vscale x 2 x double> [[TMP1]]
;
%1 = call fast <vscale x 2 x double> @llvm.pow.nxv2f64(<vscale x 2 x double> %in, <vscale x 2 x double> %power)
ret <vscale x 2 x double> %1
}
define <vscale x 4 x float> @llvm_pow_vscale_f32(<vscale x 4 x float> %in, <vscale x 4 x float> %power) #0 {
; CHECK-LABEL: define <vscale x 4 x float> @llvm_pow_vscale_f32
; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x float> [[POWER:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.pow.nxv4f32(<vscale x 4 x float> [[IN]], <vscale x 4 x float> [[POWER]])
; CHECK-NEXT: ret <vscale x 4 x float> [[TMP1]]
;
%1 = call fast <vscale x 4 x float> @llvm.pow.nxv4f32(<vscale x 4 x float> %in, <vscale x 4 x float> %power)
ret <vscale x 4 x float> %1
}
attributes #0 = { "target-features"="+sve" }

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,418 @@
; RUN: opt -vector-library=ArmPL -passes=inject-tli-mappings,loop-vectorize -S < %s | FileCheck %s --check-prefixes=CHECK,NEON
; RUN: opt -mattr=+sve -vector-library=ArmPL -passes=inject-tli-mappings,loop-vectorize -S < %s | FileCheck %s --check-prefixes=CHECK,SVE
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64-unknown-linux-gnu"
; Tests are checking if LV can vectorize loops with llvm math intrinsics
; using mappings from TLI for scalable and fixed width vectorization.
declare double @llvm.cos.f64(double)
declare float @llvm.cos.f32(float)
define void @cos_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
; CHECK-LABEL: @cos_f64(
; NEON: [[TMP5:%.*]] = call <2 x double> @armpl_vcosq_f64(<2 x double> [[TMP4:%.*]])
; SVE: [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svcos_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
; CHECK: ret void
;
entry:
br label %for.body
for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
%in = load double, ptr %in.gep, align 8
%call = tail call double @llvm.cos.f64(double %in)
%out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
store double %call, ptr %out.gep, align 8
%iv.next = add nuw nsw i64 %iv, 1
%exitcond = icmp eq i64 %iv.next, 1000
br i1 %exitcond, label %for.end, label %for.body
for.end:
ret void
}
define void @cos_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
; CHECK-LABEL: @cos_f32(
; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vcosq_f32(<4 x float> [[TMP4:%.*]])
; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svcos_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
; CHECK: ret void
;
entry:
br label %for.body
for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
%in = load float, ptr %in.gep, align 8
%call = tail call float @llvm.cos.f32(float %in)
%out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
store float %call, ptr %out.gep, align 4
%iv.next = add nuw nsw i64 %iv, 1
%exitcond = icmp eq i64 %iv.next, 1000
br i1 %exitcond, label %for.end, label %for.body
for.end:
ret void
}
declare double @llvm.exp.f64(double)
declare float @llvm.exp.f32(float)
define void @exp_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
; CHECK-LABEL: @exp_f64(
; NEON: [[TMP5:%.*]] = call <2 x double> @armpl_vexpq_f64(<2 x double> [[TMP4:%.*]])
; SVE: [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svexp_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
; CHECK: ret void
;
entry:
br label %for.body
for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
%in = load double, ptr %in.gep, align 8
%call = tail call double @llvm.exp.f64(double %in)
%out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
store double %call, ptr %out.gep, align 8
%iv.next = add nuw nsw i64 %iv, 1
%exitcond = icmp eq i64 %iv.next, 1000
br i1 %exitcond, label %for.end, label %for.body
for.end:
ret void
}
define void @exp_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
; CHECK-LABEL: @exp_f32(
; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vexpq_f32(<4 x float> [[TMP4:%.*]])
; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svexp_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
; CHECK: ret void
;
entry:
br label %for.body
for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
%in = load float, ptr %in.gep, align 8
%call = tail call float @llvm.exp.f32(float %in)
%out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
store float %call, ptr %out.gep, align 4
%iv.next = add nuw nsw i64 %iv, 1
%exitcond = icmp eq i64 %iv.next, 1000
br i1 %exitcond, label %for.end, label %for.body
for.end:
ret void
}
declare double @llvm.exp2.f64(double)
declare float @llvm.exp2.f32(float)
define void @exp2_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
; CHECK-LABEL: @exp2_f64(
; NEON: [[TMP5:%.*]] = call <2 x double> @armpl_vexp2q_f64(<2 x double> [[TMP4:%.*]])
; SVE: [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svexp2_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
; CHECK: ret void
;
entry:
br label %for.body
for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
%in = load double, ptr %in.gep, align 8
%call = tail call double @llvm.exp2.f64(double %in)
%out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
store double %call, ptr %out.gep, align 8
%iv.next = add nuw nsw i64 %iv, 1
%exitcond = icmp eq i64 %iv.next, 1000
br i1 %exitcond, label %for.end, label %for.body
for.end:
ret void
}
define void @exp2_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
; CHECK-LABEL: @exp2_f32(
; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vexp2q_f32(<4 x float> [[TMP4:%.*]])
; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svexp2_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
; CHECK: ret void
;
entry:
br label %for.body
for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
%in = load float, ptr %in.gep, align 8
%call = tail call float @llvm.exp2.f32(float %in)
%out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
store float %call, ptr %out.gep, align 4
%iv.next = add nuw nsw i64 %iv, 1
%exitcond = icmp eq i64 %iv.next, 1000
br i1 %exitcond, label %for.end, label %for.body
for.end:
ret void
}
declare double @llvm.log.f64(double)
declare float @llvm.log.f32(float)
define void @log_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
; CHECK-LABEL: @log_f64(
; NEON: [[TMP5:%.*]] = call <2 x double> @armpl_vlogq_f64(<2 x double> [[TMP4:%.*]])
; SVE: [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svlog_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
; CHECK: ret void
;
entry:
br label %for.body
for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
%in = load double, ptr %in.gep, align 8
%call = tail call double @llvm.log.f64(double %in)
%out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
store double %call, ptr %out.gep, align 8
%iv.next = add nuw nsw i64 %iv, 1
%exitcond = icmp eq i64 %iv.next, 1000
br i1 %exitcond, label %for.end, label %for.body
for.end:
ret void
}
define void @log_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
; CHECK-LABEL: @log_f32(
; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vlogq_f32(<4 x float> [[TMP4:%.*]])
; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svlog_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
; CHECK: ret void
;
entry:
br label %for.body
for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
%in = load float, ptr %in.gep, align 8
%call = tail call float @llvm.log.f32(float %in)
%out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
store float %call, ptr %out.gep, align 4
%iv.next = add nuw nsw i64 %iv, 1
%exitcond = icmp eq i64 %iv.next, 1000
br i1 %exitcond, label %for.end, label %for.body
for.end:
ret void
}
declare double @llvm.log2.f64(double)
declare float @llvm.log2.f32(float)
define void @log2_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
; CHECK-LABEL: @log2_f64(
; NEON: [[TMP5:%.*]] = call <2 x double> @armpl_vlog2q_f64(<2 x double> [[TMP4:%.*]])
; SVE: [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svlog2_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
; CHECK: ret void
;
entry:
br label %for.body
for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
%in = load double, ptr %in.gep, align 8
%call = tail call double @llvm.log2.f64(double %in)
%out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
store double %call, ptr %out.gep, align 8
%iv.next = add nuw nsw i64 %iv, 1
%exitcond = icmp eq i64 %iv.next, 1000
br i1 %exitcond, label %for.end, label %for.body
for.end:
ret void
}
define void @log2_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
; CHECK-LABEL: @log2_f32(
; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vlog2q_f32(<4 x float> [[TMP4:%.*]])
; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svlog2_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
; CHECK: ret void
;
entry:
br label %for.body
for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
%in = load float, ptr %in.gep, align 8
%call = tail call float @llvm.log2.f32(float %in)
%out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
store float %call, ptr %out.gep, align 4
%iv.next = add nuw nsw i64 %iv, 1
%exitcond = icmp eq i64 %iv.next, 1000
br i1 %exitcond, label %for.end, label %for.body
for.end:
ret void
}
declare double @llvm.log10.f64(double)
declare float @llvm.log10.f32(float)
define void @log10_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
; CHECK-LABEL: @log10_f64(
; NEON: [[TMP5:%.*]] = call <2 x double> @armpl_vlog10q_f64(<2 x double> [[TMP4:%.*]])
; SVE: [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svlog10_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
; CHECK: ret void
;
entry:
br label %for.body
for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
%in = load double, ptr %in.gep, align 8
%call = tail call double @llvm.log10.f64(double %in)
%out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
store double %call, ptr %out.gep, align 8
%iv.next = add nuw nsw i64 %iv, 1
%exitcond = icmp eq i64 %iv.next, 1000
br i1 %exitcond, label %for.end, label %for.body
for.end:
ret void
}
define void @log10_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
; CHECK-LABEL: @log10_f32(
; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vlog10q_f32(<4 x float> [[TMP4:%.*]])
; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svlog10_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
; CHECK: ret void
;
entry:
br label %for.body
for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
%in = load float, ptr %in.gep, align 8
%call = tail call float @llvm.log10.f32(float %in)
%out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
store float %call, ptr %out.gep, align 4
%iv.next = add nuw nsw i64 %iv, 1
%exitcond = icmp eq i64 %iv.next, 1000
br i1 %exitcond, label %for.end, label %for.body
for.end:
ret void
}
declare double @llvm.sin.f64(double)
declare float @llvm.sin.f32(float)
define void @sin_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
; CHECK-LABEL: @sin_f64(
; NEON: [[TMP5:%.*]] = call <2 x double> @armpl_vsinq_f64(<2 x double> [[TMP4:%.*]])
; SVE: [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svsin_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
; CHECK: ret void
;
entry:
br label %for.body
for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
%in = load double, ptr %in.gep, align 8
%call = tail call double @llvm.sin.f64(double %in)
%out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
store double %call, ptr %out.gep, align 8
%iv.next = add nuw nsw i64 %iv, 1
%exitcond = icmp eq i64 %iv.next, 1000
br i1 %exitcond, label %for.end, label %for.body
for.end:
ret void
}
define void @sin_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
; CHECK-LABEL: @sin_f32(
; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vsinq_f32(<4 x float> [[TMP4:%.*]])
; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svsin_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
; CHECK: ret void
;
entry:
br label %for.body
for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
%in = load float, ptr %in.gep, align 8
%call = tail call float @llvm.sin.f32(float %in)
%out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
store float %call, ptr %out.gep, align 4
%iv.next = add nuw nsw i64 %iv, 1
%exitcond = icmp eq i64 %iv.next, 1000
br i1 %exitcond, label %for.end, label %for.body
for.end:
ret void
}
declare double @llvm.pow.f64(double, double)
declare float @llvm.pow.f32(float, float)
define void @pow_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
; CHECK-LABEL: @pow_f64(
; NEON: [[TMP5:%.*]] = call <2 x double> @armpl_vpowq_f64(<2 x double> [[TMP4:%.*]], <2 x double> [[TMP4:%.*]])
; SVE: [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svpow_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
; CHECK: ret void
;
entry:
br label %for.body
for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
%in = load double, ptr %in.gep, align 8
%call = tail call double @llvm.pow.f64(double %in, double %in)
%out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
store double %call, ptr %out.gep, align 8
%iv.next = add nuw nsw i64 %iv, 1
%exitcond = icmp eq i64 %iv.next, 1000
br i1 %exitcond, label %for.end, label %for.body
for.end:
ret void
}
define void @pow_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
; CHECK-LABEL: @pow_f32(
; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vpowq_f32(<4 x float> [[TMP4:%.*]], <4 x float> [[TMP4:%.*]])
; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svpow_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
; CHECK: ret void
;
entry:
br label %for.body
for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
%in = load float, ptr %in.gep, align 8
%call = tail call float @llvm.pow.f32(float %in, float %in)
%out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
store float %call, ptr %out.gep, align 4
%iv.next = add nuw nsw i64 %iv, 1
%exitcond = icmp eq i64 %iv.next, 1000
br i1 %exitcond, label %for.end, label %for.body
for.end:
ret void
}