diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index e2b612f5a..6b33407a6 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -56,6 +56,7 @@ ADD_HEYOKA_BENCHMARK(event_overhead) ADD_HEYOKA_BENCHMARK(ss_event_overhead) ADD_HEYOKA_BENCHMARK(h_oscillator_lt) ADD_HEYOKA_BENCHMARK(mb) +ADD_HEYOKA_BENCHMARK(kepE_bench) ADD_HEYOKA_BENCHMARK(vsop2013_elliptic) ADD_HEYOKA_BENCHMARK(vsop2013_cartesian) ADD_HEYOKA_BENCHMARK(elp2000_cartesian) diff --git a/benchmark/kepE_bench.cpp b/benchmark/kepE_bench.cpp new file mode 100644 index 000000000..ab5b9f567 --- /dev/null +++ b/benchmark/kepE_bench.cpp @@ -0,0 +1,126 @@ +// Copyright 2020, 2021, 2022, 2023 Francesco Biscani (bluescarni@gmail.com), Dario Izzo (dario.izzo@gmail.com) +// +// This file is part of the heyoka library. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +using namespace heyoka; + +int main(int argc, char *argv[]) +{ + namespace po = boost::program_options; + + double ecc{}; + unsigned seed{}; + bool fast_math{}; + + po::options_description desc("Options"); + + desc.add_options()("help", "produce help message")("ecc", po::value(&ecc)->default_value(0.1), + "eccentricity")( + "seed", po::value(&seed)->default_value(42u), + "random seed")("fast-math", po::value(&fast_math)->default_value(true), "fast math mode"); + + po::variables_map vm; + po::store(po::parse_command_line(argc, argv, desc), vm); + po::notify(vm); + + if (vm.count("help") != 0u) { + std::cout << desc << "\n"; + return 0; + } + + if (!std::isfinite(ecc) || ecc < 0 || ecc >= 1) { + throw std::invalid_argument(fmt::format("Invalid eccentricity value: {}", ecc)); + } + + constexpr auto N = 1'000'000ul; + + std::cout << std::boolalpha; + std::cout << "Eccentricity: " << ecc << '\n'; + std::cout << "fast_math : " << fast_math << '\n'; + std::cout << "N : " << N << "\n\n"; + + // RNG setup. + std::mt19937 rng(seed); + std::uniform_real_distribution Mdist(0, 2 * boost::math::constants::pi()); + + // Data setup. + std::vector e_vec, M_vec, out_vec, out_vec_batch; + e_vec.resize(N, ecc); + M_vec.resize(N); + out_vec.resize(N); + out_vec_batch.resize(N); + std::generate(M_vec.begin(), M_vec.end(), [&rng, &Mdist]() { return Mdist(rng); }); + + // cfunc setup. + auto [e, M] = make_vars("e", "M"); + + llvm_state s{kw::fast_math = fast_math}; + const auto batch_size = recommended_simd_size(); + add_cfunc(s, "f_scalar", {kepE(e, M)}, kw::vars = {e, M}); + add_cfunc(s, "f_batch", {kepE(e, M)}, kw::vars = {e, M}, kw::batch_size = batch_size); + s.compile(); + + auto *f_sc = reinterpret_cast( + s.jit_lookup("f_scalar")); + auto *f_ba + = reinterpret_cast(s.jit_lookup("f_batch")); + + // Fetch the logger. + create_logger(); + set_logger_level_trace(); + auto logger = spdlog::get("heyoka"); + + // Scalar runtime. + spdlog::stopwatch sw; + + for (auto i = 0ull; i < N; ++i) { + double ins[] = {e_vec[i], M_vec[i]}; + f_sc(out_vec.data() + i, ins, nullptr, nullptr); + } + + logger->trace("Scalar run took: {}s", sw); + + std::vector batch_buffer(batch_size * 2ul); + auto *batch_b_ptr = batch_buffer.data(); + + sw.reset(); + + for (auto i = 0ull; i < N - N % batch_size; i += batch_size) { + std::copy(e_vec.data() + i, e_vec.data() + i + batch_size, batch_b_ptr); + std::copy(M_vec.data() + i, M_vec.data() + i + batch_size, batch_b_ptr + batch_size); + f_ba(out_vec_batch.data() + i, batch_b_ptr, nullptr, nullptr); + } + + logger->trace("Batch run took: {}s", sw); + + std::cout.precision(16); + for (auto i = 0u; i < 20u; ++i) { + std::cout << out_vec[i] << " vs " << out_vec_batch[i] << '\n'; + } +} diff --git a/doc/advanced_tutorials.rst b/doc/advanced_tutorials.rst index 02833671f..a4e899986 100644 --- a/doc/advanced_tutorials.rst +++ b/doc/advanced_tutorials.rst @@ -27,6 +27,7 @@ the tutorials should not be hard to follow. tut_batch_mode tut_extended_precision tut_arbitrary_precision + tut_single_precision tut_s11n tut_ensemble tut_parallel_mode diff --git a/doc/changelog.rst b/doc/changelog.rst index ec2b2ffad..4f053952d 100644 --- a/doc/changelog.rst +++ b/doc/changelog.rst @@ -7,8 +7,7 @@ Changelog New ~~~ -- Add the step callback (batch) set classes to compose - step callbacks +- Add step callback set classes to compose step callbacks (`#366 `__). - Add support for single-precision computations (`#363 `__). @@ -18,6 +17,10 @@ New Changes ~~~~~~~ +- When the ``fast_math`` mode is active, the SIMD-vectorised + mathematical functions now use low-precision implementations. + This can lead to substantial performance increases in batch mode + (`#367 `__). - Initialising a step callback or a callable from an empty function object (e.g., a null pointer, an empty ``std::function``, etc.) now results in an empty object diff --git a/doc/tut_single_precision.rst b/doc/tut_single_precision.rst new file mode 100644 index 000000000..99677aa89 --- /dev/null +++ b/doc/tut_single_precision.rst @@ -0,0 +1,107 @@ +.. _tut_single_precision: + +Computations in single precision +================================ + +.. versionadded:: 3.2.0 + +In previous tutorials we saw how heyoka, in addition to the standard +`double precision `__, +also supports computations in :ref:`extended precision ` and +:ref:`arbitrary precision `. Starting with version 3.2.0, heyoka +supports also computations in `single precision `__. + +Single-precision computations can lead to substantial performance benefits when high accuracy is not required. +In particular, single-precision :ref:`batch mode ` can use a SIMD width twice larger +than double precision, leading to an increase by a factor of 2 of the computational throughput. +In scalar computations, the use of single precision reduces by half the memory usage with respect to double precision, +which can help alleviating performance issues in large ODE systems. This can be particularly noticeable in applications such as +:external:ref:`neural ODEs `. + +In C++, single-precision values are usually represented via the standard floating-point type ``float``. +Correspondingly, and similarly to what explained in the :ref:`extended precision ` +tutorial, single-precision computations are activated by passing the ``float`` template parameter to functions +and classes in the heyoka API. + +A simple example +---------------- + +In order to verify that heyoka indeed is able to work in single precision, we will be monitoring the evolution of the energy constant +in a low-precision numerical integration of the simple pendulum. + +Let us begin as usual with the definition of the dynamical equations and the creation of the integrator object: + +.. literalinclude:: ../tutorial/single_precision.cpp + :language: c++ + :lines: 18-29 + +In order to activate single precision, we created an integrator object of type ``taylor_adaptive`` - that is, +we specified ``float``, instead of the usual ``double``, as the (only) template parameter for the ``taylor_adaptive`` class template. +Note that we specified a single-precision initial state via the use of the ``f`` suffix for the numerical constants. +Note also that, when operating in single precision, +*all* numerical values encapsulated in an integrator are represented in single precision - this includes not only the state vector, +but also the time coordinate, the tolerance, the Taylor coefficients, etc. Similarly to double-precision integrators, the default value +of the tolerance is the machine epsilon of ``float``. + +Next, we define a small helper function that will allow us to monitor the evolution of the energy constant +throughout the integration: + +.. literalinclude:: ../tutorial/single_precision.cpp + :language: c++ + :lines: 31-37 + +Before starting the integration, we compute and store the initial energy for later use: + +.. literalinclude:: ../tutorial/single_precision.cpp + :language: c++ + :lines: 39-40 + +We can now begin a step-by-step integration. At the end of each step, we will be computing +and printing to screen the relative energy error: + +.. literalinclude:: ../tutorial/single_precision.cpp + :language: c++ + :lines: 42-49 + +.. code-block:: console + + Relative energy error: 1.48183e-07 + Relative energy error: 5.29227e-08 + Relative energy error: 6.08611e-08 + Relative energy error: 1.79937e-07 + Relative energy error: 1.74645e-07 + Relative energy error: 2.24921e-07 + Relative energy error: 2.4609e-07 + Relative energy error: 1.1643e-07 + Relative energy error: 1.79937e-07 + Relative energy error: 1.40245e-07 + Relative energy error: 2.54029e-07 + Relative energy error: 1.84899e-07 + Relative energy error: 1.83245e-07 + Relative energy error: 1.56122e-07 + Relative energy error: 2.22275e-07 + Relative energy error: 1.61414e-07 + Relative energy error: 2.11691e-07 + Relative energy error: 2.88428e-07 + Relative energy error: 2.93721e-07 + Relative energy error: 1.82583e-07 + +The console output indeed confirms that energy is conserved at the level of the epsilon of the +single-precision format (that is, :math:`\sim 10^{-7}`). + +Other classes and functions +--------------------------- + +Besides the adaptive integrator, several other classes and functions in heyoka can be used in single precision. + +The :ref:`event classes `, for instance, can be constructed in single precision by passing ``float`` +as the template parameter (instead of ``double``). Note that the precision of an event +must match the precision of the integrator object in which the event is used, otherwise an error will be produced +at compilation time. + +Full code listing +----------------- + +.. literalinclude:: ../tutorial/single_precision.cpp + :language: c++ + :lines: 9- diff --git a/include/heyoka/detail/vector_math.hpp b/include/heyoka/detail/vector_math.hpp index c0d4fe2e7..8e55eeceb 100644 --- a/include/heyoka/detail/vector_math.hpp +++ b/include/heyoka/detail/vector_math.hpp @@ -27,6 +27,12 @@ struct vf_info { // The vfabi attribute corresponding // to the vector function. std::string vf_abi_attr; + // The corresponding low-precision versions + // of the above. These will be empty if + // the low-precision counterpart is + // not available. + std::string lp_name; + std::string lp_vf_abi_attr; // Number of SIMD lanes. std::uint32_t width = 0; // Number of arguments. diff --git a/src/detail/llvm_helpers.cpp b/src/detail/llvm_helpers.cpp index b9c68a0c3..f02166d6f 100644 --- a/src/detail/llvm_helpers.cpp +++ b/src/detail/llvm_helpers.cpp @@ -301,6 +301,9 @@ llvm::CallInst *llvm_add_vfabi_attrs(llvm_state &s, llvm::CallInst *call, const auto &context = s.context(); auto &builder = s.builder(); + // Are we in fast math mode? + const auto use_fast_math = builder.getFastMathFlags().isFast(); + if (!vfi.empty()) { // There exist vector variants of the scalar function. auto &md = s.module(); @@ -313,7 +316,11 @@ llvm::CallInst *llvm_add_vfabi_attrs(llvm_state &s, llvm::CallInst *call, const std::vector vf_abi_strs; vf_abi_strs.reserve(vfi.size()); for (const auto &el : vfi) { - vf_abi_strs.push_back(el.vf_abi_attr); + // Fetch the vf_abi attr string (either the low-precision + // or standard version). + const auto &vf_abi_attr + = (use_fast_math && !el.lp_vf_abi_attr.empty()) ? el.lp_vf_abi_attr : el.vf_abi_attr; + vf_abi_strs.push_back(vf_abi_attr); } #if LLVM_VERSION_MAJOR >= 14 call->addFnAttr(llvm::Attribute::get(context, "vector-function-abi-variant", @@ -341,6 +348,10 @@ llvm::CallInst *llvm_add_vfabi_attrs(llvm_state &s, llvm::CallInst *call, const assert(el.width > 0u); assert(el.nargs == num_args); + // Fetch the vector function name from el (either the low-precision + // or standard version). + const auto &el_name = (use_fast_math && !el.lp_name.empty()) ? el.lp_name : el.name; + // The vector type for the current variant. auto *cur_vec_t = make_vector_type(scal_t, el.width); @@ -352,11 +363,11 @@ llvm::CallInst *llvm_add_vfabi_attrs(llvm_state &s, llvm::CallInst *call, const false); // Try to lookup the variant in the module. - auto *vf_ptr = md.getFunction(el.name); + auto *vf_ptr = md.getFunction(el_name); if (vf_ptr == nullptr) { // The declaration of the variant is not there yet, create it. - vf_ptr = llvm_func_create(vec_ft, llvm::Function::ExternalLinkage, el.name, &md); + vf_ptr = llvm_func_create(vec_ft, llvm::Function::ExternalLinkage, el_name, &md); // NOTE: setting the attributes on the vector variant is not strictly required // for the auto-vectorizer to work. However, in other parts of the code, the vector @@ -380,7 +391,7 @@ llvm::CallInst *llvm_add_vfabi_attrs(llvm_state &s, llvm::CallInst *call, const // // https://llvm.org/docs/LangRef.html#the-llvm-used-global-variable // https://godbolt.org/z/1neaG4bYj - const auto dummy_name = fmt::format("heyoka.dummy_vector_call.{}", el.name); + const auto dummy_name = fmt::format("heyoka.dummy_vector_call.{}", el_name); if (auto *dummy_ptr = md.getFunction(dummy_name); dummy_ptr == nullptr) { // The dummy function has not been defined yet, do it. @@ -546,6 +557,9 @@ llvm::Value *llvm_math_intr(llvm_state &s, const std::string &intr_name, auto &builder = s.builder(); + // Are we in fast math mode? + const auto use_fast_math = builder.getFastMathFlags().isFast(); + if (llvm_stype_can_use_math_intrinsics(s, scal_t)) { // We can use the LLVM intrinsics for the given scalar type. @@ -567,10 +581,15 @@ llvm::Value *llvm_math_intr(llvm_state &s, const std::string &intr_name, if (vfi_it != vfi.end() && vfi_it->width == vector_width) { // A vector implementation with precisely the correct width is available, use it. assert(vfi_it->nargs == nargs); + + // Fetch the vector function name (either the low-precision + // or standard version). + const auto &vf_name = (use_fast_math && !vfi_it->lp_name.empty()) ? vfi_it->lp_name : vfi_it->name; + // NOTE: make sure to use the same attributes as the scalar intrinsic for the vector // call. This ensures that the vector variant is declared with the same attributes as those that would // be declared by invoking llvm_add_vfabi_attrs() on the scalar invocation. - return llvm_invoke_external(s, vfi_it->name, vec_t, {args...}, s_intr->getAttributes()); + return llvm_invoke_external(s, vf_name, vec_t, {args...}, s_intr->getAttributes()); } if (!vfi.empty()) { @@ -682,6 +701,11 @@ llvm::Value *llvm_math_cmath(llvm_state &s, const std::string &base_name, Args * const std::array arg_types = {args->getType()...}; assert(((args->getType() == arg_types[0]) && ...)); + auto &builder = s.builder(); + + // Are we in fast math mode? + const auto use_fast_math = builder.getFastMathFlags().isFast(); + // Determine the type and scalar type of the arguments. auto *x_t = arg_types[0]; auto *scal_t = x_t->getScalarType(); @@ -711,7 +735,12 @@ llvm::Value *llvm_math_cmath(llvm_state &s, const std::string &base_name, Args * if (vfi_it != vfi.end() && vfi_it->width == vector_width) { // A vector implementation with precisely the correct width is available, use it. assert(vfi_it->nargs == nargs); - return llvm_invoke_external(s, vfi_it->name, vec_t, {args...}, attrs); + + // Fetch the vector function name (either the low-precision + // or standard version). + const auto &vf_name = (use_fast_math && !vfi_it->lp_name.empty()) ? vfi_it->lp_name : vfi_it->name; + + return llvm_invoke_external(s, vf_name, vec_t, {args...}, attrs); } // A vector implementation with the correct width is **not** available: scalarise the @@ -732,7 +761,7 @@ llvm::Value *llvm_math_cmath(llvm_state &s, const std::string &base_name, Args * // NOTE: this handles only the scalar case. if (llvm_is_real(x_t) != 0) { auto *f = real_nary_op(s, x_t, "mpfr_" + base_name, boost::numeric_cast(nargs)); - return s.builder().CreateCall(f, {args...}); + return builder.CreateCall(f, {args...}); } #endif diff --git a/src/detail/vector_math.cpp b/src/detail/vector_math.cpp index 1f47450a1..4a0674b8d 100644 --- a/src/detail/vector_math.cpp +++ b/src/detail/vector_math.cpp @@ -36,12 +36,14 @@ using vf_map_t = std::unordered_map>; // but at the moment we have only SLEEF. #if defined(HEYOKA_WITH_SLEEF) -auto make_vfinfo(const char *s_name, std::string v_name, std::uint32_t width, std::uint32_t nargs) +auto make_vfinfo(const char *s_name, std::string v_name, std::string lp_v_name, std::uint32_t width, + std::uint32_t nargs) { assert(nargs == 1u || nargs == 2u); - auto ret = vf_info{std::move(v_name), {}, width, nargs}; + auto ret = vf_info{std::move(v_name), {}, std::move(lp_v_name), {}, width, nargs}; ret.vf_abi_attr = fmt::format("_ZGV_LLVM_N{}{}_{}({})", width, nargs == 1u ? "v" : "vv", s_name, ret.name); + ret.lp_vf_abi_attr = fmt::format("_ZGV_LLVM_N{}{}_{}({})", width, nargs == 1u ? "v" : "vv", s_name, ret.lp_name); return ret; } @@ -49,6 +51,23 @@ auto make_vfinfo(const char *s_name, std::string v_name, std::uint32_t width, st #if defined(HEYOKA_WITH_SLEEF) +// NOTE: helper to fetch the suffix of the low-precision version of the mathematical +// function "sleef_base_name" in SLEEF. +// NOTE: by default, the low-precision versions are denoted by the "u35" suffix +// (indicating 3.5 ULPs of precision). For some functions, the "u35" versions are not available +// and we return the standard-precision suffix instead ("u10"). +auto sleef_get_lp_suffix(const std::string &sleef_base_name) -> std::string +{ + static const std::unordered_map lp_suffix_map + = {{"acosh", "u10"}, {"asinh", "u10"}, {"atanh", "u10"}, {"erf", "u10"}, {"exp", "u10"}, {"pow", "u10"}}; + + if (auto it = lp_suffix_map.find(sleef_base_name); it == lp_suffix_map.end()) { + return "u35"; + } else { + return it->second; + } +} + // NOLINTNEXTLINE(bugprone-easily-swappable-parameters) auto add_vfinfo_sleef(vf_map_t &retval, const char *scalar_name, const char *sleef_base_name, std::string_view sleef_tp, std::uint32_t nargs = 1) @@ -59,6 +78,8 @@ auto add_vfinfo_sleef(vf_map_t &retval, const char *scalar_name, const char *sle auto make_sleef_vfinfo = [&](std::uint32_t width, const char *iset) { return make_vfinfo(scalar_name, fmt::format("Sleef_{}{}{}_u10{}", sleef_base_name, sleef_tp, width, iset), + fmt::format("Sleef_{}{}{}_{}{}", sleef_base_name, sleef_tp, width, + sleef_get_lp_suffix(sleef_base_name), iset), width, nargs); }; diff --git a/test/acos.cpp b/test/acos.cpp index 241fd9167..0d721aeb7 100644 --- a/test/acos.cpp +++ b/test/acos.cpp @@ -229,124 +229,128 @@ TEST_CASE("normalise") // Tests to check vectorisation via the vector-function-abi-variant machinery. TEST_CASE("vfabi double") { - llvm_state s{kw::slp_vectorize = true}; + for (auto fast_math : {false, true}) { + llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math}; - auto [a, b] = make_vars("a", "b"); + auto [a, b] = make_vars("a", "b"); - add_cfunc(s, "cfunc", {acos(a), acos(b)}); + add_cfunc(s, "cfunc", {acos(a), acos(b)}); - s.compile(); + s.compile(); - auto *cf_ptr - = reinterpret_cast(s.jit_lookup("cfunc")); + auto *cf_ptr = reinterpret_cast( + s.jit_lookup("cfunc")); - const std::vector ins{.1, .2}; - std::vector outs(2u, 0.); + const std::vector ins{.1, .2}; + std::vector outs(2u, 0.); - cf_ptr(outs.data(), ins.data(), nullptr, nullptr); + cf_ptr(outs.data(), ins.data(), nullptr, nullptr); - REQUIRE(outs[0] == approximately(std::acos(.1))); - REQUIRE(outs[1] == approximately(std::acos(.2))); + REQUIRE(outs[0] == approximately(std::acos(.1))); + REQUIRE(outs[1] == approximately(std::acos(.2))); - // NOTE: autovec with external scalar functions seems to work - // only since LLVM 16. + // NOTE: autovec with external scalar functions seems to work + // only since LLVM 16. #if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16 - const auto &tf = detail::get_target_features(); + const auto &tf = detail::get_target_features(); - auto ir = s.get_ir(); + auto ir = s.get_ir(); - using string_find_iterator = boost::find_iterator; + using string_find_iterator = boost::find_iterator; - auto count = 0u; - for (auto it = boost::make_find_iterator(ir, boost::first_finder("@acos", boost::is_iequal())); - it != string_find_iterator(); ++it) { - ++count; - } + auto count = 0u; + for (auto it = boost::make_find_iterator(ir, boost::first_finder("@acos", boost::is_iequal())); + it != string_find_iterator(); ++it) { + ++count; + } - // NOTE: at the moment we have comprehensive coverage of LLVM versions - // in the CI only for x86_64. - if (tf.sse2) { - // NOTE: occurrences of the scalar version: - // - 2 calls in the strided cfunc, - // - 1 declaration. - REQUIRE(count == 3u); - } + // NOTE: at the moment we have comprehensive coverage of LLVM versions + // in the CI only for x86_64. + if (tf.sse2) { + // NOTE: occurrences of the scalar version: + // - 2 calls in the strided cfunc, + // - 1 declaration. + REQUIRE(count == 3u); + } - if (tf.aarch64) { - REQUIRE(count == 3u); - } + if (tf.aarch64) { + REQUIRE(count == 3u); + } - // NOTE: currently no auto-vectorization happens on ppc64 due apparently - // to the way the target machine is being set up by orc/lljit (it works - // fine with the opt tool). When this is resolved, we can test ppc64 too. + // NOTE: currently no auto-vectorization happens on ppc64 due apparently + // to the way the target machine is being set up by orc/lljit (it works + // fine with the opt tool). When this is resolved, we can test ppc64 too. - // if (tf.vsx) { - // REQUIRE(count == 3u); - // } + // if (tf.vsx) { + // REQUIRE(count == 3u); + // } #endif + } } TEST_CASE("vfabi float") { - llvm_state s{kw::slp_vectorize = true}; + for (auto fast_math : {false, true}) { + llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math}; - auto [a, b, c, d] = make_vars("a", "b", "c", "d"); + auto [a, b, c, d] = make_vars("a", "b", "c", "d"); - add_cfunc(s, "cfunc", {acos(a), acos(b), acos(c), acos(d)}); + add_cfunc(s, "cfunc", {acos(a), acos(b), acos(c), acos(d)}); - s.compile(); + s.compile(); - auto *cf_ptr - = reinterpret_cast(s.jit_lookup("cfunc")); + auto *cf_ptr + = reinterpret_cast(s.jit_lookup("cfunc")); - const std::vector ins{.1f, .2f, .3f, .4f}; - std::vector outs(4u, 0.); + const std::vector ins{.1f, .2f, .3f, .4f}; + std::vector outs(4u, 0.); - cf_ptr(outs.data(), ins.data(), nullptr, nullptr); + cf_ptr(outs.data(), ins.data(), nullptr, nullptr); - REQUIRE(outs[0] == approximately(std::acos(.1f))); - REQUIRE(outs[1] == approximately(std::acos(.2f))); - REQUIRE(outs[2] == approximately(std::acos(.3f))); - REQUIRE(outs[3] == approximately(std::acos(.4f))); + REQUIRE(outs[0] == approximately(std::acos(.1f))); + REQUIRE(outs[1] == approximately(std::acos(.2f))); + REQUIRE(outs[2] == approximately(std::acos(.3f))); + REQUIRE(outs[3] == approximately(std::acos(.4f))); - // NOTE: autovec with external scalar functions seems to work - // only since LLVM 16. + // NOTE: autovec with external scalar functions seems to work + // only since LLVM 16. #if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16 - const auto &tf = detail::get_target_features(); + const auto &tf = detail::get_target_features(); - auto ir = s.get_ir(); + auto ir = s.get_ir(); - using string_find_iterator = boost::find_iterator; + using string_find_iterator = boost::find_iterator; - auto count = 0u; - for (auto it = boost::make_find_iterator(ir, boost::first_finder("@acosf", boost::is_iequal())); - it != string_find_iterator(); ++it) { - ++count; - } + auto count = 0u; + for (auto it = boost::make_find_iterator(ir, boost::first_finder("@acosf", boost::is_iequal())); + it != string_find_iterator(); ++it) { + ++count; + } - // NOTE: at the moment we have comprehensive coverage of LLVM versions - // in the CI only for x86_64. - if (tf.sse2) { - // NOTE: occurrences of the scalar version: - // - 4 calls in the strided cfunc, - // - 1 declaration. - REQUIRE(count == 5u); - } + // NOTE: at the moment we have comprehensive coverage of LLVM versions + // in the CI only for x86_64. + if (tf.sse2) { + // NOTE: occurrences of the scalar version: + // - 4 calls in the strided cfunc, + // - 1 declaration. + REQUIRE(count == 5u); + } - if (tf.aarch64) { - REQUIRE(count == 5u); - } + if (tf.aarch64) { + REQUIRE(count == 5u); + } - // NOTE: currently no auto-vectorization happens on ppc64 due apparently - // to the way the target machine is being set up by orc/lljit (it works - // fine with the opt tool). When this is resolved, we can test ppc64 too. + // NOTE: currently no auto-vectorization happens on ppc64 due apparently + // to the way the target machine is being set up by orc/lljit (it works + // fine with the opt tool). When this is resolved, we can test ppc64 too. - // if (tf.vsx) { - // REQUIRE(count == 5u); - // } + // if (tf.vsx) { + // REQUIRE(count == 5u); + // } #endif + } } diff --git a/test/acosh.cpp b/test/acosh.cpp index d70ec1bb8..d629b6158 100644 --- a/test/acosh.cpp +++ b/test/acosh.cpp @@ -229,125 +229,129 @@ TEST_CASE("normalise") // Tests to check vectorisation via the vector-function-abi-variant machinery. TEST_CASE("vfabi double") { - llvm_state s{kw::slp_vectorize = true}; + for (auto fast_math : {false, true}) { + llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math}; - auto [a, b] = make_vars("a", "b"); + auto [a, b] = make_vars("a", "b"); - add_cfunc(s, "cfunc", {acosh(a), acosh(b)}); + add_cfunc(s, "cfunc", {acosh(a), acosh(b)}); - s.compile(); + s.compile(); - auto *cf_ptr - = reinterpret_cast(s.jit_lookup("cfunc")); + auto *cf_ptr = reinterpret_cast( + s.jit_lookup("cfunc")); - const std::vector ins{1.1, 1.2}; - std::vector outs(2u, 0.); + const std::vector ins{1.1, 1.2}; + std::vector outs(2u, 0.); - cf_ptr(outs.data(), ins.data(), nullptr, nullptr); + cf_ptr(outs.data(), ins.data(), nullptr, nullptr); - REQUIRE(outs[0] == approximately(std::acosh(1.1))); - REQUIRE(outs[1] == approximately(std::acosh(1.2))); + REQUIRE(outs[0] == approximately(std::acosh(1.1))); + REQUIRE(outs[1] == approximately(std::acosh(1.2))); - // NOTE: autovec with external scalar functions seems to work - // only since LLVM 16. + // NOTE: autovec with external scalar functions seems to work + // only since LLVM 16. #if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16 - const auto &tf = detail::get_target_features(); + const auto &tf = detail::get_target_features(); - auto ir = s.get_ir(); + auto ir = s.get_ir(); - using string_find_iterator = boost::find_iterator; + using string_find_iterator = boost::find_iterator; - auto count = 0u; - for (auto it = boost::make_find_iterator(ir, boost::first_finder("@acosh", boost::is_iequal())); - it != string_find_iterator(); ++it) { - ++count; - } + auto count = 0u; + for (auto it = boost::make_find_iterator(ir, boost::first_finder("@acosh", boost::is_iequal())); + it != string_find_iterator(); ++it) { + ++count; + } - // NOTE: at the moment we have comprehensive coverage of LLVM versions - // in the CI only for x86_64. - if (tf.sse2) { - // NOTE: occurrences of the scalar version: - // - 2 calls in the strided cfunc, - // - 1 declaration. - REQUIRE(count == 3u); - } + // NOTE: at the moment we have comprehensive coverage of LLVM versions + // in the CI only for x86_64. + if (tf.sse2) { + // NOTE: occurrences of the scalar version: + // - 2 calls in the strided cfunc, + // - 1 declaration. + REQUIRE(count == 3u); + } - if (tf.aarch64) { - REQUIRE(count == 3u); - } + if (tf.aarch64) { + REQUIRE(count == 3u); + } - // NOTE: currently no auto-vectorization happens on ppc64 due apparently - // to the way the target machine is being set up by orc/lljit (it works - // fine with the opt tool). When this is resolved, we can test ppc64 too. + // NOTE: currently no auto-vectorization happens on ppc64 due apparently + // to the way the target machine is being set up by orc/lljit (it works + // fine with the opt tool). When this is resolved, we can test ppc64 too. - // if (tf.vsx) { - // REQUIRE(count == 3u); - // } + // if (tf.vsx) { + // REQUIRE(count == 3u); + // } #endif + } } TEST_CASE("vfabi float") { - llvm_state s{kw::slp_vectorize = true}; + for (auto fast_math : {false, true}) { + llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math}; - auto [a, b, c, d] = make_vars("a", "b", "c", "d"); + auto [a, b, c, d] = make_vars("a", "b", "c", "d"); - add_cfunc(s, "cfunc", {acosh(a), acosh(b), acosh(c), acosh(d)}); + add_cfunc(s, "cfunc", {acosh(a), acosh(b), acosh(c), acosh(d)}); - s.compile(); + s.compile(); - auto *cf_ptr - = reinterpret_cast(s.jit_lookup("cfunc")); + auto *cf_ptr + = reinterpret_cast(s.jit_lookup("cfunc")); - const std::vector ins{1.1f, 1.2f, 1.3f, 1.4f}; - std::vector outs(4u, 0.); + const std::vector ins{1.1f, 1.2f, 1.3f, 1.4f}; + std::vector outs(4u, 0.); - cf_ptr(outs.data(), ins.data(), nullptr, nullptr); + cf_ptr(outs.data(), ins.data(), nullptr, nullptr); - REQUIRE(outs[0] == approximately(std::acosh(1.1f))); - REQUIRE(outs[1] == approximately(std::acosh(1.2f))); - REQUIRE(outs[2] == approximately(std::acosh(1.3f))); - REQUIRE(outs[3] == approximately(std::acosh(1.4f))); + REQUIRE(outs[0] == approximately(std::acosh(1.1f))); + REQUIRE(outs[1] == approximately(std::acosh(1.2f))); + REQUIRE(outs[2] == approximately(std::acosh(1.3f))); + REQUIRE(outs[3] == approximately(std::acosh(1.4f))); - // NOTE: autovec with external scalar functions seems to work - // only since LLVM 16. + // NOTE: autovec with external scalar functions seems to work + // only since LLVM 16. #if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16 - const auto &tf = detail::get_target_features(); + const auto &tf = detail::get_target_features(); - auto ir = s.get_ir(); + auto ir = s.get_ir(); - using string_find_iterator = boost::find_iterator; + using string_find_iterator = boost::find_iterator; - auto count = 0u; - for (auto it = boost::make_find_iterator(ir, boost::first_finder("@acoshf", boost::is_iequal())); - it != string_find_iterator(); ++it) { - ++count; - } + auto count = 0u; + for (auto it = boost::make_find_iterator(ir, boost::first_finder("@acoshf", boost::is_iequal())); + it != string_find_iterator(); ++it) { + ++count; + } - // NOTE: at the moment we have comprehensive coverage of LLVM versions - // in the CI only for x86_64. - if (tf.sse2) { - // NOTE: occurrences of the scalar version: - // - 4 calls in the strided cfunc, - // - 1 declaration. - REQUIRE(count == 5u); - } + // NOTE: at the moment we have comprehensive coverage of LLVM versions + // in the CI only for x86_64. + if (tf.sse2) { + // NOTE: occurrences of the scalar version: + // - 4 calls in the strided cfunc, + // - 1 declaration. + REQUIRE(count == 5u); + } - // NOTE: LLVM16 is currently the version tested in the CI on arm64. - if (tf.aarch64) { - REQUIRE(count == 5u); - } + // NOTE: LLVM16 is currently the version tested in the CI on arm64. + if (tf.aarch64) { + REQUIRE(count == 5u); + } - // NOTE: currently no auto-vectorization happens on ppc64 due apparently - // to the way the target machine is being set up by orc/lljit (it works - // fine with the opt tool). When this is resolved, we can test ppc64 too. + // NOTE: currently no auto-vectorization happens on ppc64 due apparently + // to the way the target machine is being set up by orc/lljit (it works + // fine with the opt tool). When this is resolved, we can test ppc64 too. - // if (tf.vsx) { - // REQUIRE(count == 5u); - // } + // if (tf.vsx) { + // REQUIRE(count == 5u); + // } #endif + } } diff --git a/test/asin.cpp b/test/asin.cpp index 713df6695..6c05c82aa 100644 --- a/test/asin.cpp +++ b/test/asin.cpp @@ -229,124 +229,128 @@ TEST_CASE("normalise") // Tests to check vectorisation via the vector-function-abi-variant machinery. TEST_CASE("vfabi double") { - llvm_state s{kw::slp_vectorize = true}; + for (auto fast_math : {false, true}) { + llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math}; - auto [a, b] = make_vars("a", "b"); + auto [a, b] = make_vars("a", "b"); - add_cfunc(s, "cfunc", {asin(a), asin(b)}); + add_cfunc(s, "cfunc", {asin(a), asin(b)}); - s.compile(); + s.compile(); - auto *cf_ptr - = reinterpret_cast(s.jit_lookup("cfunc")); + auto *cf_ptr = reinterpret_cast( + s.jit_lookup("cfunc")); - const std::vector ins{.1, .2}; - std::vector outs(2u, 0.); + const std::vector ins{.1, .2}; + std::vector outs(2u, 0.); - cf_ptr(outs.data(), ins.data(), nullptr, nullptr); + cf_ptr(outs.data(), ins.data(), nullptr, nullptr); - REQUIRE(outs[0] == approximately(std::asin(.1))); - REQUIRE(outs[1] == approximately(std::asin(.2))); + REQUIRE(outs[0] == approximately(std::asin(.1))); + REQUIRE(outs[1] == approximately(std::asin(.2))); - // NOTE: autovec with external scalar functions seems to work - // only since LLVM 16. + // NOTE: autovec with external scalar functions seems to work + // only since LLVM 16. #if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16 - const auto &tf = detail::get_target_features(); + const auto &tf = detail::get_target_features(); - auto ir = s.get_ir(); + auto ir = s.get_ir(); - using string_find_iterator = boost::find_iterator; + using string_find_iterator = boost::find_iterator; - auto count = 0u; - for (auto it = boost::make_find_iterator(ir, boost::first_finder("@asin", boost::is_iequal())); - it != string_find_iterator(); ++it) { - ++count; - } + auto count = 0u; + for (auto it = boost::make_find_iterator(ir, boost::first_finder("@asin", boost::is_iequal())); + it != string_find_iterator(); ++it) { + ++count; + } - // NOTE: at the moment we have comprehensive coverage of LLVM versions - // in the CI only for x86_64. - if (tf.sse2) { - // NOTE: occurrences of the scalar version: - // - 2 calls in the strided cfunc, - // - 1 declaration. - REQUIRE(count == 3u); - } + // NOTE: at the moment we have comprehensive coverage of LLVM versions + // in the CI only for x86_64. + if (tf.sse2) { + // NOTE: occurrences of the scalar version: + // - 2 calls in the strided cfunc, + // - 1 declaration. + REQUIRE(count == 3u); + } - if (tf.aarch64) { - REQUIRE(count == 3u); - } + if (tf.aarch64) { + REQUIRE(count == 3u); + } - // NOTE: currently no auto-vectorization happens on ppc64 due apparently - // to the way the target machine is being set up by orc/lljit (it works - // fine with the opt tool). When this is resolved, we can test ppc64 too. + // NOTE: currently no auto-vectorization happens on ppc64 due apparently + // to the way the target machine is being set up by orc/lljit (it works + // fine with the opt tool). When this is resolved, we can test ppc64 too. - // if (tf.vsx) { - // REQUIRE(count == 3u); - // } + // if (tf.vsx) { + // REQUIRE(count == 3u); + // } #endif + } } TEST_CASE("vfabi float") { - llvm_state s{kw::slp_vectorize = true}; + for (auto fast_math : {false, true}) { + llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math}; - auto [a, b, c, d] = make_vars("a", "b", "c", "d"); + auto [a, b, c, d] = make_vars("a", "b", "c", "d"); - add_cfunc(s, "cfunc", {asin(a), asin(b), asin(c), asin(d)}); + add_cfunc(s, "cfunc", {asin(a), asin(b), asin(c), asin(d)}); - s.compile(); + s.compile(); - auto *cf_ptr - = reinterpret_cast(s.jit_lookup("cfunc")); + auto *cf_ptr + = reinterpret_cast(s.jit_lookup("cfunc")); - const std::vector ins{.1f, .2f, .3f, .4f}; - std::vector outs(4u, 0.); + const std::vector ins{.1f, .2f, .3f, .4f}; + std::vector outs(4u, 0.); - cf_ptr(outs.data(), ins.data(), nullptr, nullptr); + cf_ptr(outs.data(), ins.data(), nullptr, nullptr); - REQUIRE(outs[0] == approximately(std::asin(.1f))); - REQUIRE(outs[1] == approximately(std::asin(.2f))); - REQUIRE(outs[2] == approximately(std::asin(.3f))); - REQUIRE(outs[3] == approximately(std::asin(.4f))); + REQUIRE(outs[0] == approximately(std::asin(.1f))); + REQUIRE(outs[1] == approximately(std::asin(.2f))); + REQUIRE(outs[2] == approximately(std::asin(.3f))); + REQUIRE(outs[3] == approximately(std::asin(.4f))); - // NOTE: autovec with external scalar functions seems to work - // only since LLVM 16. + // NOTE: autovec with external scalar functions seems to work + // only since LLVM 16. #if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16 - const auto &tf = detail::get_target_features(); + const auto &tf = detail::get_target_features(); - auto ir = s.get_ir(); + auto ir = s.get_ir(); - using string_find_iterator = boost::find_iterator; + using string_find_iterator = boost::find_iterator; - auto count = 0u; - for (auto it = boost::make_find_iterator(ir, boost::first_finder("@asinf", boost::is_iequal())); - it != string_find_iterator(); ++it) { - ++count; - } + auto count = 0u; + for (auto it = boost::make_find_iterator(ir, boost::first_finder("@asinf", boost::is_iequal())); + it != string_find_iterator(); ++it) { + ++count; + } - // NOTE: at the moment we have comprehensive coverage of LLVM versions - // in the CI only for x86_64. - if (tf.sse2) { - // NOTE: occurrences of the scalar version: - // - 4 calls in the strided cfunc, - // - 1 declaration. - REQUIRE(count == 5u); - } + // NOTE: at the moment we have comprehensive coverage of LLVM versions + // in the CI only for x86_64. + if (tf.sse2) { + // NOTE: occurrences of the scalar version: + // - 4 calls in the strided cfunc, + // - 1 declaration. + REQUIRE(count == 5u); + } - if (tf.aarch64) { - REQUIRE(count == 5u); - } + if (tf.aarch64) { + REQUIRE(count == 5u); + } - // NOTE: currently no auto-vectorization happens on ppc64 due apparently - // to the way the target machine is being set up by orc/lljit (it works - // fine with the opt tool). When this is resolved, we can test ppc64 too. + // NOTE: currently no auto-vectorization happens on ppc64 due apparently + // to the way the target machine is being set up by orc/lljit (it works + // fine with the opt tool). When this is resolved, we can test ppc64 too. - // if (tf.vsx) { - // REQUIRE(count == 5u); - // } + // if (tf.vsx) { + // REQUIRE(count == 5u); + // } #endif + } } diff --git a/test/asinh.cpp b/test/asinh.cpp index f44bfe307..7cc6e91ee 100644 --- a/test/asinh.cpp +++ b/test/asinh.cpp @@ -229,125 +229,129 @@ TEST_CASE("normalise") // Tests to check vectorisation via the vector-function-abi-variant machinery. TEST_CASE("vfabi double") { - llvm_state s{kw::slp_vectorize = true}; + for (auto fast_math : {false, true}) { + llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math}; - auto [a, b] = make_vars("a", "b"); + auto [a, b] = make_vars("a", "b"); - add_cfunc(s, "cfunc", {asinh(a), asinh(b)}); + add_cfunc(s, "cfunc", {asinh(a), asinh(b)}); - s.compile(); + s.compile(); - auto *cf_ptr - = reinterpret_cast(s.jit_lookup("cfunc")); + auto *cf_ptr = reinterpret_cast( + s.jit_lookup("cfunc")); - const std::vector ins{1.1, 1.2}; - std::vector outs(2u, 0.); + const std::vector ins{1.1, 1.2}; + std::vector outs(2u, 0.); - cf_ptr(outs.data(), ins.data(), nullptr, nullptr); + cf_ptr(outs.data(), ins.data(), nullptr, nullptr); - REQUIRE(outs[0] == approximately(std::asinh(1.1))); - REQUIRE(outs[1] == approximately(std::asinh(1.2))); + REQUIRE(outs[0] == approximately(std::asinh(1.1))); + REQUIRE(outs[1] == approximately(std::asinh(1.2))); - // NOTE: autovec with external scalar functions seems to work - // only since LLVM 16. + // NOTE: autovec with external scalar functions seems to work + // only since LLVM 16. #if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16 - const auto &tf = detail::get_target_features(); + const auto &tf = detail::get_target_features(); - auto ir = s.get_ir(); + auto ir = s.get_ir(); - using string_find_iterator = boost::find_iterator; + using string_find_iterator = boost::find_iterator; - auto count = 0u; - for (auto it = boost::make_find_iterator(ir, boost::first_finder("@asinh", boost::is_iequal())); - it != string_find_iterator(); ++it) { - ++count; - } + auto count = 0u; + for (auto it = boost::make_find_iterator(ir, boost::first_finder("@asinh", boost::is_iequal())); + it != string_find_iterator(); ++it) { + ++count; + } - // NOTE: at the moment we have comprehensive coverage of LLVM versions - // in the CI only for x86_64. - if (tf.sse2) { - // NOTE: occurrences of the scalar version: - // - 2 calls in the strided cfunc, - // - 1 declaration. - REQUIRE(count == 3u); - } + // NOTE: at the moment we have comprehensive coverage of LLVM versions + // in the CI only for x86_64. + if (tf.sse2) { + // NOTE: occurrences of the scalar version: + // - 2 calls in the strided cfunc, + // - 1 declaration. + REQUIRE(count == 3u); + } - if (tf.aarch64) { - REQUIRE(count == 3u); - } + if (tf.aarch64) { + REQUIRE(count == 3u); + } - // NOTE: currently no auto-vectorization happens on ppc64 due apparently - // to the way the target machine is being set up by orc/lljit (it works - // fine with the opt tool). When this is resolved, we can test ppc64 too. + // NOTE: currently no auto-vectorization happens on ppc64 due apparently + // to the way the target machine is being set up by orc/lljit (it works + // fine with the opt tool). When this is resolved, we can test ppc64 too. - // if (tf.vsx) { - // REQUIRE(count == 3u); - // } + // if (tf.vsx) { + // REQUIRE(count == 3u); + // } #endif + } } TEST_CASE("vfabi float") { - llvm_state s{kw::slp_vectorize = true}; + for (auto fast_math : {false, true}) { + llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math}; - auto [a, b, c, d] = make_vars("a", "b", "c", "d"); + auto [a, b, c, d] = make_vars("a", "b", "c", "d"); - add_cfunc(s, "cfunc", {asinh(a), asinh(b), asinh(c), asinh(d)}); + add_cfunc(s, "cfunc", {asinh(a), asinh(b), asinh(c), asinh(d)}); - s.compile(); + s.compile(); - auto *cf_ptr - = reinterpret_cast(s.jit_lookup("cfunc")); + auto *cf_ptr + = reinterpret_cast(s.jit_lookup("cfunc")); - const std::vector ins{1.1f, 1.2f, 1.3f, 1.4f}; - std::vector outs(4u, 0.); + const std::vector ins{1.1f, 1.2f, 1.3f, 1.4f}; + std::vector outs(4u, 0.); - cf_ptr(outs.data(), ins.data(), nullptr, nullptr); + cf_ptr(outs.data(), ins.data(), nullptr, nullptr); - REQUIRE(outs[0] == approximately(std::asinh(1.1f))); - REQUIRE(outs[1] == approximately(std::asinh(1.2f))); - REQUIRE(outs[2] == approximately(std::asinh(1.3f))); - REQUIRE(outs[3] == approximately(std::asinh(1.4f))); + REQUIRE(outs[0] == approximately(std::asinh(1.1f))); + REQUIRE(outs[1] == approximately(std::asinh(1.2f))); + REQUIRE(outs[2] == approximately(std::asinh(1.3f))); + REQUIRE(outs[3] == approximately(std::asinh(1.4f))); - // NOTE: autovec with external scalar functions seems to work - // only since LLVM 16. + // NOTE: autovec with external scalar functions seems to work + // only since LLVM 16. #if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16 - const auto &tf = detail::get_target_features(); + const auto &tf = detail::get_target_features(); - auto ir = s.get_ir(); + auto ir = s.get_ir(); - using string_find_iterator = boost::find_iterator; + using string_find_iterator = boost::find_iterator; - auto count = 0u; - for (auto it = boost::make_find_iterator(ir, boost::first_finder("@asinhf", boost::is_iequal())); - it != string_find_iterator(); ++it) { - ++count; - } + auto count = 0u; + for (auto it = boost::make_find_iterator(ir, boost::first_finder("@asinhf", boost::is_iequal())); + it != string_find_iterator(); ++it) { + ++count; + } - // NOTE: at the moment we have comprehensive coverage of LLVM versions - // in the CI only for x86_64. - if (tf.sse2) { - // NOTE: occurrences of the scalar version: - // - 4 calls in the strided cfunc, - // - 1 declaration. - REQUIRE(count == 5u); - } + // NOTE: at the moment we have comprehensive coverage of LLVM versions + // in the CI only for x86_64. + if (tf.sse2) { + // NOTE: occurrences of the scalar version: + // - 4 calls in the strided cfunc, + // - 1 declaration. + REQUIRE(count == 5u); + } - // NOTE: LLVM16 is currently the version tested in the CI on arm64. - if (tf.aarch64) { - REQUIRE(count == 5u); - } + // NOTE: LLVM16 is currently the version tested in the CI on arm64. + if (tf.aarch64) { + REQUIRE(count == 5u); + } - // NOTE: currently no auto-vectorization happens on ppc64 due apparently - // to the way the target machine is being set up by orc/lljit (it works - // fine with the opt tool). When this is resolved, we can test ppc64 too. + // NOTE: currently no auto-vectorization happens on ppc64 due apparently + // to the way the target machine is being set up by orc/lljit (it works + // fine with the opt tool). When this is resolved, we can test ppc64 too. - // if (tf.vsx) { - // REQUIRE(count == 5u); - // } + // if (tf.vsx) { + // REQUIRE(count == 5u); + // } #endif + } } diff --git a/test/atan.cpp b/test/atan.cpp index 6d69c7b20..8dcdcd519 100644 --- a/test/atan.cpp +++ b/test/atan.cpp @@ -223,125 +223,129 @@ TEST_CASE("normalise") // Tests to check vectorisation via the vector-function-abi-variant machinery. TEST_CASE("vfabi double") { - llvm_state s{kw::slp_vectorize = true}; + for (auto fast_math : {false, true}) { + llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math}; - auto [a, b] = make_vars("a", "b"); + auto [a, b] = make_vars("a", "b"); - add_cfunc(s, "cfunc", {atan(a), atan(b)}); + add_cfunc(s, "cfunc", {atan(a), atan(b)}); - s.compile(); + s.compile(); - auto *cf_ptr - = reinterpret_cast(s.jit_lookup("cfunc")); + auto *cf_ptr = reinterpret_cast( + s.jit_lookup("cfunc")); - const std::vector ins{.1, .2}; - std::vector outs(2u, 0.); + const std::vector ins{.1, .2}; + std::vector outs(2u, 0.); - cf_ptr(outs.data(), ins.data(), nullptr, nullptr); + cf_ptr(outs.data(), ins.data(), nullptr, nullptr); - REQUIRE(outs[0] == approximately(std::atan(.1))); - REQUIRE(outs[1] == approximately(std::atan(.2))); + REQUIRE(outs[0] == approximately(std::atan(.1))); + REQUIRE(outs[1] == approximately(std::atan(.2))); - // NOTE: autovec with external scalar functions seems to work - // only since LLVM 16. + // NOTE: autovec with external scalar functions seems to work + // only since LLVM 16. #if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16 - const auto &tf = detail::get_target_features(); + const auto &tf = detail::get_target_features(); - auto ir = s.get_ir(); + auto ir = s.get_ir(); - using string_find_iterator = boost::find_iterator; + using string_find_iterator = boost::find_iterator; - auto count = 0u; - for (auto it = boost::make_find_iterator(ir, boost::first_finder("@atan", boost::is_iequal())); - it != string_find_iterator(); ++it) { - ++count; - } + auto count = 0u; + for (auto it = boost::make_find_iterator(ir, boost::first_finder("@atan", boost::is_iequal())); + it != string_find_iterator(); ++it) { + ++count; + } - // NOTE: at the moment we have comprehensive coverage of LLVM versions - // in the CI only for x86_64. - if (tf.sse2) { - // NOTE: occurrences of the scalar version: - // - 2 calls in the strided cfunc, - // - 1 declaration. - REQUIRE(count == 3u); - } + // NOTE: at the moment we have comprehensive coverage of LLVM versions + // in the CI only for x86_64. + if (tf.sse2) { + // NOTE: occurrences of the scalar version: + // - 2 calls in the strided cfunc, + // - 1 declaration. + REQUIRE(count == 3u); + } - if (tf.aarch64) { - REQUIRE(count == 3u); - } + if (tf.aarch64) { + REQUIRE(count == 3u); + } - // NOTE: currently no auto-vectorization happens on ppc64 due apparently - // to the way the target machine is being set up by orc/lljit (it works - // fine with the opt tool). When this is resolved, we can test ppc64 too. + // NOTE: currently no auto-vectorization happens on ppc64 due apparently + // to the way the target machine is being set up by orc/lljit (it works + // fine with the opt tool). When this is resolved, we can test ppc64 too. - // if (tf.vsx) { - // REQUIRE(count == 3u); - // } + // if (tf.vsx) { + // REQUIRE(count == 3u); + // } #endif + } } TEST_CASE("vfabi float") { - llvm_state s{kw::slp_vectorize = true}; + for (auto fast_math : {false, true}) { + llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math}; - auto [a, b, c, d] = make_vars("a", "b", "c", "d"); + auto [a, b, c, d] = make_vars("a", "b", "c", "d"); - add_cfunc(s, "cfunc", {atan(a), atan(b), atan(c), atan(d)}); + add_cfunc(s, "cfunc", {atan(a), atan(b), atan(c), atan(d)}); - s.compile(); + s.compile(); - auto *cf_ptr - = reinterpret_cast(s.jit_lookup("cfunc")); + auto *cf_ptr + = reinterpret_cast(s.jit_lookup("cfunc")); - const std::vector ins{.1f, .2f, .3f, .4f}; - std::vector outs(4u, 0.); + const std::vector ins{.1f, .2f, .3f, .4f}; + std::vector outs(4u, 0.); - cf_ptr(outs.data(), ins.data(), nullptr, nullptr); + cf_ptr(outs.data(), ins.data(), nullptr, nullptr); - REQUIRE(outs[0] == approximately(std::atan(.1f))); - REQUIRE(outs[1] == approximately(std::atan(.2f))); - REQUIRE(outs[2] == approximately(std::atan(.3f))); - REQUIRE(outs[3] == approximately(std::atan(.4f))); + REQUIRE(outs[0] == approximately(std::atan(.1f))); + REQUIRE(outs[1] == approximately(std::atan(.2f))); + REQUIRE(outs[2] == approximately(std::atan(.3f))); + REQUIRE(outs[3] == approximately(std::atan(.4f))); - // NOTE: autovec with external scalar functions seems to work - // only since LLVM 16. + // NOTE: autovec with external scalar functions seems to work + // only since LLVM 16. #if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16 - const auto &tf = detail::get_target_features(); + const auto &tf = detail::get_target_features(); - auto ir = s.get_ir(); + auto ir = s.get_ir(); - using string_find_iterator = boost::find_iterator; + using string_find_iterator = boost::find_iterator; - auto count = 0u; - for (auto it = boost::make_find_iterator(ir, boost::first_finder("@atanf", boost::is_iequal())); - it != string_find_iterator(); ++it) { - ++count; - } + auto count = 0u; + for (auto it = boost::make_find_iterator(ir, boost::first_finder("@atanf", boost::is_iequal())); + it != string_find_iterator(); ++it) { + ++count; + } - // NOTE: at the moment we have comprehensive coverage of LLVM versions - // in the CI only for x86_64. - if (tf.sse2) { - // NOTE: occurrences of the scalar version: - // - 4 calls in the strided cfunc, - // - 1 declaration. - REQUIRE(count == 5u); - } + // NOTE: at the moment we have comprehensive coverage of LLVM versions + // in the CI only for x86_64. + if (tf.sse2) { + // NOTE: occurrences of the scalar version: + // - 4 calls in the strided cfunc, + // - 1 declaration. + REQUIRE(count == 5u); + } - // NOTE: LLVM16 is currently the version tested in the CI on arm64. - if (tf.aarch64) { - REQUIRE(count == 5u); - } + // NOTE: LLVM16 is currently the version tested in the CI on arm64. + if (tf.aarch64) { + REQUIRE(count == 5u); + } - // NOTE: currently no auto-vectorization happens on ppc64 due apparently - // to the way the target machine is being set up by orc/lljit (it works - // fine with the opt tool). When this is resolved, we can test ppc64 too. + // NOTE: currently no auto-vectorization happens on ppc64 due apparently + // to the way the target machine is being set up by orc/lljit (it works + // fine with the opt tool). When this is resolved, we can test ppc64 too. - // if (tf.vsx) { - // REQUIRE(count == 5u); - // } + // if (tf.vsx) { + // REQUIRE(count == 5u); + // } #endif + } } diff --git a/test/atan2.cpp b/test/atan2.cpp index e64618142..86f9af26d 100644 --- a/test/atan2.cpp +++ b/test/atan2.cpp @@ -334,124 +334,128 @@ TEST_CASE("normalise") // Tests to check vectorisation via the vector-function-abi-variant machinery. TEST_CASE("vfabi double") { - llvm_state s{kw::slp_vectorize = true}; + for (auto fast_math : {false, true}) { + llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math}; - auto [a, b] = make_vars("a", "b"); + auto [a, b] = make_vars("a", "b"); - add_cfunc(s, "cfunc", {atan2(a, .3), atan2(b, .4)}); + add_cfunc(s, "cfunc", {atan2(a, .3), atan2(b, .4)}); - s.compile(); + s.compile(); - auto *cf_ptr - = reinterpret_cast(s.jit_lookup("cfunc")); + auto *cf_ptr = reinterpret_cast( + s.jit_lookup("cfunc")); - const std::vector ins{.1, .2}; - std::vector outs(2u, 0.); + const std::vector ins{.1, .2}; + std::vector outs(2u, 0.); - cf_ptr(outs.data(), ins.data(), nullptr, nullptr); + cf_ptr(outs.data(), ins.data(), nullptr, nullptr); - REQUIRE(outs[0] == approximately(std::atan2(.1, .3))); - REQUIRE(outs[1] == approximately(std::atan2(.2, .4))); + REQUIRE(outs[0] == approximately(std::atan2(.1, .3))); + REQUIRE(outs[1] == approximately(std::atan2(.2, .4))); - // NOTE: autovec with external scalar functions seems to work - // only since LLVM 16. + // NOTE: autovec with external scalar functions seems to work + // only since LLVM 16. #if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16 - const auto &tf = detail::get_target_features(); + const auto &tf = detail::get_target_features(); - auto ir = s.get_ir(); + auto ir = s.get_ir(); - using string_find_iterator = boost::find_iterator; + using string_find_iterator = boost::find_iterator; - auto count = 0u; - for (auto it = boost::make_find_iterator(ir, boost::first_finder("@atan2", boost::is_iequal())); - it != string_find_iterator(); ++it) { - ++count; - } + auto count = 0u; + for (auto it = boost::make_find_iterator(ir, boost::first_finder("@atan2", boost::is_iequal())); + it != string_find_iterator(); ++it) { + ++count; + } - // NOTE: at the moment we have comprehensive coverage of LLVM versions - // in the CI only for x86_64. - if (tf.sse2) { - // NOTE: occurrences of the scalar version: - // - 2 calls in the strided cfunc, - // - 1 declaration. - REQUIRE(count == 3u); - } + // NOTE: at the moment we have comprehensive coverage of LLVM versions + // in the CI only for x86_64. + if (tf.sse2) { + // NOTE: occurrences of the scalar version: + // - 2 calls in the strided cfunc, + // - 1 declaration. + REQUIRE(count == 3u); + } - if (tf.aarch64) { - REQUIRE(count == 3u); - } + if (tf.aarch64) { + REQUIRE(count == 3u); + } - // NOTE: currently no auto-vectorization happens on ppc64 due apparently - // to the way the target machine is being set up by orc/lljit (it works - // fine with the opt tool). When this is resolved, we can test ppc64 too. + // NOTE: currently no auto-vectorization happens on ppc64 due apparently + // to the way the target machine is being set up by orc/lljit (it works + // fine with the opt tool). When this is resolved, we can test ppc64 too. - // if (tf.vsx) { - // REQUIRE(count == 3u); - // } + // if (tf.vsx) { + // REQUIRE(count == 3u); + // } #endif + } } TEST_CASE("vfabi float") { - llvm_state s{kw::slp_vectorize = true}; + for (auto fast_math : {false, true}) { + llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math}; - auto [a, b, c, d] = make_vars("a", "b", "c", "d"); + auto [a, b, c, d] = make_vars("a", "b", "c", "d"); - add_cfunc(s, "cfunc", {atan2(a, .5f), atan2(b, .6f), atan2(c, .7f), atan2(d, .8f)}); + add_cfunc(s, "cfunc", {atan2(a, .5f), atan2(b, .6f), atan2(c, .7f), atan2(d, .8f)}); - s.compile(); + s.compile(); - auto *cf_ptr - = reinterpret_cast(s.jit_lookup("cfunc")); + auto *cf_ptr + = reinterpret_cast(s.jit_lookup("cfunc")); - const std::vector ins{.1f, .2f, .3f, .4f}; - std::vector outs(4u, 0.); + const std::vector ins{.1f, .2f, .3f, .4f}; + std::vector outs(4u, 0.); - cf_ptr(outs.data(), ins.data(), nullptr, nullptr); + cf_ptr(outs.data(), ins.data(), nullptr, nullptr); - REQUIRE(outs[0] == approximately(std::atan2(.1f, .5f))); - REQUIRE(outs[1] == approximately(std::atan2(.2f, .6f))); - REQUIRE(outs[2] == approximately(std::atan2(.3f, .7f))); - REQUIRE(outs[3] == approximately(std::atan2(.4f, .8f))); + REQUIRE(outs[0] == approximately(std::atan2(.1f, .5f))); + REQUIRE(outs[1] == approximately(std::atan2(.2f, .6f))); + REQUIRE(outs[2] == approximately(std::atan2(.3f, .7f))); + REQUIRE(outs[3] == approximately(std::atan2(.4f, .8f))); - // NOTE: autovec with external scalar functions seems to work - // only since LLVM 16. + // NOTE: autovec with external scalar functions seems to work + // only since LLVM 16. #if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16 - const auto &tf = detail::get_target_features(); + const auto &tf = detail::get_target_features(); - auto ir = s.get_ir(); + auto ir = s.get_ir(); - using string_find_iterator = boost::find_iterator; + using string_find_iterator = boost::find_iterator; - auto count = 0u; - for (auto it = boost::make_find_iterator(ir, boost::first_finder("@atan2f", boost::is_iequal())); - it != string_find_iterator(); ++it) { - ++count; - } + auto count = 0u; + for (auto it = boost::make_find_iterator(ir, boost::first_finder("@atan2f", boost::is_iequal())); + it != string_find_iterator(); ++it) { + ++count; + } - // NOTE: at the moment we have comprehensive coverage of LLVM versions - // in the CI only for x86_64. - if (tf.sse2) { - // NOTE: occurrences of the scalar version: - // - 4 calls in the strided cfunc, - // - 1 declaration. - REQUIRE(count == 5u); - } + // NOTE: at the moment we have comprehensive coverage of LLVM versions + // in the CI only for x86_64. + if (tf.sse2) { + // NOTE: occurrences of the scalar version: + // - 4 calls in the strided cfunc, + // - 1 declaration. + REQUIRE(count == 5u); + } - if (tf.aarch64) { - REQUIRE(count == 5u); - } + if (tf.aarch64) { + REQUIRE(count == 5u); + } - // NOTE: currently no auto-vectorization happens on ppc64 due apparently - // to the way the target machine is being set up by orc/lljit (it works - // fine with the opt tool). When this is resolved, we can test ppc64 too. + // NOTE: currently no auto-vectorization happens on ppc64 due apparently + // to the way the target machine is being set up by orc/lljit (it works + // fine with the opt tool). When this is resolved, we can test ppc64 too. - // if (tf.vsx) { - // REQUIRE(count == 5u); - // } + // if (tf.vsx) { + // REQUIRE(count == 5u); + // } #endif + } } diff --git a/test/atanh.cpp b/test/atanh.cpp index 98efb30b0..069e02e48 100644 --- a/test/atanh.cpp +++ b/test/atanh.cpp @@ -223,124 +223,128 @@ TEST_CASE("normalise") // Tests to check vectorisation via the vector-function-abi-variant machinery. TEST_CASE("vfabi double") { - llvm_state s{kw::slp_vectorize = true}; + for (auto fast_math : {false, true}) { + llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math}; - auto [a, b] = make_vars("a", "b"); + auto [a, b] = make_vars("a", "b"); - add_cfunc(s, "cfunc", {atanh(a), atanh(b)}); + add_cfunc(s, "cfunc", {atanh(a), atanh(b)}); - s.compile(); + s.compile(); - auto *cf_ptr - = reinterpret_cast(s.jit_lookup("cfunc")); + auto *cf_ptr = reinterpret_cast( + s.jit_lookup("cfunc")); - const std::vector ins{.1, .2}; - std::vector outs(2u, 0.); + const std::vector ins{.1, .2}; + std::vector outs(2u, 0.); - cf_ptr(outs.data(), ins.data(), nullptr, nullptr); + cf_ptr(outs.data(), ins.data(), nullptr, nullptr); - REQUIRE(outs[0] == approximately(std::atanh(.1))); - REQUIRE(outs[1] == approximately(std::atanh(.2))); + REQUIRE(outs[0] == approximately(std::atanh(.1))); + REQUIRE(outs[1] == approximately(std::atanh(.2))); - // NOTE: autovec with external scalar functions seems to work - // only since LLVM 16. + // NOTE: autovec with external scalar functions seems to work + // only since LLVM 16. #if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16 - const auto &tf = detail::get_target_features(); + const auto &tf = detail::get_target_features(); - auto ir = s.get_ir(); + auto ir = s.get_ir(); - using string_find_iterator = boost::find_iterator; + using string_find_iterator = boost::find_iterator; - auto count = 0u; - for (auto it = boost::make_find_iterator(ir, boost::first_finder("@atanh", boost::is_iequal())); - it != string_find_iterator(); ++it) { - ++count; - } + auto count = 0u; + for (auto it = boost::make_find_iterator(ir, boost::first_finder("@atanh", boost::is_iequal())); + it != string_find_iterator(); ++it) { + ++count; + } - // NOTE: at the moment we have comprehensive coverage of LLVM versions - // in the CI only for x86_64. - if (tf.sse2) { - // NOTE: occurrences of the scalar version: - // - 2 calls in the strided cfunc, - // - 1 declaration. - REQUIRE(count == 3u); - } + // NOTE: at the moment we have comprehensive coverage of LLVM versions + // in the CI only for x86_64. + if (tf.sse2) { + // NOTE: occurrences of the scalar version: + // - 2 calls in the strided cfunc, + // - 1 declaration. + REQUIRE(count == 3u); + } - if (tf.aarch64) { - REQUIRE(count == 3u); - } + if (tf.aarch64) { + REQUIRE(count == 3u); + } - // NOTE: currently no auto-vectorization happens on ppc64 due apparently - // to the way the target machine is being set up by orc/lljit (it works - // fine with the opt tool). When this is resolved, we can test ppc64 too. + // NOTE: currently no auto-vectorization happens on ppc64 due apparently + // to the way the target machine is being set up by orc/lljit (it works + // fine with the opt tool). When this is resolved, we can test ppc64 too. - // if (tf.vsx) { - // REQUIRE(count == 3u); - // } + // if (tf.vsx) { + // REQUIRE(count == 3u); + // } #endif + } } TEST_CASE("vfabi float") { - llvm_state s{kw::slp_vectorize = true}; + for (auto fast_math : {false, true}) { + llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math}; - auto [a, b, c, d] = make_vars("a", "b", "c", "d"); + auto [a, b, c, d] = make_vars("a", "b", "c", "d"); - add_cfunc(s, "cfunc", {atanh(a), atanh(b), atanh(c), atanh(d)}); + add_cfunc(s, "cfunc", {atanh(a), atanh(b), atanh(c), atanh(d)}); - s.compile(); + s.compile(); - auto *cf_ptr - = reinterpret_cast(s.jit_lookup("cfunc")); + auto *cf_ptr + = reinterpret_cast(s.jit_lookup("cfunc")); - const std::vector ins{.1f, .2f, .3f, .4f}; - std::vector outs(4u, 0.); + const std::vector ins{.1f, .2f, .3f, .4f}; + std::vector outs(4u, 0.); - cf_ptr(outs.data(), ins.data(), nullptr, nullptr); + cf_ptr(outs.data(), ins.data(), nullptr, nullptr); - REQUIRE(outs[0] == approximately(std::atanh(.1f))); - REQUIRE(outs[1] == approximately(std::atanh(.2f))); - REQUIRE(outs[2] == approximately(std::atanh(.3f))); - REQUIRE(outs[3] == approximately(std::atanh(.4f))); + REQUIRE(outs[0] == approximately(std::atanh(.1f))); + REQUIRE(outs[1] == approximately(std::atanh(.2f))); + REQUIRE(outs[2] == approximately(std::atanh(.3f))); + REQUIRE(outs[3] == approximately(std::atanh(.4f))); - // NOTE: autovec with external scalar functions seems to work - // only since LLVM 16. + // NOTE: autovec with external scalar functions seems to work + // only since LLVM 16. #if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16 - const auto &tf = detail::get_target_features(); + const auto &tf = detail::get_target_features(); - auto ir = s.get_ir(); + auto ir = s.get_ir(); - using string_find_iterator = boost::find_iterator; + using string_find_iterator = boost::find_iterator; - auto count = 0u; - for (auto it = boost::make_find_iterator(ir, boost::first_finder("@atanhf", boost::is_iequal())); - it != string_find_iterator(); ++it) { - ++count; - } + auto count = 0u; + for (auto it = boost::make_find_iterator(ir, boost::first_finder("@atanhf", boost::is_iequal())); + it != string_find_iterator(); ++it) { + ++count; + } - // NOTE: at the moment we have comprehensive coverage of LLVM versions - // in the CI only for x86_64. - if (tf.sse2) { - // NOTE: occurrences of the scalar version: - // - 4 calls in the strided cfunc, - // - 1 declaration. - REQUIRE(count == 5u); - } + // NOTE: at the moment we have comprehensive coverage of LLVM versions + // in the CI only for x86_64. + if (tf.sse2) { + // NOTE: occurrences of the scalar version: + // - 4 calls in the strided cfunc, + // - 1 declaration. + REQUIRE(count == 5u); + } - if (tf.aarch64) { - REQUIRE(count == 5u); - } + if (tf.aarch64) { + REQUIRE(count == 5u); + } - // NOTE: currently no auto-vectorization happens on ppc64 due apparently - // to the way the target machine is being set up by orc/lljit (it works - // fine with the opt tool). When this is resolved, we can test ppc64 too. + // NOTE: currently no auto-vectorization happens on ppc64 due apparently + // to the way the target machine is being set up by orc/lljit (it works + // fine with the opt tool). When this is resolved, we can test ppc64 too. - // if (tf.vsx) { - // REQUIRE(count == 5u); - // } + // if (tf.vsx) { + // REQUIRE(count == 5u); + // } #endif + } } diff --git a/test/cos.cpp b/test/cos.cpp index 8846d465a..a8f085421 100644 --- a/test/cos.cpp +++ b/test/cos.cpp @@ -246,130 +246,134 @@ TEST_CASE("normalise") // Tests to check vectorisation via the vector-function-abi-variant machinery. TEST_CASE("vfabi double") { - llvm_state s{kw::slp_vectorize = true}; + for (auto fast_math : {false, true}) { + llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math}; - auto [a, b] = make_vars("a", "b"); + auto [a, b] = make_vars("a", "b"); - add_cfunc(s, "cfunc", {cos(a), cos(b)}); + add_cfunc(s, "cfunc", {cos(a), cos(b)}); - s.compile(); + s.compile(); - auto *cf_ptr - = reinterpret_cast(s.jit_lookup("cfunc")); + auto *cf_ptr = reinterpret_cast( + s.jit_lookup("cfunc")); - const std::vector ins{1., 2.}; - std::vector outs(2u, 0.); + const std::vector ins{1., 2.}; + std::vector outs(2u, 0.); - cf_ptr(outs.data(), ins.data(), nullptr, nullptr); + cf_ptr(outs.data(), ins.data(), nullptr, nullptr); - REQUIRE(outs[0] == approximately(std::cos(1.))); - REQUIRE(outs[1] == approximately(std::cos(2.))); + REQUIRE(outs[0] == approximately(std::cos(1.))); + REQUIRE(outs[1] == approximately(std::cos(2.))); #if defined(HEYOKA_WITH_SLEEF) - const auto &tf = detail::get_target_features(); + const auto &tf = detail::get_target_features(); - auto ir = s.get_ir(); + auto ir = s.get_ir(); - using string_find_iterator = boost::find_iterator; + using string_find_iterator = boost::find_iterator; - auto count = 0u; - for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.cos.f64", boost::is_iequal())); - it != string_find_iterator(); ++it) { - ++count; - } + auto count = 0u; + for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.cos.f64", boost::is_iequal())); + it != string_find_iterator(); ++it) { + ++count; + } - // NOTE: at the moment we have comprehensive coverage of LLVM versions - // in the CI only for x86_64. - if (tf.sse2) { - // NOTE: occurrences of the scalar version: - // - 2 calls in the strided cfunc, - // - 1 declaration. - REQUIRE(count == 3u); - } + // NOTE: at the moment we have comprehensive coverage of LLVM versions + // in the CI only for x86_64. + if (tf.sse2) { + // NOTE: occurrences of the scalar version: + // - 2 calls in the strided cfunc, + // - 1 declaration. + REQUIRE(count == 3u); + } #if LLVM_VERSION_MAJOR >= 16 - // NOTE: LLVM16 is currently the version tested in the CI on arm64. - if (tf.aarch64) { - REQUIRE(count == 3u); - } + // NOTE: LLVM16 is currently the version tested in the CI on arm64. + if (tf.aarch64) { + REQUIRE(count == 3u); + } #endif - // NOTE: currently no auto-vectorization happens on ppc64 due apparently - // to the way the target machine is being set up by orc/lljit (it works - // fine with the opt tool). When this is resolved, we can test ppc64 too. + // NOTE: currently no auto-vectorization happens on ppc64 due apparently + // to the way the target machine is being set up by orc/lljit (it works + // fine with the opt tool). When this is resolved, we can test ppc64 too. - // if (tf.vsx) { - // REQUIRE(count == 3u); - // } + // if (tf.vsx) { + // REQUIRE(count == 3u); + // } #endif + } } TEST_CASE("vfabi float") { - llvm_state s{kw::slp_vectorize = true}; + for (auto fast_math : {false, true}) { + llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math}; - auto [a, b, c, d] = make_vars("a", "b", "c", "d"); + auto [a, b, c, d] = make_vars("a", "b", "c", "d"); - add_cfunc(s, "cfunc", {cos(a), cos(b), cos(c), cos(d)}); + add_cfunc(s, "cfunc", {cos(a), cos(b), cos(c), cos(d)}); - s.compile(); + s.compile(); - auto *cf_ptr - = reinterpret_cast(s.jit_lookup("cfunc")); + auto *cf_ptr + = reinterpret_cast(s.jit_lookup("cfunc")); - const std::vector ins{1., 2., 3., 4.}; - std::vector outs(4u, 0.); + const std::vector ins{1., 2., 3., 4.}; + std::vector outs(4u, 0.); - cf_ptr(outs.data(), ins.data(), nullptr, nullptr); + cf_ptr(outs.data(), ins.data(), nullptr, nullptr); - REQUIRE(outs[0] == approximately(std::cos(1.f))); - REQUIRE(outs[1] == approximately(std::cos(2.f))); - REQUIRE(outs[2] == approximately(std::cos(3.f))); - REQUIRE(outs[3] == approximately(std::cos(4.f))); + REQUIRE(outs[0] == approximately(std::cos(1.f))); + REQUIRE(outs[1] == approximately(std::cos(2.f))); + REQUIRE(outs[2] == approximately(std::cos(3.f))); + REQUIRE(outs[3] == approximately(std::cos(4.f))); #if defined(HEYOKA_WITH_SLEEF) - const auto &tf = detail::get_target_features(); + const auto &tf = detail::get_target_features(); - auto ir = s.get_ir(); + auto ir = s.get_ir(); - using string_find_iterator = boost::find_iterator; + using string_find_iterator = boost::find_iterator; - auto count = 0u; - for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.cos.f32", boost::is_iequal())); - it != string_find_iterator(); ++it) { - ++count; - } + auto count = 0u; + for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.cos.f32", boost::is_iequal())); + it != string_find_iterator(); ++it) { + ++count; + } - // NOTE: at the moment we have comprehensive coverage of LLVM versions - // in the CI only for x86_64. - if (tf.sse2) { - // NOTE: occurrences of the scalar version: - // - 4 calls in the strided cfunc, - // - 1 declaration. - REQUIRE(count == 5u); - } + // NOTE: at the moment we have comprehensive coverage of LLVM versions + // in the CI only for x86_64. + if (tf.sse2) { + // NOTE: occurrences of the scalar version: + // - 4 calls in the strided cfunc, + // - 1 declaration. + REQUIRE(count == 5u); + } #if LLVM_VERSION_MAJOR >= 16 - // NOTE: LLVM16 is currently the version tested in the CI on arm64. - if (tf.aarch64) { - REQUIRE(count == 5u); - } + // NOTE: LLVM16 is currently the version tested in the CI on arm64. + if (tf.aarch64) { + REQUIRE(count == 5u); + } #endif - // NOTE: currently no auto-vectorization happens on ppc64 due apparently - // to the way the target machine is being set up by orc/lljit (it works - // fine with the opt tool). When this is resolved, we can test ppc64 too. + // NOTE: currently no auto-vectorization happens on ppc64 due apparently + // to the way the target machine is being set up by orc/lljit (it works + // fine with the opt tool). When this is resolved, we can test ppc64 too. - // if (tf.vsx) { - // REQUIRE(count == 5u); - // } + // if (tf.vsx) { + // REQUIRE(count == 5u); + // } #endif + } } diff --git a/test/cosh.cpp b/test/cosh.cpp index c6ca5c8b9..91a246bd0 100644 --- a/test/cosh.cpp +++ b/test/cosh.cpp @@ -223,124 +223,128 @@ TEST_CASE("normalise") // Tests to check vectorisation via the vector-function-abi-variant machinery. TEST_CASE("vfabi double") { - llvm_state s{kw::slp_vectorize = true}; + for (auto fast_math : {false, true}) { + llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math}; - auto [a, b] = make_vars("a", "b"); + auto [a, b] = make_vars("a", "b"); - add_cfunc(s, "cfunc", {cosh(a), cosh(b)}); + add_cfunc(s, "cfunc", {cosh(a), cosh(b)}); - s.compile(); + s.compile(); - auto *cf_ptr - = reinterpret_cast(s.jit_lookup("cfunc")); + auto *cf_ptr = reinterpret_cast( + s.jit_lookup("cfunc")); - const std::vector ins{1., 2.}; - std::vector outs(2u, 0.); + const std::vector ins{1., 2.}; + std::vector outs(2u, 0.); - cf_ptr(outs.data(), ins.data(), nullptr, nullptr); + cf_ptr(outs.data(), ins.data(), nullptr, nullptr); - REQUIRE(outs[0] == approximately(std::cosh(1.))); - REQUIRE(outs[1] == approximately(std::cosh(2.))); + REQUIRE(outs[0] == approximately(std::cosh(1.))); + REQUIRE(outs[1] == approximately(std::cosh(2.))); - // NOTE: autovec with external scalar functions seems to work - // only since LLVM 16. + // NOTE: autovec with external scalar functions seems to work + // only since LLVM 16. #if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16 - const auto &tf = detail::get_target_features(); + const auto &tf = detail::get_target_features(); - auto ir = s.get_ir(); + auto ir = s.get_ir(); - using string_find_iterator = boost::find_iterator; + using string_find_iterator = boost::find_iterator; - auto count = 0u; - for (auto it = boost::make_find_iterator(ir, boost::first_finder("@cosh", boost::is_iequal())); - it != string_find_iterator(); ++it) { - ++count; - } + auto count = 0u; + for (auto it = boost::make_find_iterator(ir, boost::first_finder("@cosh", boost::is_iequal())); + it != string_find_iterator(); ++it) { + ++count; + } - // NOTE: at the moment we have comprehensive coverage of LLVM versions - // in the CI only for x86_64. - if (tf.sse2) { - // NOTE: occurrences of the scalar version: - // - 2 calls in the strided cfunc, - // - 1 declaration. - REQUIRE(count == 3u); - } + // NOTE: at the moment we have comprehensive coverage of LLVM versions + // in the CI only for x86_64. + if (tf.sse2) { + // NOTE: occurrences of the scalar version: + // - 2 calls in the strided cfunc, + // - 1 declaration. + REQUIRE(count == 3u); + } - if (tf.aarch64) { - REQUIRE(count == 3u); - } + if (tf.aarch64) { + REQUIRE(count == 3u); + } - // NOTE: currently no auto-vectorization happens on ppc64 due apparently - // to the way the target machine is being set up by orc/lljit (it works - // fine with the opt tool). When this is resolved, we can test ppc64 too. + // NOTE: currently no auto-vectorization happens on ppc64 due apparently + // to the way the target machine is being set up by orc/lljit (it works + // fine with the opt tool). When this is resolved, we can test ppc64 too. - // if (tf.vsx) { - // REQUIRE(count == 3u); - // } + // if (tf.vsx) { + // REQUIRE(count == 3u); + // } #endif + } } TEST_CASE("vfabi float") { - llvm_state s{kw::slp_vectorize = true}; + for (auto fast_math : {false, true}) { + llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math}; - auto [a, b, c, d] = make_vars("a", "b", "c", "d"); + auto [a, b, c, d] = make_vars("a", "b", "c", "d"); - add_cfunc(s, "cfunc", {cosh(a), cosh(b), cosh(c), cosh(d)}); + add_cfunc(s, "cfunc", {cosh(a), cosh(b), cosh(c), cosh(d)}); - s.compile(); + s.compile(); - auto *cf_ptr - = reinterpret_cast(s.jit_lookup("cfunc")); + auto *cf_ptr + = reinterpret_cast(s.jit_lookup("cfunc")); - const std::vector ins{1., 2., 3., 4.}; - std::vector outs(4u, 0.); + const std::vector ins{1., 2., 3., 4.}; + std::vector outs(4u, 0.); - cf_ptr(outs.data(), ins.data(), nullptr, nullptr); + cf_ptr(outs.data(), ins.data(), nullptr, nullptr); - REQUIRE(outs[0] == approximately(std::cosh(1.f))); - REQUIRE(outs[1] == approximately(std::cosh(2.f))); - REQUIRE(outs[2] == approximately(std::cosh(3.f))); - REQUIRE(outs[3] == approximately(std::cosh(4.f))); + REQUIRE(outs[0] == approximately(std::cosh(1.f))); + REQUIRE(outs[1] == approximately(std::cosh(2.f))); + REQUIRE(outs[2] == approximately(std::cosh(3.f))); + REQUIRE(outs[3] == approximately(std::cosh(4.f))); - // NOTE: autovec with external scalar functions seems to work - // only since LLVM 16. + // NOTE: autovec with external scalar functions seems to work + // only since LLVM 16. #if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16 - const auto &tf = detail::get_target_features(); + const auto &tf = detail::get_target_features(); - auto ir = s.get_ir(); + auto ir = s.get_ir(); - using string_find_iterator = boost::find_iterator; + using string_find_iterator = boost::find_iterator; - auto count = 0u; - for (auto it = boost::make_find_iterator(ir, boost::first_finder("@coshf", boost::is_iequal())); - it != string_find_iterator(); ++it) { - ++count; - } + auto count = 0u; + for (auto it = boost::make_find_iterator(ir, boost::first_finder("@coshf", boost::is_iequal())); + it != string_find_iterator(); ++it) { + ++count; + } - // NOTE: at the moment we have comprehensive coverage of LLVM versions - // in the CI only for x86_64. - if (tf.sse2) { - // NOTE: occurrences of the scalar version: - // - 4 calls in the strided cfunc, - // - 1 declaration. - REQUIRE(count == 5u); - } + // NOTE: at the moment we have comprehensive coverage of LLVM versions + // in the CI only for x86_64. + if (tf.sse2) { + // NOTE: occurrences of the scalar version: + // - 4 calls in the strided cfunc, + // - 1 declaration. + REQUIRE(count == 5u); + } - if (tf.aarch64) { - REQUIRE(count == 5u); - } + if (tf.aarch64) { + REQUIRE(count == 5u); + } - // NOTE: currently no auto-vectorization happens on ppc64 due apparently - // to the way the target machine is being set up by orc/lljit (it works - // fine with the opt tool). When this is resolved, we can test ppc64 too. + // NOTE: currently no auto-vectorization happens on ppc64 due apparently + // to the way the target machine is being set up by orc/lljit (it works + // fine with the opt tool). When this is resolved, we can test ppc64 too. - // if (tf.vsx) { - // REQUIRE(count == 5u); - // } + // if (tf.vsx) { + // REQUIRE(count == 5u); + // } #endif + } } diff --git a/test/elp2000.cpp b/test/elp2000.cpp index a3c163a88..5c6e2ed18 100644 --- a/test/elp2000.cpp +++ b/test/elp2000.cpp @@ -7,7 +7,6 @@ // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. #include -#include #include #include @@ -17,8 +16,10 @@ #include #include "catch.hpp" +#include "test_utils.hpp" using namespace heyoka; +using namespace heyoka_test; using namespace heyoka::model; TEST_CASE("basic") @@ -57,9 +58,9 @@ TEST_CASE("basic") const double tm = (date - 2451545.0) / (36525); cf_ptr(out, nullptr, nullptr, &tm); - REQUIRE(std::abs(out[0] - ref[i][0]) < 1e-10); - REQUIRE(std::abs(out[1] - ref[i][1]) < 1e-10); - REQUIRE(std::abs(out[2] - ref[i][2]) < 1e-10); + REQUIRE(out[0] == approximately(ref[i][0], 1000.)); + REQUIRE(out[1] == approximately(ref[i][1], 1000.)); + REQUIRE(out[2] == approximately(ref[i][2], 1000.)); } } @@ -92,9 +93,9 @@ TEST_CASE("fk5") const double tm = (date - 2451545.0) / (36525); cf_ptr(out, nullptr, nullptr, &tm); - REQUIRE(std::abs(out[0] - ref[i][0]) < 1e-10); - REQUIRE(std::abs(out[1] - ref[i][1]) < 1e-10); - REQUIRE(std::abs(out[2] - ref[i][2]) < 1e-10); + REQUIRE(out[0] == approximately(ref[i][0], 1000.)); + REQUIRE(out[1] == approximately(ref[i][1], 1000.)); + REQUIRE(out[2] == approximately(ref[i][2], 1000.)); } } diff --git a/test/erf.cpp b/test/erf.cpp index 239aac9ed..f201836cd 100644 --- a/test/erf.cpp +++ b/test/erf.cpp @@ -230,124 +230,128 @@ TEST_CASE("normalise") // Tests to check vectorisation via the vector-function-abi-variant machinery. TEST_CASE("vfabi double") { - llvm_state s{kw::slp_vectorize = true}; + for (auto fast_math : {false, true}) { + llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math}; - auto [a, b] = make_vars("a", "b"); + auto [a, b] = make_vars("a", "b"); - add_cfunc(s, "cfunc", {erf(a), erf(b)}); + add_cfunc(s, "cfunc", {erf(a), erf(b)}); - s.compile(); + s.compile(); - auto *cf_ptr - = reinterpret_cast(s.jit_lookup("cfunc")); + auto *cf_ptr = reinterpret_cast( + s.jit_lookup("cfunc")); - const std::vector ins{.1, .2}; - std::vector outs(2u, 0.); + const std::vector ins{.1, .2}; + std::vector outs(2u, 0.); - cf_ptr(outs.data(), ins.data(), nullptr, nullptr); + cf_ptr(outs.data(), ins.data(), nullptr, nullptr); - REQUIRE(outs[0] == approximately(std::erf(.1))); - REQUIRE(outs[1] == approximately(std::erf(.2))); + REQUIRE(outs[0] == approximately(std::erf(.1))); + REQUIRE(outs[1] == approximately(std::erf(.2))); - // NOTE: autovec with external scalar functions seems to work - // only since LLVM 16. + // NOTE: autovec with external scalar functions seems to work + // only since LLVM 16. #if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16 - const auto &tf = detail::get_target_features(); + const auto &tf = detail::get_target_features(); - auto ir = s.get_ir(); + auto ir = s.get_ir(); - using string_find_iterator = boost::find_iterator; + using string_find_iterator = boost::find_iterator; - auto count = 0u; - for (auto it = boost::make_find_iterator(ir, boost::first_finder("@erf", boost::is_iequal())); - it != string_find_iterator(); ++it) { - ++count; - } + auto count = 0u; + for (auto it = boost::make_find_iterator(ir, boost::first_finder("@erf", boost::is_iequal())); + it != string_find_iterator(); ++it) { + ++count; + } - // NOTE: at the moment we have comprehensive coverage of LLVM versions - // in the CI only for x86_64. - if (tf.sse2) { - // NOTE: occurrences of the scalar version: - // - 2 calls in the strided cfunc, - // - 1 declaration. - REQUIRE(count == 3u); - } + // NOTE: at the moment we have comprehensive coverage of LLVM versions + // in the CI only for x86_64. + if (tf.sse2) { + // NOTE: occurrences of the scalar version: + // - 2 calls in the strided cfunc, + // - 1 declaration. + REQUIRE(count == 3u); + } - if (tf.aarch64) { - REQUIRE(count == 3u); - } + if (tf.aarch64) { + REQUIRE(count == 3u); + } - // NOTE: currently no auto-vectorization happens on ppc64 due apparently - // to the way the target machine is being set up by orc/lljit (it works - // fine with the opt tool). When this is resolved, we can test ppc64 too. + // NOTE: currently no auto-vectorization happens on ppc64 due apparently + // to the way the target machine is being set up by orc/lljit (it works + // fine with the opt tool). When this is resolved, we can test ppc64 too. - // if (tf.vsx) { - // REQUIRE(count == 3u); - // } + // if (tf.vsx) { + // REQUIRE(count == 3u); + // } #endif + } } TEST_CASE("vfabi float") { - llvm_state s{kw::slp_vectorize = true}; + for (auto fast_math : {false, true}) { + llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math}; - auto [a, b, c, d] = make_vars("a", "b", "c", "d"); + auto [a, b, c, d] = make_vars("a", "b", "c", "d"); - add_cfunc(s, "cfunc", {erf(a), erf(b), erf(c), erf(d)}); + add_cfunc(s, "cfunc", {erf(a), erf(b), erf(c), erf(d)}); - s.compile(); + s.compile(); - auto *cf_ptr - = reinterpret_cast(s.jit_lookup("cfunc")); + auto *cf_ptr + = reinterpret_cast(s.jit_lookup("cfunc")); - const std::vector ins{.1f, .2f, .3f, .4f}; - std::vector outs(4u, 0.); + const std::vector ins{.1f, .2f, .3f, .4f}; + std::vector outs(4u, 0.); - cf_ptr(outs.data(), ins.data(), nullptr, nullptr); + cf_ptr(outs.data(), ins.data(), nullptr, nullptr); - REQUIRE(outs[0] == approximately(std::erf(.1f))); - REQUIRE(outs[1] == approximately(std::erf(.2f))); - REQUIRE(outs[2] == approximately(std::erf(.3f))); - REQUIRE(outs[3] == approximately(std::erf(.4f))); + REQUIRE(outs[0] == approximately(std::erf(.1f))); + REQUIRE(outs[1] == approximately(std::erf(.2f))); + REQUIRE(outs[2] == approximately(std::erf(.3f))); + REQUIRE(outs[3] == approximately(std::erf(.4f))); - // NOTE: autovec with external scalar functions seems to work - // only since LLVM 16. + // NOTE: autovec with external scalar functions seems to work + // only since LLVM 16. #if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16 - const auto &tf = detail::get_target_features(); + const auto &tf = detail::get_target_features(); - auto ir = s.get_ir(); + auto ir = s.get_ir(); - using string_find_iterator = boost::find_iterator; + using string_find_iterator = boost::find_iterator; - auto count = 0u; - for (auto it = boost::make_find_iterator(ir, boost::first_finder("@erff", boost::is_iequal())); - it != string_find_iterator(); ++it) { - ++count; - } + auto count = 0u; + for (auto it = boost::make_find_iterator(ir, boost::first_finder("@erff", boost::is_iequal())); + it != string_find_iterator(); ++it) { + ++count; + } - // NOTE: at the moment we have comprehensive coverage of LLVM versions - // in the CI only for x86_64. - if (tf.sse2) { - // NOTE: occurrences of the scalar version: - // - 4 calls in the strided cfunc, - // - 1 declaration. - REQUIRE(count == 5u); - } + // NOTE: at the moment we have comprehensive coverage of LLVM versions + // in the CI only for x86_64. + if (tf.sse2) { + // NOTE: occurrences of the scalar version: + // - 4 calls in the strided cfunc, + // - 1 declaration. + REQUIRE(count == 5u); + } - if (tf.aarch64) { - REQUIRE(count == 5u); - } + if (tf.aarch64) { + REQUIRE(count == 5u); + } - // NOTE: currently no auto-vectorization happens on ppc64 due apparently - // to the way the target machine is being set up by orc/lljit (it works - // fine with the opt tool). When this is resolved, we can test ppc64 too. + // NOTE: currently no auto-vectorization happens on ppc64 due apparently + // to the way the target machine is being set up by orc/lljit (it works + // fine with the opt tool). When this is resolved, we can test ppc64 too. - // if (tf.vsx) { - // REQUIRE(count == 5u); - // } + // if (tf.vsx) { + // REQUIRE(count == 5u); + // } #endif + } } diff --git a/test/exp.cpp b/test/exp.cpp index 768234042..ef7b8da3e 100644 --- a/test/exp.cpp +++ b/test/exp.cpp @@ -237,130 +237,134 @@ TEST_CASE("normalise") // Tests to check vectorisation via the vector-function-abi-variant machinery. TEST_CASE("vfabi double") { - llvm_state s{kw::slp_vectorize = true}; + for (auto fast_math : {false, true}) { + llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math}; - auto [a, b] = make_vars("a", "b"); + auto [a, b] = make_vars("a", "b"); - add_cfunc(s, "cfunc", {exp(a), exp(b)}); + add_cfunc(s, "cfunc", {exp(a), exp(b)}); - s.compile(); + s.compile(); - auto *cf_ptr - = reinterpret_cast(s.jit_lookup("cfunc")); + auto *cf_ptr = reinterpret_cast( + s.jit_lookup("cfunc")); - const std::vector ins{1., 2.}; - std::vector outs(2u, 0.); + const std::vector ins{1., 2.}; + std::vector outs(2u, 0.); - cf_ptr(outs.data(), ins.data(), nullptr, nullptr); + cf_ptr(outs.data(), ins.data(), nullptr, nullptr); - REQUIRE(outs[0] == approximately(std::exp(1.))); - REQUIRE(outs[1] == approximately(std::exp(2.))); + REQUIRE(outs[0] == approximately(std::exp(1.))); + REQUIRE(outs[1] == approximately(std::exp(2.))); #if defined(HEYOKA_WITH_SLEEF) - const auto &tf = detail::get_target_features(); + const auto &tf = detail::get_target_features(); - auto ir = s.get_ir(); + auto ir = s.get_ir(); - using string_find_iterator = boost::find_iterator; + using string_find_iterator = boost::find_iterator; - auto count = 0u; - for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.exp.f64", boost::is_iequal())); - it != string_find_iterator(); ++it) { - ++count; - } + auto count = 0u; + for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.exp.f64", boost::is_iequal())); + it != string_find_iterator(); ++it) { + ++count; + } - // NOTE: at the moment we have comprehensive coverage of LLVM versions - // in the CI only for x86_64. - if (tf.sse2) { - // NOTE: occurrences of the scalar version: - // - 2 calls in the strided cfunc, - // - 1 declaration. - REQUIRE(count == 3u); - } + // NOTE: at the moment we have comprehensive coverage of LLVM versions + // in the CI only for x86_64. + if (tf.sse2) { + // NOTE: occurrences of the scalar version: + // - 2 calls in the strided cfunc, + // - 1 declaration. + REQUIRE(count == 3u); + } #if LLVM_VERSION_MAJOR >= 16 - // NOTE: LLVM16 is currently the version tested in the CI on arm64. - if (tf.aarch64) { - REQUIRE(count == 3u); - } + // NOTE: LLVM16 is currently the version tested in the CI on arm64. + if (tf.aarch64) { + REQUIRE(count == 3u); + } #endif - // NOTE: currently no auto-vectorization happens on ppc64 due apparently - // to the way the target machine is being set up by orc/lljit (it works - // fine with the opt tool). When this is resolved, we can test ppc64 too. + // NOTE: currently no auto-vectorization happens on ppc64 due apparently + // to the way the target machine is being set up by orc/lljit (it works + // fine with the opt tool). When this is resolved, we can test ppc64 too. - // if (tf.vsx) { - // REQUIRE(count == 3u); - // } + // if (tf.vsx) { + // REQUIRE(count == 3u); + // } #endif + } } TEST_CASE("vfabi float") { - llvm_state s{kw::slp_vectorize = true}; + for (auto fast_math : {false, true}) { + llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math}; - auto [a, b, c, d] = make_vars("a", "b", "c", "d"); + auto [a, b, c, d] = make_vars("a", "b", "c", "d"); - add_cfunc(s, "cfunc", {exp(a), exp(b), exp(c), exp(d)}); + add_cfunc(s, "cfunc", {exp(a), exp(b), exp(c), exp(d)}); - s.compile(); + s.compile(); - auto *cf_ptr - = reinterpret_cast(s.jit_lookup("cfunc")); + auto *cf_ptr + = reinterpret_cast(s.jit_lookup("cfunc")); - const std::vector ins{1., 2., 3., 4.}; - std::vector outs(4u, 0.); + const std::vector ins{1., 2., 3., 4.}; + std::vector outs(4u, 0.); - cf_ptr(outs.data(), ins.data(), nullptr, nullptr); + cf_ptr(outs.data(), ins.data(), nullptr, nullptr); - REQUIRE(outs[0] == approximately(std::exp(1.f))); - REQUIRE(outs[1] == approximately(std::exp(2.f))); - REQUIRE(outs[2] == approximately(std::exp(3.f))); - REQUIRE(outs[3] == approximately(std::exp(4.f))); + REQUIRE(outs[0] == approximately(std::exp(1.f))); + REQUIRE(outs[1] == approximately(std::exp(2.f))); + REQUIRE(outs[2] == approximately(std::exp(3.f))); + REQUIRE(outs[3] == approximately(std::exp(4.f))); #if defined(HEYOKA_WITH_SLEEF) - const auto &tf = detail::get_target_features(); + const auto &tf = detail::get_target_features(); - auto ir = s.get_ir(); + auto ir = s.get_ir(); - using string_find_iterator = boost::find_iterator; + using string_find_iterator = boost::find_iterator; - auto count = 0u; - for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.exp.f32", boost::is_iequal())); - it != string_find_iterator(); ++it) { - ++count; - } + auto count = 0u; + for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.exp.f32", boost::is_iequal())); + it != string_find_iterator(); ++it) { + ++count; + } - // NOTE: at the moment we have comprehensive coverage of LLVM versions - // in the CI only for x86_64. - if (tf.sse2) { - // NOTE: occurrences of the scalar version: - // - 4 calls in the strided cfunc, - // - 1 declaration. - REQUIRE(count == 5u); - } + // NOTE: at the moment we have comprehensive coverage of LLVM versions + // in the CI only for x86_64. + if (tf.sse2) { + // NOTE: occurrences of the scalar version: + // - 4 calls in the strided cfunc, + // - 1 declaration. + REQUIRE(count == 5u); + } #if LLVM_VERSION_MAJOR >= 16 - // NOTE: LLVM16 is currently the version tested in the CI on arm64. - if (tf.aarch64) { - REQUIRE(count == 5u); - } + // NOTE: LLVM16 is currently the version tested in the CI on arm64. + if (tf.aarch64) { + REQUIRE(count == 5u); + } #endif - // NOTE: currently no auto-vectorization happens on ppc64 due apparently - // to the way the target machine is being set up by orc/lljit (it works - // fine with the opt tool). When this is resolved, we can test ppc64 too. + // NOTE: currently no auto-vectorization happens on ppc64 due apparently + // to the way the target machine is being set up by orc/lljit (it works + // fine with the opt tool). When this is resolved, we can test ppc64 too. - // if (tf.vsx) { - // REQUIRE(count == 5u); - // } + // if (tf.vsx) { + // REQUIRE(count == 5u); + // } #endif + } } diff --git a/test/log.cpp b/test/log.cpp index 4d5a88626..276a9aaa2 100644 --- a/test/log.cpp +++ b/test/log.cpp @@ -213,130 +213,134 @@ TEST_CASE("normalise") // Tests to check vectorisation via the vector-function-abi-variant machinery. TEST_CASE("vfabi double") { - llvm_state s{kw::slp_vectorize = true}; + for (auto fast_math : {false, true}) { + llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math}; - auto [a, b] = make_vars("a", "b"); + auto [a, b] = make_vars("a", "b"); - add_cfunc(s, "cfunc", {log(a), log(b)}); + add_cfunc(s, "cfunc", {log(a), log(b)}); - s.compile(); + s.compile(); - auto *cf_ptr - = reinterpret_cast(s.jit_lookup("cfunc")); + auto *cf_ptr = reinterpret_cast( + s.jit_lookup("cfunc")); - const std::vector ins{1., 2.}; - std::vector outs(2u, 0.); + const std::vector ins{1., 2.}; + std::vector outs(2u, 0.); - cf_ptr(outs.data(), ins.data(), nullptr, nullptr); + cf_ptr(outs.data(), ins.data(), nullptr, nullptr); - REQUIRE(outs[0] == approximately(std::log(1.))); - REQUIRE(outs[1] == approximately(std::log(2.))); + REQUIRE(outs[0] == approximately(std::log(1.))); + REQUIRE(outs[1] == approximately(std::log(2.))); #if defined(HEYOKA_WITH_SLEEF) - const auto &tf = detail::get_target_features(); + const auto &tf = detail::get_target_features(); - auto ir = s.get_ir(); + auto ir = s.get_ir(); - using string_find_iterator = boost::find_iterator; + using string_find_iterator = boost::find_iterator; - auto count = 0u; - for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.log.f64", boost::is_iequal())); - it != string_find_iterator(); ++it) { - ++count; - } + auto count = 0u; + for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.log.f64", boost::is_iequal())); + it != string_find_iterator(); ++it) { + ++count; + } - // NOTE: at the moment we have comprehensive coverage of LLVM versions - // in the CI only for x86_64. - if (tf.sse2) { - // NOTE: occurrences of the scalar version: - // - 2 calls in the strided cfunc, - // - 1 declaration. - REQUIRE(count == 3u); - } + // NOTE: at the moment we have comprehensive coverage of LLVM versions + // in the CI only for x86_64. + if (tf.sse2) { + // NOTE: occurrences of the scalar version: + // - 2 calls in the strided cfunc, + // - 1 declaration. + REQUIRE(count == 3u); + } #if LLVM_VERSION_MAJOR >= 16 - // NOTE: LLVM16 is currently the version tested in the CI on arm64. - if (tf.aarch64) { - REQUIRE(count == 3u); - } + // NOTE: LLVM16 is currently the version tested in the CI on arm64. + if (tf.aarch64) { + REQUIRE(count == 3u); + } #endif - // NOTE: currently no auto-vectorization happens on ppc64 due apparently - // to the way the target machine is being set up by orc/lljit (it works - // fine with the opt tool). When this is resolved, we can test ppc64 too. + // NOTE: currently no auto-vectorization happens on ppc64 due apparently + // to the way the target machine is being set up by orc/lljit (it works + // fine with the opt tool). When this is resolved, we can test ppc64 too. - // if (tf.vsx) { - // REQUIRE(count == 3u); - // } + // if (tf.vsx) { + // REQUIRE(count == 3u); + // } #endif + } } TEST_CASE("vfabi float") { - llvm_state s{kw::slp_vectorize = true}; + for (auto fast_math : {false, true}) { + llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math}; - auto [a, b, c, d] = make_vars("a", "b", "c", "d"); + auto [a, b, c, d] = make_vars("a", "b", "c", "d"); - add_cfunc(s, "cfunc", {log(a), log(b), log(c), log(d)}); + add_cfunc(s, "cfunc", {log(a), log(b), log(c), log(d)}); - s.compile(); + s.compile(); - auto *cf_ptr - = reinterpret_cast(s.jit_lookup("cfunc")); + auto *cf_ptr + = reinterpret_cast(s.jit_lookup("cfunc")); - const std::vector ins{1., 2., 3., 4.}; - std::vector outs(4u, 0.); + const std::vector ins{1., 2., 3., 4.}; + std::vector outs(4u, 0.); - cf_ptr(outs.data(), ins.data(), nullptr, nullptr); + cf_ptr(outs.data(), ins.data(), nullptr, nullptr); - REQUIRE(outs[0] == approximately(std::log(1.f))); - REQUIRE(outs[1] == approximately(std::log(2.f))); - REQUIRE(outs[2] == approximately(std::log(3.f))); - REQUIRE(outs[3] == approximately(std::log(4.f))); + REQUIRE(outs[0] == approximately(std::log(1.f))); + REQUIRE(outs[1] == approximately(std::log(2.f))); + REQUIRE(outs[2] == approximately(std::log(3.f))); + REQUIRE(outs[3] == approximately(std::log(4.f))); #if defined(HEYOKA_WITH_SLEEF) - const auto &tf = detail::get_target_features(); + const auto &tf = detail::get_target_features(); - auto ir = s.get_ir(); + auto ir = s.get_ir(); - using string_find_iterator = boost::find_iterator; + using string_find_iterator = boost::find_iterator; - auto count = 0u; - for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.log.f32", boost::is_iequal())); - it != string_find_iterator(); ++it) { - ++count; - } + auto count = 0u; + for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.log.f32", boost::is_iequal())); + it != string_find_iterator(); ++it) { + ++count; + } - // NOTE: at the moment we have comprehensive coverage of LLVM versions - // in the CI only for x86_64. - if (tf.sse2) { - // NOTE: occurrences of the scalar version: - // - 4 calls in the strided cfunc, - // - 1 declaration. - REQUIRE(count == 5u); - } + // NOTE: at the moment we have comprehensive coverage of LLVM versions + // in the CI only for x86_64. + if (tf.sse2) { + // NOTE: occurrences of the scalar version: + // - 4 calls in the strided cfunc, + // - 1 declaration. + REQUIRE(count == 5u); + } #if LLVM_VERSION_MAJOR >= 16 - // NOTE: LLVM16 is currently the version tested in the CI on arm64. - if (tf.aarch64) { - REQUIRE(count == 5u); - } + // NOTE: LLVM16 is currently the version tested in the CI on arm64. + if (tf.aarch64) { + REQUIRE(count == 5u); + } #endif - // NOTE: currently no auto-vectorization happens on ppc64 due apparently - // to the way the target machine is being set up by orc/lljit (it works - // fine with the opt tool). When this is resolved, we can test ppc64 too. + // NOTE: currently no auto-vectorization happens on ppc64 due apparently + // to the way the target machine is being set up by orc/lljit (it works + // fine with the opt tool). When this is resolved, we can test ppc64 too. - // if (tf.vsx) { - // REQUIRE(count == 5u); - // } + // if (tf.vsx) { + // REQUIRE(count == 5u); + // } #endif + } } diff --git a/test/pow.cpp b/test/pow.cpp index 27fbfbd87..a6e0ae183 100644 --- a/test/pow.cpp +++ b/test/pow.cpp @@ -448,129 +448,133 @@ TEST_CASE("pow overloads") // Tests to check vectorisation via the vector-function-abi-variant machinery. TEST_CASE("vfabi double") { - llvm_state s{kw::slp_vectorize = true}; + for (auto fast_math : {false, true}) { + llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math}; - auto [a, b] = make_vars("a", "b"); + auto [a, b] = make_vars("a", "b"); - add_cfunc(s, "cfunc", {pow(a, .1), pow(b, .2)}); + add_cfunc(s, "cfunc", {pow(a, .1), pow(b, .2)}); - s.compile(); + s.compile(); - auto *cf_ptr - = reinterpret_cast(s.jit_lookup("cfunc")); + auto *cf_ptr = reinterpret_cast( + s.jit_lookup("cfunc")); - const std::vector ins{1., 2.}; - std::vector outs(2u, 0.); + const std::vector ins{1., 2.}; + std::vector outs(2u, 0.); - cf_ptr(outs.data(), ins.data(), nullptr, nullptr); + cf_ptr(outs.data(), ins.data(), nullptr, nullptr); - REQUIRE(outs[0] == approximately(std::pow(1., .1))); - REQUIRE(outs[1] == approximately(std::pow(2., .2))); + REQUIRE(outs[0] == approximately(std::pow(1., .1))); + REQUIRE(outs[1] == approximately(std::pow(2., .2))); #if defined(HEYOKA_WITH_SLEEF) - const auto &tf = detail::get_target_features(); + const auto &tf = detail::get_target_features(); - auto ir = s.get_ir(); + auto ir = s.get_ir(); - using string_find_iterator = boost::find_iterator; + using string_find_iterator = boost::find_iterator; - auto count = 0u; - for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.pow.f64", boost::is_iequal())); - it != string_find_iterator(); ++it) { - ++count; - } + auto count = 0u; + for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.pow.f64", boost::is_iequal())); + it != string_find_iterator(); ++it) { + ++count; + } - // NOTE: at the moment we have comprehensive coverage of LLVM versions - // in the CI only for x86_64. - if (tf.sse2) { - // NOTE: occurrences of the scalar version: - // - 2 calls in the strided cfunc, - // - 1 declaration. - REQUIRE(count == 3u); - } + // NOTE: at the moment we have comprehensive coverage of LLVM versions + // in the CI only for x86_64. + if (tf.sse2) { + // NOTE: occurrences of the scalar version: + // - 2 calls in the strided cfunc, + // - 1 declaration. + REQUIRE(count == 3u); + } #if LLVM_VERSION_MAJOR >= 16 - // NOTE: LLVM16 is currently the version tested in the CI on arm64. - if (tf.aarch64) { - REQUIRE(count == 3u); - } + // NOTE: LLVM16 is currently the version tested in the CI on arm64. + if (tf.aarch64) { + REQUIRE(count == 3u); + } #endif - // NOTE: currently no auto-vectorization happens on ppc64 due apparently - // to the way the target machine is being set up by orc/lljit (it works - // fine with the opt tool). When this is resolved, we can test ppc64 too. + // NOTE: currently no auto-vectorization happens on ppc64 due apparently + // to the way the target machine is being set up by orc/lljit (it works + // fine with the opt tool). When this is resolved, we can test ppc64 too. - // if (tf.vsx) { - // REQUIRE(count == 3u); - // } + // if (tf.vsx) { + // REQUIRE(count == 3u); + // } #endif + } } TEST_CASE("vfabi float") { - llvm_state s{kw::slp_vectorize = true}; + for (auto fast_math : {false, true}) { + llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math}; - auto [a, b, c, d] = make_vars("a", "b", "c", "d"); + auto [a, b, c, d] = make_vars("a", "b", "c", "d"); - add_cfunc(s, "cfunc", {pow(a, .6f), pow(b, .7f), pow(c, .8f), pow(d, .9f)}); + add_cfunc(s, "cfunc", {pow(a, .6f), pow(b, .7f), pow(c, .8f), pow(d, .9f)}); - s.compile(); + s.compile(); - auto *cf_ptr - = reinterpret_cast(s.jit_lookup("cfunc")); + auto *cf_ptr + = reinterpret_cast(s.jit_lookup("cfunc")); - const std::vector ins{.1f, .2f, .3f, .4f}; - std::vector outs(4u, 0.); + const std::vector ins{.1f, .2f, .3f, .4f}; + std::vector outs(4u, 0.); - cf_ptr(outs.data(), ins.data(), nullptr, nullptr); + cf_ptr(outs.data(), ins.data(), nullptr, nullptr); - REQUIRE(outs[0] == approximately(std::pow(.1f, .6f))); - REQUIRE(outs[1] == approximately(std::pow(.2f, .7f))); - REQUIRE(outs[2] == approximately(std::pow(.3f, .8f))); - REQUIRE(outs[3] == approximately(std::pow(.4f, .9f))); + REQUIRE(outs[0] == approximately(std::pow(.1f, .6f))); + REQUIRE(outs[1] == approximately(std::pow(.2f, .7f))); + REQUIRE(outs[2] == approximately(std::pow(.3f, .8f))); + REQUIRE(outs[3] == approximately(std::pow(.4f, .9f))); #if defined(HEYOKA_WITH_SLEEF) - const auto &tf = detail::get_target_features(); + const auto &tf = detail::get_target_features(); - auto ir = s.get_ir(); + auto ir = s.get_ir(); - using string_find_iterator = boost::find_iterator; + using string_find_iterator = boost::find_iterator; - auto count = 0u; - for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.pow.f32", boost::is_iequal())); - it != string_find_iterator(); ++it) { - ++count; - } + auto count = 0u; + for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.pow.f32", boost::is_iequal())); + it != string_find_iterator(); ++it) { + ++count; + } - // NOTE: at the moment we have comprehensive coverage of LLVM versions - // in the CI only for x86_64. - if (tf.sse2) { - // NOTE: occurrences of the scalar version: - // - 4 calls in the strided cfunc, - // - 1 declaration. - REQUIRE(count == 5u); - } + // NOTE: at the moment we have comprehensive coverage of LLVM versions + // in the CI only for x86_64. + if (tf.sse2) { + // NOTE: occurrences of the scalar version: + // - 4 calls in the strided cfunc, + // - 1 declaration. + REQUIRE(count == 5u); + } #if LLVM_VERSION_MAJOR >= 16 - if (tf.aarch64) { - REQUIRE(count == 5u); - } + if (tf.aarch64) { + REQUIRE(count == 5u); + } #endif - // NOTE: currently no auto-vectorization happens on ppc64 due apparently - // to the way the target machine is being set up by orc/lljit (it works - // fine with the opt tool). When this is resolved, we can test ppc64 too. + // NOTE: currently no auto-vectorization happens on ppc64 due apparently + // to the way the target machine is being set up by orc/lljit (it works + // fine with the opt tool). When this is resolved, we can test ppc64 too. - // if (tf.vsx) { - // REQUIRE(count == 5u); - // } + // if (tf.vsx) { + // REQUIRE(count == 5u); + // } #endif + } } diff --git a/test/sin.cpp b/test/sin.cpp index 8948d49db..ccbf816a2 100644 --- a/test/sin.cpp +++ b/test/sin.cpp @@ -232,338 +232,342 @@ TEST_CASE("normalise") // Tests to check vectorisation via the vector-function-abi-variant machinery. TEST_CASE("vfabi double") { - llvm_state s{kw::slp_vectorize = true}; + for (auto fast_math : {false, true}) { + llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math}; - auto [a, b] = make_vars("a", "b"); + auto [a, b] = make_vars("a", "b"); - add_cfunc(s, "cfunc", {sin(a), sin(b)}); + add_cfunc(s, "cfunc", {sin(a), sin(b)}); - s.compile(); + s.compile(); - auto *cf_ptr - = reinterpret_cast(s.jit_lookup("cfunc")); + auto *cf_ptr = reinterpret_cast( + s.jit_lookup("cfunc")); - const std::vector ins{1., 2.}; - std::vector outs(2u, 0.); + const std::vector ins{1., 2.}; + std::vector outs(2u, 0.); - cf_ptr(outs.data(), ins.data(), nullptr, nullptr); + cf_ptr(outs.data(), ins.data(), nullptr, nullptr); - REQUIRE(outs[0] == approximately(std::sin(1.))); - REQUIRE(outs[1] == approximately(std::sin(2.))); + REQUIRE(outs[0] == approximately(std::sin(1.))); + REQUIRE(outs[1] == approximately(std::sin(2.))); #if defined(HEYOKA_WITH_SLEEF) - const auto &tf = detail::get_target_features(); + const auto &tf = detail::get_target_features(); - auto ir = s.get_ir(); + auto ir = s.get_ir(); - using string_find_iterator = boost::find_iterator; + using string_find_iterator = boost::find_iterator; - auto count = 0u; - for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.sin.f64", boost::is_iequal())); - it != string_find_iterator(); ++it) { - ++count; - } + auto count = 0u; + for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.sin.f64", boost::is_iequal())); + it != string_find_iterator(); ++it) { + ++count; + } - // NOTE: at the moment we have comprehensive coverage of LLVM versions - // in the CI only for x86_64. - if (tf.sse2) { - // NOTE: occurrences of the scalar version: - // - 2 calls in the strided cfunc, - // - 1 declaration. - REQUIRE(count == 3u); - } + // NOTE: at the moment we have comprehensive coverage of LLVM versions + // in the CI only for x86_64. + if (tf.sse2) { + // NOTE: occurrences of the scalar version: + // - 2 calls in the strided cfunc, + // - 1 declaration. + REQUIRE(count == 3u); + } #if LLVM_VERSION_MAJOR >= 16 - // NOTE: LLVM16 is currently the version tested in the CI on arm64. - if (tf.aarch64) { - REQUIRE(count == 3u); - } + // NOTE: LLVM16 is currently the version tested in the CI on arm64. + if (tf.aarch64) { + REQUIRE(count == 3u); + } #endif - // NOTE: currently no auto-vectorization happens on ppc64 due apparently - // to the way the target machine is being set up by orc/lljit (it works - // fine with the opt tool). When this is resolved, we can test ppc64 too. + // NOTE: currently no auto-vectorization happens on ppc64 due apparently + // to the way the target machine is being set up by orc/lljit (it works + // fine with the opt tool). When this is resolved, we can test ppc64 too. - // if (tf.vsx) { - // REQUIRE(count == 3u); - // } + // if (tf.vsx) { + // REQUIRE(count == 3u); + // } - // Some more extensive testing specific to x86, only for this function. - auto [c, d, e] = make_vars("c", "d", "e"); + // Some more extensive testing specific to x86, only for this function. + auto [c, d, e] = make_vars("c", "d", "e"); - llvm_state s2{kw::slp_vectorize = true}; + llvm_state s2{kw::slp_vectorize = true}; - add_cfunc(s2, "cfunc1", {sin(a), sin(b), sin(c), sin(d)}); - add_cfunc(s2, "cfunc2", {sin(a), sin(b), sin(c), sin(d), sin(e)}); + add_cfunc(s2, "cfunc1", {sin(a), sin(b), sin(c), sin(d)}); + add_cfunc(s2, "cfunc2", {sin(a), sin(b), sin(c), sin(d), sin(e)}); - s2.compile(); + s2.compile(); - auto *cf1_ptr - = reinterpret_cast(s2.jit_lookup("cfunc1")); - auto *cf2_ptr - = reinterpret_cast(s2.jit_lookup("cfunc2")); + auto *cf1_ptr = reinterpret_cast( + s2.jit_lookup("cfunc1")); + auto *cf2_ptr = reinterpret_cast( + s2.jit_lookup("cfunc2")); - const std::vector ins2{1., 2., 3., 4., 5.}; - std::vector outs2(5u, 0.); + const std::vector ins2{1., 2., 3., 4., 5.}; + std::vector outs2(5u, 0.); - cf1_ptr(outs2.data(), ins2.data(), nullptr, nullptr); + cf1_ptr(outs2.data(), ins2.data(), nullptr, nullptr); - REQUIRE(outs2[0] == approximately(std::sin(1.))); - REQUIRE(outs2[1] == approximately(std::sin(2.))); - REQUIRE(outs2[2] == approximately(std::sin(3.))); - REQUIRE(outs2[3] == approximately(std::sin(4.))); + REQUIRE(outs2[0] == approximately(std::sin(1.))); + REQUIRE(outs2[1] == approximately(std::sin(2.))); + REQUIRE(outs2[2] == approximately(std::sin(3.))); + REQUIRE(outs2[3] == approximately(std::sin(4.))); - cf2_ptr(outs2.data(), ins2.data(), nullptr, nullptr); + cf2_ptr(outs2.data(), ins2.data(), nullptr, nullptr); - REQUIRE(outs2[0] == approximately(std::sin(1.))); - REQUIRE(outs2[1] == approximately(std::sin(2.))); - REQUIRE(outs2[2] == approximately(std::sin(3.))); - REQUIRE(outs2[3] == approximately(std::sin(4.))); - REQUIRE(outs2[4] == approximately(std::sin(5.))); + REQUIRE(outs2[0] == approximately(std::sin(1.))); + REQUIRE(outs2[1] == approximately(std::sin(2.))); + REQUIRE(outs2[2] == approximately(std::sin(3.))); + REQUIRE(outs2[3] == approximately(std::sin(4.))); + REQUIRE(outs2[4] == approximately(std::sin(5.))); - ir = s2.get_ir(); + ir = s2.get_ir(); - count = 0u; - for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.sin.f64", boost::is_iequal())); - it != string_find_iterator(); ++it) { - ++count; - } + count = 0u; + for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.sin.f64", boost::is_iequal())); + it != string_find_iterator(); ++it) { + ++count; + } - if (tf.avx) { - // NOTE: occurrences of the scalar version: - // - 4 + 5 calls in the strided cfuncs, - // - 1 declaration, - // - 1 call to deal with the remainder in the - // 5-argument version. - REQUIRE(count == 11u); - } + if (tf.avx) { + // NOTE: occurrences of the scalar version: + // - 4 + 5 calls in the strided cfuncs, + // - 1 declaration, + // - 1 call to deal with the remainder in the + // 5-argument version. + REQUIRE(count == 11u); + } - // NOTE: this next test seems to work properly starting - // from LLVM 13. + // NOTE: this next test seems to work properly starting + // from LLVM 13. #if LLVM_VERSION_MAJOR >= 13 - // Check that the autovec works also on batch sizes which do not correspond - // exactly to an available vector width. - llvm_state s3{kw::slp_vectorize = true}; + // Check that the autovec works also on batch sizes which do not correspond + // exactly to an available vector width. + llvm_state s3{kw::slp_vectorize = true}; - add_cfunc(s3, "cfunc", {sin(a)}, kw::batch_size = 3u); + add_cfunc(s3, "cfunc", {sin(a)}, kw::batch_size = 3u); - s3.compile(); + s3.compile(); - auto *cf3_ptr - = reinterpret_cast(s3.jit_lookup("cfunc")); + auto *cf3_ptr = reinterpret_cast( + s3.jit_lookup("cfunc")); - std::vector ins3 = {1., 2., 3.}, outs3 = {0., 0., 0.}; + std::vector ins3 = {1., 2., 3.}, outs3 = {0., 0., 0.}; - cf3_ptr(outs3.data(), ins3.data(), nullptr, nullptr); + cf3_ptr(outs3.data(), ins3.data(), nullptr, nullptr); - REQUIRE(outs3[0] == approximately(std::sin(1.))); - REQUIRE(outs3[1] == approximately(std::sin(2.))); - REQUIRE(outs3[2] == approximately(std::sin(3.))); + REQUIRE(outs3[0] == approximately(std::sin(1.))); + REQUIRE(outs3[1] == approximately(std::sin(2.))); + REQUIRE(outs3[2] == approximately(std::sin(3.))); - ir = s3.get_ir(); + ir = s3.get_ir(); - count = 0u; - for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.sin.f64", boost::is_iequal())); - it != string_find_iterator(); ++it) { - ++count; - } + count = 0u; + for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.sin.f64", boost::is_iequal())); + it != string_find_iterator(); ++it) { + ++count; + } - if (tf.sse2) { - // NOTE: occurrences of the scalar version: - // - 1 call in the remainder of the unstrided cfunc, - // - 1 call in the remainder of the strided cfunc, - // - 1 declaration. - REQUIRE(count == 3u); - } + if (tf.sse2) { + // NOTE: occurrences of the scalar version: + // - 1 call in the remainder of the unstrided cfunc, + // - 1 call in the remainder of the strided cfunc, + // - 1 declaration. + REQUIRE(count == 3u); + } #if LLVM_VERSION_MAJOR >= 16 - if (tf.aarch64) { - REQUIRE(count == 3u); - } + if (tf.aarch64) { + REQUIRE(count == 3u); + } #endif #endif #endif + } } TEST_CASE("vfabi float") { - llvm_state s{kw::slp_vectorize = true}; + for (auto fast_math : {false, true}) { + llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math}; - auto [a, b, c, d] = make_vars("a", "b", "c", "d"); + auto [a, b, c, d] = make_vars("a", "b", "c", "d"); - add_cfunc(s, "cfunc", {sin(a), sin(b), sin(c), sin(d)}); + add_cfunc(s, "cfunc", {sin(a), sin(b), sin(c), sin(d)}); - s.compile(); + s.compile(); - auto *cf_ptr - = reinterpret_cast(s.jit_lookup("cfunc")); + auto *cf_ptr + = reinterpret_cast(s.jit_lookup("cfunc")); - const std::vector ins{1., 2., 3., 4.}; - std::vector outs(4u, 0.); + const std::vector ins{1., 2., 3., 4.}; + std::vector outs(4u, 0.); - cf_ptr(outs.data(), ins.data(), nullptr, nullptr); + cf_ptr(outs.data(), ins.data(), nullptr, nullptr); - REQUIRE(outs[0] == approximately(std::sin(1.f))); - REQUIRE(outs[1] == approximately(std::sin(2.f))); - REQUIRE(outs[2] == approximately(std::sin(3.f))); - REQUIRE(outs[3] == approximately(std::sin(4.f))); + REQUIRE(outs[0] == approximately(std::sin(1.f))); + REQUIRE(outs[1] == approximately(std::sin(2.f))); + REQUIRE(outs[2] == approximately(std::sin(3.f))); + REQUIRE(outs[3] == approximately(std::sin(4.f))); #if defined(HEYOKA_WITH_SLEEF) - const auto &tf = detail::get_target_features(); + const auto &tf = detail::get_target_features(); - auto ir = s.get_ir(); + auto ir = s.get_ir(); - using string_find_iterator = boost::find_iterator; + using string_find_iterator = boost::find_iterator; - auto count = 0u; - for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.sin.f32", boost::is_iequal())); - it != string_find_iterator(); ++it) { - ++count; - } + auto count = 0u; + for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.sin.f32", boost::is_iequal())); + it != string_find_iterator(); ++it) { + ++count; + } - // NOTE: at the moment we have comprehensive coverage of LLVM versions - // in the CI only for x86_64. - if (tf.sse2) { - // NOTE: occurrences of the scalar version: - // - 4 calls in the strided cfunc, - // - 1 declaration. - REQUIRE(count == 5u); - } + // NOTE: at the moment we have comprehensive coverage of LLVM versions + // in the CI only for x86_64. + if (tf.sse2) { + // NOTE: occurrences of the scalar version: + // - 4 calls in the strided cfunc, + // - 1 declaration. + REQUIRE(count == 5u); + } #if LLVM_VERSION_MAJOR >= 16 - // NOTE: LLVM16 is currently the version tested in the CI on arm64. - if (tf.aarch64) { - REQUIRE(count == 5u); - } + // NOTE: LLVM16 is currently the version tested in the CI on arm64. + if (tf.aarch64) { + REQUIRE(count == 5u); + } #endif - // NOTE: currently no auto-vectorization happens on ppc64 due apparently - // to the way the target machine is being set up by orc/lljit (it works - // fine with the opt tool). When this is resolved, we can test ppc64 too. + // NOTE: currently no auto-vectorization happens on ppc64 due apparently + // to the way the target machine is being set up by orc/lljit (it works + // fine with the opt tool). When this is resolved, we can test ppc64 too. - // if (tf.vsx) { - // REQUIRE(count == 5u); - // } + // if (tf.vsx) { + // REQUIRE(count == 5u); + // } - // Some more extensive testing specific to x86, only for this function. - auto [e, f, g, h, i] = make_vars("e", "f", "g", "h", "i"); + // Some more extensive testing specific to x86, only for this function. + auto [e, f, g, h, i] = make_vars("e", "f", "g", "h", "i"); - llvm_state s2{kw::slp_vectorize = true}; + llvm_state s2{kw::slp_vectorize = true}; - add_cfunc(s2, "cfunc1", {sin(a), sin(b), sin(c), sin(d), sin(e), sin(f), sin(g), sin(h)}); - add_cfunc(s2, "cfunc2", {sin(a), sin(b), sin(c), sin(d), sin(e), sin(f), sin(g), sin(h), sin(i)}); + add_cfunc(s2, "cfunc1", {sin(a), sin(b), sin(c), sin(d), sin(e), sin(f), sin(g), sin(h)}); + add_cfunc(s2, "cfunc2", {sin(a), sin(b), sin(c), sin(d), sin(e), sin(f), sin(g), sin(h), sin(i)}); - s2.compile(); + s2.compile(); - auto *cf1_ptr - = reinterpret_cast(s2.jit_lookup("cfunc1")); - auto *cf2_ptr - = reinterpret_cast(s2.jit_lookup("cfunc2")); + auto *cf1_ptr + = reinterpret_cast(s2.jit_lookup("cfunc1")); + auto *cf2_ptr + = reinterpret_cast(s2.jit_lookup("cfunc2")); - const std::vector ins2{1., 2., 3., 4., 5., 6., 7., 8., 9.}; - std::vector outs2(9u, 0.); + const std::vector ins2{1., 2., 3., 4., 5., 6., 7., 8., 9.}; + std::vector outs2(9u, 0.); - cf1_ptr(outs2.data(), ins2.data(), nullptr, nullptr); + cf1_ptr(outs2.data(), ins2.data(), nullptr, nullptr); - REQUIRE(outs2[0] == approximately(std::sin(1.f))); - REQUIRE(outs2[1] == approximately(std::sin(2.f))); - REQUIRE(outs2[2] == approximately(std::sin(3.f))); - REQUIRE(outs2[3] == approximately(std::sin(4.f))); - REQUIRE(outs2[4] == approximately(std::sin(5.f))); - REQUIRE(outs2[5] == approximately(std::sin(6.f))); - REQUIRE(outs2[6] == approximately(std::sin(7.f))); - REQUIRE(outs2[7] == approximately(std::sin(8.f))); + REQUIRE(outs2[0] == approximately(std::sin(1.f))); + REQUIRE(outs2[1] == approximately(std::sin(2.f))); + REQUIRE(outs2[2] == approximately(std::sin(3.f))); + REQUIRE(outs2[3] == approximately(std::sin(4.f))); + REQUIRE(outs2[4] == approximately(std::sin(5.f))); + REQUIRE(outs2[5] == approximately(std::sin(6.f))); + REQUIRE(outs2[6] == approximately(std::sin(7.f))); + REQUIRE(outs2[7] == approximately(std::sin(8.f))); - cf2_ptr(outs2.data(), ins2.data(), nullptr, nullptr); + cf2_ptr(outs2.data(), ins2.data(), nullptr, nullptr); - REQUIRE(outs2[0] == approximately(std::sin(1.f))); - REQUIRE(outs2[1] == approximately(std::sin(2.f))); - REQUIRE(outs2[2] == approximately(std::sin(3.f))); - REQUIRE(outs2[3] == approximately(std::sin(4.f))); - REQUIRE(outs2[4] == approximately(std::sin(5.f))); - REQUIRE(outs2[5] == approximately(std::sin(6.f))); - REQUIRE(outs2[6] == approximately(std::sin(7.f))); - REQUIRE(outs2[7] == approximately(std::sin(8.f))); - REQUIRE(outs2[8] == approximately(std::sin(9.f))); + REQUIRE(outs2[0] == approximately(std::sin(1.f))); + REQUIRE(outs2[1] == approximately(std::sin(2.f))); + REQUIRE(outs2[2] == approximately(std::sin(3.f))); + REQUIRE(outs2[3] == approximately(std::sin(4.f))); + REQUIRE(outs2[4] == approximately(std::sin(5.f))); + REQUIRE(outs2[5] == approximately(std::sin(6.f))); + REQUIRE(outs2[6] == approximately(std::sin(7.f))); + REQUIRE(outs2[7] == approximately(std::sin(8.f))); + REQUIRE(outs2[8] == approximately(std::sin(9.f))); - ir = s2.get_ir(); + ir = s2.get_ir(); - count = 0u; - for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.sin.f32", boost::is_iequal())); - it != string_find_iterator(); ++it) { - ++count; - } + count = 0u; + for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.sin.f32", boost::is_iequal())); + it != string_find_iterator(); ++it) { + ++count; + } - if (tf.avx) { - // NOTE: occurrences of the scalar version: - // - 8 + 9 calls in the strided cfuncs, - // - 1 declaration, - // - 1 call to deal with the remainder in the - // 9-argument version. - REQUIRE(count == 19u); - } + if (tf.avx) { + // NOTE: occurrences of the scalar version: + // - 8 + 9 calls in the strided cfuncs, + // - 1 declaration, + // - 1 call to deal with the remainder in the + // 9-argument version. + REQUIRE(count == 19u); + } - // NOTE: this next test seems to work properly starting - // from LLVM 13. + // NOTE: this next test seems to work properly starting + // from LLVM 13. #if LLVM_VERSION_MAJOR >= 13 - // Check that the autovec works also on batch sizes which do not correspond - // exactly to an available vector width. - llvm_state s3{kw::slp_vectorize = true}; + // Check that the autovec works also on batch sizes which do not correspond + // exactly to an available vector width. + llvm_state s3{kw::slp_vectorize = true}; - add_cfunc(s3, "cfunc", {sin(a)}, kw::batch_size = 5u); + add_cfunc(s3, "cfunc", {sin(a)}, kw::batch_size = 5u); - s3.compile(); + s3.compile(); - auto *cf3_ptr - = reinterpret_cast(s3.jit_lookup("cfunc")); + auto *cf3_ptr + = reinterpret_cast(s3.jit_lookup("cfunc")); - std::vector ins3 = {1., 2., 3., 4., 5.}, outs3 = {0., 0., 0., 0., 0.}; + std::vector ins3 = {1., 2., 3., 4., 5.}, outs3 = {0., 0., 0., 0., 0.}; - cf3_ptr(outs3.data(), ins3.data(), nullptr, nullptr); + cf3_ptr(outs3.data(), ins3.data(), nullptr, nullptr); - REQUIRE(outs3[0] == approximately(std::sin(1.f))); - REQUIRE(outs3[1] == approximately(std::sin(2.f))); - REQUIRE(outs3[2] == approximately(std::sin(3.f))); - REQUIRE(outs3[3] == approximately(std::sin(4.f))); - REQUIRE(outs3[4] == approximately(std::sin(5.f))); + REQUIRE(outs3[0] == approximately(std::sin(1.f))); + REQUIRE(outs3[1] == approximately(std::sin(2.f))); + REQUIRE(outs3[2] == approximately(std::sin(3.f))); + REQUIRE(outs3[3] == approximately(std::sin(4.f))); + REQUIRE(outs3[4] == approximately(std::sin(5.f))); - ir = s3.get_ir(); + ir = s3.get_ir(); - count = 0u; - for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.sin.f32", boost::is_iequal())); - it != string_find_iterator(); ++it) { - ++count; - } + count = 0u; + for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.sin.f32", boost::is_iequal())); + it != string_find_iterator(); ++it) { + ++count; + } - if (tf.sse2) { - // NOTE: occurrences of the scalar version: - // - 1 call in the remainder of the unstrided cfunc, - // - 1 call in the remainder of the strided cfunc, - // - 1 declaration. - REQUIRE(count == 3u); - } + if (tf.sse2) { + // NOTE: occurrences of the scalar version: + // - 1 call in the remainder of the unstrided cfunc, + // - 1 call in the remainder of the strided cfunc, + // - 1 declaration. + REQUIRE(count == 3u); + } #if LLVM_VERSION_MAJOR >= 16 - if (tf.aarch64) { - REQUIRE(count == 3u); - } + if (tf.aarch64) { + REQUIRE(count == 3u); + } #endif #endif #endif + } } diff --git a/test/sinh.cpp b/test/sinh.cpp index a0a218c82..fd8cad405 100644 --- a/test/sinh.cpp +++ b/test/sinh.cpp @@ -223,312 +223,317 @@ TEST_CASE("normalise") // Tests to check vectorisation via the vector-function-abi-variant machinery. TEST_CASE("vfabi double") { - llvm_state s{kw::slp_vectorize = true}; + for (auto fast_math : {false, true}) { + llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math}; - auto [a, b] = make_vars("a", "b"); + auto [a, b] = make_vars("a", "b"); - add_cfunc(s, "cfunc", {sinh(a), sinh(b)}); + add_cfunc(s, "cfunc", {sinh(a), sinh(b)}); - s.compile(); + s.compile(); - auto *cf_ptr - = reinterpret_cast(s.jit_lookup("cfunc")); + auto *cf_ptr = reinterpret_cast( + s.jit_lookup("cfunc")); - const std::vector ins{1., 2.}; - std::vector outs(2u, 0.); + const std::vector ins{1., 2.}; + std::vector outs(2u, 0.); - cf_ptr(outs.data(), ins.data(), nullptr, nullptr); + cf_ptr(outs.data(), ins.data(), nullptr, nullptr); - REQUIRE(outs[0] == approximately(std::sinh(1.))); - REQUIRE(outs[1] == approximately(std::sinh(2.))); + REQUIRE(outs[0] == approximately(std::sinh(1.))); + REQUIRE(outs[1] == approximately(std::sinh(2.))); - // NOTE: autovec with external scalar functions seems to work - // only since LLVM 16. + // NOTE: autovec with external scalar functions seems to work + // only since LLVM 16. #if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16 - const auto &tf = detail::get_target_features(); + const auto &tf = detail::get_target_features(); - auto ir = s.get_ir(); + auto ir = s.get_ir(); - using string_find_iterator = boost::find_iterator; + using string_find_iterator = boost::find_iterator; - auto count = 0u; - for (auto it = boost::make_find_iterator(ir, boost::first_finder("@sinh", boost::is_iequal())); - it != string_find_iterator(); ++it) { - ++count; - } + auto count = 0u; + for (auto it = boost::make_find_iterator(ir, boost::first_finder("@sinh", boost::is_iequal())); + it != string_find_iterator(); ++it) { + ++count; + } - // NOTE: at the moment we have comprehensive coverage of LLVM versions - // in the CI only for x86_64. - if (tf.sse2) { - // NOTE: occurrences of the scalar version: - // - 2 calls in the strided cfunc, - // - 1 declaration. - REQUIRE(count == 3u); - } + // NOTE: at the moment we have comprehensive coverage of LLVM versions + // in the CI only for x86_64. + if (tf.sse2) { + // NOTE: occurrences of the scalar version: + // - 2 calls in the strided cfunc, + // - 1 declaration. + REQUIRE(count == 3u); + } - if (tf.aarch64) { - REQUIRE(count == 3u); - } + if (tf.aarch64) { + REQUIRE(count == 3u); + } - // NOTE: currently no auto-vectorization happens on ppc64 due apparently - // to the way the target machine is being set up by orc/lljit (it works - // fine with the opt tool). When this is resolved, we can test ppc64 too. + // NOTE: currently no auto-vectorization happens on ppc64 due apparently + // to the way the target machine is being set up by orc/lljit (it works + // fine with the opt tool). When this is resolved, we can test ppc64 too. - // if (tf.vsx) { - // REQUIRE(count == 3u); - // } + // if (tf.vsx) { + // REQUIRE(count == 3u); + // } - // Some more extensive testing specific to x86, only for this function. - auto [c, d, e] = make_vars("c", "d", "e"); + // Some more extensive testing specific to x86, only for this function. + auto [c, d, e] = make_vars("c", "d", "e"); - llvm_state s2{kw::slp_vectorize = true}; + llvm_state s2{kw::slp_vectorize = true}; - add_cfunc(s2, "cfunc1", {sinh(a), sinh(b), sinh(c), sinh(d)}); - add_cfunc(s2, "cfunc2", {sinh(a), sinh(b), sinh(c), sinh(d), sinh(e)}); + add_cfunc(s2, "cfunc1", {sinh(a), sinh(b), sinh(c), sinh(d)}); + add_cfunc(s2, "cfunc2", {sinh(a), sinh(b), sinh(c), sinh(d), sinh(e)}); - s2.compile(); + s2.compile(); - auto *cf1_ptr - = reinterpret_cast(s2.jit_lookup("cfunc1")); - auto *cf2_ptr - = reinterpret_cast(s2.jit_lookup("cfunc2")); + auto *cf1_ptr = reinterpret_cast( + s2.jit_lookup("cfunc1")); + auto *cf2_ptr = reinterpret_cast( + s2.jit_lookup("cfunc2")); - const std::vector ins2{1., 2., 3., 4., 5.}; - std::vector outs2(5u, 0.); + const std::vector ins2{1., 2., 3., 4., 5.}; + std::vector outs2(5u, 0.); - cf1_ptr(outs2.data(), ins2.data(), nullptr, nullptr); + cf1_ptr(outs2.data(), ins2.data(), nullptr, nullptr); - REQUIRE(outs2[0] == approximately(std::sinh(1.))); - REQUIRE(outs2[1] == approximately(std::sinh(2.))); - REQUIRE(outs2[2] == approximately(std::sinh(3.))); - REQUIRE(outs2[3] == approximately(std::sinh(4.))); + REQUIRE(outs2[0] == approximately(std::sinh(1.))); + REQUIRE(outs2[1] == approximately(std::sinh(2.))); + REQUIRE(outs2[2] == approximately(std::sinh(3.))); + REQUIRE(outs2[3] == approximately(std::sinh(4.))); - cf2_ptr(outs2.data(), ins2.data(), nullptr, nullptr); + cf2_ptr(outs2.data(), ins2.data(), nullptr, nullptr); - REQUIRE(outs2[0] == approximately(std::sinh(1.))); - REQUIRE(outs2[1] == approximately(std::sinh(2.))); - REQUIRE(outs2[2] == approximately(std::sinh(3.))); - REQUIRE(outs2[3] == approximately(std::sinh(4.))); - REQUIRE(outs2[4] == approximately(std::sinh(5.))); + REQUIRE(outs2[0] == approximately(std::sinh(1.))); + REQUIRE(outs2[1] == approximately(std::sinh(2.))); + REQUIRE(outs2[2] == approximately(std::sinh(3.))); + REQUIRE(outs2[3] == approximately(std::sinh(4.))); + REQUIRE(outs2[4] == approximately(std::sinh(5.))); - ir = s2.get_ir(); + ir = s2.get_ir(); - count = 0u; - for (auto it = boost::make_find_iterator(ir, boost::first_finder("@sinh", boost::is_iequal())); - it != string_find_iterator(); ++it) { - ++count; - } + count = 0u; + for (auto it = boost::make_find_iterator(ir, boost::first_finder("@sinh", boost::is_iequal())); + it != string_find_iterator(); ++it) { + ++count; + } - if (tf.avx) { - // NOTE: occurrences of the scalar version: - // - 4 + 5 calls in the strided cfuncs, - // - 1 declaration, - // - 1 call to deal with the remainder in the - // 5-argument version. - REQUIRE(count == 11u); - } + if (tf.avx) { + // NOTE: occurrences of the scalar version: + // - 4 + 5 calls in the strided cfuncs, + // - 1 declaration, + // - 1 call to deal with the remainder in the + // 5-argument version. + REQUIRE(count == 11u); + } - // Check that the autovec works also on batch sizes which do not correspond - // exactly to an available vector width. - llvm_state s3{kw::slp_vectorize = true}; + // Check that the autovec works also on batch sizes which do not correspond + // exactly to an available vector width. + llvm_state s3{kw::slp_vectorize = true}; - add_cfunc(s3, "cfunc", {sinh(a)}, kw::batch_size = 3u); + add_cfunc(s3, "cfunc", {sinh(a)}, kw::batch_size = 3u); - s3.compile(); + s3.compile(); - auto *cf3_ptr - = reinterpret_cast(s3.jit_lookup("cfunc")); + auto *cf3_ptr = reinterpret_cast( + s3.jit_lookup("cfunc")); - std::vector ins3 = {1., 2., 3.}, outs3 = {0., 0., 0.}; + std::vector ins3 = {1., 2., 3.}, outs3 = {0., 0., 0.}; - cf3_ptr(outs3.data(), ins3.data(), nullptr, nullptr); + cf3_ptr(outs3.data(), ins3.data(), nullptr, nullptr); - REQUIRE(outs3[0] == approximately(std::sinh(1.))); - REQUIRE(outs3[1] == approximately(std::sinh(2.))); - REQUIRE(outs3[2] == approximately(std::sinh(3.))); + REQUIRE(outs3[0] == approximately(std::sinh(1.))); + REQUIRE(outs3[1] == approximately(std::sinh(2.))); + REQUIRE(outs3[2] == approximately(std::sinh(3.))); - ir = s3.get_ir(); + ir = s3.get_ir(); - count = 0u; - for (auto it = boost::make_find_iterator(ir, boost::first_finder("@sinh", boost::is_iequal())); - it != string_find_iterator(); ++it) { - ++count; - } + count = 0u; + for (auto it = boost::make_find_iterator(ir, boost::first_finder("@sinh", boost::is_iequal())); + it != string_find_iterator(); ++it) { + ++count; + } - if (tf.sse2) { - // NOTE: occurrences of the scalar version: - // - 1 call in the remainder of the unstrided cfunc, - // - 1 call in the remainder of the strided cfunc, - // - 1 declaration. - REQUIRE(count == 3u); - } + if (tf.sse2) { + // NOTE: occurrences of the scalar version: + // - 1 call in the remainder of the unstrided cfunc, + // - 1 call in the remainder of the strided cfunc, + // - 1 declaration. + REQUIRE(count == 3u); + } - if (tf.aarch64) { - REQUIRE(count == 3u); - } + if (tf.aarch64) { + REQUIRE(count == 3u); + } #endif + } } TEST_CASE("vfabi float") { - llvm_state s{kw::slp_vectorize = true}; + for (auto fast_math : {false, true}) { + llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math}; - auto [a, b, c, d] = make_vars("a", "b", "c", "d"); + auto [a, b, c, d] = make_vars("a", "b", "c", "d"); - add_cfunc(s, "cfunc", {sinh(a), sinh(b), sinh(c), sinh(d)}); + add_cfunc(s, "cfunc", {sinh(a), sinh(b), sinh(c), sinh(d)}); - s.compile(); + s.compile(); - auto *cf_ptr - = reinterpret_cast(s.jit_lookup("cfunc")); + auto *cf_ptr + = reinterpret_cast(s.jit_lookup("cfunc")); - const std::vector ins{1., 2., 3., 4.}; - std::vector outs(4u, 0.); + const std::vector ins{1., 2., 3., 4.}; + std::vector outs(4u, 0.); - cf_ptr(outs.data(), ins.data(), nullptr, nullptr); + cf_ptr(outs.data(), ins.data(), nullptr, nullptr); - REQUIRE(outs[0] == approximately(std::sinh(1.f))); - REQUIRE(outs[1] == approximately(std::sinh(2.f))); - REQUIRE(outs[2] == approximately(std::sinh(3.f))); - REQUIRE(outs[3] == approximately(std::sinh(4.f))); + REQUIRE(outs[0] == approximately(std::sinh(1.f))); + REQUIRE(outs[1] == approximately(std::sinh(2.f))); + REQUIRE(outs[2] == approximately(std::sinh(3.f))); + REQUIRE(outs[3] == approximately(std::sinh(4.f))); - // NOTE: autovec with external scalar functions seems to work - // only since LLVM 16. + // NOTE: autovec with external scalar functions seems to work + // only since LLVM 16. #if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16 - const auto &tf = detail::get_target_features(); + const auto &tf = detail::get_target_features(); - auto ir = s.get_ir(); + auto ir = s.get_ir(); - using string_find_iterator = boost::find_iterator; + using string_find_iterator = boost::find_iterator; - auto count = 0u; - for (auto it = boost::make_find_iterator(ir, boost::first_finder("@sinhf", boost::is_iequal())); - it != string_find_iterator(); ++it) { - ++count; - } + auto count = 0u; + for (auto it = boost::make_find_iterator(ir, boost::first_finder("@sinhf", boost::is_iequal())); + it != string_find_iterator(); ++it) { + ++count; + } - // NOTE: at the moment we have comprehensive coverage of LLVM versions - // in the CI only for x86_64. - if (tf.sse2) { - // NOTE: occurrences of the scalar version: - // - 4 calls in the strided cfunc, - // - 1 declaration. - REQUIRE(count == 5u); - } + // NOTE: at the moment we have comprehensive coverage of LLVM versions + // in the CI only for x86_64. + if (tf.sse2) { + // NOTE: occurrences of the scalar version: + // - 4 calls in the strided cfunc, + // - 1 declaration. + REQUIRE(count == 5u); + } - if (tf.aarch64) { - REQUIRE(count == 5u); - } + if (tf.aarch64) { + REQUIRE(count == 5u); + } - // NOTE: currently no auto-vectorization happens on ppc64 due apparently - // to the way the target machine is being set up by orc/lljit (it works - // fine with the opt tool). When this is resolved, we can test ppc64 too. + // NOTE: currently no auto-vectorization happens on ppc64 due apparently + // to the way the target machine is being set up by orc/lljit (it works + // fine with the opt tool). When this is resolved, we can test ppc64 too. - // if (tf.vsx) { - // REQUIRE(count == 3u); - // } + // if (tf.vsx) { + // REQUIRE(count == 3u); + // } - // Some more extensive testing specific to x86, only for this function. - auto [e, f, g, h, i] = make_vars("e", "f", "g", "h", "i"); + // Some more extensive testing specific to x86, only for this function. + auto [e, f, g, h, i] = make_vars("e", "f", "g", "h", "i"); - llvm_state s2{kw::slp_vectorize = true}; + llvm_state s2{kw::slp_vectorize = true}; - add_cfunc(s2, "cfunc1", {sinh(a), sinh(b), sinh(c), sinh(d), sinh(e), sinh(f), sinh(g), sinh(h)}); - add_cfunc(s2, "cfunc2", {sinh(a), sinh(b), sinh(c), sinh(d), sinh(e), sinh(f), sinh(g), sinh(h), sinh(i)}); + add_cfunc(s2, "cfunc1", {sinh(a), sinh(b), sinh(c), sinh(d), sinh(e), sinh(f), sinh(g), sinh(h)}); + add_cfunc(s2, "cfunc2", + {sinh(a), sinh(b), sinh(c), sinh(d), sinh(e), sinh(f), sinh(g), sinh(h), sinh(i)}); - s2.compile(); + s2.compile(); - auto *cf1_ptr - = reinterpret_cast(s2.jit_lookup("cfunc1")); - auto *cf2_ptr - = reinterpret_cast(s2.jit_lookup("cfunc2")); + auto *cf1_ptr + = reinterpret_cast(s2.jit_lookup("cfunc1")); + auto *cf2_ptr + = reinterpret_cast(s2.jit_lookup("cfunc2")); - const std::vector ins2{1., 2., 3., 4., 5., 6., 7., 8., 9.}; - std::vector outs2(9u, 0.); + const std::vector ins2{1., 2., 3., 4., 5., 6., 7., 8., 9.}; + std::vector outs2(9u, 0.); - cf1_ptr(outs2.data(), ins2.data(), nullptr, nullptr); + cf1_ptr(outs2.data(), ins2.data(), nullptr, nullptr); - REQUIRE(outs2[0] == approximately(std::sinh(1.f))); - REQUIRE(outs2[1] == approximately(std::sinh(2.f))); - REQUIRE(outs2[2] == approximately(std::sinh(3.f))); - REQUIRE(outs2[3] == approximately(std::sinh(4.f))); - REQUIRE(outs2[4] == approximately(std::sinh(5.f))); - REQUIRE(outs2[5] == approximately(std::sinh(6.f))); - REQUIRE(outs2[6] == approximately(std::sinh(7.f))); - REQUIRE(outs2[7] == approximately(std::sinh(8.f))); + REQUIRE(outs2[0] == approximately(std::sinh(1.f))); + REQUIRE(outs2[1] == approximately(std::sinh(2.f))); + REQUIRE(outs2[2] == approximately(std::sinh(3.f))); + REQUIRE(outs2[3] == approximately(std::sinh(4.f))); + REQUIRE(outs2[4] == approximately(std::sinh(5.f))); + REQUIRE(outs2[5] == approximately(std::sinh(6.f))); + REQUIRE(outs2[6] == approximately(std::sinh(7.f))); + REQUIRE(outs2[7] == approximately(std::sinh(8.f))); - cf2_ptr(outs2.data(), ins2.data(), nullptr, nullptr); + cf2_ptr(outs2.data(), ins2.data(), nullptr, nullptr); - REQUIRE(outs2[0] == approximately(std::sinh(1.f))); - REQUIRE(outs2[1] == approximately(std::sinh(2.f))); - REQUIRE(outs2[2] == approximately(std::sinh(3.f))); - REQUIRE(outs2[3] == approximately(std::sinh(4.f))); - REQUIRE(outs2[4] == approximately(std::sinh(5.f))); - REQUIRE(outs2[5] == approximately(std::sinh(6.f))); - REQUIRE(outs2[6] == approximately(std::sinh(7.f))); - REQUIRE(outs2[7] == approximately(std::sinh(8.f))); - REQUIRE(outs2[8] == approximately(std::sinh(9.f))); + REQUIRE(outs2[0] == approximately(std::sinh(1.f))); + REQUIRE(outs2[1] == approximately(std::sinh(2.f))); + REQUIRE(outs2[2] == approximately(std::sinh(3.f))); + REQUIRE(outs2[3] == approximately(std::sinh(4.f))); + REQUIRE(outs2[4] == approximately(std::sinh(5.f))); + REQUIRE(outs2[5] == approximately(std::sinh(6.f))); + REQUIRE(outs2[6] == approximately(std::sinh(7.f))); + REQUIRE(outs2[7] == approximately(std::sinh(8.f))); + REQUIRE(outs2[8] == approximately(std::sinh(9.f))); - ir = s2.get_ir(); + ir = s2.get_ir(); - count = 0u; - for (auto it = boost::make_find_iterator(ir, boost::first_finder("@sinhf", boost::is_iequal())); - it != string_find_iterator(); ++it) { - ++count; - } + count = 0u; + for (auto it = boost::make_find_iterator(ir, boost::first_finder("@sinhf", boost::is_iequal())); + it != string_find_iterator(); ++it) { + ++count; + } - if (tf.avx) { - // NOTE: occurrences of the scalar version: - // - 8 + 9 calls in the strided cfuncs, - // - 1 declaration, - // - 1 call to deal with the remainder in the - // 9-argument version. - REQUIRE(count == 19u); - } + if (tf.avx) { + // NOTE: occurrences of the scalar version: + // - 8 + 9 calls in the strided cfuncs, + // - 1 declaration, + // - 1 call to deal with the remainder in the + // 9-argument version. + REQUIRE(count == 19u); + } - // Check that the autovec works also on batch sizes which do not correspond - // exactly to an available vector width. - llvm_state s3{kw::slp_vectorize = true}; + // Check that the autovec works also on batch sizes which do not correspond + // exactly to an available vector width. + llvm_state s3{kw::slp_vectorize = true}; - add_cfunc(s3, "cfunc", {sinh(a)}, kw::batch_size = 5u); + add_cfunc(s3, "cfunc", {sinh(a)}, kw::batch_size = 5u); - s3.compile(); + s3.compile(); - auto *cf3_ptr - = reinterpret_cast(s3.jit_lookup("cfunc")); + auto *cf3_ptr + = reinterpret_cast(s3.jit_lookup("cfunc")); - std::vector ins3 = {1., 2., 3., 4., 5.}, outs3 = {0., 0., 0., 0., 0.}; + std::vector ins3 = {1., 2., 3., 4., 5.}, outs3 = {0., 0., 0., 0., 0.}; - cf3_ptr(outs3.data(), ins3.data(), nullptr, nullptr); + cf3_ptr(outs3.data(), ins3.data(), nullptr, nullptr); - REQUIRE(outs3[0] == approximately(std::sinh(1.f))); - REQUIRE(outs3[1] == approximately(std::sinh(2.f))); - REQUIRE(outs3[2] == approximately(std::sinh(3.f))); - REQUIRE(outs3[3] == approximately(std::sinh(4.f))); - REQUIRE(outs3[4] == approximately(std::sinh(5.f))); + REQUIRE(outs3[0] == approximately(std::sinh(1.f))); + REQUIRE(outs3[1] == approximately(std::sinh(2.f))); + REQUIRE(outs3[2] == approximately(std::sinh(3.f))); + REQUIRE(outs3[3] == approximately(std::sinh(4.f))); + REQUIRE(outs3[4] == approximately(std::sinh(5.f))); - ir = s3.get_ir(); + ir = s3.get_ir(); - count = 0u; - for (auto it = boost::make_find_iterator(ir, boost::first_finder("@sinhf", boost::is_iequal())); - it != string_find_iterator(); ++it) { - ++count; - } + count = 0u; + for (auto it = boost::make_find_iterator(ir, boost::first_finder("@sinhf", boost::is_iequal())); + it != string_find_iterator(); ++it) { + ++count; + } - if (tf.sse2) { - // NOTE: occurrences of the scalar version: - // - 1 call in the remainder of the unstrided cfunc, - // - 1 call in the remainder of the strided cfunc, - // - 1 declaration. - REQUIRE(count == 3u); - } + if (tf.sse2) { + // NOTE: occurrences of the scalar version: + // - 1 call in the remainder of the unstrided cfunc, + // - 1 call in the remainder of the strided cfunc, + // - 1 declaration. + REQUIRE(count == 3u); + } - if (tf.aarch64) { - REQUIRE(count == 3u); - } + if (tf.aarch64) { + REQUIRE(count == 3u); + } #endif + } } diff --git a/test/tan.cpp b/test/tan.cpp index 66788af21..57a08d954 100644 --- a/test/tan.cpp +++ b/test/tan.cpp @@ -222,124 +222,128 @@ TEST_CASE("normalise") // Tests to check vectorisation via the vector-function-abi-variant machinery. TEST_CASE("vfabi double") { - llvm_state s{kw::slp_vectorize = true}; + for (auto fast_math : {false, true}) { + llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math}; - auto [a, b] = make_vars("a", "b"); + auto [a, b] = make_vars("a", "b"); - add_cfunc(s, "cfunc", {tan(a), tan(b)}); + add_cfunc(s, "cfunc", {tan(a), tan(b)}); - s.compile(); + s.compile(); - auto *cf_ptr - = reinterpret_cast(s.jit_lookup("cfunc")); + auto *cf_ptr = reinterpret_cast( + s.jit_lookup("cfunc")); - const std::vector ins{.1, .2}; - std::vector outs(2u, 0.); + const std::vector ins{.1, .2}; + std::vector outs(2u, 0.); - cf_ptr(outs.data(), ins.data(), nullptr, nullptr); + cf_ptr(outs.data(), ins.data(), nullptr, nullptr); - REQUIRE(outs[0] == approximately(std::tan(.1))); - REQUIRE(outs[1] == approximately(std::tan(.2))); + REQUIRE(outs[0] == approximately(std::tan(.1))); + REQUIRE(outs[1] == approximately(std::tan(.2))); - // NOTE: autovec with external scalar functions seems to work - // only since LLVM 16. + // NOTE: autovec with external scalar functions seems to work + // only since LLVM 16. #if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16 - const auto &tf = detail::get_target_features(); + const auto &tf = detail::get_target_features(); - auto ir = s.get_ir(); + auto ir = s.get_ir(); - using string_find_iterator = boost::find_iterator; + using string_find_iterator = boost::find_iterator; - auto count = 0u; - for (auto it = boost::make_find_iterator(ir, boost::first_finder("@tan", boost::is_iequal())); - it != string_find_iterator(); ++it) { - ++count; - } + auto count = 0u; + for (auto it = boost::make_find_iterator(ir, boost::first_finder("@tan", boost::is_iequal())); + it != string_find_iterator(); ++it) { + ++count; + } - // NOTE: at the moment we have comprehensive coverage of LLVM versions - // in the CI only for x86_64. - if (tf.sse2) { - // NOTE: occurrences of the scalar version: - // - 2 calls in the strided cfunc, - // - 1 declaration. - REQUIRE(count == 3u); - } + // NOTE: at the moment we have comprehensive coverage of LLVM versions + // in the CI only for x86_64. + if (tf.sse2) { + // NOTE: occurrences of the scalar version: + // - 2 calls in the strided cfunc, + // - 1 declaration. + REQUIRE(count == 3u); + } - if (tf.aarch64) { - REQUIRE(count == 3u); - } + if (tf.aarch64) { + REQUIRE(count == 3u); + } - // NOTE: currently no auto-vectorization happens on ppc64 due apparently - // to the way the target machine is being set up by orc/lljit (it works - // fine with the opt tool). When this is resolved, we can test ppc64 too. + // NOTE: currently no auto-vectorization happens on ppc64 due apparently + // to the way the target machine is being set up by orc/lljit (it works + // fine with the opt tool). When this is resolved, we can test ppc64 too. - // if (tf.vsx) { - // REQUIRE(count == 3u); - // } + // if (tf.vsx) { + // REQUIRE(count == 3u); + // } #endif + } } TEST_CASE("vfabi float") { - llvm_state s{kw::slp_vectorize = true}; + for (auto fast_math : {false, true}) { + llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math}; - auto [a, b, c, d] = make_vars("a", "b", "c", "d"); + auto [a, b, c, d] = make_vars("a", "b", "c", "d"); - add_cfunc(s, "cfunc", {tan(a), tan(b), tan(c), tan(d)}); + add_cfunc(s, "cfunc", {tan(a), tan(b), tan(c), tan(d)}); - s.compile(); + s.compile(); - auto *cf_ptr - = reinterpret_cast(s.jit_lookup("cfunc")); + auto *cf_ptr + = reinterpret_cast(s.jit_lookup("cfunc")); - const std::vector ins{1., 2., 3., 4.}; - std::vector outs(4u, 0.); + const std::vector ins{1., 2., 3., 4.}; + std::vector outs(4u, 0.); - cf_ptr(outs.data(), ins.data(), nullptr, nullptr); + cf_ptr(outs.data(), ins.data(), nullptr, nullptr); - REQUIRE(outs[0] == approximately(std::tan(1.f))); - REQUIRE(outs[1] == approximately(std::tan(2.f))); - REQUIRE(outs[2] == approximately(std::tan(3.f))); - REQUIRE(outs[3] == approximately(std::tan(4.f))); + REQUIRE(outs[0] == approximately(std::tan(1.f))); + REQUIRE(outs[1] == approximately(std::tan(2.f))); + REQUIRE(outs[2] == approximately(std::tan(3.f))); + REQUIRE(outs[3] == approximately(std::tan(4.f))); - // NOTE: autovec with external scalar functions seems to work - // only since LLVM 16. + // NOTE: autovec with external scalar functions seems to work + // only since LLVM 16. #if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16 - const auto &tf = detail::get_target_features(); + const auto &tf = detail::get_target_features(); - auto ir = s.get_ir(); + auto ir = s.get_ir(); - using string_find_iterator = boost::find_iterator; + using string_find_iterator = boost::find_iterator; - auto count = 0u; - for (auto it = boost::make_find_iterator(ir, boost::first_finder("@tanf", boost::is_iequal())); - it != string_find_iterator(); ++it) { - ++count; - } + auto count = 0u; + for (auto it = boost::make_find_iterator(ir, boost::first_finder("@tanf", boost::is_iequal())); + it != string_find_iterator(); ++it) { + ++count; + } - // NOTE: at the moment we have comprehensive coverage of LLVM versions - // in the CI only for x86_64. - if (tf.sse2) { - // NOTE: occurrences of the scalar version: - // - 4 calls in the strided cfunc, - // - 1 declaration. - REQUIRE(count == 5u); - } + // NOTE: at the moment we have comprehensive coverage of LLVM versions + // in the CI only for x86_64. + if (tf.sse2) { + // NOTE: occurrences of the scalar version: + // - 4 calls in the strided cfunc, + // - 1 declaration. + REQUIRE(count == 5u); + } - if (tf.aarch64) { - REQUIRE(count == 5u); - } + if (tf.aarch64) { + REQUIRE(count == 5u); + } - // NOTE: currently no auto-vectorization happens on ppc64 due apparently - // to the way the target machine is being set up by orc/lljit (it works - // fine with the opt tool). When this is resolved, we can test ppc64 too. + // NOTE: currently no auto-vectorization happens on ppc64 due apparently + // to the way the target machine is being set up by orc/lljit (it works + // fine with the opt tool). When this is resolved, we can test ppc64 too. - // if (tf.vsx) { - // REQUIRE(count == 5u); - // } + // if (tf.vsx) { + // REQUIRE(count == 5u); + // } #endif + } } diff --git a/test/tanh.cpp b/test/tanh.cpp index 43b6f6b2b..b68f011a0 100644 --- a/test/tanh.cpp +++ b/test/tanh.cpp @@ -223,124 +223,128 @@ TEST_CASE("normalise") // Tests to check vectorisation via the vector-function-abi-variant machinery. TEST_CASE("vfabi double") { - llvm_state s{kw::slp_vectorize = true}; + for (auto fast_math : {false, true}) { + llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math}; - auto [a, b] = make_vars("a", "b"); + auto [a, b] = make_vars("a", "b"); - add_cfunc(s, "cfunc", {tanh(a), tanh(b)}); + add_cfunc(s, "cfunc", {tanh(a), tanh(b)}); - s.compile(); + s.compile(); - auto *cf_ptr - = reinterpret_cast(s.jit_lookup("cfunc")); + auto *cf_ptr = reinterpret_cast( + s.jit_lookup("cfunc")); - const std::vector ins{.1, .2}; - std::vector outs(2u, 0.); + const std::vector ins{.1, .2}; + std::vector outs(2u, 0.); - cf_ptr(outs.data(), ins.data(), nullptr, nullptr); + cf_ptr(outs.data(), ins.data(), nullptr, nullptr); - REQUIRE(outs[0] == approximately(std::tanh(.1))); - REQUIRE(outs[1] == approximately(std::tanh(.2))); + REQUIRE(outs[0] == approximately(std::tanh(.1))); + REQUIRE(outs[1] == approximately(std::tanh(.2))); - // NOTE: autovec with external scalar functions seems to work - // only since LLVM 16. + // NOTE: autovec with external scalar functions seems to work + // only since LLVM 16. #if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16 - const auto &tf = detail::get_target_features(); + const auto &tf = detail::get_target_features(); - auto ir = s.get_ir(); + auto ir = s.get_ir(); - using string_find_iterator = boost::find_iterator; + using string_find_iterator = boost::find_iterator; - auto count = 0u; - for (auto it = boost::make_find_iterator(ir, boost::first_finder("@tanh", boost::is_iequal())); - it != string_find_iterator(); ++it) { - ++count; - } + auto count = 0u; + for (auto it = boost::make_find_iterator(ir, boost::first_finder("@tanh", boost::is_iequal())); + it != string_find_iterator(); ++it) { + ++count; + } - // NOTE: at the moment we have comprehensive coverage of LLVM versions - // in the CI only for x86_64. - if (tf.sse2) { - // NOTE: occurrences of the scalar version: - // - 2 calls in the strided cfunc, - // - 1 declaration. - REQUIRE(count == 3u); - } + // NOTE: at the moment we have comprehensive coverage of LLVM versions + // in the CI only for x86_64. + if (tf.sse2) { + // NOTE: occurrences of the scalar version: + // - 2 calls in the strided cfunc, + // - 1 declaration. + REQUIRE(count == 3u); + } - if (tf.aarch64) { - REQUIRE(count == 3u); - } + if (tf.aarch64) { + REQUIRE(count == 3u); + } - // NOTE: currently no auto-vectorization happens on ppc64 due apparently - // to the way the target machine is being set up by orc/lljit (it works - // fine with the opt tool). When this is resolved, we can test ppc64 too. + // NOTE: currently no auto-vectorization happens on ppc64 due apparently + // to the way the target machine is being set up by orc/lljit (it works + // fine with the opt tool). When this is resolved, we can test ppc64 too. - // if (tf.vsx) { - // REQUIRE(count == 3u); - // } + // if (tf.vsx) { + // REQUIRE(count == 3u); + // } #endif + } } TEST_CASE("vfabi float") { - llvm_state s{kw::slp_vectorize = true}; + for (auto fast_math : {false, true}) { + llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math}; - auto [a, b, c, d] = make_vars("a", "b", "c", "d"); + auto [a, b, c, d] = make_vars("a", "b", "c", "d"); - add_cfunc(s, "cfunc", {tanh(a), tanh(b), tanh(c), tanh(d)}); + add_cfunc(s, "cfunc", {tanh(a), tanh(b), tanh(c), tanh(d)}); - s.compile(); + s.compile(); - auto *cf_ptr - = reinterpret_cast(s.jit_lookup("cfunc")); + auto *cf_ptr + = reinterpret_cast(s.jit_lookup("cfunc")); - const std::vector ins{1., 2., 3., 4.}; - std::vector outs(4u, 0.); + const std::vector ins{1., 2., 3., 4.}; + std::vector outs(4u, 0.); - cf_ptr(outs.data(), ins.data(), nullptr, nullptr); + cf_ptr(outs.data(), ins.data(), nullptr, nullptr); - REQUIRE(outs[0] == approximately(std::tanh(1.f))); - REQUIRE(outs[1] == approximately(std::tanh(2.f))); - REQUIRE(outs[2] == approximately(std::tanh(3.f))); - REQUIRE(outs[3] == approximately(std::tanh(4.f))); + REQUIRE(outs[0] == approximately(std::tanh(1.f))); + REQUIRE(outs[1] == approximately(std::tanh(2.f))); + REQUIRE(outs[2] == approximately(std::tanh(3.f))); + REQUIRE(outs[3] == approximately(std::tanh(4.f))); - // NOTE: autovec with external scalar functions seems to work - // only since LLVM 16. + // NOTE: autovec with external scalar functions seems to work + // only since LLVM 16. #if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16 - const auto &tf = detail::get_target_features(); + const auto &tf = detail::get_target_features(); - auto ir = s.get_ir(); + auto ir = s.get_ir(); - using string_find_iterator = boost::find_iterator; + using string_find_iterator = boost::find_iterator; - auto count = 0u; - for (auto it = boost::make_find_iterator(ir, boost::first_finder("@tanhf", boost::is_iequal())); - it != string_find_iterator(); ++it) { - ++count; - } + auto count = 0u; + for (auto it = boost::make_find_iterator(ir, boost::first_finder("@tanhf", boost::is_iequal())); + it != string_find_iterator(); ++it) { + ++count; + } - // NOTE: at the moment we have comprehensive coverage of LLVM versions - // in the CI only for x86_64. - if (tf.sse2) { - // NOTE: occurrences of the scalar version: - // - 4 calls in the strided cfunc, - // - 1 declaration. - REQUIRE(count == 5u); - } + // NOTE: at the moment we have comprehensive coverage of LLVM versions + // in the CI only for x86_64. + if (tf.sse2) { + // NOTE: occurrences of the scalar version: + // - 4 calls in the strided cfunc, + // - 1 declaration. + REQUIRE(count == 5u); + } - if (tf.aarch64) { - REQUIRE(count == 5u); - } + if (tf.aarch64) { + REQUIRE(count == 5u); + } - // NOTE: currently no auto-vectorization happens on ppc64 due apparently - // to the way the target machine is being set up by orc/lljit (it works - // fine with the opt tool). When this is resolved, we can test ppc64 too. + // NOTE: currently no auto-vectorization happens on ppc64 due apparently + // to the way the target machine is being set up by orc/lljit (it works + // fine with the opt tool). When this is resolved, we can test ppc64 too. - // if (tf.vsx) { - // REQUIRE(count == 5u); - // } + // if (tf.vsx) { + // REQUIRE(count == 5u); + // } #endif + } } diff --git a/test/taylor_sincos.cpp b/test/taylor_sincos.cpp index 8efebd5fd..a8dc59bf0 100644 --- a/test/taylor_sincos.cpp +++ b/test/taylor_sincos.cpp @@ -47,10 +47,11 @@ const auto fp_types = std::tuple{}; template -void compare_batch_scalar(std::initializer_list sys, unsigned opt_level, bool high_accuracy, bool compact_mode) +void compare_batch_scalar(std::initializer_list sys, unsigned opt_level, bool high_accuracy, bool compact_mode, + bool fast_math) { for (auto batch_size : {2u, 4u, 8u, 5u}) { - llvm_state s{kw::opt_level = opt_level}; + llvm_state s{kw::opt_level = opt_level, kw::fast_math = fast_math}; taylor_add_jet(s, "jet_batch", sys, 3, batch_size, high_accuracy, compact_mode); taylor_add_jet(s, "jet_scalar", sys, 3, 1, high_accuracy, compact_mode); @@ -98,7 +99,7 @@ TEST_CASE("taylor sincos decompose bug 00") TEST_CASE("taylor sincos") { - auto tester = [](auto fp_x, unsigned opt_level, bool high_accuracy, bool compact_mode) { + auto tester = [](auto fp_x, unsigned opt_level, bool high_accuracy, bool compact_mode, bool fast_math) { using std::sin; using std::cos; @@ -108,7 +109,7 @@ TEST_CASE("taylor sincos") // Number-number tests. { - llvm_state s{kw::opt_level = opt_level}; + llvm_state s{kw::opt_level = opt_level, kw::fast_math = fast_math}; taylor_add_jet(s, "jet", {sin(expression{number{fp_t{2}}}) + cos(expression{number{fp_t{3}}}), x + y}, 1, 1, high_accuracy, compact_mode); @@ -129,7 +130,7 @@ TEST_CASE("taylor sincos") } { - llvm_state s{kw::opt_level = opt_level}; + llvm_state s{kw::opt_level = opt_level, kw::fast_math = fast_math}; taylor_add_jet(s, "jet", {sin(par[0]) + cos(par[1]), x + y}, 1, 1, high_accuracy, compact_mode); @@ -151,7 +152,7 @@ TEST_CASE("taylor sincos") } { - llvm_state s{kw::opt_level = opt_level}; + llvm_state s{kw::opt_level = opt_level, kw::fast_math = fast_math}; taylor_add_jet(s, "jet", {sin(expression{number{fp_t{2}}}) + cos(expression{number{fp_t{3}}}), x + y}, 1, 2, high_accuracy, compact_mode); @@ -179,7 +180,7 @@ TEST_CASE("taylor sincos") } { - llvm_state s{kw::opt_level = opt_level}; + llvm_state s{kw::opt_level = opt_level, kw::fast_math = fast_math}; taylor_add_jet(s, "jet", {sin(par[0]) + cos(par[1]), x + y}, 1, 2, high_accuracy, compact_mode); @@ -208,7 +209,7 @@ TEST_CASE("taylor sincos") } { - llvm_state s{kw::opt_level = opt_level}; + llvm_state s{kw::opt_level = opt_level, kw::fast_math = fast_math}; taylor_add_jet(s, "jet", {sin(expression{number{fp_t{2}}}) + cos(expression{number{fp_t{3}}}), x + y}, 2, 1, high_accuracy, compact_mode); @@ -231,7 +232,7 @@ TEST_CASE("taylor sincos") } { - llvm_state s{kw::opt_level = opt_level}; + llvm_state s{kw::opt_level = opt_level, kw::fast_math = fast_math}; taylor_add_jet(s, "jet", {sin(expression{number{fp_t{2}}}) + cos(expression{number{fp_t{3}}}), x + y}, 2, 2, high_accuracy, compact_mode); @@ -265,7 +266,7 @@ TEST_CASE("taylor sincos") } { - llvm_state s{kw::opt_level = opt_level}; + llvm_state s{kw::opt_level = opt_level, kw::fast_math = fast_math}; taylor_add_jet(s, "jet", {sin(expression{number{fp_t{2}}}) + cos(expression{number{fp_t{3}}}), x + y}, 3, 3, high_accuracy, compact_mode); @@ -313,7 +314,7 @@ TEST_CASE("taylor sincos") } { - llvm_state s{kw::opt_level = opt_level}; + llvm_state s{kw::opt_level = opt_level, kw::fast_math = fast_math}; taylor_add_jet(s, "jet", {sin(par[0]) + cos(par[1]), x + y}, 3, 3, high_accuracy, compact_mode); @@ -363,11 +364,11 @@ TEST_CASE("taylor sincos") // Do the batch/scalar comparison. compare_batch_scalar({sin(expression{number{fp_t{2}}}) + cos(expression{number{fp_t{3}}}), x + y}, - opt_level, high_accuracy, compact_mode); + opt_level, high_accuracy, compact_mode, fast_math); // Variable tests. { - llvm_state s{kw::opt_level = opt_level}; + llvm_state s{kw::opt_level = opt_level, kw::fast_math = fast_math}; taylor_add_jet(s, "jet", {sin(y + 1_dbl), cos(x + 1_dbl)}, 1, 1, high_accuracy, compact_mode); @@ -387,7 +388,7 @@ TEST_CASE("taylor sincos") } { - llvm_state s{kw::opt_level = opt_level}; + llvm_state s{kw::opt_level = opt_level, kw::fast_math = fast_math}; taylor_add_jet(s, "jet", {sin(y + 1_dbl), cos(x + 1_dbl)}, 1, 2, high_accuracy, compact_mode); @@ -414,7 +415,7 @@ TEST_CASE("taylor sincos") } { - llvm_state s{kw::opt_level = opt_level}; + llvm_state s{kw::opt_level = opt_level, kw::fast_math = fast_math}; taylor_add_jet(s, "jet", {sin(y + 1_dbl), cos(x + 1_dbl)}, 2, 1, high_accuracy, compact_mode); @@ -436,7 +437,7 @@ TEST_CASE("taylor sincos") } { - llvm_state s{kw::opt_level = opt_level}; + llvm_state s{kw::opt_level = opt_level, kw::fast_math = fast_math}; taylor_add_jet(s, "jet", {sin(y), cos(x)}, 2, 2, high_accuracy, compact_mode); @@ -469,7 +470,7 @@ TEST_CASE("taylor sincos") } { - llvm_state s{kw::opt_level = opt_level}; + llvm_state s{kw::opt_level = opt_level, kw::fast_math = fast_math}; taylor_add_jet(s, "jet", {sin(y), cos(x)}, 3, 3, high_accuracy, compact_mode); @@ -522,15 +523,17 @@ TEST_CASE("taylor sincos") } // Do the batch/scalar comparison. - compare_batch_scalar({sin(y), cos(x)}, opt_level, high_accuracy, compact_mode); + compare_batch_scalar({sin(y), cos(x)}, opt_level, high_accuracy, compact_mode, fast_math); }; for (auto cm : {false, true}) { for (auto f : {false, true}) { - tuple_for_each(fp_types, [&tester, f, cm](auto x) { tester(x, 0, f, cm); }); - tuple_for_each(fp_types, [&tester, f, cm](auto x) { tester(x, 1, f, cm); }); - tuple_for_each(fp_types, [&tester, f, cm](auto x) { tester(x, 2, f, cm); }); - tuple_for_each(fp_types, [&tester, f, cm](auto x) { tester(x, 3, f, cm); }); + for (auto fm : {false, true}) { + tuple_for_each(fp_types, [&tester, f, cm, fm](auto x) { tester(x, 0, f, cm, fm); }); + tuple_for_each(fp_types, [&tester, f, cm, fm](auto x) { tester(x, 1, f, cm, fm); }); + tuple_for_each(fp_types, [&tester, f, cm, fm](auto x) { tester(x, 2, f, cm, fm); }); + tuple_for_each(fp_types, [&tester, f, cm, fm](auto x) { tester(x, 3, f, cm, fm); }); + } } } } diff --git a/tutorial/CMakeLists.txt b/tutorial/CMakeLists.txt index b027f63b5..1c8b2fd95 100644 --- a/tutorial/CMakeLists.txt +++ b/tutorial/CMakeLists.txt @@ -29,6 +29,7 @@ ADD_HEYOKA_TUTORIAL(s11n_event) ADD_HEYOKA_TUTORIAL(ensemble) ADD_HEYOKA_TUTORIAL(par_mode) ADD_HEYOKA_TUTORIAL(extended_precision) +ADD_HEYOKA_TUTORIAL(single_precision) if(HEYOKA_WITH_MPPP AND mp++_WITH_MPFR) ADD_HEYOKA_TUTORIAL(arbitrary_precision) diff --git a/tutorial/single_precision.cpp b/tutorial/single_precision.cpp new file mode 100644 index 000000000..f3bb901be --- /dev/null +++ b/tutorial/single_precision.cpp @@ -0,0 +1,50 @@ +// Copyright 2020, 2021, 2022, 2023 Francesco Biscani (bluescarni@gmail.com), Dario Izzo (dario.izzo@gmail.com) +// +// This file is part of the heyoka library. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include +#include + +#include + +using namespace heyoka; + +int main() +{ + // Create the symbolic variables x and v. + auto [x, v] = make_vars("x", "v"); + + // Create the integrator object + // in single precision. + auto ta = taylor_adaptive{// Definition of the ODE system: + // x' = v + // v' = -9.8 * sin(x) + {prime(x) = v, prime(v) = -9.8 * sin(x)}, + // Initial conditions + // for x and v. + {-1.f, 0.f}}; + + // Create a small helper to compute the energy constant + // from the state vector. + auto compute_energy = [](const auto &sv) { + using std::cos; + + return (sv[1] * sv[1]) / 2 + 9.8 * (1 - cos(sv[0])); + }; + + // Compute and store the intial energy. + const auto orig_E = compute_energy(ta.get_state()); + + // Integrate for a few timesteps. + for (auto i = 0; i < 20; ++i) { + using std::abs; + + ta.step(); + + std::cout << "Relative energy error: " << abs((orig_E - compute_energy(ta.get_state())) / orig_E) << '\n'; + } +}