diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index e2b612f5a..6b33407a6 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -56,6 +56,7 @@ ADD_HEYOKA_BENCHMARK(event_overhead)
 ADD_HEYOKA_BENCHMARK(ss_event_overhead)
 ADD_HEYOKA_BENCHMARK(h_oscillator_lt)
 ADD_HEYOKA_BENCHMARK(mb)
+ADD_HEYOKA_BENCHMARK(kepE_bench)
 ADD_HEYOKA_BENCHMARK(vsop2013_elliptic)
 ADD_HEYOKA_BENCHMARK(vsop2013_cartesian)
 ADD_HEYOKA_BENCHMARK(elp2000_cartesian)
diff --git a/benchmark/kepE_bench.cpp b/benchmark/kepE_bench.cpp
new file mode 100644
index 000000000..ab5b9f567
--- /dev/null
+++ b/benchmark/kepE_bench.cpp
@@ -0,0 +1,126 @@
+// Copyright 2020, 2021, 2022, 2023 Francesco Biscani (bluescarni@gmail.com), Dario Izzo (dario.izzo@gmail.com)
+//
+// This file is part of the heyoka library.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include <algorithm>
+#include <cmath>
+#include <ios>
+#include <iostream>
+#include <random>
+#include <vector>
+
+#include <boost/math/constants/constants.hpp>
+#include <boost/program_options.hpp>
+
+#include <fmt/core.h>
+
+#include <spdlog/spdlog.h>
+#include <spdlog/stopwatch.h>
+
+#include <heyoka/expression.hpp>
+#include <heyoka/kw.hpp>
+#include <heyoka/llvm_state.hpp>
+#include <heyoka/logging.hpp>
+#include <heyoka/math/kepE.hpp>
+#include <stdexcept>
+
+using namespace heyoka;
+
+int main(int argc, char *argv[])
+{
+    namespace po = boost::program_options;
+
+    double ecc{};
+    unsigned seed{};
+    bool fast_math{};
+
+    po::options_description desc("Options");
+
+    desc.add_options()("help", "produce help message")("ecc", po::value<double>(&ecc)->default_value(0.1),
+                                                       "eccentricity")(
+        "seed", po::value<unsigned>(&seed)->default_value(42u),
+        "random seed")("fast-math", po::value<bool>(&fast_math)->default_value(true), "fast math mode");
+
+    po::variables_map vm;
+    po::store(po::parse_command_line(argc, argv, desc), vm);
+    po::notify(vm);
+
+    if (vm.count("help") != 0u) {
+        std::cout << desc << "\n";
+        return 0;
+    }
+
+    if (!std::isfinite(ecc) || ecc < 0 || ecc >= 1) {
+        throw std::invalid_argument(fmt::format("Invalid eccentricity value: {}", ecc));
+    }
+
+    constexpr auto N = 1'000'000ul;
+
+    std::cout << std::boolalpha;
+    std::cout << "Eccentricity: " << ecc << '\n';
+    std::cout << "fast_math   : " << fast_math << '\n';
+    std::cout << "N           : " << N << "\n\n";
+
+    // RNG setup.
+    std::mt19937 rng(seed);
+    std::uniform_real_distribution<double> Mdist(0, 2 * boost::math::constants::pi<double>());
+
+    // Data setup.
+    std::vector<double> e_vec, M_vec, out_vec, out_vec_batch;
+    e_vec.resize(N, ecc);
+    M_vec.resize(N);
+    out_vec.resize(N);
+    out_vec_batch.resize(N);
+    std::generate(M_vec.begin(), M_vec.end(), [&rng, &Mdist]() { return Mdist(rng); });
+
+    // cfunc setup.
+    auto [e, M] = make_vars("e", "M");
+
+    llvm_state s{kw::fast_math = fast_math};
+    const auto batch_size = recommended_simd_size<double>();
+    add_cfunc<double>(s, "f_scalar", {kepE(e, M)}, kw::vars = {e, M});
+    add_cfunc<double>(s, "f_batch", {kepE(e, M)}, kw::vars = {e, M}, kw::batch_size = batch_size);
+    s.compile();
+
+    auto *f_sc = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(
+        s.jit_lookup("f_scalar"));
+    auto *f_ba
+        = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(s.jit_lookup("f_batch"));
+
+    // Fetch the logger.
+    create_logger();
+    set_logger_level_trace();
+    auto logger = spdlog::get("heyoka");
+
+    // Scalar runtime.
+    spdlog::stopwatch sw;
+
+    for (auto i = 0ull; i < N; ++i) {
+        double ins[] = {e_vec[i], M_vec[i]};
+        f_sc(out_vec.data() + i, ins, nullptr, nullptr);
+    }
+
+    logger->trace("Scalar run took: {}s", sw);
+
+    std::vector<double> batch_buffer(batch_size * 2ul);
+    auto *batch_b_ptr = batch_buffer.data();
+
+    sw.reset();
+
+    for (auto i = 0ull; i < N - N % batch_size; i += batch_size) {
+        std::copy(e_vec.data() + i, e_vec.data() + i + batch_size, batch_b_ptr);
+        std::copy(M_vec.data() + i, M_vec.data() + i + batch_size, batch_b_ptr + batch_size);
+        f_ba(out_vec_batch.data() + i, batch_b_ptr, nullptr, nullptr);
+    }
+
+    logger->trace("Batch run took: {}s", sw);
+
+    std::cout.precision(16);
+    for (auto i = 0u; i < 20u; ++i) {
+        std::cout << out_vec[i] << " vs " << out_vec_batch[i] << '\n';
+    }
+}
diff --git a/doc/advanced_tutorials.rst b/doc/advanced_tutorials.rst
index 02833671f..a4e899986 100644
--- a/doc/advanced_tutorials.rst
+++ b/doc/advanced_tutorials.rst
@@ -27,6 +27,7 @@ the tutorials should not be hard to follow.
   tut_batch_mode
   tut_extended_precision
   tut_arbitrary_precision
+  tut_single_precision
   tut_s11n
   tut_ensemble
   tut_parallel_mode
diff --git a/doc/changelog.rst b/doc/changelog.rst
index ec2b2ffad..4f053952d 100644
--- a/doc/changelog.rst
+++ b/doc/changelog.rst
@@ -7,8 +7,7 @@ Changelog
 New
 ~~~
 
-- Add the step callback (batch) set classes to compose
-  step callbacks
+- Add step callback set classes to compose step callbacks
   (`#366 <https://github.com/bluescarni/heyoka/pull/366>`__).
 - Add support for single-precision computations
   (`#363 <https://github.com/bluescarni/heyoka/pull/363>`__).
@@ -18,6 +17,10 @@ New
 Changes
 ~~~~~~~
 
+- When the ``fast_math`` mode is active, the SIMD-vectorised
+  mathematical functions now use low-precision implementations.
+  This can lead to substantial performance increases in batch mode
+  (`#367 <https://github.com/bluescarni/heyoka/pull/367>`__).
 - Initialising a step callback or a callable from an empty
   function object (e.g., a null pointer, an empty ``std::function``, etc.)
   now results in an empty object
diff --git a/doc/tut_single_precision.rst b/doc/tut_single_precision.rst
new file mode 100644
index 000000000..99677aa89
--- /dev/null
+++ b/doc/tut_single_precision.rst
@@ -0,0 +1,107 @@
+.. _tut_single_precision:
+
+Computations in single precision
+================================
+
+.. versionadded:: 3.2.0
+
+In previous tutorials we saw how heyoka, in addition to the standard
+`double precision <https://en.wikipedia.org/wiki/Double-precision_floating-point_format>`__,
+also supports computations in :ref:`extended precision <tut_extended_precision>` and
+:ref:`arbitrary precision <tut_arbitrary_precision>`. Starting with version 3.2.0, heyoka
+supports also computations in `single precision <https://en.wikipedia.org/wiki/Single-precision_floating-point_format>`__.
+
+Single-precision computations can lead to substantial performance benefits when high accuracy is not required.
+In particular, single-precision :ref:`batch mode <tut_batch_mode>` can use a SIMD width twice larger
+than double precision, leading to an increase by a factor of 2 of the computational throughput.
+In scalar computations, the use of single precision reduces by half the memory usage with respect to double precision,
+which can help alleviating performance issues in large ODE systems. This can be particularly noticeable in applications such as
+:external:ref:`neural ODEs <tut_neural_ode>`.
+
+In C++, single-precision values are usually represented via the standard floating-point type ``float``.
+Correspondingly, and similarly to what explained in the :ref:`extended precision <tut_extended_precision>`
+tutorial, single-precision computations are activated by passing the ``float`` template parameter to functions
+and classes in the heyoka API.
+
+A simple example
+----------------
+
+In order to verify that heyoka indeed is able to work in single precision, we will be monitoring the evolution of the energy constant
+in a low-precision numerical integration of the simple pendulum.
+
+Let us begin as usual with the definition of the dynamical equations and the creation of the integrator object:
+
+.. literalinclude:: ../tutorial/single_precision.cpp
+   :language: c++
+   :lines: 18-29
+
+In order to activate single precision, we created an integrator object of type ``taylor_adaptive<float>`` - that is,
+we specified ``float``, instead of the usual ``double``, as the (only) template parameter for the ``taylor_adaptive`` class template.
+Note that we specified a single-precision initial state via the use of the ``f`` suffix for the numerical constants.
+Note also that, when operating in single precision,
+*all* numerical values encapsulated in an integrator are represented in single precision - this includes not only the state vector,
+but also the time coordinate, the tolerance, the Taylor coefficients, etc. Similarly to double-precision integrators, the default value
+of the tolerance is the machine epsilon of ``float``.
+
+Next, we define a small helper function that will allow us to monitor the evolution of the energy constant
+throughout the integration:
+
+.. literalinclude:: ../tutorial/single_precision.cpp
+   :language: c++
+   :lines: 31-37
+
+Before starting the integration, we compute and store the initial energy for later use:
+
+.. literalinclude:: ../tutorial/single_precision.cpp
+   :language: c++
+   :lines: 39-40
+
+We can now begin a step-by-step integration. At the end of each step, we will be computing
+and printing to screen the relative energy error:
+
+.. literalinclude:: ../tutorial/single_precision.cpp
+   :language: c++
+   :lines: 42-49
+
+.. code-block:: console
+
+   Relative energy error: 1.48183e-07
+   Relative energy error: 5.29227e-08
+   Relative energy error: 6.08611e-08
+   Relative energy error: 1.79937e-07
+   Relative energy error: 1.74645e-07
+   Relative energy error: 2.24921e-07
+   Relative energy error: 2.4609e-07
+   Relative energy error: 1.1643e-07
+   Relative energy error: 1.79937e-07
+   Relative energy error: 1.40245e-07
+   Relative energy error: 2.54029e-07
+   Relative energy error: 1.84899e-07
+   Relative energy error: 1.83245e-07
+   Relative energy error: 1.56122e-07
+   Relative energy error: 2.22275e-07
+   Relative energy error: 1.61414e-07
+   Relative energy error: 2.11691e-07
+   Relative energy error: 2.88428e-07
+   Relative energy error: 2.93721e-07
+   Relative energy error: 1.82583e-07
+
+The console output indeed confirms that energy is conserved at the level of the epsilon of the
+single-precision format (that is, :math:`\sim 10^{-7}`).
+
+Other classes and functions
+---------------------------
+
+Besides the adaptive integrator, several other classes and functions in heyoka can be used in single precision.
+
+The :ref:`event classes <tut_events>`, for instance, can be constructed in single precision by passing ``float``
+as the template parameter (instead of ``double``). Note that the precision of an event
+must match the precision of the integrator object in which the event is used, otherwise an error will be produced
+at compilation time.
+
+Full code listing
+-----------------
+
+.. literalinclude:: ../tutorial/single_precision.cpp
+   :language: c++
+   :lines: 9-
diff --git a/include/heyoka/detail/vector_math.hpp b/include/heyoka/detail/vector_math.hpp
index c0d4fe2e7..8e55eeceb 100644
--- a/include/heyoka/detail/vector_math.hpp
+++ b/include/heyoka/detail/vector_math.hpp
@@ -27,6 +27,12 @@ struct vf_info {
     // The vfabi attribute corresponding
     // to the vector function.
     std::string vf_abi_attr;
+    // The corresponding low-precision versions
+    // of the above. These will be empty if
+    // the low-precision counterpart is
+    // not available.
+    std::string lp_name;
+    std::string lp_vf_abi_attr;
     // Number of SIMD lanes.
     std::uint32_t width = 0;
     // Number of arguments.
diff --git a/src/detail/llvm_helpers.cpp b/src/detail/llvm_helpers.cpp
index b9c68a0c3..f02166d6f 100644
--- a/src/detail/llvm_helpers.cpp
+++ b/src/detail/llvm_helpers.cpp
@@ -301,6 +301,9 @@ llvm::CallInst *llvm_add_vfabi_attrs(llvm_state &s, llvm::CallInst *call, const
     auto &context = s.context();
     auto &builder = s.builder();
 
+    // Are we in fast math mode?
+    const auto use_fast_math = builder.getFastMathFlags().isFast();
+
     if (!vfi.empty()) {
         // There exist vector variants of the scalar function.
         auto &md = s.module();
@@ -313,7 +316,11 @@ llvm::CallInst *llvm_add_vfabi_attrs(llvm_state &s, llvm::CallInst *call, const
         std::vector<std::string> vf_abi_strs;
         vf_abi_strs.reserve(vfi.size());
         for (const auto &el : vfi) {
-            vf_abi_strs.push_back(el.vf_abi_attr);
+            // Fetch the vf_abi attr string (either the low-precision
+            // or standard version).
+            const auto &vf_abi_attr
+                = (use_fast_math && !el.lp_vf_abi_attr.empty()) ? el.lp_vf_abi_attr : el.vf_abi_attr;
+            vf_abi_strs.push_back(vf_abi_attr);
         }
 #if LLVM_VERSION_MAJOR >= 14
         call->addFnAttr(llvm::Attribute::get(context, "vector-function-abi-variant",
@@ -341,6 +348,10 @@ llvm::CallInst *llvm_add_vfabi_attrs(llvm_state &s, llvm::CallInst *call, const
             assert(el.width > 0u);
             assert(el.nargs == num_args);
 
+            // Fetch the vector function name from el (either the low-precision
+            // or standard version).
+            const auto &el_name = (use_fast_math && !el.lp_name.empty()) ? el.lp_name : el.name;
+
             // The vector type for the current variant.
             auto *cur_vec_t = make_vector_type(scal_t, el.width);
 
@@ -352,11 +363,11 @@ llvm::CallInst *llvm_add_vfabi_attrs(llvm_state &s, llvm::CallInst *call, const
                 false);
 
             // Try to lookup the variant in the module.
-            auto *vf_ptr = md.getFunction(el.name);
+            auto *vf_ptr = md.getFunction(el_name);
 
             if (vf_ptr == nullptr) {
                 // The declaration of the variant is not there yet, create it.
-                vf_ptr = llvm_func_create(vec_ft, llvm::Function::ExternalLinkage, el.name, &md);
+                vf_ptr = llvm_func_create(vec_ft, llvm::Function::ExternalLinkage, el_name, &md);
 
                 // NOTE: setting the attributes on the vector variant is not strictly required
                 // for the auto-vectorizer to work. However, in other parts of the code, the vector
@@ -380,7 +391,7 @@ llvm::CallInst *llvm_add_vfabi_attrs(llvm_state &s, llvm::CallInst *call, const
             //
             // https://llvm.org/docs/LangRef.html#the-llvm-used-global-variable
             // https://godbolt.org/z/1neaG4bYj
-            const auto dummy_name = fmt::format("heyoka.dummy_vector_call.{}", el.name);
+            const auto dummy_name = fmt::format("heyoka.dummy_vector_call.{}", el_name);
 
             if (auto *dummy_ptr = md.getFunction(dummy_name); dummy_ptr == nullptr) {
                 // The dummy function has not been defined yet, do it.
@@ -546,6 +557,9 @@ llvm::Value *llvm_math_intr(llvm_state &s, const std::string &intr_name,
 
     auto &builder = s.builder();
 
+    // Are we in fast math mode?
+    const auto use_fast_math = builder.getFastMathFlags().isFast();
+
     if (llvm_stype_can_use_math_intrinsics(s, scal_t)) {
         // We can use the LLVM intrinsics for the given scalar type.
 
@@ -567,10 +581,15 @@ llvm::Value *llvm_math_intr(llvm_state &s, const std::string &intr_name,
             if (vfi_it != vfi.end() && vfi_it->width == vector_width) {
                 // A vector implementation with precisely the correct width is available, use it.
                 assert(vfi_it->nargs == nargs);
+
+                // Fetch the vector function name (either the low-precision
+                // or standard version).
+                const auto &vf_name = (use_fast_math && !vfi_it->lp_name.empty()) ? vfi_it->lp_name : vfi_it->name;
+
                 // NOTE: make sure to use the same attributes as the scalar intrinsic for the vector
                 // call. This ensures that the vector variant is declared with the same attributes as those that would
                 // be declared by invoking llvm_add_vfabi_attrs() on the scalar invocation.
-                return llvm_invoke_external(s, vfi_it->name, vec_t, {args...}, s_intr->getAttributes());
+                return llvm_invoke_external(s, vf_name, vec_t, {args...}, s_intr->getAttributes());
             }
 
             if (!vfi.empty()) {
@@ -682,6 +701,11 @@ llvm::Value *llvm_math_cmath(llvm_state &s, const std::string &base_name, Args *
     const std::array arg_types = {args->getType()...};
     assert(((args->getType() == arg_types[0]) && ...));
 
+    auto &builder = s.builder();
+
+    // Are we in fast math mode?
+    const auto use_fast_math = builder.getFastMathFlags().isFast();
+
     // Determine the type and scalar type of the arguments.
     auto *x_t = arg_types[0];
     auto *scal_t = x_t->getScalarType();
@@ -711,7 +735,12 @@ llvm::Value *llvm_math_cmath(llvm_state &s, const std::string &base_name, Args *
             if (vfi_it != vfi.end() && vfi_it->width == vector_width) {
                 // A vector implementation with precisely the correct width is available, use it.
                 assert(vfi_it->nargs == nargs);
-                return llvm_invoke_external(s, vfi_it->name, vec_t, {args...}, attrs);
+
+                // Fetch the vector function name (either the low-precision
+                // or standard version).
+                const auto &vf_name = (use_fast_math && !vfi_it->lp_name.empty()) ? vfi_it->lp_name : vfi_it->name;
+
+                return llvm_invoke_external(s, vf_name, vec_t, {args...}, attrs);
             }
 
             // A vector implementation with the correct width is **not** available: scalarise the
@@ -732,7 +761,7 @@ llvm::Value *llvm_math_cmath(llvm_state &s, const std::string &base_name, Args *
     // NOTE: this handles only the scalar case.
     if (llvm_is_real(x_t) != 0) {
         auto *f = real_nary_op(s, x_t, "mpfr_" + base_name, boost::numeric_cast<unsigned>(nargs));
-        return s.builder().CreateCall(f, {args...});
+        return builder.CreateCall(f, {args...});
     }
 
 #endif
diff --git a/src/detail/vector_math.cpp b/src/detail/vector_math.cpp
index 1f47450a1..4a0674b8d 100644
--- a/src/detail/vector_math.cpp
+++ b/src/detail/vector_math.cpp
@@ -36,12 +36,14 @@ using vf_map_t = std::unordered_map<std::string, std::vector<vf_info>>;
 // but at the moment we have only SLEEF.
 #if defined(HEYOKA_WITH_SLEEF)
 
-auto make_vfinfo(const char *s_name, std::string v_name, std::uint32_t width, std::uint32_t nargs)
+auto make_vfinfo(const char *s_name, std::string v_name, std::string lp_v_name, std::uint32_t width,
+                 std::uint32_t nargs)
 {
     assert(nargs == 1u || nargs == 2u);
 
-    auto ret = vf_info{std::move(v_name), {}, width, nargs};
+    auto ret = vf_info{std::move(v_name), {}, std::move(lp_v_name), {}, width, nargs};
     ret.vf_abi_attr = fmt::format("_ZGV_LLVM_N{}{}_{}({})", width, nargs == 1u ? "v" : "vv", s_name, ret.name);
+    ret.lp_vf_abi_attr = fmt::format("_ZGV_LLVM_N{}{}_{}({})", width, nargs == 1u ? "v" : "vv", s_name, ret.lp_name);
     return ret;
 }
 
@@ -49,6 +51,23 @@ auto make_vfinfo(const char *s_name, std::string v_name, std::uint32_t width, st
 
 #if defined(HEYOKA_WITH_SLEEF)
 
+// NOTE: helper to fetch the suffix of the low-precision version of the mathematical
+// function "sleef_base_name" in SLEEF.
+// NOTE: by default, the low-precision versions are denoted by the "u35" suffix
+// (indicating 3.5 ULPs of precision). For some functions, the "u35" versions are not available
+// and we return the standard-precision suffix instead ("u10").
+auto sleef_get_lp_suffix(const std::string &sleef_base_name) -> std::string
+{
+    static const std::unordered_map<std::string, std::string> lp_suffix_map
+        = {{"acosh", "u10"}, {"asinh", "u10"}, {"atanh", "u10"}, {"erf", "u10"}, {"exp", "u10"}, {"pow", "u10"}};
+
+    if (auto it = lp_suffix_map.find(sleef_base_name); it == lp_suffix_map.end()) {
+        return "u35";
+    } else {
+        return it->second;
+    }
+}
+
 // NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
 auto add_vfinfo_sleef(vf_map_t &retval, const char *scalar_name, const char *sleef_base_name, std::string_view sleef_tp,
                       std::uint32_t nargs = 1)
@@ -59,6 +78,8 @@ auto add_vfinfo_sleef(vf_map_t &retval, const char *scalar_name, const char *sle
 
     auto make_sleef_vfinfo = [&](std::uint32_t width, const char *iset) {
         return make_vfinfo(scalar_name, fmt::format("Sleef_{}{}{}_u10{}", sleef_base_name, sleef_tp, width, iset),
+                           fmt::format("Sleef_{}{}{}_{}{}", sleef_base_name, sleef_tp, width,
+                                       sleef_get_lp_suffix(sleef_base_name), iset),
                            width, nargs);
     };
 
diff --git a/test/acos.cpp b/test/acos.cpp
index 241fd9167..0d721aeb7 100644
--- a/test/acos.cpp
+++ b/test/acos.cpp
@@ -229,124 +229,128 @@ TEST_CASE("normalise")
 // Tests to check vectorisation via the vector-function-abi-variant machinery.
 TEST_CASE("vfabi double")
 {
-    llvm_state s{kw::slp_vectorize = true};
+    for (auto fast_math : {false, true}) {
+        llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math};
 
-    auto [a, b] = make_vars("a", "b");
+        auto [a, b] = make_vars("a", "b");
 
-    add_cfunc<double>(s, "cfunc", {acos(a), acos(b)});
+        add_cfunc<double>(s, "cfunc", {acos(a), acos(b)});
 
-    s.compile();
+        s.compile();
 
-    auto *cf_ptr
-        = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(s.jit_lookup("cfunc"));
+        auto *cf_ptr = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(
+            s.jit_lookup("cfunc"));
 
-    const std::vector ins{.1, .2};
-    std::vector<double> outs(2u, 0.);
+        const std::vector ins{.1, .2};
+        std::vector<double> outs(2u, 0.);
 
-    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+        cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
 
-    REQUIRE(outs[0] == approximately(std::acos(.1)));
-    REQUIRE(outs[1] == approximately(std::acos(.2)));
+        REQUIRE(outs[0] == approximately(std::acos(.1)));
+        REQUIRE(outs[1] == approximately(std::acos(.2)));
 
-    // NOTE: autovec with external scalar functions seems to work
-    // only since LLVM 16.
+        // NOTE: autovec with external scalar functions seems to work
+        // only since LLVM 16.
 #if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16
 
-    const auto &tf = detail::get_target_features();
+        const auto &tf = detail::get_target_features();
 
-    auto ir = s.get_ir();
+        auto ir = s.get_ir();
 
-    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+        using string_find_iterator = boost::find_iterator<std::string::iterator>;
 
-    auto count = 0u;
-    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@acos", boost::is_iequal()));
-         it != string_find_iterator(); ++it) {
-        ++count;
-    }
+        auto count = 0u;
+        for (auto it = boost::make_find_iterator(ir, boost::first_finder("@acos", boost::is_iequal()));
+             it != string_find_iterator(); ++it) {
+            ++count;
+        }
 
-    // NOTE: at the moment we have comprehensive coverage of LLVM versions
-    // in the CI only for x86_64.
-    if (tf.sse2) {
-        // NOTE: occurrences of the scalar version:
-        // - 2 calls in the strided cfunc,
-        // - 1 declaration.
-        REQUIRE(count == 3u);
-    }
+        // NOTE: at the moment we have comprehensive coverage of LLVM versions
+        // in the CI only for x86_64.
+        if (tf.sse2) {
+            // NOTE: occurrences of the scalar version:
+            // - 2 calls in the strided cfunc,
+            // - 1 declaration.
+            REQUIRE(count == 3u);
+        }
 
-    if (tf.aarch64) {
-        REQUIRE(count == 3u);
-    }
+        if (tf.aarch64) {
+            REQUIRE(count == 3u);
+        }
 
-    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
-    // to the way the target machine is being set up by orc/lljit (it works
-    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+        // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+        // to the way the target machine is being set up by orc/lljit (it works
+        // fine with the opt tool). When this is resolved, we can test ppc64 too.
 
-    // if (tf.vsx) {
-    //     REQUIRE(count == 3u);
-    // }
+        // if (tf.vsx) {
+        //     REQUIRE(count == 3u);
+        // }
 
 #endif
+    }
 }
 
 TEST_CASE("vfabi float")
 {
-    llvm_state s{kw::slp_vectorize = true};
+    for (auto fast_math : {false, true}) {
+        llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math};
 
-    auto [a, b, c, d] = make_vars("a", "b", "c", "d");
+        auto [a, b, c, d] = make_vars("a", "b", "c", "d");
 
-    add_cfunc<float>(s, "cfunc", {acos(a), acos(b), acos(c), acos(d)});
+        add_cfunc<float>(s, "cfunc", {acos(a), acos(b), acos(c), acos(d)});
 
-    s.compile();
+        s.compile();
 
-    auto *cf_ptr
-        = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
+        auto *cf_ptr
+            = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
 
-    const std::vector<float> ins{.1f, .2f, .3f, .4f};
-    std::vector<float> outs(4u, 0.);
+        const std::vector<float> ins{.1f, .2f, .3f, .4f};
+        std::vector<float> outs(4u, 0.);
 
-    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+        cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
 
-    REQUIRE(outs[0] == approximately(std::acos(.1f)));
-    REQUIRE(outs[1] == approximately(std::acos(.2f)));
-    REQUIRE(outs[2] == approximately(std::acos(.3f)));
-    REQUIRE(outs[3] == approximately(std::acos(.4f)));
+        REQUIRE(outs[0] == approximately(std::acos(.1f)));
+        REQUIRE(outs[1] == approximately(std::acos(.2f)));
+        REQUIRE(outs[2] == approximately(std::acos(.3f)));
+        REQUIRE(outs[3] == approximately(std::acos(.4f)));
 
-    // NOTE: autovec with external scalar functions seems to work
-    // only since LLVM 16.
+        // NOTE: autovec with external scalar functions seems to work
+        // only since LLVM 16.
 #if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16
 
-    const auto &tf = detail::get_target_features();
+        const auto &tf = detail::get_target_features();
 
-    auto ir = s.get_ir();
+        auto ir = s.get_ir();
 
-    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+        using string_find_iterator = boost::find_iterator<std::string::iterator>;
 
-    auto count = 0u;
-    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@acosf", boost::is_iequal()));
-         it != string_find_iterator(); ++it) {
-        ++count;
-    }
+        auto count = 0u;
+        for (auto it = boost::make_find_iterator(ir, boost::first_finder("@acosf", boost::is_iequal()));
+             it != string_find_iterator(); ++it) {
+            ++count;
+        }
 
-    // NOTE: at the moment we have comprehensive coverage of LLVM versions
-    // in the CI only for x86_64.
-    if (tf.sse2) {
-        // NOTE: occurrences of the scalar version:
-        // - 4 calls in the strided cfunc,
-        // - 1 declaration.
-        REQUIRE(count == 5u);
-    }
+        // NOTE: at the moment we have comprehensive coverage of LLVM versions
+        // in the CI only for x86_64.
+        if (tf.sse2) {
+            // NOTE: occurrences of the scalar version:
+            // - 4 calls in the strided cfunc,
+            // - 1 declaration.
+            REQUIRE(count == 5u);
+        }
 
-    if (tf.aarch64) {
-        REQUIRE(count == 5u);
-    }
+        if (tf.aarch64) {
+            REQUIRE(count == 5u);
+        }
 
-    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
-    // to the way the target machine is being set up by orc/lljit (it works
-    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+        // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+        // to the way the target machine is being set up by orc/lljit (it works
+        // fine with the opt tool). When this is resolved, we can test ppc64 too.
 
-    // if (tf.vsx) {
-    //     REQUIRE(count == 5u);
-    // }
+        // if (tf.vsx) {
+        //     REQUIRE(count == 5u);
+        // }
 
 #endif
+    }
 }
diff --git a/test/acosh.cpp b/test/acosh.cpp
index d70ec1bb8..d629b6158 100644
--- a/test/acosh.cpp
+++ b/test/acosh.cpp
@@ -229,125 +229,129 @@ TEST_CASE("normalise")
 // Tests to check vectorisation via the vector-function-abi-variant machinery.
 TEST_CASE("vfabi double")
 {
-    llvm_state s{kw::slp_vectorize = true};
+    for (auto fast_math : {false, true}) {
+        llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math};
 
-    auto [a, b] = make_vars("a", "b");
+        auto [a, b] = make_vars("a", "b");
 
-    add_cfunc<double>(s, "cfunc", {acosh(a), acosh(b)});
+        add_cfunc<double>(s, "cfunc", {acosh(a), acosh(b)});
 
-    s.compile();
+        s.compile();
 
-    auto *cf_ptr
-        = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(s.jit_lookup("cfunc"));
+        auto *cf_ptr = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(
+            s.jit_lookup("cfunc"));
 
-    const std::vector ins{1.1, 1.2};
-    std::vector<double> outs(2u, 0.);
+        const std::vector ins{1.1, 1.2};
+        std::vector<double> outs(2u, 0.);
 
-    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+        cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
 
-    REQUIRE(outs[0] == approximately(std::acosh(1.1)));
-    REQUIRE(outs[1] == approximately(std::acosh(1.2)));
+        REQUIRE(outs[0] == approximately(std::acosh(1.1)));
+        REQUIRE(outs[1] == approximately(std::acosh(1.2)));
 
-    // NOTE: autovec with external scalar functions seems to work
-    // only since LLVM 16.
+        // NOTE: autovec with external scalar functions seems to work
+        // only since LLVM 16.
 #if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16
 
-    const auto &tf = detail::get_target_features();
+        const auto &tf = detail::get_target_features();
 
-    auto ir = s.get_ir();
+        auto ir = s.get_ir();
 
-    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+        using string_find_iterator = boost::find_iterator<std::string::iterator>;
 
-    auto count = 0u;
-    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@acosh", boost::is_iequal()));
-         it != string_find_iterator(); ++it) {
-        ++count;
-    }
+        auto count = 0u;
+        for (auto it = boost::make_find_iterator(ir, boost::first_finder("@acosh", boost::is_iequal()));
+             it != string_find_iterator(); ++it) {
+            ++count;
+        }
 
-    // NOTE: at the moment we have comprehensive coverage of LLVM versions
-    // in the CI only for x86_64.
-    if (tf.sse2) {
-        // NOTE: occurrences of the scalar version:
-        // - 2 calls in the strided cfunc,
-        // - 1 declaration.
-        REQUIRE(count == 3u);
-    }
+        // NOTE: at the moment we have comprehensive coverage of LLVM versions
+        // in the CI only for x86_64.
+        if (tf.sse2) {
+            // NOTE: occurrences of the scalar version:
+            // - 2 calls in the strided cfunc,
+            // - 1 declaration.
+            REQUIRE(count == 3u);
+        }
 
-    if (tf.aarch64) {
-        REQUIRE(count == 3u);
-    }
+        if (tf.aarch64) {
+            REQUIRE(count == 3u);
+        }
 
-    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
-    // to the way the target machine is being set up by orc/lljit (it works
-    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+        // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+        // to the way the target machine is being set up by orc/lljit (it works
+        // fine with the opt tool). When this is resolved, we can test ppc64 too.
 
-    // if (tf.vsx) {
-    //     REQUIRE(count == 3u);
-    // }
+        // if (tf.vsx) {
+        //     REQUIRE(count == 3u);
+        // }
 
 #endif
+    }
 }
 
 TEST_CASE("vfabi float")
 {
-    llvm_state s{kw::slp_vectorize = true};
+    for (auto fast_math : {false, true}) {
+        llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math};
 
-    auto [a, b, c, d] = make_vars("a", "b", "c", "d");
+        auto [a, b, c, d] = make_vars("a", "b", "c", "d");
 
-    add_cfunc<float>(s, "cfunc", {acosh(a), acosh(b), acosh(c), acosh(d)});
+        add_cfunc<float>(s, "cfunc", {acosh(a), acosh(b), acosh(c), acosh(d)});
 
-    s.compile();
+        s.compile();
 
-    auto *cf_ptr
-        = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
+        auto *cf_ptr
+            = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
 
-    const std::vector<float> ins{1.1f, 1.2f, 1.3f, 1.4f};
-    std::vector<float> outs(4u, 0.);
+        const std::vector<float> ins{1.1f, 1.2f, 1.3f, 1.4f};
+        std::vector<float> outs(4u, 0.);
 
-    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+        cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
 
-    REQUIRE(outs[0] == approximately(std::acosh(1.1f)));
-    REQUIRE(outs[1] == approximately(std::acosh(1.2f)));
-    REQUIRE(outs[2] == approximately(std::acosh(1.3f)));
-    REQUIRE(outs[3] == approximately(std::acosh(1.4f)));
+        REQUIRE(outs[0] == approximately(std::acosh(1.1f)));
+        REQUIRE(outs[1] == approximately(std::acosh(1.2f)));
+        REQUIRE(outs[2] == approximately(std::acosh(1.3f)));
+        REQUIRE(outs[3] == approximately(std::acosh(1.4f)));
 
-    // NOTE: autovec with external scalar functions seems to work
-    // only since LLVM 16.
+        // NOTE: autovec with external scalar functions seems to work
+        // only since LLVM 16.
 #if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16
 
-    const auto &tf = detail::get_target_features();
+        const auto &tf = detail::get_target_features();
 
-    auto ir = s.get_ir();
+        auto ir = s.get_ir();
 
-    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+        using string_find_iterator = boost::find_iterator<std::string::iterator>;
 
-    auto count = 0u;
-    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@acoshf", boost::is_iequal()));
-         it != string_find_iterator(); ++it) {
-        ++count;
-    }
+        auto count = 0u;
+        for (auto it = boost::make_find_iterator(ir, boost::first_finder("@acoshf", boost::is_iequal()));
+             it != string_find_iterator(); ++it) {
+            ++count;
+        }
 
-    // NOTE: at the moment we have comprehensive coverage of LLVM versions
-    // in the CI only for x86_64.
-    if (tf.sse2) {
-        // NOTE: occurrences of the scalar version:
-        // - 4 calls in the strided cfunc,
-        // - 1 declaration.
-        REQUIRE(count == 5u);
-    }
+        // NOTE: at the moment we have comprehensive coverage of LLVM versions
+        // in the CI only for x86_64.
+        if (tf.sse2) {
+            // NOTE: occurrences of the scalar version:
+            // - 4 calls in the strided cfunc,
+            // - 1 declaration.
+            REQUIRE(count == 5u);
+        }
 
-    // NOTE: LLVM16 is currently the version tested in the CI on arm64.
-    if (tf.aarch64) {
-        REQUIRE(count == 5u);
-    }
+        // NOTE: LLVM16 is currently the version tested in the CI on arm64.
+        if (tf.aarch64) {
+            REQUIRE(count == 5u);
+        }
 
-    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
-    // to the way the target machine is being set up by orc/lljit (it works
-    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+        // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+        // to the way the target machine is being set up by orc/lljit (it works
+        // fine with the opt tool). When this is resolved, we can test ppc64 too.
 
-    // if (tf.vsx) {
-    //     REQUIRE(count == 5u);
-    // }
+        // if (tf.vsx) {
+        //     REQUIRE(count == 5u);
+        // }
 
 #endif
+    }
 }
diff --git a/test/asin.cpp b/test/asin.cpp
index 713df6695..6c05c82aa 100644
--- a/test/asin.cpp
+++ b/test/asin.cpp
@@ -229,124 +229,128 @@ TEST_CASE("normalise")
 // Tests to check vectorisation via the vector-function-abi-variant machinery.
 TEST_CASE("vfabi double")
 {
-    llvm_state s{kw::slp_vectorize = true};
+    for (auto fast_math : {false, true}) {
+        llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math};
 
-    auto [a, b] = make_vars("a", "b");
+        auto [a, b] = make_vars("a", "b");
 
-    add_cfunc<double>(s, "cfunc", {asin(a), asin(b)});
+        add_cfunc<double>(s, "cfunc", {asin(a), asin(b)});
 
-    s.compile();
+        s.compile();
 
-    auto *cf_ptr
-        = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(s.jit_lookup("cfunc"));
+        auto *cf_ptr = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(
+            s.jit_lookup("cfunc"));
 
-    const std::vector ins{.1, .2};
-    std::vector<double> outs(2u, 0.);
+        const std::vector ins{.1, .2};
+        std::vector<double> outs(2u, 0.);
 
-    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+        cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
 
-    REQUIRE(outs[0] == approximately(std::asin(.1)));
-    REQUIRE(outs[1] == approximately(std::asin(.2)));
+        REQUIRE(outs[0] == approximately(std::asin(.1)));
+        REQUIRE(outs[1] == approximately(std::asin(.2)));
 
-    // NOTE: autovec with external scalar functions seems to work
-    // only since LLVM 16.
+        // NOTE: autovec with external scalar functions seems to work
+        // only since LLVM 16.
 #if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16
 
-    const auto &tf = detail::get_target_features();
+        const auto &tf = detail::get_target_features();
 
-    auto ir = s.get_ir();
+        auto ir = s.get_ir();
 
-    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+        using string_find_iterator = boost::find_iterator<std::string::iterator>;
 
-    auto count = 0u;
-    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@asin", boost::is_iequal()));
-         it != string_find_iterator(); ++it) {
-        ++count;
-    }
+        auto count = 0u;
+        for (auto it = boost::make_find_iterator(ir, boost::first_finder("@asin", boost::is_iequal()));
+             it != string_find_iterator(); ++it) {
+            ++count;
+        }
 
-    // NOTE: at the moment we have comprehensive coverage of LLVM versions
-    // in the CI only for x86_64.
-    if (tf.sse2) {
-        // NOTE: occurrences of the scalar version:
-        // - 2 calls in the strided cfunc,
-        // - 1 declaration.
-        REQUIRE(count == 3u);
-    }
+        // NOTE: at the moment we have comprehensive coverage of LLVM versions
+        // in the CI only for x86_64.
+        if (tf.sse2) {
+            // NOTE: occurrences of the scalar version:
+            // - 2 calls in the strided cfunc,
+            // - 1 declaration.
+            REQUIRE(count == 3u);
+        }
 
-    if (tf.aarch64) {
-        REQUIRE(count == 3u);
-    }
+        if (tf.aarch64) {
+            REQUIRE(count == 3u);
+        }
 
-    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
-    // to the way the target machine is being set up by orc/lljit (it works
-    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+        // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+        // to the way the target machine is being set up by orc/lljit (it works
+        // fine with the opt tool). When this is resolved, we can test ppc64 too.
 
-    // if (tf.vsx) {
-    //     REQUIRE(count == 3u);
-    // }
+        // if (tf.vsx) {
+        //     REQUIRE(count == 3u);
+        // }
 
 #endif
+    }
 }
 
 TEST_CASE("vfabi float")
 {
-    llvm_state s{kw::slp_vectorize = true};
+    for (auto fast_math : {false, true}) {
+        llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math};
 
-    auto [a, b, c, d] = make_vars("a", "b", "c", "d");
+        auto [a, b, c, d] = make_vars("a", "b", "c", "d");
 
-    add_cfunc<float>(s, "cfunc", {asin(a), asin(b), asin(c), asin(d)});
+        add_cfunc<float>(s, "cfunc", {asin(a), asin(b), asin(c), asin(d)});
 
-    s.compile();
+        s.compile();
 
-    auto *cf_ptr
-        = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
+        auto *cf_ptr
+            = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
 
-    const std::vector<float> ins{.1f, .2f, .3f, .4f};
-    std::vector<float> outs(4u, 0.);
+        const std::vector<float> ins{.1f, .2f, .3f, .4f};
+        std::vector<float> outs(4u, 0.);
 
-    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+        cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
 
-    REQUIRE(outs[0] == approximately(std::asin(.1f)));
-    REQUIRE(outs[1] == approximately(std::asin(.2f)));
-    REQUIRE(outs[2] == approximately(std::asin(.3f)));
-    REQUIRE(outs[3] == approximately(std::asin(.4f)));
+        REQUIRE(outs[0] == approximately(std::asin(.1f)));
+        REQUIRE(outs[1] == approximately(std::asin(.2f)));
+        REQUIRE(outs[2] == approximately(std::asin(.3f)));
+        REQUIRE(outs[3] == approximately(std::asin(.4f)));
 
-    // NOTE: autovec with external scalar functions seems to work
-    // only since LLVM 16.
+        // NOTE: autovec with external scalar functions seems to work
+        // only since LLVM 16.
 #if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16
 
-    const auto &tf = detail::get_target_features();
+        const auto &tf = detail::get_target_features();
 
-    auto ir = s.get_ir();
+        auto ir = s.get_ir();
 
-    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+        using string_find_iterator = boost::find_iterator<std::string::iterator>;
 
-    auto count = 0u;
-    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@asinf", boost::is_iequal()));
-         it != string_find_iterator(); ++it) {
-        ++count;
-    }
+        auto count = 0u;
+        for (auto it = boost::make_find_iterator(ir, boost::first_finder("@asinf", boost::is_iequal()));
+             it != string_find_iterator(); ++it) {
+            ++count;
+        }
 
-    // NOTE: at the moment we have comprehensive coverage of LLVM versions
-    // in the CI only for x86_64.
-    if (tf.sse2) {
-        // NOTE: occurrences of the scalar version:
-        // - 4 calls in the strided cfunc,
-        // - 1 declaration.
-        REQUIRE(count == 5u);
-    }
+        // NOTE: at the moment we have comprehensive coverage of LLVM versions
+        // in the CI only for x86_64.
+        if (tf.sse2) {
+            // NOTE: occurrences of the scalar version:
+            // - 4 calls in the strided cfunc,
+            // - 1 declaration.
+            REQUIRE(count == 5u);
+        }
 
-    if (tf.aarch64) {
-        REQUIRE(count == 5u);
-    }
+        if (tf.aarch64) {
+            REQUIRE(count == 5u);
+        }
 
-    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
-    // to the way the target machine is being set up by orc/lljit (it works
-    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+        // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+        // to the way the target machine is being set up by orc/lljit (it works
+        // fine with the opt tool). When this is resolved, we can test ppc64 too.
 
-    // if (tf.vsx) {
-    //     REQUIRE(count == 5u);
-    // }
+        // if (tf.vsx) {
+        //     REQUIRE(count == 5u);
+        // }
 
 #endif
+    }
 }
diff --git a/test/asinh.cpp b/test/asinh.cpp
index f44bfe307..7cc6e91ee 100644
--- a/test/asinh.cpp
+++ b/test/asinh.cpp
@@ -229,125 +229,129 @@ TEST_CASE("normalise")
 // Tests to check vectorisation via the vector-function-abi-variant machinery.
 TEST_CASE("vfabi double")
 {
-    llvm_state s{kw::slp_vectorize = true};
+    for (auto fast_math : {false, true}) {
+        llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math};
 
-    auto [a, b] = make_vars("a", "b");
+        auto [a, b] = make_vars("a", "b");
 
-    add_cfunc<double>(s, "cfunc", {asinh(a), asinh(b)});
+        add_cfunc<double>(s, "cfunc", {asinh(a), asinh(b)});
 
-    s.compile();
+        s.compile();
 
-    auto *cf_ptr
-        = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(s.jit_lookup("cfunc"));
+        auto *cf_ptr = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(
+            s.jit_lookup("cfunc"));
 
-    const std::vector ins{1.1, 1.2};
-    std::vector<double> outs(2u, 0.);
+        const std::vector ins{1.1, 1.2};
+        std::vector<double> outs(2u, 0.);
 
-    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+        cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
 
-    REQUIRE(outs[0] == approximately(std::asinh(1.1)));
-    REQUIRE(outs[1] == approximately(std::asinh(1.2)));
+        REQUIRE(outs[0] == approximately(std::asinh(1.1)));
+        REQUIRE(outs[1] == approximately(std::asinh(1.2)));
 
-    // NOTE: autovec with external scalar functions seems to work
-    // only since LLVM 16.
+        // NOTE: autovec with external scalar functions seems to work
+        // only since LLVM 16.
 #if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16
 
-    const auto &tf = detail::get_target_features();
+        const auto &tf = detail::get_target_features();
 
-    auto ir = s.get_ir();
+        auto ir = s.get_ir();
 
-    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+        using string_find_iterator = boost::find_iterator<std::string::iterator>;
 
-    auto count = 0u;
-    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@asinh", boost::is_iequal()));
-         it != string_find_iterator(); ++it) {
-        ++count;
-    }
+        auto count = 0u;
+        for (auto it = boost::make_find_iterator(ir, boost::first_finder("@asinh", boost::is_iequal()));
+             it != string_find_iterator(); ++it) {
+            ++count;
+        }
 
-    // NOTE: at the moment we have comprehensive coverage of LLVM versions
-    // in the CI only for x86_64.
-    if (tf.sse2) {
-        // NOTE: occurrences of the scalar version:
-        // - 2 calls in the strided cfunc,
-        // - 1 declaration.
-        REQUIRE(count == 3u);
-    }
+        // NOTE: at the moment we have comprehensive coverage of LLVM versions
+        // in the CI only for x86_64.
+        if (tf.sse2) {
+            // NOTE: occurrences of the scalar version:
+            // - 2 calls in the strided cfunc,
+            // - 1 declaration.
+            REQUIRE(count == 3u);
+        }
 
-    if (tf.aarch64) {
-        REQUIRE(count == 3u);
-    }
+        if (tf.aarch64) {
+            REQUIRE(count == 3u);
+        }
 
-    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
-    // to the way the target machine is being set up by orc/lljit (it works
-    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+        // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+        // to the way the target machine is being set up by orc/lljit (it works
+        // fine with the opt tool). When this is resolved, we can test ppc64 too.
 
-    // if (tf.vsx) {
-    //     REQUIRE(count == 3u);
-    // }
+        // if (tf.vsx) {
+        //     REQUIRE(count == 3u);
+        // }
 
 #endif
+    }
 }
 
 TEST_CASE("vfabi float")
 {
-    llvm_state s{kw::slp_vectorize = true};
+    for (auto fast_math : {false, true}) {
+        llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math};
 
-    auto [a, b, c, d] = make_vars("a", "b", "c", "d");
+        auto [a, b, c, d] = make_vars("a", "b", "c", "d");
 
-    add_cfunc<float>(s, "cfunc", {asinh(a), asinh(b), asinh(c), asinh(d)});
+        add_cfunc<float>(s, "cfunc", {asinh(a), asinh(b), asinh(c), asinh(d)});
 
-    s.compile();
+        s.compile();
 
-    auto *cf_ptr
-        = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
+        auto *cf_ptr
+            = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
 
-    const std::vector<float> ins{1.1f, 1.2f, 1.3f, 1.4f};
-    std::vector<float> outs(4u, 0.);
+        const std::vector<float> ins{1.1f, 1.2f, 1.3f, 1.4f};
+        std::vector<float> outs(4u, 0.);
 
-    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+        cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
 
-    REQUIRE(outs[0] == approximately(std::asinh(1.1f)));
-    REQUIRE(outs[1] == approximately(std::asinh(1.2f)));
-    REQUIRE(outs[2] == approximately(std::asinh(1.3f)));
-    REQUIRE(outs[3] == approximately(std::asinh(1.4f)));
+        REQUIRE(outs[0] == approximately(std::asinh(1.1f)));
+        REQUIRE(outs[1] == approximately(std::asinh(1.2f)));
+        REQUIRE(outs[2] == approximately(std::asinh(1.3f)));
+        REQUIRE(outs[3] == approximately(std::asinh(1.4f)));
 
-    // NOTE: autovec with external scalar functions seems to work
-    // only since LLVM 16.
+        // NOTE: autovec with external scalar functions seems to work
+        // only since LLVM 16.
 #if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16
 
-    const auto &tf = detail::get_target_features();
+        const auto &tf = detail::get_target_features();
 
-    auto ir = s.get_ir();
+        auto ir = s.get_ir();
 
-    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+        using string_find_iterator = boost::find_iterator<std::string::iterator>;
 
-    auto count = 0u;
-    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@asinhf", boost::is_iequal()));
-         it != string_find_iterator(); ++it) {
-        ++count;
-    }
+        auto count = 0u;
+        for (auto it = boost::make_find_iterator(ir, boost::first_finder("@asinhf", boost::is_iequal()));
+             it != string_find_iterator(); ++it) {
+            ++count;
+        }
 
-    // NOTE: at the moment we have comprehensive coverage of LLVM versions
-    // in the CI only for x86_64.
-    if (tf.sse2) {
-        // NOTE: occurrences of the scalar version:
-        // - 4 calls in the strided cfunc,
-        // - 1 declaration.
-        REQUIRE(count == 5u);
-    }
+        // NOTE: at the moment we have comprehensive coverage of LLVM versions
+        // in the CI only for x86_64.
+        if (tf.sse2) {
+            // NOTE: occurrences of the scalar version:
+            // - 4 calls in the strided cfunc,
+            // - 1 declaration.
+            REQUIRE(count == 5u);
+        }
 
-    // NOTE: LLVM16 is currently the version tested in the CI on arm64.
-    if (tf.aarch64) {
-        REQUIRE(count == 5u);
-    }
+        // NOTE: LLVM16 is currently the version tested in the CI on arm64.
+        if (tf.aarch64) {
+            REQUIRE(count == 5u);
+        }
 
-    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
-    // to the way the target machine is being set up by orc/lljit (it works
-    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+        // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+        // to the way the target machine is being set up by orc/lljit (it works
+        // fine with the opt tool). When this is resolved, we can test ppc64 too.
 
-    // if (tf.vsx) {
-    //     REQUIRE(count == 5u);
-    // }
+        // if (tf.vsx) {
+        //     REQUIRE(count == 5u);
+        // }
 
 #endif
+    }
 }
diff --git a/test/atan.cpp b/test/atan.cpp
index 6d69c7b20..8dcdcd519 100644
--- a/test/atan.cpp
+++ b/test/atan.cpp
@@ -223,125 +223,129 @@ TEST_CASE("normalise")
 // Tests to check vectorisation via the vector-function-abi-variant machinery.
 TEST_CASE("vfabi double")
 {
-    llvm_state s{kw::slp_vectorize = true};
+    for (auto fast_math : {false, true}) {
+        llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math};
 
-    auto [a, b] = make_vars("a", "b");
+        auto [a, b] = make_vars("a", "b");
 
-    add_cfunc<double>(s, "cfunc", {atan(a), atan(b)});
+        add_cfunc<double>(s, "cfunc", {atan(a), atan(b)});
 
-    s.compile();
+        s.compile();
 
-    auto *cf_ptr
-        = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(s.jit_lookup("cfunc"));
+        auto *cf_ptr = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(
+            s.jit_lookup("cfunc"));
 
-    const std::vector ins{.1, .2};
-    std::vector<double> outs(2u, 0.);
+        const std::vector ins{.1, .2};
+        std::vector<double> outs(2u, 0.);
 
-    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+        cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
 
-    REQUIRE(outs[0] == approximately(std::atan(.1)));
-    REQUIRE(outs[1] == approximately(std::atan(.2)));
+        REQUIRE(outs[0] == approximately(std::atan(.1)));
+        REQUIRE(outs[1] == approximately(std::atan(.2)));
 
-    // NOTE: autovec with external scalar functions seems to work
-    // only since LLVM 16.
+        // NOTE: autovec with external scalar functions seems to work
+        // only since LLVM 16.
 #if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16
 
-    const auto &tf = detail::get_target_features();
+        const auto &tf = detail::get_target_features();
 
-    auto ir = s.get_ir();
+        auto ir = s.get_ir();
 
-    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+        using string_find_iterator = boost::find_iterator<std::string::iterator>;
 
-    auto count = 0u;
-    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@atan", boost::is_iequal()));
-         it != string_find_iterator(); ++it) {
-        ++count;
-    }
+        auto count = 0u;
+        for (auto it = boost::make_find_iterator(ir, boost::first_finder("@atan", boost::is_iequal()));
+             it != string_find_iterator(); ++it) {
+            ++count;
+        }
 
-    // NOTE: at the moment we have comprehensive coverage of LLVM versions
-    // in the CI only for x86_64.
-    if (tf.sse2) {
-        // NOTE: occurrences of the scalar version:
-        // - 2 calls in the strided cfunc,
-        // - 1 declaration.
-        REQUIRE(count == 3u);
-    }
+        // NOTE: at the moment we have comprehensive coverage of LLVM versions
+        // in the CI only for x86_64.
+        if (tf.sse2) {
+            // NOTE: occurrences of the scalar version:
+            // - 2 calls in the strided cfunc,
+            // - 1 declaration.
+            REQUIRE(count == 3u);
+        }
 
-    if (tf.aarch64) {
-        REQUIRE(count == 3u);
-    }
+        if (tf.aarch64) {
+            REQUIRE(count == 3u);
+        }
 
-    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
-    // to the way the target machine is being set up by orc/lljit (it works
-    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+        // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+        // to the way the target machine is being set up by orc/lljit (it works
+        // fine with the opt tool). When this is resolved, we can test ppc64 too.
 
-    // if (tf.vsx) {
-    //     REQUIRE(count == 3u);
-    // }
+        // if (tf.vsx) {
+        //     REQUIRE(count == 3u);
+        // }
 
 #endif
+    }
 }
 
 TEST_CASE("vfabi float")
 {
-    llvm_state s{kw::slp_vectorize = true};
+    for (auto fast_math : {false, true}) {
+        llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math};
 
-    auto [a, b, c, d] = make_vars("a", "b", "c", "d");
+        auto [a, b, c, d] = make_vars("a", "b", "c", "d");
 
-    add_cfunc<float>(s, "cfunc", {atan(a), atan(b), atan(c), atan(d)});
+        add_cfunc<float>(s, "cfunc", {atan(a), atan(b), atan(c), atan(d)});
 
-    s.compile();
+        s.compile();
 
-    auto *cf_ptr
-        = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
+        auto *cf_ptr
+            = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
 
-    const std::vector<float> ins{.1f, .2f, .3f, .4f};
-    std::vector<float> outs(4u, 0.);
+        const std::vector<float> ins{.1f, .2f, .3f, .4f};
+        std::vector<float> outs(4u, 0.);
 
-    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+        cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
 
-    REQUIRE(outs[0] == approximately(std::atan(.1f)));
-    REQUIRE(outs[1] == approximately(std::atan(.2f)));
-    REQUIRE(outs[2] == approximately(std::atan(.3f)));
-    REQUIRE(outs[3] == approximately(std::atan(.4f)));
+        REQUIRE(outs[0] == approximately(std::atan(.1f)));
+        REQUIRE(outs[1] == approximately(std::atan(.2f)));
+        REQUIRE(outs[2] == approximately(std::atan(.3f)));
+        REQUIRE(outs[3] == approximately(std::atan(.4f)));
 
-    // NOTE: autovec with external scalar functions seems to work
-    // only since LLVM 16.
+        // NOTE: autovec with external scalar functions seems to work
+        // only since LLVM 16.
 #if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16
 
-    const auto &tf = detail::get_target_features();
+        const auto &tf = detail::get_target_features();
 
-    auto ir = s.get_ir();
+        auto ir = s.get_ir();
 
-    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+        using string_find_iterator = boost::find_iterator<std::string::iterator>;
 
-    auto count = 0u;
-    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@atanf", boost::is_iequal()));
-         it != string_find_iterator(); ++it) {
-        ++count;
-    }
+        auto count = 0u;
+        for (auto it = boost::make_find_iterator(ir, boost::first_finder("@atanf", boost::is_iequal()));
+             it != string_find_iterator(); ++it) {
+            ++count;
+        }
 
-    // NOTE: at the moment we have comprehensive coverage of LLVM versions
-    // in the CI only for x86_64.
-    if (tf.sse2) {
-        // NOTE: occurrences of the scalar version:
-        // - 4 calls in the strided cfunc,
-        // - 1 declaration.
-        REQUIRE(count == 5u);
-    }
+        // NOTE: at the moment we have comprehensive coverage of LLVM versions
+        // in the CI only for x86_64.
+        if (tf.sse2) {
+            // NOTE: occurrences of the scalar version:
+            // - 4 calls in the strided cfunc,
+            // - 1 declaration.
+            REQUIRE(count == 5u);
+        }
 
-    // NOTE: LLVM16 is currently the version tested in the CI on arm64.
-    if (tf.aarch64) {
-        REQUIRE(count == 5u);
-    }
+        // NOTE: LLVM16 is currently the version tested in the CI on arm64.
+        if (tf.aarch64) {
+            REQUIRE(count == 5u);
+        }
 
-    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
-    // to the way the target machine is being set up by orc/lljit (it works
-    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+        // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+        // to the way the target machine is being set up by orc/lljit (it works
+        // fine with the opt tool). When this is resolved, we can test ppc64 too.
 
-    // if (tf.vsx) {
-    //     REQUIRE(count == 5u);
-    // }
+        // if (tf.vsx) {
+        //     REQUIRE(count == 5u);
+        // }
 
 #endif
+    }
 }
diff --git a/test/atan2.cpp b/test/atan2.cpp
index e64618142..86f9af26d 100644
--- a/test/atan2.cpp
+++ b/test/atan2.cpp
@@ -334,124 +334,128 @@ TEST_CASE("normalise")
 // Tests to check vectorisation via the vector-function-abi-variant machinery.
 TEST_CASE("vfabi double")
 {
-    llvm_state s{kw::slp_vectorize = true};
+    for (auto fast_math : {false, true}) {
+        llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math};
 
-    auto [a, b] = make_vars("a", "b");
+        auto [a, b] = make_vars("a", "b");
 
-    add_cfunc<double>(s, "cfunc", {atan2(a, .3), atan2(b, .4)});
+        add_cfunc<double>(s, "cfunc", {atan2(a, .3), atan2(b, .4)});
 
-    s.compile();
+        s.compile();
 
-    auto *cf_ptr
-        = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(s.jit_lookup("cfunc"));
+        auto *cf_ptr = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(
+            s.jit_lookup("cfunc"));
 
-    const std::vector ins{.1, .2};
-    std::vector<double> outs(2u, 0.);
+        const std::vector ins{.1, .2};
+        std::vector<double> outs(2u, 0.);
 
-    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+        cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
 
-    REQUIRE(outs[0] == approximately(std::atan2(.1, .3)));
-    REQUIRE(outs[1] == approximately(std::atan2(.2, .4)));
+        REQUIRE(outs[0] == approximately(std::atan2(.1, .3)));
+        REQUIRE(outs[1] == approximately(std::atan2(.2, .4)));
 
-    // NOTE: autovec with external scalar functions seems to work
-    // only since LLVM 16.
+        // NOTE: autovec with external scalar functions seems to work
+        // only since LLVM 16.
 #if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16
 
-    const auto &tf = detail::get_target_features();
+        const auto &tf = detail::get_target_features();
 
-    auto ir = s.get_ir();
+        auto ir = s.get_ir();
 
-    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+        using string_find_iterator = boost::find_iterator<std::string::iterator>;
 
-    auto count = 0u;
-    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@atan2", boost::is_iequal()));
-         it != string_find_iterator(); ++it) {
-        ++count;
-    }
+        auto count = 0u;
+        for (auto it = boost::make_find_iterator(ir, boost::first_finder("@atan2", boost::is_iequal()));
+             it != string_find_iterator(); ++it) {
+            ++count;
+        }
 
-    // NOTE: at the moment we have comprehensive coverage of LLVM versions
-    // in the CI only for x86_64.
-    if (tf.sse2) {
-        // NOTE: occurrences of the scalar version:
-        // - 2 calls in the strided cfunc,
-        // - 1 declaration.
-        REQUIRE(count == 3u);
-    }
+        // NOTE: at the moment we have comprehensive coverage of LLVM versions
+        // in the CI only for x86_64.
+        if (tf.sse2) {
+            // NOTE: occurrences of the scalar version:
+            // - 2 calls in the strided cfunc,
+            // - 1 declaration.
+            REQUIRE(count == 3u);
+        }
 
-    if (tf.aarch64) {
-        REQUIRE(count == 3u);
-    }
+        if (tf.aarch64) {
+            REQUIRE(count == 3u);
+        }
 
-    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
-    // to the way the target machine is being set up by orc/lljit (it works
-    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+        // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+        // to the way the target machine is being set up by orc/lljit (it works
+        // fine with the opt tool). When this is resolved, we can test ppc64 too.
 
-    // if (tf.vsx) {
-    //     REQUIRE(count == 3u);
-    // }
+        // if (tf.vsx) {
+        //     REQUIRE(count == 3u);
+        // }
 
 #endif
+    }
 }
 
 TEST_CASE("vfabi float")
 {
-    llvm_state s{kw::slp_vectorize = true};
+    for (auto fast_math : {false, true}) {
+        llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math};
 
-    auto [a, b, c, d] = make_vars("a", "b", "c", "d");
+        auto [a, b, c, d] = make_vars("a", "b", "c", "d");
 
-    add_cfunc<float>(s, "cfunc", {atan2(a, .5f), atan2(b, .6f), atan2(c, .7f), atan2(d, .8f)});
+        add_cfunc<float>(s, "cfunc", {atan2(a, .5f), atan2(b, .6f), atan2(c, .7f), atan2(d, .8f)});
 
-    s.compile();
+        s.compile();
 
-    auto *cf_ptr
-        = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
+        auto *cf_ptr
+            = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
 
-    const std::vector<float> ins{.1f, .2f, .3f, .4f};
-    std::vector<float> outs(4u, 0.);
+        const std::vector<float> ins{.1f, .2f, .3f, .4f};
+        std::vector<float> outs(4u, 0.);
 
-    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+        cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
 
-    REQUIRE(outs[0] == approximately(std::atan2(.1f, .5f)));
-    REQUIRE(outs[1] == approximately(std::atan2(.2f, .6f)));
-    REQUIRE(outs[2] == approximately(std::atan2(.3f, .7f)));
-    REQUIRE(outs[3] == approximately(std::atan2(.4f, .8f)));
+        REQUIRE(outs[0] == approximately(std::atan2(.1f, .5f)));
+        REQUIRE(outs[1] == approximately(std::atan2(.2f, .6f)));
+        REQUIRE(outs[2] == approximately(std::atan2(.3f, .7f)));
+        REQUIRE(outs[3] == approximately(std::atan2(.4f, .8f)));
 
-    // NOTE: autovec with external scalar functions seems to work
-    // only since LLVM 16.
+        // NOTE: autovec with external scalar functions seems to work
+        // only since LLVM 16.
 #if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16
 
-    const auto &tf = detail::get_target_features();
+        const auto &tf = detail::get_target_features();
 
-    auto ir = s.get_ir();
+        auto ir = s.get_ir();
 
-    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+        using string_find_iterator = boost::find_iterator<std::string::iterator>;
 
-    auto count = 0u;
-    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@atan2f", boost::is_iequal()));
-         it != string_find_iterator(); ++it) {
-        ++count;
-    }
+        auto count = 0u;
+        for (auto it = boost::make_find_iterator(ir, boost::first_finder("@atan2f", boost::is_iequal()));
+             it != string_find_iterator(); ++it) {
+            ++count;
+        }
 
-    // NOTE: at the moment we have comprehensive coverage of LLVM versions
-    // in the CI only for x86_64.
-    if (tf.sse2) {
-        // NOTE: occurrences of the scalar version:
-        // - 4 calls in the strided cfunc,
-        // - 1 declaration.
-        REQUIRE(count == 5u);
-    }
+        // NOTE: at the moment we have comprehensive coverage of LLVM versions
+        // in the CI only for x86_64.
+        if (tf.sse2) {
+            // NOTE: occurrences of the scalar version:
+            // - 4 calls in the strided cfunc,
+            // - 1 declaration.
+            REQUIRE(count == 5u);
+        }
 
-    if (tf.aarch64) {
-        REQUIRE(count == 5u);
-    }
+        if (tf.aarch64) {
+            REQUIRE(count == 5u);
+        }
 
-    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
-    // to the way the target machine is being set up by orc/lljit (it works
-    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+        // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+        // to the way the target machine is being set up by orc/lljit (it works
+        // fine with the opt tool). When this is resolved, we can test ppc64 too.
 
-    // if (tf.vsx) {
-    //     REQUIRE(count == 5u);
-    // }
+        // if (tf.vsx) {
+        //     REQUIRE(count == 5u);
+        // }
 
 #endif
+    }
 }
diff --git a/test/atanh.cpp b/test/atanh.cpp
index 98efb30b0..069e02e48 100644
--- a/test/atanh.cpp
+++ b/test/atanh.cpp
@@ -223,124 +223,128 @@ TEST_CASE("normalise")
 // Tests to check vectorisation via the vector-function-abi-variant machinery.
 TEST_CASE("vfabi double")
 {
-    llvm_state s{kw::slp_vectorize = true};
+    for (auto fast_math : {false, true}) {
+        llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math};
 
-    auto [a, b] = make_vars("a", "b");
+        auto [a, b] = make_vars("a", "b");
 
-    add_cfunc<double>(s, "cfunc", {atanh(a), atanh(b)});
+        add_cfunc<double>(s, "cfunc", {atanh(a), atanh(b)});
 
-    s.compile();
+        s.compile();
 
-    auto *cf_ptr
-        = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(s.jit_lookup("cfunc"));
+        auto *cf_ptr = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(
+            s.jit_lookup("cfunc"));
 
-    const std::vector ins{.1, .2};
-    std::vector<double> outs(2u, 0.);
+        const std::vector ins{.1, .2};
+        std::vector<double> outs(2u, 0.);
 
-    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+        cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
 
-    REQUIRE(outs[0] == approximately(std::atanh(.1)));
-    REQUIRE(outs[1] == approximately(std::atanh(.2)));
+        REQUIRE(outs[0] == approximately(std::atanh(.1)));
+        REQUIRE(outs[1] == approximately(std::atanh(.2)));
 
-    // NOTE: autovec with external scalar functions seems to work
-    // only since LLVM 16.
+        // NOTE: autovec with external scalar functions seems to work
+        // only since LLVM 16.
 #if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16
 
-    const auto &tf = detail::get_target_features();
+        const auto &tf = detail::get_target_features();
 
-    auto ir = s.get_ir();
+        auto ir = s.get_ir();
 
-    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+        using string_find_iterator = boost::find_iterator<std::string::iterator>;
 
-    auto count = 0u;
-    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@atanh", boost::is_iequal()));
-         it != string_find_iterator(); ++it) {
-        ++count;
-    }
+        auto count = 0u;
+        for (auto it = boost::make_find_iterator(ir, boost::first_finder("@atanh", boost::is_iequal()));
+             it != string_find_iterator(); ++it) {
+            ++count;
+        }
 
-    // NOTE: at the moment we have comprehensive coverage of LLVM versions
-    // in the CI only for x86_64.
-    if (tf.sse2) {
-        // NOTE: occurrences of the scalar version:
-        // - 2 calls in the strided cfunc,
-        // - 1 declaration.
-        REQUIRE(count == 3u);
-    }
+        // NOTE: at the moment we have comprehensive coverage of LLVM versions
+        // in the CI only for x86_64.
+        if (tf.sse2) {
+            // NOTE: occurrences of the scalar version:
+            // - 2 calls in the strided cfunc,
+            // - 1 declaration.
+            REQUIRE(count == 3u);
+        }
 
-    if (tf.aarch64) {
-        REQUIRE(count == 3u);
-    }
+        if (tf.aarch64) {
+            REQUIRE(count == 3u);
+        }
 
-    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
-    // to the way the target machine is being set up by orc/lljit (it works
-    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+        // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+        // to the way the target machine is being set up by orc/lljit (it works
+        // fine with the opt tool). When this is resolved, we can test ppc64 too.
 
-    // if (tf.vsx) {
-    //     REQUIRE(count == 3u);
-    // }
+        // if (tf.vsx) {
+        //     REQUIRE(count == 3u);
+        // }
 
 #endif
+    }
 }
 
 TEST_CASE("vfabi float")
 {
-    llvm_state s{kw::slp_vectorize = true};
+    for (auto fast_math : {false, true}) {
+        llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math};
 
-    auto [a, b, c, d] = make_vars("a", "b", "c", "d");
+        auto [a, b, c, d] = make_vars("a", "b", "c", "d");
 
-    add_cfunc<float>(s, "cfunc", {atanh(a), atanh(b), atanh(c), atanh(d)});
+        add_cfunc<float>(s, "cfunc", {atanh(a), atanh(b), atanh(c), atanh(d)});
 
-    s.compile();
+        s.compile();
 
-    auto *cf_ptr
-        = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
+        auto *cf_ptr
+            = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
 
-    const std::vector<float> ins{.1f, .2f, .3f, .4f};
-    std::vector<float> outs(4u, 0.);
+        const std::vector<float> ins{.1f, .2f, .3f, .4f};
+        std::vector<float> outs(4u, 0.);
 
-    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+        cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
 
-    REQUIRE(outs[0] == approximately(std::atanh(.1f)));
-    REQUIRE(outs[1] == approximately(std::atanh(.2f)));
-    REQUIRE(outs[2] == approximately(std::atanh(.3f)));
-    REQUIRE(outs[3] == approximately(std::atanh(.4f)));
+        REQUIRE(outs[0] == approximately(std::atanh(.1f)));
+        REQUIRE(outs[1] == approximately(std::atanh(.2f)));
+        REQUIRE(outs[2] == approximately(std::atanh(.3f)));
+        REQUIRE(outs[3] == approximately(std::atanh(.4f)));
 
-    // NOTE: autovec with external scalar functions seems to work
-    // only since LLVM 16.
+        // NOTE: autovec with external scalar functions seems to work
+        // only since LLVM 16.
 #if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16
 
-    const auto &tf = detail::get_target_features();
+        const auto &tf = detail::get_target_features();
 
-    auto ir = s.get_ir();
+        auto ir = s.get_ir();
 
-    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+        using string_find_iterator = boost::find_iterator<std::string::iterator>;
 
-    auto count = 0u;
-    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@atanhf", boost::is_iequal()));
-         it != string_find_iterator(); ++it) {
-        ++count;
-    }
+        auto count = 0u;
+        for (auto it = boost::make_find_iterator(ir, boost::first_finder("@atanhf", boost::is_iequal()));
+             it != string_find_iterator(); ++it) {
+            ++count;
+        }
 
-    // NOTE: at the moment we have comprehensive coverage of LLVM versions
-    // in the CI only for x86_64.
-    if (tf.sse2) {
-        // NOTE: occurrences of the scalar version:
-        // - 4 calls in the strided cfunc,
-        // - 1 declaration.
-        REQUIRE(count == 5u);
-    }
+        // NOTE: at the moment we have comprehensive coverage of LLVM versions
+        // in the CI only for x86_64.
+        if (tf.sse2) {
+            // NOTE: occurrences of the scalar version:
+            // - 4 calls in the strided cfunc,
+            // - 1 declaration.
+            REQUIRE(count == 5u);
+        }
 
-    if (tf.aarch64) {
-        REQUIRE(count == 5u);
-    }
+        if (tf.aarch64) {
+            REQUIRE(count == 5u);
+        }
 
-    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
-    // to the way the target machine is being set up by orc/lljit (it works
-    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+        // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+        // to the way the target machine is being set up by orc/lljit (it works
+        // fine with the opt tool). When this is resolved, we can test ppc64 too.
 
-    // if (tf.vsx) {
-    //     REQUIRE(count == 5u);
-    // }
+        // if (tf.vsx) {
+        //     REQUIRE(count == 5u);
+        // }
 
 #endif
+    }
 }
diff --git a/test/cos.cpp b/test/cos.cpp
index 8846d465a..a8f085421 100644
--- a/test/cos.cpp
+++ b/test/cos.cpp
@@ -246,130 +246,134 @@ TEST_CASE("normalise")
 // Tests to check vectorisation via the vector-function-abi-variant machinery.
 TEST_CASE("vfabi double")
 {
-    llvm_state s{kw::slp_vectorize = true};
+    for (auto fast_math : {false, true}) {
+        llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math};
 
-    auto [a, b] = make_vars("a", "b");
+        auto [a, b] = make_vars("a", "b");
 
-    add_cfunc<double>(s, "cfunc", {cos(a), cos(b)});
+        add_cfunc<double>(s, "cfunc", {cos(a), cos(b)});
 
-    s.compile();
+        s.compile();
 
-    auto *cf_ptr
-        = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(s.jit_lookup("cfunc"));
+        auto *cf_ptr = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(
+            s.jit_lookup("cfunc"));
 
-    const std::vector ins{1., 2.};
-    std::vector<double> outs(2u, 0.);
+        const std::vector ins{1., 2.};
+        std::vector<double> outs(2u, 0.);
 
-    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+        cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
 
-    REQUIRE(outs[0] == approximately(std::cos(1.)));
-    REQUIRE(outs[1] == approximately(std::cos(2.)));
+        REQUIRE(outs[0] == approximately(std::cos(1.)));
+        REQUIRE(outs[1] == approximately(std::cos(2.)));
 
 #if defined(HEYOKA_WITH_SLEEF)
 
-    const auto &tf = detail::get_target_features();
+        const auto &tf = detail::get_target_features();
 
-    auto ir = s.get_ir();
+        auto ir = s.get_ir();
 
-    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+        using string_find_iterator = boost::find_iterator<std::string::iterator>;
 
-    auto count = 0u;
-    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.cos.f64", boost::is_iequal()));
-         it != string_find_iterator(); ++it) {
-        ++count;
-    }
+        auto count = 0u;
+        for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.cos.f64", boost::is_iequal()));
+             it != string_find_iterator(); ++it) {
+            ++count;
+        }
 
-    // NOTE: at the moment we have comprehensive coverage of LLVM versions
-    // in the CI only for x86_64.
-    if (tf.sse2) {
-        // NOTE: occurrences of the scalar version:
-        // - 2 calls in the strided cfunc,
-        // - 1 declaration.
-        REQUIRE(count == 3u);
-    }
+        // NOTE: at the moment we have comprehensive coverage of LLVM versions
+        // in the CI only for x86_64.
+        if (tf.sse2) {
+            // NOTE: occurrences of the scalar version:
+            // - 2 calls in the strided cfunc,
+            // - 1 declaration.
+            REQUIRE(count == 3u);
+        }
 
 #if LLVM_VERSION_MAJOR >= 16
 
-    // NOTE: LLVM16 is currently the version tested in the CI on arm64.
-    if (tf.aarch64) {
-        REQUIRE(count == 3u);
-    }
+        // NOTE: LLVM16 is currently the version tested in the CI on arm64.
+        if (tf.aarch64) {
+            REQUIRE(count == 3u);
+        }
 
 #endif
 
-    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
-    // to the way the target machine is being set up by orc/lljit (it works
-    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+        // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+        // to the way the target machine is being set up by orc/lljit (it works
+        // fine with the opt tool). When this is resolved, we can test ppc64 too.
 
-    // if (tf.vsx) {
-    //     REQUIRE(count == 3u);
-    // }
+        // if (tf.vsx) {
+        //     REQUIRE(count == 3u);
+        // }
 
 #endif
+    }
 }
 
 TEST_CASE("vfabi float")
 {
-    llvm_state s{kw::slp_vectorize = true};
+    for (auto fast_math : {false, true}) {
+        llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math};
 
-    auto [a, b, c, d] = make_vars("a", "b", "c", "d");
+        auto [a, b, c, d] = make_vars("a", "b", "c", "d");
 
-    add_cfunc<float>(s, "cfunc", {cos(a), cos(b), cos(c), cos(d)});
+        add_cfunc<float>(s, "cfunc", {cos(a), cos(b), cos(c), cos(d)});
 
-    s.compile();
+        s.compile();
 
-    auto *cf_ptr
-        = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
+        auto *cf_ptr
+            = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
 
-    const std::vector<float> ins{1., 2., 3., 4.};
-    std::vector<float> outs(4u, 0.);
+        const std::vector<float> ins{1., 2., 3., 4.};
+        std::vector<float> outs(4u, 0.);
 
-    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+        cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
 
-    REQUIRE(outs[0] == approximately(std::cos(1.f)));
-    REQUIRE(outs[1] == approximately(std::cos(2.f)));
-    REQUIRE(outs[2] == approximately(std::cos(3.f)));
-    REQUIRE(outs[3] == approximately(std::cos(4.f)));
+        REQUIRE(outs[0] == approximately(std::cos(1.f)));
+        REQUIRE(outs[1] == approximately(std::cos(2.f)));
+        REQUIRE(outs[2] == approximately(std::cos(3.f)));
+        REQUIRE(outs[3] == approximately(std::cos(4.f)));
 
 #if defined(HEYOKA_WITH_SLEEF)
 
-    const auto &tf = detail::get_target_features();
+        const auto &tf = detail::get_target_features();
 
-    auto ir = s.get_ir();
+        auto ir = s.get_ir();
 
-    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+        using string_find_iterator = boost::find_iterator<std::string::iterator>;
 
-    auto count = 0u;
-    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.cos.f32", boost::is_iequal()));
-         it != string_find_iterator(); ++it) {
-        ++count;
-    }
+        auto count = 0u;
+        for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.cos.f32", boost::is_iequal()));
+             it != string_find_iterator(); ++it) {
+            ++count;
+        }
 
-    // NOTE: at the moment we have comprehensive coverage of LLVM versions
-    // in the CI only for x86_64.
-    if (tf.sse2) {
-        // NOTE: occurrences of the scalar version:
-        // - 4 calls in the strided cfunc,
-        // - 1 declaration.
-        REQUIRE(count == 5u);
-    }
+        // NOTE: at the moment we have comprehensive coverage of LLVM versions
+        // in the CI only for x86_64.
+        if (tf.sse2) {
+            // NOTE: occurrences of the scalar version:
+            // - 4 calls in the strided cfunc,
+            // - 1 declaration.
+            REQUIRE(count == 5u);
+        }
 
 #if LLVM_VERSION_MAJOR >= 16
 
-    // NOTE: LLVM16 is currently the version tested in the CI on arm64.
-    if (tf.aarch64) {
-        REQUIRE(count == 5u);
-    }
+        // NOTE: LLVM16 is currently the version tested in the CI on arm64.
+        if (tf.aarch64) {
+            REQUIRE(count == 5u);
+        }
 
 #endif
 
-    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
-    // to the way the target machine is being set up by orc/lljit (it works
-    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+        // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+        // to the way the target machine is being set up by orc/lljit (it works
+        // fine with the opt tool). When this is resolved, we can test ppc64 too.
 
-    // if (tf.vsx) {
-    //     REQUIRE(count == 5u);
-    // }
+        // if (tf.vsx) {
+        //     REQUIRE(count == 5u);
+        // }
 
 #endif
+    }
 }
diff --git a/test/cosh.cpp b/test/cosh.cpp
index c6ca5c8b9..91a246bd0 100644
--- a/test/cosh.cpp
+++ b/test/cosh.cpp
@@ -223,124 +223,128 @@ TEST_CASE("normalise")
 // Tests to check vectorisation via the vector-function-abi-variant machinery.
 TEST_CASE("vfabi double")
 {
-    llvm_state s{kw::slp_vectorize = true};
+    for (auto fast_math : {false, true}) {
+        llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math};
 
-    auto [a, b] = make_vars("a", "b");
+        auto [a, b] = make_vars("a", "b");
 
-    add_cfunc<double>(s, "cfunc", {cosh(a), cosh(b)});
+        add_cfunc<double>(s, "cfunc", {cosh(a), cosh(b)});
 
-    s.compile();
+        s.compile();
 
-    auto *cf_ptr
-        = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(s.jit_lookup("cfunc"));
+        auto *cf_ptr = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(
+            s.jit_lookup("cfunc"));
 
-    const std::vector ins{1., 2.};
-    std::vector<double> outs(2u, 0.);
+        const std::vector ins{1., 2.};
+        std::vector<double> outs(2u, 0.);
 
-    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+        cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
 
-    REQUIRE(outs[0] == approximately(std::cosh(1.)));
-    REQUIRE(outs[1] == approximately(std::cosh(2.)));
+        REQUIRE(outs[0] == approximately(std::cosh(1.)));
+        REQUIRE(outs[1] == approximately(std::cosh(2.)));
 
-    // NOTE: autovec with external scalar functions seems to work
-    // only since LLVM 16.
+        // NOTE: autovec with external scalar functions seems to work
+        // only since LLVM 16.
 #if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16
 
-    const auto &tf = detail::get_target_features();
+        const auto &tf = detail::get_target_features();
 
-    auto ir = s.get_ir();
+        auto ir = s.get_ir();
 
-    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+        using string_find_iterator = boost::find_iterator<std::string::iterator>;
 
-    auto count = 0u;
-    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@cosh", boost::is_iequal()));
-         it != string_find_iterator(); ++it) {
-        ++count;
-    }
+        auto count = 0u;
+        for (auto it = boost::make_find_iterator(ir, boost::first_finder("@cosh", boost::is_iequal()));
+             it != string_find_iterator(); ++it) {
+            ++count;
+        }
 
-    // NOTE: at the moment we have comprehensive coverage of LLVM versions
-    // in the CI only for x86_64.
-    if (tf.sse2) {
-        // NOTE: occurrences of the scalar version:
-        // - 2 calls in the strided cfunc,
-        // - 1 declaration.
-        REQUIRE(count == 3u);
-    }
+        // NOTE: at the moment we have comprehensive coverage of LLVM versions
+        // in the CI only for x86_64.
+        if (tf.sse2) {
+            // NOTE: occurrences of the scalar version:
+            // - 2 calls in the strided cfunc,
+            // - 1 declaration.
+            REQUIRE(count == 3u);
+        }
 
-    if (tf.aarch64) {
-        REQUIRE(count == 3u);
-    }
+        if (tf.aarch64) {
+            REQUIRE(count == 3u);
+        }
 
-    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
-    // to the way the target machine is being set up by orc/lljit (it works
-    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+        // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+        // to the way the target machine is being set up by orc/lljit (it works
+        // fine with the opt tool). When this is resolved, we can test ppc64 too.
 
-    // if (tf.vsx) {
-    //     REQUIRE(count == 3u);
-    // }
+        // if (tf.vsx) {
+        //     REQUIRE(count == 3u);
+        // }
 
 #endif
+    }
 }
 
 TEST_CASE("vfabi float")
 {
-    llvm_state s{kw::slp_vectorize = true};
+    for (auto fast_math : {false, true}) {
+        llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math};
 
-    auto [a, b, c, d] = make_vars("a", "b", "c", "d");
+        auto [a, b, c, d] = make_vars("a", "b", "c", "d");
 
-    add_cfunc<float>(s, "cfunc", {cosh(a), cosh(b), cosh(c), cosh(d)});
+        add_cfunc<float>(s, "cfunc", {cosh(a), cosh(b), cosh(c), cosh(d)});
 
-    s.compile();
+        s.compile();
 
-    auto *cf_ptr
-        = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
+        auto *cf_ptr
+            = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
 
-    const std::vector<float> ins{1., 2., 3., 4.};
-    std::vector<float> outs(4u, 0.);
+        const std::vector<float> ins{1., 2., 3., 4.};
+        std::vector<float> outs(4u, 0.);
 
-    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+        cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
 
-    REQUIRE(outs[0] == approximately(std::cosh(1.f)));
-    REQUIRE(outs[1] == approximately(std::cosh(2.f)));
-    REQUIRE(outs[2] == approximately(std::cosh(3.f)));
-    REQUIRE(outs[3] == approximately(std::cosh(4.f)));
+        REQUIRE(outs[0] == approximately(std::cosh(1.f)));
+        REQUIRE(outs[1] == approximately(std::cosh(2.f)));
+        REQUIRE(outs[2] == approximately(std::cosh(3.f)));
+        REQUIRE(outs[3] == approximately(std::cosh(4.f)));
 
-    // NOTE: autovec with external scalar functions seems to work
-    // only since LLVM 16.
+        // NOTE: autovec with external scalar functions seems to work
+        // only since LLVM 16.
 #if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16
 
-    const auto &tf = detail::get_target_features();
+        const auto &tf = detail::get_target_features();
 
-    auto ir = s.get_ir();
+        auto ir = s.get_ir();
 
-    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+        using string_find_iterator = boost::find_iterator<std::string::iterator>;
 
-    auto count = 0u;
-    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@coshf", boost::is_iequal()));
-         it != string_find_iterator(); ++it) {
-        ++count;
-    }
+        auto count = 0u;
+        for (auto it = boost::make_find_iterator(ir, boost::first_finder("@coshf", boost::is_iequal()));
+             it != string_find_iterator(); ++it) {
+            ++count;
+        }
 
-    // NOTE: at the moment we have comprehensive coverage of LLVM versions
-    // in the CI only for x86_64.
-    if (tf.sse2) {
-        // NOTE: occurrences of the scalar version:
-        // - 4 calls in the strided cfunc,
-        // - 1 declaration.
-        REQUIRE(count == 5u);
-    }
+        // NOTE: at the moment we have comprehensive coverage of LLVM versions
+        // in the CI only for x86_64.
+        if (tf.sse2) {
+            // NOTE: occurrences of the scalar version:
+            // - 4 calls in the strided cfunc,
+            // - 1 declaration.
+            REQUIRE(count == 5u);
+        }
 
-    if (tf.aarch64) {
-        REQUIRE(count == 5u);
-    }
+        if (tf.aarch64) {
+            REQUIRE(count == 5u);
+        }
 
-    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
-    // to the way the target machine is being set up by orc/lljit (it works
-    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+        // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+        // to the way the target machine is being set up by orc/lljit (it works
+        // fine with the opt tool). When this is resolved, we can test ppc64 too.
 
-    // if (tf.vsx) {
-    //     REQUIRE(count == 5u);
-    // }
+        // if (tf.vsx) {
+        //     REQUIRE(count == 5u);
+        // }
 
 #endif
+    }
 }
diff --git a/test/elp2000.cpp b/test/elp2000.cpp
index a3c163a88..5c6e2ed18 100644
--- a/test/elp2000.cpp
+++ b/test/elp2000.cpp
@@ -7,7 +7,6 @@
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 #include <array>
-#include <cmath>
 #include <initializer_list>
 #include <stdexcept>
 
@@ -17,8 +16,10 @@
 #include <heyoka/model/elp2000.hpp>
 
 #include "catch.hpp"
+#include "test_utils.hpp"
 
 using namespace heyoka;
+using namespace heyoka_test;
 using namespace heyoka::model;
 
 TEST_CASE("basic")
@@ -57,9 +58,9 @@ TEST_CASE("basic")
         const double tm = (date - 2451545.0) / (36525);
         cf_ptr(out, nullptr, nullptr, &tm);
 
-        REQUIRE(std::abs(out[0] - ref[i][0]) < 1e-10);
-        REQUIRE(std::abs(out[1] - ref[i][1]) < 1e-10);
-        REQUIRE(std::abs(out[2] - ref[i][2]) < 1e-10);
+        REQUIRE(out[0] == approximately(ref[i][0], 1000.));
+        REQUIRE(out[1] == approximately(ref[i][1], 1000.));
+        REQUIRE(out[2] == approximately(ref[i][2], 1000.));
     }
 }
 
@@ -92,9 +93,9 @@ TEST_CASE("fk5")
         const double tm = (date - 2451545.0) / (36525);
         cf_ptr(out, nullptr, nullptr, &tm);
 
-        REQUIRE(std::abs(out[0] - ref[i][0]) < 1e-10);
-        REQUIRE(std::abs(out[1] - ref[i][1]) < 1e-10);
-        REQUIRE(std::abs(out[2] - ref[i][2]) < 1e-10);
+        REQUIRE(out[0] == approximately(ref[i][0], 1000.));
+        REQUIRE(out[1] == approximately(ref[i][1], 1000.));
+        REQUIRE(out[2] == approximately(ref[i][2], 1000.));
     }
 }
 
diff --git a/test/erf.cpp b/test/erf.cpp
index 239aac9ed..f201836cd 100644
--- a/test/erf.cpp
+++ b/test/erf.cpp
@@ -230,124 +230,128 @@ TEST_CASE("normalise")
 // Tests to check vectorisation via the vector-function-abi-variant machinery.
 TEST_CASE("vfabi double")
 {
-    llvm_state s{kw::slp_vectorize = true};
+    for (auto fast_math : {false, true}) {
+        llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math};
 
-    auto [a, b] = make_vars("a", "b");
+        auto [a, b] = make_vars("a", "b");
 
-    add_cfunc<double>(s, "cfunc", {erf(a), erf(b)});
+        add_cfunc<double>(s, "cfunc", {erf(a), erf(b)});
 
-    s.compile();
+        s.compile();
 
-    auto *cf_ptr
-        = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(s.jit_lookup("cfunc"));
+        auto *cf_ptr = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(
+            s.jit_lookup("cfunc"));
 
-    const std::vector ins{.1, .2};
-    std::vector<double> outs(2u, 0.);
+        const std::vector ins{.1, .2};
+        std::vector<double> outs(2u, 0.);
 
-    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+        cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
 
-    REQUIRE(outs[0] == approximately(std::erf(.1)));
-    REQUIRE(outs[1] == approximately(std::erf(.2)));
+        REQUIRE(outs[0] == approximately(std::erf(.1)));
+        REQUIRE(outs[1] == approximately(std::erf(.2)));
 
-    // NOTE: autovec with external scalar functions seems to work
-    // only since LLVM 16.
+        // NOTE: autovec with external scalar functions seems to work
+        // only since LLVM 16.
 #if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16
 
-    const auto &tf = detail::get_target_features();
+        const auto &tf = detail::get_target_features();
 
-    auto ir = s.get_ir();
+        auto ir = s.get_ir();
 
-    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+        using string_find_iterator = boost::find_iterator<std::string::iterator>;
 
-    auto count = 0u;
-    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@erf", boost::is_iequal()));
-         it != string_find_iterator(); ++it) {
-        ++count;
-    }
+        auto count = 0u;
+        for (auto it = boost::make_find_iterator(ir, boost::first_finder("@erf", boost::is_iequal()));
+             it != string_find_iterator(); ++it) {
+            ++count;
+        }
 
-    // NOTE: at the moment we have comprehensive coverage of LLVM versions
-    // in the CI only for x86_64.
-    if (tf.sse2) {
-        // NOTE: occurrences of the scalar version:
-        // - 2 calls in the strided cfunc,
-        // - 1 declaration.
-        REQUIRE(count == 3u);
-    }
+        // NOTE: at the moment we have comprehensive coverage of LLVM versions
+        // in the CI only for x86_64.
+        if (tf.sse2) {
+            // NOTE: occurrences of the scalar version:
+            // - 2 calls in the strided cfunc,
+            // - 1 declaration.
+            REQUIRE(count == 3u);
+        }
 
-    if (tf.aarch64) {
-        REQUIRE(count == 3u);
-    }
+        if (tf.aarch64) {
+            REQUIRE(count == 3u);
+        }
 
-    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
-    // to the way the target machine is being set up by orc/lljit (it works
-    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+        // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+        // to the way the target machine is being set up by orc/lljit (it works
+        // fine with the opt tool). When this is resolved, we can test ppc64 too.
 
-    // if (tf.vsx) {
-    //     REQUIRE(count == 3u);
-    // }
+        // if (tf.vsx) {
+        //     REQUIRE(count == 3u);
+        // }
 
 #endif
+    }
 }
 
 TEST_CASE("vfabi float")
 {
-    llvm_state s{kw::slp_vectorize = true};
+    for (auto fast_math : {false, true}) {
+        llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math};
 
-    auto [a, b, c, d] = make_vars("a", "b", "c", "d");
+        auto [a, b, c, d] = make_vars("a", "b", "c", "d");
 
-    add_cfunc<float>(s, "cfunc", {erf(a), erf(b), erf(c), erf(d)});
+        add_cfunc<float>(s, "cfunc", {erf(a), erf(b), erf(c), erf(d)});
 
-    s.compile();
+        s.compile();
 
-    auto *cf_ptr
-        = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
+        auto *cf_ptr
+            = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
 
-    const std::vector<float> ins{.1f, .2f, .3f, .4f};
-    std::vector<float> outs(4u, 0.);
+        const std::vector<float> ins{.1f, .2f, .3f, .4f};
+        std::vector<float> outs(4u, 0.);
 
-    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+        cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
 
-    REQUIRE(outs[0] == approximately(std::erf(.1f)));
-    REQUIRE(outs[1] == approximately(std::erf(.2f)));
-    REQUIRE(outs[2] == approximately(std::erf(.3f)));
-    REQUIRE(outs[3] == approximately(std::erf(.4f)));
+        REQUIRE(outs[0] == approximately(std::erf(.1f)));
+        REQUIRE(outs[1] == approximately(std::erf(.2f)));
+        REQUIRE(outs[2] == approximately(std::erf(.3f)));
+        REQUIRE(outs[3] == approximately(std::erf(.4f)));
 
-    // NOTE: autovec with external scalar functions seems to work
-    // only since LLVM 16.
+        // NOTE: autovec with external scalar functions seems to work
+        // only since LLVM 16.
 #if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16
 
-    const auto &tf = detail::get_target_features();
+        const auto &tf = detail::get_target_features();
 
-    auto ir = s.get_ir();
+        auto ir = s.get_ir();
 
-    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+        using string_find_iterator = boost::find_iterator<std::string::iterator>;
 
-    auto count = 0u;
-    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@erff", boost::is_iequal()));
-         it != string_find_iterator(); ++it) {
-        ++count;
-    }
+        auto count = 0u;
+        for (auto it = boost::make_find_iterator(ir, boost::first_finder("@erff", boost::is_iequal()));
+             it != string_find_iterator(); ++it) {
+            ++count;
+        }
 
-    // NOTE: at the moment we have comprehensive coverage of LLVM versions
-    // in the CI only for x86_64.
-    if (tf.sse2) {
-        // NOTE: occurrences of the scalar version:
-        // - 4 calls in the strided cfunc,
-        // - 1 declaration.
-        REQUIRE(count == 5u);
-    }
+        // NOTE: at the moment we have comprehensive coverage of LLVM versions
+        // in the CI only for x86_64.
+        if (tf.sse2) {
+            // NOTE: occurrences of the scalar version:
+            // - 4 calls in the strided cfunc,
+            // - 1 declaration.
+            REQUIRE(count == 5u);
+        }
 
-    if (tf.aarch64) {
-        REQUIRE(count == 5u);
-    }
+        if (tf.aarch64) {
+            REQUIRE(count == 5u);
+        }
 
-    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
-    // to the way the target machine is being set up by orc/lljit (it works
-    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+        // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+        // to the way the target machine is being set up by orc/lljit (it works
+        // fine with the opt tool). When this is resolved, we can test ppc64 too.
 
-    // if (tf.vsx) {
-    //     REQUIRE(count == 5u);
-    // }
+        // if (tf.vsx) {
+        //     REQUIRE(count == 5u);
+        // }
 
 #endif
+    }
 }
diff --git a/test/exp.cpp b/test/exp.cpp
index 768234042..ef7b8da3e 100644
--- a/test/exp.cpp
+++ b/test/exp.cpp
@@ -237,130 +237,134 @@ TEST_CASE("normalise")
 // Tests to check vectorisation via the vector-function-abi-variant machinery.
 TEST_CASE("vfabi double")
 {
-    llvm_state s{kw::slp_vectorize = true};
+    for (auto fast_math : {false, true}) {
+        llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math};
 
-    auto [a, b] = make_vars("a", "b");
+        auto [a, b] = make_vars("a", "b");
 
-    add_cfunc<double>(s, "cfunc", {exp(a), exp(b)});
+        add_cfunc<double>(s, "cfunc", {exp(a), exp(b)});
 
-    s.compile();
+        s.compile();
 
-    auto *cf_ptr
-        = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(s.jit_lookup("cfunc"));
+        auto *cf_ptr = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(
+            s.jit_lookup("cfunc"));
 
-    const std::vector ins{1., 2.};
-    std::vector<double> outs(2u, 0.);
+        const std::vector ins{1., 2.};
+        std::vector<double> outs(2u, 0.);
 
-    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+        cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
 
-    REQUIRE(outs[0] == approximately(std::exp(1.)));
-    REQUIRE(outs[1] == approximately(std::exp(2.)));
+        REQUIRE(outs[0] == approximately(std::exp(1.)));
+        REQUIRE(outs[1] == approximately(std::exp(2.)));
 
 #if defined(HEYOKA_WITH_SLEEF)
 
-    const auto &tf = detail::get_target_features();
+        const auto &tf = detail::get_target_features();
 
-    auto ir = s.get_ir();
+        auto ir = s.get_ir();
 
-    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+        using string_find_iterator = boost::find_iterator<std::string::iterator>;
 
-    auto count = 0u;
-    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.exp.f64", boost::is_iequal()));
-         it != string_find_iterator(); ++it) {
-        ++count;
-    }
+        auto count = 0u;
+        for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.exp.f64", boost::is_iequal()));
+             it != string_find_iterator(); ++it) {
+            ++count;
+        }
 
-    // NOTE: at the moment we have comprehensive coverage of LLVM versions
-    // in the CI only for x86_64.
-    if (tf.sse2) {
-        // NOTE: occurrences of the scalar version:
-        // - 2 calls in the strided cfunc,
-        // - 1 declaration.
-        REQUIRE(count == 3u);
-    }
+        // NOTE: at the moment we have comprehensive coverage of LLVM versions
+        // in the CI only for x86_64.
+        if (tf.sse2) {
+            // NOTE: occurrences of the scalar version:
+            // - 2 calls in the strided cfunc,
+            // - 1 declaration.
+            REQUIRE(count == 3u);
+        }
 
 #if LLVM_VERSION_MAJOR >= 16
 
-    // NOTE: LLVM16 is currently the version tested in the CI on arm64.
-    if (tf.aarch64) {
-        REQUIRE(count == 3u);
-    }
+        // NOTE: LLVM16 is currently the version tested in the CI on arm64.
+        if (tf.aarch64) {
+            REQUIRE(count == 3u);
+        }
 
 #endif
 
-    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
-    // to the way the target machine is being set up by orc/lljit (it works
-    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+        // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+        // to the way the target machine is being set up by orc/lljit (it works
+        // fine with the opt tool). When this is resolved, we can test ppc64 too.
 
-    // if (tf.vsx) {
-    //     REQUIRE(count == 3u);
-    // }
+        // if (tf.vsx) {
+        //     REQUIRE(count == 3u);
+        // }
 
 #endif
+    }
 }
 
 TEST_CASE("vfabi float")
 {
-    llvm_state s{kw::slp_vectorize = true};
+    for (auto fast_math : {false, true}) {
+        llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math};
 
-    auto [a, b, c, d] = make_vars("a", "b", "c", "d");
+        auto [a, b, c, d] = make_vars("a", "b", "c", "d");
 
-    add_cfunc<float>(s, "cfunc", {exp(a), exp(b), exp(c), exp(d)});
+        add_cfunc<float>(s, "cfunc", {exp(a), exp(b), exp(c), exp(d)});
 
-    s.compile();
+        s.compile();
 
-    auto *cf_ptr
-        = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
+        auto *cf_ptr
+            = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
 
-    const std::vector<float> ins{1., 2., 3., 4.};
-    std::vector<float> outs(4u, 0.);
+        const std::vector<float> ins{1., 2., 3., 4.};
+        std::vector<float> outs(4u, 0.);
 
-    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+        cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
 
-    REQUIRE(outs[0] == approximately(std::exp(1.f)));
-    REQUIRE(outs[1] == approximately(std::exp(2.f)));
-    REQUIRE(outs[2] == approximately(std::exp(3.f)));
-    REQUIRE(outs[3] == approximately(std::exp(4.f)));
+        REQUIRE(outs[0] == approximately(std::exp(1.f)));
+        REQUIRE(outs[1] == approximately(std::exp(2.f)));
+        REQUIRE(outs[2] == approximately(std::exp(3.f)));
+        REQUIRE(outs[3] == approximately(std::exp(4.f)));
 
 #if defined(HEYOKA_WITH_SLEEF)
 
-    const auto &tf = detail::get_target_features();
+        const auto &tf = detail::get_target_features();
 
-    auto ir = s.get_ir();
+        auto ir = s.get_ir();
 
-    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+        using string_find_iterator = boost::find_iterator<std::string::iterator>;
 
-    auto count = 0u;
-    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.exp.f32", boost::is_iequal()));
-         it != string_find_iterator(); ++it) {
-        ++count;
-    }
+        auto count = 0u;
+        for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.exp.f32", boost::is_iequal()));
+             it != string_find_iterator(); ++it) {
+            ++count;
+        }
 
-    // NOTE: at the moment we have comprehensive coverage of LLVM versions
-    // in the CI only for x86_64.
-    if (tf.sse2) {
-        // NOTE: occurrences of the scalar version:
-        // - 4 calls in the strided cfunc,
-        // - 1 declaration.
-        REQUIRE(count == 5u);
-    }
+        // NOTE: at the moment we have comprehensive coverage of LLVM versions
+        // in the CI only for x86_64.
+        if (tf.sse2) {
+            // NOTE: occurrences of the scalar version:
+            // - 4 calls in the strided cfunc,
+            // - 1 declaration.
+            REQUIRE(count == 5u);
+        }
 
 #if LLVM_VERSION_MAJOR >= 16
 
-    // NOTE: LLVM16 is currently the version tested in the CI on arm64.
-    if (tf.aarch64) {
-        REQUIRE(count == 5u);
-    }
+        // NOTE: LLVM16 is currently the version tested in the CI on arm64.
+        if (tf.aarch64) {
+            REQUIRE(count == 5u);
+        }
 
 #endif
 
-    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
-    // to the way the target machine is being set up by orc/lljit (it works
-    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+        // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+        // to the way the target machine is being set up by orc/lljit (it works
+        // fine with the opt tool). When this is resolved, we can test ppc64 too.
 
-    // if (tf.vsx) {
-    //     REQUIRE(count == 5u);
-    // }
+        // if (tf.vsx) {
+        //     REQUIRE(count == 5u);
+        // }
 
 #endif
+    }
 }
diff --git a/test/log.cpp b/test/log.cpp
index 4d5a88626..276a9aaa2 100644
--- a/test/log.cpp
+++ b/test/log.cpp
@@ -213,130 +213,134 @@ TEST_CASE("normalise")
 // Tests to check vectorisation via the vector-function-abi-variant machinery.
 TEST_CASE("vfabi double")
 {
-    llvm_state s{kw::slp_vectorize = true};
+    for (auto fast_math : {false, true}) {
+        llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math};
 
-    auto [a, b] = make_vars("a", "b");
+        auto [a, b] = make_vars("a", "b");
 
-    add_cfunc<double>(s, "cfunc", {log(a), log(b)});
+        add_cfunc<double>(s, "cfunc", {log(a), log(b)});
 
-    s.compile();
+        s.compile();
 
-    auto *cf_ptr
-        = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(s.jit_lookup("cfunc"));
+        auto *cf_ptr = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(
+            s.jit_lookup("cfunc"));
 
-    const std::vector ins{1., 2.};
-    std::vector<double> outs(2u, 0.);
+        const std::vector ins{1., 2.};
+        std::vector<double> outs(2u, 0.);
 
-    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+        cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
 
-    REQUIRE(outs[0] == approximately(std::log(1.)));
-    REQUIRE(outs[1] == approximately(std::log(2.)));
+        REQUIRE(outs[0] == approximately(std::log(1.)));
+        REQUIRE(outs[1] == approximately(std::log(2.)));
 
 #if defined(HEYOKA_WITH_SLEEF)
 
-    const auto &tf = detail::get_target_features();
+        const auto &tf = detail::get_target_features();
 
-    auto ir = s.get_ir();
+        auto ir = s.get_ir();
 
-    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+        using string_find_iterator = boost::find_iterator<std::string::iterator>;
 
-    auto count = 0u;
-    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.log.f64", boost::is_iequal()));
-         it != string_find_iterator(); ++it) {
-        ++count;
-    }
+        auto count = 0u;
+        for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.log.f64", boost::is_iequal()));
+             it != string_find_iterator(); ++it) {
+            ++count;
+        }
 
-    // NOTE: at the moment we have comprehensive coverage of LLVM versions
-    // in the CI only for x86_64.
-    if (tf.sse2) {
-        // NOTE: occurrences of the scalar version:
-        // - 2 calls in the strided cfunc,
-        // - 1 declaration.
-        REQUIRE(count == 3u);
-    }
+        // NOTE: at the moment we have comprehensive coverage of LLVM versions
+        // in the CI only for x86_64.
+        if (tf.sse2) {
+            // NOTE: occurrences of the scalar version:
+            // - 2 calls in the strided cfunc,
+            // - 1 declaration.
+            REQUIRE(count == 3u);
+        }
 
 #if LLVM_VERSION_MAJOR >= 16
 
-    // NOTE: LLVM16 is currently the version tested in the CI on arm64.
-    if (tf.aarch64) {
-        REQUIRE(count == 3u);
-    }
+        // NOTE: LLVM16 is currently the version tested in the CI on arm64.
+        if (tf.aarch64) {
+            REQUIRE(count == 3u);
+        }
 
 #endif
 
-    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
-    // to the way the target machine is being set up by orc/lljit (it works
-    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+        // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+        // to the way the target machine is being set up by orc/lljit (it works
+        // fine with the opt tool). When this is resolved, we can test ppc64 too.
 
-    // if (tf.vsx) {
-    //     REQUIRE(count == 3u);
-    // }
+        // if (tf.vsx) {
+        //     REQUIRE(count == 3u);
+        // }
 
 #endif
+    }
 }
 
 TEST_CASE("vfabi float")
 {
-    llvm_state s{kw::slp_vectorize = true};
+    for (auto fast_math : {false, true}) {
+        llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math};
 
-    auto [a, b, c, d] = make_vars("a", "b", "c", "d");
+        auto [a, b, c, d] = make_vars("a", "b", "c", "d");
 
-    add_cfunc<float>(s, "cfunc", {log(a), log(b), log(c), log(d)});
+        add_cfunc<float>(s, "cfunc", {log(a), log(b), log(c), log(d)});
 
-    s.compile();
+        s.compile();
 
-    auto *cf_ptr
-        = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
+        auto *cf_ptr
+            = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
 
-    const std::vector<float> ins{1., 2., 3., 4.};
-    std::vector<float> outs(4u, 0.);
+        const std::vector<float> ins{1., 2., 3., 4.};
+        std::vector<float> outs(4u, 0.);
 
-    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+        cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
 
-    REQUIRE(outs[0] == approximately(std::log(1.f)));
-    REQUIRE(outs[1] == approximately(std::log(2.f)));
-    REQUIRE(outs[2] == approximately(std::log(3.f)));
-    REQUIRE(outs[3] == approximately(std::log(4.f)));
+        REQUIRE(outs[0] == approximately(std::log(1.f)));
+        REQUIRE(outs[1] == approximately(std::log(2.f)));
+        REQUIRE(outs[2] == approximately(std::log(3.f)));
+        REQUIRE(outs[3] == approximately(std::log(4.f)));
 
 #if defined(HEYOKA_WITH_SLEEF)
 
-    const auto &tf = detail::get_target_features();
+        const auto &tf = detail::get_target_features();
 
-    auto ir = s.get_ir();
+        auto ir = s.get_ir();
 
-    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+        using string_find_iterator = boost::find_iterator<std::string::iterator>;
 
-    auto count = 0u;
-    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.log.f32", boost::is_iequal()));
-         it != string_find_iterator(); ++it) {
-        ++count;
-    }
+        auto count = 0u;
+        for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.log.f32", boost::is_iequal()));
+             it != string_find_iterator(); ++it) {
+            ++count;
+        }
 
-    // NOTE: at the moment we have comprehensive coverage of LLVM versions
-    // in the CI only for x86_64.
-    if (tf.sse2) {
-        // NOTE: occurrences of the scalar version:
-        // - 4 calls in the strided cfunc,
-        // - 1 declaration.
-        REQUIRE(count == 5u);
-    }
+        // NOTE: at the moment we have comprehensive coverage of LLVM versions
+        // in the CI only for x86_64.
+        if (tf.sse2) {
+            // NOTE: occurrences of the scalar version:
+            // - 4 calls in the strided cfunc,
+            // - 1 declaration.
+            REQUIRE(count == 5u);
+        }
 
 #if LLVM_VERSION_MAJOR >= 16
 
-    // NOTE: LLVM16 is currently the version tested in the CI on arm64.
-    if (tf.aarch64) {
-        REQUIRE(count == 5u);
-    }
+        // NOTE: LLVM16 is currently the version tested in the CI on arm64.
+        if (tf.aarch64) {
+            REQUIRE(count == 5u);
+        }
 
 #endif
 
-    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
-    // to the way the target machine is being set up by orc/lljit (it works
-    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+        // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+        // to the way the target machine is being set up by orc/lljit (it works
+        // fine with the opt tool). When this is resolved, we can test ppc64 too.
 
-    // if (tf.vsx) {
-    //     REQUIRE(count == 5u);
-    // }
+        // if (tf.vsx) {
+        //     REQUIRE(count == 5u);
+        // }
 
 #endif
+    }
 }
diff --git a/test/pow.cpp b/test/pow.cpp
index 27fbfbd87..a6e0ae183 100644
--- a/test/pow.cpp
+++ b/test/pow.cpp
@@ -448,129 +448,133 @@ TEST_CASE("pow overloads")
 // Tests to check vectorisation via the vector-function-abi-variant machinery.
 TEST_CASE("vfabi double")
 {
-    llvm_state s{kw::slp_vectorize = true};
+    for (auto fast_math : {false, true}) {
+        llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math};
 
-    auto [a, b] = make_vars("a", "b");
+        auto [a, b] = make_vars("a", "b");
 
-    add_cfunc<double>(s, "cfunc", {pow(a, .1), pow(b, .2)});
+        add_cfunc<double>(s, "cfunc", {pow(a, .1), pow(b, .2)});
 
-    s.compile();
+        s.compile();
 
-    auto *cf_ptr
-        = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(s.jit_lookup("cfunc"));
+        auto *cf_ptr = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(
+            s.jit_lookup("cfunc"));
 
-    const std::vector ins{1., 2.};
-    std::vector<double> outs(2u, 0.);
+        const std::vector ins{1., 2.};
+        std::vector<double> outs(2u, 0.);
 
-    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+        cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
 
-    REQUIRE(outs[0] == approximately(std::pow(1., .1)));
-    REQUIRE(outs[1] == approximately(std::pow(2., .2)));
+        REQUIRE(outs[0] == approximately(std::pow(1., .1)));
+        REQUIRE(outs[1] == approximately(std::pow(2., .2)));
 
 #if defined(HEYOKA_WITH_SLEEF)
 
-    const auto &tf = detail::get_target_features();
+        const auto &tf = detail::get_target_features();
 
-    auto ir = s.get_ir();
+        auto ir = s.get_ir();
 
-    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+        using string_find_iterator = boost::find_iterator<std::string::iterator>;
 
-    auto count = 0u;
-    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.pow.f64", boost::is_iequal()));
-         it != string_find_iterator(); ++it) {
-        ++count;
-    }
+        auto count = 0u;
+        for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.pow.f64", boost::is_iequal()));
+             it != string_find_iterator(); ++it) {
+            ++count;
+        }
 
-    // NOTE: at the moment we have comprehensive coverage of LLVM versions
-    // in the CI only for x86_64.
-    if (tf.sse2) {
-        // NOTE: occurrences of the scalar version:
-        // - 2 calls in the strided cfunc,
-        // - 1 declaration.
-        REQUIRE(count == 3u);
-    }
+        // NOTE: at the moment we have comprehensive coverage of LLVM versions
+        // in the CI only for x86_64.
+        if (tf.sse2) {
+            // NOTE: occurrences of the scalar version:
+            // - 2 calls in the strided cfunc,
+            // - 1 declaration.
+            REQUIRE(count == 3u);
+        }
 
 #if LLVM_VERSION_MAJOR >= 16
 
-    // NOTE: LLVM16 is currently the version tested in the CI on arm64.
-    if (tf.aarch64) {
-        REQUIRE(count == 3u);
-    }
+        // NOTE: LLVM16 is currently the version tested in the CI on arm64.
+        if (tf.aarch64) {
+            REQUIRE(count == 3u);
+        }
 
 #endif
 
-    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
-    // to the way the target machine is being set up by orc/lljit (it works
-    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+        // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+        // to the way the target machine is being set up by orc/lljit (it works
+        // fine with the opt tool). When this is resolved, we can test ppc64 too.
 
-    // if (tf.vsx) {
-    //     REQUIRE(count == 3u);
-    // }
+        // if (tf.vsx) {
+        //     REQUIRE(count == 3u);
+        // }
 
 #endif
+    }
 }
 
 TEST_CASE("vfabi float")
 {
-    llvm_state s{kw::slp_vectorize = true};
+    for (auto fast_math : {false, true}) {
+        llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math};
 
-    auto [a, b, c, d] = make_vars("a", "b", "c", "d");
+        auto [a, b, c, d] = make_vars("a", "b", "c", "d");
 
-    add_cfunc<float>(s, "cfunc", {pow(a, .6f), pow(b, .7f), pow(c, .8f), pow(d, .9f)});
+        add_cfunc<float>(s, "cfunc", {pow(a, .6f), pow(b, .7f), pow(c, .8f), pow(d, .9f)});
 
-    s.compile();
+        s.compile();
 
-    auto *cf_ptr
-        = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
+        auto *cf_ptr
+            = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
 
-    const std::vector<float> ins{.1f, .2f, .3f, .4f};
-    std::vector<float> outs(4u, 0.);
+        const std::vector<float> ins{.1f, .2f, .3f, .4f};
+        std::vector<float> outs(4u, 0.);
 
-    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+        cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
 
-    REQUIRE(outs[0] == approximately(std::pow(.1f, .6f)));
-    REQUIRE(outs[1] == approximately(std::pow(.2f, .7f)));
-    REQUIRE(outs[2] == approximately(std::pow(.3f, .8f)));
-    REQUIRE(outs[3] == approximately(std::pow(.4f, .9f)));
+        REQUIRE(outs[0] == approximately(std::pow(.1f, .6f)));
+        REQUIRE(outs[1] == approximately(std::pow(.2f, .7f)));
+        REQUIRE(outs[2] == approximately(std::pow(.3f, .8f)));
+        REQUIRE(outs[3] == approximately(std::pow(.4f, .9f)));
 
 #if defined(HEYOKA_WITH_SLEEF)
 
-    const auto &tf = detail::get_target_features();
+        const auto &tf = detail::get_target_features();
 
-    auto ir = s.get_ir();
+        auto ir = s.get_ir();
 
-    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+        using string_find_iterator = boost::find_iterator<std::string::iterator>;
 
-    auto count = 0u;
-    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.pow.f32", boost::is_iequal()));
-         it != string_find_iterator(); ++it) {
-        ++count;
-    }
+        auto count = 0u;
+        for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.pow.f32", boost::is_iequal()));
+             it != string_find_iterator(); ++it) {
+            ++count;
+        }
 
-    // NOTE: at the moment we have comprehensive coverage of LLVM versions
-    // in the CI only for x86_64.
-    if (tf.sse2) {
-        // NOTE: occurrences of the scalar version:
-        // - 4 calls in the strided cfunc,
-        // - 1 declaration.
-        REQUIRE(count == 5u);
-    }
+        // NOTE: at the moment we have comprehensive coverage of LLVM versions
+        // in the CI only for x86_64.
+        if (tf.sse2) {
+            // NOTE: occurrences of the scalar version:
+            // - 4 calls in the strided cfunc,
+            // - 1 declaration.
+            REQUIRE(count == 5u);
+        }
 
 #if LLVM_VERSION_MAJOR >= 16
 
-    if (tf.aarch64) {
-        REQUIRE(count == 5u);
-    }
+        if (tf.aarch64) {
+            REQUIRE(count == 5u);
+        }
 
 #endif
 
-    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
-    // to the way the target machine is being set up by orc/lljit (it works
-    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+        // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+        // to the way the target machine is being set up by orc/lljit (it works
+        // fine with the opt tool). When this is resolved, we can test ppc64 too.
 
-    // if (tf.vsx) {
-    //     REQUIRE(count == 5u);
-    // }
+        // if (tf.vsx) {
+        //     REQUIRE(count == 5u);
+        // }
 
 #endif
+    }
 }
diff --git a/test/sin.cpp b/test/sin.cpp
index 8948d49db..ccbf816a2 100644
--- a/test/sin.cpp
+++ b/test/sin.cpp
@@ -232,338 +232,342 @@ TEST_CASE("normalise")
 // Tests to check vectorisation via the vector-function-abi-variant machinery.
 TEST_CASE("vfabi double")
 {
-    llvm_state s{kw::slp_vectorize = true};
+    for (auto fast_math : {false, true}) {
+        llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math};
 
-    auto [a, b] = make_vars("a", "b");
+        auto [a, b] = make_vars("a", "b");
 
-    add_cfunc<double>(s, "cfunc", {sin(a), sin(b)});
+        add_cfunc<double>(s, "cfunc", {sin(a), sin(b)});
 
-    s.compile();
+        s.compile();
 
-    auto *cf_ptr
-        = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(s.jit_lookup("cfunc"));
+        auto *cf_ptr = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(
+            s.jit_lookup("cfunc"));
 
-    const std::vector ins{1., 2.};
-    std::vector<double> outs(2u, 0.);
+        const std::vector ins{1., 2.};
+        std::vector<double> outs(2u, 0.);
 
-    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+        cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
 
-    REQUIRE(outs[0] == approximately(std::sin(1.)));
-    REQUIRE(outs[1] == approximately(std::sin(2.)));
+        REQUIRE(outs[0] == approximately(std::sin(1.)));
+        REQUIRE(outs[1] == approximately(std::sin(2.)));
 
 #if defined(HEYOKA_WITH_SLEEF)
 
-    const auto &tf = detail::get_target_features();
+        const auto &tf = detail::get_target_features();
 
-    auto ir = s.get_ir();
+        auto ir = s.get_ir();
 
-    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+        using string_find_iterator = boost::find_iterator<std::string::iterator>;
 
-    auto count = 0u;
-    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.sin.f64", boost::is_iequal()));
-         it != string_find_iterator(); ++it) {
-        ++count;
-    }
+        auto count = 0u;
+        for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.sin.f64", boost::is_iequal()));
+             it != string_find_iterator(); ++it) {
+            ++count;
+        }
 
-    // NOTE: at the moment we have comprehensive coverage of LLVM versions
-    // in the CI only for x86_64.
-    if (tf.sse2) {
-        // NOTE: occurrences of the scalar version:
-        // - 2 calls in the strided cfunc,
-        // - 1 declaration.
-        REQUIRE(count == 3u);
-    }
+        // NOTE: at the moment we have comprehensive coverage of LLVM versions
+        // in the CI only for x86_64.
+        if (tf.sse2) {
+            // NOTE: occurrences of the scalar version:
+            // - 2 calls in the strided cfunc,
+            // - 1 declaration.
+            REQUIRE(count == 3u);
+        }
 
 #if LLVM_VERSION_MAJOR >= 16
 
-    // NOTE: LLVM16 is currently the version tested in the CI on arm64.
-    if (tf.aarch64) {
-        REQUIRE(count == 3u);
-    }
+        // NOTE: LLVM16 is currently the version tested in the CI on arm64.
+        if (tf.aarch64) {
+            REQUIRE(count == 3u);
+        }
 
 #endif
 
-    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
-    // to the way the target machine is being set up by orc/lljit (it works
-    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+        // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+        // to the way the target machine is being set up by orc/lljit (it works
+        // fine with the opt tool). When this is resolved, we can test ppc64 too.
 
-    // if (tf.vsx) {
-    //     REQUIRE(count == 3u);
-    // }
+        // if (tf.vsx) {
+        //     REQUIRE(count == 3u);
+        // }
 
-    // Some more extensive testing specific to x86, only for this function.
-    auto [c, d, e] = make_vars("c", "d", "e");
+        // Some more extensive testing specific to x86, only for this function.
+        auto [c, d, e] = make_vars("c", "d", "e");
 
-    llvm_state s2{kw::slp_vectorize = true};
+        llvm_state s2{kw::slp_vectorize = true};
 
-    add_cfunc<double>(s2, "cfunc1", {sin(a), sin(b), sin(c), sin(d)});
-    add_cfunc<double>(s2, "cfunc2", {sin(a), sin(b), sin(c), sin(d), sin(e)});
+        add_cfunc<double>(s2, "cfunc1", {sin(a), sin(b), sin(c), sin(d)});
+        add_cfunc<double>(s2, "cfunc2", {sin(a), sin(b), sin(c), sin(d), sin(e)});
 
-    s2.compile();
+        s2.compile();
 
-    auto *cf1_ptr
-        = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(s2.jit_lookup("cfunc1"));
-    auto *cf2_ptr
-        = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(s2.jit_lookup("cfunc2"));
+        auto *cf1_ptr = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(
+            s2.jit_lookup("cfunc1"));
+        auto *cf2_ptr = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(
+            s2.jit_lookup("cfunc2"));
 
-    const std::vector ins2{1., 2., 3., 4., 5.};
-    std::vector<double> outs2(5u, 0.);
+        const std::vector ins2{1., 2., 3., 4., 5.};
+        std::vector<double> outs2(5u, 0.);
 
-    cf1_ptr(outs2.data(), ins2.data(), nullptr, nullptr);
+        cf1_ptr(outs2.data(), ins2.data(), nullptr, nullptr);
 
-    REQUIRE(outs2[0] == approximately(std::sin(1.)));
-    REQUIRE(outs2[1] == approximately(std::sin(2.)));
-    REQUIRE(outs2[2] == approximately(std::sin(3.)));
-    REQUIRE(outs2[3] == approximately(std::sin(4.)));
+        REQUIRE(outs2[0] == approximately(std::sin(1.)));
+        REQUIRE(outs2[1] == approximately(std::sin(2.)));
+        REQUIRE(outs2[2] == approximately(std::sin(3.)));
+        REQUIRE(outs2[3] == approximately(std::sin(4.)));
 
-    cf2_ptr(outs2.data(), ins2.data(), nullptr, nullptr);
+        cf2_ptr(outs2.data(), ins2.data(), nullptr, nullptr);
 
-    REQUIRE(outs2[0] == approximately(std::sin(1.)));
-    REQUIRE(outs2[1] == approximately(std::sin(2.)));
-    REQUIRE(outs2[2] == approximately(std::sin(3.)));
-    REQUIRE(outs2[3] == approximately(std::sin(4.)));
-    REQUIRE(outs2[4] == approximately(std::sin(5.)));
+        REQUIRE(outs2[0] == approximately(std::sin(1.)));
+        REQUIRE(outs2[1] == approximately(std::sin(2.)));
+        REQUIRE(outs2[2] == approximately(std::sin(3.)));
+        REQUIRE(outs2[3] == approximately(std::sin(4.)));
+        REQUIRE(outs2[4] == approximately(std::sin(5.)));
 
-    ir = s2.get_ir();
+        ir = s2.get_ir();
 
-    count = 0u;
-    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.sin.f64", boost::is_iequal()));
-         it != string_find_iterator(); ++it) {
-        ++count;
-    }
+        count = 0u;
+        for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.sin.f64", boost::is_iequal()));
+             it != string_find_iterator(); ++it) {
+            ++count;
+        }
 
-    if (tf.avx) {
-        // NOTE: occurrences of the scalar version:
-        // - 4 + 5 calls in the strided cfuncs,
-        // - 1 declaration,
-        // - 1 call to deal with the remainder in the
-        //   5-argument version.
-        REQUIRE(count == 11u);
-    }
+        if (tf.avx) {
+            // NOTE: occurrences of the scalar version:
+            // - 4 + 5 calls in the strided cfuncs,
+            // - 1 declaration,
+            // - 1 call to deal with the remainder in the
+            //   5-argument version.
+            REQUIRE(count == 11u);
+        }
 
-    // NOTE: this next test seems to work properly starting
-    // from LLVM 13.
+        // NOTE: this next test seems to work properly starting
+        // from LLVM 13.
 #if LLVM_VERSION_MAJOR >= 13
 
-    // Check that the autovec works also on batch sizes which do not correspond
-    // exactly to an available vector width.
-    llvm_state s3{kw::slp_vectorize = true};
+        // Check that the autovec works also on batch sizes which do not correspond
+        // exactly to an available vector width.
+        llvm_state s3{kw::slp_vectorize = true};
 
-    add_cfunc<double>(s3, "cfunc", {sin(a)}, kw::batch_size = 3u);
+        add_cfunc<double>(s3, "cfunc", {sin(a)}, kw::batch_size = 3u);
 
-    s3.compile();
+        s3.compile();
 
-    auto *cf3_ptr
-        = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(s3.jit_lookup("cfunc"));
+        auto *cf3_ptr = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(
+            s3.jit_lookup("cfunc"));
 
-    std::vector<double> ins3 = {1., 2., 3.}, outs3 = {0., 0., 0.};
+        std::vector<double> ins3 = {1., 2., 3.}, outs3 = {0., 0., 0.};
 
-    cf3_ptr(outs3.data(), ins3.data(), nullptr, nullptr);
+        cf3_ptr(outs3.data(), ins3.data(), nullptr, nullptr);
 
-    REQUIRE(outs3[0] == approximately(std::sin(1.)));
-    REQUIRE(outs3[1] == approximately(std::sin(2.)));
-    REQUIRE(outs3[2] == approximately(std::sin(3.)));
+        REQUIRE(outs3[0] == approximately(std::sin(1.)));
+        REQUIRE(outs3[1] == approximately(std::sin(2.)));
+        REQUIRE(outs3[2] == approximately(std::sin(3.)));
 
-    ir = s3.get_ir();
+        ir = s3.get_ir();
 
-    count = 0u;
-    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.sin.f64", boost::is_iequal()));
-         it != string_find_iterator(); ++it) {
-        ++count;
-    }
+        count = 0u;
+        for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.sin.f64", boost::is_iequal()));
+             it != string_find_iterator(); ++it) {
+            ++count;
+        }
 
-    if (tf.sse2) {
-        // NOTE: occurrences of the scalar version:
-        // - 1 call in the remainder of the unstrided cfunc,
-        // - 1 call in the remainder of the strided cfunc,
-        // - 1 declaration.
-        REQUIRE(count == 3u);
-    }
+        if (tf.sse2) {
+            // NOTE: occurrences of the scalar version:
+            // - 1 call in the remainder of the unstrided cfunc,
+            // - 1 call in the remainder of the strided cfunc,
+            // - 1 declaration.
+            REQUIRE(count == 3u);
+        }
 
 #if LLVM_VERSION_MAJOR >= 16
 
-    if (tf.aarch64) {
-        REQUIRE(count == 3u);
-    }
+        if (tf.aarch64) {
+            REQUIRE(count == 3u);
+        }
 
 #endif
 
 #endif
 
 #endif
+    }
 }
 
 TEST_CASE("vfabi float")
 {
-    llvm_state s{kw::slp_vectorize = true};
+    for (auto fast_math : {false, true}) {
+        llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math};
 
-    auto [a, b, c, d] = make_vars("a", "b", "c", "d");
+        auto [a, b, c, d] = make_vars("a", "b", "c", "d");
 
-    add_cfunc<float>(s, "cfunc", {sin(a), sin(b), sin(c), sin(d)});
+        add_cfunc<float>(s, "cfunc", {sin(a), sin(b), sin(c), sin(d)});
 
-    s.compile();
+        s.compile();
 
-    auto *cf_ptr
-        = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
+        auto *cf_ptr
+            = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
 
-    const std::vector<float> ins{1., 2., 3., 4.};
-    std::vector<float> outs(4u, 0.);
+        const std::vector<float> ins{1., 2., 3., 4.};
+        std::vector<float> outs(4u, 0.);
 
-    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+        cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
 
-    REQUIRE(outs[0] == approximately(std::sin(1.f)));
-    REQUIRE(outs[1] == approximately(std::sin(2.f)));
-    REQUIRE(outs[2] == approximately(std::sin(3.f)));
-    REQUIRE(outs[3] == approximately(std::sin(4.f)));
+        REQUIRE(outs[0] == approximately(std::sin(1.f)));
+        REQUIRE(outs[1] == approximately(std::sin(2.f)));
+        REQUIRE(outs[2] == approximately(std::sin(3.f)));
+        REQUIRE(outs[3] == approximately(std::sin(4.f)));
 
 #if defined(HEYOKA_WITH_SLEEF)
 
-    const auto &tf = detail::get_target_features();
+        const auto &tf = detail::get_target_features();
 
-    auto ir = s.get_ir();
+        auto ir = s.get_ir();
 
-    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+        using string_find_iterator = boost::find_iterator<std::string::iterator>;
 
-    auto count = 0u;
-    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.sin.f32", boost::is_iequal()));
-         it != string_find_iterator(); ++it) {
-        ++count;
-    }
+        auto count = 0u;
+        for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.sin.f32", boost::is_iequal()));
+             it != string_find_iterator(); ++it) {
+            ++count;
+        }
 
-    // NOTE: at the moment we have comprehensive coverage of LLVM versions
-    // in the CI only for x86_64.
-    if (tf.sse2) {
-        // NOTE: occurrences of the scalar version:
-        // - 4 calls in the strided cfunc,
-        // - 1 declaration.
-        REQUIRE(count == 5u);
-    }
+        // NOTE: at the moment we have comprehensive coverage of LLVM versions
+        // in the CI only for x86_64.
+        if (tf.sse2) {
+            // NOTE: occurrences of the scalar version:
+            // - 4 calls in the strided cfunc,
+            // - 1 declaration.
+            REQUIRE(count == 5u);
+        }
 
 #if LLVM_VERSION_MAJOR >= 16
 
-    // NOTE: LLVM16 is currently the version tested in the CI on arm64.
-    if (tf.aarch64) {
-        REQUIRE(count == 5u);
-    }
+        // NOTE: LLVM16 is currently the version tested in the CI on arm64.
+        if (tf.aarch64) {
+            REQUIRE(count == 5u);
+        }
 
 #endif
 
-    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
-    // to the way the target machine is being set up by orc/lljit (it works
-    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+        // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+        // to the way the target machine is being set up by orc/lljit (it works
+        // fine with the opt tool). When this is resolved, we can test ppc64 too.
 
-    // if (tf.vsx) {
-    //     REQUIRE(count == 5u);
-    // }
+        // if (tf.vsx) {
+        //     REQUIRE(count == 5u);
+        // }
 
-    // Some more extensive testing specific to x86, only for this function.
-    auto [e, f, g, h, i] = make_vars("e", "f", "g", "h", "i");
+        // Some more extensive testing specific to x86, only for this function.
+        auto [e, f, g, h, i] = make_vars("e", "f", "g", "h", "i");
 
-    llvm_state s2{kw::slp_vectorize = true};
+        llvm_state s2{kw::slp_vectorize = true};
 
-    add_cfunc<float>(s2, "cfunc1", {sin(a), sin(b), sin(c), sin(d), sin(e), sin(f), sin(g), sin(h)});
-    add_cfunc<float>(s2, "cfunc2", {sin(a), sin(b), sin(c), sin(d), sin(e), sin(f), sin(g), sin(h), sin(i)});
+        add_cfunc<float>(s2, "cfunc1", {sin(a), sin(b), sin(c), sin(d), sin(e), sin(f), sin(g), sin(h)});
+        add_cfunc<float>(s2, "cfunc2", {sin(a), sin(b), sin(c), sin(d), sin(e), sin(f), sin(g), sin(h), sin(i)});
 
-    s2.compile();
+        s2.compile();
 
-    auto *cf1_ptr
-        = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s2.jit_lookup("cfunc1"));
-    auto *cf2_ptr
-        = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s2.jit_lookup("cfunc2"));
+        auto *cf1_ptr
+            = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s2.jit_lookup("cfunc1"));
+        auto *cf2_ptr
+            = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s2.jit_lookup("cfunc2"));
 
-    const std::vector<float> ins2{1., 2., 3., 4., 5., 6., 7., 8., 9.};
-    std::vector<float> outs2(9u, 0.);
+        const std::vector<float> ins2{1., 2., 3., 4., 5., 6., 7., 8., 9.};
+        std::vector<float> outs2(9u, 0.);
 
-    cf1_ptr(outs2.data(), ins2.data(), nullptr, nullptr);
+        cf1_ptr(outs2.data(), ins2.data(), nullptr, nullptr);
 
-    REQUIRE(outs2[0] == approximately(std::sin(1.f)));
-    REQUIRE(outs2[1] == approximately(std::sin(2.f)));
-    REQUIRE(outs2[2] == approximately(std::sin(3.f)));
-    REQUIRE(outs2[3] == approximately(std::sin(4.f)));
-    REQUIRE(outs2[4] == approximately(std::sin(5.f)));
-    REQUIRE(outs2[5] == approximately(std::sin(6.f)));
-    REQUIRE(outs2[6] == approximately(std::sin(7.f)));
-    REQUIRE(outs2[7] == approximately(std::sin(8.f)));
+        REQUIRE(outs2[0] == approximately(std::sin(1.f)));
+        REQUIRE(outs2[1] == approximately(std::sin(2.f)));
+        REQUIRE(outs2[2] == approximately(std::sin(3.f)));
+        REQUIRE(outs2[3] == approximately(std::sin(4.f)));
+        REQUIRE(outs2[4] == approximately(std::sin(5.f)));
+        REQUIRE(outs2[5] == approximately(std::sin(6.f)));
+        REQUIRE(outs2[6] == approximately(std::sin(7.f)));
+        REQUIRE(outs2[7] == approximately(std::sin(8.f)));
 
-    cf2_ptr(outs2.data(), ins2.data(), nullptr, nullptr);
+        cf2_ptr(outs2.data(), ins2.data(), nullptr, nullptr);
 
-    REQUIRE(outs2[0] == approximately(std::sin(1.f)));
-    REQUIRE(outs2[1] == approximately(std::sin(2.f)));
-    REQUIRE(outs2[2] == approximately(std::sin(3.f)));
-    REQUIRE(outs2[3] == approximately(std::sin(4.f)));
-    REQUIRE(outs2[4] == approximately(std::sin(5.f)));
-    REQUIRE(outs2[5] == approximately(std::sin(6.f)));
-    REQUIRE(outs2[6] == approximately(std::sin(7.f)));
-    REQUIRE(outs2[7] == approximately(std::sin(8.f)));
-    REQUIRE(outs2[8] == approximately(std::sin(9.f)));
+        REQUIRE(outs2[0] == approximately(std::sin(1.f)));
+        REQUIRE(outs2[1] == approximately(std::sin(2.f)));
+        REQUIRE(outs2[2] == approximately(std::sin(3.f)));
+        REQUIRE(outs2[3] == approximately(std::sin(4.f)));
+        REQUIRE(outs2[4] == approximately(std::sin(5.f)));
+        REQUIRE(outs2[5] == approximately(std::sin(6.f)));
+        REQUIRE(outs2[6] == approximately(std::sin(7.f)));
+        REQUIRE(outs2[7] == approximately(std::sin(8.f)));
+        REQUIRE(outs2[8] == approximately(std::sin(9.f)));
 
-    ir = s2.get_ir();
+        ir = s2.get_ir();
 
-    count = 0u;
-    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.sin.f32", boost::is_iequal()));
-         it != string_find_iterator(); ++it) {
-        ++count;
-    }
+        count = 0u;
+        for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.sin.f32", boost::is_iequal()));
+             it != string_find_iterator(); ++it) {
+            ++count;
+        }
 
-    if (tf.avx) {
-        // NOTE: occurrences of the scalar version:
-        // - 8 + 9 calls in the strided cfuncs,
-        // - 1 declaration,
-        // - 1 call to deal with the remainder in the
-        //   9-argument version.
-        REQUIRE(count == 19u);
-    }
+        if (tf.avx) {
+            // NOTE: occurrences of the scalar version:
+            // - 8 + 9 calls in the strided cfuncs,
+            // - 1 declaration,
+            // - 1 call to deal with the remainder in the
+            //   9-argument version.
+            REQUIRE(count == 19u);
+        }
 
-    // NOTE: this next test seems to work properly starting
-    // from LLVM 13.
+        // NOTE: this next test seems to work properly starting
+        // from LLVM 13.
 #if LLVM_VERSION_MAJOR >= 13
 
-    // Check that the autovec works also on batch sizes which do not correspond
-    // exactly to an available vector width.
-    llvm_state s3{kw::slp_vectorize = true};
+        // Check that the autovec works also on batch sizes which do not correspond
+        // exactly to an available vector width.
+        llvm_state s3{kw::slp_vectorize = true};
 
-    add_cfunc<float>(s3, "cfunc", {sin(a)}, kw::batch_size = 5u);
+        add_cfunc<float>(s3, "cfunc", {sin(a)}, kw::batch_size = 5u);
 
-    s3.compile();
+        s3.compile();
 
-    auto *cf3_ptr
-        = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s3.jit_lookup("cfunc"));
+        auto *cf3_ptr
+            = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s3.jit_lookup("cfunc"));
 
-    std::vector<float> ins3 = {1., 2., 3., 4., 5.}, outs3 = {0., 0., 0., 0., 0.};
+        std::vector<float> ins3 = {1., 2., 3., 4., 5.}, outs3 = {0., 0., 0., 0., 0.};
 
-    cf3_ptr(outs3.data(), ins3.data(), nullptr, nullptr);
+        cf3_ptr(outs3.data(), ins3.data(), nullptr, nullptr);
 
-    REQUIRE(outs3[0] == approximately(std::sin(1.f)));
-    REQUIRE(outs3[1] == approximately(std::sin(2.f)));
-    REQUIRE(outs3[2] == approximately(std::sin(3.f)));
-    REQUIRE(outs3[3] == approximately(std::sin(4.f)));
-    REQUIRE(outs3[4] == approximately(std::sin(5.f)));
+        REQUIRE(outs3[0] == approximately(std::sin(1.f)));
+        REQUIRE(outs3[1] == approximately(std::sin(2.f)));
+        REQUIRE(outs3[2] == approximately(std::sin(3.f)));
+        REQUIRE(outs3[3] == approximately(std::sin(4.f)));
+        REQUIRE(outs3[4] == approximately(std::sin(5.f)));
 
-    ir = s3.get_ir();
+        ir = s3.get_ir();
 
-    count = 0u;
-    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.sin.f32", boost::is_iequal()));
-         it != string_find_iterator(); ++it) {
-        ++count;
-    }
+        count = 0u;
+        for (auto it = boost::make_find_iterator(ir, boost::first_finder("@llvm.sin.f32", boost::is_iequal()));
+             it != string_find_iterator(); ++it) {
+            ++count;
+        }
 
-    if (tf.sse2) {
-        // NOTE: occurrences of the scalar version:
-        // - 1 call in the remainder of the unstrided cfunc,
-        // - 1 call in the remainder of the strided cfunc,
-        // - 1 declaration.
-        REQUIRE(count == 3u);
-    }
+        if (tf.sse2) {
+            // NOTE: occurrences of the scalar version:
+            // - 1 call in the remainder of the unstrided cfunc,
+            // - 1 call in the remainder of the strided cfunc,
+            // - 1 declaration.
+            REQUIRE(count == 3u);
+        }
 
 #if LLVM_VERSION_MAJOR >= 16
 
-    if (tf.aarch64) {
-        REQUIRE(count == 3u);
-    }
+        if (tf.aarch64) {
+            REQUIRE(count == 3u);
+        }
 
 #endif
 
 #endif
 
 #endif
+    }
 }
diff --git a/test/sinh.cpp b/test/sinh.cpp
index a0a218c82..fd8cad405 100644
--- a/test/sinh.cpp
+++ b/test/sinh.cpp
@@ -223,312 +223,317 @@ TEST_CASE("normalise")
 // Tests to check vectorisation via the vector-function-abi-variant machinery.
 TEST_CASE("vfabi double")
 {
-    llvm_state s{kw::slp_vectorize = true};
+    for (auto fast_math : {false, true}) {
+        llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math};
 
-    auto [a, b] = make_vars("a", "b");
+        auto [a, b] = make_vars("a", "b");
 
-    add_cfunc<double>(s, "cfunc", {sinh(a), sinh(b)});
+        add_cfunc<double>(s, "cfunc", {sinh(a), sinh(b)});
 
-    s.compile();
+        s.compile();
 
-    auto *cf_ptr
-        = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(s.jit_lookup("cfunc"));
+        auto *cf_ptr = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(
+            s.jit_lookup("cfunc"));
 
-    const std::vector ins{1., 2.};
-    std::vector<double> outs(2u, 0.);
+        const std::vector ins{1., 2.};
+        std::vector<double> outs(2u, 0.);
 
-    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+        cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
 
-    REQUIRE(outs[0] == approximately(std::sinh(1.)));
-    REQUIRE(outs[1] == approximately(std::sinh(2.)));
+        REQUIRE(outs[0] == approximately(std::sinh(1.)));
+        REQUIRE(outs[1] == approximately(std::sinh(2.)));
 
-    // NOTE: autovec with external scalar functions seems to work
-    // only since LLVM 16.
+        // NOTE: autovec with external scalar functions seems to work
+        // only since LLVM 16.
 #if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16
 
-    const auto &tf = detail::get_target_features();
+        const auto &tf = detail::get_target_features();
 
-    auto ir = s.get_ir();
+        auto ir = s.get_ir();
 
-    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+        using string_find_iterator = boost::find_iterator<std::string::iterator>;
 
-    auto count = 0u;
-    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@sinh", boost::is_iequal()));
-         it != string_find_iterator(); ++it) {
-        ++count;
-    }
+        auto count = 0u;
+        for (auto it = boost::make_find_iterator(ir, boost::first_finder("@sinh", boost::is_iequal()));
+             it != string_find_iterator(); ++it) {
+            ++count;
+        }
 
-    // NOTE: at the moment we have comprehensive coverage of LLVM versions
-    // in the CI only for x86_64.
-    if (tf.sse2) {
-        // NOTE: occurrences of the scalar version:
-        // - 2 calls in the strided cfunc,
-        // - 1 declaration.
-        REQUIRE(count == 3u);
-    }
+        // NOTE: at the moment we have comprehensive coverage of LLVM versions
+        // in the CI only for x86_64.
+        if (tf.sse2) {
+            // NOTE: occurrences of the scalar version:
+            // - 2 calls in the strided cfunc,
+            // - 1 declaration.
+            REQUIRE(count == 3u);
+        }
 
-    if (tf.aarch64) {
-        REQUIRE(count == 3u);
-    }
+        if (tf.aarch64) {
+            REQUIRE(count == 3u);
+        }
 
-    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
-    // to the way the target machine is being set up by orc/lljit (it works
-    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+        // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+        // to the way the target machine is being set up by orc/lljit (it works
+        // fine with the opt tool). When this is resolved, we can test ppc64 too.
 
-    // if (tf.vsx) {
-    //     REQUIRE(count == 3u);
-    // }
+        // if (tf.vsx) {
+        //     REQUIRE(count == 3u);
+        // }
 
-    // Some more extensive testing specific to x86, only for this function.
-    auto [c, d, e] = make_vars("c", "d", "e");
+        // Some more extensive testing specific to x86, only for this function.
+        auto [c, d, e] = make_vars("c", "d", "e");
 
-    llvm_state s2{kw::slp_vectorize = true};
+        llvm_state s2{kw::slp_vectorize = true};
 
-    add_cfunc<double>(s2, "cfunc1", {sinh(a), sinh(b), sinh(c), sinh(d)});
-    add_cfunc<double>(s2, "cfunc2", {sinh(a), sinh(b), sinh(c), sinh(d), sinh(e)});
+        add_cfunc<double>(s2, "cfunc1", {sinh(a), sinh(b), sinh(c), sinh(d)});
+        add_cfunc<double>(s2, "cfunc2", {sinh(a), sinh(b), sinh(c), sinh(d), sinh(e)});
 
-    s2.compile();
+        s2.compile();
 
-    auto *cf1_ptr
-        = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(s2.jit_lookup("cfunc1"));
-    auto *cf2_ptr
-        = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(s2.jit_lookup("cfunc2"));
+        auto *cf1_ptr = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(
+            s2.jit_lookup("cfunc1"));
+        auto *cf2_ptr = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(
+            s2.jit_lookup("cfunc2"));
 
-    const std::vector ins2{1., 2., 3., 4., 5.};
-    std::vector<double> outs2(5u, 0.);
+        const std::vector ins2{1., 2., 3., 4., 5.};
+        std::vector<double> outs2(5u, 0.);
 
-    cf1_ptr(outs2.data(), ins2.data(), nullptr, nullptr);
+        cf1_ptr(outs2.data(), ins2.data(), nullptr, nullptr);
 
-    REQUIRE(outs2[0] == approximately(std::sinh(1.)));
-    REQUIRE(outs2[1] == approximately(std::sinh(2.)));
-    REQUIRE(outs2[2] == approximately(std::sinh(3.)));
-    REQUIRE(outs2[3] == approximately(std::sinh(4.)));
+        REQUIRE(outs2[0] == approximately(std::sinh(1.)));
+        REQUIRE(outs2[1] == approximately(std::sinh(2.)));
+        REQUIRE(outs2[2] == approximately(std::sinh(3.)));
+        REQUIRE(outs2[3] == approximately(std::sinh(4.)));
 
-    cf2_ptr(outs2.data(), ins2.data(), nullptr, nullptr);
+        cf2_ptr(outs2.data(), ins2.data(), nullptr, nullptr);
 
-    REQUIRE(outs2[0] == approximately(std::sinh(1.)));
-    REQUIRE(outs2[1] == approximately(std::sinh(2.)));
-    REQUIRE(outs2[2] == approximately(std::sinh(3.)));
-    REQUIRE(outs2[3] == approximately(std::sinh(4.)));
-    REQUIRE(outs2[4] == approximately(std::sinh(5.)));
+        REQUIRE(outs2[0] == approximately(std::sinh(1.)));
+        REQUIRE(outs2[1] == approximately(std::sinh(2.)));
+        REQUIRE(outs2[2] == approximately(std::sinh(3.)));
+        REQUIRE(outs2[3] == approximately(std::sinh(4.)));
+        REQUIRE(outs2[4] == approximately(std::sinh(5.)));
 
-    ir = s2.get_ir();
+        ir = s2.get_ir();
 
-    count = 0u;
-    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@sinh", boost::is_iequal()));
-         it != string_find_iterator(); ++it) {
-        ++count;
-    }
+        count = 0u;
+        for (auto it = boost::make_find_iterator(ir, boost::first_finder("@sinh", boost::is_iequal()));
+             it != string_find_iterator(); ++it) {
+            ++count;
+        }
 
-    if (tf.avx) {
-        // NOTE: occurrences of the scalar version:
-        // - 4 + 5 calls in the strided cfuncs,
-        // - 1 declaration,
-        // - 1 call to deal with the remainder in the
-        //   5-argument version.
-        REQUIRE(count == 11u);
-    }
+        if (tf.avx) {
+            // NOTE: occurrences of the scalar version:
+            // - 4 + 5 calls in the strided cfuncs,
+            // - 1 declaration,
+            // - 1 call to deal with the remainder in the
+            //   5-argument version.
+            REQUIRE(count == 11u);
+        }
 
-    // Check that the autovec works also on batch sizes which do not correspond
-    // exactly to an available vector width.
-    llvm_state s3{kw::slp_vectorize = true};
+        // Check that the autovec works also on batch sizes which do not correspond
+        // exactly to an available vector width.
+        llvm_state s3{kw::slp_vectorize = true};
 
-    add_cfunc<double>(s3, "cfunc", {sinh(a)}, kw::batch_size = 3u);
+        add_cfunc<double>(s3, "cfunc", {sinh(a)}, kw::batch_size = 3u);
 
-    s3.compile();
+        s3.compile();
 
-    auto *cf3_ptr
-        = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(s3.jit_lookup("cfunc"));
+        auto *cf3_ptr = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(
+            s3.jit_lookup("cfunc"));
 
-    std::vector<double> ins3 = {1., 2., 3.}, outs3 = {0., 0., 0.};
+        std::vector<double> ins3 = {1., 2., 3.}, outs3 = {0., 0., 0.};
 
-    cf3_ptr(outs3.data(), ins3.data(), nullptr, nullptr);
+        cf3_ptr(outs3.data(), ins3.data(), nullptr, nullptr);
 
-    REQUIRE(outs3[0] == approximately(std::sinh(1.)));
-    REQUIRE(outs3[1] == approximately(std::sinh(2.)));
-    REQUIRE(outs3[2] == approximately(std::sinh(3.)));
+        REQUIRE(outs3[0] == approximately(std::sinh(1.)));
+        REQUIRE(outs3[1] == approximately(std::sinh(2.)));
+        REQUIRE(outs3[2] == approximately(std::sinh(3.)));
 
-    ir = s3.get_ir();
+        ir = s3.get_ir();
 
-    count = 0u;
-    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@sinh", boost::is_iequal()));
-         it != string_find_iterator(); ++it) {
-        ++count;
-    }
+        count = 0u;
+        for (auto it = boost::make_find_iterator(ir, boost::first_finder("@sinh", boost::is_iequal()));
+             it != string_find_iterator(); ++it) {
+            ++count;
+        }
 
-    if (tf.sse2) {
-        // NOTE: occurrences of the scalar version:
-        // - 1 call in the remainder of the unstrided cfunc,
-        // - 1 call in the remainder of the strided cfunc,
-        // - 1 declaration.
-        REQUIRE(count == 3u);
-    }
+        if (tf.sse2) {
+            // NOTE: occurrences of the scalar version:
+            // - 1 call in the remainder of the unstrided cfunc,
+            // - 1 call in the remainder of the strided cfunc,
+            // - 1 declaration.
+            REQUIRE(count == 3u);
+        }
 
-    if (tf.aarch64) {
-        REQUIRE(count == 3u);
-    }
+        if (tf.aarch64) {
+            REQUIRE(count == 3u);
+        }
 
 #endif
+    }
 }
 
 TEST_CASE("vfabi float")
 {
-    llvm_state s{kw::slp_vectorize = true};
+    for (auto fast_math : {false, true}) {
+        llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math};
 
-    auto [a, b, c, d] = make_vars("a", "b", "c", "d");
+        auto [a, b, c, d] = make_vars("a", "b", "c", "d");
 
-    add_cfunc<float>(s, "cfunc", {sinh(a), sinh(b), sinh(c), sinh(d)});
+        add_cfunc<float>(s, "cfunc", {sinh(a), sinh(b), sinh(c), sinh(d)});
 
-    s.compile();
+        s.compile();
 
-    auto *cf_ptr
-        = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
+        auto *cf_ptr
+            = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
 
-    const std::vector<float> ins{1., 2., 3., 4.};
-    std::vector<float> outs(4u, 0.);
+        const std::vector<float> ins{1., 2., 3., 4.};
+        std::vector<float> outs(4u, 0.);
 
-    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+        cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
 
-    REQUIRE(outs[0] == approximately(std::sinh(1.f)));
-    REQUIRE(outs[1] == approximately(std::sinh(2.f)));
-    REQUIRE(outs[2] == approximately(std::sinh(3.f)));
-    REQUIRE(outs[3] == approximately(std::sinh(4.f)));
+        REQUIRE(outs[0] == approximately(std::sinh(1.f)));
+        REQUIRE(outs[1] == approximately(std::sinh(2.f)));
+        REQUIRE(outs[2] == approximately(std::sinh(3.f)));
+        REQUIRE(outs[3] == approximately(std::sinh(4.f)));
 
-    // NOTE: autovec with external scalar functions seems to work
-    // only since LLVM 16.
+        // NOTE: autovec with external scalar functions seems to work
+        // only since LLVM 16.
 #if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16
 
-    const auto &tf = detail::get_target_features();
+        const auto &tf = detail::get_target_features();
 
-    auto ir = s.get_ir();
+        auto ir = s.get_ir();
 
-    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+        using string_find_iterator = boost::find_iterator<std::string::iterator>;
 
-    auto count = 0u;
-    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@sinhf", boost::is_iequal()));
-         it != string_find_iterator(); ++it) {
-        ++count;
-    }
+        auto count = 0u;
+        for (auto it = boost::make_find_iterator(ir, boost::first_finder("@sinhf", boost::is_iequal()));
+             it != string_find_iterator(); ++it) {
+            ++count;
+        }
 
-    // NOTE: at the moment we have comprehensive coverage of LLVM versions
-    // in the CI only for x86_64.
-    if (tf.sse2) {
-        // NOTE: occurrences of the scalar version:
-        // - 4 calls in the strided cfunc,
-        // - 1 declaration.
-        REQUIRE(count == 5u);
-    }
+        // NOTE: at the moment we have comprehensive coverage of LLVM versions
+        // in the CI only for x86_64.
+        if (tf.sse2) {
+            // NOTE: occurrences of the scalar version:
+            // - 4 calls in the strided cfunc,
+            // - 1 declaration.
+            REQUIRE(count == 5u);
+        }
 
-    if (tf.aarch64) {
-        REQUIRE(count == 5u);
-    }
+        if (tf.aarch64) {
+            REQUIRE(count == 5u);
+        }
 
-    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
-    // to the way the target machine is being set up by orc/lljit (it works
-    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+        // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+        // to the way the target machine is being set up by orc/lljit (it works
+        // fine with the opt tool). When this is resolved, we can test ppc64 too.
 
-    // if (tf.vsx) {
-    //     REQUIRE(count == 3u);
-    // }
+        // if (tf.vsx) {
+        //     REQUIRE(count == 3u);
+        // }
 
-    // Some more extensive testing specific to x86, only for this function.
-    auto [e, f, g, h, i] = make_vars("e", "f", "g", "h", "i");
+        // Some more extensive testing specific to x86, only for this function.
+        auto [e, f, g, h, i] = make_vars("e", "f", "g", "h", "i");
 
-    llvm_state s2{kw::slp_vectorize = true};
+        llvm_state s2{kw::slp_vectorize = true};
 
-    add_cfunc<float>(s2, "cfunc1", {sinh(a), sinh(b), sinh(c), sinh(d), sinh(e), sinh(f), sinh(g), sinh(h)});
-    add_cfunc<float>(s2, "cfunc2", {sinh(a), sinh(b), sinh(c), sinh(d), sinh(e), sinh(f), sinh(g), sinh(h), sinh(i)});
+        add_cfunc<float>(s2, "cfunc1", {sinh(a), sinh(b), sinh(c), sinh(d), sinh(e), sinh(f), sinh(g), sinh(h)});
+        add_cfunc<float>(s2, "cfunc2",
+                         {sinh(a), sinh(b), sinh(c), sinh(d), sinh(e), sinh(f), sinh(g), sinh(h), sinh(i)});
 
-    s2.compile();
+        s2.compile();
 
-    auto *cf1_ptr
-        = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s2.jit_lookup("cfunc1"));
-    auto *cf2_ptr
-        = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s2.jit_lookup("cfunc2"));
+        auto *cf1_ptr
+            = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s2.jit_lookup("cfunc1"));
+        auto *cf2_ptr
+            = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s2.jit_lookup("cfunc2"));
 
-    const std::vector<float> ins2{1., 2., 3., 4., 5., 6., 7., 8., 9.};
-    std::vector<float> outs2(9u, 0.);
+        const std::vector<float> ins2{1., 2., 3., 4., 5., 6., 7., 8., 9.};
+        std::vector<float> outs2(9u, 0.);
 
-    cf1_ptr(outs2.data(), ins2.data(), nullptr, nullptr);
+        cf1_ptr(outs2.data(), ins2.data(), nullptr, nullptr);
 
-    REQUIRE(outs2[0] == approximately(std::sinh(1.f)));
-    REQUIRE(outs2[1] == approximately(std::sinh(2.f)));
-    REQUIRE(outs2[2] == approximately(std::sinh(3.f)));
-    REQUIRE(outs2[3] == approximately(std::sinh(4.f)));
-    REQUIRE(outs2[4] == approximately(std::sinh(5.f)));
-    REQUIRE(outs2[5] == approximately(std::sinh(6.f)));
-    REQUIRE(outs2[6] == approximately(std::sinh(7.f)));
-    REQUIRE(outs2[7] == approximately(std::sinh(8.f)));
+        REQUIRE(outs2[0] == approximately(std::sinh(1.f)));
+        REQUIRE(outs2[1] == approximately(std::sinh(2.f)));
+        REQUIRE(outs2[2] == approximately(std::sinh(3.f)));
+        REQUIRE(outs2[3] == approximately(std::sinh(4.f)));
+        REQUIRE(outs2[4] == approximately(std::sinh(5.f)));
+        REQUIRE(outs2[5] == approximately(std::sinh(6.f)));
+        REQUIRE(outs2[6] == approximately(std::sinh(7.f)));
+        REQUIRE(outs2[7] == approximately(std::sinh(8.f)));
 
-    cf2_ptr(outs2.data(), ins2.data(), nullptr, nullptr);
+        cf2_ptr(outs2.data(), ins2.data(), nullptr, nullptr);
 
-    REQUIRE(outs2[0] == approximately(std::sinh(1.f)));
-    REQUIRE(outs2[1] == approximately(std::sinh(2.f)));
-    REQUIRE(outs2[2] == approximately(std::sinh(3.f)));
-    REQUIRE(outs2[3] == approximately(std::sinh(4.f)));
-    REQUIRE(outs2[4] == approximately(std::sinh(5.f)));
-    REQUIRE(outs2[5] == approximately(std::sinh(6.f)));
-    REQUIRE(outs2[6] == approximately(std::sinh(7.f)));
-    REQUIRE(outs2[7] == approximately(std::sinh(8.f)));
-    REQUIRE(outs2[8] == approximately(std::sinh(9.f)));
+        REQUIRE(outs2[0] == approximately(std::sinh(1.f)));
+        REQUIRE(outs2[1] == approximately(std::sinh(2.f)));
+        REQUIRE(outs2[2] == approximately(std::sinh(3.f)));
+        REQUIRE(outs2[3] == approximately(std::sinh(4.f)));
+        REQUIRE(outs2[4] == approximately(std::sinh(5.f)));
+        REQUIRE(outs2[5] == approximately(std::sinh(6.f)));
+        REQUIRE(outs2[6] == approximately(std::sinh(7.f)));
+        REQUIRE(outs2[7] == approximately(std::sinh(8.f)));
+        REQUIRE(outs2[8] == approximately(std::sinh(9.f)));
 
-    ir = s2.get_ir();
+        ir = s2.get_ir();
 
-    count = 0u;
-    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@sinhf", boost::is_iequal()));
-         it != string_find_iterator(); ++it) {
-        ++count;
-    }
+        count = 0u;
+        for (auto it = boost::make_find_iterator(ir, boost::first_finder("@sinhf", boost::is_iequal()));
+             it != string_find_iterator(); ++it) {
+            ++count;
+        }
 
-    if (tf.avx) {
-        // NOTE: occurrences of the scalar version:
-        // - 8 + 9 calls in the strided cfuncs,
-        // - 1 declaration,
-        // - 1 call to deal with the remainder in the
-        //   9-argument version.
-        REQUIRE(count == 19u);
-    }
+        if (tf.avx) {
+            // NOTE: occurrences of the scalar version:
+            // - 8 + 9 calls in the strided cfuncs,
+            // - 1 declaration,
+            // - 1 call to deal with the remainder in the
+            //   9-argument version.
+            REQUIRE(count == 19u);
+        }
 
-    // Check that the autovec works also on batch sizes which do not correspond
-    // exactly to an available vector width.
-    llvm_state s3{kw::slp_vectorize = true};
+        // Check that the autovec works also on batch sizes which do not correspond
+        // exactly to an available vector width.
+        llvm_state s3{kw::slp_vectorize = true};
 
-    add_cfunc<float>(s3, "cfunc", {sinh(a)}, kw::batch_size = 5u);
+        add_cfunc<float>(s3, "cfunc", {sinh(a)}, kw::batch_size = 5u);
 
-    s3.compile();
+        s3.compile();
 
-    auto *cf3_ptr
-        = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s3.jit_lookup("cfunc"));
+        auto *cf3_ptr
+            = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s3.jit_lookup("cfunc"));
 
-    std::vector<float> ins3 = {1., 2., 3., 4., 5.}, outs3 = {0., 0., 0., 0., 0.};
+        std::vector<float> ins3 = {1., 2., 3., 4., 5.}, outs3 = {0., 0., 0., 0., 0.};
 
-    cf3_ptr(outs3.data(), ins3.data(), nullptr, nullptr);
+        cf3_ptr(outs3.data(), ins3.data(), nullptr, nullptr);
 
-    REQUIRE(outs3[0] == approximately(std::sinh(1.f)));
-    REQUIRE(outs3[1] == approximately(std::sinh(2.f)));
-    REQUIRE(outs3[2] == approximately(std::sinh(3.f)));
-    REQUIRE(outs3[3] == approximately(std::sinh(4.f)));
-    REQUIRE(outs3[4] == approximately(std::sinh(5.f)));
+        REQUIRE(outs3[0] == approximately(std::sinh(1.f)));
+        REQUIRE(outs3[1] == approximately(std::sinh(2.f)));
+        REQUIRE(outs3[2] == approximately(std::sinh(3.f)));
+        REQUIRE(outs3[3] == approximately(std::sinh(4.f)));
+        REQUIRE(outs3[4] == approximately(std::sinh(5.f)));
 
-    ir = s3.get_ir();
+        ir = s3.get_ir();
 
-    count = 0u;
-    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@sinhf", boost::is_iequal()));
-         it != string_find_iterator(); ++it) {
-        ++count;
-    }
+        count = 0u;
+        for (auto it = boost::make_find_iterator(ir, boost::first_finder("@sinhf", boost::is_iequal()));
+             it != string_find_iterator(); ++it) {
+            ++count;
+        }
 
-    if (tf.sse2) {
-        // NOTE: occurrences of the scalar version:
-        // - 1 call in the remainder of the unstrided cfunc,
-        // - 1 call in the remainder of the strided cfunc,
-        // - 1 declaration.
-        REQUIRE(count == 3u);
-    }
+        if (tf.sse2) {
+            // NOTE: occurrences of the scalar version:
+            // - 1 call in the remainder of the unstrided cfunc,
+            // - 1 call in the remainder of the strided cfunc,
+            // - 1 declaration.
+            REQUIRE(count == 3u);
+        }
 
-    if (tf.aarch64) {
-        REQUIRE(count == 3u);
-    }
+        if (tf.aarch64) {
+            REQUIRE(count == 3u);
+        }
 
 #endif
+    }
 }
diff --git a/test/tan.cpp b/test/tan.cpp
index 66788af21..57a08d954 100644
--- a/test/tan.cpp
+++ b/test/tan.cpp
@@ -222,124 +222,128 @@ TEST_CASE("normalise")
 // Tests to check vectorisation via the vector-function-abi-variant machinery.
 TEST_CASE("vfabi double")
 {
-    llvm_state s{kw::slp_vectorize = true};
+    for (auto fast_math : {false, true}) {
+        llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math};
 
-    auto [a, b] = make_vars("a", "b");
+        auto [a, b] = make_vars("a", "b");
 
-    add_cfunc<double>(s, "cfunc", {tan(a), tan(b)});
+        add_cfunc<double>(s, "cfunc", {tan(a), tan(b)});
 
-    s.compile();
+        s.compile();
 
-    auto *cf_ptr
-        = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(s.jit_lookup("cfunc"));
+        auto *cf_ptr = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(
+            s.jit_lookup("cfunc"));
 
-    const std::vector ins{.1, .2};
-    std::vector<double> outs(2u, 0.);
+        const std::vector ins{.1, .2};
+        std::vector<double> outs(2u, 0.);
 
-    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+        cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
 
-    REQUIRE(outs[0] == approximately(std::tan(.1)));
-    REQUIRE(outs[1] == approximately(std::tan(.2)));
+        REQUIRE(outs[0] == approximately(std::tan(.1)));
+        REQUIRE(outs[1] == approximately(std::tan(.2)));
 
-    // NOTE: autovec with external scalar functions seems to work
-    // only since LLVM 16.
+        // NOTE: autovec with external scalar functions seems to work
+        // only since LLVM 16.
 #if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16
 
-    const auto &tf = detail::get_target_features();
+        const auto &tf = detail::get_target_features();
 
-    auto ir = s.get_ir();
+        auto ir = s.get_ir();
 
-    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+        using string_find_iterator = boost::find_iterator<std::string::iterator>;
 
-    auto count = 0u;
-    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@tan", boost::is_iequal()));
-         it != string_find_iterator(); ++it) {
-        ++count;
-    }
+        auto count = 0u;
+        for (auto it = boost::make_find_iterator(ir, boost::first_finder("@tan", boost::is_iequal()));
+             it != string_find_iterator(); ++it) {
+            ++count;
+        }
 
-    // NOTE: at the moment we have comprehensive coverage of LLVM versions
-    // in the CI only for x86_64.
-    if (tf.sse2) {
-        // NOTE: occurrences of the scalar version:
-        // - 2 calls in the strided cfunc,
-        // - 1 declaration.
-        REQUIRE(count == 3u);
-    }
+        // NOTE: at the moment we have comprehensive coverage of LLVM versions
+        // in the CI only for x86_64.
+        if (tf.sse2) {
+            // NOTE: occurrences of the scalar version:
+            // - 2 calls in the strided cfunc,
+            // - 1 declaration.
+            REQUIRE(count == 3u);
+        }
 
-    if (tf.aarch64) {
-        REQUIRE(count == 3u);
-    }
+        if (tf.aarch64) {
+            REQUIRE(count == 3u);
+        }
 
-    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
-    // to the way the target machine is being set up by orc/lljit (it works
-    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+        // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+        // to the way the target machine is being set up by orc/lljit (it works
+        // fine with the opt tool). When this is resolved, we can test ppc64 too.
 
-    // if (tf.vsx) {
-    //     REQUIRE(count == 3u);
-    // }
+        // if (tf.vsx) {
+        //     REQUIRE(count == 3u);
+        // }
 
 #endif
+    }
 }
 
 TEST_CASE("vfabi float")
 {
-    llvm_state s{kw::slp_vectorize = true};
+    for (auto fast_math : {false, true}) {
+        llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math};
 
-    auto [a, b, c, d] = make_vars("a", "b", "c", "d");
+        auto [a, b, c, d] = make_vars("a", "b", "c", "d");
 
-    add_cfunc<float>(s, "cfunc", {tan(a), tan(b), tan(c), tan(d)});
+        add_cfunc<float>(s, "cfunc", {tan(a), tan(b), tan(c), tan(d)});
 
-    s.compile();
+        s.compile();
 
-    auto *cf_ptr
-        = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
+        auto *cf_ptr
+            = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
 
-    const std::vector<float> ins{1., 2., 3., 4.};
-    std::vector<float> outs(4u, 0.);
+        const std::vector<float> ins{1., 2., 3., 4.};
+        std::vector<float> outs(4u, 0.);
 
-    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+        cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
 
-    REQUIRE(outs[0] == approximately(std::tan(1.f)));
-    REQUIRE(outs[1] == approximately(std::tan(2.f)));
-    REQUIRE(outs[2] == approximately(std::tan(3.f)));
-    REQUIRE(outs[3] == approximately(std::tan(4.f)));
+        REQUIRE(outs[0] == approximately(std::tan(1.f)));
+        REQUIRE(outs[1] == approximately(std::tan(2.f)));
+        REQUIRE(outs[2] == approximately(std::tan(3.f)));
+        REQUIRE(outs[3] == approximately(std::tan(4.f)));
 
-    // NOTE: autovec with external scalar functions seems to work
-    // only since LLVM 16.
+        // NOTE: autovec with external scalar functions seems to work
+        // only since LLVM 16.
 #if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16
 
-    const auto &tf = detail::get_target_features();
+        const auto &tf = detail::get_target_features();
 
-    auto ir = s.get_ir();
+        auto ir = s.get_ir();
 
-    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+        using string_find_iterator = boost::find_iterator<std::string::iterator>;
 
-    auto count = 0u;
-    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@tanf", boost::is_iequal()));
-         it != string_find_iterator(); ++it) {
-        ++count;
-    }
+        auto count = 0u;
+        for (auto it = boost::make_find_iterator(ir, boost::first_finder("@tanf", boost::is_iequal()));
+             it != string_find_iterator(); ++it) {
+            ++count;
+        }
 
-    // NOTE: at the moment we have comprehensive coverage of LLVM versions
-    // in the CI only for x86_64.
-    if (tf.sse2) {
-        // NOTE: occurrences of the scalar version:
-        // - 4 calls in the strided cfunc,
-        // - 1 declaration.
-        REQUIRE(count == 5u);
-    }
+        // NOTE: at the moment we have comprehensive coverage of LLVM versions
+        // in the CI only for x86_64.
+        if (tf.sse2) {
+            // NOTE: occurrences of the scalar version:
+            // - 4 calls in the strided cfunc,
+            // - 1 declaration.
+            REQUIRE(count == 5u);
+        }
 
-    if (tf.aarch64) {
-        REQUIRE(count == 5u);
-    }
+        if (tf.aarch64) {
+            REQUIRE(count == 5u);
+        }
 
-    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
-    // to the way the target machine is being set up by orc/lljit (it works
-    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+        // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+        // to the way the target machine is being set up by orc/lljit (it works
+        // fine with the opt tool). When this is resolved, we can test ppc64 too.
 
-    // if (tf.vsx) {
-    //     REQUIRE(count == 5u);
-    // }
+        // if (tf.vsx) {
+        //     REQUIRE(count == 5u);
+        // }
 
 #endif
+    }
 }
diff --git a/test/tanh.cpp b/test/tanh.cpp
index 43b6f6b2b..b68f011a0 100644
--- a/test/tanh.cpp
+++ b/test/tanh.cpp
@@ -223,124 +223,128 @@ TEST_CASE("normalise")
 // Tests to check vectorisation via the vector-function-abi-variant machinery.
 TEST_CASE("vfabi double")
 {
-    llvm_state s{kw::slp_vectorize = true};
+    for (auto fast_math : {false, true}) {
+        llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math};
 
-    auto [a, b] = make_vars("a", "b");
+        auto [a, b] = make_vars("a", "b");
 
-    add_cfunc<double>(s, "cfunc", {tanh(a), tanh(b)});
+        add_cfunc<double>(s, "cfunc", {tanh(a), tanh(b)});
 
-    s.compile();
+        s.compile();
 
-    auto *cf_ptr
-        = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(s.jit_lookup("cfunc"));
+        auto *cf_ptr = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(
+            s.jit_lookup("cfunc"));
 
-    const std::vector ins{.1, .2};
-    std::vector<double> outs(2u, 0.);
+        const std::vector ins{.1, .2};
+        std::vector<double> outs(2u, 0.);
 
-    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+        cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
 
-    REQUIRE(outs[0] == approximately(std::tanh(.1)));
-    REQUIRE(outs[1] == approximately(std::tanh(.2)));
+        REQUIRE(outs[0] == approximately(std::tanh(.1)));
+        REQUIRE(outs[1] == approximately(std::tanh(.2)));
 
-    // NOTE: autovec with external scalar functions seems to work
-    // only since LLVM 16.
+        // NOTE: autovec with external scalar functions seems to work
+        // only since LLVM 16.
 #if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16
 
-    const auto &tf = detail::get_target_features();
+        const auto &tf = detail::get_target_features();
 
-    auto ir = s.get_ir();
+        auto ir = s.get_ir();
 
-    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+        using string_find_iterator = boost::find_iterator<std::string::iterator>;
 
-    auto count = 0u;
-    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@tanh", boost::is_iequal()));
-         it != string_find_iterator(); ++it) {
-        ++count;
-    }
+        auto count = 0u;
+        for (auto it = boost::make_find_iterator(ir, boost::first_finder("@tanh", boost::is_iequal()));
+             it != string_find_iterator(); ++it) {
+            ++count;
+        }
 
-    // NOTE: at the moment we have comprehensive coverage of LLVM versions
-    // in the CI only for x86_64.
-    if (tf.sse2) {
-        // NOTE: occurrences of the scalar version:
-        // - 2 calls in the strided cfunc,
-        // - 1 declaration.
-        REQUIRE(count == 3u);
-    }
+        // NOTE: at the moment we have comprehensive coverage of LLVM versions
+        // in the CI only for x86_64.
+        if (tf.sse2) {
+            // NOTE: occurrences of the scalar version:
+            // - 2 calls in the strided cfunc,
+            // - 1 declaration.
+            REQUIRE(count == 3u);
+        }
 
-    if (tf.aarch64) {
-        REQUIRE(count == 3u);
-    }
+        if (tf.aarch64) {
+            REQUIRE(count == 3u);
+        }
 
-    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
-    // to the way the target machine is being set up by orc/lljit (it works
-    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+        // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+        // to the way the target machine is being set up by orc/lljit (it works
+        // fine with the opt tool). When this is resolved, we can test ppc64 too.
 
-    // if (tf.vsx) {
-    //     REQUIRE(count == 3u);
-    // }
+        // if (tf.vsx) {
+        //     REQUIRE(count == 3u);
+        // }
 
 #endif
+    }
 }
 
 TEST_CASE("vfabi float")
 {
-    llvm_state s{kw::slp_vectorize = true};
+    for (auto fast_math : {false, true}) {
+        llvm_state s{kw::slp_vectorize = true, kw::fast_math = fast_math};
 
-    auto [a, b, c, d] = make_vars("a", "b", "c", "d");
+        auto [a, b, c, d] = make_vars("a", "b", "c", "d");
 
-    add_cfunc<float>(s, "cfunc", {tanh(a), tanh(b), tanh(c), tanh(d)});
+        add_cfunc<float>(s, "cfunc", {tanh(a), tanh(b), tanh(c), tanh(d)});
 
-    s.compile();
+        s.compile();
 
-    auto *cf_ptr
-        = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
+        auto *cf_ptr
+            = reinterpret_cast<void (*)(float *, const float *, const float *, const float *)>(s.jit_lookup("cfunc"));
 
-    const std::vector<float> ins{1., 2., 3., 4.};
-    std::vector<float> outs(4u, 0.);
+        const std::vector<float> ins{1., 2., 3., 4.};
+        std::vector<float> outs(4u, 0.);
 
-    cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
+        cf_ptr(outs.data(), ins.data(), nullptr, nullptr);
 
-    REQUIRE(outs[0] == approximately(std::tanh(1.f)));
-    REQUIRE(outs[1] == approximately(std::tanh(2.f)));
-    REQUIRE(outs[2] == approximately(std::tanh(3.f)));
-    REQUIRE(outs[3] == approximately(std::tanh(4.f)));
+        REQUIRE(outs[0] == approximately(std::tanh(1.f)));
+        REQUIRE(outs[1] == approximately(std::tanh(2.f)));
+        REQUIRE(outs[2] == approximately(std::tanh(3.f)));
+        REQUIRE(outs[3] == approximately(std::tanh(4.f)));
 
-    // NOTE: autovec with external scalar functions seems to work
-    // only since LLVM 16.
+        // NOTE: autovec with external scalar functions seems to work
+        // only since LLVM 16.
 #if defined(HEYOKA_WITH_SLEEF) && LLVM_VERSION_MAJOR >= 16
 
-    const auto &tf = detail::get_target_features();
+        const auto &tf = detail::get_target_features();
 
-    auto ir = s.get_ir();
+        auto ir = s.get_ir();
 
-    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+        using string_find_iterator = boost::find_iterator<std::string::iterator>;
 
-    auto count = 0u;
-    for (auto it = boost::make_find_iterator(ir, boost::first_finder("@tanhf", boost::is_iequal()));
-         it != string_find_iterator(); ++it) {
-        ++count;
-    }
+        auto count = 0u;
+        for (auto it = boost::make_find_iterator(ir, boost::first_finder("@tanhf", boost::is_iequal()));
+             it != string_find_iterator(); ++it) {
+            ++count;
+        }
 
-    // NOTE: at the moment we have comprehensive coverage of LLVM versions
-    // in the CI only for x86_64.
-    if (tf.sse2) {
-        // NOTE: occurrences of the scalar version:
-        // - 4 calls in the strided cfunc,
-        // - 1 declaration.
-        REQUIRE(count == 5u);
-    }
+        // NOTE: at the moment we have comprehensive coverage of LLVM versions
+        // in the CI only for x86_64.
+        if (tf.sse2) {
+            // NOTE: occurrences of the scalar version:
+            // - 4 calls in the strided cfunc,
+            // - 1 declaration.
+            REQUIRE(count == 5u);
+        }
 
-    if (tf.aarch64) {
-        REQUIRE(count == 5u);
-    }
+        if (tf.aarch64) {
+            REQUIRE(count == 5u);
+        }
 
-    // NOTE: currently no auto-vectorization happens on ppc64 due apparently
-    // to the way the target machine is being set up by orc/lljit (it works
-    // fine with the opt tool). When this is resolved, we can test ppc64 too.
+        // NOTE: currently no auto-vectorization happens on ppc64 due apparently
+        // to the way the target machine is being set up by orc/lljit (it works
+        // fine with the opt tool). When this is resolved, we can test ppc64 too.
 
-    // if (tf.vsx) {
-    //     REQUIRE(count == 5u);
-    // }
+        // if (tf.vsx) {
+        //     REQUIRE(count == 5u);
+        // }
 
 #endif
+    }
 }
diff --git a/test/taylor_sincos.cpp b/test/taylor_sincos.cpp
index 8efebd5fd..a8dc59bf0 100644
--- a/test/taylor_sincos.cpp
+++ b/test/taylor_sincos.cpp
@@ -47,10 +47,11 @@ const auto fp_types = std::tuple<float, double
                                  >{};
 
 template <typename T, typename U>
-void compare_batch_scalar(std::initializer_list<U> sys, unsigned opt_level, bool high_accuracy, bool compact_mode)
+void compare_batch_scalar(std::initializer_list<U> sys, unsigned opt_level, bool high_accuracy, bool compact_mode,
+                          bool fast_math)
 {
     for (auto batch_size : {2u, 4u, 8u, 5u}) {
-        llvm_state s{kw::opt_level = opt_level};
+        llvm_state s{kw::opt_level = opt_level, kw::fast_math = fast_math};
 
         taylor_add_jet<T>(s, "jet_batch", sys, 3, batch_size, high_accuracy, compact_mode);
         taylor_add_jet<T>(s, "jet_scalar", sys, 3, 1, high_accuracy, compact_mode);
@@ -98,7 +99,7 @@ TEST_CASE("taylor sincos decompose bug 00")
 
 TEST_CASE("taylor sincos")
 {
-    auto tester = [](auto fp_x, unsigned opt_level, bool high_accuracy, bool compact_mode) {
+    auto tester = [](auto fp_x, unsigned opt_level, bool high_accuracy, bool compact_mode, bool fast_math) {
         using std::sin;
         using std::cos;
 
@@ -108,7 +109,7 @@ TEST_CASE("taylor sincos")
 
         // Number-number tests.
         {
-            llvm_state s{kw::opt_level = opt_level};
+            llvm_state s{kw::opt_level = opt_level, kw::fast_math = fast_math};
 
             taylor_add_jet<fp_t>(s, "jet", {sin(expression{number{fp_t{2}}}) + cos(expression{number{fp_t{3}}}), x + y},
                                  1, 1, high_accuracy, compact_mode);
@@ -129,7 +130,7 @@ TEST_CASE("taylor sincos")
         }
 
         {
-            llvm_state s{kw::opt_level = opt_level};
+            llvm_state s{kw::opt_level = opt_level, kw::fast_math = fast_math};
 
             taylor_add_jet<fp_t>(s, "jet", {sin(par[0]) + cos(par[1]), x + y}, 1, 1, high_accuracy, compact_mode);
 
@@ -151,7 +152,7 @@ TEST_CASE("taylor sincos")
         }
 
         {
-            llvm_state s{kw::opt_level = opt_level};
+            llvm_state s{kw::opt_level = opt_level, kw::fast_math = fast_math};
 
             taylor_add_jet<fp_t>(s, "jet", {sin(expression{number{fp_t{2}}}) + cos(expression{number{fp_t{3}}}), x + y},
                                  1, 2, high_accuracy, compact_mode);
@@ -179,7 +180,7 @@ TEST_CASE("taylor sincos")
         }
 
         {
-            llvm_state s{kw::opt_level = opt_level};
+            llvm_state s{kw::opt_level = opt_level, kw::fast_math = fast_math};
 
             taylor_add_jet<fp_t>(s, "jet", {sin(par[0]) + cos(par[1]), x + y}, 1, 2, high_accuracy, compact_mode);
 
@@ -208,7 +209,7 @@ TEST_CASE("taylor sincos")
         }
 
         {
-            llvm_state s{kw::opt_level = opt_level};
+            llvm_state s{kw::opt_level = opt_level, kw::fast_math = fast_math};
 
             taylor_add_jet<fp_t>(s, "jet", {sin(expression{number{fp_t{2}}}) + cos(expression{number{fp_t{3}}}), x + y},
                                  2, 1, high_accuracy, compact_mode);
@@ -231,7 +232,7 @@ TEST_CASE("taylor sincos")
         }
 
         {
-            llvm_state s{kw::opt_level = opt_level};
+            llvm_state s{kw::opt_level = opt_level, kw::fast_math = fast_math};
 
             taylor_add_jet<fp_t>(s, "jet", {sin(expression{number{fp_t{2}}}) + cos(expression{number{fp_t{3}}}), x + y},
                                  2, 2, high_accuracy, compact_mode);
@@ -265,7 +266,7 @@ TEST_CASE("taylor sincos")
         }
 
         {
-            llvm_state s{kw::opt_level = opt_level};
+            llvm_state s{kw::opt_level = opt_level, kw::fast_math = fast_math};
 
             taylor_add_jet<fp_t>(s, "jet", {sin(expression{number{fp_t{2}}}) + cos(expression{number{fp_t{3}}}), x + y},
                                  3, 3, high_accuracy, compact_mode);
@@ -313,7 +314,7 @@ TEST_CASE("taylor sincos")
         }
 
         {
-            llvm_state s{kw::opt_level = opt_level};
+            llvm_state s{kw::opt_level = opt_level, kw::fast_math = fast_math};
 
             taylor_add_jet<fp_t>(s, "jet", {sin(par[0]) + cos(par[1]), x + y}, 3, 3, high_accuracy, compact_mode);
 
@@ -363,11 +364,11 @@ TEST_CASE("taylor sincos")
 
         // Do the batch/scalar comparison.
         compare_batch_scalar<fp_t>({sin(expression{number{fp_t{2}}}) + cos(expression{number{fp_t{3}}}), x + y},
-                                   opt_level, high_accuracy, compact_mode);
+                                   opt_level, high_accuracy, compact_mode, fast_math);
 
         // Variable tests.
         {
-            llvm_state s{kw::opt_level = opt_level};
+            llvm_state s{kw::opt_level = opt_level, kw::fast_math = fast_math};
 
             taylor_add_jet<fp_t>(s, "jet", {sin(y + 1_dbl), cos(x + 1_dbl)}, 1, 1, high_accuracy, compact_mode);
 
@@ -387,7 +388,7 @@ TEST_CASE("taylor sincos")
         }
 
         {
-            llvm_state s{kw::opt_level = opt_level};
+            llvm_state s{kw::opt_level = opt_level, kw::fast_math = fast_math};
 
             taylor_add_jet<fp_t>(s, "jet", {sin(y + 1_dbl), cos(x + 1_dbl)}, 1, 2, high_accuracy, compact_mode);
 
@@ -414,7 +415,7 @@ TEST_CASE("taylor sincos")
         }
 
         {
-            llvm_state s{kw::opt_level = opt_level};
+            llvm_state s{kw::opt_level = opt_level, kw::fast_math = fast_math};
 
             taylor_add_jet<fp_t>(s, "jet", {sin(y + 1_dbl), cos(x + 1_dbl)}, 2, 1, high_accuracy, compact_mode);
 
@@ -436,7 +437,7 @@ TEST_CASE("taylor sincos")
         }
 
         {
-            llvm_state s{kw::opt_level = opt_level};
+            llvm_state s{kw::opt_level = opt_level, kw::fast_math = fast_math};
 
             taylor_add_jet<fp_t>(s, "jet", {sin(y), cos(x)}, 2, 2, high_accuracy, compact_mode);
 
@@ -469,7 +470,7 @@ TEST_CASE("taylor sincos")
         }
 
         {
-            llvm_state s{kw::opt_level = opt_level};
+            llvm_state s{kw::opt_level = opt_level, kw::fast_math = fast_math};
 
             taylor_add_jet<fp_t>(s, "jet", {sin(y), cos(x)}, 3, 3, high_accuracy, compact_mode);
 
@@ -522,15 +523,17 @@ TEST_CASE("taylor sincos")
         }
 
         // Do the batch/scalar comparison.
-        compare_batch_scalar<fp_t>({sin(y), cos(x)}, opt_level, high_accuracy, compact_mode);
+        compare_batch_scalar<fp_t>({sin(y), cos(x)}, opt_level, high_accuracy, compact_mode, fast_math);
     };
 
     for (auto cm : {false, true}) {
         for (auto f : {false, true}) {
-            tuple_for_each(fp_types, [&tester, f, cm](auto x) { tester(x, 0, f, cm); });
-            tuple_for_each(fp_types, [&tester, f, cm](auto x) { tester(x, 1, f, cm); });
-            tuple_for_each(fp_types, [&tester, f, cm](auto x) { tester(x, 2, f, cm); });
-            tuple_for_each(fp_types, [&tester, f, cm](auto x) { tester(x, 3, f, cm); });
+            for (auto fm : {false, true}) {
+                tuple_for_each(fp_types, [&tester, f, cm, fm](auto x) { tester(x, 0, f, cm, fm); });
+                tuple_for_each(fp_types, [&tester, f, cm, fm](auto x) { tester(x, 1, f, cm, fm); });
+                tuple_for_each(fp_types, [&tester, f, cm, fm](auto x) { tester(x, 2, f, cm, fm); });
+                tuple_for_each(fp_types, [&tester, f, cm, fm](auto x) { tester(x, 3, f, cm, fm); });
+            }
         }
     }
 }
diff --git a/tutorial/CMakeLists.txt b/tutorial/CMakeLists.txt
index b027f63b5..1c8b2fd95 100644
--- a/tutorial/CMakeLists.txt
+++ b/tutorial/CMakeLists.txt
@@ -29,6 +29,7 @@ ADD_HEYOKA_TUTORIAL(s11n_event)
 ADD_HEYOKA_TUTORIAL(ensemble)
 ADD_HEYOKA_TUTORIAL(par_mode)
 ADD_HEYOKA_TUTORIAL(extended_precision)
+ADD_HEYOKA_TUTORIAL(single_precision)
 
 if(HEYOKA_WITH_MPPP AND mp++_WITH_MPFR)
   ADD_HEYOKA_TUTORIAL(arbitrary_precision)
diff --git a/tutorial/single_precision.cpp b/tutorial/single_precision.cpp
new file mode 100644
index 000000000..f3bb901be
--- /dev/null
+++ b/tutorial/single_precision.cpp
@@ -0,0 +1,50 @@
+// Copyright 2020, 2021, 2022, 2023 Francesco Biscani (bluescarni@gmail.com), Dario Izzo (dario.izzo@gmail.com)
+//
+// This file is part of the heyoka library.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include <cmath>
+#include <iostream>
+
+#include <heyoka/heyoka.hpp>
+
+using namespace heyoka;
+
+int main()
+{
+    // Create the symbolic variables x and v.
+    auto [x, v] = make_vars("x", "v");
+
+    // Create the integrator object
+    // in single precision.
+    auto ta = taylor_adaptive<float>{// Definition of the ODE system:
+                                     // x' = v
+                                     // v' = -9.8 * sin(x)
+                                     {prime(x) = v, prime(v) = -9.8 * sin(x)},
+                                     // Initial conditions
+                                     // for x and v.
+                                     {-1.f, 0.f}};
+
+    // Create a small helper to compute the energy constant
+    // from the state vector.
+    auto compute_energy = [](const auto &sv) {
+        using std::cos;
+
+        return (sv[1] * sv[1]) / 2 + 9.8 * (1 - cos(sv[0]));
+    };
+
+    // Compute and store the intial energy.
+    const auto orig_E = compute_energy(ta.get_state());
+
+    // Integrate for a few timesteps.
+    for (auto i = 0; i < 20; ++i) {
+        using std::abs;
+
+        ta.step();
+
+        std::cout << "Relative energy error: " << abs((orig_E - compute_energy(ta.get_state())) / orig_E) << '\n';
+    }
+}