Merge pull request #367 from bluescarni/pr/fast_math_vec

Low-precision vector math support
bluescarni · Nov 29, 2023 · f8c2b0e · f8c2b0e
2 parents 4ccf2ce + 4a15c19
commit f8c2b0e
Show file tree

Hide file tree

Showing 29 changed files with 2,060 additions and 1,642 deletions.
diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
@@ -56,6 +56,7 @@ ADD_HEYOKA_BENCHMARK(event_overhead)
 ADD_HEYOKA_BENCHMARK(ss_event_overhead)
 ADD_HEYOKA_BENCHMARK(h_oscillator_lt)
 ADD_HEYOKA_BENCHMARK(mb)
+ADD_HEYOKA_BENCHMARK(kepE_bench)
 ADD_HEYOKA_BENCHMARK(vsop2013_elliptic)
 ADD_HEYOKA_BENCHMARK(vsop2013_cartesian)
 ADD_HEYOKA_BENCHMARK(elp2000_cartesian)

diff --git a/benchmark/kepE_bench.cpp b/benchmark/kepE_bench.cpp
@@ -0,0 +1,126 @@
+// Copyright 2020, 2021, 2022, 2023 Francesco Biscani ([email protected]), Dario Izzo ([email protected])
+//
+// This file is part of the heyoka library.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include <algorithm>
+#include <cmath>
+#include <ios>
+#include <iostream>
+#include <random>
+#include <vector>
+
+#include <boost/math/constants/constants.hpp>
+#include <boost/program_options.hpp>
+
+#include <fmt/core.h>
+
+#include <spdlog/spdlog.h>
+#include <spdlog/stopwatch.h>
+
+#include <heyoka/expression.hpp>
+#include <heyoka/kw.hpp>
+#include <heyoka/llvm_state.hpp>
+#include <heyoka/logging.hpp>
+#include <heyoka/math/kepE.hpp>
+#include <stdexcept>
+
+using namespace heyoka;
+
+int main(int argc, char *argv[])
+{
+    namespace po = boost::program_options;
+
+    double ecc{};
+    unsigned seed{};
+    bool fast_math{};
+
+    po::options_description desc("Options");
+
+    desc.add_options()("help", "produce help message")("ecc", po::value<double>(&ecc)->default_value(0.1),
+                                                       "eccentricity")(
+        "seed", po::value<unsigned>(&seed)->default_value(42u),
+        "random seed")("fast-math", po::value<bool>(&fast_math)->default_value(true), "fast math mode");
+
+    po::variables_map vm;
+    po::store(po::parse_command_line(argc, argv, desc), vm);
+    po::notify(vm);
+
+    if (vm.count("help") != 0u) {
+        std::cout << desc << "\n";
+        return 0;
+    }
+
+    if (!std::isfinite(ecc) || ecc < 0 || ecc >= 1) {
+        throw std::invalid_argument(fmt::format("Invalid eccentricity value: {}", ecc));
+    }
+
+    constexpr auto N = 1'000'000ul;
+
+    std::cout << std::boolalpha;
+    std::cout << "Eccentricity: " << ecc << '\n';
+    std::cout << "fast_math   : " << fast_math << '\n';
+    std::cout << "N           : " << N << "\n\n";
+
+    // RNG setup.
+    std::mt19937 rng(seed);
+    std::uniform_real_distribution<double> Mdist(0, 2 * boost::math::constants::pi<double>());
+
+    // Data setup.
+    std::vector<double> e_vec, M_vec, out_vec, out_vec_batch;
+    e_vec.resize(N, ecc);
+    M_vec.resize(N);
+    out_vec.resize(N);
+    out_vec_batch.resize(N);
+    std::generate(M_vec.begin(), M_vec.end(), [&rng, &Mdist]() { return Mdist(rng); });
+
+    // cfunc setup.
+    auto [e, M] = make_vars("e", "M");
+
+    llvm_state s{kw::fast_math = fast_math};
+    const auto batch_size = recommended_simd_size<double>();
+    add_cfunc<double>(s, "f_scalar", {kepE(e, M)}, kw::vars = {e, M});
+    add_cfunc<double>(s, "f_batch", {kepE(e, M)}, kw::vars = {e, M}, kw::batch_size = batch_size);
+    s.compile();
+
+    auto *f_sc = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(
+        s.jit_lookup("f_scalar"));
+    auto *f_ba
+        = reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(s.jit_lookup("f_batch"));
+
+    // Fetch the logger.
+    create_logger();
+    set_logger_level_trace();
+    auto logger = spdlog::get("heyoka");
+
+    // Scalar runtime.
+    spdlog::stopwatch sw;
+
+    for (auto i = 0ull; i < N; ++i) {
+        double ins[] = {e_vec[i], M_vec[i]};
+        f_sc(out_vec.data() + i, ins, nullptr, nullptr);
+    }
+
+    logger->trace("Scalar run took: {}s", sw);
+
+    std::vector<double> batch_buffer(batch_size * 2ul);
+    auto *batch_b_ptr = batch_buffer.data();
+
+    sw.reset();
+
+    for (auto i = 0ull; i < N - N % batch_size; i += batch_size) {
+        std::copy(e_vec.data() + i, e_vec.data() + i + batch_size, batch_b_ptr);
+        std::copy(M_vec.data() + i, M_vec.data() + i + batch_size, batch_b_ptr + batch_size);
+        f_ba(out_vec_batch.data() + i, batch_b_ptr, nullptr, nullptr);
+    }
+
+    logger->trace("Batch run took: {}s", sw);
+
+    std::cout.precision(16);
+    for (auto i = 0u; i < 20u; ++i) {
+        std::cout << out_vec[i] << " vs " << out_vec_batch[i] << '\n';
+    }
+}
diff --git a/doc/advanced_tutorials.rst b/doc/advanced_tutorials.rst
@@ -27,6 +27,7 @@ the tutorials should not be hard to follow.
   tut_batch_mode
   tut_extended_precision
   tut_arbitrary_precision
+  tut_single_precision
   tut_s11n
   tut_ensemble
   tut_parallel_mode
diff --git a/doc/changelog.rst b/doc/changelog.rst
@@ -7,8 +7,7 @@ Changelog
 New
 ~~~
 
-- Add the step callback (batch) set classes to compose
-  step callbacks
+- Add step callback set classes to compose step callbacks
   (`#366 <https://github.com/bluescarni/heyoka/pull/366>`__).
 - Add support for single-precision computations
   (`#363 <https://github.com/bluescarni/heyoka/pull/363>`__).
@@ -18,6 +17,10 @@ New
 Changes
 ~~~~~~~
 
+- When the ``fast_math`` mode is active, the SIMD-vectorised
+  mathematical functions now use low-precision implementations.
+  This can lead to substantial performance increases in batch mode
+  (`#367 <https://github.com/bluescarni/heyoka/pull/367>`__).
 - Initialising a step callback or a callable from an empty
   function object (e.g., a null pointer, an empty ``std::function``, etc.)
   now results in an empty object

diff --git a/doc/tut_single_precision.rst b/doc/tut_single_precision.rst
@@ -0,0 +1,107 @@
+.. _tut_single_precision:
+
+Computations in single precision
+================================
+
+.. versionadded:: 3.2.0
+
+In previous tutorials we saw how heyoka, in addition to the standard
+`double precision <https://en.wikipedia.org/wiki/Double-precision_floating-point_format>`__,
+also supports computations in :ref:`extended precision <tut_extended_precision>` and
+:ref:`arbitrary precision <tut_arbitrary_precision>`. Starting with version 3.2.0, heyoka
+supports also computations in `single precision <https://en.wikipedia.org/wiki/Single-precision_floating-point_format>`__.
+
+Single-precision computations can lead to substantial performance benefits when high accuracy is not required.
+In particular, single-precision :ref:`batch mode <tut_batch_mode>` can use a SIMD width twice larger
+than double precision, leading to an increase by a factor of 2 of the computational throughput.
+In scalar computations, the use of single precision reduces by half the memory usage with respect to double precision,
+which can help alleviating performance issues in large ODE systems. This can be particularly noticeable in applications such as
+:external:ref:`neural ODEs <tut_neural_ode>`.
+
+In C++, single-precision values are usually represented via the standard floating-point type ``float``.
+Correspondingly, and similarly to what explained in the :ref:`extended precision <tut_extended_precision>`
+tutorial, single-precision computations are activated by passing the ``float`` template parameter to functions
+and classes in the heyoka API.
+
+A simple example
+----------------
+
+In order to verify that heyoka indeed is able to work in single precision, we will be monitoring the evolution of the energy constant
+in a low-precision numerical integration of the simple pendulum.
+
+Let us begin as usual with the definition of the dynamical equations and the creation of the integrator object:
+
+.. literalinclude:: ../tutorial/single_precision.cpp
+   :language: c++
+   :lines: 18-29
+
+In order to activate single precision, we created an integrator object of type ``taylor_adaptive<float>`` - that is,
+we specified ``float``, instead of the usual ``double``, as the (only) template parameter for the ``taylor_adaptive`` class template.
+Note that we specified a single-precision initial state via the use of the ``f`` suffix for the numerical constants.
+Note also that, when operating in single precision,
+*all* numerical values encapsulated in an integrator are represented in single precision - this includes not only the state vector,
+but also the time coordinate, the tolerance, the Taylor coefficients, etc. Similarly to double-precision integrators, the default value
+of the tolerance is the machine epsilon of ``float``.
+
+Next, we define a small helper function that will allow us to monitor the evolution of the energy constant
+throughout the integration:
+
+.. literalinclude:: ../tutorial/single_precision.cpp
+   :language: c++
+   :lines: 31-37
+
+Before starting the integration, we compute and store the initial energy for later use:
+
+.. literalinclude:: ../tutorial/single_precision.cpp
+   :language: c++
+   :lines: 39-40
+
+We can now begin a step-by-step integration. At the end of each step, we will be computing
+and printing to screen the relative energy error:
+
+.. literalinclude:: ../tutorial/single_precision.cpp
+   :language: c++
+   :lines: 42-49
+
+.. code-block:: console
+
+   Relative energy error: 1.48183e-07
+   Relative energy error: 5.29227e-08
+   Relative energy error: 6.08611e-08
+   Relative energy error: 1.79937e-07
+   Relative energy error: 1.74645e-07
+   Relative energy error: 2.24921e-07
+   Relative energy error: 2.4609e-07
+   Relative energy error: 1.1643e-07
+   Relative energy error: 1.79937e-07
+   Relative energy error: 1.40245e-07
+   Relative energy error: 2.54029e-07
+   Relative energy error: 1.84899e-07
+   Relative energy error: 1.83245e-07
+   Relative energy error: 1.56122e-07
+   Relative energy error: 2.22275e-07
+   Relative energy error: 1.61414e-07
+   Relative energy error: 2.11691e-07
+   Relative energy error: 2.88428e-07
+   Relative energy error: 2.93721e-07
+   Relative energy error: 1.82583e-07
+
+The console output indeed confirms that energy is conserved at the level of the epsilon of the
+single-precision format (that is, :math:`\sim 10^{-7}`).
+
+Other classes and functions
+---------------------------
+
+Besides the adaptive integrator, several other classes and functions in heyoka can be used in single precision.
+
+The :ref:`event classes <tut_events>`, for instance, can be constructed in single precision by passing ``float``
+as the template parameter (instead of ``double``). Note that the precision of an event
+must match the precision of the integrator object in which the event is used, otherwise an error will be produced
+at compilation time.
+
+Full code listing
+-----------------
+
+.. literalinclude:: ../tutorial/single_precision.cpp
+   :language: c++
+   :lines: 9-
diff --git a/include/heyoka/detail/vector_math.hpp b/include/heyoka/detail/vector_math.hpp
@@ -27,6 +27,12 @@ struct vf_info {
     // The vfabi attribute corresponding
     // to the vector function.
     std::string vf_abi_attr;
+    // The corresponding low-precision versions
+    // of the above. These will be empty if
+    // the low-precision counterpart is
+    // not available.
+    std::string lp_name;
+    std::string lp_vf_abi_attr;
     // Number of SIMD lanes.
     std::uint32_t width = 0;
     // Number of arguments.