From d2c80ead05ee91f2bb24afa6a523592e46c6e0ed Mon Sep 17 00:00:00 2001 From: Nara Date: Fri, 17 May 2024 23:41:58 +0200 Subject: [PATCH] fix(lfsr113,mgr,,mtgp32,philox4x32,threefry,xorwow): force inline generate function (#495) A compiler change caused the generate function to no longer be inlined which caused a performance regression. Force inlining resolves this. --- library/src/rng/lfsr113.hpp | 18 +++++++++--------- library/src/rng/mrg.hpp | 3 ++- library/src/rng/mtgp32.hpp | 3 ++- library/src/rng/philox4x32_10.hpp | 16 ++++++++-------- library/src/rng/threefry.hpp | 16 ++++++++-------- library/src/rng/xorwow.hpp | 3 ++- 6 files changed, 31 insertions(+), 28 deletions(-) diff --git a/library/src/rng/lfsr113.hpp b/library/src/rng/lfsr113.hpp index 41c5e58a..a3acd02e 100644 --- a/library/src/rng/lfsr113.hpp +++ b/library/src/rng/lfsr113.hpp @@ -61,15 +61,15 @@ __host__ __device__ inline void init_lfsr113_engines(dim3 block_idx, } template -__host__ __device__ void generate_lfsr113(dim3 block_idx, - dim3 thread_idx, - dim3 grid_dim, - dim3 /*block_dim*/, - lfsr113_device_engine* engines, - const unsigned int start_engine_id, - T* data, - const size_t n, - Distribution distribution) +__host__ __device__ __forceinline__ void generate_lfsr113(dim3 block_idx, + dim3 thread_idx, + dim3 grid_dim, + dim3 /*block_dim*/, + lfsr113_device_engine* engines, + const unsigned int start_engine_id, + T* data, + const size_t n, + Distribution distribution) { static_assert(is_single_tile_config(IsDynamic), "This kernel should only be used with single tile configs"); diff --git a/library/src/rng/mrg.hpp b/library/src/rng/mrg.hpp index 0e792509..a01ac23b 100644 --- a/library/src/rng/mrg.hpp +++ b/library/src/rng/mrg.hpp @@ -30,6 +30,7 @@ #include "generator_type.hpp" #include "system.hpp" +#include #include #include #include @@ -62,7 +63,7 @@ __host__ __device__ void init_engines_mrg(dim3 block_idx, } template -__host__ __device__ void generate_mrg(dim3 block_idx, +__host__ __device__ __forceinline__ void generate_mrg(dim3 block_idx, dim3 thread_idx, dim3 grid_dim, dim3 /*block_dim*/, diff --git a/library/src/rng/mtgp32.hpp b/library/src/rng/mtgp32.hpp index d16b3d6d..fa81defb 100644 --- a/library/src/rng/mtgp32.hpp +++ b/library/src/rng/mtgp32.hpp @@ -63,6 +63,7 @@ #include "generator_type.hpp" #include "system.hpp" +#include #include #include #include @@ -222,7 +223,7 @@ __device__ void save_head_tail(T (&output)[output_width], } template -__host__ __device__ void generate_mtgp(dim3 block_idx, +__host__ __device__ __forceinline__ void generate_mtgp(dim3 block_idx, dim3 thread_idx, dim3 grid_dim, dim3 /*block_dim*/, diff --git a/library/src/rng/philox4x32_10.hpp b/library/src/rng/philox4x32_10.hpp index c2aa578d..86a071a4 100644 --- a/library/src/rng/philox4x32_10.hpp +++ b/library/src/rng/philox4x32_10.hpp @@ -103,14 +103,14 @@ struct philox4x32_10_device_engine : public ::rocrand_device::philox4x32_10_engi }; template -__host__ __device__ void generate_philox(dim3 block_idx, - dim3 thread_idx, - dim3 grid_dim, - dim3 block_dim, - philox4x32_10_device_engine engine, - T* data, - const size_t n, - Distribution distribution) +__host__ __device__ __forceinline__ void generate_philox(dim3 block_idx, + dim3 thread_idx, + dim3 grid_dim, + dim3 block_dim, + philox4x32_10_device_engine engine, + T* data, + const size_t n, + Distribution distribution) { constexpr unsigned int input_width = Distribution::input_width; constexpr unsigned int output_width = Distribution::output_width; diff --git a/library/src/rng/threefry.hpp b/library/src/rng/threefry.hpp index 8a0ea7b1..8982c855 100644 --- a/library/src/rng/threefry.hpp +++ b/library/src/rng/threefry.hpp @@ -83,14 +83,14 @@ struct threefry_device_engine : public BaseType }; template -__host__ __device__ void generate_threefry(dim3 block_idx, - dim3 thread_idx, - dim3 grid_dim, - dim3 block_dim, - Engine engine, - T* data, - const size_t n, - Distribution distribution) +__host__ __device__ __forceinline__ void generate_threefry(dim3 block_idx, + dim3 thread_idx, + dim3 grid_dim, + dim3 block_dim, + Engine engine, + T* data, + const size_t n, + Distribution distribution) { using engine_scalar_type = typename Engine::scalar_type; diff --git a/library/src/rng/xorwow.hpp b/library/src/rng/xorwow.hpp index 4113c9bd..32f3b97c 100644 --- a/library/src/rng/xorwow.hpp +++ b/library/src/rng/xorwow.hpp @@ -29,6 +29,7 @@ #include "generator_type.hpp" #include "system.hpp" +#include #include #include @@ -60,7 +61,7 @@ __host__ __device__ inline void init_xorwow_engines(dim3 block_idx, } template -__host__ __device__ void generate_xorwow(dim3 block_idx, +__host__ __device__ __forceinline__ void generate_xorwow(dim3 block_idx, dim3 thread_idx, dim3 grid_dim, dim3 /*block_dim*/,