Skip to content

Commit

Permalink
fix(lfsr113,mgr,,mtgp32,philox4x32,threefry,xorwow): force inline gen…
Browse files Browse the repository at this point in the history
…erate function (#495)

A compiler change caused the generate function to no longer be inlined which caused a performance regression. Force inlining resolves this.
  • Loading branch information
Naraenda authored May 17, 2024
1 parent 824eadf commit d2c80ea
Show file tree
Hide file tree
Showing 6 changed files with 31 additions and 28 deletions.
18 changes: 9 additions & 9 deletions library/src/rng/lfsr113.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,15 +61,15 @@ __host__ __device__ inline void init_lfsr113_engines(dim3 block_idx,
}

template<class ConfigProvider, bool IsDynamic, class T, class Distribution>
__host__ __device__ void generate_lfsr113(dim3 block_idx,
dim3 thread_idx,
dim3 grid_dim,
dim3 /*block_dim*/,
lfsr113_device_engine* engines,
const unsigned int start_engine_id,
T* data,
const size_t n,
Distribution distribution)
__host__ __device__ __forceinline__ void generate_lfsr113(dim3 block_idx,
dim3 thread_idx,
dim3 grid_dim,
dim3 /*block_dim*/,
lfsr113_device_engine* engines,
const unsigned int start_engine_id,
T* data,
const size_t n,
Distribution distribution)
{
static_assert(is_single_tile_config<ConfigProvider, T>(IsDynamic),
"This kernel should only be used with single tile configs");
Expand Down
3 changes: 2 additions & 1 deletion library/src/rng/mrg.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
#include "generator_type.hpp"
#include "system.hpp"

#include <hip/amd_detail/host_defines.h>
#include <rocrand/rocrand.h>
#include <rocrand/rocrand_mrg31k3p.h>
#include <rocrand/rocrand_mrg32k3a.h>
Expand Down Expand Up @@ -62,7 +63,7 @@ __host__ __device__ void init_engines_mrg(dim3 block_idx,
}

template<class ConfigProvider, bool IsDynamic, class Engine, class T, class Distribution>
__host__ __device__ void generate_mrg(dim3 block_idx,
__host__ __device__ __forceinline__ void generate_mrg(dim3 block_idx,
dim3 thread_idx,
dim3 grid_dim,
dim3 /*block_dim*/,
Expand Down
3 changes: 2 additions & 1 deletion library/src/rng/mtgp32.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@
#include "generator_type.hpp"
#include "system.hpp"

#include <hip/amd_detail/host_defines.h>
#include <rocrand/rocrand.h>
#include <rocrand/rocrand_mtgp32.h>
#include <rocrand/rocrand_mtgp32_11213.h>
Expand Down Expand Up @@ -222,7 +223,7 @@ __device__ void save_head_tail(T (&output)[output_width],
}

template<class ConfigProvider, bool IsDynamic, class T, class Distribution>
__host__ __device__ void generate_mtgp(dim3 block_idx,
__host__ __device__ __forceinline__ void generate_mtgp(dim3 block_idx,
dim3 thread_idx,
dim3 grid_dim,
dim3 /*block_dim*/,
Expand Down
16 changes: 8 additions & 8 deletions library/src/rng/philox4x32_10.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,14 +103,14 @@ struct philox4x32_10_device_engine : public ::rocrand_device::philox4x32_10_engi
};

template<typename T, typename Distribution>
__host__ __device__ void generate_philox(dim3 block_idx,
dim3 thread_idx,
dim3 grid_dim,
dim3 block_dim,
philox4x32_10_device_engine engine,
T* data,
const size_t n,
Distribution distribution)
__host__ __device__ __forceinline__ void generate_philox(dim3 block_idx,
dim3 thread_idx,
dim3 grid_dim,
dim3 block_dim,
philox4x32_10_device_engine engine,
T* data,
const size_t n,
Distribution distribution)
{
constexpr unsigned int input_width = Distribution::input_width;
constexpr unsigned int output_width = Distribution::output_width;
Expand Down
16 changes: 8 additions & 8 deletions library/src/rng/threefry.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,14 +83,14 @@ struct threefry_device_engine : public BaseType
};

template<class Engine, class T, class Distribution>
__host__ __device__ void generate_threefry(dim3 block_idx,
dim3 thread_idx,
dim3 grid_dim,
dim3 block_dim,
Engine engine,
T* data,
const size_t n,
Distribution distribution)
__host__ __device__ __forceinline__ void generate_threefry(dim3 block_idx,
dim3 thread_idx,
dim3 grid_dim,
dim3 block_dim,
Engine engine,
T* data,
const size_t n,
Distribution distribution)
{
using engine_scalar_type = typename Engine::scalar_type;

Expand Down
3 changes: 2 additions & 1 deletion library/src/rng/xorwow.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include "generator_type.hpp"
#include "system.hpp"

#include <hip/amd_detail/host_defines.h>
#include <rocrand/rocrand.h>
#include <rocrand/rocrand_xorwow.h>

Expand Down Expand Up @@ -60,7 +61,7 @@ __host__ __device__ inline void init_xorwow_engines(dim3 block_idx,
}

template<class ConfigProvider, bool IsDynamic, class T, class Distribution>
__host__ __device__ void generate_xorwow(dim3 block_idx,
__host__ __device__ __forceinline__ void generate_xorwow(dim3 block_idx,
dim3 thread_idx,
dim3 grid_dim,
dim3 /*block_dim*/,
Expand Down

0 comments on commit d2c80ea

Please sign in to comment.