diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 8ddeba454d..e4ca0e1809 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -18,3 +18,7 @@ raja_add_benchmark( raja_add_benchmark( NAME ltimes SOURCES ltimes.cpp) + +raja_add_bench_mark( + NAME raja_view_blur + SOURCES raja_view_blur.cpp) diff --git a/benchmark/raja_view_blur.cpp b/benchmark/raja_view_blur.cpp new file mode 100644 index 0000000000..c3db04b2dd --- /dev/null +++ b/benchmark/raja_view_blur.cpp @@ -0,0 +1,170 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include +#include "RAJA/util/Timer.hpp" +#include + +/* + * RAJA view performance test + * Kernel performs a 2D Gaussian blur + * + */ + +#if defined(RAJA_ENABLE_HIP) +using device_pol = RAJA::hip_exec<256>; +using device_resources = RAJA::resource::Hip; + +using kernel_pol = RAJA::KernelPolicy< + RAJA::statement::HipKernelFixed<256, + RAJA::statement::For<1, RAJA::hip_global_size_y_direct<16>, + RAJA::statement::For<0, RAJA::hip_global_size_x_direct<16>, + RAJA::statement::Lambda<0> + > + > + > + >; +#elif defined(RAJA_ENABLE_CUDA) +using device_pol = RAJA::cuda_exec<256>; +using device_resources = RAJA::resources::Cuda; + +using kernel_pol = RAJA::KernelPolicy< + RAJA::statement::CudaKernelFixed<256, + RAJA::statement::For<1, RAJA::cuda_global_size_y_direct<16>, + RAJA::statement::For<0, RAJA::cuda_global_size_x_direct<16>, + RAJA::statement::Lambda<0> + > + > + > + >; +#else +using host_pol = RAJA::seq_exec; +using device_resources = RAJA::resources::Host; +#endif + +using host_resources = RAJA::resources::Host; + +int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +{ + + const int N = 10000; + const int K = 17; + + device_resources def_device_res{device_resources::get_default()}; + host_resources def_host_res{host_resources::get_default()}; + + auto timer = RAJA::Timer(); + + //launch to intialize the stream + RAJA::forall + (RAJA::RangeSegment(0,1), [=] RAJA_DEVICE (int i) { + printf(" \n"); + }); + + int * array = def_host_res.allocate(N * N); + int * array_copy = def_host_res.allocate(N * N); + + //big array, or image + for (int i = 0; i < N * N; ++i) { + array[i] = 1; + array_copy[i] = 1; + } + + //small array that acts as the blur + //int* kernel = new int[K * K]; + int * kernel = def_host_res.allocate(K * K); + for (int i = 0; i < K * K; ++i) { + kernel[i] = 2; + } + + // copying to gpu + int* d_array = def_device_res.allocate(N * N); + int* d_array_copy = def_device_res.allocate(N * N); + int* d_kernel = def_device_res.allocate(K * K); + + def_device_res.memcpy(d_array, array, N * N * sizeof(int)); + def_device_res.memcpy(d_array_copy, array_copy, N * N * sizeof(int)); + def_device_res.memcpy(d_kernel, kernel, K * K * sizeof(int)); + + constexpr int DIM = 2; + RAJA::View> array_view(d_array, N, N); + RAJA::View> array_view_copy(d_array_copy, N, N); + RAJA::View> kernel_view(d_kernel, K, K); + + RAJA::RangeSegment range_i(0, N); + RAJA::RangeSegment range_j(0, N); + + timer.start(); + + RAJA::kernel + (RAJA::make_tuple(range_i, range_j), + [=] RAJA_DEVICE (int i, int j) { + int sum = 0; + + //looping through the "blur" + for (int m = 0; m < K; ++m) { + for (int n = 0; n < K; ++n) { + int x = i + m; + int y = j + n; + + // adding the "blur" to the "image" wherever the blur is located on the image + if (x < N && y < N) { + sum += kernel_view(m, n) * array_view(x, y); + } + } + } + + array_view(i, j) += sum; + } + ); + + timer.stop(); + + std::cout<<"Elapsed time with RAJA view : "< + (RAJA::make_tuple(range_i, range_j), + [=] RAJA_DEVICE (int i, int j) { + int sum = 0; + + // looping through the "blur" + for (int m = 0; m < K; ++m) { + for (int n = 0; n < K; ++n) { + int x = i + m; + int y = j + n; + + // adding the "blur" to the "image" wherever the blur is located on the image + if (x < N && y < N) { + sum += d_kernel[m * K + n] * d_array_copy[x * N + y]; + } + } + } + + d_array_copy[i * N + j] += sum; + } + ); + timer.stop(); + + std::cout<<"Elapsed time with NO RAJA view : "< -#include "RAJA/util/Timer.hpp" -#include - -int main() { - - const int N = 10000; - const int K = 17; - - auto timer = RAJA::Timer(); - - //launch to intialize the stream - RAJA::forall> - (RAJA::RangeSegment(0,1), [=] __device__ (int i) { - printf("launch kernel\n"); - }); - - - int* array = new int[N * N]; - int* array_copy = new int[N * N]; - - //big array, or image - for (int i = 0; i < N * N; ++i) { - array[i] = 1; - array_copy[i] = 1; - } - - //small array that acts as the blur - int* kernel = new int[K * K]; - for (int i = 0; i < K * K; ++i) { - kernel[i] = 2; - } - - // copying to gpu - int* d_array; - int* d_array_copy; - int* d_kernel; - cudaMalloc((void**)&d_array, N * N * sizeof(int)); - cudaMalloc((void**)&d_array_copy, N * N * sizeof(int)); - cudaMalloc((void**)&d_kernel, K * K * sizeof(int)); - cudaMemcpy(d_array, array, N * N * sizeof(int), cudaMemcpyHostToDevice); - cudaMemcpy(d_array_copy, array_copy, N * N * sizeof(int), cudaMemcpyHostToDevice); - cudaMemcpy(d_kernel, kernel, K * K * sizeof(int), cudaMemcpyHostToDevice); - - - constexpr int DIM = 2; - RAJA::View> array_view(d_array, N, N); - RAJA::View> array_view_copy(d_array_copy, N, N); - RAJA::View> kernel_view(d_kernel, K, K); - - - using EXEC_POL5 = RAJA::KernelPolicy< - RAJA::statement::CudaKernelFixed<256, - RAJA::statement::For<1, RAJA::cuda_global_size_y_direct<16>, - RAJA::statement::For<0, RAJA::cuda_global_size_x_direct<16>, - RAJA::statement::Lambda<0> - > - > - > - >; - - RAJA::RangeSegment range_i(0, N); - RAJA::RangeSegment range_j(0, N); - - -timer.start(); - - RAJA::kernel - (RAJA::make_tuple(range_i, range_j), - [=] RAJA_DEVICE (int i, int j) { - int sum = 0; - - //looping through the "blur" - for (int m = 0; m < K; ++m) { - for (int n = 0; n < K; ++n) { - int x = i + m; - int y = j + n; - - // adding the "blur" to the "image" wherever the blur is located on the image - if (x < N && y < N) { - sum += kernel_view(m, n) * array_view(x, y); - } - } - } - - array_view(i, j) += sum; - } - ); - -timer.stop(); - -std::cout<<"Elapsed time with RAJA view : "< - (RAJA::make_tuple(range_i, range_j), - [=] RAJA_DEVICE (int i, int j) { - int sum = 0; - - // looping through the "blur" - for (int m = 0; m < K; ++m) { - for (int n = 0; n < K; ++n) { - int x = i + m; - int y = j + n; - - // adding the "blur" to the "image" wherever the blur is located on the image - if (x < N && y < N) { - sum += d_kernel[m * K + n] * d_array_copy[x * N + y]; - } - } - } - - d_array_copy[i * N + j] += sum; - } - ); - -timer.stop(); -std::cout<<"Elapsed time with NO RAJA view : "<