diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 143d6b5b98..c52e040a76 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -17,10 +17,9 @@ raja_add_benchmark( SOURCES benchmark-atomic.cpp) endif() -#TODO: Fix build issues -#raja_add_benchmark( -# NAME ltimes -# SOURCES ltimes.cpp) +raja_add_benchmark( + NAME ltimes + SOURCES ltimes.cpp) raja_add_benchmark( NAME raja_view_blur diff --git a/benchmark/ltimes.cpp b/benchmark/ltimes.cpp index b2fa413b8e..15a3e54eb8 100644 --- a/benchmark/ltimes.cpp +++ b/benchmark/ltimes.cpp @@ -88,18 +88,18 @@ extern "C" { * RAJA 'statement' concepts * * Note that calls to the checkResult() method after each variant is run - * are turned off so the example code runs much faster. If you want + * are turned off so the example code runs much faster. If you want * to verify the results are correct, define the 'DEBUG_LTIMES' macro * below or turn on checking for individual variants. */ - using namespace RAJA; +using namespace RAJA::expt; // -// Index value types for strongly-typed indices must be defined outside +// Index value types for strongly-typed indices must be defined outside // function scope for RAJA CUDA variants to work. // // These types provide strongly-typed index values so if something is wrong @@ -116,7 +116,7 @@ RAJA_INDEX_VALUE_T(IZ, int, "IZ"); // template void checkResult(PHIVIEW_T& phi, LVIEW_T& L, PSIVIEW_T& psi, - const int num_m, + const int num_m, const int num_d, const int num_g, const int num_z); @@ -142,7 +142,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) const long num_z = 32 + (rand()/RAND_MAX); #else const int num_iter = 10 + (rand()/RAND_MAX); - const int num_z = 32*65536 + (rand()/RAND_MAX); + const int num_z = 32*657 + (rand()/RAND_MAX); #endif @@ -231,14 +231,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // View types and Views/Layouts for indexing into arrays - // - // L(m, d) : 1 -> d is stride-1 dimension + // + // L(m, d) : 1 -> d is stride-1 dimension using LView = TypedView, IM, ID>; - // psi(d, g, z) : 2 -> z is stride-1 dimension + // psi(d, g, z) : 2 -> z is stride-1 dimension using PsiView = TypedView, ID, IG, IZ>; - // phi(m, g, z) : 2 -> z is stride-1 dimension + // phi(m, g, z) : 2 -> z is stride-1 dimension using PhiView = TypedView, IM, IG, IZ>; std::array L_perm {{1, 0}}; @@ -255,7 +255,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::Timer timer; - timer.start(); + timer.start(); for (int iter = 0;iter < num_iter;++ iter) for (IG g(0); g < num_g; ++g) { @@ -268,7 +268,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } } - timer.stop(); + timer.stop(); double t = timer.elapsed(); double gflop_rate = total_flops / t / 1.0e9; std::cout << " C-version of LTimes run time (with Views) (sec.): " @@ -291,14 +291,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // View types and Views/Layouts for indexing into arrays - // - // L(m, d) : 1 -> d is stride-1 dimension + // + // L(m, d) : 1 -> d is stride-1 dimension using LView = TypedView, IM, ID>; - // psi(d, g, z) : 2 -> z is stride-1 dimension + // psi(d, g, z) : 2 -> z is stride-1 dimension using PsiView = TypedView, ID, IG, IZ>; - // phi(m, g, z) : 2 -> z is stride-1 dimension + // phi(m, g, z) : 2 -> z is stride-1 dimension using PhiView = TypedView, IM, IG, IZ>; std::array L_perm {{1, 0}}; @@ -313,11 +313,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) PhiView phi(phi_data, RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); - using EXECPOL = + using EXECPOL = RAJA::KernelPolicy< - statement::For<2, loop_exec, // g - statement::For<3, loop_exec, // z - statement::For<0, loop_exec, // m + statement::For<2, seq_exec, // g + statement::For<3, seq_exec, // z + statement::For<0, seq_exec, // m statement::For<1, simd_exec, // d statement::Lambda<0> > @@ -388,9 +388,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using EXECPOL = RAJA::KernelPolicy< - statement::For<2, loop_exec, // g - statement::For<3, loop_exec, // z - statement::For<0, loop_exec, // m + statement::For<2, seq_exec, // g + statement::For<3, seq_exec, // z + statement::For<0, seq_exec, // m statement::For<1, simd_exec, // d statement::Lambda<0, Segs<0, 1, 2, 3>> > @@ -462,10 +462,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using pol_launch = RAJA::LaunchPolicy; - using pol_g = RAJA::LoopPolicy; - using pol_z = RAJA::LoopPolicy; - using pol_m = RAJA::LoopPolicy; - using pol_d = RAJA::LoopPolicy; + using pol_g = RAJA::LoopPolicy; + using pol_z = RAJA::LoopPolicy; + using pol_m = RAJA::LoopPolicy; + using pol_d = RAJA::LoopPolicy; @@ -535,14 +535,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) PhiView phi(phi_data, RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); - using vector_t = RAJA::VectorRegister; - using VecIZ = RAJA::VectorIndex; + using vector_t = RAJA::expt::VectorRegister; + using VecIZ = RAJA::expt::VectorIndex; using EXECPOL = RAJA::KernelPolicy< - statement::For<2, loop_exec, // g - statement::For<0, loop_exec, // m - statement::For<1, loop_exec, // d + statement::For<2, seq_exec, // g + statement::For<0, seq_exec, // m + statement::For<1, seq_exec, // d statement::Lambda<0> > @@ -552,7 +552,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) #ifdef RAJA_ENABLE_VECTOR_STATS - RAJA::tensor_stats::resetVectorStats(); + RAJA::expt::tensor_stats::resetVectorStats(); #endif @@ -622,9 +622,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) PhiView phi(phi_data, RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); - using matrix_t = RAJA::SquareMatrixRegister; - //using matrix_t = RAJA::SquareMatrixRegister; -// using matrix_t = RAJA::RectMatrixRegister; + using matrix_t = RAJA::expt::SquareMatrixRegister; + //using matrix_t = RAJA::expt::SquareMatrixRegister; +// using matrix_t = RAJA::expt::RectMatrixRegister; @@ -633,9 +633,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) printf("Num registers/matrix = %d\n", (int)matrix_t::s_num_registers); - using RowM = RAJA::RowIndex; - using ColD = RAJA::ColIndex; - using ColZ = RAJA::ColIndex; + using RowM = RAJA::expt::RowIndex; + using ColD = RAJA::expt::ColIndex; + using ColZ = RAJA::expt::ColIndex; #ifdef RAJA_ENABLE_VECTOR_STATS @@ -648,7 +648,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) for (int iter = 0;iter < num_iter;++ iter){ - RAJA::forall(RAJA::TypedRangeSegment(0, num_g), + RAJA::forall(RAJA::TypedRangeSegment(0, num_g), [=](IG g) { @@ -718,19 +718,19 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) PhiView phi(phi_data, RAJA::make_permuted_layout({{num_m, num_g, num_z}}, phi_perm)); - using matrix_t = RAJA::SquareMatrixRegister; + using matrix_t = RAJA::expt::SquareMatrixRegister; std::cout << "matrix size: " << matrix_t::s_dim_elem(0) << "x" << matrix_t::s_dim_elem(1) << std::endl; - using RowM = RAJA::RowIndex; - using ColD = RAJA::ColIndex; - using ColZ = RAJA::ColIndex; + using RowM = RAJA::expt::RowIndex; + using ColD = RAJA::expt::ColIndex; + using ColZ = RAJA::expt::ColIndex; #ifdef RAJA_ENABLE_VECTOR_STATS - RAJA::tensor_stats::resetVectorStats(); + RAJA::expt::tensor_stats::resetVectorStats(); #endif RAJA::Timer timer; @@ -738,7 +738,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) for (int iter = 0;iter < num_iter;++ iter){ - RAJA::forall(RAJA::TypedRangeSegment(0, num_g), + RAJA::forall(RAJA::TypedRangeSegment(0, num_g), [=](IG g) { @@ -820,13 +820,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) statement::InitLocalMem, // Tile outer m,d loops - statement::Tile<0, tile_fixed, loop_exec, // m - statement::Tile<1, tile_fixed, loop_exec, // d + statement::Tile<0, tile_fixed, seq_exec, // m + statement::Tile<1, tile_fixed, seq_exec, // d // Load L(m,d) for m,d tile into shmem - statement::For<0, loop_exec, // m - statement::For<1, loop_exec, // d + statement::For<0, seq_exec, // m + statement::For<1, seq_exec, // d statement::Lambda<0, Segs<0, 1>, Params<0>, Offsets<0, 1>> @@ -834,13 +834,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) >, // Run inner g, z loops with z loop tiled - statement::For<2, loop_exec, // g - statement::Tile<3, tile_fixed, loop_exec, // z + statement::For<2, seq_exec, // g + statement::Tile<3, tile_fixed, seq_exec, // z // Load psi into shmem - statement::For<1, loop_exec, // d - statement::For<3, loop_exec, // z + statement::For<1, seq_exec, // d + statement::For<3, seq_exec, // z statement::Lambda<1, Segs<1, 2, 3>, Params<1>, Offsets<1, 2, 3>> @@ -848,25 +848,25 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) >, // Compute phi - statement::For<0, loop_exec, // m + statement::For<0, seq_exec, // m // Load phi into shmem - statement::For<3, loop_exec, // z + statement::For<3, seq_exec, // z statement::Lambda<2, Segs<0, 2, 3>, Params<2>, Offsets<0, 2, 3>> >, // Compute phi in shmem - statement::For<1, loop_exec, // d - statement::For<3, loop_exec, // z + statement::For<1, seq_exec, // d + statement::For<3, seq_exec, // z statement::Lambda<3, Params<0, 1, 2>, Offsets<0, 1, 2, 3>> > >, // Store phi - statement:: For<3, loop_exec, // z + statement:: For<3, seq_exec, // z statement::Lambda<4, Segs<0, 2, 3>, Params<2>, Offsets<0, 2, 3>> @@ -992,14 +992,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // View types and Views/Layouts for indexing into arrays - // - // L(m, d) : 1 -> d is stride-1 dimension + // + // L(m, d) : 1 -> d is stride-1 dimension using LView = TypedView, IM, ID>; - // psi(d, g, z) : 2 -> z is stride-1 dimension + // psi(d, g, z) : 2 -> z is stride-1 dimension using PsiView = TypedView, ID, IG, IZ>; - // phi(m, g, z) : 2 -> z is stride-1 dimension + // phi(m, g, z) : 2 -> z is stride-1 dimension using PhiView = TypedView, IM, IG, IZ>; std::array L_perm {{0, 1}}; @@ -1019,9 +1019,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using EXECPOL = RAJA::KernelPolicy< statement::For<0, omp_parallel_for_exec, // m - statement::For<1, loop_exec, // d - statement::For<2, loop_exec, // g - statement::For<3, simd_exec, // z + statement::For<1, seq_exec, // d + statement::For<2, seq_exec, // g + statement::For<3, simd_exec, // z statement::Lambda<0> > > @@ -1037,7 +1037,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::KernelPolicy< statement::Collapse, // m, g, z - statement::For<1, loop_exec, // d + statement::For<1, seq_exec, // d statement::Lambda<0> > > @@ -1096,14 +1096,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // View types and Views/Layouts for indexing into arrays - // - // L(m, d) : 1 -> d is stride-1 dimension + // + // L(m, d) : 1 -> d is stride-1 dimension using LView = TypedView, IM, ID>; - // psi(d, g, z) : 2 -> z is stride-1 dimension + // psi(d, g, z) : 2 -> z is stride-1 dimension using PsiView = TypedView, ID, IG, IZ>; - // phi(m, g, z) : 2 -> z is stride-1 dimension + // phi(m, g, z) : 2 -> z is stride-1 dimension using PhiView = TypedView, IM, IG, IZ>; std::array L_perm {{0, 1}}; @@ -1120,7 +1120,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using EXECPOL = RAJA::KernelPolicy< - statement::CudaKernelAsync< + statement::CudaKernelAsync< statement::For<0, cuda_block_x_loop, // m statement::For<2, cuda_block_y_loop, // g statement::For<3, cuda_thread_x_loop, // z @@ -1130,9 +1130,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > > > - > - >; - + > + >; + auto segments = RAJA::make_tuple(RAJA::TypedRangeSegment(0, num_m), RAJA::TypedRangeSegment(0, num_d), RAJA::TypedRangeSegment(0, num_g), @@ -1165,9 +1165,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) cudaErrchk( cudaFree( dphi_data ) ); // Reset data in Views to CPU data - L.set_data(L_data); - psi.set_data(psi_data); - phi.set_data(phi_data); + L.set_data(L_data); + psi.set_data(psi_data); + phi.set_data(phi_data); #if defined(DEBUG_LTIMES) checkResult(phi, L, psi, num_m, num_d, num_g, num_z); @@ -1201,10 +1201,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using pol_launch = RAJA::LaunchPolicy >; - using pol_g = RAJA::LoopPolicy; - using pol_z = RAJA::LoopPolicy; - using pol_m = RAJA::LoopPolicy; - using pol_d = RAJA::LoopPolicy; + using pol_g = RAJA::LoopPolicy; + using pol_z = RAJA::LoopPolicy; + using pol_m = RAJA::LoopPolicy; + using pol_d = RAJA::LoopPolicy; // @@ -1325,22 +1325,22 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using matrix_layout = RowMajorLayout; - using L_matrix_host_t = RAJA::SquareMatrixRegister; - using L_matrix_device_t = RAJA::RectMatrixRegister; + using L_matrix_host_t = RAJA::expt::SquareMatrixRegister; + using L_matrix_device_t = RAJA::expt::RectMatrixRegister; using L_matrix_hd_t = RAJA::LaunchPolicy; - using phi_matrix_host_t = RAJA::SquareMatrixRegister; - using phi_matrix_device_t = RAJA::RectMatrixRegister; + using phi_matrix_host_t = RAJA::expt::SquareMatrixRegister; + using phi_matrix_device_t = RAJA::expt::RectMatrixRegister; using phi_matrix_hd_t = RAJA::LaunchPolicy; - using psi_matrix_host_t = RAJA::SquareMatrixRegister; - using psi_matrix_device_t = RAJA::RectMatrixRegister; + using psi_matrix_host_t = RAJA::expt::SquareMatrixRegister; + using psi_matrix_device_t = RAJA::expt::RectMatrixRegister; using psi_matrix_hd_t = RAJA::LaunchPolicy; using pol_launch = RAJA::LaunchPolicy >; - using pol_g = RAJA::LoopPolicy; - using pol_z = RAJA::LoopPolicy; + using pol_g = RAJA::LoopPolicy; + using pol_z = RAJA::LoopPolicy; // @@ -1388,16 +1388,16 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) using L_matrix_t = RAJA_GET_POLICY(L_matrix_hd_t); - using L_RowM = RAJA::RowIndex; - using L_ColD = RAJA::ColIndex; + using L_RowM = RAJA::expt::RowIndex; + using L_ColD = RAJA::expt::ColIndex; using psi_matrix_t = RAJA_GET_POLICY(psi_matrix_hd_t); - using psi_RowD = RAJA::RowIndex; - using psi_ColZ = RAJA::ColIndex; + using psi_RowD = RAJA::expt::RowIndex; + using psi_ColZ = RAJA::expt::ColIndex; using phi_matrix_t = RAJA_GET_POLICY(phi_matrix_hd_t); - using phi_RowM = RAJA::RowIndex; - using phi_ColZ = RAJA::ColIndex; + using phi_RowM = RAJA::expt::RowIndex; + using phi_ColZ = RAJA::expt::ColIndex; RAJA::loop(ctx, RAJA::TypedRangeSegment(0, num_g), [&](IG g){ @@ -1468,8 +1468,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // View types and Views/Layouts for indexing into arrays - // - // L(m, d) : 1 -> d is stride-1 dimension + // + // L(m, d) : 1 -> d is stride-1 dimension using LView = TypedView, IM, ID>; // psi(d, g, z) : 2 -> z is stride-1 dimension @@ -1530,11 +1530,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::KernelPolicy< statement::CudaKernelAsync< statement::InitLocalMem, - // Tile outer m,d loops + // Tile outer m,d loops statement::Tile<0, tile_fixed, seq_exec, // m statement::Tile<1, tile_fixed, seq_exec, // d - // Load L for m,d tile into shmem + // Load L for m,d tile into shmem statement::For<1, cuda_thread_x_loop, // d statement::For<0, cuda_thread_y_direct, // m statement::Lambda<0, Segs<0,1>, Params<0>, Offsets<0,1>> @@ -1571,7 +1571,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > // d > // m >, // z - + // finish tile over directions statement::CudaSyncThreads, @@ -1582,7 +1582,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >, statement::CudaSyncThreads - + > // Tile z > // g @@ -1594,7 +1594,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) >; // KernelPolicy - + RAJA::Timer timer; @@ -1671,9 +1671,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) cudaMemcpyDeviceToHost ) ); // Reset data in Views to CPU data - L.set_data(L_data); - psi.set_data(psi_data); - phi.set_data(phi_data); + L.set_data(L_data); + psi.set_data(psi_data); + phi.set_data(phi_data); checkResult(phi, L, psi, num_m, num_d, num_g, num_z); #endif @@ -1874,11 +1874,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::KernelPolicy< statement::HipKernelAsync< statement::InitLocalMem, - // Tile outer m,d loops + // Tile outer m,d loops statement::Tile<0, tile_fixed, seq_exec, // m statement::Tile<1, tile_fixed, seq_exec, // d - // Load L for m,d tile into shmem + // Load L for m,d tile into shmem statement::For<1, hip_thread_x_loop, // d statement::For<0, hip_thread_y_direct, // m statement::Lambda<0, Segs<0,1>, Params<0>, Offsets<0,1>> @@ -1915,7 +1915,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > // d > // m >, // z - + // finish tile over directions statement::HipSyncThreads, @@ -1926,7 +1926,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >, statement::HipSyncThreads - + > // Tile z > // g @@ -1983,7 +1983,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }, // Lambda<3> Compute thread-local phi value - [=] RAJA_DEVICE (IM RAJA_UNUSED_ARG(m), ID RAJA_UNUSED_ARG(d), + [=] RAJA_DEVICE (IM RAJA_UNUSED_ARG(m), ID RAJA_UNUSED_ARG(d), IG RAJA_UNUSED_ARG(g), IZ RAJA_UNUSED_ARG(z), shmem_L_t& sh_L, shmem_psi_t& sh_psi, double& phi_local, IM tm, ID td, IG tg, IZ tz) { @@ -2040,7 +2040,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // template void checkResult(PHIVIEW_T& phi, LVIEW_T& L, PSIVIEW_T& psi, - const int num_m, + const int num_m, const int num_d, const int num_g, const int num_z) diff --git a/include/RAJA/pattern/tensor/TensorIndex.hpp b/include/RAJA/pattern/tensor/TensorIndex.hpp index f992649876..8f152d92ce 100644 --- a/include/RAJA/pattern/tensor/TensorIndex.hpp +++ b/include/RAJA/pattern/tensor/TensorIndex.hpp @@ -29,7 +29,7 @@ namespace expt { - template LENGTH_VALUE> + template INDEX_VALUE, strip_index_type_t LENGTH_VALUE> struct StaticTensorIndexInner; template @@ -56,8 +56,8 @@ namespace expt RAJA_HOST_DEVICE static constexpr - StaticTensorIndex> static_all(){ - return StaticTensorIndex>(); + StaticTensorIndex> static_all(){ + return StaticTensorIndex>(); } RAJA_INLINE @@ -103,7 +103,7 @@ namespace expt TensorIndex(TensorIndex const &c) : m_index(*c), m_length(c.size()) {} - template LEN_VAL> + template RAJA_INLINE RAJA_HOST_DEVICE constexpr @@ -156,7 +156,7 @@ namespace expt }; - template LENGTH_VALUE> + template INDEX_VALUE, strip_index_type_t LENGTH_VALUE> struct StaticTensorIndex> { using base_type = TensorIndex; diff --git a/scripts/lc-builds/blueos_clang.sh b/scripts/lc-builds/blueos_clang.sh index 7e0b6600d8..99954b3eb8 100755 --- a/scripts/lc-builds/blueos_clang.sh +++ b/scripts/lc-builds/blueos_clang.sh @@ -38,6 +38,7 @@ cmake \ -DBLT_CXX_STD=c++14 \ -C ../host-configs/lc-builds/blueos/clang_X.cmake \ -DENABLE_OPENMP=On \ + -DENABLE_BENCHMARKS=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/blueos_clang_omptarget.sh b/scripts/lc-builds/blueos_clang_omptarget.sh index 154a1f764a..977e0ec4a3 100755 --- a/scripts/lc-builds/blueos_clang_omptarget.sh +++ b/scripts/lc-builds/blueos_clang_omptarget.sh @@ -42,6 +42,7 @@ cmake \ -DRAJA_ENABLE_TARGET_OPENMP=On \ -DBLT_OPENMP_COMPILE_FLAGS="-fopenmp;-fopenmp-targets=nvptx64-nvidia-cuda" \ -DBLT_OPENMP_LINK_FLAGS="-fopenmp;-fopenmp-targets=nvptx64-nvidia-cuda" \ + -DENABLE_BENCHMARKS=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/blueos_clangcuda.sh b/scripts/lc-builds/blueos_clangcuda.sh index 16ac65b825..98d2619546 100755 --- a/scripts/lc-builds/blueos_clangcuda.sh +++ b/scripts/lc-builds/blueos_clangcuda.sh @@ -50,6 +50,7 @@ cmake \ -DBLT_CLANG_CUDA_ARCH=${CUDA_ARCH} \ -DENABLE_CUDA=On \ -DCUDA_ARCH=${CUDA_ARCH} \ + -DENABLE_BENCHMARKS=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/blueos_gcc.sh b/scripts/lc-builds/blueos_gcc.sh index 65bfb2c307..f89ad3e6bf 100755 --- a/scripts/lc-builds/blueos_gcc.sh +++ b/scripts/lc-builds/blueos_gcc.sh @@ -36,6 +36,7 @@ cmake \ -DBLT_CXX_STD=c++14 \ -C ../host-configs/lc-builds/blueos/gcc_X.cmake \ -DENABLE_OPENMP=On \ + -DENABLE_BENCHMARKS=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/blueos_nvcc_clang.sh b/scripts/lc-builds/blueos_nvcc_clang.sh index faf0c8938a..ad846fa101 100755 --- a/scripts/lc-builds/blueos_nvcc_clang.sh +++ b/scripts/lc-builds/blueos_nvcc_clang.sh @@ -48,6 +48,7 @@ cmake \ -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER} \ -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER}/bin/nvcc \ -DCMAKE_CUDA_ARCHITECTURES=${COMP_ARCH} \ + -DENABLE_BENCHMARKS=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/blueos_nvcc_gcc.sh b/scripts/lc-builds/blueos_nvcc_gcc.sh index 7619c360d4..141936aad8 100755 --- a/scripts/lc-builds/blueos_nvcc_gcc.sh +++ b/scripts/lc-builds/blueos_nvcc_gcc.sh @@ -47,6 +47,7 @@ cmake \ -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER} \ -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER}/bin/nvcc \ -DCMAKE_CUDA_ARCHITECTURES=${COMP_ARCH} \ + -DENABLE_BENCHMARKS=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/blueos_nvcc_xl.sh b/scripts/lc-builds/blueos_nvcc_xl.sh index d0507d0eb1..ffeea779ca 100755 --- a/scripts/lc-builds/blueos_nvcc_xl.sh +++ b/scripts/lc-builds/blueos_nvcc_xl.sh @@ -47,6 +47,7 @@ cmake \ -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER} \ -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER}/bin/nvcc \ -DCMAKE_CUDA_ARCHITECTURES=${COMP_ARCH} \ + -DENABLE_BENCHMARKS=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/blueos_pgi.sh b/scripts/lc-builds/blueos_pgi.sh index 16c9816d19..d8a6b904ec 100755 --- a/scripts/lc-builds/blueos_pgi.sh +++ b/scripts/lc-builds/blueos_pgi.sh @@ -37,6 +37,7 @@ cmake \ -DBLT_CXX_STD=c++14 \ -C ../host-configs/lc-builds/blueos/pgi_X.cmake \ -DENABLE_OPENMP=On \ + -DENABLE_BENCHMARKS=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/blueos_xl.sh b/scripts/lc-builds/blueos_xl.sh index 58a13fc065..a39a0f4c93 100755 --- a/scripts/lc-builds/blueos_xl.sh +++ b/scripts/lc-builds/blueos_xl.sh @@ -36,6 +36,7 @@ cmake \ -DBLT_CXX_STD=c++14 \ -C ../host-configs/lc-builds/blueos/xl_X.cmake \ -DENABLE_OPENMP=On \ + -DENABLE_BENCHMARKS=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/blueos_xl_omptarget.sh b/scripts/lc-builds/blueos_xl_omptarget.sh index 3fd7e22ce8..d331044a99 100755 --- a/scripts/lc-builds/blueos_xl_omptarget.sh +++ b/scripts/lc-builds/blueos_xl_omptarget.sh @@ -39,6 +39,7 @@ cmake \ -DRAJA_ENABLE_TARGET_OPENMP=On \ -DBLT_OPENMP_COMPILE_FLAGS="-qoffload;-qsmp=omp;-qalias=noansi" \ -DBLT_OPENMP_LINK_FLAGS="-qoffload;-qsmp=omp;-qalias=noansi" \ + -DENABLE_BENCHMARKS=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/corona_sycl.sh b/scripts/lc-builds/corona_sycl.sh index 815928e434..4b636e4da0 100755 --- a/scripts/lc-builds/corona_sycl.sh +++ b/scripts/lc-builds/corona_sycl.sh @@ -55,6 +55,7 @@ cmake \ -DBLT_CXX_STD=c++17 \ -DENABLE_TESTS=On \ -DENABLE_EXAMPLES=On \ + -DENABLE_BENCHMARKS=On \ "$@" \ .. diff --git a/scripts/lc-builds/toss4_amdclang.sh b/scripts/lc-builds/toss4_amdclang.sh index 9e738fc781..ec7910e148 100755 --- a/scripts/lc-builds/toss4_amdclang.sh +++ b/scripts/lc-builds/toss4_amdclang.sh @@ -73,6 +73,7 @@ cmake \ -DENABLE_HIP=ON \ -DENABLE_OPENMP=ON \ -DENABLE_CUDA=OFF \ + -DENABLE_BENCHMARKS=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/toss4_amdclang_asan.sh b/scripts/lc-builds/toss4_amdclang_asan.sh index b22cdce3bd..dd9526a1d2 100755 --- a/scripts/lc-builds/toss4_amdclang_asan.sh +++ b/scripts/lc-builds/toss4_amdclang_asan.sh @@ -83,6 +83,7 @@ cmake \ -DENABLE_HIP=ON \ -DENABLE_OPENMP=ON \ -DENABLE_CUDA=OFF \ + -DENABLE_BENCHMARKS=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/toss4_cce_hip.sh b/scripts/lc-builds/toss4_cce_hip.sh index 80db5f37be..9d05fddca8 100755 --- a/scripts/lc-builds/toss4_cce_hip.sh +++ b/scripts/lc-builds/toss4_cce_hip.sh @@ -57,6 +57,7 @@ cmake \ -DENABLE_HIP=ON \ -DENABLE_OPENMP=ON \ -DENABLE_CUDA=OFF \ + -DENABLE_BENCHMARKS=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/toss4_clang.sh b/scripts/lc-builds/toss4_clang.sh index ab1cb69e81..a1bf63ff89 100755 --- a/scripts/lc-builds/toss4_clang.sh +++ b/scripts/lc-builds/toss4_clang.sh @@ -36,6 +36,7 @@ cmake \ -DBLT_CXX_STD=c++14 \ -C ../host-configs/lc-builds/toss4/clang_X.cmake \ -DENABLE_OPENMP=On \ + -DENABLE_BENCHMARKS=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/toss4_clang_san.sh b/scripts/lc-builds/toss4_clang_san.sh index e7501c09cd..7d917f4c96 100755 --- a/scripts/lc-builds/toss4_clang_san.sh +++ b/scripts/lc-builds/toss4_clang_san.sh @@ -45,6 +45,7 @@ cmake \ -DBLT_CXX_STD=c++14 \ -C ../host-configs/lc-builds/toss4/clang_X_${SAN_VER}.cmake \ -DENABLE_OPENMP=On \ + -DENABLE_BENCHMARKS=ON \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/toss4_gcc.sh b/scripts/lc-builds/toss4_gcc.sh index 07fa26cc3f..c73c5ba6e9 100755 --- a/scripts/lc-builds/toss4_gcc.sh +++ b/scripts/lc-builds/toss4_gcc.sh @@ -36,6 +36,7 @@ cmake \ -DBLT_CXX_STD=c++14 \ -C ../host-configs/lc-builds/toss4/gcc_X.cmake \ -DENABLE_OPENMP=On \ + -DENABLE_BENCHMARKS=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/toss4_hipcc.sh b/scripts/lc-builds/toss4_hipcc.sh index f7342e474c..5743692ba6 100755 --- a/scripts/lc-builds/toss4_hipcc.sh +++ b/scripts/lc-builds/toss4_hipcc.sh @@ -74,6 +74,7 @@ cmake \ -DENABLE_HIP=ON \ -DENABLE_OPENMP=ON \ -DENABLE_CUDA=OFF \ + -DENABLE_BENCHMARKS=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/toss4_icpc-classic.sh b/scripts/lc-builds/toss4_icpc-classic.sh index 447cf35ac8..a892d08ecf 100755 --- a/scripts/lc-builds/toss4_icpc-classic.sh +++ b/scripts/lc-builds/toss4_icpc-classic.sh @@ -43,6 +43,7 @@ cmake \ -C ../host-configs/lc-builds/toss4/icpc-classic_X.cmake \ -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off \ -DENABLE_OPENMP=On \ + -DENABLE_BENCHMARKS=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/toss4_icpc.sh b/scripts/lc-builds/toss4_icpc.sh index 93d4d62cad..82dab8e8df 100755 --- a/scripts/lc-builds/toss4_icpc.sh +++ b/scripts/lc-builds/toss4_icpc.sh @@ -43,6 +43,7 @@ cmake \ -C ../host-configs/lc-builds/toss4/icpc_X.cmake \ -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off \ -DENABLE_OPENMP=On \ + -DENABLE_BENCHMARKS=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/toss4_icpx.sh b/scripts/lc-builds/toss4_icpx.sh index d6fe0d867b..88cc43d824 100755 --- a/scripts/lc-builds/toss4_icpx.sh +++ b/scripts/lc-builds/toss4_icpx.sh @@ -45,6 +45,7 @@ cmake \ -C ../host-configs/lc-builds/toss4/icpx_X.cmake \ -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off \ -DENABLE_OPENMP=On \ + -DENABLE_BENCHMARKS=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ ..