diff --git a/dpbench/benchmarks/CMakeLists.txt b/dpbench/benchmarks/CMakeLists.txt index 6869220b..c1693bc1 100644 --- a/dpbench/benchmarks/CMakeLists.txt +++ b/dpbench/benchmarks/CMakeLists.txt @@ -10,6 +10,7 @@ add_subdirectory(kmeans) add_subdirectory(knn) add_subdirectory(gpairs) add_subdirectory(dbscan) +add_subdirectory(rodinia) # generate dpcpp version into config set(FILE ${CMAKE_SOURCE_DIR}/dpbench/configs/framework_info/dpcpp.toml) diff --git a/dpbench/benchmarks/rodinia/CMakeLists.txt b/dpbench/benchmarks/rodinia/CMakeLists.txt new file mode 100644 index 00000000..3c6af4a1 --- /dev/null +++ b/dpbench/benchmarks/rodinia/CMakeLists.txt @@ -0,0 +1,5 @@ +# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +add_subdirectory(pathfinder) diff --git a/dpbench/benchmarks/rodinia/pathfinder/CMakeLists.txt b/dpbench/benchmarks/rodinia/pathfinder/CMakeLists.txt new file mode 100644 index 00000000..5465348e --- /dev/null +++ b/dpbench/benchmarks/rodinia/pathfinder/CMakeLists.txt @@ -0,0 +1,5 @@ +# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +add_subdirectory(pathfinder_sycl_native_ext) diff --git a/dpbench/benchmarks/rodinia/pathfinder/__init__.py b/dpbench/benchmarks/rodinia/pathfinder/__init__.py new file mode 100644 index 00000000..8c072074 --- /dev/null +++ b/dpbench/benchmarks/rodinia/pathfinder/__init__.py @@ -0,0 +1,32 @@ +# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +""" + +Pathfinder Computation + +This algorithm finds the shortest path from the first row to the last. + +Input +--------- +rows : Indicates the number of rows + +cols : Indicates the number of cols + +pyramid height : Indicates pyramid height + +Output + +-------- + +result> : Indicates the minimum distance from first row to last + +Method: + +The elements are fed to the kernel row-wise and the minimum distance is computed based +on the minimum weight of the neighbors above. +This is done for all rows until last and result is returned. + + +""" diff --git a/dpbench/benchmarks/rodinia/pathfinder/pathfinder_initialize.py b/dpbench/benchmarks/rodinia/pathfinder/pathfinder_initialize.py new file mode 100644 index 00000000..52023d28 --- /dev/null +++ b/dpbench/benchmarks/rodinia/pathfinder/pathfinder_initialize.py @@ -0,0 +1,20 @@ +# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +LOW = 0 +HIGH = 10.0 +SEED = 9 + + +def initialize(rows, cols, pyramid_height, types_dict=None): + import numpy as np + import numpy.random as rnd + + rnd.seed(SEED) + + data, result = rnd.randint( + LOW, HIGH, (rows * cols), dtype=np.int64 + ), np.empty(cols, dtype=np.int64) + + return (data, rows, cols, pyramid_height, result) diff --git a/dpbench/benchmarks/rodinia/pathfinder/pathfinder_numba_dpex_k.py b/dpbench/benchmarks/rodinia/pathfinder/pathfinder_numba_dpex_k.py new file mode 100644 index 00000000..10beb7cb --- /dev/null +++ b/dpbench/benchmarks/rodinia/pathfinder/pathfinder_numba_dpex_k.py @@ -0,0 +1,56 @@ +# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +import dpnp +import numba_dpex + + +def MIN(a, b): + return (a) if (a) <= (b) else (b) + + +@numba_dpex.func +def min_dpex(a, b): + t = a if a <= b else b + return t + + +@numba_dpex.kernel(debug=True) +def _pathfinder_kernel(device_src, device_dest, cols): + current_element = numba_dpex.get_global_id(0) + + left_ind = current_element - 1 if current_element >= 1 else 0 + right_ind = current_element + 1 if current_element < cols - 1 else cols - 1 + up_ind = current_element + + left = device_src[left_ind] + up = device_src[up_ind] + right = device_src[right_ind] + shortest = min_dpex(left, up) + shortest = min_dpex(shortest, right) + + device_dest[current_element] += shortest + + +def pathfinder(data, rows, cols, pyramid_height, result): + # create a temp list that hold first row of data as first element and empty numpy array as second element + device_dest = dpnp.array(data[:cols], dtype=dpnp.int64) # first row + device_src = dpnp.array([0] * cols, dtype=dpnp.int64) + + t = 1 + + while True: + if t >= rows: + break + device_src = device_dest + device_dest = dpnp.array( + data[t * cols : (t + 1) * cols], dtype=dpnp.int64 + ) + _pathfinder_kernel[numba_dpex.Range(cols)]( + device_src, device_dest, cols + ) + t += 1 + + for i in range(cols): + result[i] = device_dest[i] diff --git a/dpbench/benchmarks/rodinia/pathfinder/pathfinder_python.py b/dpbench/benchmarks/rodinia/pathfinder/pathfinder_python.py new file mode 100644 index 00000000..d31f32ea --- /dev/null +++ b/dpbench/benchmarks/rodinia/pathfinder/pathfinder_python.py @@ -0,0 +1,45 @@ +# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +import dpnp + + +def min_dpex(a, b): + return a if a <= b else b + + +def _pathfinder_kernel(device_src, device_dest, cols, current_element): + left_ind = current_element - 1 if current_element >= 1 else 0 + right_ind = current_element + 1 if current_element < cols - 1 else cols - 1 + up_ind = current_element + + left = device_src[left_ind] + up = device_src[up_ind] + right = device_src[right_ind] + shortest = min_dpex(left, up) + shortest = min_dpex(shortest, right) + + device_dest[current_element] += shortest + + +def pathfinder(data, rows, cols, pyramid_height, result): + # create a temp list that hold first row of data as first element and empty numpy array as second element + device_dest = dpnp.array(data[:cols], dtype=dpnp.int64) # first row + device_src = dpnp.array([0] * cols, dtype=dpnp.int64) + + t = 1 + while True: + if t >= rows: + break + device_src = device_dest + device_dest = dpnp.array( + data[t * cols : (t + 1) * cols], dtype=dpnp.int64 + ) + + for i in range(cols): + _pathfinder_kernel(device_src, device_dest, cols, i) + t += 1 + + for i in range(cols): + result[i] = device_dest[i] diff --git a/dpbench/benchmarks/rodinia/pathfinder/pathfinder_sycl_native_ext/CMakeLists.txt b/dpbench/benchmarks/rodinia/pathfinder/pathfinder_sycl_native_ext/CMakeLists.txt new file mode 100644 index 00000000..267cea30 --- /dev/null +++ b/dpbench/benchmarks/rodinia/pathfinder/pathfinder_sycl_native_ext/CMakeLists.txt @@ -0,0 +1,14 @@ +# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +set(module_name pathfinder_sycl) +set(py_module_name _${module_name}) +python_add_library(${py_module_name} MODULE ${module_name}/${py_module_name}.cpp) +add_sycl_to_target(TARGET ${py_module_name} SOURCES ${module_name}/${py_module_name}.cpp) +target_include_directories(${py_module_name} PRIVATE ${Dpctl_INCLUDE_DIRS}) + +file(RELATIVE_PATH py_module_dest ${CMAKE_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}) +install(TARGETS ${py_module_name} + DESTINATION ${py_module_dest}/${module_name} +) diff --git a/dpbench/benchmarks/rodinia/pathfinder/pathfinder_sycl_native_ext/__init__.py b/dpbench/benchmarks/rodinia/pathfinder/pathfinder_sycl_native_ext/__init__.py new file mode 100644 index 00000000..85fa3a61 --- /dev/null +++ b/dpbench/benchmarks/rodinia/pathfinder/pathfinder_sycl_native_ext/__init__.py @@ -0,0 +1,7 @@ +# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +from .pathfinder_sycl._pathfinder_sycl import pathfinder as pathfinder_sycl + +__all__ = ["pathfinder_sycl"] diff --git a/dpbench/benchmarks/rodinia/pathfinder/pathfinder_sycl_native_ext/pathfinder_sycl/_pathfinder_kernel.hpp b/dpbench/benchmarks/rodinia/pathfinder/pathfinder_sycl_native_ext/pathfinder_sycl/_pathfinder_kernel.hpp new file mode 100644 index 00000000..34bde14d --- /dev/null +++ b/dpbench/benchmarks/rodinia/pathfinder/pathfinder_sycl_native_ext/pathfinder_sycl/_pathfinder_kernel.hpp @@ -0,0 +1,97 @@ +// SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +using namespace sycl; + +#define BLOCK_SIZE 128 +#define STR_SIZE 128 +#define DEVICE 0 +#define HALO 1 + +#define IN_RANGE(x, min, max) ((x) >= (min) && (x) <= (max)) +#define CLAMP_RANGE(x, min, max) x = (x < (min)) ? min : ((x > (max)) ? max : x) +#define MIN(a, b) ((a) <= (b) ? (a) : (b)) + +void pathfinder_impl(int iteration, + int64_t *gpuWall, + int64_t *gpuSrc, + int64_t *gpuResults, + int cols, + int rows, + int startStep, + int border, + sycl::nd_item<3> item_ct1, + int64_t *prev, + int64_t *result) +{ + + int bx = item_ct1.get_group(2); + int tx = item_ct1.get_local_id(2); + + // each block finally computes result for a small block + // after N iterations. + // it is the non-overlapping small blocks that cover + // all the input data + + // calculate the small block size + int small_block_cols = BLOCK_SIZE - iteration * HALO * 2; + + // calculate the boundary for the block according to + // the boundary of its small block + int blkX = small_block_cols * bx - border; + int blkXmax = blkX + BLOCK_SIZE - 1; + + // calculate the global thread coordination + int xidx = blkX + tx; + + // effective range within this block that falls within + // the valid range of the input data + // used to rule out computation outside the boundary. + int validXmin = (blkX < 0) ? -blkX : 0; + int validXmax = (blkXmax > cols - 1) ? BLOCK_SIZE - 1 - (blkXmax - cols + 1) + : BLOCK_SIZE - 1; + + int W = tx - 1; + int E = tx + 1; + + W = (W < validXmin) ? validXmin : W; + E = (E > validXmax) ? validXmax : E; + + bool isValid = IN_RANGE(tx, validXmin, validXmax); + + if (IN_RANGE(xidx, 0, cols - 1)) { + prev[tx] = gpuSrc[xidx]; + } + item_ct1.barrier(); // [Ronny] Added sync to avoid race on prev Aug. 14 2012 + bool computed; + for (int i = 0; i < iteration; i++) { + computed = false; + if (IN_RANGE(tx, i + 1, BLOCK_SIZE - i - 2) && isValid) { + computed = true; + int64_t left = prev[W]; + int64_t up = prev[tx]; + int64_t right = prev[E]; + int64_t shortest = MIN(left, up); + shortest = MIN(shortest, right); + int index = cols * (startStep + i) + xidx; + result[tx] = shortest + gpuWall[index]; + } + item_ct1.barrier(); + if (i == iteration - 1) + break; + if (computed) // Assign the computation range + prev[tx] = result[tx]; + item_ct1 + .barrier(); // [Ronny] Added sync to avoid race on prev Aug. 14 2012 + } + + // update the global memory + // after the last iteration, only threads coordinated within the + // small block perform the calculation and switch on ``computed'' + if (computed) { + gpuResults[xidx] = result[tx]; + } +} diff --git a/dpbench/benchmarks/rodinia/pathfinder/pathfinder_sycl_native_ext/pathfinder_sycl/_pathfinder_sycl.cpp b/dpbench/benchmarks/rodinia/pathfinder/pathfinder_sycl_native_ext/pathfinder_sycl/_pathfinder_sycl.cpp new file mode 100644 index 00000000..dd513e28 --- /dev/null +++ b/dpbench/benchmarks/rodinia/pathfinder/pathfinder_sycl_native_ext/pathfinder_sycl/_pathfinder_sycl.cpp @@ -0,0 +1,133 @@ +// SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 + +#include "_pathfinder_kernel.hpp" +#include +#include + +#include +#include +#include +#include + +using namespace sycl; + +#define M_SEED 9 + +template bool ensure_compatibility(const Args &...args) +{ + std::vector arrays = {args...}; + + auto arr = arrays.at(0); + auto q = arr.get_queue(); + auto type_flag = arr.get_typenum(); + auto arr_size = arr.get_size(); + + for (auto &arr : arrays) { + if (!(arr.get_flags() & (USM_ARRAY_C_CONTIGUOUS))) { + std::cerr << "All arrays need to be C contiguous.\n"; + return false; + } + if (arr.get_typenum() != type_flag) { + std::cerr << "All arrays should be of same elemental type.\n"; + return false; + } + if (arr.get_ndim() > 1) { + std::cerr << "All arrays expected to be single-dimensional.\n"; + return false; + } + } + return true; +} + +void pathfinder_sync(dpctl::tensor::usm_ndarray data, + int rows, + int cols, + int pyramid_height, + dpctl::tensor::usm_ndarray result) +{ + + /* --------------- pyramid parameters --------------- */ + int borderCols = (pyramid_height)*HALO; + int smallBlockCol = BLOCK_SIZE - (pyramid_height)*HALO * 2; + int blockCols = + cols / smallBlockCol + ((cols % smallBlockCol == 0) ? 0 : 1); + + int64_t *gpuWall, *gpuResult[2]; + int size = rows * cols; + + auto defaultQueue = data.get_queue(); + + if (!ensure_compatibility(data, result)) + throw std::runtime_error("Input arrays are not acceptable."); + + gpuResult[0] = sycl::malloc_device(cols, defaultQueue); + gpuResult[1] = sycl::malloc_device(cols, defaultQueue); + gpuWall = sycl::malloc_device((size - cols), defaultQueue); + + // Extract value ptr from dpctl array + int64_t *data_value = data.get_data(); + + defaultQueue.memcpy(gpuResult[0], data_value, sizeof(int64_t) * cols) + .wait(); + + defaultQueue + .memcpy(gpuWall, data_value + cols, sizeof(int64_t) * (size - cols)) + .wait(); + + sycl::range<3> dimBlock(1, 1, BLOCK_SIZE); + sycl::range<3> dimGrid(1, 1, blockCols); + int src = 1, dst = 0; + for (int t = 0; t < rows - 1; t += pyramid_height) { + int temp = src; + src = dst; + dst = temp; + /* + DPCT1049:0: The workgroup size passed to the SYCL + * kernel may exceed the limit. To get the device limit, query + * info::device::max_work_group_size. Adjust the workgroup size if + * needed. + */ + defaultQueue + .submit([&](sycl::handler &cgh) { + sycl::local_accessor prev_acc_ct1( + sycl::range<1>(256 /*BLOCK_SIZE*/), cgh); + sycl::local_accessor result_acc_ct1( + sycl::range<1>(256 /*BLOCK_SIZE*/), cgh); + auto gpuResult_src_ct2 = gpuResult[src]; + auto gpuResult_dst_ct3 = gpuResult[dst]; + cgh.parallel_for( + sycl::nd_range<3>(dimGrid * dimBlock, dimBlock), + [=](sycl::nd_item<3> item_ct1) { + pathfinder_impl( + MIN(pyramid_height, rows - t - 1), gpuWall, + gpuResult_src_ct2, gpuResult_dst_ct3, cols, rows, t, + borderCols, item_ct1, prev_acc_ct1.get_pointer(), + result_acc_ct1.get_pointer()); + }); + }) + .wait(); + } + + // Extract value for result ptr + auto result_value = result.get_data(); + + defaultQueue.memcpy(result_value, gpuResult[dst], sizeof(int64_t) * cols) + .wait(); + + sycl::free(gpuWall, defaultQueue); + sycl::free(gpuResult[0], defaultQueue); + sycl::free(gpuResult[1], defaultQueue); +} + +PYBIND11_MODULE(_pathfinder_sycl, m) +{ + // Import the dpctl extensions + import_dpctl(); + + m.def("pathfinder", &pathfinder_sync, + "DPC++ implementation of the pathfinder", py::arg("data"), + py::arg("rows"), py::arg("cols"), py::arg("pyramid_height"), + py::arg("result")); +} diff --git a/dpbench/config/reader.py b/dpbench/config/reader.py index bc549653..5da1407e 100644 --- a/dpbench/config/reader.py +++ b/dpbench/config/reader.py @@ -28,6 +28,7 @@ def read_configs( # noqa: C901: TODO: move modules into config no_dpbench: bool = False, with_npbench: bool = False, with_polybench: bool = False, + with_rodinia: bool = False, load_implementations: bool = True, ) -> Config: """Read all configuration files and populate those settings into Config. @@ -84,7 +85,17 @@ def read_configs( # noqa: C901: TODO: move modules into config path=os.path.join(dirname, "../benchmarks/polybench"), ) ) - + if with_rodinia: + modules.append( + Module( + benchmark_configs_path=os.path.join( + dirname, "../configs/bench_info/rodinia" + ), + benchmark_configs_recursive=True, + benchmarks_module="dpbench.benchmarks.rodinia", + path=os.path.join(dirname, "../benchmarks/rodinia"), + ) + ) for mod in modules: if mod.benchmark_configs_path != "": read_benchmarks( diff --git a/dpbench/configs/bench_info/rodinia/pathfinder.toml b/dpbench/configs/bench_info/rodinia/pathfinder.toml new file mode 100644 index 00000000..153301dd --- /dev/null +++ b/dpbench/configs/bench_info/rodinia/pathfinder.toml @@ -0,0 +1,64 @@ +# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +[benchmark] +name = "Pathfinder Computation" +short_name = "pathfinder" +relative_path = "pathfinder" +module_name = "pathfinder" +func_name = "pathfinder" +kind = "microbenchmark" +domain = "Shortest Path" +input_args = [ + "data", + "rows", + "cols", + "pyramid_height", + "result", +] +array_args = [ + "data", + "result", +] +output_args = [ + "result", +] + +[benchmark.parameters.S] +rows=10 +cols=10 +pyramid_height=1 + +[benchmark.parameters.M16Gb] +rows=100 +cols=100 +pyramid_height=1 + +[benchmark.parameters.M] +rows=2048 +cols=64 +pyramid_height=1 + +[benchmark.parameters.L] +rows=4096 +cols=64 +pyramid_height=1 + +[benchmark.init] +func_name = "initialize" +types_dict_name="types_dict" +precision="double" +input_args = [ + "rows", + "cols", + "pyramid_height", + "types_dict" +] +output_args = [ + "data", + "rows", + "cols", + "pyramid_height", + "result" +] diff --git a/dpbench/console/_namespace.py b/dpbench/console/_namespace.py index 2cefe737..f6c5f63a 100644 --- a/dpbench/console/_namespace.py +++ b/dpbench/console/_namespace.py @@ -19,6 +19,7 @@ class Namespace(argparse.Namespace): dpbench: bool npbench: bool polybench: bool + rodinia: bool print_results: bool validate: bool run_id: Union[int, None] diff --git a/dpbench/console/config.py b/dpbench/console/config.py index fe962f79..f7144367 100644 --- a/dpbench/console/config.py +++ b/dpbench/console/config.py @@ -43,6 +43,7 @@ def execute_config(args: Namespace): implementations=args.implementations, with_npbench=True, with_polybench=True, + with_rodinia=True, ) color_output = args.color diff --git a/dpbench/console/run.py b/dpbench/console/run.py index e22433dd..9e44c8b8 100644 --- a/dpbench/console/run.py +++ b/dpbench/console/run.py @@ -58,6 +58,12 @@ def add_run_arguments(parser: argparse.ArgumentParser): default=False, help="Set if run polybench benchmarks.", ) + parser.add_argument( + "--rodinia", + action=argparse.BooleanOptionalAction, + default=False, + help="Set if run rodinia benchmarks.", + ) parser.add_argument( "-r", "--repeat", @@ -135,6 +141,7 @@ def execute_run(args: Namespace, conn: sqlalchemy.Engine): no_dpbench=not args.dpbench, with_npbench=args.npbench, with_polybench=args.polybench, + with_rodinia=args.rodinia, ) if args.all_implementations: diff --git a/setup.py b/setup.py index 6c55a714..37c39bd5 100644 --- a/setup.py +++ b/setup.py @@ -49,6 +49,7 @@ "bench_info/polybench/linear-algebra/blas/*.toml", "bench_info/polybench/medley/*.toml", "bench_info/npbench/*.toml", + "bench_info/rodinia/*.toml", "framework_info/*.toml", ], },