From c747805ca02869c81beea76024f3ae4028412d0c Mon Sep 17 00:00:00 2001 From: Yevhenii Havrylko Date: Thu, 7 Dec 2023 16:57:40 -0500 Subject: [PATCH 1/3] Fix typo in nrt_reserve_meminfo description --- numba_dpex/core/runtime/experimental/nrt_reserve_meminfo.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numba_dpex/core/runtime/experimental/nrt_reserve_meminfo.h b/numba_dpex/core/runtime/experimental/nrt_reserve_meminfo.h index 853f31eeac..aedf0db8d3 100644 --- a/numba_dpex/core/runtime/experimental/nrt_reserve_meminfo.h +++ b/numba_dpex/core/runtime/experimental/nrt_reserve_meminfo.h @@ -5,7 +5,7 @@ //===----------------------------------------------------------------------===// /// /// \file -/// Defines dpctl style function(s) that interruct with nrt meminfo and sycl. +/// Defines dpctl style function(s) that interact with nrt meminfo and sycl. /// //===----------------------------------------------------------------------===// From 774d543fe7081f1f2264a6773499d15d54a47d99 Mon Sep 17 00:00:00 2001 From: Yevhenii Havrylko Date: Mon, 11 Dec 2023 16:28:59 -0500 Subject: [PATCH 2/3] Add boost hash_combine function --- LICENSES.third-party | 30 +++++++++++++++ .../runtime/experimental/tools/boost_hash.hpp | 38 +++++++++++++++++++ 2 files changed, 68 insertions(+) create mode 100644 LICENSES.third-party create mode 100644 numba_dpex/core/runtime/experimental/tools/boost_hash.hpp diff --git a/LICENSES.third-party b/LICENSES.third-party new file mode 100644 index 0000000000..5f03f58ac0 --- /dev/null +++ b/LICENSES.third-party @@ -0,0 +1,30 @@ +The numba-dpex source tree includes vendored libraries governed by the following +licenses. + + +boost hash.hpp header +--------------------- + +Boost Software License - Version 1.0 - August 17th, 2003 + +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/numba_dpex/core/runtime/experimental/tools/boost_hash.hpp b/numba_dpex/core/runtime/experimental/tools/boost_hash.hpp new file mode 100644 index 0000000000..7d96bdc31f --- /dev/null +++ b/numba_dpex/core/runtime/experimental/tools/boost_hash.hpp @@ -0,0 +1,38 @@ +// Copyright 2005-2014 Daniel James. +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +// Based on Peter Dimov's proposal +// http://www.open-std.org/JTC1/SC22/WG21/docs/papers/2005/n1756.pdf +// issue 6.18. +// +// This also contains public domain code from MurmurHash. From the +// MurmurHash header: + +// MurmurHash3 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. + +// 2023 Intel Corporation +// Copied hash_combine and hash_combine_impl from boost +// (https://www.boost.org/doc/libs/1_76_0/boost/container_hash/hash.hpp) and +// changed hash_combine to use std::hash instead of boost::hash. + +#include + +namespace boost +{ +namespace hash_detail +{ +template +inline void hash_combine_impl(SizeT &seed, SizeT value) +{ + seed ^= value + 0x9e3779b9 + (seed << 6) + (seed >> 2); +} +} // namespace hash_detail + +template inline void hash_combine(std::size_t &seed, T const &v) +{ + std::hash hasher; + return boost::hash_detail::hash_combine_impl(seed, hasher(v)); +} +} // namespace boost From a8178c6a0eb05187e1b873bdd3028211af0c26eb Mon Sep 17 00:00:00 2001 From: Yevhenii Havrylko Date: Thu, 7 Dec 2023 16:23:54 -0500 Subject: [PATCH 3/3] Enable device caching for kernels --- numba_dpex/core/runtime/_dpexrt_python.c | 8 ++ numba_dpex/core/runtime/context.py | 53 +++++++++ .../runtime/experimental/kernel_caching.cpp | 111 ++++++++++++++++++ .../runtime/experimental/kernel_caching.h | 54 +++++++++ .../core/runtime/experimental/tools/dpctl.cpp | 24 ++++ .../core/runtime/experimental/tools/dpctl.hpp | 26 ++++ numba_dpex/experimental/launcher.py | 75 ++++-------- numba_dpex/experimental/testing.py | 31 +++++ .../tests/experimental/test_async_kernel.py | 10 ++ 9 files changed, 343 insertions(+), 49 deletions(-) create mode 100644 numba_dpex/core/runtime/experimental/kernel_caching.cpp create mode 100644 numba_dpex/core/runtime/experimental/kernel_caching.h create mode 100644 numba_dpex/core/runtime/experimental/tools/dpctl.cpp create mode 100644 numba_dpex/core/runtime/experimental/tools/dpctl.hpp create mode 100644 numba_dpex/experimental/testing.py diff --git a/numba_dpex/core/runtime/_dpexrt_python.c b/numba_dpex/core/runtime/_dpexrt_python.c index b10079d33f..d334f21523 100644 --- a/numba_dpex/core/runtime/_dpexrt_python.c +++ b/numba_dpex/core/runtime/_dpexrt_python.c @@ -24,6 +24,7 @@ #include "_queuestruct.h" #include "_usmarraystruct.h" +#include "experimental/kernel_caching.h" #include "experimental/nrt_reserve_meminfo.h" #include "numba/core/runtime/nrt_external.h" @@ -1493,6 +1494,8 @@ static PyObject *build_c_helpers_dict(void) _declpointer("DPEXRT_sycl_event_init", &DPEXRT_sycl_event_init); _declpointer("DPEXRT_nrt_acquire_meminfo_and_schedule_release", &DPEXRT_nrt_acquire_meminfo_and_schedule_release); + _declpointer("DPEXRT_build_or_get_kernel", &DPEXRT_build_or_get_kernel); + _declpointer("DPEXRT_kernel_cache_size", &DPEXRT_kernel_cache_size); #undef _declpointer return dct; @@ -1563,6 +1566,11 @@ MOD_INIT(_dpexrt_python) PyModule_AddObject( m, "DPEXRT_nrt_acquire_meminfo_and_schedule_release", PyLong_FromVoidPtr(&DPEXRT_nrt_acquire_meminfo_and_schedule_release)); + PyModule_AddObject(m, "DPEXRT_build_or_get_kernel", + PyLong_FromVoidPtr(&DPEXRT_build_or_get_kernel)); + PyModule_AddObject(m, "DPEXRT_kernel_cache_size", + PyLong_FromVoidPtr(&DPEXRT_kernel_cache_size)); + PyModule_AddObject(m, "c_helpers", build_c_helpers_dict()); return MOD_SUCCESS_VAL(m); } diff --git a/numba_dpex/core/runtime/context.py b/numba_dpex/core/runtime/context.py index 1d9f30dad7..f026caecc1 100644 --- a/numba_dpex/core/runtime/context.py +++ b/numba_dpex/core/runtime/context.py @@ -471,3 +471,56 @@ def acquire_meminfo_and_schedule_release( ret = builder.call(fn, args) return ret + + def build_or_get_kernel(self, builder: llvmir.IRBuilder, args): + """Inserts LLVM IR to call build_or_get_kernel. + + DPCTLSyclKernelRef + DPEXRT_build_or_get_kernel( + const DPCTLSyclContextRef ctx, + const DPCTLSyclDeviceRef dev, + size_t il_hash, + const char *il, + size_t il_length, + const char *compile_opts, + const char *kernel_name, + ); + + """ + mod = builder.module + + func_ty = llvmir.FunctionType( + cgutils.voidptr_t, + [ + cgutils.voidptr_t, + cgutils.voidptr_t, + llvmir.IntType(64), + cgutils.voidptr_t, + llvmir.IntType(64), + cgutils.voidptr_t, + cgutils.voidptr_t, + ], + ) + fn = cgutils.get_or_insert_function( + mod, func_ty, "DPEXRT_build_or_get_kernel" + ) + ret = builder.call(fn, args) + + return ret + + def kernel_cache_size(self, builder: llvmir.IRBuilder): + """Inserts LLVM IR to call kernel_cache_size. + + size_t DPEXRT_kernel_cache_size(); + + """ + fn = cgutils.get_or_insert_function( + builder.module, + llvmir.FunctionType( + llvmir.IntType(64), + [], + ), + "DPEXRT_kernel_cache_size", + ) + + return builder.call(fn, []) diff --git a/numba_dpex/core/runtime/experimental/kernel_caching.cpp b/numba_dpex/core/runtime/experimental/kernel_caching.cpp new file mode 100644 index 0000000000..9e35e333c0 --- /dev/null +++ b/numba_dpex/core/runtime/experimental/kernel_caching.cpp @@ -0,0 +1,111 @@ +// SPDX-FileCopyrightText: 2020 - 2023 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 + +#include "kernel_caching.h" +#include + +extern "C" +{ +#include "dpctl_capi.h" +#include "dpctl_sycl_interface.h" + +#include "_dbg_printer.h" + +#include "numba/core/runtime/nrt_external.h" +} + +#include "syclinterface/dpctl_sycl_type_casters.hpp" +#include "tools/boost_hash.hpp" +#include "tools/dpctl.hpp" + +using CacheKey = std::tuple; + +namespace std +{ +template <> struct hash +{ + size_t operator()(const CacheKey &ck) const + { + std::size_t seed = 0; + boost::hash_combine(seed, std::get(ck)); + boost::hash_combine(seed, std::get(ck)); + boost::hash_detail::hash_combine_impl(seed, std::get(ck)); + return seed; + } +}; +template <> struct equal_to +{ + constexpr bool operator()(const CacheKey &lhs, const CacheKey &rhs) const + { + return DPCTLDevice_AreEq(std::get(lhs), + std::get(rhs)) && + DPCTLContext_AreEq(std::get(lhs), + std::get(rhs)) && + std::get(lhs) == std::get(rhs); + } +}; +} // namespace std + +// TODO: add cache cleaning +// https://github.com/IntelPython/numba-dpex/issues/1240 +std::unordered_map sycl_kernel_cache = + std::unordered_map(); + +template +typename M::mapped_type &get_else_compute(M &m, Key const &k, F f) +{ + typedef typename M::mapped_type V; + std::pair r = + m.insert(typename M::value_type(k, V())); + V &v = r.first->second; + if (r.second) { + DPEXRT_DEBUG(drt_debug_print("DPEXRT-DEBUG: building kernel.\n");); + f(v); + } + else { + DPEXRT_DEBUG(drt_debug_print("DPEXRT-DEBUG: using cached kernel.\n");); + DPCTLDevice_Delete(std::get(k)); + DPCTLContext_Delete(std::get(k)); + } + return v; +} + +extern "C" +{ + DPCTLSyclKernelRef DPEXRT_build_or_get_kernel(const DPCTLSyclContextRef ctx, + const DPCTLSyclDeviceRef dev, + size_t il_hash, + const char *il, + size_t il_length, + const char *compile_opts, + const char *kernel_name) + { + DPEXRT_DEBUG( + drt_debug_print("DPEXRT-DEBUG: in build or get kernel.\n");); + + CacheKey key = std::make_tuple(ctx, dev, il_hash); + + DPEXRT_DEBUG(auto ctx_hash = std::hash{}(ctx); + auto dev_hash = std::hash{}(dev); + drt_debug_print("DPEXRT-DEBUG: key hashes: %d %d %d.\n", + ctx_hash, dev_hash, il_hash);); + + auto k_ref = get_else_compute( + sycl_kernel_cache, key, + [ctx, dev, il, il_length, compile_opts, + kernel_name](DPCTLSyclKernelRef &k_ref) { + auto kb_ref = DPCTLKernelBundle_CreateFromSpirv( + ctx, dev, il, il_length, compile_opts); + k_ref = DPCTLKernelBundle_GetKernel(kb_ref, kernel_name); + DPCTLKernelBundle_Delete(kb_ref); + }); + + DPEXRT_DEBUG(drt_debug_print("DPEXRT-DEBUG: kernel hash size: %d.\n", + sycl_kernel_cache.size());); + + return DPCTLKernel_Copy(k_ref); + } + + size_t DPEXRT_kernel_cache_size() { return sycl_kernel_cache.size(); } +} diff --git a/numba_dpex/core/runtime/experimental/kernel_caching.h b/numba_dpex/core/runtime/experimental/kernel_caching.h new file mode 100644 index 0000000000..e345fc135f --- /dev/null +++ b/numba_dpex/core/runtime/experimental/kernel_caching.h @@ -0,0 +1,54 @@ +// SPDX-FileCopyrightText: 2023 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 + +//===----------------------------------------------------------------------===// +/// +/// \file +/// Defines dpex run time function(s) that cache kernel on device. +/// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "dpctl_capi.h" +#include "dpctl_sycl_interface.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + /*! + * @brief returns dpctl kernel reference for the SPIRV file on particular + * device. Compiles only first time, all others will use cache for the same + * input. It steals reference to context and device because we need to keep + * it alive for cache keys. + * + * @param ctx Context reference, + * @param dev Device reference, + * @param il_hash Hash of the SPIRV binary data, + * @param il SPIRV binary data, + * @param il_length SPIRV binary data size, + * @param compile_opts compile options, + * @param kernel_name kernel name inside SPIRV binary data to return + * reference to. + * + * @return {return} Kernel reference to the compiled SPIR-V. + */ + DPCTLSyclKernelRef DPEXRT_build_or_get_kernel(const DPCTLSyclContextRef ctx, + const DPCTLSyclDeviceRef dev, + size_t il_hash, + const char *il, + size_t il_length, + const char *compile_opts, + const char *kernel_name); + + /*! + * @brief returns cache size. Intended for test purposes only + * + * @return {return} Kernel cache size. + */ + size_t DPEXRT_kernel_cache_size(); +#ifdef __cplusplus +} +#endif diff --git a/numba_dpex/core/runtime/experimental/tools/dpctl.cpp b/numba_dpex/core/runtime/experimental/tools/dpctl.cpp new file mode 100644 index 0000000000..f376fc0b19 --- /dev/null +++ b/numba_dpex/core/runtime/experimental/tools/dpctl.cpp @@ -0,0 +1,24 @@ +// SPDX-FileCopyrightText: 2020 - 2023 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 + +#include "dpctl.hpp" +#include + +namespace std +{ + +size_t +hash::operator()(const DPCTLSyclDeviceRef &DRef) const +{ + using dpctl::syclinterface::unwrap; + return hash()(*unwrap(DRef)); +} + +size_t +hash::operator()(const DPCTLSyclContextRef &CRef) const +{ + using dpctl::syclinterface::unwrap; + return hash()(*unwrap(CRef)); +} +} // namespace std diff --git a/numba_dpex/core/runtime/experimental/tools/dpctl.hpp b/numba_dpex/core/runtime/experimental/tools/dpctl.hpp new file mode 100644 index 0000000000..9fa03b9acb --- /dev/null +++ b/numba_dpex/core/runtime/experimental/tools/dpctl.hpp @@ -0,0 +1,26 @@ +// SPDX-FileCopyrightText: 2020 - 2023 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 + +//===----------------------------------------------------------------------===// +/// +/// \file +/// Defines overloads to dpctl library that eventually must be ported there. +/// +//===----------------------------------------------------------------------===// + +#pragma once +#include "syclinterface/dpctl_sycl_type_casters.hpp" + +namespace std +{ +template <> struct hash +{ + size_t operator()(const DPCTLSyclDeviceRef &DRef) const; +}; + +template <> struct hash +{ + size_t operator()(const DPCTLSyclContextRef &CRef) const; +}; +} // namespace std diff --git a/numba_dpex/experimental/launcher.py b/numba_dpex/experimental/launcher.py index 420d7f5758..63389fcfb5 100644 --- a/numba_dpex/experimental/launcher.py +++ b/numba_dpex/experimental/launcher.py @@ -54,6 +54,10 @@ def __init__( ): self.context = codegen_targetctx self.builder = builder + # TODO: get dpex RT from cached property once the PR is merged + # https://github.com/IntelPython/numba-dpex/pull/1027 + # and get rid of the global variable. Use self.context.dpexrt instead. + self.dpexrt = DpexRTContext(self.context) if config.DEBUG_KERNEL_LAUNCHER: cgutils.printf( @@ -139,7 +143,7 @@ def get_queue_ref_val( return ptr_to_queue_ref - def get_kernel(self, qref, kernel_module: _KernelModule): + def get_kernel(self, queue_ref, kernel_module: _KernelModule): """Returns the pointer to the sycl::kernel object in a passed in sycl::kernel_bundle wrapper object. """ @@ -150,23 +154,31 @@ def get_kernel(self, qref, kernel_module: _KernelModule): bytes=kernel_module.kernel_bitcode, ) - # Create a sycl::kernel_bundle object and return it as an opaque pointer - # using dpctl's libsyclinterface. - kbref = self.create_kernel_bundle_from_spirv( - queue_ref=qref, - kernel_bc=kernel_bc_byte_str, - kernel_bc_size_in_bytes=len(kernel_module.kernel_bitcode), - ) - kernel_name = self.context.insert_const_string( self.builder.module, kernel_module.kernel_name ) - kernel_ref = sycl.dpctl_kernel_bundle_get_kernel( - self.builder, kbref, kernel_name - ) + context_ref = sycl.dpctl_queue_get_context(self.builder, queue_ref) + device_ref = sycl.dpctl_queue_get_device(self.builder, queue_ref) - sycl.dpctl_kernel_bundle_delete(self.builder, kbref) + # build_or_get_kernel stills reference to context and device cause it + # needs to keep them alive for keys. + kernel_ref = self.dpexrt.build_or_get_kernel( + self.builder, + [ + context_ref, + device_ref, + llvmir.Constant( + llvmir.IntType(64), hash(kernel_module.kernel_bitcode) + ), + kernel_bc_byte_str, + llvmir.Constant( + llvmir.IntType(64), len(kernel_module.kernel_bitcode) + ), + self.builder.load(create_null_ptr(self.builder, self.context)), + kernel_name, + ], + ) return kernel_ref @@ -210,36 +222,6 @@ def create_llvm_values_for_index_space( return LLRange(global_range_extents, local_range_extents) - def create_kernel_bundle_from_spirv( - self, - queue_ref: llvmir.PointerType, - kernel_bc: llvmir.Constant, - kernel_bc_size_in_bytes: int, - ) -> llvmir.CallInstr: - """Calls DPCTLKernelBundle_CreateFromSpirv to create an opaque pointer - to a sycl::kernel_bundle from the SPIR-V generated for a kernel. - """ - device_ref = sycl.dpctl_queue_get_device(self.builder, queue_ref) - context_ref = sycl.dpctl_queue_get_context(self.builder, queue_ref) - args = [ - context_ref, - device_ref, - kernel_bc, - llvmir.Constant(llvmir.IntType(64), kernel_bc_size_in_bytes), - self.builder.load(create_null_ptr(self.builder, self.context)), - ] - kb_ref = sycl.dpctl_kernel_bundle_create_from_spirv(self.builder, *args) - sycl.dpctl_context_delete(self.builder, context_ref) - sycl.dpctl_device_delete(self.builder, device_ref) - - if config.DEBUG_KERNEL_LAUNCHER: - cgutils.printf( - self.builder, - "DPEX-DEBUG: Generated kernel_bundle from SPIR-V.\n", - ) - - return kb_ref - def acquire_meminfo_and_schedule_release( self, queue_ref, @@ -259,12 +241,7 @@ def acquire_meminfo_and_schedule_release( status_ptr = cgutils.alloca_once( self.builder, self.context.get_value_type(types.uint64) ) - # TODO: get dpex RT from cached property once the PR is merged - # https://github.com/IntelPython/numba-dpex/pull/1027 - # host_eref = ctx.dpexrt.acquire_meminfo_and_schedule_release( # noqa: W0621 - host_eref = DpexRTContext( - self.context - ).acquire_meminfo_and_schedule_release( + host_eref = self.dpexrt.acquire_meminfo_and_schedule_release( self.builder, [ self.context.nrt.get_nrt_api(self.builder), diff --git a/numba_dpex/experimental/testing.py b/numba_dpex/experimental/testing.py new file mode 100644 index 0000000000..b915626efb --- /dev/null +++ b/numba_dpex/experimental/testing.py @@ -0,0 +1,31 @@ +# SPDX-FileCopyrightText: 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +"""Tools for testing, not intended for regular use.""" + + +from numba.core import types +from numba.extending import intrinsic + +from numba_dpex import dpjit +from numba_dpex.core.runtime.context import DpexRTContext + + +@intrinsic(target="cpu") +def _kernel_cache_size( + typingctx, # pylint: disable=W0613 +): + sig = types.int64() + + def codegen(ctx, builder, sig, llargs): # pylint: disable=W0613 + dpexrt = DpexRTContext(ctx) + return dpexrt.kernel_cache_size(builder) + + return sig, codegen + + +@dpjit +def kernel_cache_size() -> int: + """Returns kernel cache size.""" + return _kernel_cache_size() # pylint: disable=E1120 diff --git a/numba_dpex/tests/experimental/test_async_kernel.py b/numba_dpex/tests/experimental/test_async_kernel.py index 53a962588b..71dec6066b 100644 --- a/numba_dpex/tests/experimental/test_async_kernel.py +++ b/numba_dpex/tests/experimental/test_async_kernel.py @@ -8,6 +8,7 @@ import numba_dpex as dpex import numba_dpex.experimental as exp_dpex from numba_dpex import Range +from numba_dpex.experimental import testing @exp_dpex.kernel( @@ -47,3 +48,12 @@ def test_async_add(): d = a + b assert dpnp.array_equal(c, d) + + +def test_async_add_from_cache(): + test_async_add() # compile + old_size = testing.kernel_cache_size() + test_async_add() # use from cache + new_size = testing.kernel_cache_size() + + assert new_size == old_size