From 54d5aab39fab03350c2b31190a9e9eba8707a4b5 Mon Sep 17 00:00:00 2001 From: Adarsh Yoga Date: Tue, 20 Feb 2024 22:46:33 +0000 Subject: [PATCH] Switch to experimental dpex kernel --- .../black_scholes_numba_dpex_k.py | 22 +++- .../benchmarks/dbscan/dbscan_numba_dpex_k.py | 13 ++- .../benchmarks/gpairs/gpairs_numba_dpex_k.py | 24 ++-- .../benchmarks/kmeans/kmeans_numba_dpex_k.py | 108 ++++++++++++------ dpbench/benchmarks/knn/knn_numba_dpex_k.py | 11 +- .../l2_norm/l2_norm_numba_dpex_k.py | 9 +- .../pairwise_distance_numba_dpex_k.py | 17 ++- .../benchmarks/rambo/rambo_numba_dpex_k.py | 15 +-- 8 files changed, 144 insertions(+), 75 deletions(-) diff --git a/dpbench/benchmarks/black_scholes/black_scholes_numba_dpex_k.py b/dpbench/benchmarks/black_scholes/black_scholes_numba_dpex_k.py index 502b6a2f..dc4ab77c 100644 --- a/dpbench/benchmarks/black_scholes/black_scholes_numba_dpex_k.py +++ b/dpbench/benchmarks/black_scholes/black_scholes_numba_dpex_k.py @@ -4,16 +4,19 @@ from math import erf, exp, log, sqrt -import numba_dpex as dpex +import numba_dpex.experimental as dpex +from numba_dpex import kernel_api as kapi @dpex.kernel -def _black_scholes_kernel(nopt, price, strike, t, rate, volatility, call, put): +def _black_scholes_kernel( + item: kapi.Item, nopt, price, strike, t, rate, volatility, call, put +): dtype = price.dtype mr = -rate sig_sig_two = volatility * volatility * dtype.type(2) - i = dpex.get_global_id(0) + i = item.get_id(0) P = price[i] S = strike[i] @@ -40,6 +43,15 @@ def _black_scholes_kernel(nopt, price, strike, t, rate, volatility, call, put): def black_scholes(nopt, price, strike, t, rate, volatility, call, put): - _black_scholes_kernel[dpex.Range(nopt)]( - nopt, price, strike, t, rate, volatility, call, put + dpex.call_kernel( + _black_scholes_kernel, + kapi.Range(nopt), + nopt, + price, + strike, + t, + rate, + volatility, + call, + put, ) diff --git a/dpbench/benchmarks/dbscan/dbscan_numba_dpex_k.py b/dpbench/benchmarks/dbscan/dbscan_numba_dpex_k.py index 66e7396c..730a94f2 100644 --- a/dpbench/benchmarks/dbscan/dbscan_numba_dpex_k.py +++ b/dpbench/benchmarks/dbscan/dbscan_numba_dpex_k.py @@ -4,8 +4,9 @@ import dpnp as np import numba as nb -import numba_dpex as dpex +import numba_dpex.experimental as dpex import numpy +from numba_dpex import kernel_api as kapi NOISE = -1 UNDEFINED = -2 @@ -50,8 +51,10 @@ def _queue_empty(head, tail): @dpex.kernel -def get_neighborhood(n, dim, data, eps, ind_lst, sz_lst, block_size, nblocks): - i = dpex.get_global_id(0) +def get_neighborhood( + item: kapi.Item, n, dim, data, eps, ind_lst, sz_lst, block_size, nblocks +): + i = item.get_id(0) start = i * block_size stop = n if i + 1 == nblocks else start + block_size @@ -130,7 +133,9 @@ def dbscan(n_samples, n_features, data, eps, min_pts): ) sizes = np.zeros_like(data, shape=n_samples, dtype=np.int64) - get_neighborhood[dpex.Range(n_samples)]( + dpex.call_kernel( + get_neighborhood, + kapi.Range(n_samples), n_samples, n_features, data, diff --git a/dpbench/benchmarks/gpairs/gpairs_numba_dpex_k.py b/dpbench/benchmarks/gpairs/gpairs_numba_dpex_k.py index 9635d224..65f088fa 100644 --- a/dpbench/benchmarks/gpairs/gpairs_numba_dpex_k.py +++ b/dpbench/benchmarks/gpairs/gpairs_numba_dpex_k.py @@ -3,12 +3,15 @@ # SPDX-License-Identifier: Apache-2.0 import numba_dpex as dpex +import numba_dpex.experimental as dpexexp +from numba_dpex import kernel_api as kapi # This implementation is numba dpex kernel version with atomics. -@dpex.kernel +@dpexexp.kernel def count_weighted_pairs_3d_intel_no_slm_ker( + nd_item: kapi.NdItem, n, nbins, slm_hist_size, @@ -25,14 +28,14 @@ def count_weighted_pairs_3d_intel_no_slm_ker( result, ): dtype = x0.dtype - lid0 = dpex.get_local_id(0) - gr0 = dpex.get_group_id(0) + lid0 = nd_item.get_local_id(0) + gr0 = nd_item.get_group().get_group_id(0) - lid1 = dpex.get_local_id(1) - gr1 = dpex.get_group_id(1) + lid1 = nd_item.get_local_id(1) + gr1 = nd_item.get_group().get_group_id(1) - lws0 = dpex.get_local_size(0) - lws1 = dpex.get_local_size(1) + lws0 = nd_item.get_local_range(0) + lws1 = nd_item.get_local_range(1) n_wi = 20 @@ -107,7 +110,8 @@ def count_weighted_pairs_3d_intel_no_slm_ker( pk = k for p in range(private_hist_size): - dpex.atomic.add(result, pk, private_hist[p]) + result_aref = kapi.AtomicRef(result, index=pk) + result_aref.fetch_add(private_hist[p]) pk += 1 @@ -147,7 +151,9 @@ def gpairs( ceiling_quotient(nbins, private_hist_size) * private_hist_size ) - count_weighted_pairs_3d_intel_no_slm_ker[dpex.NdRange(gwsRange, lwsRange)]( + dpexexp.call_kernel( + count_weighted_pairs_3d_intel_no_slm_ker, + kapi.NdRange(dpex.Range(*gwsRange), dpex.Range(*lwsRange)), nopt, nbins, slm_hist_size, diff --git a/dpbench/benchmarks/kmeans/kmeans_numba_dpex_k.py b/dpbench/benchmarks/kmeans/kmeans_numba_dpex_k.py index b2363256..a66193e8 100644 --- a/dpbench/benchmarks/kmeans/kmeans_numba_dpex_k.py +++ b/dpbench/benchmarks/kmeans/kmeans_numba_dpex_k.py @@ -6,9 +6,9 @@ from math import sqrt import numba_dpex as dpex -import numpy +import numba_dpex.experimental as dpexexp from dpctl import tensor as dpt -from numba_dpex import NdRange +from numba_dpex import kernel_api as kapi def DivUp(numerator, denominator): @@ -25,9 +25,15 @@ def getGroupByCluster( # noqa: C901 ): local_copies = min(4, max(1, DivUp(local_size_, num_centroids))) - @dpex.kernel + @dpexexp.kernel def groupByCluster( - arrayP, arrayPcluster, arrayC, NewCentroids, NewCount, last + nd_item: kapi.NdItem, + arrayP, + arrayPcluster, + arrayC, + NewCentroids, + NewCount, + last, ): numpoints = arrayP.shape[0] localCentroids = dpex.local.array((dims, num_centroids), dtype=dtyp) @@ -38,9 +44,9 @@ def groupByCluster( (local_copies, num_centroids), dtype=dpt.int32 ) - grid = dpex.get_group_id(0) - lid = dpex.get_local_id(0) - local_size = dpex.get_local_size(0) + grid = nd_item.get_group().get_group_id(0) + lid = nd_item.get_local_id(0) + local_size = nd_item.get_local_range(0) for i in range(lid, num_centroids * dims, local_size): localCentroids[i % dims, i // dims] = arrayC[i // dims, i % dims] @@ -51,7 +57,7 @@ def groupByCluster( for lc in range(local_copies): localNewCount[lc, c] = 0 - dpex.barrier(dpex.LOCAL_MEM_FENCE) + kapi.group_barrier(nd_item.get_group()) for i in range(WorkPI): point_id = grid * WorkPI * local_size + i * local_size + lid @@ -73,44 +79,59 @@ def groupByCluster( lc = lid % local_copies for d in range(dims): - dpex.atomic.add( - localNewCentroids, (lc, d, nearest_centroid), localP[d] + localNewCentroids_aref = kapi.AtomicRef( + localNewCentroids, + index=(lc, d, nearest_centroid), + address_space=kapi.AddressSpace.LOCAL, ) + localNewCentroids_aref.fetch_add(localP[d]) - dpex.atomic.add(localNewCount, (lc, nearest_centroid), 1) + localNewCount_aref = kapi.AtomicRef( + localNewCount, + index=(lc, nearest_centroid), + address_space=kapi.AddressSpace.LOCAL, + ) + localNewCount_aref.fetch_add(1) if last: arrayPcluster[point_id] = nearest_centroid - dpex.barrier(dpex.LOCAL_MEM_FENCE) + kapi.group_barrier(nd_item.get_group()) for i in range(lid, num_centroids * dims, local_size): local_centroid_d = dtyp.type(0) for lc in range(local_copies): local_centroid_d += localNewCentroids[lc, i % dims, i // dims] - dpex.atomic.add( - NewCentroids, - (i // dims, i % dims), - local_centroid_d, + NewCentroids_aref = kapi.AtomicRef( + NewCentroids, index=(i // dims, i % dims) ) + NewCentroids_aref.fetch_add(local_centroid_d) for c in range(lid, num_centroids, local_size): local_centroid_npoints = dpt.int32.type(0) for lc in range(local_copies): local_centroid_npoints += localNewCount[lc, c] - dpex.atomic.add(NewCount, c, local_centroid_npoints) + NewCount_aref = kapi.AtomicRef(NewCount, index=c) + NewCount_aref.fetch_add(local_centroid_npoints) return groupByCluster @lru_cache(maxsize=1) def getUpdateCentroids(dims, num_centroids, dtyp, local_size_): - @dpex.kernel - def updateCentroids(diff, arrayC, arrayCnumpoint, NewCentroids, NewCount): - lid = dpex.get_local_id(0) - local_size = dpex.get_local_size(0) + @dpexexp.kernel + def updateCentroids( + nd_item: kapi.NdItem, + diff, + arrayC, + arrayCnumpoint, + NewCentroids, + NewCount, + ): + lid = nd_item.get_local_id(0) + local_size = nd_item.get_local_range(0) local_distance = dpex.local.array(local_size_, dtype=dtyp) @@ -134,7 +155,7 @@ def updateCentroids(diff, arrayC, arrayCnumpoint, NewCentroids, NewCount): max_distance = max(max_distance, distance) local_distance[c] = max_distance - dpex.barrier(dpex.LOCAL_MEM_FENCE) + kapi.group_barrier(nd_item.get_group()) if lid == 0: for c in range(local_size): @@ -147,19 +168,19 @@ def updateCentroids(diff, arrayC, arrayCnumpoint, NewCentroids, NewCount): @lru_cache(maxsize=1) def getUpdateLabels(dims, num_centroids, dtyp, WorkPI): - @dpex.kernel - def updateLabels(arrayP, arrayPcluster, arrayC): + @dpexexp.kernel + def updateLabels(nd_item: kapi.NdItem, arrayP, arrayPcluster, arrayC): numpoints = arrayP.shape[0] localCentroids = dpex.local.array((dims, num_centroids), dtype=dtyp) - grid = dpex.get_group_id(0) - lid = dpex.get_local_id(0) - local_size = dpex.get_local_size(0) + grid = nd_item.get_group().get_group_id(0) + lid = nd_item.get_local_id(0) + local_size = nd_item.get_local_range(0) for i in range(lid, num_centroids * dims, local_size): localCentroids[i % dims, i // dims] = arrayC[i // dims, i % dims] - dpex.barrier(dpex.LOCAL_MEM_FENCE) + kapi.group_barrier(nd_item.get_group()) for i in range(WorkPI): point_id = grid * WorkPI * local_size + i * local_size + lid @@ -224,19 +245,36 @@ def kmeans_kernel( for i in range(niters): last = i == (niters - 1) if diff_host < tolerance: - updateLabels[NdRange((global_size,), (local_size,))]( - arrayP, arrayPcluster, arrayC + dpexexp.call_kernel( + updateLabels, + kapi.NdRange((global_size,), (local_size,)), + arrayP, + arrayPcluster, + arrayC, ) break - groupByCluster[NdRange((global_size,), (local_size,))]( - arrayP, arrayPcluster, arrayC, NewCentroids, NewCount, last + dpexexp.call_kernel( + groupByCluster, + kapi.NdRange((global_size,), (local_size,)), + arrayP, + arrayPcluster, + arrayC, + NewCentroids, + NewCount, + last, ) update_centroid_size = min(num_centroids, local_size) - updateCentroids[ - NdRange((update_centroid_size,), (update_centroid_size,)) - ](diff, arrayC, arrayCnumpoint, NewCentroids, NewCount) + dpexexp.call_kernel( + updateCentroids, + kapi.NdRange((update_centroid_size,), (update_centroid_size,)), + diff, + arrayC, + arrayCnumpoint, + NewCentroids, + NewCount, + ) diff_host = dpt.asnumpy(diff)[0] diff --git a/dpbench/benchmarks/knn/knn_numba_dpex_k.py b/dpbench/benchmarks/knn/knn_numba_dpex_k.py index f5e3935c..640a94cd 100644 --- a/dpbench/benchmarks/knn/knn_numba_dpex_k.py +++ b/dpbench/benchmarks/knn/knn_numba_dpex_k.py @@ -5,11 +5,14 @@ from math import sqrt import numba_dpex as dpex +import numba_dpex.experimental as dpexexp import numpy as np +from numba_dpex import kernel_api as kapi -@dpex.kernel +@dpexexp.kernel def _knn_kernel( # noqa: C901: TODO: can we simplify logic? + item: kapi.Item, train, train_labels, test, @@ -21,7 +24,7 @@ def _knn_kernel( # noqa: C901: TODO: can we simplify logic? data_dim, ): dtype = train.dtype - i = dpex.get_global_id(0) + i = item.get_id(0) # here k has to be 5 in order to match with numpy queue_neighbors = dpex.private.array(shape=(5, 2), dtype=dtype) @@ -106,7 +109,9 @@ def knn( votes_to_classes, data_dim, ): - _knn_kernel[dpex.Range(test_size)]( + dpexexp.call_kernel( + _knn_kernel, + kapi.Range(test_size), x_train, y_train, x_test, diff --git a/dpbench/benchmarks/l2_norm/l2_norm_numba_dpex_k.py b/dpbench/benchmarks/l2_norm/l2_norm_numba_dpex_k.py index cbc7fe37..eb95183b 100644 --- a/dpbench/benchmarks/l2_norm/l2_norm_numba_dpex_k.py +++ b/dpbench/benchmarks/l2_norm/l2_norm_numba_dpex_k.py @@ -4,12 +4,13 @@ import math -import numba_dpex as dpex +import numba_dpex.experimental as dpex +from numba_dpex import kernel_api as kapi @dpex.kernel -def l2_norm_kernel(a, d): - i = dpex.get_global_id(0) +def l2_norm_kernel(item: kapi.Item, a, d): + i = item.get_id(0) a_rows = a.shape[1] d[i] = 0.0 for k in range(a_rows): @@ -18,4 +19,4 @@ def l2_norm_kernel(a, d): def l2_norm(a, d): - l2_norm_kernel[dpex.Range(a.shape[0])](a, d) + dpex.call_kernel(l2_norm_kernel, kapi.Range(a.shape[0]), a, d) diff --git a/dpbench/benchmarks/pairwise_distance/pairwise_distance_numba_dpex_k.py b/dpbench/benchmarks/pairwise_distance/pairwise_distance_numba_dpex_k.py index e0fa9982..c3dea96b 100644 --- a/dpbench/benchmarks/pairwise_distance/pairwise_distance_numba_dpex_k.py +++ b/dpbench/benchmarks/pairwise_distance/pairwise_distance_numba_dpex_k.py @@ -3,13 +3,14 @@ # SPDX-License-Identifier: Apache-2.0 import dpnp as np -import numba_dpex as dpex +import numba_dpex.experimental as dpex +from numba_dpex import kernel_api as kapi @dpex.kernel -def _pairwise_distance_kernel(X1, X2, D): - i = dpex.get_global_id(1) - j = dpex.get_global_id(0) +def _pairwise_distance_kernel(item: kapi.Item, X1, X2, D): + i = item.get_id(1) + j = item.get_id(0) X1_cols = X1.shape[1] @@ -21,4 +22,10 @@ def _pairwise_distance_kernel(X1, X2, D): def pairwise_distance(X1, X2, D): - _pairwise_distance_kernel[dpex.Range(X2.shape[0], X1.shape[0])](X1, X2, D) + dpex.call_kernel( + _pairwise_distance_kernel, + kapi.Range(X2.shape[0], X1.shape[0]), + X1, + X2, + D, + ) diff --git a/dpbench/benchmarks/rambo/rambo_numba_dpex_k.py b/dpbench/benchmarks/rambo/rambo_numba_dpex_k.py index 3937a442..43644d06 100644 --- a/dpbench/benchmarks/rambo/rambo_numba_dpex_k.py +++ b/dpbench/benchmarks/rambo/rambo_numba_dpex_k.py @@ -4,13 +4,14 @@ from math import cos, log, pi, sin, sqrt -import numba_dpex as dpex +import numba_dpex.experimental as dpex +from numba_dpex import kernel_api as kapi @dpex.kernel -def _rambo(C1, F1, Q1, nout, output): +def _rambo(item: kapi.Item, C1, F1, Q1, nout, output): dtype = C1.dtype - i = dpex.get_global_id(0) + i = item.get_id(0) for j in range(nout): C = dtype.type(2.0) * C1[i, j] - dtype.type(1.0) S = sqrt(dtype.type(1) - C * C) @@ -24,10 +25,4 @@ def _rambo(C1, F1, Q1, nout, output): def rambo(nevts, nout, C1, F1, Q1, output): - _rambo[dpex.Range(nevts)]( - C1, - F1, - Q1, - nout, - output, - ) + dpex.call_kernel(_rambo, kapi.Range(nevts), C1, F1, Q1, nout, output)