From ba7c7a62debcbef77e38c91580a221676a0971fb Mon Sep 17 00:00:00 2001 From: Adarsh Yoga Date: Mon, 12 Feb 2024 19:32:21 +0000 Subject: [PATCH] adding synchronize to cupy implementations to capture execution times accurately --- .../black_scholes/black_scholes_cupy.py | 10 ++++++---- dpbench/benchmarks/gpairs/gpairs_cupy.py | 16 ++++++++++------ dpbench/benchmarks/l2_norm/l2_norm_cupy.py | 8 +++++--- .../pairwise_distance/pairwise_distance_cupy.py | 16 +++++++++------- dpbench/benchmarks/pca/pca_cupy.py | 13 ++++++++----- dpbench/benchmarks/rambo/rambo_cupy.py | 14 ++++++++------ 6 files changed, 46 insertions(+), 31 deletions(-) diff --git a/dpbench/benchmarks/black_scholes/black_scholes_cupy.py b/dpbench/benchmarks/black_scholes/black_scholes_cupy.py index 927d71cb..8cf4650e 100644 --- a/dpbench/benchmarks/black_scholes/black_scholes_cupy.py +++ b/dpbench/benchmarks/black_scholes/black_scholes_cupy.py @@ -2,7 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 -import cupy as np +import cupy as cp from scipy.special import erf @@ -14,12 +14,12 @@ def black_scholes(nopt, price, strike, t, rate, volatility, call, put): S = strike T = t - a = np.log(P / S) + a = cp.log(P / S) b = T * mr z = T * sig_sig_two c = 0.25 * z - y = np.true_divide(1.0, np.sqrt(z)) + y = cp.true_divide(1.0, cp.sqrt(z)) w1 = (a - b + c) * y w2 = (a - b - c) * y @@ -27,7 +27,9 @@ def black_scholes(nopt, price, strike, t, rate, volatility, call, put): d1 = 0.5 + 0.5 * erf(w1) d2 = 0.5 + 0.5 * erf(w2) - Se = np.exp(b) * S + Se = cp.exp(b) * S call[:] = P * d1 - Se * d2 put[:] = call - P + Se + + cp.cuda.stream.get_current_stream().synchronize() diff --git a/dpbench/benchmarks/gpairs/gpairs_cupy.py b/dpbench/benchmarks/gpairs/gpairs_cupy.py index 8e26f2cb..f0a58074 100644 --- a/dpbench/benchmarks/gpairs/gpairs_cupy.py +++ b/dpbench/benchmarks/gpairs/gpairs_cupy.py @@ -2,19 +2,23 @@ # # SPDX-License-Identifier: Apache-2.0 -import cupy as np +import cupy as cp def _gpairs_impl(x1, y1, z1, w1, x2, y2, z2, w2, rbins): dm = ( - np.square(x2 - x1[:, None]) - + np.square(y2 - y1[:, None]) - + np.square(z2 - z1[:, None]) + cp.square(x2 - x1[:, None]) + + cp.square(y2 - y1[:, None]) + + cp.square(z2 - z1[:, None]) ) - return np.array( - [np.outer(w1, w2)[dm <= rbins[k]].sum() for k in range(len(rbins))] + ret_arr = cp.array( + [cp.outer(w1, w2)[dm <= rbins[k]].sum() for k in range(len(rbins))] ) + cp.cuda.stream.get_current_stream().synchronize() + + return ret_arr + def gpairs(nopt, nbins, x1, y1, z1, w1, x2, y2, z2, w2, rbins, results): results[:] = _gpairs_impl(x1, y1, z1, w1, x2, y2, z2, w2, rbins) diff --git a/dpbench/benchmarks/l2_norm/l2_norm_cupy.py b/dpbench/benchmarks/l2_norm/l2_norm_cupy.py index a0a979c5..95582772 100644 --- a/dpbench/benchmarks/l2_norm/l2_norm_cupy.py +++ b/dpbench/benchmarks/l2_norm/l2_norm_cupy.py @@ -2,10 +2,12 @@ # # SPDX-License-Identifier: Apache-2.0 -import cupy as np +import cupy as cp def l2_norm(a, d): - sq = np.square(a) + sq = cp.square(a) sum = sq.sum(axis=1) - d[:] = np.sqrt(sum) + d[:] = cp.sqrt(sum) + + cp.cuda.stream.get_current_stream().synchronize() diff --git a/dpbench/benchmarks/pairwise_distance/pairwise_distance_cupy.py b/dpbench/benchmarks/pairwise_distance/pairwise_distance_cupy.py index d4fc1110..0c534215 100644 --- a/dpbench/benchmarks/pairwise_distance/pairwise_distance_cupy.py +++ b/dpbench/benchmarks/pairwise_distance/pairwise_distance_cupy.py @@ -2,15 +2,17 @@ # # SPDX-License-Identifier: Apache-2.0 -import cupy as np +import cupy as cp def pairwise_distance(X1, X2, D): - x1 = np.sum(np.square(X1), axis=1) - x2 = np.sum(np.square(X2), axis=1) - np.dot(X1, X2.T, D) + x1 = cp.sum(cp.square(X1), axis=1) + x2 = cp.sum(cp.square(X2), axis=1) + cp.dot(X1, X2.T, D) D *= -2 x3 = x1.reshape(x1.size, 1) - np.add(D, x3, D) - np.add(D, x2, D) - np.sqrt(D, D) + cp.add(D, x3, D) + cp.add(D, x2, D) + cp.sqrt(D, D) + + cp.cuda.stream.get_current_stream().synchronize() diff --git a/dpbench/benchmarks/pca/pca_cupy.py b/dpbench/benchmarks/pca/pca_cupy.py index 4bfb9631..f439117e 100644 --- a/dpbench/benchmarks/pca/pca_cupy.py +++ b/dpbench/benchmarks/pca/pca_cupy.py @@ -2,7 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 -import cupy as np +import cupy as cp def pca(data, dims_rescaled_data=2): @@ -10,13 +10,13 @@ def pca(data, dims_rescaled_data=2): data -= data.mean(axis=0) # calculate the covariance matrix - v = np.cov(data, rowvar=False, dtype=data.dtype) + v = cp.cov(data, rowvar=False, dtype=data.dtype) # calculate eigenvectors & eigenvalues of the covariance matrix - evalues, evectors = np.linalg.eigh(v) + evalues, evectors = cp.linalg.eigh(v) # sort eigenvalues and eigenvectors in decreasing order - idx = np.argsort(evalues)[::-1] + idx = cp.argsort(evalues)[::-1] evectors = evectors[:, idx] evalues = evalues[idx] @@ -25,7 +25,10 @@ def pca(data, dims_rescaled_data=2): evectors = evectors[:, :dims_rescaled_data] # carry out the transformation on the data using eigenvectors - tdata = np.dot(evectors.T, data.T).T + tdata = cp.dot(evectors.T, data.T).T + + cp.cuda.stream.get_current_stream().synchronize() # return the transformed data, eigenvalues, and eigenvectors + return tdata, evalues, evectors diff --git a/dpbench/benchmarks/rambo/rambo_cupy.py b/dpbench/benchmarks/rambo/rambo_cupy.py index 1ffb1b96..87b7b4b0 100644 --- a/dpbench/benchmarks/rambo/rambo_cupy.py +++ b/dpbench/benchmarks/rambo/rambo_cupy.py @@ -2,16 +2,18 @@ # # SPDX-License-Identifier: Apache-2.0 -import cupy as np +import cupy as cp def rambo(nevts, nout, C1, F1, Q1, output): C = 2.0 * C1 - 1.0 - S = np.sqrt(1 - np.square(C)) - F = 2.0 * np.pi * F1 - Q = -np.log(Q1) + S = cp.sqrt(1 - cp.square(C)) + F = 2.0 * cp.pi * F1 + Q = -cp.log(Q1) output[:, :, 0] = Q - output[:, :, 1] = Q * S * np.sin(F) - output[:, :, 2] = Q * S * np.cos(F) + output[:, :, 1] = Q * S * cp.sin(F) + output[:, :, 2] = Q * S * cp.cos(F) output[:, :, 3] = Q * C + + cp.cuda.stream.get_current_stream().synchronize()