From ba7c7a62debcbef77e38c91580a221676a0971fb Mon Sep 17 00:00:00 2001
From: Adarsh Yoga <adarsh.yoga@intel.com>
Date: Mon, 12 Feb 2024 19:32:21 +0000
Subject: [PATCH] adding synchronize to cupy implementations to capture
 execution times accurately

---
 .../black_scholes/black_scholes_cupy.py          | 10 ++++++----
 dpbench/benchmarks/gpairs/gpairs_cupy.py         | 16 ++++++++++------
 dpbench/benchmarks/l2_norm/l2_norm_cupy.py       |  8 +++++---
 .../pairwise_distance/pairwise_distance_cupy.py  | 16 +++++++++-------
 dpbench/benchmarks/pca/pca_cupy.py               | 13 ++++++++-----
 dpbench/benchmarks/rambo/rambo_cupy.py           | 14 ++++++++------
 6 files changed, 46 insertions(+), 31 deletions(-)

diff --git a/dpbench/benchmarks/black_scholes/black_scholes_cupy.py b/dpbench/benchmarks/black_scholes/black_scholes_cupy.py
index 927d71cb..8cf4650e 100644
--- a/dpbench/benchmarks/black_scholes/black_scholes_cupy.py
+++ b/dpbench/benchmarks/black_scholes/black_scholes_cupy.py
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-import cupy as np
+import cupy as cp
 from scipy.special import erf
 
 
@@ -14,12 +14,12 @@ def black_scholes(nopt, price, strike, t, rate, volatility, call, put):
     S = strike
     T = t
 
-    a = np.log(P / S)
+    a = cp.log(P / S)
     b = T * mr
 
     z = T * sig_sig_two
     c = 0.25 * z
-    y = np.true_divide(1.0, np.sqrt(z))
+    y = cp.true_divide(1.0, cp.sqrt(z))
 
     w1 = (a - b + c) * y
     w2 = (a - b - c) * y
@@ -27,7 +27,9 @@ def black_scholes(nopt, price, strike, t, rate, volatility, call, put):
     d1 = 0.5 + 0.5 * erf(w1)
     d2 = 0.5 + 0.5 * erf(w2)
 
-    Se = np.exp(b) * S
+    Se = cp.exp(b) * S
 
     call[:] = P * d1 - Se * d2
     put[:] = call - P + Se
+
+    cp.cuda.stream.get_current_stream().synchronize()
diff --git a/dpbench/benchmarks/gpairs/gpairs_cupy.py b/dpbench/benchmarks/gpairs/gpairs_cupy.py
index 8e26f2cb..f0a58074 100644
--- a/dpbench/benchmarks/gpairs/gpairs_cupy.py
+++ b/dpbench/benchmarks/gpairs/gpairs_cupy.py
@@ -2,19 +2,23 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-import cupy as np
+import cupy as cp
 
 
 def _gpairs_impl(x1, y1, z1, w1, x2, y2, z2, w2, rbins):
     dm = (
-        np.square(x2 - x1[:, None])
-        + np.square(y2 - y1[:, None])
-        + np.square(z2 - z1[:, None])
+        cp.square(x2 - x1[:, None])
+        + cp.square(y2 - y1[:, None])
+        + cp.square(z2 - z1[:, None])
     )
-    return np.array(
-        [np.outer(w1, w2)[dm <= rbins[k]].sum() for k in range(len(rbins))]
+    ret_arr = cp.array(
+        [cp.outer(w1, w2)[dm <= rbins[k]].sum() for k in range(len(rbins))]
     )
 
+    cp.cuda.stream.get_current_stream().synchronize()
+
+    return ret_arr
+
 
 def gpairs(nopt, nbins, x1, y1, z1, w1, x2, y2, z2, w2, rbins, results):
     results[:] = _gpairs_impl(x1, y1, z1, w1, x2, y2, z2, w2, rbins)
diff --git a/dpbench/benchmarks/l2_norm/l2_norm_cupy.py b/dpbench/benchmarks/l2_norm/l2_norm_cupy.py
index a0a979c5..95582772 100644
--- a/dpbench/benchmarks/l2_norm/l2_norm_cupy.py
+++ b/dpbench/benchmarks/l2_norm/l2_norm_cupy.py
@@ -2,10 +2,12 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-import cupy as np
+import cupy as cp
 
 
 def l2_norm(a, d):
-    sq = np.square(a)
+    sq = cp.square(a)
     sum = sq.sum(axis=1)
-    d[:] = np.sqrt(sum)
+    d[:] = cp.sqrt(sum)
+
+    cp.cuda.stream.get_current_stream().synchronize()
diff --git a/dpbench/benchmarks/pairwise_distance/pairwise_distance_cupy.py b/dpbench/benchmarks/pairwise_distance/pairwise_distance_cupy.py
index d4fc1110..0c534215 100644
--- a/dpbench/benchmarks/pairwise_distance/pairwise_distance_cupy.py
+++ b/dpbench/benchmarks/pairwise_distance/pairwise_distance_cupy.py
@@ -2,15 +2,17 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-import cupy as np
+import cupy as cp
 
 
 def pairwise_distance(X1, X2, D):
-    x1 = np.sum(np.square(X1), axis=1)
-    x2 = np.sum(np.square(X2), axis=1)
-    np.dot(X1, X2.T, D)
+    x1 = cp.sum(cp.square(X1), axis=1)
+    x2 = cp.sum(cp.square(X2), axis=1)
+    cp.dot(X1, X2.T, D)
     D *= -2
     x3 = x1.reshape(x1.size, 1)
-    np.add(D, x3, D)
-    np.add(D, x2, D)
-    np.sqrt(D, D)
+    cp.add(D, x3, D)
+    cp.add(D, x2, D)
+    cp.sqrt(D, D)
+
+    cp.cuda.stream.get_current_stream().synchronize()
diff --git a/dpbench/benchmarks/pca/pca_cupy.py b/dpbench/benchmarks/pca/pca_cupy.py
index 4bfb9631..f439117e 100644
--- a/dpbench/benchmarks/pca/pca_cupy.py
+++ b/dpbench/benchmarks/pca/pca_cupy.py
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-import cupy as np
+import cupy as cp
 
 
 def pca(data, dims_rescaled_data=2):
@@ -10,13 +10,13 @@ def pca(data, dims_rescaled_data=2):
     data -= data.mean(axis=0)
 
     # calculate the covariance matrix
-    v = np.cov(data, rowvar=False, dtype=data.dtype)
+    v = cp.cov(data, rowvar=False, dtype=data.dtype)
 
     # calculate eigenvectors & eigenvalues of the covariance matrix
-    evalues, evectors = np.linalg.eigh(v)
+    evalues, evectors = cp.linalg.eigh(v)
 
     # sort eigenvalues and eigenvectors in decreasing order
-    idx = np.argsort(evalues)[::-1]
+    idx = cp.argsort(evalues)[::-1]
     evectors = evectors[:, idx]
     evalues = evalues[idx]
 
@@ -25,7 +25,10 @@ def pca(data, dims_rescaled_data=2):
     evectors = evectors[:, :dims_rescaled_data]
 
     # carry out the transformation on the data using eigenvectors
-    tdata = np.dot(evectors.T, data.T).T
+    tdata = cp.dot(evectors.T, data.T).T
+
+    cp.cuda.stream.get_current_stream().synchronize()
 
     # return the transformed data, eigenvalues, and eigenvectors
+
     return tdata, evalues, evectors
diff --git a/dpbench/benchmarks/rambo/rambo_cupy.py b/dpbench/benchmarks/rambo/rambo_cupy.py
index 1ffb1b96..87b7b4b0 100644
--- a/dpbench/benchmarks/rambo/rambo_cupy.py
+++ b/dpbench/benchmarks/rambo/rambo_cupy.py
@@ -2,16 +2,18 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-import cupy as np
+import cupy as cp
 
 
 def rambo(nevts, nout, C1, F1, Q1, output):
     C = 2.0 * C1 - 1.0
-    S = np.sqrt(1 - np.square(C))
-    F = 2.0 * np.pi * F1
-    Q = -np.log(Q1)
+    S = cp.sqrt(1 - cp.square(C))
+    F = 2.0 * cp.pi * F1
+    Q = -cp.log(Q1)
 
     output[:, :, 0] = Q
-    output[:, :, 1] = Q * S * np.sin(F)
-    output[:, :, 2] = Q * S * np.cos(F)
+    output[:, :, 1] = Q * S * cp.sin(F)
+    output[:, :, 2] = Q * S * cp.cos(F)
     output[:, :, 3] = Q * C
+
+    cp.cuda.stream.get_current_stream().synchronize()