From 7b9afbfa28e9441fd3bfbe4e3c24eb62816cb1a7 Mon Sep 17 00:00:00 2001
From: ayasar70 <64045900+ayasar70@users.noreply.github.com>
Date: Fri, 18 Feb 2022 14:52:18 -0800
Subject: [PATCH] [Performance][GPU] Improving _SegmentMaskColKernel (#3745)

* Based on issue #3436. Improving _SegmentCopyKernel s GPU utilization by switching to nonzero based thread assignment

* fixing lint issues

* Update cub for cuda 11.5 compatibility (#3468)

* fixing type mismatch

* tx guaranteed to be smaller than nnz. Hence removing last check

* minor: updating comment

* adding three unit tests for csr slice method to cover some corner cases

* timing repeatkernel

* clean

* clean

* clean

* updating _SegmentMaskColKernel

* Working on requests: removing sorted array check and adding comments to utility functions

* fixing lint issue

Co-authored-by: Abdurrahman Yasar <ayasar@nvidia.com>
Co-authored-by: nv-dlasalle <63612878+nv-dlasalle@users.noreply.github.com>
Co-authored-by: Jinjing Zhou <VoVAllen@users.noreply.github.com>
---
 src/array/cuda/csr2coo.cu           | 14 +----
 src/array/cuda/spmat_op_impl_csr.cu | 85 +++++++++++++++--------------
 src/array/cuda/utils.h              | 52 ++++++++++++++++++
 3 files changed, 100 insertions(+), 51 deletions(-)
diff --git a/src/array/cuda/csr2coo.cu b/src/array/cuda/csr2coo.cu
index b883fa56cdbd..001c532c3c2b 100644
--- a/src/array/cuda/csr2coo.cu
+++ b/src/array/cuda/csr2coo.cu
@@ -66,19 +66,11 @@ template <typename DType, typename IdType>
 __global__ void _RepeatKernel(
     const DType* val, const IdType* pos,
     DType* out, int64_t n_row, int64_t length) {
-  int tx = blockIdx.x * blockDim.x + threadIdx.x;
+  IdType tx = static_cast<IdType>(blockIdx.x) * blockDim.x + threadIdx.x;
   const int stride_x = gridDim.x * blockDim.x;
   while (tx < length) {
-    IdType l = 0, r = n_row, m = 0;
-    while (l < r) {
-      m = l + (r-l)/2;
-      if (tx >= pos[m]) {
-        l = m+1;
-      } else {
-        r = m;
-      }
-    }
-    out[tx] = val[l-1];
+    IdType i = dgl::cuda::_UpperBound(pos, n_row, tx) - 1;
+    out[tx] = val[i];
     tx += stride_x;
   }
 }
diff --git a/src/array/cuda/spmat_op_impl_csr.cu b/src/array/cuda/spmat_op_impl_csr.cu
index a7a9788d4c71..dd8e28c53b0e 100644
--- a/src/array/cuda/spmat_op_impl_csr.cu
+++ b/src/array/cuda/spmat_op_impl_csr.cu
@@ -9,6 +9,8 @@
 #include <numeric>
 #include "../../runtime/cuda/cuda_common.h"
 #include "./utils.h"
+#include "./atomic.cuh"
+#include "./dgl_cub.cuh"
 
 namespace dgl {
 
@@ -30,7 +32,7 @@ bool CSRIsNonZero(CSRMatrix csr, int64_t row, int64_t col) {
   IdArray out = aten::NewIdArray(1, ctx, sizeof(IdType) * 8);
   const IdType* data = nullptr;
   // TODO(minjie): use binary search for sorted csr
-  CUDA_KERNEL_CALL(cuda::_LinearSearchKernel,
+  CUDA_KERNEL_CALL(dgl::cuda::_LinearSearchKernel,
       1, 1, 0, thr_entry->stream,
       csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(), data,
       rows.Ptr<IdType>(), cols.Ptr<IdType>(),
@@ -54,11 +56,11 @@ NDArray CSRIsNonZero(CSRMatrix csr, NDArray row, NDArray col) {
   const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1;
   const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1;
   auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
-  const int nt = cuda::FindNumThreads(rstlen);
+  const int nt = dgl::cuda::FindNumThreads(rstlen);
   const int nb = (rstlen + nt - 1) / nt;
   const IdType* data = nullptr;
   // TODO(minjie): use binary search for sorted csr
-  CUDA_KERNEL_CALL(cuda::_LinearSearchKernel,
+  CUDA_KERNEL_CALL(dgl::cuda::_LinearSearchKernel,
       nb, nt, 0, thr_entry->stream,
       csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(), data,
       row.Ptr<IdType>(), col.Ptr<IdType>(),
@@ -103,13 +105,13 @@ bool CSRHasDuplicate(CSRMatrix csr) {
   // We allocate a workspace of num_rows bytes. It wastes a little bit memory but should
   // be fine.
   int8_t* flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, csr.num_rows));
-  const int nt = cuda::FindNumThreads(csr.num_rows);
+  const int nt = dgl::cuda::FindNumThreads(csr.num_rows);
   const int nb = (csr.num_rows + nt - 1) / nt;
   CUDA_KERNEL_CALL(_SegmentHasNoDuplicate,
       nb, nt, 0, thr_entry->stream,
       csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(),
       csr.num_rows, flags);
-  bool ret = cuda::AllTrue(flags, csr.num_rows, ctx);
+  bool ret = dgl::cuda::AllTrue(flags, csr.num_rows, ctx);
   device->FreeWorkspace(ctx, flags);
   return !ret;
 }
@@ -152,7 +154,7 @@ NDArray CSRGetRowNNZ(CSRMatrix csr, NDArray rows) {
   const IdType* indptr_data = static_cast<IdType*>(csr.indptr->data);
   NDArray rst = NDArray::Empty({len}, rows->dtype, rows->ctx);
   IdType* rst_data = static_cast<IdType*>(rst->data);
-  const int nt = cuda::FindNumThreads(len);
+  const int nt = dgl::cuda::FindNumThreads(len);
   const int nb = (len + nt - 1) / nt;
   CUDA_KERNEL_CALL(_CSRGetRowNNZKernel,
       nb, nt, 0, thr_entry->stream,
@@ -233,19 +235,7 @@ __global__ void _SegmentCopyKernel(
   IdType tx = static_cast<IdType>(blockIdx.x) * blockDim.x + threadIdx.x;
   const int stride_x = gridDim.x * blockDim.x;
   while (tx < length) {
-    // find upper bound for tx using binary search.
-    // out_indptr has already a prefix sum. n_row = size(out_indptr)-1
-    IdType l = 0, r = n_row, m = 0;
-    while (l < r) {
-      m = l + (r-l)/2;
-      if (tx >= out_indptr[m]) {
-        l = m+1;
-      } else {
-        r = m;
-      }
-    }
-
-    IdType rpos = l-1;
+    IdType rpos = dgl::cuda::_UpperBound(out_indptr, n_row, tx) - 1;
     IdType rofs = tx - out_indptr[rpos];
     const IdType u = row[rpos];
     out_data[tx] = data? data[indptr[u]+rofs] : indptr[u]+rofs;
@@ -372,7 +362,7 @@ std::vector<NDArray> CSRGetDataAndIndices(CSRMatrix csr, NDArray row, NDArray co
 
   // Generate a 0-1 mask for matched (row, col) positions.
   IdArray mask = Full(0, nnz, nbits, ctx);
-  const int nt = cuda::FindNumThreads(len);
+  const int nt = dgl::cuda::FindNumThreads(len);
   const int nb = (len + nt - 1) / nt;
   CUDA_KERNEL_CALL(_SegmentMaskKernel,
       nb, nt, 0, thr_entry->stream,
@@ -388,7 +378,7 @@ std::vector<NDArray> CSRGetDataAndIndices(CSRMatrix csr, NDArray row, NDArray co
 
   // Search for row index
   IdArray ret_row = NewIdArray(idx->shape[0], ctx, nbits);
-  const int nt2 = cuda::FindNumThreads(idx->shape[0]);
+  const int nt2 = dgl::cuda::FindNumThreads(idx->shape[0]);
   const int nb2 = (idx->shape[0] + nt - 1) / nt;
   CUDA_KERNEL_CALL(_SortedSearchKernel,
       nb2, nt2, 0, thr_entry->stream,
@@ -415,25 +405,19 @@ template std::vector<NDArray> CSRGetDataAndIndices<kDLGPU, int64_t>(
  */
 template <typename IdType>
 __global__ void _SegmentMaskColKernel(
-    const IdType* indptr, const IdType* indices, int64_t num_rows,
+    const IdType* indptr, const IdType* indices, int64_t num_rows, int64_t num_nnz,
     const IdType* col, int64_t col_len,
     IdType* mask, IdType* count) {
-  int tx = blockIdx.x * blockDim.x + threadIdx.x;
+  IdType tx = static_cast<IdType>(blockIdx.x) * blockDim.x + threadIdx.x;
   const int stride_x = gridDim.x * blockDim.x;
-  // TODO(minjie): consider putting the col array in shared memory.
-  while (tx < num_rows) {
-    IdType cnt = 0;
-    for (IdType i = indptr[tx]; i < indptr[tx + 1]; ++i) {
-      const IdType cur_c = indices[i];
-      for (int64_t j = 0; j < col_len; ++j) {
-        if (cur_c == col[j]) {
-          mask[i] = 1;
-          ++cnt;
-          break;
-        }
-      }
+  while (tx < num_nnz) {
+    IdType rpos = dgl::cuda::_UpperBound(indptr, num_rows, tx) - 1;
+    IdType cur_c = indices[tx];
+    IdType i = dgl::cuda::_BinarySearch(col, col_len, cur_c);
+    if (i < col_len) {
+      mask[tx] = 1;
+      cuda::AtomicAdd(count+rpos, IdType(1));
     }
-    count[tx] = cnt;
     tx += stride_x;
   }
 }
@@ -464,12 +448,33 @@ CSRMatrix CSRSliceMatrix(CSRMatrix csr, runtime::NDArray rows, runtime::NDArray
   IdArray mask = Full(0, csr.indices->shape[0], nbits, ctx);
   // A count for how many masked values per row.
   IdArray count = NewIdArray(csr.num_rows, ctx, nbits);
-  const int nt = cuda::FindNumThreads(csr.num_rows);
-  const int nb = (csr.num_rows + nt - 1) / nt;
+  CUDA_CALL(cudaMemset(count.Ptr<IdType>(), 0, sizeof(IdType) * (csr.num_rows)));
+
+  const int64_t nnz_csr = csr.indices->shape[0];
+  const int nt = 256;
+
+  // In general ``cols'' array is sorted. But it is not guaranteed.
+  // Hence checking and sorting array first. Sorting is not in place.
+  auto device = runtime::DeviceAPI::Get(ctx);
+  auto cols_size = cols->shape[0];
+
+  IdArray sorted_array = NewIdArray(cols->shape[0], ctx, cols->dtype.bits);
+  auto ptr_sorted_cols = sorted_array.Ptr<IdType>();
+  auto ptr_cols = cols.Ptr<IdType>();
+  size_t workspace_size = 0;
+  cub::DeviceRadixSort::SortKeys(
+    nullptr, workspace_size, ptr_cols, ptr_sorted_cols, cols->shape[0]);
+  void *workspace = device->AllocWorkspace(ctx, workspace_size);
+  cub::DeviceRadixSort::SortKeys(
+    workspace, workspace_size, ptr_cols, ptr_sorted_cols, cols->shape[0]);
+  device->FreeWorkspace(ctx, workspace);
+
+  // Execute SegmentMaskColKernel
+  int nb = (nnz_csr + nt - 1) / nt;
   CUDA_KERNEL_CALL(_SegmentMaskColKernel,
       nb, nt, 0, thr_entry->stream,
-      csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(), csr.num_rows,
-      cols.Ptr<IdType>(), cols->shape[0],
+      csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(), csr.num_rows, nnz_csr,
+      ptr_sorted_cols, cols_size,
       mask.Ptr<IdType>(), count.Ptr<IdType>());
 
   IdArray idx = AsNumBits(NonZero(mask), nbits);
diff --git a/src/array/cuda/utils.h b/src/array/cuda/utils.h
index 5b2297eb465f..cee2961889b6 100644
--- a/src/array/cuda/utils.h
+++ b/src/array/cuda/utils.h
@@ -192,6 +192,58 @@ inline DType GetCUDAScalar(
   return result;
 }
 
+/*!
+ * \brief Given a sorted array and a value this function returns the index
+ * of the first element which compares greater than value.
+ *
+ * This function assumes 0-based index
+ * @param A: ascending sorted array
+ * @param n: size of the A
+ * @param x: value to search in A
+ * @return index, i, of the first element st. A[i]>x. If x>=A[n-1] returns n.
+ * if x<A[0] then it returns 0.
+ */
+template <typename IdType>
+__device__ IdType _UpperBound(const IdType *A, int64_t n, IdType x) {
+  IdType l = 0, r = n, m = 0;
+  while (l < r) {
+    m = l + (r-l)/2;
+    if (x >= A[m]) {
+      l = m+1;
+    } else {
+      r = m;
+    }
+  }
+  return l;
+}
+
+/*!
+ * \brief Given a sorted array and a value this function returns the index
+ * of the element who is equal to val. If not exist returns n+1
+ *
+ * This function assumes 0-based index
+ * @param A: ascending sorted array
+ * @param n: size of the A
+ * @param x: value to search in A
+ * @return index, i, st. A[i]==x. If such an index not exists returns 'n'.
+ */
+template <typename IdType>
+__device__ IdType _BinarySearch(const IdType *A, int64_t n, IdType x) {
+  IdType l = 0, r = n-1, m = 0;
+  while (l <= r) {
+    m = l + (r-l)/2;
+    if (A[m] == x) {
+      return m;
+    }
+    if (A[m] < x) {
+      l = m+1;
+    } else {
+      r = m-1;
+    }
+  }
+  return n;  // not found
+}
+
 }  // namespace cuda
 }  // namespace dgl