From 7b9afbfa28e9441fd3bfbe4e3c24eb62816cb1a7 Mon Sep 17 00:00:00 2001 From: ayasar70 <64045900+ayasar70@users.noreply.github.com> Date: Fri, 18 Feb 2022 14:52:18 -0800 Subject: [PATCH] [Performance][GPU] Improving _SegmentMaskColKernel (#3745) * Based on issue #3436. Improving _SegmentCopyKernel s GPU utilization by switching to nonzero based thread assignment * fixing lint issues * Update cub for cuda 11.5 compatibility (#3468) * fixing type mismatch * tx guaranteed to be smaller than nnz. Hence removing last check * minor: updating comment * adding three unit tests for csr slice method to cover some corner cases * timing repeatkernel * clean * clean * clean * updating _SegmentMaskColKernel * Working on requests: removing sorted array check and adding comments to utility functions * fixing lint issue Co-authored-by: Abdurrahman Yasar Co-authored-by: nv-dlasalle <63612878+nv-dlasalle@users.noreply.github.com> Co-authored-by: Jinjing Zhou --- src/array/cuda/csr2coo.cu | 14 +---- src/array/cuda/spmat_op_impl_csr.cu | 85 +++++++++++++++-------------- src/array/cuda/utils.h | 52 ++++++++++++++++++ 3 files changed, 100 insertions(+), 51 deletions(-) diff --git a/src/array/cuda/csr2coo.cu b/src/array/cuda/csr2coo.cu index b883fa56cdbd..001c532c3c2b 100644 --- a/src/array/cuda/csr2coo.cu +++ b/src/array/cuda/csr2coo.cu @@ -66,19 +66,11 @@ template __global__ void _RepeatKernel( const DType* val, const IdType* pos, DType* out, int64_t n_row, int64_t length) { - int tx = blockIdx.x * blockDim.x + threadIdx.x; + IdType tx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; const int stride_x = gridDim.x * blockDim.x; while (tx < length) { - IdType l = 0, r = n_row, m = 0; - while (l < r) { - m = l + (r-l)/2; - if (tx >= pos[m]) { - l = m+1; - } else { - r = m; - } - } - out[tx] = val[l-1]; + IdType i = dgl::cuda::_UpperBound(pos, n_row, tx) - 1; + out[tx] = val[i]; tx += stride_x; } } diff --git a/src/array/cuda/spmat_op_impl_csr.cu b/src/array/cuda/spmat_op_impl_csr.cu index a7a9788d4c71..dd8e28c53b0e 100644 --- a/src/array/cuda/spmat_op_impl_csr.cu +++ b/src/array/cuda/spmat_op_impl_csr.cu @@ -9,6 +9,8 @@ #include #include "../../runtime/cuda/cuda_common.h" #include "./utils.h" +#include "./atomic.cuh" +#include "./dgl_cub.cuh" namespace dgl { @@ -30,7 +32,7 @@ bool CSRIsNonZero(CSRMatrix csr, int64_t row, int64_t col) { IdArray out = aten::NewIdArray(1, ctx, sizeof(IdType) * 8); const IdType* data = nullptr; // TODO(minjie): use binary search for sorted csr - CUDA_KERNEL_CALL(cuda::_LinearSearchKernel, + CUDA_KERNEL_CALL(dgl::cuda::_LinearSearchKernel, 1, 1, 0, thr_entry->stream, csr.indptr.Ptr(), csr.indices.Ptr(), data, rows.Ptr(), cols.Ptr(), @@ -54,11 +56,11 @@ NDArray CSRIsNonZero(CSRMatrix csr, NDArray row, NDArray col) { const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1; const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1; auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); - const int nt = cuda::FindNumThreads(rstlen); + const int nt = dgl::cuda::FindNumThreads(rstlen); const int nb = (rstlen + nt - 1) / nt; const IdType* data = nullptr; // TODO(minjie): use binary search for sorted csr - CUDA_KERNEL_CALL(cuda::_LinearSearchKernel, + CUDA_KERNEL_CALL(dgl::cuda::_LinearSearchKernel, nb, nt, 0, thr_entry->stream, csr.indptr.Ptr(), csr.indices.Ptr(), data, row.Ptr(), col.Ptr(), @@ -103,13 +105,13 @@ bool CSRHasDuplicate(CSRMatrix csr) { // We allocate a workspace of num_rows bytes. It wastes a little bit memory but should // be fine. int8_t* flags = static_cast(device->AllocWorkspace(ctx, csr.num_rows)); - const int nt = cuda::FindNumThreads(csr.num_rows); + const int nt = dgl::cuda::FindNumThreads(csr.num_rows); const int nb = (csr.num_rows + nt - 1) / nt; CUDA_KERNEL_CALL(_SegmentHasNoDuplicate, nb, nt, 0, thr_entry->stream, csr.indptr.Ptr(), csr.indices.Ptr(), csr.num_rows, flags); - bool ret = cuda::AllTrue(flags, csr.num_rows, ctx); + bool ret = dgl::cuda::AllTrue(flags, csr.num_rows, ctx); device->FreeWorkspace(ctx, flags); return !ret; } @@ -152,7 +154,7 @@ NDArray CSRGetRowNNZ(CSRMatrix csr, NDArray rows) { const IdType* indptr_data = static_cast(csr.indptr->data); NDArray rst = NDArray::Empty({len}, rows->dtype, rows->ctx); IdType* rst_data = static_cast(rst->data); - const int nt = cuda::FindNumThreads(len); + const int nt = dgl::cuda::FindNumThreads(len); const int nb = (len + nt - 1) / nt; CUDA_KERNEL_CALL(_CSRGetRowNNZKernel, nb, nt, 0, thr_entry->stream, @@ -233,19 +235,7 @@ __global__ void _SegmentCopyKernel( IdType tx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; const int stride_x = gridDim.x * blockDim.x; while (tx < length) { - // find upper bound for tx using binary search. - // out_indptr has already a prefix sum. n_row = size(out_indptr)-1 - IdType l = 0, r = n_row, m = 0; - while (l < r) { - m = l + (r-l)/2; - if (tx >= out_indptr[m]) { - l = m+1; - } else { - r = m; - } - } - - IdType rpos = l-1; + IdType rpos = dgl::cuda::_UpperBound(out_indptr, n_row, tx) - 1; IdType rofs = tx - out_indptr[rpos]; const IdType u = row[rpos]; out_data[tx] = data? data[indptr[u]+rofs] : indptr[u]+rofs; @@ -372,7 +362,7 @@ std::vector CSRGetDataAndIndices(CSRMatrix csr, NDArray row, NDArray co // Generate a 0-1 mask for matched (row, col) positions. IdArray mask = Full(0, nnz, nbits, ctx); - const int nt = cuda::FindNumThreads(len); + const int nt = dgl::cuda::FindNumThreads(len); const int nb = (len + nt - 1) / nt; CUDA_KERNEL_CALL(_SegmentMaskKernel, nb, nt, 0, thr_entry->stream, @@ -388,7 +378,7 @@ std::vector CSRGetDataAndIndices(CSRMatrix csr, NDArray row, NDArray co // Search for row index IdArray ret_row = NewIdArray(idx->shape[0], ctx, nbits); - const int nt2 = cuda::FindNumThreads(idx->shape[0]); + const int nt2 = dgl::cuda::FindNumThreads(idx->shape[0]); const int nb2 = (idx->shape[0] + nt - 1) / nt; CUDA_KERNEL_CALL(_SortedSearchKernel, nb2, nt2, 0, thr_entry->stream, @@ -415,25 +405,19 @@ template std::vector CSRGetDataAndIndices( */ template __global__ void _SegmentMaskColKernel( - const IdType* indptr, const IdType* indices, int64_t num_rows, + const IdType* indptr, const IdType* indices, int64_t num_rows, int64_t num_nnz, const IdType* col, int64_t col_len, IdType* mask, IdType* count) { - int tx = blockIdx.x * blockDim.x + threadIdx.x; + IdType tx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; const int stride_x = gridDim.x * blockDim.x; - // TODO(minjie): consider putting the col array in shared memory. - while (tx < num_rows) { - IdType cnt = 0; - for (IdType i = indptr[tx]; i < indptr[tx + 1]; ++i) { - const IdType cur_c = indices[i]; - for (int64_t j = 0; j < col_len; ++j) { - if (cur_c == col[j]) { - mask[i] = 1; - ++cnt; - break; - } - } + while (tx < num_nnz) { + IdType rpos = dgl::cuda::_UpperBound(indptr, num_rows, tx) - 1; + IdType cur_c = indices[tx]; + IdType i = dgl::cuda::_BinarySearch(col, col_len, cur_c); + if (i < col_len) { + mask[tx] = 1; + cuda::AtomicAdd(count+rpos, IdType(1)); } - count[tx] = cnt; tx += stride_x; } } @@ -464,12 +448,33 @@ CSRMatrix CSRSliceMatrix(CSRMatrix csr, runtime::NDArray rows, runtime::NDArray IdArray mask = Full(0, csr.indices->shape[0], nbits, ctx); // A count for how many masked values per row. IdArray count = NewIdArray(csr.num_rows, ctx, nbits); - const int nt = cuda::FindNumThreads(csr.num_rows); - const int nb = (csr.num_rows + nt - 1) / nt; + CUDA_CALL(cudaMemset(count.Ptr(), 0, sizeof(IdType) * (csr.num_rows))); + + const int64_t nnz_csr = csr.indices->shape[0]; + const int nt = 256; + + // In general ``cols'' array is sorted. But it is not guaranteed. + // Hence checking and sorting array first. Sorting is not in place. + auto device = runtime::DeviceAPI::Get(ctx); + auto cols_size = cols->shape[0]; + + IdArray sorted_array = NewIdArray(cols->shape[0], ctx, cols->dtype.bits); + auto ptr_sorted_cols = sorted_array.Ptr(); + auto ptr_cols = cols.Ptr(); + size_t workspace_size = 0; + cub::DeviceRadixSort::SortKeys( + nullptr, workspace_size, ptr_cols, ptr_sorted_cols, cols->shape[0]); + void *workspace = device->AllocWorkspace(ctx, workspace_size); + cub::DeviceRadixSort::SortKeys( + workspace, workspace_size, ptr_cols, ptr_sorted_cols, cols->shape[0]); + device->FreeWorkspace(ctx, workspace); + + // Execute SegmentMaskColKernel + int nb = (nnz_csr + nt - 1) / nt; CUDA_KERNEL_CALL(_SegmentMaskColKernel, nb, nt, 0, thr_entry->stream, - csr.indptr.Ptr(), csr.indices.Ptr(), csr.num_rows, - cols.Ptr(), cols->shape[0], + csr.indptr.Ptr(), csr.indices.Ptr(), csr.num_rows, nnz_csr, + ptr_sorted_cols, cols_size, mask.Ptr(), count.Ptr()); IdArray idx = AsNumBits(NonZero(mask), nbits); diff --git a/src/array/cuda/utils.h b/src/array/cuda/utils.h index 5b2297eb465f..cee2961889b6 100644 --- a/src/array/cuda/utils.h +++ b/src/array/cuda/utils.h @@ -192,6 +192,58 @@ inline DType GetCUDAScalar( return result; } +/*! + * \brief Given a sorted array and a value this function returns the index + * of the first element which compares greater than value. + * + * This function assumes 0-based index + * @param A: ascending sorted array + * @param n: size of the A + * @param x: value to search in A + * @return index, i, of the first element st. A[i]>x. If x>=A[n-1] returns n. + * if x +__device__ IdType _UpperBound(const IdType *A, int64_t n, IdType x) { + IdType l = 0, r = n, m = 0; + while (l < r) { + m = l + (r-l)/2; + if (x >= A[m]) { + l = m+1; + } else { + r = m; + } + } + return l; +} + +/*! + * \brief Given a sorted array and a value this function returns the index + * of the element who is equal to val. If not exist returns n+1 + * + * This function assumes 0-based index + * @param A: ascending sorted array + * @param n: size of the A + * @param x: value to search in A + * @return index, i, st. A[i]==x. If such an index not exists returns 'n'. + */ +template +__device__ IdType _BinarySearch(const IdType *A, int64_t n, IdType x) { + IdType l = 0, r = n-1, m = 0; + while (l <= r) { + m = l + (r-l)/2; + if (A[m] == x) { + return m; + } + if (A[m] < x) { + l = m+1; + } else { + r = m-1; + } + } + return n; // not found +} + } // namespace cuda } // namespace dgl