diff --git a/resolve/cuda/CMakeLists.txt b/resolve/cuda/CMakeLists.txt index 225ea3c6..a22fb2f1 100644 --- a/resolve/cuda/CMakeLists.txt +++ b/resolve/cuda/CMakeLists.txt @@ -8,13 +8,12 @@ set(ReSolve_CUDA_SRC cudaKernels.cu - cudaVectorKernels.cu + VectorKernels.cu MemoryUtils.cu ) set(ReSolve_CUDA_HEADER_INSTALL cudaKernels.h - cudaVectorKernels.h CudaMemory.hpp cuda_check_errors.hpp ) diff --git a/resolve/cuda/VectorKernels.cu b/resolve/cuda/VectorKernels.cu new file mode 100644 index 00000000..c10a4a7a --- /dev/null +++ b/resolve/cuda/VectorKernels.cu @@ -0,0 +1,47 @@ +/** + * @file cudaVectorKernels.cu + * @author Slaven Peles (peless@ornl.gov) + * @brief Contains implementation of CUDA vector kernels and their wrappers. + * @date 2023-12-08 + * + * @note Kernel wrappers implemented here are intended for use in hardware + * agnostic code. + */ + +#include + + +namespace ReSolve { namespace vector { + + namespace kernels { + + /** + * @brief CUDA kernel that sets values of an array to a constant. + * + * @param[in] n - length of the array + * @param[in] val - the value the array is set to + * @param[out] arr - a pointer to the array + * + * @pre `arr` is allocated to size `n` + * @post `arr` elements are set to `val` + */ + __global__ void set_const(index_type n, real_type val, real_type* arr) + { + index_type i = blockIdx.x * blockDim.x + threadIdx.x; + if(i < n) + { + arr[i] = val; + } + } + + } // namespace kernels + + void set_array_const(index_type n, real_type val, real_type* arr) + { + index_type num_blocks; + index_type block_size = 512; + num_blocks = (n + block_size - 1) / block_size; + kernels::set_const<<>>(n, val, arr); + } + +}} // namespace ReSolve::vector \ No newline at end of file diff --git a/resolve/cuda/cudaKernels.cu b/resolve/cuda/cudaKernels.cu index 4773464c..fe88bb91 100644 --- a/resolve/cuda/cudaKernels.cu +++ b/resolve/cuda/cudaKernels.cu @@ -1,372 +1,528 @@ +/** + * @file cudaKernels.cu + * @author Kasia Swirydowicz (kasia.swirydowicz@pnnl.gov) + * @brief + * @date 2023-12-08 + * + * + */ + #include "cudaKernels.h" #include -#define maxk 1024 -#define Tv5 1024 -//computes V^T[u1 u2] where v is n x k and u1 and u2 are nx1 -__global__ void MassIPTwoVec_kernel(const double* __restrict__ u1, - const double* __restrict__ u2, - const double* __restrict__ v, - double* result, - const int k, - const int N) -{ - int t = threadIdx.x; - int bsize = blockDim.x; - - // assume T threads per thread block (and k reductions to be performed) - volatile __shared__ double s_tmp1[Tv5]; - - volatile __shared__ double s_tmp2[Tv5]; - // map between thread index space and the problem index space - int j = blockIdx.x; - s_tmp1[t] = 0.0f; - s_tmp2[t] = 0.0f; - int nn = t; - double can1, can2, cbn; - - while(nn < N) { - can1 = u1[nn]; - can2 = u2[nn]; - - cbn = v[N * j + nn]; - s_tmp1[t] += can1 * cbn; - s_tmp2[t] += can2 * cbn; - - nn += bsize; - } - __syncthreads(); - if(Tv5 >= 1024) { - if(t < 512) { - s_tmp1[t] += s_tmp1[t + 512]; - s_tmp2[t] += s_tmp2[t + 512]; - } - __syncthreads(); - } - if(Tv5 >= 512) { - if(t < 256) { - s_tmp1[t] += s_tmp1[t + 256]; - s_tmp2[t] += s_tmp2[t + 256]; +namespace ReSolve { + namespace kernels { + + /** + * @brief Computes v^T * [u1 u2] where v is n x k multivector + * and u1 and u2 are n x 1 vectors. + * + * @tparam Tv5 - Size of shared memory + * + * @param[in] u1 - (n x 1) vector + * @param[in] u2 - (n x 1) vector + * @param[in] v - (n x k) multivector + * @param[out] result - (k x 2) multivector + * @param[in] k - dimension of the subspace + * @param[in] N - size of vectors u1, u2 + */ + template + __global__ void MassIPTwoVec(const real_type* __restrict__ u1, + const real_type* __restrict__ u2, + const real_type* __restrict__ v, + real_type* result, + const index_type k, + const index_type N) + { + index_type t = threadIdx.x; + index_type bsize = blockDim.x; + + // assume T threads per thread block (and k reductions to be performed) + volatile __shared__ real_type s_tmp1[Tv5]; + volatile __shared__ real_type s_tmp2[Tv5]; + + // map between thread index space and the problem index space + index_type j = blockIdx.x; + s_tmp1[t] = 0.0; + s_tmp2[t] = 0.0; + index_type nn = t; + real_type can1, can2, cbn; + + while(nn < N) { + can1 = u1[nn]; + can2 = u2[nn]; + + cbn = v[N * j + nn]; + s_tmp1[t] += can1 * cbn; + s_tmp2[t] += can2 * cbn; + + nn += bsize; + } + + __syncthreads(); + + if(Tv5 >= 1024) { + if(t < 512) { + s_tmp1[t] += s_tmp1[t + 512]; + s_tmp2[t] += s_tmp2[t + 512]; + } + __syncthreads(); + } + if(Tv5 >= 512) { + if(t < 256) { + s_tmp1[t] += s_tmp1[t + 256]; + s_tmp2[t] += s_tmp2[t + 256]; + } + __syncthreads(); + } + { + if(t < 128) { + s_tmp1[t] += s_tmp1[t + 128]; + s_tmp2[t] += s_tmp2[t + 128]; + } + __syncthreads(); + } + { + if(t < 64) { + s_tmp1[t] += s_tmp1[t + 64]; + s_tmp2[t] += s_tmp2[t + 64]; + } + __syncthreads(); + } + + if(t < 32) { + s_tmp1[t] += s_tmp1[t + 32]; + s_tmp2[t] += s_tmp2[t + 32]; + + s_tmp1[t] += s_tmp1[t + 16]; + s_tmp2[t] += s_tmp2[t + 16]; + + s_tmp1[t] += s_tmp1[t + 8]; + s_tmp2[t] += s_tmp2[t + 8]; + + s_tmp1[t] += s_tmp1[t + 4]; + s_tmp2[t] += s_tmp2[t + 4]; + + s_tmp1[t] += s_tmp1[t + 2]; + s_tmp2[t] += s_tmp2[t + 2]; + + s_tmp1[t] += s_tmp1[t + 1]; + s_tmp2[t] += s_tmp2[t + 1]; + } + if(t == 0) { + result[blockIdx.x] = s_tmp1[0]; + result[blockIdx.x + k] = s_tmp2[0]; + } } - __syncthreads(); - } - { - if(t < 128) { - s_tmp1[t] += s_tmp1[t + 128]; - s_tmp2[t] += s_tmp2[t + 128]; - } - __syncthreads(); - } - { - if(t < 64) { - s_tmp1[t] += s_tmp1[t + 64]; - s_tmp2[t] += s_tmp2[t + 64]; - } - __syncthreads(); - } - if(t < 32) { - s_tmp1[t] += s_tmp1[t + 32]; - s_tmp2[t] += s_tmp2[t + 32]; - s_tmp1[t] += s_tmp1[t + 16]; - s_tmp2[t] += s_tmp2[t + 16]; + /** + * @brief AXPY y = y - x*alpha where alpha is [k x 1], and x is [N x k] needed in 1 and 2 synch GMRES + * + * @tparam Tmaxk + * + * @param[in] N - number of rows in x + * @param[in] k - number of columns in x + * @param[in] x_data - double array, size [N x k] + * @param[out] y_data - double array, size [N x 1] + * @param[in] alpha - doble array, size [k x 1] + */ + template + __global__ void massAxpy3(index_type N, + index_type k, + const real_type* x_data, + real_type* y_data, + const real_type* alpha) + { + index_type i = blockIdx.x * blockDim.x + threadIdx.x; + index_type t = threadIdx.x; + __shared__ real_type s_alpha[Tmaxk]; + + if(t < k) { + s_alpha[t] = alpha[t]; + } + __syncthreads(); + + if(i < N) { + real_type temp = 0.0; + for(index_type j = 0; j < k; ++j) { + temp += x_data[j * N + i] * s_alpha[j]; + } + y_data[i] -= temp; + } + } - s_tmp1[t] += s_tmp1[t + 8]; - s_tmp2[t] += s_tmp2[t + 8]; + /** + * @brief Pass through matrix rows and sum each as \sum_{j=1}^m abs(a_{ij}) + * + * @param[in] n - number of rows in the matrix. + * @param[in] nnz - number of non-zero values in the matrix + * @param[in] a_ia - row pointers (CSR storage) + * @param[in] a_val - values (CSR storage) + * @param[out] result - array size [n x 1] containing sums of values in each row. + */ + __global__ void matrixInfNormPart1(const index_type n, + const index_type nnz, + const index_type* a_ia, + const real_type* a_val, + real_type* result) + { + index_type idx = blockIdx.x * blockDim.x + threadIdx.x; + while (idx < n) { + real_type sum = 0.0; + for (index_type i = a_ia[idx]; i < a_ia[idx + 1]; ++i) { + sum = sum + fabs(a_val[i]); + } + result[idx] = sum; + idx += (blockDim.x * gridDim.x); + } + } - s_tmp1[t] += s_tmp1[t + 4]; - s_tmp2[t] += s_tmp2[t + 4]; + /** + * @brief For count sketch in sketching random method + * + * @param[in] n - number of entries in input vector + * @param[in] k - number of entries in output vector (k < n) + * @param[in] labels - array size [n x 1] containing integers from 0 to k-1, assigned randomly. + * @param[in] flip - array size [n x 1] containing values `1` and `-1` + * @param[in] input - input vector, size [n x 1] + * @param[out] output - output vector, size [k x 1] (this vector must be allocated and initialized with `0`s prior to calling the kernel). + */ + __global__ void count_sketch(const index_type n, + const index_type k, + const index_type* labels, + const index_type* flip, + const real_type* input, + real_type* output) + { + index_type idx = blockIdx.x * blockDim.x + threadIdx.x; + while (idx < n) { + real_type val = input[idx]; + if (flip[idx] != 1) { + val *= -1.0; + } + atomicAdd(&output[labels[idx]], val); + idx += blockDim.x * gridDim.x; + } + } - s_tmp1[t] += s_tmp1[t + 2]; - s_tmp2[t] += s_tmp2[t + 2]; + /** + * @brief Walsh-Hadamard transform (select) + * + * @param[in] k - + * @param[in] perm - + * @param[in] input - + * @param[out] output - + */ + __global__ void select(const index_type k, + const index_type* perm, + const real_type* input, + real_type* output){ + + index_type idx = blockIdx.x * blockDim.x + threadIdx.x; + while (idx < k) { + output[idx] = input[perm[idx]]; + idx += blockDim.x * gridDim.x; + } + } - s_tmp1[t] += s_tmp1[t + 1]; - s_tmp2[t] += s_tmp2[t + 1]; - } - if(t == 0) { - result[blockIdx.x] = s_tmp1[0]; - result[blockIdx.x + k] = s_tmp2[0]; - } -} + /** + * @brief Walsh-Hadamard transform (scale) + * + * @param[in] n - + * @param[in] D - + * @param[in] x - + * @param[out] y - + */ + __global__ void scaleByD(const index_type n, + const index_type* D, + const real_type* x, + real_type* y) + { + index_type idx = blockIdx.x * blockDim.x + threadIdx.x; + + while (idx < n) { + + if (D[idx] == 1) { + y[idx] = x[idx]; + } else { + y[idx] = (-1.0) * x[idx]; + } + + idx += blockDim.x * gridDim.x; + } + } + /** + * @brief Single in-global memory radix-4 Fast Walsh Transform pass + * (for strides exceeding elementary vector size). + * + * @param d_Output - + * @param d_Input - + * @param stride - + */ + __global__ void fwtBatch2Kernel(real_type* d_Output, real_type* d_Input, index_type stride) + { + const index_type pos = blockIdx.x * blockDim.x + threadIdx.x; + const index_type N = blockDim.x * gridDim.x * 4; + + real_type* d_Src = d_Input + blockIdx.y * N; + real_type* d_Dst = d_Output + blockIdx.y * N; + + index_type lo = pos & (stride - 1); + index_type i0 = ((pos - lo) << 2) + lo; + index_type i1 = i0 + stride; + index_type i2 = i1 + stride; + index_type i3 = i2 + stride; + + real_type D0 = d_Src[i0]; + real_type D1 = d_Src[i1]; + real_type D2 = d_Src[i2]; + real_type D3 = d_Src[i3]; + + real_type T; + T = D0; + D0 = D0 + D2; + D2 = T - D2; + T = D1; + D1 = D1 + D3; + D3 = T - D3; + T = D0; + d_Dst[i0] = D0 + D1; + d_Dst[i1] = T - D1; + T = D2; + d_Dst[i2] = D2 + D3; + d_Dst[i3] = T - D3; + } -//mass AXPY i.e y = y - x*alpha where alpha is [k x 1], needed in 1 and 2 synch GMRES -__global__ void massAxpy3_kernel(int N, - int k, - const double* x_data, - double* y_data, - const double* alpha) { + /** + * @brief + * + * @param d_Output - + * @param d_Input - + * @param log2N - + * + * @todo `d_Input` should be `const` parameter. + * + */ + __global__ void fwtBatch1Kernel(real_type* d_Output, real_type* d_Input, index_type log2N) + { + // Handle to thread block group + + cooperative_groups::thread_block cta = cooperative_groups::this_thread_block(); + const index_type N = 1 << log2N; + const index_type base = blockIdx.x << log2N; + + //(2 ** 11) * 4 bytes == 8KB -- maximum s_data[] size for G80 + extern __shared__ real_type s_data[]; + real_type* d_Src = d_Input + base; + real_type* d_Dst = d_Output + base; + + for (index_type pos = threadIdx.x; pos < N; pos += blockDim.x) { + s_data[pos] = d_Src[pos]; + } + + // Main radix-4 stages + const index_type pos = threadIdx.x; + + for (index_type stride = N >> 2; stride > 0; stride >>= 2) { + index_type lo = pos & (stride - 1); + index_type i0 = ((pos - lo) << 2) + lo; + index_type i1 = i0 + stride; + index_type i2 = i1 + stride; + index_type i3 = i2 + stride; + + cooperative_groups::sync(cta); + real_type D0 = s_data[i0]; + real_type D1 = s_data[i1]; + real_type D2 = s_data[i2]; + real_type D3 = s_data[i3]; + + real_type T; + T = D0; + D0 = D0 + D2; + D2 = T - D2; + T = D1; + D1 = D1 + D3; + D3 = T - D3; + T = D0; + s_data[i0] = D0 + D1; + s_data[i1] = T - D1; + T = D2; + s_data[i2] = D2 + D3; + s_data[i3] = T - D3; + } + + // Do single radix-2 stage for odd power of two + if (log2N & 1) { + + cooperative_groups::sync(cta); + + for (index_type pos = threadIdx.x; pos < N / 2; pos += blockDim.x) { + index_type i0 = pos << 1; + index_type i1 = i0 + 1; + + real_type D0 = s_data[i0]; + real_type D1 = s_data[i1]; + s_data[i0] = D0 + D1; + s_data[i1] = D0 - D1; + } + } + + cooperative_groups::sync(cta); + + for (index_type pos = threadIdx.x; pos < N; pos += blockDim.x) { + d_Dst[pos] = s_data[pos]; + } + } - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int t = threadIdx.x; - __shared__ double s_alpha[maxk]; - if(t < k) { - s_alpha[t] = alpha[t]; + } // namespace kernels + + + // + // Kernel wrappers + // + + /** + * @brief Computes result = mvec^T * [vec1 vec2] + * + * @param n - size of vectors vec1, vec2 + * @param i - + * @param vec1 - (n x 1) vector + * @param vec2 - (n x 1) vector + * @param mvec - (n x (i+1)) multivector + * @param result - ((i+1) x 2) multivector + * + * @todo Input data should be `const`. + * @todo Is it coincidence that the block size is equal to the default + * value of Tv5? + * @todo Should we use dynamic shared memory here instead? + */ + void mass_inner_product_two_vectors(index_type n, + index_type i, + const real_type* vec1, + const real_type* vec2, + const real_type* mvec, + real_type* result) + { + kernels::MassIPTwoVec<<>>(vec1, vec2, mvec, result, i + 1, n); } - __syncthreads(); - if(i < N) { - double temp = 0.0f; - for(int j = 0; j < k; ++j) { - temp += x_data[j * N + i] * s_alpha[j]; - } - y_data[i] -= temp; - } -} - -__global__ void matrixInfNormPart1(const int n, - const int nnz, - const int* a_ia, - const double* a_val, - double* result) { - - // one thread per row, pass through rows - // and sum - // can be done through atomics - //\sum_{j=1}^m abs(a_{ij}) - - int idx = blockIdx.x*blockDim.x + threadIdx.x; - while (idx < n){ - double sum = 0.0f; - for (int i = a_ia[idx]; i < a_ia[idx+1]; ++i) { - sum = sum + fabs(a_val[i]); - } - result[idx] = sum; - idx += (blockDim.x*gridDim.x); - } -} - -// for count sketch sketching random method -__global__ void count_sketch_kernel(const int n, - const int k, - const int* labels, - const int* flip, - const double* input, - double* output){ - - int idx = blockIdx.x * blockDim.x + threadIdx.x; - while (idx < n){ - //printf("in place %d, I am putting input[perm[%d]] = input[%d] = %f \n", idx,idx, perm[idx], input[perm[idx]] ); - double val = input[idx]; - if (flip[idx] != 1){ - val *= -1.0; - } - atomicAdd(&output[labels[idx]], val); - idx += blockDim.x * gridDim.x; + /** + * @brief Computes y := y - x*alpha + * + * @param[in] n - vector size + * @param[in] i - number of vectors in the multivector + * @param[in] x - (n x (i+1)) multivector + * @param[out] y - (n x (i+1)) multivector + * @param[in] alpha - ((i+1) x 1) vector + */ + void mass_axpy(index_type n, index_type i, const real_type* x, real_type* y, const real_type* alpha) + { + kernels::massAxpy3<<<(n + 384 - 1) / 384, 384>>>(n, i + 1, x, y, alpha); } -} - -// for Walsh-Hadamard transform -//kernel 1 -__global__ void select_kernel(const int k, - const int* perm, - const double* input, - double* output){ - - int idx = blockIdx.x * blockDim.x + threadIdx.x; - while (idx < k){ - //printf("in place %d, I am putting input[perm[%d]] = input[%d] = %f \n", idx,idx, perm[idx], input[perm[idx]] ); - output[idx] = input[perm[idx]]; - idx += blockDim.x * gridDim.x; + /** + * @brief + * + * @param[in] n - + * @param[in] nnz - + * @param[in] a_ia - + * @param[in] a_val - + * @param[out] result - + * + * @todo Decide how to allow user to configure grid and block sizes. + */ + void matrix_row_sums(index_type n, + index_type nnz, + const index_type* a_ia, + const real_type* a_val, + real_type* result) + { + kernels::matrixInfNormPart1<<<1000, 1024>>>(n, nnz, a_ia, a_val, result); } -} - -//kernel 2 -__global__ void scaleByD_kernel(const int n, - const int* D, - const double* x, - double* y){ - int idx = blockIdx.x * blockDim.x + threadIdx.x; - - while (idx < n){ - - if (D[idx] == 1) y[idx]=x[idx]; - else y[idx]= (-1.0)*x[idx]; - idx += blockDim.x * gridDim.x; - } -} - -//kernels 3 and 4 - - -#define ELEMENTARY_LOG2SIZE 11 -namespace cg = cooperative_groups; -//////////////////////////////////////////////////////////////////////////////// -// Single in-global memory radix-4 Fast Walsh Transform pass -// (for strides exceeding elementary vector size) -//////////////////////////////////////////////////////////////////////////////// - -__global__ void fwtBatch2Kernel(double* d_Output, double* d_Input, int stride) -{ - const int pos = blockIdx.x * blockDim.x + threadIdx.x; - const int N = blockDim.x * gridDim.x * 4; - - double* d_Src = d_Input + blockIdx.y * N; - double* d_Dst = d_Output + blockIdx.y * N; - - int lo = pos & (stride - 1); - int i0 = ((pos - lo) << 2) + lo; - int i1 = i0 + stride; - int i2 = i1 + stride; - int i3 = i2 + stride; - - double D0 = d_Src[i0]; - double D1 = d_Src[i1]; - double D2 = d_Src[i2]; - double D3 = d_Src[i3]; - - double T; - T = D0; - D0 = D0 + D2; - D2 = T - D2; - T = D1; - D1 = D1 + D3; - D3 = T - D3; - T = D0; - d_Dst[i0] = D0 + D1; - d_Dst[i1] = T - D1; - T = D2; - d_Dst[i2] = D2 + D3; - d_Dst[i3] = T - D3; -} - - -__global__ void fwtBatch1Kernel(double* d_Output, double* d_Input, int log2N) -{ - // Handle to thread block group - - cg::thread_block cta = cg::this_thread_block(); - const int N = 1 << log2N; - const int base = blockIdx.x << log2N; - - //(2 ** 11) * 4 bytes == 8KB -- maximum s_data[] size for G80 - extern __shared__ double s_data[]; - double* d_Src = d_Input + base; - double* d_Dst = d_Output + base; - - for (int pos = threadIdx.x; pos < N; pos += blockDim.x) { - s_data[pos] = d_Src[pos]; + /** + * @brief Kernel wrapper for + * + * @param[in] n - + * @param[in] k - + * @param[in] labels - + * @param[in] flip - + * @param[in] input - + * @param[out] output - + * + * @todo Decide how to allow user to configure grid and block sizes. + */ + void count_sketch_theta(index_type n, + index_type k, + const index_type* labels, + const index_type* flip, + const real_type* input, + real_type* output) + { + kernels::count_sketch<<<10000, 1024>>>(n, k, labels, flip, input, output); } - // Main radix-4 stages - const int pos = threadIdx.x; - - for (int stride = N >> 2; stride > 0; stride >>= 2) { - int lo = pos & (stride - 1); - int i0 = ((pos - lo) << 2) + lo; - int i1 = i0 + stride; - int i2 = i1 + stride; - int i3 = i2 + stride; - - cg::sync(cta); - double D0 = s_data[i0]; - double D1 = s_data[i1]; - double D2 = s_data[i2]; - double D3 = s_data[i3]; - - double T; - T = D0; - D0 = D0 + D2; - D2 = T - D2; - T = D1; - D1 = D1 + D3; - D3 = T - D3; - T = D0; - s_data[i0] = D0 + D1; - s_data[i1] = T - D1; - T = D2; - s_data[i2] = D2 + D3; - s_data[i3] = T - D3; + /** + * @brief Wrapper for `select` kernel, part of Walsh-Hadamard transform + * + * @param[in] k - + * @param[in] perm - + * @param[in] input - + * @param[out] output - + * + * @todo Decide how to allow user to configure grid and block sizes. + */ + void FWHT_select(index_type k, + const index_type* perm, + const real_type* input, + real_type* output) + { + kernels::select<<<1000, 1024>>>(k, perm, input, output); } - // Do single radix-2 stage for odd power of two - if (log2N & 1) { - - cg::sync(cta); + /** + * @brief Wrapper for `scale` kernel, part of Walsh-Hadamard transform + * + * @param[in] n - + * @param[in] D - + * @param[in] x - + * @param[out] y - + * + * @todo Decide how to allow user to configure grid and block sizes. + */ + void FWHT_scaleByD(index_type n, + const index_type* D, + const real_type* x, + real_type* y) + { + kernels::scaleByD<<<1000, 1024>>>(n, D, x, y); + } - for (int pos = threadIdx.x; pos < N / 2; pos += blockDim.x) { - int i0 = pos << 1; - int i1 = i0 + 1; + /** + * @brief + * + * @param[in] M - + * @param[in] log2N - + * @param[out] d_Data - + * + * @todo Decide if and how user should configure log2size, thread_n, etc. + */ + void FWHT(index_type M, index_type log2N, real_type* d_Data) + { + const index_type ELEMENTARY_LOG2SIZE = 11; + const index_type THREAD_N = 1024; + index_type N = 1 << log2N; + dim3 grid((1 << log2N) / (4 * THREAD_N), M, 1); - double D0 = s_data[i0]; - double D1 = s_data[i1]; - s_data[i0] = D0 + D1; - s_data[i1] = D0 - D1; + for (; log2N > ELEMENTARY_LOG2SIZE; log2N -= 2, N >>= 2, M <<= 2) { + kernels::fwtBatch2Kernel<<>>(d_Data, d_Data, N / 4); } - } - cg::sync(cta); - - for (int pos = threadIdx.x; pos < N; pos += blockDim.x) { - d_Dst[pos] = s_data[pos]; - } -} - -void mass_inner_product_two_vectors(int n, - int i, - double* vec1, - double* vec2, - double* mvec, - double* result) -{ - MassIPTwoVec_kernel<<>>(vec1, vec2, mvec, result, i + 1, n); -} - -void mass_axpy(int n, int i, double* x, double* y, double* alpha) -{ - massAxpy3_kernel<<<(n + 384 - 1) / 384, 384>>>(n, i + 1, x, y, alpha); -} - -void matrix_row_sums(int n, - int nnz, - int* a_ia, - double* a_val, - double* result) -{ - matrixInfNormPart1<<<1000,1024>>>(n, nnz, a_ia, a_val, result); -} - -void count_sketch_theta(int n, - int k, - int* labels, - int* flip, - double* input, - double* output) -{ - - count_sketch_kernel<<<10000, 1024>>>(n, k, labels, flip, input, output); -} - -void FWHT_select(int k, - int* perm, - double* input, - double* output) -{ - select_kernel<<<1000,1024>>>(k, perm, input, output); -} - -void FWHT_scaleByD(int n, - int* D, - double* x, - double* y) -{ - scaleByD_kernel<<<1000,1024>>>(n, D, x, y); -} - -void FWHT(int M, int log2N, double* d_Data) { - - const int THREAD_N = 1024; - int N = 1 << log2N; - dim3 grid((1 << log2N) / (4 * THREAD_N), M, 1); - - for (; log2N > ELEMENTARY_LOG2SIZE; log2N -= 2, N >>= 2, M <<= 2) { - fwtBatch2Kernel<<>>(d_Data, d_Data, N / 4); + kernels::fwtBatch1Kernel<<>>(d_Data, d_Data, log2N); } - fwtBatch1Kernel<<>>(d_Data, d_Data, log2N); -} +} // namespace ReSolve diff --git a/resolve/cuda/cudaKernels.h b/resolve/cuda/cudaKernels.h index 460118f9..7b27b9ce 100644 --- a/resolve/cuda/cudaKernels.h +++ b/resolve/cuda/cudaKernels.h @@ -1,34 +1,52 @@ -void mass_inner_product_two_vectors(int n, - int i, - double* vec1, - double* vec2, - double* mvec, - double* result); -void mass_axpy(int n, int i, double* x, double* y, double* alpha); - -//needed for matrix inf nrm -void matrix_row_sums(int n, - int nnz, - int* a_ia, - double* a_val, - double* result); - -// needed for rand solver -void count_sketch_theta(int n, - int k, - int* labels, - int* flip, - double* input, - double* output); - -void FWHT_select(int k, - int* perm, - double* input, - double* output); - -void FWHT_scaleByD(int n, - int* D, - double* x, - double* y); - -void FWHT(int M, int log2N, double* d_Data); +/** + * @file cudaKernels.h + * @author Kasia Swirydowicz (kasia.swirydowicz@pnnl.gov) + * + * @brief Contains prototypes of CUDA kernels. + * + * @note These kernels will be used in CUDA specific code, only. + * + */ +#pragma once + +#include + +namespace ReSolve { + + void mass_inner_product_two_vectors(index_type n, + index_type i, + const real_type* vec1, + const real_type* vec2, + const real_type* mvec, + real_type* result); + + void mass_axpy(index_type n, index_type i, const real_type* x, real_type* y, const real_type* alpha); + + //needed for matrix inf nrm + void matrix_row_sums(index_type n, + index_type nnz, + const index_type* a_ia, + const real_type* a_val, + real_type* result); + + // needed for rand solver + void count_sketch_theta(index_type n, + index_type k, + const index_type* labels, + const index_type* flip, + const real_type* input, + real_type* output); + + void FWHT_select(index_type k, + const index_type* perm, + const real_type* input, + real_type* output); + + void FWHT_scaleByD(index_type n, + const index_type* D, + const real_type* x, + real_type* y); + + void FWHT(index_type M, index_type log2N, real_type* d_Data); + +} // namespace ReSolve diff --git a/resolve/cuda/cudaVectorKernels.cu b/resolve/cuda/cudaVectorKernels.cu deleted file mode 100644 index a1c53198..00000000 --- a/resolve/cuda/cudaVectorKernels.cu +++ /dev/null @@ -1,28 +0,0 @@ -#include "cudaVectorKernels.h" -#include - - -namespace ReSolve { namespace vector { - -namespace kernels { - -__global__ void set_const(index_type n, real_type val, real_type* arr) -{ - index_type i = blockIdx.x * blockDim.x + threadIdx.x; - if(i < n) - { - arr[i] = val; - } -} - -} // namespace kernels - -void set_array_const(index_type n, real_type val, real_type* arr) -{ - index_type num_blocks; - index_type block_size = 512; - num_blocks = (n + block_size - 1) / block_size; - kernels::set_const<<>>(n, val, arr); -} - -}} // namespace ReSolve::vector \ No newline at end of file diff --git a/resolve/cuda/cudaVectorKernels.h b/resolve/cuda/cudaVectorKernels.h deleted file mode 100644 index e85016e3..00000000 --- a/resolve/cuda/cudaVectorKernels.h +++ /dev/null @@ -1,60 +0,0 @@ -#pragma once - -#include -// #include -// #include -// #include - -#include - -//***************************************************************************// -//**** See VectorKernels.hpp for kernel wrapper functions documentation ****// -//***************************************************************************// - -namespace ReSolve { namespace vector { - -namespace kernels { - // __global__ void adapt_diag_scale(index_type, index_type, real_type*, index_type*, index_type*, real_type*, index_type*, - // index_type*, real_type*, index_type*, index_type*, real_type*, real_type*, real_type*, real_type*); - - // __global__ void adapt_row_max(index_type, index_type, real_type*, index_type*, index_type*, real_type*, index_type*, - // index_type*, real_type*, index_type*, index_type*, real_type*); - - // __global__ void add_const(index_type, index_type, index_type*); - - /** - * @brief CUDA kernel that sets values of an array to a constant. - * - * @param[in] n - length of the array - * @param[in] val - the value the array is set to - * @param[out] arr - a pointer to the array - * - * @pre `arr` is allocated to size `n` - * @post `arr` elements are set to `val` - */ - __global__ void set_const(index_type n, real_type val, real_type* arr); - - // __global__ void add_vecs(index_type, real_type*, real_type, real_type*); - - // __global__ void mult_const(index_type, real_type, real_type*); - - // __global__ void add_diag(index_type, real_type, index_type*, index_type*, real_type*); - - // __global__ void inv_vec_scale(index_type, real_type*, real_type*); - - // __global__ void vec_scale(index_type, real_type*, real_type*); - - // __global__ void concatenate(index_type, index_type, index_type, index_type, real_type*, index_type*, index_type*, - // real_type*, index_type*, index_type*, real_type*, index_type*, index_type*); - - // __global__ void row_scale(index_type, real_type*, index_type*, index_type*, real_type*, real_type*, - // real_type*, real_type*); - - // __global__ void diag_scale(index_type, index_type, real_type*, index_type*, index_type*, real_type*, index_type*, - // index_type*, real_type*, real_type*, real_type*, index_type); - - // __global__ void row_max(index_type, index_type, real_type*, index_type*, index_type*, real_type*, index_type*, index_type*, - // real_type* scale); -} // namespace kernels - -}} // namespace ReSolve::vector \ No newline at end of file diff --git a/resolve/hip/CMakeLists.txt b/resolve/hip/CMakeLists.txt index fb71a3bd..86755c0f 100644 --- a/resolve/hip/CMakeLists.txt +++ b/resolve/hip/CMakeLists.txt @@ -8,13 +8,12 @@ set(ReSolve_HIP_SRC hipKernels.hip - hipVectorKernels.hip + VectorKernels.hip MemoryUtils.hip ) set(ReSolve_HIP_HEADER_INSTALL hipKernels.h - hipVectorKernels.h HipMemory.hpp hip_check_errors.hpp ) diff --git a/resolve/hip/hipVectorKernels.hip b/resolve/hip/VectorKernels.hip similarity index 72% rename from resolve/hip/hipVectorKernels.hip rename to resolve/hip/VectorKernels.hip index 5b3ace30..30b5e982 100644 --- a/resolve/hip/hipVectorKernels.hip +++ b/resolve/hip/VectorKernels.hip @@ -1,3 +1,13 @@ +/** + * @file hipVectorKernels.hip + * @author Slaven Peles (peless@ornl.gov) + * @brief Contains implementation of CUDA vector kernels and their wrappers. + * @date 2023-12-08 + * + * @note Kernel wrappers implemented here are intended for use in hardware + * agnostic code. + */ + #include #include #include diff --git a/resolve/hip/hipKernels.h b/resolve/hip/hipKernels.h index f7416b17..e6f547b6 100644 --- a/resolve/hip/hipKernels.h +++ b/resolve/hip/hipKernels.h @@ -1,51 +1,68 @@ -void mass_inner_product_two_vectors(int n, - int i, - double* vec1, - double* vec2, - double* mvec, - double* result); -void mass_axpy(int n, int i, double* x, double* y, double* alpha); - -//needed for matrix inf nrm -void matrix_row_sums(int n, - int nnz, - int* a_ia, - double* a_val, - double* result); - -// needed for triangular solve - -void permuteVectorP(int n, - int* perm_vector, - double* vec_in, - double* vec_out); - -void permuteVectorQ(int n, - int* perm_vector, - double* vec_in, - double* vec_out); - -// needed for rand solver -void count_sketch_theta(int n, - int k, - int* labels, - int* flip, - double* input, - double* output); - -void FWHT_select(int k, - int* perm, - double* input, - double* output); - -void FWHT_scaleByD(int n, - int* D, - double* x, - double* y); - -void FWHT(int M, int log2N, double* d_Data); - -void vector_inf_norm(int n, - double* input, - double * buffer, - double* result); +/** + * @file hipKernels.h + * @author Kasia Swirydowicz (kasia.swirydowicz@pnnl.gov) + * @brief Contains prototypes of HIP kernels. + * @date 2023-12-08 + * + * @note These kernels will be used in HIP specific code, only. + * + */ + +#pragma once + +#include + +namespace ReSolve { + + void mass_inner_product_two_vectors(index_type n, + index_type i, + real_type* vec1, + real_type* vec2, + real_type* mvec, + real_type* result); + void mass_axpy(index_type n, index_type i, real_type* x, real_type* y, real_type* alpha); + + //needed for matrix inf nrm + void matrix_row_sums(index_type n, + index_type nnz, + index_type* a_ia, + real_type* a_val, + real_type* result); + + // needed for triangular solve + + void permuteVectorP(index_type n, + index_type* perm_vector, + real_type* vec_in, + real_type* vec_out); + + void permuteVectorQ(index_type n, + index_type* perm_vector, + real_type* vec_in, + real_type* vec_out); + + // needed for rand solver + void count_sketch_theta(index_type n, + index_type k, + index_type* labels, + index_type* flip, + real_type* input, + real_type* output); + + void FWHT_select(index_type k, + index_type* perm, + real_type* input, + real_type* output); + + void FWHT_scaleByD(index_type n, + index_type* D, + real_type* x, + real_type* y); + + void FWHT(index_type M, index_type log2N, real_type* d_Data); + + void vector_inf_norm(index_type n, + real_type* input, + real_type * buffer, + real_type* result); +} // namespace ReSolve diff --git a/resolve/hip/hipKernels.hip b/resolve/hip/hipKernels.hip index 671311d4..71c8171c 100644 --- a/resolve/hip/hipKernels.hip +++ b/resolve/hip/hipKernels.hip @@ -1,504 +1,743 @@ +/** + * @file hipKernels.hip + * @author Kasia Swirydowicz (kasia.swirydowicz@pnnl.gov) + * @brief + * @date 2023-12-08 + * + * + */ + #include "hipKernels.h" #include #include -//computes V^T[u1 u2] where v is n x k and u1 and u2 are nx1 -template -__global__ void MassIPTwoVec_kernel(const double* __restrict__ u1, - const double* __restrict__ u2, - const double* __restrict__ v, - double* result, - const int k, - const int N) -{ - int t = threadIdx.x; - int bsize = blockDim.x; - - // assume T threads per thread block (and k reductions to be performed) - volatile __shared__ double s_tmp1[Tv5]; - - volatile __shared__ double s_tmp2[Tv5]; - // map between thread index space and the problem index space - int j = blockIdx.x; - s_tmp1[t] = 0.0; - s_tmp2[t] = 0.0; - int nn = t; - double can1, can2, cbn; - - while (nn < N) { - can1 = u1[nn]; - can2 = u2[nn]; - - cbn = v[N * j + nn]; - s_tmp1[t] += can1 * cbn; - s_tmp2[t] += can2 * cbn; - - nn += bsize; - } - - __syncthreads(); - - if (Tv5 >= 1024) { - if(t < 512) { - s_tmp1[t] += s_tmp1[t + 512]; - s_tmp2[t] += s_tmp2[t + 512]; +namespace ReSolve { + namespace kernels { + + /** + * @brief Computes v^T * [u1 u2] where v is n x k multivector + * and u1 and u2 are n x 1 vectors. + * + * @tparam Tv5 - Size of shared memory + * + * @param[in] u1 - (n x 1) vector + * @param[in] u2 - (n x 1) vector + * @param[in] v - (n x k) multivector + * @param[out] result - (k x 2) multivector + * @param[in] k - dimension of the subspace + * @param[in] N - size of vectors u1, u2 + */ + template + __global__ void MassIPTwoVec_kernel(const real_type* __restrict__ u1, + const real_type* __restrict__ u2, + const real_type* __restrict__ v, + real_type* result, + const index_type k, + const index_type N) + { + index_type t = threadIdx.x; + index_type bsize = blockDim.x; + + // assume T threads per thread block (and k reductions to be performed) + volatile __shared__ real_type s_tmp1[Tv5]; + + volatile __shared__ real_type s_tmp2[Tv5]; + // map between thread index space and the problem index space + index_type j = blockIdx.x; + s_tmp1[t] = 0.0; + s_tmp2[t] = 0.0; + index_type nn = t; + real_type can1, can2, cbn; + + while (nn < N) { + can1 = u1[nn]; + can2 = u2[nn]; + + cbn = v[N * j + nn]; + s_tmp1[t] += can1 * cbn; + s_tmp2[t] += can2 * cbn; + + nn += bsize; + } + + __syncthreads(); + + if (Tv5 >= 1024) { + if(t < 512) { + s_tmp1[t] += s_tmp1[t + 512]; + s_tmp2[t] += s_tmp2[t + 512]; + } + __syncthreads(); + } + if (Tv5 >= 512) { + if(t < 256) { + s_tmp1[t] += s_tmp1[t + 256]; + s_tmp2[t] += s_tmp2[t + 256]; + } + __syncthreads(); + } + { + if (t < 128) { + s_tmp1[t] += s_tmp1[t + 128]; + s_tmp2[t] += s_tmp2[t + 128]; + } + __syncthreads(); + } + { + if (t < 64) { + s_tmp1[t] += s_tmp1[t + 64]; + s_tmp2[t] += s_tmp2[t + 64]; + } + __syncthreads(); + } + + if (t < 32) { + s_tmp1[t] += s_tmp1[t + 32]; + s_tmp2[t] += s_tmp2[t + 32]; + + s_tmp1[t] += s_tmp1[t + 16]; + s_tmp2[t] += s_tmp2[t + 16]; + + s_tmp1[t] += s_tmp1[t + 8]; + s_tmp2[t] += s_tmp2[t + 8]; + + s_tmp1[t] += s_tmp1[t + 4]; + s_tmp2[t] += s_tmp2[t + 4]; + + s_tmp1[t] += s_tmp1[t + 2]; + s_tmp2[t] += s_tmp2[t + 2]; + + s_tmp1[t] += s_tmp1[t + 1]; + s_tmp2[t] += s_tmp2[t + 1]; + } + if (t == 0) { + result[blockIdx.x] = s_tmp1[0]; + result[blockIdx.x + k] = s_tmp2[0]; + } } - __syncthreads(); - } - if (Tv5 >= 512) { - if(t < 256) { - s_tmp1[t] += s_tmp1[t + 256]; - s_tmp2[t] += s_tmp2[t + 256]; - } - __syncthreads(); - } - { - if (t < 128) { - s_tmp1[t] += s_tmp1[t + 128]; - s_tmp2[t] += s_tmp2[t + 128]; - } - __syncthreads(); - } - { - if (t < 64) { - s_tmp1[t] += s_tmp1[t + 64]; - s_tmp2[t] += s_tmp2[t + 64]; - } - __syncthreads(); - } - - if (t < 32) { - s_tmp1[t] += s_tmp1[t + 32]; - s_tmp2[t] += s_tmp2[t + 32]; - s_tmp1[t] += s_tmp1[t + 16]; - s_tmp2[t] += s_tmp2[t + 16]; - s_tmp1[t] += s_tmp1[t + 8]; - s_tmp2[t] += s_tmp2[t + 8]; + /** + * @brief AXPY y = y - x*alpha where alpha is [k x 1], needed in 1 and 2 synch GMRES + * + * @tparam Tmaxk + * + * @param[in] N - + * @param[in] k - + * @param[in] x_data - + * @param[out] y_data - + * @param[in] alpha - + */ + template + __global__ void massAxpy3_kernel(index_type N, + index_type k, + const real_type* x_data, + real_type* y_data, + const real_type* alpha) + { + index_type i = blockIdx.x * blockDim.x + threadIdx.x; + index_type t = threadIdx.x; + + __shared__ real_type s_alpha[Tmaxk]; + if (t < k) { + s_alpha[t] = alpha[t]; + } + __syncthreads(); + while (i < N) { + real_type temp = 0.0; + for(index_type j = 0; j < k; ++j) { + temp += x_data[j * N + i] * s_alpha[j]; + } + y_data[i] -= temp; + i += (blockDim.x*gridDim.x); + } + } - s_tmp1[t] += s_tmp1[t + 4]; - s_tmp2[t] += s_tmp2[t + 4]; + /** + * @brief Pass through matrix rows and sum each as \sum_{j=1}^m abs(a_{ij}) + * + * @param[in] n - + * @param[in] nnz - + * @param[in] a_ia - + * @param[in] a_val - + * @param[out] result - + */ + __global__ void matrixInfNormPart1(const index_type n, + const index_type nnz, + const index_type* a_ia, + const real_type* a_val, + real_type* result) + { + index_type idx = blockIdx.x*blockDim.x + threadIdx.x; + while (idx < n) { + real_type sum = 0.0; + for (index_type i = a_ia[idx]; i < a_ia[idx+1]; ++i) { + sum = sum + fabs(a_val[i]); + } + result[idx] = sum; + idx += (blockDim.x * gridDim.x); + } + } - s_tmp1[t] += s_tmp1[t + 2]; - s_tmp2[t] += s_tmp2[t + 2]; + /** + * @brief + * + * @param[in] n - vector size + * @param[in] input - + * @param[out] result - + */ + __global__ void vectorInfNorm(const index_type n, + const real_type* input, + real_type* result) + { + + index_type idx = blockIdx.x * blockDim.x + threadIdx.x; + + volatile __shared__ real_type s_max[1024]; + index_type t = threadIdx.x; + index_type bsize = blockDim.x; + real_type local_max = 0.0; + if (idx < n) { + local_max = fabs(input[idx]); + } + + idx += (blockDim.x*gridDim.x); + + while (idx < n) { + local_max = fmax(fabs(input[idx]), local_max); + idx += (blockDim.x * gridDim.x); + } + s_max[t] = local_max; + __syncthreads(); + + // reduction + + if (bsize >= 1024) { + if(t < 512) { + s_max[t] = fmax(s_max[t], s_max[t + 512]); + } + __syncthreads(); + } + if (bsize >= 512) { + if(t < 256) { + s_max[t] = fmax(s_max[t], s_max[t + 256]); + } + __syncthreads(); + } + + if (bsize >= 256) { + if(t < 128) { + s_max[t] = fmax(s_max[t], s_max[t + 128]); + } + __syncthreads(); + } + + if (bsize >= 128) { + if(t < 64) { + s_max[t] = fmax(s_max[t], s_max[t + 64]); + } + __syncthreads(); + } + //unroll for last warp + if (t < 64) { + s_max[t] = fmax(s_max[t], s_max[t + 64]); + s_max[t] = fmax(s_max[t], s_max[t + 32]); + s_max[t] = fmax(s_max[t], s_max[t + 16]); + s_max[t] = fmax(s_max[t], s_max[t + 8]); + s_max[t] = fmax(s_max[t], s_max[t + 4]); + s_max[t] = fmax(s_max[t], s_max[t + 2]); + s_max[t] = fmax(s_max[t], s_max[t + 1]); + + } + if (t == 0) { + index_type bid = blockIdx.x; + index_type gid = gridDim.x; + result[blockIdx.x] = s_max[0]; + } + } - s_tmp1[t] += s_tmp1[t + 1]; - s_tmp2[t] += s_tmp2[t + 1]; - } - if (t == 0) { - result[blockIdx.x] = s_tmp1[0]; - result[blockIdx.x + k] = s_tmp2[0]; - } -} + /** + * @brief + * + * @param n - vector size + * @param perm_vector - permutation map + * @param vec_in - input vector + * @param vec_out - permuted vector + */ + __global__ void permuteVectorP_kernel(const index_type n, + const index_type* perm_vector, + const real_type* vec_in, + real_type* vec_out) + { + //one thread per vector entry, pass through rows + index_type idx = blockIdx.x*blockDim.x + threadIdx.x; + while (idx < n) { + vec_out[idx] = vec_in[perm_vector[idx]]; + idx += (blockDim.x * gridDim.x); + } + } -/// mass AXPY i.e y = y - x*alpha where alpha is [k x 1], needed in 1 and 2 synch GMRES -template -__global__ void massAxpy3_kernel(int N, - int k, - const double* x_data, - double* y_data, - const double* alpha) -{ + /** + * @brief + * + * @param n - vector size + * @param perm_vector - permutation map + * @param vec_in - input vector + * @param vec_out - permuted vector + */ + __global__ void permuteVectorQ_kernel(const index_type n, + const index_type* perm_vector, + const real_type* vec_in, + real_type* vec_out) + { + //one thread per vector entry, pass through rows + index_type idx = blockIdx.x*blockDim.x + threadIdx.x; + while (idx < n) { + vec_out[perm_vector[idx]] = vec_in[idx]; + idx += (blockDim.x * gridDim.x); + } + } - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + /** + * @brief For count sketch in sketching random method + * + * @param[in] n - + * @param[in] k - + * @param[in] labels - + * @param[in] flip - + * @param[in] input - + * @param[out] output - + */ + __global__ void count_sketch_kernel(const index_type n, + const index_type k, + const index_type* labels, + const index_type* flip, + const real_type* input, + real_type* output){ + + index_type idx = blockIdx.x * blockDim.x + threadIdx.x; + while (idx < n) { + real_type val = input[idx]; + if (flip[idx] != 1){ + val *= -1.0; + } + atomicAdd(&output[labels[idx]], val); + idx += blockDim.x * gridDim.x; + } + } - unsigned int t = threadIdx.x; + /** + * @brief Walsh-Hadamard transform (select kernel) + * + * @param[in] k - + * @param[in] perm - + * @param[in] input - + * @param[out] output - + */ + __global__ void select_kernel(const index_type k, + const index_type* perm, + const real_type* input, + real_type* output){ + + index_type idx = blockIdx.x * blockDim.x + threadIdx.x; + while (idx < k) { + output[idx] = input[perm[idx]]; + idx += blockDim.x * gridDim.x; + } + } - __shared__ double s_alpha[Tmaxk]; - if (t < k) { - s_alpha[t] = alpha[t]; - } - __syncthreads(); - while (i < N) { - double temp = 0.0; - for(int j = 0; j < k; ++j) { - temp += x_data[j * N + i] * s_alpha[j]; + /** + * @brief Walsh-Hadamard transform (scale) + * + * @param[in] n - + * @param[in] D - + * @param[in] x - + * @param[out] y - + */ + __global__ void scaleByD_kernel(const index_type n, + const index_type* D, + const real_type* x, + real_type* y){ + index_type idx = blockIdx.x * blockDim.x + threadIdx.x; + + while (idx < n){ + + if (D[idx] == 1) y[idx]=x[idx]; + else y[idx]= (-1.0)*x[idx]; + + idx += blockDim.x * gridDim.x; + } } - y_data[i] -= temp; - i += (blockDim.x*gridDim.x); - } -} - -__global__ void matrixInfNormPart1(const int n, - const int nnz, - const int* a_ia, - const double* a_val, - double* result) -{ - // one thread per row, pass through rows - // and sum - // can be done through atomics - //\sum_{j=1}^m fabs(a_{ij}) - - int idx = blockIdx.x*blockDim.x + threadIdx.x; - while (idx < n) { - double sum = 0.0; - for (int i = a_ia[idx]; i < a_ia[idx+1]; ++i) { - sum = sum + fabs(a_val[i]); + + /** + * @brief Single in-global memory radix-4 Fast Walsh Transform pass + * (for strides exceeding elementary vector size). + * + * @param d_Output - + * @param d_Input - + * @param stride - + */ + __global__ void fwtBatch2Kernel(real_type* d_Output, real_type* d_Input, index_type stride) + { + const index_type pos = blockIdx.x * blockDim.x + threadIdx.x; + const index_type N = blockDim.x * gridDim.x * 4; + + real_type* d_Src = d_Input + blockIdx.y * N; + real_type* d_Dst = d_Output + blockIdx.y * N; + + index_type lo = pos & (stride - 1); + index_type i0 = ((pos - lo) << 2) + lo; + index_type i1 = i0 + stride; + index_type i2 = i1 + stride; + index_type i3 = i2 + stride; + + real_type D0 = d_Src[i0]; + real_type D1 = d_Src[i1]; + real_type D2 = d_Src[i2]; + real_type D3 = d_Src[i3]; + + real_type T; + T = D0; + D0 = D0 + D2; + D2 = T - D2; + T = D1; + D1 = D1 + D3; + D3 = T - D3; + T = D0; + d_Dst[i0] = D0 + D1; + d_Dst[i1] = T - D1; + T = D2; + d_Dst[i2] = D2 + D3; + d_Dst[i3] = T - D3; } - result[idx] = sum; - idx += (blockDim.x*gridDim.x); - } -} -__global__ void vectorInfNorm(const int n, - const double* input, - double* result) -{ - int idx = blockIdx.x * blockDim.x + threadIdx.x; + /** + * @brief + * + * @param d_Output - + * @param d_Input - + * @param log2N - + * + * @todo `d_Input` should be `const` parameter. + * + */ + __global__ void fwtBatch1Kernel(real_type* d_Output, real_type* d_Input, index_type log2N) + { + // Handle to thread block group + + cooperative_groups::thread_block cta = cooperative_groups::this_thread_block(); + const index_type N = 1 << log2N; + const index_type base = blockIdx.x << log2N; + + //(2 ** 11) * 4 bytes == 8KB -- maximum s_data[] size for G80 + extern __shared__ real_type s_data[]; + real_type* d_Src = d_Input + base; + real_type* d_Dst = d_Output + base; + + for (index_type pos = threadIdx.x; pos < N; pos += blockDim.x) { + s_data[pos] = d_Src[pos]; + } + + // Main radix-4 stages + const index_type pos = threadIdx.x; + + for (index_type stride = N >> 2; stride > 0; stride >>= 2) { + index_type lo = pos & (stride - 1); + index_type i0 = ((pos - lo) << 2) + lo; + index_type i1 = i0 + stride; + index_type i2 = i1 + stride; + index_type i3 = i2 + stride; + + cooperative_groups::sync(cta); + real_type D0 = s_data[i0]; + real_type D1 = s_data[i1]; + real_type D2 = s_data[i2]; + real_type D3 = s_data[i3]; + + real_type T; + T = D0; + D0 = D0 + D2; + D2 = T - D2; + T = D1; + D1 = D1 + D3; + D3 = T - D3; + T = D0; + s_data[i0] = D0 + D1; + s_data[i1] = T - D1; + T = D2; + s_data[i2] = D2 + D3; + s_data[i3] = T - D3; + } + + // Do single radix-2 stage for odd power of two + if (log2N & 1) { + + cooperative_groups::sync(cta); + + for (index_type pos = threadIdx.x; pos < N / 2; pos += blockDim.x) { + index_type i0 = pos << 1; + index_type i1 = i0 + 1; + + real_type D0 = s_data[i0]; + real_type D1 = s_data[i1]; + s_data[i0] = D0 + D1; + s_data[i1] = D0 - D1; + } + } + + cooperative_groups::sync(cta); + + for (index_type pos = threadIdx.x; pos < N; pos += blockDim.x) { + d_Dst[pos] = s_data[pos]; + } + } - volatile __shared__ double s_max[1024]; - int t = threadIdx.x; - int bsize = blockDim.x; - double local_max = 0.0; - if (idx < n) { - local_max = fabs(input[idx]); + } // namespace kernels + + // + // Kernel wrappers + // + + /** + * @brief Computes result = mvec^T * [vec1 vec2] + * + * @param n - size of vectors vec1, vec2 + * @param i - + * @param vec1 - (n x 1) vector + * @param vec2 - (n x 1) vector + * @param mvec - (n x (i+1)) multivector + * @param result - ((i+1) x 2) multivector + * + * @todo Input data should be `const`. + * @todo Is it coincidence that the block size is equal to the default + * value of Tv5? + * @todo Should we use dynamic shared memory here instead? + */ + void mass_inner_product_two_vectors(index_type n, + index_type i, + real_type* vec1, + real_type* vec2, + real_type* mvec, + real_type* result) + { + hipLaunchKernelGGL(kernels::MassIPTwoVec_kernel, + dim3(i + 1), + dim3(1024), + 0, + 0, + vec1, + vec2, + mvec, + result, + i + 1, + n); } - idx += (blockDim.x*gridDim.x); - - while (idx < n) { - local_max = fmax(fabs(input[idx]), local_max); - idx += (blockDim.x*gridDim.x); + /** + * @brief Computes y := y - x*alpha + * + * @param[in] n - vector size + * @param[in] i - number of vectors in the multivector + * @param[in] x - (n x (i+1)) multivector + * @param[out] y - (n x (i+1)) multivector + * @param[in] alpha - ((i+1) x 1) vector + */ + void mass_axpy(index_type n, index_type i, real_type* x, real_type* y, real_type* alpha) + { + hipLaunchKernelGGL(kernels::massAxpy3_kernel, + dim3((n + 384 - 1) / 384), + dim3(384), + 0, + 0, + n, + i, + x, + y, + alpha); } - s_max[t] = local_max; - __syncthreads(); - - // reduction - if (bsize >= 1024) { - if(t < 512) { - s_max[t] = fmax(s_max[t], s_max[t + 512]); - } - __syncthreads(); - } - if (bsize >= 512) { - if(t < 256) { - s_max[t] = fmax(s_max[t], s_max[t + 256]); - } - __syncthreads(); + /** + * @brief + * + * @param[in] n - + * @param[in] nnz - + * @param[in] a_ia - + * @param[in] a_val - + * @param[out] result - + * + * @todo Decide how to allow user to configure grid and block sizes. + */ + void matrix_row_sums(index_type n, + index_type nnz, + index_type* a_ia, + real_type* a_val, + real_type* result) + { + hipLaunchKernelGGL(kernels::matrixInfNormPart1, dim3(1000), dim3(1024), 0, 0, n, nnz, a_ia, a_val, result); } - if (bsize >= 256) { - if(t < 128) { - s_max[t] = fmax(s_max[t], s_max[t + 128]); - } - __syncthreads(); + /** + * @brief + * + * @param n - + * @param input - + * @param buffer - + * @param result - + * + * @todo Decide how to allow user to configure grid and block sizes. + */ + void vector_inf_norm(index_type n, + real_type* input, + real_type* buffer, + real_type* result) + { + hipLaunchKernelGGL(kernels::vectorInfNorm, dim3(1024), dim3(1024), 0, 0, n, input, buffer); + hipDeviceSynchronize(); + hipLaunchKernelGGL(kernels::vectorInfNorm, dim3(1), dim3(1024), 0, 0, 1024, buffer, buffer); + hipMemcpy(result, buffer, sizeof(real_type) * 1, hipMemcpyDeviceToHost); } - if (bsize >= 128) { - if(t < 64) { - s_max[t] = fmax(s_max[t], s_max[t + 64]); - } - __syncthreads(); + /** + * @brief + * + * @param n - vector size + * @param perm_vector - permutation map + * @param vec_in - input vector + * @param vec_out - permuted vector + */ + void permuteVectorP(index_type n, + index_type* perm_vector, + real_type* vec_in, + real_type* vec_out) + { + hipLaunchKernelGGL(kernels::permuteVectorP_kernel, + dim3(1000), + dim3(1024), + 0, + 0, + n, + perm_vector, + vec_in, + vec_out); } - //unroll for last warp - if (t < 64) { - s_max[t] = fmax(s_max[t], s_max[t + 64]); - s_max[t] = fmax(s_max[t], s_max[t + 32]); - s_max[t] = fmax(s_max[t], s_max[t + 16]); - s_max[t] = fmax(s_max[t], s_max[t + 8]); - s_max[t] = fmax(s_max[t], s_max[t + 4]); - s_max[t] = fmax(s_max[t], s_max[t + 2]); - s_max[t] = fmax(s_max[t], s_max[t + 1]); + /** + * @brief + * + * @param n - vector size + * @param perm_vector - permutation map + * @param vec_in - input vector + * @param vec_out - permuted vector + */ + void permuteVectorQ(index_type n, + index_type* perm_vector, + real_type* vec_in, + real_type* vec_out) + { + hipLaunchKernelGGL(kernels::permuteVectorQ_kernel, + dim3(1000), + dim3(1024), + 0, + 0, + n, + perm_vector, + vec_in, + vec_out); } - if (t == 0) { - int bid = blockIdx.x; - int gid = gridDim.x; - result[blockIdx.x] = s_max[0]; - } -} - - -__global__ void permuteVectorP_kernel(const int n, - const int* perm_vector, - const double* vec_in, - double* vec_out) -{ - //one thread per vector entry, pass through rows - int idx = blockIdx.x*blockDim.x + threadIdx.x; - while (idx> 2; stride > 0; stride >>= 2) { - int lo = pos & (stride - 1); - int i0 = ((pos - lo) << 2) + lo; - int i1 = i0 + stride; - int i2 = i1 + stride; - int i3 = i2 + stride; - - cg::sync(cta); - double D0 = s_data[i0]; - double D1 = s_data[i1]; - double D2 = s_data[i2]; - double D3 = s_data[i3]; - - double T; - T = D0; - D0 = D0 + D2; - D2 = T - D2; - T = D1; - D1 = D1 + D3; - D3 = T - D3; - T = D0; - s_data[i0] = D0 + D1; - s_data[i1] = T - D1; - T = D2; - s_data[i2] = D2 + D3; - s_data[i3] = T - D3; + /** + * @brief Wrapper for `scale` kernel, part of Walsh-Hadamard transform + * + * @param[in] n - + * @param[in] D - + * @param[in] x - + * @param[out] y - + * + * @todo Decide how to allow user to configure grid and block sizes. + */ + void FWHT_scaleByD(index_type n, + index_type* D, + real_type* x, + real_type* y) + { + hipLaunchKernelGGL(kernels::scaleByD_kernel, dim3(1000), dim3(1024), 0, 0, n, D, x, y); } - // Do single radix-2 stage for odd power of two - if (log2N & 1) { - - cg::sync(cta); - - for (int pos = threadIdx.x; pos < N / 2; pos += blockDim.x) { - int i0 = pos << 1; - int i1 = i0 + 1; + /** + * @brief + * + * @param[in] M - + * @param[in] log2N - + * @param[out] d_Data - + * + * @todo Decide if and how user should configure log2size, thread_n, etc. + */ + void FWHT(index_type M, index_type log2N, real_type* d_Data) + { + const index_type ELEMENTARY_LOG2SIZE = 11; + const index_type THREAD_N = 1024; + index_type N = 1 << log2N; + dim3 grid((1 << log2N) / (4 * THREAD_N), M, 1); - double D0 = s_data[i0]; - double D1 = s_data[i1]; - s_data[i0] = D0 + D1; - s_data[i1] = D0 - D1; + for (; log2N > ELEMENTARY_LOG2SIZE; log2N -= 2, N >>= 2, M <<= 2) { + hipLaunchKernelGGL(kernels::fwtBatch2Kernel, grid, dim3(THREAD_N), 0, 0, d_Data, d_Data, N / 4); } - } - - cg::sync(cta); - for (int pos = threadIdx.x; pos < N; pos += blockDim.x) { - d_Dst[pos] = s_data[pos]; - } -} - - - -void mass_inner_product_two_vectors(int n, - int i, - double* vec1, - double* vec2, - double* mvec, - double* result) -{ - hipLaunchKernelGGL(MassIPTwoVec_kernel, dim3(i + 1), dim3(1024), 0, 0, vec1, vec2, mvec, result, i + 1, n); -} - -void mass_axpy(int n, int i, double* x, double* y, double* alpha) -{ - hipLaunchKernelGGL(massAxpy3_kernel, dim3((n + 384 - 1) / 384), dim3(384), 0, 0, n, i, x, y, alpha); -} - -void matrix_row_sums(int n, - int nnz, - int* a_ia, - double* a_val, - double* result) -{ - hipLaunchKernelGGL(matrixInfNormPart1, dim3(1000), dim3(1024), 0, 0, n, nnz, a_ia, a_val, result); -} - - -void vector_inf_norm(int n, - double* input, - double* buffer, - double* result) -{ - hipLaunchKernelGGL(vectorInfNorm, dim3(1024), dim3(1024), 0, 0, n, input, buffer); - hipDeviceSynchronize(); - hipLaunchKernelGGL(vectorInfNorm, dim3(1), dim3(1024), 0, 0, 1024, buffer, buffer); - hipMemcpy(result, buffer, sizeof(double) * 1, hipMemcpyDeviceToHost); -} - -void permuteVectorP(int n, - int* perm_vector, - double* vec_in, - double* vec_out) -{ - hipLaunchKernelGGL(permuteVectorP_kernel, dim3(1000), dim3(1024), 0, 0, n, perm_vector, vec_in, vec_out); -} - -void permuteVectorQ(int n, - int* perm_vector, - double* vec_in, - double* vec_out) -{ - hipLaunchKernelGGL(permuteVectorQ_kernel, dim3(1000), dim3(1024), 0, 0, n, perm_vector, vec_in, vec_out); -} - -void count_sketch_theta(int n, - int k, - int* labels, - int* flip, - double* input, - double* output) -{ - - hipLaunchKernelGGL(count_sketch_kernel, dim3(10000), dim3(1024), 0, 0, n, k, labels, flip, input, output); -} - -void FWHT_select(int k, - int* perm, - double* input, - double* output) -{ - hipLaunchKernelGGL(select_kernel, dim3(1000), dim3(1024), 0, 0,k, perm, input, output); -} - -void FWHT_scaleByD(int n, - int* D, - double* x, - double* y) -{ - hipLaunchKernelGGL(scaleByD_kernel, dim3(1000), dim3(1024), 0, 0, n, D, x, y); -} - -void FWHT(int M, int log2N, double* d_Data) { - - const int THREAD_N = 1024; - int N = 1 << log2N; - dim3 grid((1 << log2N) / (4 * THREAD_N), M, 1); - - for (; log2N > ELEMENTARY_LOG2SIZE; log2N -= 2, N >>= 2, M <<= 2) { - hipLaunchKernelGGL(fwtBatch2Kernel, grid, dim3(THREAD_N), 0, 0, d_Data, d_Data, N / 4); + hipLaunchKernelGGL(kernels::fwtBatch1Kernel, dim3(M), dim3(N / 4), N * sizeof(real_type), 0, d_Data, d_Data, log2N); } - hipLaunchKernelGGL(fwtBatch1Kernel, dim3(M), dim3(N / 4), N * sizeof(double), 0, d_Data, d_Data, log2N); -} +} // namespace ReSolve diff --git a/resolve/hip/hipVectorKernels.h b/resolve/hip/hipVectorKernels.h deleted file mode 100644 index cd23f822..00000000 --- a/resolve/hip/hipVectorKernels.h +++ /dev/null @@ -1,57 +0,0 @@ -#pragma once - -#include - -#include - -//***************************************************************************// -//**** See VectorKernels.hpp for kernel wrapper functions documentation ****// -//***************************************************************************// - -namespace ReSolve { namespace vector { - -namespace kernels { - // __global__ void adapt_diag_scale(index_type, index_type, real_type*, index_type*, index_type*, real_type*, index_type*, - // index_type*, real_type*, index_type*, index_type*, real_type*, real_type*, real_type*, real_type*); - - // __global__ void adapt_row_max(index_type, index_type, real_type*, index_type*, index_type*, real_type*, index_type*, - // index_type*, real_type*, index_type*, index_type*, real_type*); - - // __global__ void add_const(index_type, index_type, index_type*); - - /** - * @brief CUDA kernel that sets values of an array to a constant. - * - * @param[in] n - length of the array - * @param[in] val - the value the array is set to - * @param[out] arr - a pointer to the array - * - * @pre `arr` is allocated to size `n` - * @post `arr` elements are set to `val` - */ - __global__ void set_const(index_type n, real_type val, real_type* arr); - - // __global__ void add_vecs(index_type, real_type*, real_type, real_type*); - - // __global__ void mult_const(index_type, real_type, real_type*); - - // __global__ void add_diag(index_type, real_type, index_type*, index_type*, real_type*); - - // __global__ void inv_vec_scale(index_type, real_type*, real_type*); - - // __global__ void vec_scale(index_type, real_type*, real_type*); - - // __global__ void concatenate(index_type, index_type, index_type, index_type, real_type*, index_type*, index_type*, - // real_type*, index_type*, index_type*, real_type*, index_type*, index_type*); - - // __global__ void row_scale(index_type, real_type*, index_type*, index_type*, real_type*, real_type*, - // real_type*, real_type*); - - // __global__ void diag_scale(index_type, index_type, real_type*, index_type*, index_type*, real_type*, index_type*, - // index_type*, real_type*, real_type*, real_type*, index_type); - - // __global__ void row_max(index_type, index_type, real_type*, index_type*, index_type*, real_type*, index_type*, index_type*, - // real_type* scale); -} // namespace kernels - -}} // namespace ReSolve::vector \ No newline at end of file diff --git a/resolve/vector/VectorKernels.hpp b/resolve/vector/VectorKernels.hpp index 9f7d1bca..8e69ff38 100644 --- a/resolve/vector/VectorKernels.hpp +++ b/resolve/vector/VectorKernels.hpp @@ -1,4 +1,14 @@ +/** + * @file VectorKernels.hpp + * @author Slaven Peles (peless@ornl.gov) + * @brief Hardware agnostic prototypes of GPU vector kernels + * @date 2023-12-08 + * + * + */ + #pragma once +#include namespace ReSolve { namespace vector {