From 6ad87aad21a25ecacede4f42c501ac1bdc031a9f Mon Sep 17 00:00:00 2001 From: Slaven Peles Date: Mon, 16 Oct 2023 21:45:48 -0400 Subject: [PATCH] Separate CPU and CUDA mmethods for csc2csr conversion. --- examples/r_KLU_KLU.cpp | 2 +- resolve/matrix/CMakeLists.txt | 1 - resolve/matrix/MatrixHandler.cpp | 62 ++----------- resolve/matrix/MatrixHandler.hpp | 3 +- resolve/matrix/MatrixHandlerCpu.cpp | 130 ++++++++++++++++----------- resolve/matrix/MatrixHandlerCpu.hpp | 3 +- resolve/matrix/MatrixHandlerCuda.cpp | 123 ++++++++++++------------- resolve/matrix/MatrixHandlerCuda.hpp | 3 +- resolve/matrix/MatrixHandlerImpl.hpp | 2 +- 9 files changed, 146 insertions(+), 183 deletions(-) diff --git a/examples/r_KLU_KLU.cpp b/examples/r_KLU_KLU.cpp index b1e87203..85c84921 100644 --- a/examples/r_KLU_KLU.cpp +++ b/examples/r_KLU_KLU.cpp @@ -132,7 +132,7 @@ int main(int argc, char *argv[]) status = KLU->solve(vec_rhs, vec_x); std::cout<<"KLU solve status: "<update(rhs, "cpu", "cuda"); + vec_r->update(rhs, "cpu", "cpu"); matrix_handler->setValuesChanged(true); diff --git a/resolve/matrix/CMakeLists.txt b/resolve/matrix/CMakeLists.txt index 8c74ad09..5d05df1b 100644 --- a/resolve/matrix/CMakeLists.txt +++ b/resolve/matrix/CMakeLists.txt @@ -26,7 +26,6 @@ set(Matrix_HEADER_INSTALL Csr.hpp Csc.hpp MatrixHandler.hpp - MatrixHandlerImpl.hpp ) # Build shared library ReSolve::matrix diff --git a/resolve/matrix/MatrixHandler.cpp b/resolve/matrix/MatrixHandler.cpp index 30c08991..7005046d 100644 --- a/resolve/matrix/MatrixHandler.cpp +++ b/resolve/matrix/MatrixHandler.cpp @@ -66,12 +66,6 @@ namespace ReSolve { cudaImpl_ = new MatrixHandlerCuda(new_workspace); } - bool MatrixHandler::getValuesChanged() - { - values_changed_ = cpuImpl_->isValuesChanged(); - return this->values_changed_; - } - void MatrixHandler::setValuesChanged(bool toWhat) { this->values_changed_ = toWhat; @@ -269,58 +263,16 @@ namespace ReSolve { int MatrixHandler::csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr, std::string memspace) { - //it ONLY WORKS WITH CUDA index_type error_sum = 0; if (memspace == "cuda") { - LinAlgWorkspaceCUDA* workspaceCUDA = (LinAlgWorkspaceCUDA*) workspace_; - - A_csr->allocateMatrixData("cuda"); - index_type n = A_csc->getNumRows(); - index_type m = A_csc->getNumRows(); - index_type nnz = A_csc->getNnz(); - size_t bufferSize; - void* d_work; - cusparseStatus_t status = cusparseCsr2cscEx2_bufferSize(workspaceCUDA->getCusparseHandle(), - n, - m, - nnz, - A_csc->getValues("cuda"), - A_csc->getColData("cuda"), - A_csc->getRowData("cuda"), - A_csr->getValues("cuda"), - A_csr->getRowData("cuda"), - A_csr->getColData("cuda"), - CUDA_R_64F, - CUSPARSE_ACTION_NUMERIC, - CUSPARSE_INDEX_BASE_ZERO, - CUSPARSE_CSR2CSC_ALG1, - &bufferSize); - error_sum += status; - mem_.allocateBufferOnDevice(&d_work, bufferSize); - status = cusparseCsr2cscEx2(workspaceCUDA->getCusparseHandle(), - n, - m, - nnz, - A_csc->getValues("cuda"), - A_csc->getColData("cuda"), - A_csc->getRowData("cuda"), - A_csr->getValues("cuda"), - A_csr->getRowData("cuda"), - A_csr->getColData("cuda"), - CUDA_R_64F, - CUSPARSE_ACTION_NUMERIC, - CUSPARSE_INDEX_BASE_ZERO, - CUSPARSE_CSR2CSC_ALG1, - d_work); - error_sum += status; - return error_sum; - mem_.deleteOnDevice(d_work); - } else { - out::error() << "Not implemented (yet)" << std::endl; + return cudaImpl_->csc2csr(A_csc, A_csr); + } else if (memspace == "cpu") { + out::warning() << "Using untested csc2csr on CPU ..." << std::endl; + return cpuImpl_->csc2csr(A_csc, A_csr); + } else { + out::error() << "csc2csr not implemented for " << memspace << " device." << std::endl; return -1; - } - - + } } } // namespace ReSolve diff --git a/resolve/matrix/MatrixHandler.hpp b/resolve/matrix/MatrixHandler.hpp index ea6abbcb..af444abd 100644 --- a/resolve/matrix/MatrixHandler.hpp +++ b/resolve/matrix/MatrixHandler.hpp @@ -7,7 +7,6 @@ #include #include -#include namespace ReSolve { @@ -23,6 +22,7 @@ namespace ReSolve class Csr; } class LinAlgWorkspace; + class MatrixHandlerImpl; } @@ -71,7 +71,6 @@ namespace ReSolve { std::string matrix_type, std::string memspace); int Matrix1Norm(matrix::Sparse *A, real_type* norm); - bool getValuesChanged(); void setValuesChanged(bool toWhat); private: diff --git a/resolve/matrix/MatrixHandlerCpu.cpp b/resolve/matrix/MatrixHandlerCpu.cpp index 5d61501d..d06a4ce5 100644 --- a/resolve/matrix/MatrixHandlerCpu.cpp +++ b/resolve/matrix/MatrixHandlerCpu.cpp @@ -1,4 +1,5 @@ #include +#include #include #include @@ -279,60 +280,83 @@ namespace ReSolve { return -1; } - // int MatrixHandlerCpu::csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr, std::string memspace) - // { - // //it ONLY WORKS WITH CUDA - // index_type error_sum = 0; - // if (memspace == "cuda") { - // LinAlgWorkspaceCUDA* workspaceCUDA = (LinAlgWorkspaceCUDA*) workspace_; - - // A_csr->allocateMatrixData("cuda"); - // index_type n = A_csc->getNumRows(); - // index_type m = A_csc->getNumRows(); - // index_type nnz = A_csc->getNnz(); - // size_t bufferSize; - // void* d_work; - // cusparseStatus_t status = cusparseCsr2cscEx2_bufferSize(workspaceCUDA->getCusparseHandle(), - // n, - // m, - // nnz, - // A_csc->getValues("cuda"), - // A_csc->getColData("cuda"), - // A_csc->getRowData("cuda"), - // A_csr->getValues("cuda"), - // A_csr->getRowData("cuda"), - // A_csr->getColData("cuda"), - // CUDA_R_64F, - // CUSPARSE_ACTION_NUMERIC, - // CUSPARSE_INDEX_BASE_ZERO, - // CUSPARSE_CSR2CSC_ALG1, - // &bufferSize); - // error_sum += status; - // mem_.allocateBufferOnDevice(&d_work, bufferSize); - // status = cusparseCsr2cscEx2(workspaceCUDA->getCusparseHandle(), - // n, - // m, - // nnz, - // A_csc->getValues("cuda"), - // A_csc->getColData("cuda"), - // A_csc->getRowData("cuda"), - // A_csr->getValues("cuda"), - // A_csr->getRowData("cuda"), - // A_csr->getColData("cuda"), - // CUDA_R_64F, - // CUSPARSE_ACTION_NUMERIC, - // CUSPARSE_INDEX_BASE_ZERO, - // CUSPARSE_CSR2CSC_ALG1, - // d_work); - // error_sum += status; - // return error_sum; - // mem_.deleteOnDevice(d_work); - // } else { - // out::error() << "Not implemented (yet)" << std::endl; - // return -1; - // } + /** + * @authors Slaven Peles , Daniel Reynolds (SMU), and + * David Gardner and Carol Woodward (LLNL) + */ + int MatrixHandlerCpu::csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr) + { + int error_sum = 0; + assert(A_csc->getNnz() == A_csr->getNnz()); + assert(A_csc->getNumRows() == A_csr->getNumColumns()); + assert(A_csr->getNumRows() == A_csc->getNumColumns()); + index_type nnz = A_csc->getNnz(); + index_type n = A_csc->getNumColumns(); - // } + index_type* rowIdxCsc = A_csc->getRowData("cpu"); + index_type* colPtrCsc = A_csc->getColData("cpu"); + real_type* valuesCsc = A_csc->getValues("cpu"); + + index_type* rowPtrCsr = A_csr->getRowData("cpu"); + index_type* colIdxCsr = A_csr->getColData("cpu"); + real_type* valuesCsr = A_csr->getValues("cpu"); + + // Set all CSR row pointers to zero + for (index_type i = 0; i <= n; ++i) { + rowPtrCsr[i] = 0; + } + + // Set all CSR values and column indices to zero + for (index_type i = 0; i < nnz; ++i) { + colIdxCsr[i] = 0; + valuesCsr[i] = 0.0; + } + + // Compute number of entries per row + for (index_type i = 0; i < nnz; ++i) { + rowPtrCsr[rowIdxCsc[i]]++; + } + + // Compute cumualtive sum of nnz per row + for (index_type row = 0, rowsum = 0; row < n; ++row) + { + // Store value in row pointer to temp + index_type temp = rowPtrCsr[row]; + + // Copy cumulative sum to the row pointer + rowPtrCsr[row] = rowsum; + + // Update row sum + rowsum += temp; + } + rowPtrCsr[n] = nnz; + + for (index_type col = 0; col < n; ++col) + { + // Compute positions of column indices and values in CSR matrix and store them there + // Overwrites CSR row pointers in the process + for (index_type jj = colPtrCsc[col]; jj < colPtrCsc[col+1]; jj++) + { + index_type row = rowIdxCsc[jj]; + index_type dest = rowPtrCsr[row]; + + colIdxCsr[dest] = col; + valuesCsr[dest] = valuesCsc[jj]; + + rowPtrCsr[row]++; + } + } + + // Restore CSR row pointer values + for (index_type row = 0, last = 0; row <= n; row++) + { + index_type temp = rowPtrCsr[row]; + rowPtrCsr[row] = last; + last = temp; + } + + return 0; + } } // namespace ReSolve diff --git a/resolve/matrix/MatrixHandlerCpu.hpp b/resolve/matrix/MatrixHandlerCpu.hpp index e4de02bf..60079a89 100644 --- a/resolve/matrix/MatrixHandlerCpu.hpp +++ b/resolve/matrix/MatrixHandlerCpu.hpp @@ -40,7 +40,7 @@ namespace ReSolve { MatrixHandlerCpu(LinAlgWorkspace* workspace); virtual ~MatrixHandlerCpu(); - // int csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr, std::string memspace); //memspace decides on what is returned (cpu or cuda pointer) + int csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr); //memspace decides on what is returned (cpu or cuda pointer) // int coo2csr(matrix::Coo* A_coo, matrix::Csr* A_csr, std::string memspace); /// Should compute vec_result := alpha*A*vec_x + beta*vec_result, but at least on cpu alpha and beta are flipped @@ -51,7 +51,6 @@ namespace ReSolve { const real_type* beta, std::string matrix_type); virtual int Matrix1Norm(matrix::Sparse *A, real_type* norm); - // bool getValuesChanged(); // void setValuesChanged(bool toWhat); private: diff --git a/resolve/matrix/MatrixHandlerCuda.cpp b/resolve/matrix/MatrixHandlerCuda.cpp index bd151cbf..31515398 100644 --- a/resolve/matrix/MatrixHandlerCuda.cpp +++ b/resolve/matrix/MatrixHandlerCuda.cpp @@ -241,7 +241,6 @@ namespace ReSolve { if (matrixFormat == "csr") { matrix::Csr* A = dynamic_cast(Ageneric); //result = alpha *A*x + beta * result - // if (memspace == "cuda" ) { cusparseStatus_t status; // std::cout << "Matvec on NVIDIA GPU ...\n"; LinAlgWorkspaceCUDA* workspaceCUDA = (LinAlgWorkspaceCUDA*) workspace_; @@ -260,16 +259,16 @@ namespace ReSolve { cusparseHandle_t handle_cusparse = workspaceCUDA->getCusparseHandle(); if (values_changed_) { status = cusparseCreateCsr(&matA, - A->getNumRows(), - A->getNumColumns(), - A->getNnzExpanded(), - A->getRowData("cuda"), - A->getColData("cuda"), - A->getValues("cuda"), - CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, - CUDA_R_64F); + A->getNumRows(), + A->getNumColumns(), + A->getNnzExpanded(), + A->getRowData("cuda"), + A->getColData("cuda"), + A->getValues("cuda"), + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, + CUDA_R_64F); error_sum += status; values_changed_ = false; } @@ -328,60 +327,52 @@ namespace ReSolve { return -1; } - // int MatrixHandlerCuda::csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr, std::string memspace) - // { - // //it ONLY WORKS WITH CUDA - // index_type error_sum = 0; - // if (memspace == "cuda") { - // LinAlgWorkspaceCUDA* workspaceCUDA = (LinAlgWorkspaceCUDA*) workspace_; - - // A_csr->allocateMatrixData("cuda"); - // index_type n = A_csc->getNumRows(); - // index_type m = A_csc->getNumRows(); - // index_type nnz = A_csc->getNnz(); - // size_t bufferSize; - // void* d_work; - // cusparseStatus_t status = cusparseCsr2cscEx2_bufferSize(workspaceCUDA->getCusparseHandle(), - // n, - // m, - // nnz, - // A_csc->getValues("cuda"), - // A_csc->getColData("cuda"), - // A_csc->getRowData("cuda"), - // A_csr->getValues("cuda"), - // A_csr->getRowData("cuda"), - // A_csr->getColData("cuda"), - // CUDA_R_64F, - // CUSPARSE_ACTION_NUMERIC, - // CUSPARSE_INDEX_BASE_ZERO, - // CUSPARSE_CSR2CSC_ALG1, - // &bufferSize); - // error_sum += status; - // mem_.allocateBufferOnDevice(&d_work, bufferSize); - // status = cusparseCsr2cscEx2(workspaceCUDA->getCusparseHandle(), - // n, - // m, - // nnz, - // A_csc->getValues("cuda"), - // A_csc->getColData("cuda"), - // A_csc->getRowData("cuda"), - // A_csr->getValues("cuda"), - // A_csr->getRowData("cuda"), - // A_csr->getColData("cuda"), - // CUDA_R_64F, - // CUSPARSE_ACTION_NUMERIC, - // CUSPARSE_INDEX_BASE_ZERO, - // CUSPARSE_CSR2CSC_ALG1, - // d_work); - // error_sum += status; - // return error_sum; - // mem_.deleteOnDevice(d_work); - // } else { - // out::error() << "Not implemented (yet)" << std::endl; - // return -1; - // } - - - // } + int MatrixHandlerCuda::csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr) + { + index_type error_sum = 0; + LinAlgWorkspaceCUDA* workspaceCUDA = (LinAlgWorkspaceCUDA*) workspace_; + + A_csr->allocateMatrixData("cuda"); + index_type n = A_csc->getNumRows(); + index_type m = A_csc->getNumRows(); + index_type nnz = A_csc->getNnz(); + size_t bufferSize; + void* d_work; + cusparseStatus_t status = cusparseCsr2cscEx2_bufferSize(workspaceCUDA->getCusparseHandle(), + n, + m, + nnz, + A_csc->getValues("cuda"), + A_csc->getColData("cuda"), + A_csc->getRowData("cuda"), + A_csr->getValues("cuda"), + A_csr->getRowData("cuda"), + A_csr->getColData("cuda"), + CUDA_R_64F, + CUSPARSE_ACTION_NUMERIC, + CUSPARSE_INDEX_BASE_ZERO, + CUSPARSE_CSR2CSC_ALG1, + &bufferSize); + error_sum += status; + mem_.allocateBufferOnDevice(&d_work, bufferSize); + status = cusparseCsr2cscEx2(workspaceCUDA->getCusparseHandle(), + n, + m, + nnz, + A_csc->getValues("cuda"), + A_csc->getColData("cuda"), + A_csc->getRowData("cuda"), + A_csr->getValues("cuda"), + A_csr->getRowData("cuda"), + A_csr->getColData("cuda"), + CUDA_R_64F, + CUSPARSE_ACTION_NUMERIC, + CUSPARSE_INDEX_BASE_ZERO, + CUSPARSE_CSR2CSC_ALG1, + d_work); + error_sum += status; + return error_sum; + mem_.deleteOnDevice(d_work); + } } // namespace ReSolve diff --git a/resolve/matrix/MatrixHandlerCuda.hpp b/resolve/matrix/MatrixHandlerCuda.hpp index 1e411ad1..ae0c3709 100644 --- a/resolve/matrix/MatrixHandlerCuda.hpp +++ b/resolve/matrix/MatrixHandlerCuda.hpp @@ -40,7 +40,7 @@ namespace ReSolve { MatrixHandlerCuda(LinAlgWorkspace* workspace); virtual ~MatrixHandlerCuda(); - // int csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr, std::string memspace); //memspace decides on what is returned (cpu or cuda pointer) + int csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr); //memspace decides on what is returned (cpu or cuda pointer) // int coo2csr(matrix::Coo* A_coo, matrix::Csr* A_csr, std::string memspace); /// Should compute vec_result := alpha*A*vec_x + beta*vec_result, but at least on cpu alpha and beta are flipped @@ -51,7 +51,6 @@ namespace ReSolve { const real_type* beta, std::string matrix_type); virtual int Matrix1Norm(matrix::Sparse *A, real_type* norm); - // bool getValuesChanged(); // void setValuesChanged(bool toWhat); private: diff --git a/resolve/matrix/MatrixHandlerImpl.hpp b/resolve/matrix/MatrixHandlerImpl.hpp index 5a1d6b6f..ba24d498 100644 --- a/resolve/matrix/MatrixHandlerImpl.hpp +++ b/resolve/matrix/MatrixHandlerImpl.hpp @@ -41,7 +41,7 @@ namespace ReSolve { virtual ~MatrixHandlerImpl() {} - // int csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr, std::string memspace); //memspace decides on what is returned (cpu or cuda pointer) + virtual int csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr) = 0; // int coo2csr(matrix::Coo* A_coo, matrix::Csr* A_csr, std::string memspace); /// Should compute vec_result := alpha*A*vec_x + beta*vec_result, but at least on cpu alpha and beta are flipped