diff --git a/CMakeLists.txt b/CMakeLists.txt index f846585d1..843d9b773 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -229,6 +229,7 @@ set(BML_ROCSPARSE FALSE CACHE BOOL "Whether to compile with rocSPARSE support") set(BML_CUDA FALSE CACHE BOOL "Whether to compile with CUDA support") set(BML_CUSOLVER FALSE CACHE BOOL "Whether to compile with cuSolver support") set(BML_MAGMA FALSE CACHE BOOL "Whether to use MAGMA library") +set(BML_HYPRE FALSE CACHE BOOL "Whether to use HYPRE library") set(BML_OMP_OFFLOAD OFF CACHE BOOL "Compile with OpenMP GPU Offload support") set(BML_OFFLOAD_ARCH "NVIDIA" CACHE STRING "Offload architecture") set_property(CACHE BML_OFFLOAD_ARCH PROPERTY STRINGS "NVIDIA" "AMD") @@ -306,7 +307,7 @@ if(BML_OMP_OFFLOAD) endif() endif() -set(BML_USE_DEVICE (BML_CUDA OR BML_MAGMA OR BML_CUSOLVER OR BML_ROCSOLVER OR BML_CUSPARSE OR BML_ROCSPARSE)) +set(BML_USE_DEVICE (BML_CUDA OR BML_MAGMA OR BML_CUSOLVER OR BML_ROCSOLVER OR BML_CUSPARSE OR BML_ROCSPARSE OR BML_HYPRE)) if(BML_USE_DEVICE) message(STATUS "Will use device libraries for some solvers") if (CUDAToolkit_FOUND) @@ -390,6 +391,53 @@ if(BML_MAGMA) endif() endif() +if(BML_HYPRE) + message(STATUS "Search for HYPRE...") + find_package(HYPRE REQUIRED) + + if (${HYPRE_FOUND}) + add_definitions(-DBML_USE_HYPRE) + message(STATUS + "HYPRE was found:\n" + " HYPRE_INCLUDE_DIRS: ${HYPRE_INCLUDE_DIRS}\n" + " HYPRE_LIBRARY_DIRS: ${HYPRE_LIBRARY_DIRS}\n" + " HYPRE_LIBRARIES: ${HYPRE_LIBRARIES}" + ) + include_directories(${HYPRE_INCLUDE_DIRS}) + + list(APPEND LINK_LIBRARIES "-L${HYPRE_LIBRARY_DIRS}") + list(APPEND LINK_LIBRARIES ${HYPRE_LIBRARIES}) + + if (CUDAToolkit_FOUND) + list(APPEND LINK_LIBRARIES "-L${CUDAToolkit_LIBRARY_DIR}") + list(APPEND LINK_LIBRARIES ${CUDA_cublas_LIBRARY}) + list(APPEND LINK_LIBRARIES ${CUDA_cudart_LIBRARY}) + include_directories(${CUDAToolkit_INCLUDE_DIR}) + +# add_definitions(-DBML_USE_CUSPARSE) + list(APPEND LINK_LIBRARIES ${CUDA_cusparse_LIBRARY}) + list(APPEND LINK_LIBRARIES ${CUDA_curand_LIBRARY}) + elseif(hip_FOUND) + find_package(rocblas REQUIRED) + list(APPEND LINK_LIBRARIES ${rocblas_LIBRARIES}) + include_directories(${rocblas_INCLUDE_DIRS}) + +# add_definitions(-DBML_USE_rocSPARSE) + find_package(rocsparse REQUIRED) + include_directories(${rocsparse_INCLUDE_DIRS}) + list(APPEND LINK_LIBRARIES ${rocsparse_LIBRARIES}) + find_package(rocrand REQUIRED) + list(APPEND LINK_LIBRARIES ${rocrand_LIBRARIES}) + + include_directories(${hip_INCLUDE_DIRS}) + endif() + list(APPEND LINK_LIBRARIES -lstdc++) + message(STATUS "LINK_LIBRARIES: ${LINK_LIBRARIES}") + else() + message(FATAL_ERROR "HYPRE was not found") + endif() +endif() + set(BML_ELPA FALSE CACHE BOOL "Whether to use ELPA library") if(BML_ELPA) message(STATUS "Search for ELPA in directory ${ELPA_DIR}\n") diff --git a/build.sh b/build.sh index 762712714..fa3c16af8 100755 --- a/build.sh +++ b/build.sh @@ -89,6 +89,7 @@ EOF echo "BML_MAGMA Build with MAGMA (default is ${BML_MAGMA})" echo "BML_CUSOLVER Build with cuSOLVER (default is ${BML_CUSOLVER})" echo "BML_CUSPARSE Build with cuSPARSE (default is ${BML_CUSPARSE})" + echo "BML_HYPRE Build with HYPRE (default is ${BML_HYPRE})" echo "BML_ROCSOLVER Build with rocSOLVER (default is ${BML_ROCSOLVER})" echo "BML_ROCSPARSE Build with rocSPARSE (default is ${BML_ROCSPARSE})" echo "BML_SYEVD Build with SYEVD (default is ${BML_SYEVD})" @@ -144,6 +145,7 @@ set_defaults() { : ${BML_MAGMA:=no} : ${BML_CUSOLVER:=no} : ${BML_CUSPARSE:=no} + : ${BML_HYPRE:=no} : ${BML_ROCSOLVER:=no} : ${BML_ROCSPARSE:=no} : ${BML_SYEVD:=yes} @@ -205,8 +207,8 @@ is_enabled() { } sanity_check() { - if (is_enabled ${BML_CUSPARSE} || is_enabled ${BML_ROCSPARSE}) && ! is_enabled ${BML_OMP_OFFLOAD}; then - die "In order to enable BML_CUSPARSE or BML_ROCSPARSE, BML_OMP_OFFLOAD needs to be enabled as well" + if (is_enabled ${BML_CUSPARSE} || is_enabled ${BML_ROCSPARSE} || is_enabled ${BML_HYPRE}) && ! is_enabled ${BML_OMP_OFFLOAD}; then + die "In order to enable BML_CUSPARSE or BML_ROCSPARSE or BML_HYPRE, BML_OMP_OFFLOAD needs to be enabled as well" fi } @@ -256,6 +258,7 @@ configure() { -DBML_MAGMA="${BML_MAGMA}" \ -DBML_CUSOLVER="${BML_CUSOLVER}" \ -DBML_CUSPARSE="${BML_CUSPARSE}" \ + -DBML_HYPRE="${BML_HYPRE}" \ -DBML_ROCSOLVER="${BML_ROCSOLVER}" \ -DBML_ROCSPARSE="${BML_ROCSPARSE}" \ -DBML_SYEVD="${BML_SYEVD}" \ diff --git a/cmake/FindHYPRE.cmake b/cmake/FindHYPRE.cmake new file mode 100644 index 000000000..7a0b53441 --- /dev/null +++ b/cmake/FindHYPRE.cmake @@ -0,0 +1,36 @@ +# - Find the HYPRE library +# +# Usage: +# find_package(HYPRE [REQUIRED] [QUIET] ) +# +# It sets the following variables: +# HYPRE_FOUND ... true if HYPRE is found on the system +# HYPRE_LIBRARY_DIRS ... full path to HYPRE library +# HYPRE_INCLUDE_DIRS ... HYPRE include directory +# HYPRE_LIBRARIES ... HYPRE libraries +# +# The following variables will be checked by the function +# HYPRE_USE_STATIC_LIBS ... if true, only static libraries are found +# HYPRE_ROOT ... if set, the libraries are exclusively searched +# under this path + +#If environment variable HYPRE_ROOT is specified, it has same effect as HYPRE_ROOT +if( NOT HYPRE_ROOT AND NOT $ENV{HYPRE_ROOT} STREQUAL "" ) + set( HYPRE_ROOT $ENV{HYPRE_ROOT} ) + # set library directories + set(HYPRE_LIBRARY_DIRS ${HYPRE_ROOT}/lib) + # set include directories + set(HYPRE_INCLUDE_DIRS ${HYPRE_ROOT}/include) + # set libraries + find_library( + HYPRE_LIBRARIES + NAMES "HYPRE" + PATHS ${HYPRE_ROOT} + PATH_SUFFIXES "lib" + NO_DEFAULT_PATH + ) + set(HYPRE_FOUND TRUE) +else() + set(HYPRE_FOUND FALSE) +endif() + diff --git a/scripts/build_lassen_xl_offload.sh b/scripts/build_lassen_xl_offload_whypre.sh similarity index 57% rename from scripts/build_lassen_xl_offload.sh rename to scripts/build_lassen_xl_offload_whypre.sh index 370d5caf0..460f1aab4 100755 --- a/scripts/build_lassen_xl_offload.sh +++ b/scripts/build_lassen_xl_offload_whypre.sh @@ -2,20 +2,25 @@ # Make sure all the paths are correct -source setenv_lassen_offload.sh +source scripts/setenv_lassen_offload.sh rm -r build rm -r install MY_PATH=$(pwd) -export CC=${CC:=xlc-gpu} +HYPRE_INSTALL_PATH="/usr/WS1/osei/soft/CoPA/lassen/gpu/fork/bml-hypre/hypre/src/hypre" + +export CC=${CC:=xlc++-gpu} export FC=${FC:=xlf2003-gpu} export CXX=${CXX:=xlc++-gpu} export BLAS_VENDOR=${BLAS_VENDOR:=Auto} export BML_OPENMP=${BML_OPENMP:=yes} export BML_OMP_OFFLOAD=${BML_OMP_OFFLOAD:=yes} -export BML_CUSPARSE=${BML_CUSPARSE:=yes} +export BML_OFFLOAD_ARCH=${BML_OFFLOAD_ARCH:=NVIDIA} +export BML_CUSPARSE=${BML_CUSPARSE:=no} +export BML_HYPRE=${BML_HYPRE:=yes} +export HYPRE_ROOT=${HYPRE_INSTALL_PATH} export BML_COMPLEX=${BML_COMPLEX:=no} export INSTALL_DIR=${INSTALL_DIR:="${MY_PATH}/install"} export BML_TESTING=${BML_TESTING:=yes} @@ -23,7 +28,10 @@ export CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:=Release} export EXTRA_CFLAGS=${EXTRA_CFLAGS:=""} export EXTRA_LINK_FLAGS=${EXTRA_LINK_FLAGS:=""} #export EXTRA_LINK_FLAGS=${EXTRA_LINK_FLAGS:="-lm -L/usr/tce/packages/xl/xl-2021.03.11/xlC/16.1.1/lib -libmc++"} -export BLAS_LIBRARIES=${BLAS_LIBRARIES:="-L${LAPACK_DIR} -llapack -lblas"} +export BLAS_LIBRARIES=${BLAS_LIBRARIES:="-L${ESSLLIBDIR64} -lesslsmp"} +export LAPACK_LIBRARIES=${LAPACK_LIBRARIES:="-L${LAPACK_DIR} -llapack"} + +export CUDA_TOOLKIT_ROOT_DIR=${CUDA_TOOLKIT_ROOT_DIR=$CUDA_HOME} ./build.sh configure diff --git a/scripts/build_olcf_summit_gnu_offload_openblas_whypre.sh b/scripts/build_olcf_summit_gnu_offload_openblas_whypre.sh new file mode 100755 index 000000000..0162a9b3a --- /dev/null +++ b/scripts/build_olcf_summit_gnu_offload_openblas_whypre.sh @@ -0,0 +1,44 @@ +#!/bin/bash +module load cmake +module load cuda +module load gcc/11.2.0 +module load openblas + +export CUDA_TOOLKIT_ROOT_DIR=${CUDA_TOOLKIT_ROOT_DIR="/sw/summit/cuda/11.0.3"} + +rm -r build +rm -r install + +MY_PATH=$(pwd) + +# change this to path of hypre installation. +# build hypre with: ./configure --with-cuda --without-MPI CUCC=nvcc +# using gcc-9 compilers. +HYPRE_INSTALL_PATH="/ccs/home/osei/soft/CoPA/with-hypre/hypre/src/hypre" + +#get jsrun with full path +JSRUN=$(which jsrun) +echo ${JSRUN} + +export CC=${CC:=gcc} +export FC=${FC:=gfortran} +export CXX=${CXX:=g++} +export BLAS_VENDOR=${BLAS_VENDOR:=OpenBLAS} +export BML_OPENMP=${BML_OPENMP:=yes} +export BML_OMP_OFFLOAD=${BML_OMP_OFFLOAD:=yes} +export BML_HYPRE=${BML_HYPRE:=yes} +export HYPRE_ROOT=${HYPRE_INSTALL_PATH} +export INSTALL_DIR=${INSTALL_DIR:="${MY_PATH}/install"} +export BML_TESTING=${BML_TESTING:=yes} +export BML_COMPLEX=${BML_COMPLEX:=no} +export CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:=Release} +export EXTRA_LINK_FLAGS=${EXTRA_LINK_FLAGS:="-fopenmp -latomic -lm"} +export BML_CUSPARSE=${BML_CUSPARSE:=no} +export BML_COMPLEX=${BML_COMPLEX:=no} +export BML_SYEVD=${BML_SYEVD:=no} + +#use jsrun to run tests on a compute node +export BML_NONMPI_PRECOMMAND=${BML_NONMPI_PRECOMMAND:=${JSRUN}} +export BML_NONMPI_PRECOMMAND_ARGS=${BML_NONMPI_PRECOMMAND_ARGS:="-n1;-a1;-g1;-c7"} + +./build.sh install diff --git a/scripts/setenv_lassen_offload.sh b/scripts/setenv_lassen_offload.sh index 012370189..3b1a56a1d 100644 --- a/scripts/setenv_lassen_offload.sh +++ b/scripts/setenv_lassen_offload.sh @@ -1,10 +1,8 @@ #!/bin/bash #module purge -module load cmake -module load xl/2021.03.11-cuda-11.2.0 -module load cuda/11.2.0 +module load cmake/3.23.1 +module load xl/2022.03.10-cuda-11.8.0 +module load cuda/11.8.0 module load lapack/3.9.0-xl-2020.11.12 -#module load essl -export CUDA_TOOLKIT_ROOT_DIR=${CUDA_TOOLKIT_ROOT_DIR="/usr/tce/packages/cuda/cuda-11.2.0"} - +module load essl diff --git a/src/C-interface/bml_logger.c b/src/C-interface/bml_logger.c index 38a8d10f5..41485a1c4 100644 --- a/src/C-interface/bml_logger.c +++ b/src/C-interface/bml_logger.c @@ -192,4 +192,7 @@ bml_print_version( #ifdef BML_USE_ROCSPARSE fprintf(stdout, "BML uses AMD rocSparse\n"); #endif +#ifdef BML_USE_HYPRE + fprintf(stdout, "BML uses hypre library\n"); +#endif } diff --git a/src/C-interface/ellpack/bml_add_ellpack.h b/src/C-interface/ellpack/bml_add_ellpack.h index 23b7c6d64..958ef57f4 100644 --- a/src/C-interface/ellpack/bml_add_ellpack.h +++ b/src/C-interface/ellpack/bml_add_ellpack.h @@ -200,4 +200,40 @@ void bml_add_rocsparse_ellpack_double_complex( double beta, double threshold); #endif +#if defined(BML_USE_HYPRE) +void bml_add_hypre_ellpack( + bml_matrix_ellpack_t * A, + bml_matrix_ellpack_t * B, + double alpha, + double beta, + double threshold); + +void bml_add_hypre_ellpack_single_real( + bml_matrix_ellpack_t * A, + bml_matrix_ellpack_t * B, + double alpha, + double beta, + double threshold); + +void bml_add_hypre_ellpack_double_real( + bml_matrix_ellpack_t * A, + bml_matrix_ellpack_t * B, + double alpha, + double beta, + double threshold); + +void bml_add_hypre_ellpack_single_complex( + bml_matrix_ellpack_t * A, + bml_matrix_ellpack_t * B, + double alpha, + double beta, + double threshold); + +void bml_add_hypre_ellpack_double_complex( + bml_matrix_ellpack_t * A, + bml_matrix_ellpack_t * B, + double alpha, + double beta, + double threshold); +#endif #endif diff --git a/src/C-interface/ellpack/bml_add_ellpack_typed.c b/src/C-interface/ellpack/bml_add_ellpack_typed.c index 949135f67..7bed4edb0 100644 --- a/src/C-interface/ellpack/bml_add_ellpack_typed.c +++ b/src/C-interface/ellpack/bml_add_ellpack_typed.c @@ -29,6 +29,12 @@ */ #endif +#if defined(BML_USE_HYPRE) +#include "_hypre_utilities.h" +#include "HYPRE.h" +#include "seq_mv.h" +#endif + /** Matrix addition. * * \f$ A = \alpha A + \beta B \f$ @@ -72,6 +78,8 @@ void TYPED_FUNC( TYPED_FUNC(bml_add_cusparse_ellpack) (A, B, alpha, beta, threshold); #elif defined(BML_USE_ROCSPARSE) TYPED_FUNC(bml_add_rocsparse_ellpack) (A, B, alpha, beta, threshold); +#elif defined(BML_USE_HYPRE) + TYPED_FUNC(bml_add_hypre_ellpack) (A, B, alpha, beta, threshold); #else //Should be safe to use BML_OFFLOAD_CHUNKS here but preserving old version @@ -965,4 +973,150 @@ void TYPED_FUNC( BML_CHECK_ROCSPARSE(rocsparse_destroy_mat_descr(matC_tmp)); BML_CHECK_ROCSPARSE(rocsparse_destroy_handle(handle)); } + +#elif defined(BML_USE_HYPRE) +void TYPED_FUNC( + bml_add_hypre_ellpack) ( + bml_matrix_ellpack_t * A, + bml_matrix_ellpack_t * B, + double alpha1, + double beta1, + double threshold1) +{ + /* Need "in-place" implementation of addition + * First do cuSPARSE addition: C = aA + bB, then do a copy from C to A. + */ + int A_N = A->N; + int A_M = A->M; + + int B_N = B->N; + int B_M = B->M; + + int *csrColIndA = A->csrColInd; + int *csrColIndB = B->csrColInd; + int *csrColIndC = NULL; + int *csrRowPtrA = A->csrRowPtr; + int *csrRowPtrB = B->csrRowPtr; + int *csrRowPtrC = NULL; + REAL_T *csrValA = (REAL_T *) A->csrVal; + REAL_T *csrValB = (REAL_T *) B->csrVal; + REAL_T *csrValC = NULL; + + /* hypre CSR matrix objects */ + hypre_CSRMatrix *matA; + hypre_CSRMatrix *matB; + hypre_CSRMatrix *matC; + + REAL_T alpha = (REAL_T) alpha1; + REAL_T beta = (REAL_T) beta1; + REAL_T threshold = (REAL_T) threshold1; + + // convert ellpack to cucsr + TYPED_FUNC(bml_ellpack2cucsr_ellpack) (A); + TYPED_FUNC(bml_ellpack2cucsr_ellpack) (B); + + // Create hypre csr matrices A and B + // Note: The following update is not necessary since the ellpack2cucsr + // routine updates the csr rowpointers on host and device +//#pragma omp target update from(csrRowPtrA[:A_N+1]) +//#pragma omp target update from(csrRowPtrB[:B_N+1]) + int nnzA = csrRowPtrA[A_N]; + int nnzB = csrRowPtrB[B_N]; + +// HYPRE_Init(); +// HYPRE_SetExecutionPolicy(HYPRE_EXEC_DEVICE); + + int use_vendor=0; + HYPRE_SetSpAddUseVendor(use_vendor); + hypre_SetSpAddAlgorithm(1); + /* create hypre csr matrix */ + matA = hypre_CSRMatrixCreate( A_N,A_N,nnzA ); + matB = hypre_CSRMatrixCreate( B_N,B_N,nnzB ); + +#pragma omp target data use_device_ptr(csrRowPtrA,csrColIndA,csrValA, \ + csrRowPtrB,csrColIndB,csrValB) + { + hypre_CSRMatrixI(matA) = csrRowPtrA; + hypre_CSRMatrixJ(matA) = csrColIndA; + hypre_CSRMatrixData(matA) = csrValA; + + hypre_CSRMatrixI(matB) = csrRowPtrB; + hypre_CSRMatrixJ(matB) = csrColIndB; + hypre_CSRMatrixData(matB) = csrValB; + } + + /* Add matrices */ + matC = hypre_CSRMatrixAddDevice(alpha, matA, beta, matB); + + /* threshold - drop small entries */ + if (is_above_threshold(threshold, BML_REAL_MIN)) + { + int nnzC = hypre_CSRMatrixNumNonzeros(matC); + REAL_T *elmt_tol = + (REAL_T *) malloc(sizeof(REAL_T) * nnzC); + // Allocate the working arrays on the device +#pragma omp target enter data map(alloc:elmt_tol[:nnzC]) + +#pragma omp target teams distribute parallel for + for(int i = 0; icsrColInd; int *csrRowPtr = A->csrRowPtr; REAL_T *csrVal = A->csrVal; @@ -57,7 +57,7 @@ void TYPED_FUNC( bml_free_memory(A->index); bml_free_memory(A->nnz); -#if defined(BML_USE_CUSPARSE) || defined(BML_USE_ROCSPARSE) +#if defined(BML_USE_CUSPARSE) || defined(BML_USE_ROCSPARSE) || defined(BML_USE_HYPRE) bml_free_memory(A->csrRowPtr); bml_free_memory(A->csrColInd); bml_free_memory(A->csrVal); @@ -171,7 +171,7 @@ bml_matrix_ellpack_t #pragma omp target enter data map(alloc:A_value[:N*M], A_index[:N*M], A_nnz[:N]) #pragma omp target update to(A_value[:N*M], A_index[:N*M], A_nnz[:N]) -#if defined(BML_USE_CUSPARSE) || defined(BML_USE_ROCSPARSE) +#if defined(BML_USE_CUSPARSE) || defined(BML_USE_ROCSPARSE) || defined(BML_USE_HYPRE) A->csrColInd = bml_noinit_allocate_memory(sizeof(int) * N * M); A->csrRowPtr = bml_allocate_memory(sizeof(int) * (N + 1)); A->csrVal = bml_noinit_allocate_memory(sizeof(REAL_T) * N * M); @@ -221,7 +221,7 @@ bml_matrix_ellpack_t *TYPED_FUNC( A->index = bml_allocate_memory(sizeof(int) * N * M); A->nnz = bml_allocate_memory(sizeof(int) * N); A->value = bml_allocate_memory(sizeof(REAL_T) * N * M); -#if defined(BML_USE_CUSPARSE) || defined(BML_USE_ROCSPARSE) +#if defined(BML_USE_CUSPARSE) || defined(BML_USE_ROCSPARSE) || defined(BML_USE_HYPRE) A->csrColInd = bml_allocate_memory(sizeof(int) * N * M); A->csrRowPtr = bml_allocate_memory(sizeof(int) * (N + 1)); A->csrVal = bml_allocate_memory(sizeof(REAL_T) * N * M); @@ -235,7 +235,7 @@ bml_matrix_ellpack_t *TYPED_FUNC( int *A_nnz = A->nnz; int *A_index = A->index; int NM = N * M; -#if defined(BML_USE_CUSPARSE) || defined(BML_USE_ROCSPARSE) +#if defined(BML_USE_CUSPARSE) || defined(BML_USE_ROCSPARSE) || defined(BML_USE_HYPRE) int *csrColInd = A->csrColInd; int *csrRowPtr = A->csrRowPtr; REAL_T *csrVal = A->csrVal; @@ -258,7 +258,7 @@ bml_matrix_ellpack_t *TYPED_FUNC( } } -#if defined(BML_USE_CUSPARSE) || defined(BML_USE_ROCSPARSE) +#if defined(BML_USE_CUSPARSE) || defined(BML_USE_ROCSPARSE) || defined(BML_USE_HYPRE) #pragma omp target enter data map(to:csrVal[:N*M], csrColInd[:N*M], csrRowPtr[:N+1]) #endif #endif @@ -375,7 +375,7 @@ bml_matrix_ellpack_t *TYPED_FUNC( col_marker[col] = 1; nnz_row++; } - /* generate random column index 0 >= col < N */ + /* generate random column index 0 <= col < N */ col = rand() % N; } /* update nnz of row */ @@ -656,7 +656,7 @@ void TYPED_FUNC( BML_CHECK_ROCSPARSE(rocsparse_destroy_mat_descr(mat_tmp)); } #endif -#if defined(BML_USE_CUSPARSE) || defined(BML_USE_ROCSPARSE) +#if defined(BML_USE_CUSPARSE) || defined(BML_USE_ROCSPARSE) || defined(BML_USE_HYPRE) /** Ellpack to cuCSR conversion. * * Convert from Ellpack format to cusparse csr format. diff --git a/src/C-interface/ellpack/bml_multiply_ellpack.h b/src/C-interface/ellpack/bml_multiply_ellpack.h index 400e0c377..da56bf2f3 100644 --- a/src/C-interface/ellpack/bml_multiply_ellpack.h +++ b/src/C-interface/ellpack/bml_multiply_ellpack.h @@ -203,4 +203,41 @@ void bml_multiply_rocsparse_ellpack_double_complex( double beta1, double threshold); #endif +#if defined(BML_USE_HYPRE) +void bml_multiply_hypre_ellpack( + bml_matrix_ellpack_t * A, + bml_matrix_ellpack_t * B, + bml_matrix_ellpack_t * C, + double alpha1, + double beta1, + double threshold); +void bml_multiply_hypre_ellpack_single_real( + bml_matrix_ellpack_t * A, + bml_matrix_ellpack_t * B, + bml_matrix_ellpack_t * C, + double alpha1, + double beta1, + double threshold); +void bml_multiply_hypre_ellpack_double_real( + bml_matrix_ellpack_t * A, + bml_matrix_ellpack_t * B, + bml_matrix_ellpack_t * C, + double alpha1, + double beta1, + double threshold); +void bml_multiply_hypre_ellpack_single_complex( + bml_matrix_ellpack_t * A, + bml_matrix_ellpack_t * B, + bml_matrix_ellpack_t * C, + double alpha1, + double beta1, + double threshold); +void bml_multiply_hypre_ellpack_double_complex( + bml_matrix_ellpack_t * A, + bml_matrix_ellpack_t * B, + bml_matrix_ellpack_t * C, + double alpha1, + double beta1, + double threshold); +#endif #endif diff --git a/src/C-interface/ellpack/bml_multiply_ellpack_typed.c b/src/C-interface/ellpack/bml_multiply_ellpack_typed.c index 268553f7a..7001670db 100644 --- a/src/C-interface/ellpack/bml_multiply_ellpack_typed.c +++ b/src/C-interface/ellpack/bml_multiply_ellpack_typed.c @@ -33,6 +33,12 @@ //#include // needed for hipDeviceSynchronize() #endif +#if defined(BML_USE_HYPRE) +#include "_hypre_utilities.h" +#include "HYPRE.h" +#include "seq_mv.h" +#endif + /** Matrix multiply. * * \f$ C \leftarrow \alpha A \, B + \beta C \f$ @@ -71,6 +77,10 @@ void TYPED_FUNC( #if defined(BML_USE_ROCSPARSE) TYPED_FUNC(bml_multiply_rocsparse_ellpack) (A, B, C, alpha, beta, threshold); +#elif defined(BML_USE_HYPRE) + TYPED_FUNC(bml_multiply_hypre_ellpack) (A, B, C, alpha, beta, + threshold); + #else if (A == B && alpha == ONE && beta == ZERO) { @@ -147,7 +157,7 @@ void *TYPED_FUNC( int rowMin = X_localRowMin[myRank]; int rowMax = X_localRowMax[myRank]; -#if defined(BML_USE_CUSPARSE) || defined(BML_USE_ROCSPARSE) +#if defined(BML_USE_CUSPARSE) || defined(BML_USE_ROCSPARSE) || defined(BML_USE_HYPRE) double alpha = 1.0; double beta = 0.0; @@ -159,6 +169,9 @@ void *TYPED_FUNC( TYPED_FUNC(bml_multiply_rocsparse_ellpack) (X, X, X2, alpha, beta, threshold); +#elif defined(BML_USE_HYPRE) + TYPED_FUNC(bml_multiply_hypre_ellpack) (X, X, X2, alpha, beta, + threshold); #endif traceX = TYPED_FUNC(bml_trace_ellpack) (X); @@ -378,7 +391,7 @@ void TYPED_FUNC( int rowMin = A_localRowMin[myRank]; int rowMax = A_localRowMax[myRank]; -#if defined(BML_USE_CUSPARSE) || defined(BML_USE_ROCSPARSE) +#if defined(BML_USE_CUSPARSE) || defined(BML_USE_ROCSPARSE) || defined(BML_USE_HYPRE) double alpha = 1.0; double beta = 0.0; @@ -388,6 +401,9 @@ void TYPED_FUNC( #elif defined(BML_USE_ROCSPARSE) TYPED_FUNC(bml_multiply_rocsparse_ellpack) (A, B, C, alpha, beta, threshold); +#elif defined(BML_USE_HYPRE) + TYPED_FUNC(bml_multiply_hypre_ellpack) (A, B, C, alpha, beta, + threshold); #endif #else @@ -1272,4 +1288,181 @@ void TYPED_FUNC( } BML_CHECK_ROCSPARSE(rocsparse_destroy_handle(handle)); } + +#elif defined(BML_USE_HYPRE) +void TYPED_FUNC( + bml_multiply_hypre_ellpack) ( + bml_matrix_ellpack_t * A, + bml_matrix_ellpack_t * B, + bml_matrix_ellpack_t * C, + double alpha1, + double beta1, + double threshold1) +{ + int A_N = A->N; + int A_M = A->M; + + int B_N = B->N; + int B_M = B->M; + + int C_N = C->N; + int C_M = C->M; + + int *csrColIndA = A->csrColInd; + int *csrColIndB = B->csrColInd; + int *csrColIndC = C->csrColInd; + int *csrRowPtrA = A->csrRowPtr; + int *csrRowPtrB = B->csrRowPtr; + int *csrRowPtrC = C->csrRowPtr; + REAL_T *csrValA = (REAL_T *) A->csrVal; + REAL_T *csrValB = (REAL_T *) B->csrVal; + REAL_T *csrValC = (REAL_T *) C->csrVal; + + /* hypre CSR matrix objects */ + hypre_CSRMatrix *matA; + hypre_CSRMatrix *matB; + hypre_CSRMatrix *matC; + + REAL_T alpha = (REAL_T) alpha1; + REAL_T beta = (REAL_T) beta1; + + REAL_T threshold = (REAL_T) threshold1; + + // convert ellpack to cucsr + TYPED_FUNC(bml_ellpack2cucsr_ellpack) (A); + TYPED_FUNC(bml_ellpack2cucsr_ellpack) (B); + TYPED_FUNC(bml_ellpack2cucsr_ellpack) (C); + + int nnzA = csrRowPtrA[A_N]; + int nnzB = csrRowPtrB[B_N]; + int nnzC_in = csrRowPtrC[C_N]; + +// HYPRE_Init(); +// HYPRE_SetExecutionPolicy(HYPRE_EXEC_DEVICE); + int use_vendor = 0; + int spgemm_alg = 1; + int spgemm_binned = 0; + HYPRE_SetSpGemmUseVendor(use_vendor); + hypre_SetSpGemmAlgorithm(spgemm_alg); + hypre_SetSpGemmBinned(spgemm_binned); + /* create hypre csr matrix */ + matA = hypre_CSRMatrixCreate( A_N,A_N,nnzA ); + matB = hypre_CSRMatrixCreate( B_N,B_N,nnzB ); + matC = hypre_CSRMatrixCreate( C_N,C_N,nnzC_in ); + +#pragma omp target data use_device_ptr(csrRowPtrA,csrColIndA,csrValA, \ + csrRowPtrB,csrColIndB,csrValB, \ + csrRowPtrC,csrColIndC,csrValC) + { + hypre_CSRMatrixI(matA) = csrRowPtrA; + hypre_CSRMatrixJ(matA) = csrColIndA; + hypre_CSRMatrixData(matA) = csrValA; + + hypre_CSRMatrixI(matB) = csrRowPtrB; + hypre_CSRMatrixJ(matB) = csrColIndB; + hypre_CSRMatrixData(matB) = csrValB; + + hypre_CSRMatrixI(matC) = csrRowPtrC; + hypre_CSRMatrixJ(matC) = csrColIndC; + hypre_CSRMatrixData(matC) = csrValC; + } + + hypre_CSRMatrix *matD = hypre_CSRMatrixMultiplyDevice(matA, matB); + + /* add matrices */ + int spadd_use_vendor=0; + HYPRE_SetSpAddUseVendor(spadd_use_vendor); + hypre_SetSpAddAlgorithm(1); + hypre_CSRMatrix *matE = hypre_CSRMatrixAddDevice(alpha, matD, beta, matC); + + // Place the resulting matrix in C + if (is_above_threshold(threshold, BML_REAL_MIN)) + { + int nnzE = hypre_CSRMatrixNumNonzeros(matE); + REAL_T *elmt_tol = + (REAL_T *) malloc(sizeof(REAL_T) * nnzE); + // Allocate the working arrays on the device +#pragma omp target enter data map(alloc:elmt_tol[:nnzE]) + +#pragma omp target teams distribute parallel for + for(int i = 0; i