From b9c8adf3fda6980f5da9ee999a633f4659344596 Mon Sep 17 00:00:00 2001 From: Jamal Mohd-Yusof Date: Tue, 12 Sep 2023 19:08:00 -0600 Subject: [PATCH] Separate real offload diagonalize routines. (#733) --- src/C-interface/dense/bml_diagonalize_dense.c | 322 ++++++++++++++++-- 1 file changed, 297 insertions(+), 25 deletions(-) diff --git a/src/C-interface/dense/bml_diagonalize_dense.c b/src/C-interface/dense/bml_diagonalize_dense.c index 241a1084..ee5d480f 100644 --- a/src/C-interface/dense/bml_diagonalize_dense.c +++ b/src/C-interface/dense/bml_diagonalize_dense.c @@ -21,7 +21,14 @@ #include #endif #endif + +#ifdef MKL_GPU +#include "stdio.h" +#include "mkl.h" +#include "mkl_omp_offload.h" +#else #include "../lapack.h" +#endif #include #include @@ -85,6 +92,142 @@ bml_diagonalize_dense_magma_single_real( } #endif +#ifdef MKL_GPU +void +bml_diagonalize_dense_gpu_single_real( + bml_matrix_dense_t * A, + void *eigenvalues, + bml_matrix_dense_t * eigenvectors) +{ +#ifdef NOBLAS + LOG_ERROR("No BLAS library"); +#else + + int dnum = 0; + MKL_INT info; + MKL_INT N = A->N; + float *A_matrix = (float *) A->matrix; + float *typed_eigenvalues = (float *) eigenvalues; + + float *evecs = (float *) malloc(A->N * A->N * sizeof(float)); + float *evals = (float *) malloc(A->N * sizeof(float)); + +#pragma omp target enter data map(alloc:evecs[0:N*N]) +#pragma omp target enter data map(alloc:evals[0:N]) + +#ifdef MKL_GPU_DEBUG +// pull from GPU +#pragma omp target update from(A_matrix[0:N*N]) + printf("Checking A matrix values \n"); + for (int i = 0; i < N; i++) + { + for (int j = 0; j < N; j++) + { + printf(" %6.3f", A_matrix[i * N + j]); + } + printf("\n"); + } +#endif // DEBUG + // copy A to evecs on GPU +#pragma omp target teams distribute parallel for + for (int i = 0; i < N * N; i++) + { + evecs[i] = A_matrix[i]; + } +#pragma omp target teams distribute parallel for + for (int i = 0; i < N; i++) + { + evals[i] = 1.0; + } + +#ifdef MKL_GPU_DEBUG +// pull from GPU +#pragma omp target update from(evecs[0:N*N]) +#pragma omp target update from(evals[0:N]) + // Check values before syev + printf("Checking evecs input values \n"); + for (int i = 0; i < N; i++) + { + for (int j = 0; j < N; j++) + { + printf(" %6.3f,", evecs[i * N + j]); + } + printf("\n"); + } + for (int i = 0; i < N; i++) + { + printf("%d, %f \n", i, evals[i]); + } +#endif // DEBUG +#ifdef BML_SYEVD + // Divide and conquer solver + MKL_INT lwork = 1 + 6 * N + 2 * N * N; + float *work = (float *) malloc(lwork * sizeof(float)); + MKL_INT liwork = 3 + 5 * N; + MKL_INT *iwork = (MKL_INT *) malloc(liwork * sizeof(MKL_INT)); +#pragma omp target enter data map(alloc:work[0:lwork]) +#pragma omp target enter data map(alloc:iwork[0:liwork]) + +#pragma omp target variant dispatch use_device_ptr(evecs, evals, work, iwork) + ssyevd("V", "U", &N, evecs, &N, evals, work, &lwork, iwork, &liwork, + &info); +#pragma omp target exit data map(release:iwork) + free(iwork); +#else + MKL_INT lwork = 3 * N; + float *work = (float *) mkl_malloc(lwork * sizeof(float), 64); +#pragma omp target enter data map(alloc:work[0:lwork]) +#pragma omp target variant dispatch device(dnum) use_device_ptr(evecs, evals, work) + ssyev("V", "U", &N, evecs, &N, evals, work, &lwork, &info); +#endif // BML_SYEVD + +#ifdef MKL_GPU_DEBUG +#pragma omp target update from(evecs[0:N*N]) +#pragma omp target update from(evals[0:N]) + + printf("After SYEV \n"); + for (int i = 0; i < N; i++) + { + for (int j = 0; j < N; j++) + { + printf(" %6.3f", evecs[i * N + j]); + } + printf("\n"); + } + for (int i = 0; i < N; i++) + { + printf("%d, %f \n", i, evals[i]); + } +#endif + +// need typed_eigenvalues on CPU +#pragma omp target update from(evals[0:N]) + for (int i = 0; i < N; i++) + { + typed_eigenvalues[i] = (float) evals[i]; + } + +// leave eigenvectors on GPU + float *e_matrix = (float *) eigenvectors->matrix; +#pragma omp target teams distribute parallel for + for (int i = 0; i < N; i++) + { + for (int j = 0; j < N; j++) + { + e_matrix[ROWMAJOR(i, j, N, N)] = evecs[COLMAJOR(i, j, N, N)]; + } + } + +#pragma omp target exit data map(release:evecs) +#pragma omp target exit data map(release:evals) +#pragma omp target exit data map(release:work) + free(evecs); + free(evals); + free(work); +#endif // NOBLAS +} +#endif // MKL_GPU + void bml_diagonalize_dense_cpu_single_real( bml_matrix_dense_t * A, @@ -95,32 +238,30 @@ bml_diagonalize_dense_cpu_single_real( LOG_ERROR("No BLAS library"); #else int info; + int N = A->N; float *A_matrix; float *typed_eigenvalues = (float *) eigenvalues; float *evecs = calloc(A->N * A->N, sizeof(float)); float *evals = calloc(A->N, sizeof(float)); -#ifdef MKL_GPU -// pull from GPU -#pragma omp target update from(A_matrix[0:N*N]) -#endif - memcpy(evecs, A->matrix, A->N * A->N * sizeof(float)); #ifdef BML_SYEVD // Divide and conquer solver - int lwork = 1 + 6 * A->N + 2 * A->N * A->N; + const int lwork = 1 + 6 * N + 2 * N * N; float *work = malloc(lwork * sizeof(float)); - int liwork = 3 + 5 * A->N; + const int liwork = 3 + 5 * N; int *iwork = malloc(liwork * sizeof(int)); - C_SSYEVD("V", "U", &A->N, evecs, &A->N, evals, work, &lwork, + printf("%d, %d \n", lwork, liwork); + + C_SSYEVD("V", "U", &N, evecs, &N, evals, work, &lwork, iwork, &liwork, &info); free(iwork); #else - int lwork = 3 * A->N; + const int lwork = 3 * A->N; float *work = calloc(lwork, sizeof(float)); - C_SSYEV("V", "U", &A->N, evecs, &A->N, evals, work, &lwork, &info); + C_SSYEV("V", "U", &N, evecs, &N, evals, work, &lwork, &info); #endif // BML_SYEVD A_matrix = (float *) eigenvectors->matrix; @@ -133,10 +274,6 @@ bml_diagonalize_dense_cpu_single_real( evecs[COLMAJOR(i, j, A->N, A->N)]; } } -#ifdef MKL_GPU -// push back to GPU -#pragma omp target update to(A_matrix[0:N*N]) -#endif free(evecs); free(evals); @@ -152,6 +289,8 @@ bml_diagonalize_dense_single_real( { #ifdef BML_USE_MAGMA bml_diagonalize_dense_magma_single_real(A, eigenvalues, eigenvectors); +#elif MKL_GPU + bml_diagonalize_dense_gpu_single_real(A, eigenvalues, eigenvectors); #else bml_diagonalize_dense_cpu_single_real(A, eigenvalues, eigenvectors); #endif @@ -303,7 +442,7 @@ bml_diagonalize_dense_magma_double_real( free(evals); //verify norm eigenvectors - //for(int i=0;iN;i++) + //for(int i= 0;iN;i++) //{ // double norm = magma_dnrm2(A->N, evecs+A->ld*i, 1, bml_queue()); // printf("norm = %le\n", norm); @@ -317,7 +456,7 @@ bml_diagonalize_dense_magma_double_real( magma_queue_sync(bml_queue()); //verify norm eigenvectors transposed - //for(int i=0;iN;i++) + //for(int i= 0;iN;i++) //{ // double norm = magma_dnrm2(A->N, evecs+i, A->ld, bml_queue()); // printf("norm transposed vector = %le\n", norm); @@ -326,6 +465,140 @@ bml_diagonalize_dense_magma_double_real( } #endif +#ifdef MKL_GPU +void +bml_diagonalize_dense_gpu_double_real( + bml_matrix_dense_t * A, + void *eigenvalues, + bml_matrix_dense_t * eigenvectors) +{ +#ifdef NOBLAS + LOG_ERROR("No BLAS library"); +#else + + int dnum = 0; + MKL_INT info; + const MKL_INT N = A->N; + double *A_matrix = A->matrix; + double *typed_eigenvalues = (double *) eigenvalues; + + double *evecs = (double *) calloc(A->N * A->N, sizeof(double)); + double *evals = (double *) calloc(A->N, sizeof(double)); + +#pragma omp target enter data map(alloc:evecs[0:N*N]) +#pragma omp target enter data map(alloc:evals[0:N]) + +#ifdef MKL_GPU_DEBUG +// pull from GPU +#pragma omp target update from(A_matrix[0:N*N]) + printf("Checking A matrix values \n"); + for (int i = 0; i < N; i++) + { + for (int j = 0; j < N; j++) + { + printf(" %6.3f,", A_matrix[i * N + j]); + } + printf("\n"); + } +#endif // DEBUG + // copy A to evecs on GPU +#pragma omp target teams distribute parallel for + for (int i = 0; i < N * N; i++) + { + evecs[i] = A_matrix[i]; + } + +#ifdef MKL_GPU_DEBUG + +#pragma omp target update from(evecs[0:N*N]) +#pragma omp target update from(evals[0:N]) + // Check values before syev + printf("Checking evecs input values \n"); + for (int i = 0; i < N; i++) + { + for (int j = 0; j < N; j++) + { + printf(" %6.3f,", evecs[i * N + j]); + } + printf("\n"); + } + for (int i = 0; i < N; i++) + { + printf("%d, %f \n", i, evals[i]); + } +#endif // DEBUG + +#ifdef BML_SYEVD + // Divide and conquer solver + const MKL_INT lwork = 1 + 6 * N + 2 * N * N; + double *work = malloc(lwork * sizeof(double)); + const MKL_INT liwork = 3 + 5 * N; + MKL_INT *iwork = malloc(liwork * sizeof(MKL_INT)); +#pragma omp target enter data map(alloc:work[0:lwork]) +#pragma omp target enter data map(alloc:iwork[0:liwork]) + +#pragma omp target variant dispatch device(dnum) use_device_ptr(evecs, evals, work, iwork) + dsyevd("V", "U", &N, evecs, &N, evals, work, &lwork, + iwork, &liwork, &info); +#pragma omp target exit data map(release:iwork) + free(iwork); +#else + const MKL_INT lwork = 3 * N; + double *work = calloc(lwork, sizeof(double)); +#pragma omp target enter data map(alloc:work[0:lwork]) +#pragma omp target variant dispatch device(dnum) use_device_ptr(evecs, evals, work) + dsyev("V", "U", &N, evecs, &N, evals, work, &lwork, &info); +#endif // BML_SYEVD + +#ifdef MKL_GPU_DEBUG + // pull evecs, evals back from GPU +#pragma omp target update from(evecs[0:N*N]) +#pragma omp target update from(evals[0:N]) + + printf("After SYEV \n"); + for (int i = 0; i < N; i++) + { + for (int j = 0; j < N; j++) + { + printf(" %6.3f,", evecs[i * N + j]); + } + printf("\n"); + } + for (int i = 0; i < N; i++) + { + printf("%d, %f \n", i, evals[i]); + } +#endif // DEBUG + +// need typed_eigenvalues on CPU +#pragma omp target update from(evals[0:N]) + for (int i = 0; i < A->N; i++) + { + typed_eigenvalues[i] = (double) evals[i]; + } + +// leave eigenvectors on GPU + double *e_matrix = (double *) eigenvectors->matrix; +#pragma omp target teams distribute parallel for + for (int i = 0; i < N; i++) + { + for (int j = 0; j < N; j++) + { + e_matrix[ROWMAJOR(i, j, N, N)] = evecs[COLMAJOR(i, j, N, N)]; + } + } + +#pragma omp target exit data map(release:evecs) +#pragma omp target exit data map(release:evals) +#pragma omp target exit data map(release:work) + + free(evecs); + free(evals); + free(work); +#endif // NOBLAS +} +#endif // MKL_GPU + void bml_diagonalize_dense_cpu_double_real( bml_matrix_dense_t * A, @@ -336,15 +609,13 @@ bml_diagonalize_dense_cpu_double_real( LOG_ERROR("No BLAS library"); #else int info; - double *A_matrix; + int N = A->N; + double *A_matrix = A->matrix; double *typed_eigenvalues = (double *) eigenvalues; double *evecs = calloc(A->N * A->N, sizeof(double)); double *evals = calloc(A->N, sizeof(double)); -#ifdef MKL_GPU -// pull from GPU -#pragma omp target update from(A_matrix[0:N*N]) -#endif + memcpy(evecs, A->matrix, A->N * A->N * sizeof(double)); #ifdef BML_SYEVD @@ -372,10 +643,6 @@ bml_diagonalize_dense_cpu_double_real( evecs[COLMAJOR(i, j, A->N, A->N)]; } } -#ifdef MKL_GPU -// push back to GPU -#pragma omp target update to(A_matrix[0:N*N]) -#endif free(evecs); free(evals); free(work); @@ -390,6 +657,8 @@ bml_diagonalize_dense_double_real( { #ifdef BML_USE_MAGMA bml_diagonalize_dense_magma_double_real(A, eigenvalues, eigenvectors); +#elif MKL_GPU + bml_diagonalize_dense_gpu_double_real(A, eigenvalues, eigenvectors); #else // CPU code bml_diagonalize_dense_cpu_double_real(A, eigenvalues, eigenvectors); #endif @@ -402,6 +671,7 @@ bml_diagonalize_dense_single_complex( bml_matrix_dense_t * eigenvectors) { int info; + int N = A->N; float complex *typed_eigenvalues = (float complex *) eigenvalues; #ifdef BML_USE_MAGMA @@ -460,6 +730,7 @@ bml_diagonalize_dense_single_complex( #ifdef MKL_GPU // pull from GPU + #pragma omp target update from(A_matrix[0:N*N]) #endif memcpy(A_copy, A->matrix, A->N * A->N * sizeof(float complex)); @@ -509,6 +780,7 @@ bml_diagonalize_dense_double_complex( bml_matrix_dense_t * eigenvectors) { int info; + int N = A->N; double complex *typed_eigenvalues = (double complex *) eigenvalues; #ifdef BML_USE_MAGMA