From 38bab8b52db704d9b50ef0eeba11672086cb4a57 Mon Sep 17 00:00:00 2001 From: David Cortes Date: Tue, 1 Oct 2024 17:33:51 +0200 Subject: [PATCH 01/16] Use spectral decomposition as more accurate fallback from Cholesky --- cpp/daal/src/algorithms/service_kernel_math.h | 147 ++++++++++++++---- cpp/daal/src/externals/service_lapack.h | 33 ++++ .../src/externals/service_lapack_declar_ref.h | 6 + cpp/daal/src/externals/service_lapack_mkl.h | 52 +++++++ cpp/daal/src/externals/service_lapack_ref.h | 42 +++++ 5 files changed, 252 insertions(+), 28 deletions(-) diff --git a/cpp/daal/src/algorithms/service_kernel_math.h b/cpp/daal/src/algorithms/service_kernel_math.h index e23104ea1c5..dea50a1b8e4 100644 --- a/cpp/daal/src/algorithms/service_kernel_math.h +++ b/cpp/daal/src/algorithms/service_kernel_math.h @@ -24,6 +24,8 @@ #ifndef __SERVICE_KERNEL_MATH_H__ #define __SERVICE_KERNEL_MATH_H__ +#include + #include "services/daal_defines.h" #include "services/env_detect.h" #include "src/algorithms/service_error_handling.h" @@ -660,6 +662,15 @@ bool solveEquationsSystemWithCholesky(FPType * a, FPType * b, size_t n, size_t n } if (info != 0) return false; + /* Note: there can be cases in which the matrix is singular / rank-deficient, but due to numerical + inaccuracies, Cholesky still succeeds. In such cases, it might produce a solution successfully, but + it will not be the minimum-norm solution, and might be prone towards having too large numbers. Thus + it's preferrable to fall back to a different type of solver that can work correctly with those. */ + for (size_t ix = 0; ix < n; ix++) + { + if (a[ix * (ix + 1)] < 1e-6) return false; + } + /* Solve L*L' * x = b */ if (sequential) { @@ -673,72 +684,152 @@ bool solveEquationsSystemWithCholesky(FPType * a, FPType * b, size_t n, size_t n } template -bool solveEquationsSystemWithPLU(FPType * a, FPType * b, size_t n, size_t nX, bool sequential, bool extendFromSymmetric) +bool solveEquationsSystemWithSpectralDecomposition(FPType * a, FPType * b, size_t n, size_t nX, bool sequential) { - if (extendFromSymmetric) + /* Storage for the eigenvalues. + Note: this allocates more size than they might require when nX > 1, because the same + buffer will get reused later on and needs the extra size. Those additional entries + will not be filled with eigenvalues. */ + TArrayScalable eigenvalues(n * nX); + DAAL_CHECK_MALLOC(eigenvalues.get()); + + /* SYEV parameters */ + char jobz = 'V'; + char uplo = 'U'; + DAAL_INT info; + + /* Query the procedure for size of required buffer */ + DAAL_INT lwork = -1; + FPType buffer_size; + if (sequential) { - /* Extend symmetric matrix to generic through filling of upper triangle */ - for (size_t i = 0; i < n; ++i) - { - for (size_t j = 0; j < i; ++j) - { - a[j * n + i] = a[i * n + j]; - } - } + LapackInst::xxsyev(&jobz, &uplo, (DAAL_INT *)&n, a, (DAAL_INT *)&n, eigenvalues.get(), &buffer_size, &lwork, &info); } - /* GETRF and GETRS parameters */ - char trans = 'N'; - DAAL_INT info = 0; + else + { + LapackInst::xsyev(&jobz, &uplo, (DAAL_INT *)&n, a, (DAAL_INT *)&n, eigenvalues.get(), &buffer_size, &lwork, &info); + } - TArrayScalable ipiv(n); - DAAL_CHECK_MALLOC(ipiv.get()); + if (info) return false; - /* Perform P*L*U factorization of A */ + /* Check that buffer size will not overflow when passed to LAPACK */ + if (static_cast(buffer_size) > std::numeric_limits::max()) return false; + + /* Allocate work buffer as needed */ + DAAL_INT work_buffer_size = static_cast(buffer_size); + TArrayScalable work_buffer(work_buffer_size); + DAAL_CHECK_MALLOC(work_buffer.get()); + + /* Perform Q*diag(l)*Q' factorization of A */ if (sequential) { - LapackInst::xxgetrf((DAAL_INT *)&n, (DAAL_INT *)&n, a, (DAAL_INT *)&n, ipiv.get(), &info); + LapackInst::xxsyev(&jobz, &uplo, (DAAL_INT *)&n, a, (DAAL_INT *)&n, eigenvalues.get(), work_buffer.get(), &work_buffer_size, + &info); } else { - LapackInst::xgetrf((DAAL_INT *)&n, (DAAL_INT *)&n, a, (DAAL_INT *)&n, ipiv.get(), &info); + LapackInst::xsyev(&jobz, &uplo, (DAAL_INT *)&n, a, (DAAL_INT *)&n, eigenvalues.get(), work_buffer.get(), &work_buffer_size, + &info); + } + if (info) return false; + + /* Components with small singular values get eliminated using the exact same logic as 'gelsd' with default parameters */ + constexpr const FPType eps = std::numeric_limits::epsilon(); + if (eigenvalues[n - 1] <= eps) return false; + const double component_threshold = eps * eigenvalues[n - 1]; + DAAL_INT num_discarded; + for (num_discarded = 0; num_discarded < static_cast(n) - 1; num_discarded++) + { + if (eigenvalues[num_discarded] > component_threshold) + { + break; + } } - if (info != 0) return false; - /* Solve P*L*U * x = b */ + /* Create the square root of the inverse: Qis = Q * diag(1 / sqrt(l)) */ + DAAL_INT one = 1; + for (size_t col = num_discarded; col < n; col++) + { + const FPType scale = std::sqrt(eigenvalues[col]); + if (sequential) + { + LapackInst::xxrscl((DAAL_INT *)&n, &scale, a + col * n, &one); + } + + else + { + LapackInst::xrscl((DAAL_INT *)&n, &scale, a + col * n, &one); + } + } + + /* Now calculate the actual solution: Qis * Qis' * B */ + char trans_yes = 'T'; + char trans_no = 'N'; + FPType one_fp = 1; + FPType zero = 0; + DAAL_INT num_taken = static_cast(n) - num_discarded; + a += static_cast(num_discarded) * n; if (sequential) { - LapackInst::xxgetrs(&trans, (DAAL_INT *)&n, (DAAL_INT *)&nX, a, (DAAL_INT *)&n, ipiv.get(), b, (DAAL_INT *)&n, &info); + if (nX == 1) + { + BlasInst::xxgemv(&trans_yes, (DAAL_INT *)&n, &num_taken, &one_fp, a, (DAAL_INT *)&n, b, &one, &zero, eigenvalues.get(), + &one); + BlasInst::xxgemv(&trans_no, (DAAL_INT *)&n, &num_taken, &one_fp, a, (DAAL_INT *)&n, eigenvalues.get(), &one, &zero, b, &one); + } + + else + { + BlasInst::xxgemm(&trans_yes, &trans_no, &num_taken, (DAAL_INT *)&nX, (DAAL_INT *)&n, &one_fp, a, (DAAL_INT *)&n, b, + (DAAL_INT *)&n, &zero, eigenvalues.get(), &num_taken); + BlasInst::xxgemm(&trans_no, &trans_no, (DAAL_INT *)&n, (DAAL_INT *)&nX, &num_taken, &one_fp, a, (DAAL_INT *)&n, + eigenvalues.get(), &num_taken, &zero, b, (DAAL_INT *)&n); + } } + else { - LapackInst::xgetrs(&trans, (DAAL_INT *)&n, (DAAL_INT *)&nX, a, (DAAL_INT *)&n, ipiv.get(), b, (DAAL_INT *)&n, &info); + if (nX == 1) + { + BlasInst::xgemv(&trans_yes, (DAAL_INT *)&n, &num_taken, &one_fp, a, (DAAL_INT *)&n, b, &one, &zero, eigenvalues.get(), &one); + BlasInst::xgemv(&trans_no, (DAAL_INT *)&n, &num_taken, &one_fp, a, (DAAL_INT *)&n, eigenvalues.get(), &one, &zero, b, &one); + } + + else + { + BlasInst::xgemm(&trans_yes, &trans_no, &num_taken, (DAAL_INT *)&nX, (DAAL_INT *)&n, &one_fp, a, (DAAL_INT *)&n, b, + (DAAL_INT *)&n, &zero, eigenvalues.get(), &num_taken); + BlasInst::xgemm(&trans_no, &trans_no, (DAAL_INT *)&n, (DAAL_INT *)&nX, &num_taken, &one_fp, a, (DAAL_INT *)&n, + eigenvalues.get(), &num_taken, &zero, b, (DAAL_INT *)&n); + } } - return (info == 0); + + return true; } template bool solveSymmetricEquationsSystem(FPType * a, FPType * b, size_t n, size_t nX, bool sequential) { - /* Copy data for fallback from Cholesky to PLU factorization */ + /* Copy data for fallback from Cholesky to spectral decomposition */ TArrayScalable aCopy(n * n); TArrayScalable bCopy(n); DAAL_CHECK_MALLOC(aCopy.get()); DAAL_CHECK_MALLOC(bCopy.get()); int copy_status = services::internal::daal_memcpy_s(aCopy.get(), n * n * sizeof(FPType), a, n * n * sizeof(FPType)); - copy_status += services::internal::daal_memcpy_s(bCopy.get(), n * sizeof(FPType), b, n * sizeof(FPType)); + copy_status += services::internal::daal_memcpy_s(bCopy.get(), n * nX * sizeof(FPType), b, n * nX * sizeof(FPType)); if (copy_status != 0) return false; /* Try to solve with Cholesky factorization */ if (!solveEquationsSystemWithCholesky(a, b, n, nX, sequential)) { - /* Fallback to PLU factorization */ - bool status = solveEquationsSystemWithPLU(aCopy.get(), bCopy.get(), n, nX, sequential, true); + /* Fall back to spectral decomposition */ + bool status = solveEquationsSystemWithSpectralDecomposition(aCopy.get(), bCopy.get(), n, nX, sequential); if (status) { - status = status && (services::internal::daal_memcpy_s(b, n * sizeof(FPType), bCopy.get(), n * sizeof(FPType)) == 0); + status = status && (services::internal::daal_memcpy_s(b, n * nX * sizeof(FPType), bCopy.get(), n * nX * sizeof(FPType)) == 0); } return status; } diff --git a/cpp/daal/src/externals/service_lapack.h b/cpp/daal/src/externals/service_lapack.h index fbf2ebd44f7..ec7e9ac25cf 100644 --- a/cpp/daal/src/externals/service_lapack.h +++ b/cpp/daal/src/externals/service_lapack.h @@ -21,6 +21,11 @@ //-- */ +/* Note: this file is not auto-generated. These 'x'/'xx' functions are manually added here on an +as-needed basis, and are only used internally within the library so their signatures might not +match LAPACK's to every minutiae like passing pointers to scalars or passing them by value, or +having 'const' qualifiers or not. */ + #ifndef __SERVICE_LAPACK_H__ #define __SERVICE_LAPACK_H__ @@ -193,6 +198,18 @@ struct Lapack _impl::xxsyevd(jobz, uplo, n, a, lda, w, work, lwork, iwork, liwork, info); } + static void xsyev(const char * jobz, const char * uplo, const SizeType * n, fpType * a, const SizeType * lda, fpType * w, fpType * work, + SizeType * lwork, SizeType * info) + { + _impl::xsyev(jobz, uplo, n, a, lda, w, work, lwork, info); + } + + static void xxsyev(const char * jobz, const char * uplo, const SizeType * n, fpType * a, const SizeType * lda, fpType * w, fpType * work, + SizeType * lwork, SizeType * info) + { + _impl::xxsyev(jobz, uplo, n, a, lda, w, work, lwork, info); + } + static void xormqr(char * side, char * trans, SizeType * m, SizeType * n, SizeType * k, fpType * a, SizeType * lda, fpType * tau, fpType * c, SizeType * ldc, fpType * work, SizeType * lwork, SizeType * info) { @@ -204,6 +221,10 @@ struct Lapack { _impl::xxormqr(side, trans, m, n, k, a, lda, tau, c, ldc, work, lwork, info); } + + static void xrscl(const SizeType * n, const fpType * sa, fpType * sx, const SizeType * incx) { _impl::xrscl(n, sa, sx, incx); } + + static void xxrscl(const SizeType * n, const fpType * sa, fpType * sx, const SizeType * incx) { _impl::xxrscl(n, sa, sx, incx); } }; template @@ -361,6 +382,18 @@ struct LapackAutoDispatch DAAL_DISPATCH_LAPACK_BY_CPU(fpType, xxsyevd, jobz, uplo, n, a, lda, w, work, lwork, iwork, liwork, info); } + static void xsyev(char * jobz, char * uplo, SizeType * n, fpType * a, SizeType * lda, fpType * w, fpType * work, SizeType * lwork, + SizeType * info) + { + DAAL_DISPATCH_LAPACK_BY_CPU(fpType, xsyev, jobz, uplo, n, a, lda, w, work, lwork, info); + } + + static void xxsyev(char * jobz, char * uplo, SizeType * n, fpType * a, SizeType * lda, fpType * w, fpType * work, SizeType * lwork, + SizeType * info) + { + DAAL_DISPATCH_LAPACK_BY_CPU(fpType, xxsyev, jobz, uplo, n, a, lda, w, work, lwork, info); + } + static void xormqr(char * side, char * trans, SizeType * m, SizeType * n, SizeType * k, fpType * a, SizeType * lda, fpType * tau, fpType * c, SizeType * ldc, fpType * work, SizeType * lwork, SizeType * info) { diff --git a/cpp/daal/src/externals/service_lapack_declar_ref.h b/cpp/daal/src/externals/service_lapack_declar_ref.h index 7e6c9c195d1..064d4848d3d 100644 --- a/cpp/daal/src/externals/service_lapack_declar_ref.h +++ b/cpp/daal/src/externals/service_lapack_declar_ref.h @@ -82,10 +82,16 @@ extern "C" extern void ssyevd_(char *, char *, DAAL_INT *, float *, DAAL_INT *, float *, float *, DAAL_INT *, DAAL_INT *, DAAL_INT *, DAAL_INT *); extern void dsyevd_(char *, char *, DAAL_INT *, double *, DAAL_INT *, double *, double *, DAAL_INT *, DAAL_INT *, DAAL_INT *, DAAL_INT *); + extern void ssyev_(const char *, const char *, const DAAL_INT *, float *, const DAAL_INT *, float *, float *, DAAL_INT *, DAAL_INT *); + extern void dsyev_(const char *, const char *, const DAAL_INT *, double *, const DAAL_INT *, double *, double *, DAAL_INT *, DAAL_INT *); + extern void sormqr_(char *, char *, DAAL_INT *, DAAL_INT *, DAAL_INT *, float *, DAAL_INT *, float *, float *, DAAL_INT *, float *, DAAL_INT *, DAAL_INT *); extern void dormqr_(char *, char *, DAAL_INT *, DAAL_INT *, DAAL_INT *, double *, DAAL_INT *, double *, double *, DAAL_INT *, double *, DAAL_INT *, DAAL_INT *); + + extern void drscl_(const DAAL_INT *, const double *, double *, const DAAL_INT *); + extern void srscl_(const DAAL_INT *, const float *, float *, const DAAL_INT *); } } // namespace ref diff --git a/cpp/daal/src/externals/service_lapack_mkl.h b/cpp/daal/src/externals/service_lapack_mkl.h index 37a81c3262f..b0d1cf81e9a 100644 --- a/cpp/daal/src/externals/service_lapack_mkl.h +++ b/cpp/daal/src/externals/service_lapack_mkl.h @@ -243,6 +243,20 @@ struct MklLapack mkl_set_num_threads_local(old_nthr); } + static void xsyev(const char * jobz, const char * uplo, const DAAL_INT * n, double * a, const DAAL_INT * lda, double * w, double * work, + DAAL_INT * lwork, DAAL_INT * info) + { + __DAAL_MKLFN_CALL_LAPACK(dsyev, (jobz, uplo, (MKL_INT *)n, a, (MKL_INT *)lda, w, work, (MKL_INT *)lwork, (MKL_INT *)info)); + } + + static void xxsyev(const char * jobz, const char * uplo, const DAAL_INT * n, double * a, const DAAL_INT * lda, double * w, double * work, + DAAL_INT * lwork, DAAL_INT * info) + { + int old_nthr = mkl_set_num_threads_local(1); + __DAAL_MKLFN_CALL_LAPACK(dsyev, (jobz, uplo, (MKL_INT *)n, a, (MKL_INT *)lda, w, work, (MKL_INT *)lwork, (MKL_INT *)info)); + mkl_set_num_threads_local(old_nthr); + } + static void xormqr(char * side, char * trans, DAAL_INT * m, DAAL_INT * n, DAAL_INT * k, double * a, DAAL_INT * lda, double * tau, double * c, DAAL_INT * ldc, double * work, DAAL_INT * lwork, DAAL_INT * info) { @@ -258,6 +272,18 @@ struct MklLapack (MKL_INT *)lwork, (MKL_INT *)info)); mkl_set_num_threads_local(old_nthr); } + + static void xrscl(const DAAL_INT * n, const double * sa, double * sx, const DAAL_INT * incx) + { + __DAAL_MKLFN_CALL_LAPACK(drscl, ((MKL_INT *)n, sa, sx, (MKL_INT *)incx)); + } + + static void xxrscl(const DAAL_INT * n, const double * sa, double * sx, const DAAL_INT * incx) + { + int old_nthr = mkl_set_num_threads_local(1); + __DAAL_MKLFN_CALL_LAPACK(drscl, ((MKL_INT *)n, sa, sx, (MKL_INT *)incx)); + mkl_set_num_threads_local(old_nthr); + } }; /* @@ -461,6 +487,20 @@ struct MklLapack mkl_set_num_threads_local(old_nthr); } + static void xsyev(const char * jobz, const char * uplo, const DAAL_INT * n, float * a, const DAAL_INT * lda, float * w, float * work, + DAAL_INT * lwork, DAAL_INT * info) + { + __DAAL_MKLFN_CALL_LAPACK(ssyev, (jobz, uplo, (MKL_INT *)n, a, (MKL_INT *)lda, w, work, (MKL_INT *)lwork, (MKL_INT *)info)); + } + + static void xxsyev(const char * jobz, const char * uplo, const DAAL_INT * n, float * a, const DAAL_INT * lda, float * w, float * work, + DAAL_INT * lwork, DAAL_INT * info) + { + int old_nthr = mkl_set_num_threads_local(1); + __DAAL_MKLFN_CALL_LAPACK(ssyev, (jobz, uplo, (MKL_INT *)n, a, (MKL_INT *)lda, w, work, (MKL_INT *)lwork, (MKL_INT *)info)); + mkl_set_num_threads_local(old_nthr); + } + static void xormqr(char * side, char * trans, DAAL_INT * m, DAAL_INT * n, DAAL_INT * k, float * a, DAAL_INT * lda, float * tau, float * c, DAAL_INT * ldc, float * work, DAAL_INT * lwork, DAAL_INT * info) { @@ -476,6 +516,18 @@ struct MklLapack (MKL_INT *)lwork, (MKL_INT *)info)); mkl_set_num_threads_local(old_nthr); } + + static void xrscl(const DAAL_INT * n, const float * sa, float * sx, const DAAL_INT * incx) + { + __DAAL_MKLFN_CALL_LAPACK(srscl, ((MKL_INT *)n, sa, sx, (MKL_INT *)incx)); + } + + static void xxrscl(const DAAL_INT * n, const float * sa, float * sx, const DAAL_INT * incx) + { + int old_nthr = mkl_set_num_threads_local(1); + __DAAL_MKLFN_CALL_LAPACK(srscl, ((MKL_INT *)n, sa, sx, (MKL_INT *)incx)); + mkl_set_num_threads_local(old_nthr); + } }; } // namespace mkl diff --git a/cpp/daal/src/externals/service_lapack_ref.h b/cpp/daal/src/externals/service_lapack_ref.h index 4b87d88cac8..c8967469c6f 100644 --- a/cpp/daal/src/externals/service_lapack_ref.h +++ b/cpp/daal/src/externals/service_lapack_ref.h @@ -204,6 +204,19 @@ struct OpenBlasLapack dsyevd_(jobz, uplo, n, a, lda, w, work, lwork, iwork, liwork, info); } + static void xsyev(const char * jobz, const char * uplo, const DAAL_INT * n, double * a, const DAAL_INT * lda, double * w, double * work, + DAAL_INT * lwork, DAAL_INT * info) + { + dsyev_(jobz, uplo, n, a, lda, w, work, lwork, info); + } + + static void xxsyev(const char * jobz, const char * uplo, const DAAL_INT * n, double * a, const DAAL_INT * lda, double * w, double * work, + DAAL_INT * lwork, DAAL_INT * info) + { + openblas_thread_setter ots(1); + dsyev_(jobz, uplo, n, a, lda, w, work, lwork, info); + } + static void xormqr(char * side, char * trans, DAAL_INT * m, DAAL_INT * n, DAAL_INT * k, double * a, DAAL_INT * lda, double * tau, double * c, DAAL_INT * ldc, double * work, DAAL_INT * lwork, DAAL_INT * info) { @@ -216,6 +229,14 @@ struct OpenBlasLapack openblas_thread_setter ots(1); dormqr_(side, trans, m, n, k, a, lda, tau, c, ldc, work, lwork, info); } + + static void xrscl(const DAAL_INT * n, const double * sa, double * sx, const DAAL_INT * incx) { drscl_(n, sa, sx, incx); } + + static void xxrscl(const DAAL_INT * n, const double * sa, double * sx, const DAAL_INT * incx) + { + openblas_thread_setter ots(1); + drscl_(n, sa, sx, incx); + } }; /* @@ -381,6 +402,19 @@ struct OpenBlasLapack ssyevd_(jobz, uplo, n, a, lda, w, work, lwork, iwork, liwork, info); } + static void xsyev(const char * jobz, const char * uplo, const DAAL_INT * n, float * a, const DAAL_INT * lda, float * w, float * work, + DAAL_INT * lwork, DAAL_INT * info) + { + ssyev_(jobz, uplo, n, a, lda, w, work, lwork, info); + } + + static void xxsyev(const char * jobz, const char * uplo, const DAAL_INT * n, float * a, const DAAL_INT * lda, float * w, float * work, + DAAL_INT * lwork, DAAL_INT * info) + { + openblas_thread_setter ots(1); + ssyev_(jobz, uplo, n, a, lda, w, work, lwork, info); + } + static void xormqr(char * side, char * trans, DAAL_INT * m, DAAL_INT * n, DAAL_INT * k, float * a, DAAL_INT * lda, float * tau, float * c, DAAL_INT * ldc, float * work, DAAL_INT * lwork, DAAL_INT * info) { @@ -393,6 +427,14 @@ struct OpenBlasLapack openblas_thread_setter ots(1); sormqr_(side, trans, m, n, k, a, lda, tau, c, ldc, work, lwork, info); } + + static void xrscl(const DAAL_INT * n, const float * sa, float * sx, const DAAL_INT * incx) { srscl_(n, sa, sx, incx); } + + static void xxrscl(const DAAL_INT * n, const float * sa, float * sx, const DAAL_INT * incx) + { + openblas_thread_setter ots(1); + srscl_(n, sa, sx, incx); + } }; } // namespace ref From 9deb8256e4ad165b54e67f2268e631ab1dc60eae Mon Sep 17 00:00:00 2001 From: David Cortes Date: Wed, 2 Oct 2024 10:30:20 +0200 Subject: [PATCH 02/16] correct buffer size --- cpp/daal/src/algorithms/service_kernel_math.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/daal/src/algorithms/service_kernel_math.h b/cpp/daal/src/algorithms/service_kernel_math.h index dea50a1b8e4..20c627f6b5a 100644 --- a/cpp/daal/src/algorithms/service_kernel_math.h +++ b/cpp/daal/src/algorithms/service_kernel_math.h @@ -813,7 +813,7 @@ bool solveSymmetricEquationsSystem(FPType * a, FPType * b, size_t n, size_t nX, { /* Copy data for fallback from Cholesky to spectral decomposition */ TArrayScalable aCopy(n * n); - TArrayScalable bCopy(n); + TArrayScalable bCopy(n * nX); DAAL_CHECK_MALLOC(aCopy.get()); DAAL_CHECK_MALLOC(bCopy.get()); From 53dd07743953c16c1e5a065adc140c8db56a2a02 Mon Sep 17 00:00:00 2001 From: David Cortes Date: Wed, 2 Oct 2024 10:36:36 +0200 Subject: [PATCH 03/16] add optional pragma (has no effect though) --- cpp/daal/src/algorithms/service_kernel_math.h | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/daal/src/algorithms/service_kernel_math.h b/cpp/daal/src/algorithms/service_kernel_math.h index 20c627f6b5a..0d2214ee26d 100644 --- a/cpp/daal/src/algorithms/service_kernel_math.h +++ b/cpp/daal/src/algorithms/service_kernel_math.h @@ -749,6 +749,7 @@ bool solveEquationsSystemWithSpectralDecomposition(FPType * a, FPType * b, size_ /* Create the square root of the inverse: Qis = Q * diag(1 / sqrt(l)) */ DAAL_INT one = 1; + PRAGMA_IVDEP for (size_t col = num_discarded; col < n; col++) { const FPType scale = std::sqrt(eigenvalues[col]); From 12cbd13a7bcf232b591e608c4d9fdfc92d9de414 Mon Sep 17 00:00:00 2001 From: David Cortes Date: Wed, 2 Oct 2024 12:06:54 +0200 Subject: [PATCH 04/16] add test with an overdetermined system --- .../dal/algo/linear_regression/test/batch.cpp | 1 + .../algo/linear_regression/test/fixture.hpp | 73 +++++++++++++++++++ 2 files changed, 74 insertions(+) diff --git a/cpp/oneapi/dal/algo/linear_regression/test/batch.cpp b/cpp/oneapi/dal/algo/linear_regression/test/batch.cpp index 00ec7babbb9..ae37e8cac15 100644 --- a/cpp/oneapi/dal/algo/linear_regression/test/batch.cpp +++ b/cpp/oneapi/dal/algo/linear_regression/test/batch.cpp @@ -48,6 +48,7 @@ TEMPLATE_LIST_TEST_M(lr_batch_test, "LR common flow", "[lr][batch]", lr_types) { this->generate(777); this->run_and_check_linear(); + this->run_and_check_linear_indefinite(); } TEMPLATE_LIST_TEST_M(lr_batch_test, "RR common flow", "[rr][batch]", lr_types) { diff --git a/cpp/oneapi/dal/algo/linear_regression/test/fixture.hpp b/cpp/oneapi/dal/algo/linear_regression/test/fixture.hpp index fb935174cfe..20eca90b939 100644 --- a/cpp/oneapi/dal/algo/linear_regression/test/fixture.hpp +++ b/cpp/oneapi/dal/algo/linear_regression/test/fixture.hpp @@ -291,6 +291,79 @@ class lr_test : public te::crtp_algo_fixture { } } + /* Note: difference between this test and the above, is that the linear system to solve + here is not positive-definite, thus it has an infinite number of possible solutions. The + solution here is the one with minimum norm, which is typically more desirable. */ + void run_and_check_linear_indefinite(double tol = 1e-3) { + const double X[] = { -0.98912135, -0.36778665, 1.28792526, 0.19397442, 0.9202309, + 0.57710379, -0.63646365, 0.54195222, -0.31659545, -0.32238912, + 0.09716732, -1.52593041, 1.1921661, -0.67108968, 1.00026942 }; + const double y[] = { 0.13632112, 1.53203308, -0.65996941 }; + auto X_tbl = oneapi::dal::detail::homogen_table_builder() + .set_data_type(data_type::float64) + .set_layout(data_layout::row_major) + .allocate(3, 5) + .copy_data(X, 3, 5) + .build(); + auto y_tbl = oneapi::dal::detail::homogen_table_builder() + .set_data_type(data_type::float64) + .set_layout(data_layout::row_major) + .allocate(3, 1) + .copy_data(y, 3, 1) + .build(); + + auto desc = this->get_descriptor(); + auto train_res = this->train(desc, X_tbl, y_tbl); + const auto coefs = train_res.get_coefficients(); + + if (desc.get_result_options().test(result_options::intercept)) { + const double expected_beta[] = { 0.27785494, + 0.53011669, + 0.34352259, + 0.40506216, + -1.26026447 }; + const double expected_intercept[] = { 1.24485441 }; + const auto expected_beta_tbl = oneapi::dal::detail::homogen_table_builder() + .set_data_type(data_type::float64) + .set_layout(data_layout::row_major) + .allocate(1, 5) + .copy_data(expected_beta, 1, 5) + .build(); + const auto expected_intercept_tbl = oneapi::dal::detail::homogen_table_builder() + .set_data_type(data_type::float64) + .set_layout(data_layout::row_major) + .allocate(1, 1) + .copy_data(expected_intercept, 1, 1) + .build(); + + const auto intercept = train_res.get_intercept(); + + SECTION("Checking intercept values") { + check_if_close(intercept, expected_intercept_tbl, tol); + } + SECTION("Checking coefficient values") { + check_if_close(coefs, expected_beta_tbl, tol); + } + } + + else { + const double expected_beta[] = { 0.38217445, + 0.2732197, + 1.87135517, + 0.63458468, + -2.08473134 }; + const auto expected_beta_tbl = oneapi::dal::detail::homogen_table_builder() + .set_data_type(data_type::float64) + .set_layout(data_layout::row_major) + .allocate(1, 5) + .copy_data(expected_beta, 1, 5) + .build(); + SECTION("Checking coefficient values") { + check_if_close(coefs, expected_beta_tbl, tol); + } + } + } + template std::vector split_table_by_rows(const dal::table& t, std::int64_t split_count) { ONEDAL_ASSERT(0l < split_count); From eec237ef3f3a488c5d971370ca46cdaa68395cf0 Mon Sep 17 00:00:00 2001 From: david-cortes-intel Date: Wed, 2 Oct 2024 13:26:25 +0200 Subject: [PATCH 05/16] Update cpp/daal/src/algorithms/service_kernel_math.h Co-authored-by: Victoriya Fedotova --- cpp/daal/src/algorithms/service_kernel_math.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/daal/src/algorithms/service_kernel_math.h b/cpp/daal/src/algorithms/service_kernel_math.h index 0d2214ee26d..2f27a3905db 100644 --- a/cpp/daal/src/algorithms/service_kernel_math.h +++ b/cpp/daal/src/algorithms/service_kernel_math.h @@ -752,7 +752,7 @@ bool solveEquationsSystemWithSpectralDecomposition(FPType * a, FPType * b, size_ PRAGMA_IVDEP for (size_t col = num_discarded; col < n; col++) { - const FPType scale = std::sqrt(eigenvalues[col]); + const FPType scale = daal::internal::MathInst::sSqrt(eigenvalues[col]); if (sequential) { LapackInst::xxrscl((DAAL_INT *)&n, &scale, a + col * n, &one); From c8fadb7ea917f9bd9c8723c35f4a5198e2ddfd7f Mon Sep 17 00:00:00 2001 From: David Cortes Date: Wed, 2 Oct 2024 13:38:04 +0200 Subject: [PATCH 06/16] PR comments --- cpp/daal/src/algorithms/service_kernel_math.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/cpp/daal/src/algorithms/service_kernel_math.h b/cpp/daal/src/algorithms/service_kernel_math.h index 2f27a3905db..ab70d56fea4 100644 --- a/cpp/daal/src/algorithms/service_kernel_math.h +++ b/cpp/daal/src/algorithms/service_kernel_math.h @@ -24,7 +24,7 @@ #ifndef __SERVICE_KERNEL_MATH_H__ #define __SERVICE_KERNEL_MATH_H__ -#include +#include #include "services/daal_defines.h" #include "services/env_detect.h" @@ -665,10 +665,13 @@ bool solveEquationsSystemWithCholesky(FPType * a, FPType * b, size_t n, size_t n /* Note: there can be cases in which the matrix is singular / rank-deficient, but due to numerical inaccuracies, Cholesky still succeeds. In such cases, it might produce a solution successfully, but it will not be the minimum-norm solution, and might be prone towards having too large numbers. Thus - it's preferrable to fall back to a different type of solver that can work correctly with those. */ + it's preferrable to fall back to a different type of solver that can work correctly with those. + Note that the thresholds chosen there are just a guess and not based on any properties of floating + points or academic research. */ + const FPType threshold_chol_diag = std::is_same::value? 1e-4 : 1e-6; for (size_t ix = 0; ix < n; ix++) { - if (a[ix * (ix + 1)] < 1e-6) return false; + if (a[ix * (ix + 1)] < threshold_chol_diag) return false; } /* Solve L*L' * x = b */ @@ -735,7 +738,7 @@ bool solveEquationsSystemWithSpectralDecomposition(FPType * a, FPType * b, size_ if (info) return false; /* Components with small singular values get eliminated using the exact same logic as 'gelsd' with default parameters */ - constexpr const FPType eps = std::numeric_limits::epsilon(); + const FPType eps = std::is_same::value? 1.1920929e-07 : 2.220446049250313e-16; if (eigenvalues[n - 1] <= eps) return false; const double component_threshold = eps * eigenvalues[n - 1]; DAAL_INT num_discarded; @@ -752,7 +755,7 @@ bool solveEquationsSystemWithSpectralDecomposition(FPType * a, FPType * b, size_ PRAGMA_IVDEP for (size_t col = num_discarded; col < n; col++) { - const FPType scale = daal::internal::MathInst::sSqrt(eigenvalues[col]); + const FPType scale = daal::internal::MathInst::sSqrt(eigenvalues[col]); if (sequential) { LapackInst::xxrscl((DAAL_INT *)&n, &scale, a + col * n, &one); From b062696220de6488440af66c9765425e1ada20c4 Mon Sep 17 00:00:00 2001 From: David Cortes Date: Thu, 3 Oct 2024 07:52:54 +0200 Subject: [PATCH 07/16] use syevr which is faster than syev --- cpp/daal/src/algorithms/service_kernel_math.h | 107 +++++++++++------- cpp/daal/src/externals/service_lapack.h | 37 ++++-- .../src/externals/service_lapack_declar_ref.h | 6 +- cpp/daal/src/externals/service_lapack_mkl.h | 40 +++++-- cpp/daal/src/externals/service_lapack_ref.h | 32 ++++-- 5 files changed, 142 insertions(+), 80 deletions(-) diff --git a/cpp/daal/src/algorithms/service_kernel_math.h b/cpp/daal/src/algorithms/service_kernel_math.h index ab70d56fea4..590af4b17c1 100644 --- a/cpp/daal/src/algorithms/service_kernel_math.h +++ b/cpp/daal/src/algorithms/service_kernel_math.h @@ -668,7 +668,7 @@ bool solveEquationsSystemWithCholesky(FPType * a, FPType * b, size_t n, size_t n it's preferrable to fall back to a different type of solver that can work correctly with those. Note that the thresholds chosen there are just a guess and not based on any properties of floating points or academic research. */ - const FPType threshold_chol_diag = std::is_same::value? 1e-4 : 1e-6; + const FPType threshold_chol_diag = std::is_same::value ? 1e-4 : 1e-6; for (size_t ix = 0; ix < n; ix++) { if (a[ix * (ix + 1)] < threshold_chol_diag) return false; @@ -696,99 +696,118 @@ bool solveEquationsSystemWithSpectralDecomposition(FPType * a, FPType * b, size_ TArrayScalable eigenvalues(n * nX); DAAL_CHECK_MALLOC(eigenvalues.get()); + TArrayScalable eigenvectors(n * n); + DAAL_CHECK_MALLOC(eigenvectors.get()); + + TArrayScalable buffer_isuppz(2 * n); + DAAL_CHECK_MALLOC(buffer_isuppz.get()); + /* SYEV parameters */ - char jobz = 'V'; - char uplo = 'U'; + const char jobz = 'V'; + const char range = 'A'; + const char uplo = 'U'; + FPType zero = 0; DAAL_INT info; + DAAL_INT num_eigenvalues; /* Query the procedure for size of required buffer */ - DAAL_INT lwork = -1; - FPType buffer_size; + DAAL_INT lwork_query_indicator = -1; + FPType buffer_size_work = 0; + DAAL_INT buffer_size_iwork = 0; if (sequential) { - LapackInst::xxsyev(&jobz, &uplo, (DAAL_INT *)&n, a, (DAAL_INT *)&n, eigenvalues.get(), &buffer_size, &lwork, &info); + LapackInst::xxsyevr(&jobz, &range, &uplo, (DAAL_INT *)&n, a, (DAAL_INT *)&n, nullptr, nullptr, nullptr, nullptr, &zero, + &num_eigenvalues, eigenvalues.get(), eigenvectors.get(), (DAAL_INT *)&n, buffer_isuppz.get(), + &buffer_size_work, &lwork_query_indicator, &buffer_size_iwork, &lwork_query_indicator, &info); } else { - LapackInst::xsyev(&jobz, &uplo, (DAAL_INT *)&n, a, (DAAL_INT *)&n, eigenvalues.get(), &buffer_size, &lwork, &info); + LapackInst::xsyevr(&jobz, &range, &uplo, (DAAL_INT *)&n, a, (DAAL_INT *)&n, nullptr, nullptr, nullptr, nullptr, &zero, + &num_eigenvalues, eigenvalues.get(), eigenvectors.get(), (DAAL_INT *)&n, buffer_isuppz.get(), + &buffer_size_work, &lwork_query_indicator, &buffer_size_iwork, &lwork_query_indicator, &info); } if (info) return false; - /* Check that buffer size will not overflow when passed to LAPACK */ - if (static_cast(buffer_size) > std::numeric_limits::max()) return false; + /* Check that buffer sizes will not overflow when passed to LAPACK */ + if (static_cast(buffer_size_work) > std::numeric_limits::max()) return false; + if (buffer_size_iwork < 0) return false; - /* Allocate work buffer as needed */ - DAAL_INT work_buffer_size = static_cast(buffer_size); + /* Allocate work buffers as needed */ + DAAL_INT work_buffer_size = static_cast(buffer_size_work); TArrayScalable work_buffer(work_buffer_size); DAAL_CHECK_MALLOC(work_buffer.get()); + TArrayScalable iwork_buffer(buffer_size_iwork); + DAAL_CHECK_MALLOC(iwork_buffer.get()); /* Perform Q*diag(l)*Q' factorization of A */ if (sequential) { - LapackInst::xxsyev(&jobz, &uplo, (DAAL_INT *)&n, a, (DAAL_INT *)&n, eigenvalues.get(), work_buffer.get(), &work_buffer_size, - &info); + LapackInst::xxsyevr(&jobz, &range, &uplo, (DAAL_INT *)&n, a, (DAAL_INT *)&n, nullptr, nullptr, nullptr, nullptr, &zero, + &num_eigenvalues, eigenvalues.get(), eigenvectors.get(), (DAAL_INT *)&n, buffer_isuppz.get(), + work_buffer.get(), &work_buffer_size, iwork_buffer.get(), &buffer_size_iwork, &info); } else { - LapackInst::xsyev(&jobz, &uplo, (DAAL_INT *)&n, a, (DAAL_INT *)&n, eigenvalues.get(), work_buffer.get(), &work_buffer_size, - &info); + LapackInst::xsyevr(&jobz, &range, &uplo, (DAAL_INT *)&n, a, (DAAL_INT *)&n, nullptr, nullptr, nullptr, nullptr, &zero, + &num_eigenvalues, eigenvalues.get(), eigenvectors.get(), (DAAL_INT *)&n, buffer_isuppz.get(), + work_buffer.get(), &work_buffer_size, iwork_buffer.get(), &buffer_size_iwork, &info); } if (info) return false; /* Components with small singular values get eliminated using the exact same logic as 'gelsd' with default parameters */ - const FPType eps = std::is_same::value? 1.1920929e-07 : 2.220446049250313e-16; + const FPType eps = std::is_same::value ? 1.1920929e-07 : 2.220446049250313e-16; if (eigenvalues[n - 1] <= eps) return false; const double component_threshold = eps * eigenvalues[n - 1]; DAAL_INT num_discarded; for (num_discarded = 0; num_discarded < static_cast(n) - 1; num_discarded++) { - if (eigenvalues[num_discarded] > component_threshold) - { - break; - } + if (eigenvalues[num_discarded] > component_threshold) break; } /* Create the square root of the inverse: Qis = Q * diag(1 / sqrt(l)) */ + DAAL_INT num_taken = static_cast(n) - num_discarded; + daal::internal::MathInst::vSqrt(num_taken, eigenvalues.get() + num_discarded, eigenvalues.get() + num_discarded); DAAL_INT one = 1; PRAGMA_IVDEP for (size_t col = num_discarded; col < n; col++) { - const FPType scale = daal::internal::MathInst::sSqrt(eigenvalues[col]); + const FPType scale = eigenvalues[col]; if (sequential) { - LapackInst::xxrscl((DAAL_INT *)&n, &scale, a + col * n, &one); + LapackInst::xxrscl((DAAL_INT *)&n, &scale, eigenvectors.get() + col * n, &one); } else { - LapackInst::xrscl((DAAL_INT *)&n, &scale, a + col * n, &one); + LapackInst::xrscl((DAAL_INT *)&n, &scale, eigenvectors.get() + col * n, &one); } } /* Now calculate the actual solution: Qis * Qis' * B */ - char trans_yes = 'T'; - char trans_no = 'N'; - FPType one_fp = 1; - FPType zero = 0; - DAAL_INT num_taken = static_cast(n) - num_discarded; - a += static_cast(num_discarded) * n; + char trans_yes = 'T'; + char trans_no = 'N'; + FPType one_fp = 1; + const size_t eigenvalues_offset = static_cast(num_discarded) * n; if (sequential) { if (nX == 1) { - BlasInst::xxgemv(&trans_yes, (DAAL_INT *)&n, &num_taken, &one_fp, a, (DAAL_INT *)&n, b, &one, &zero, eigenvalues.get(), - &one); - BlasInst::xxgemv(&trans_no, (DAAL_INT *)&n, &num_taken, &one_fp, a, (DAAL_INT *)&n, eigenvalues.get(), &one, &zero, b, &one); + BlasInst::xxgemv(&trans_yes, (DAAL_INT *)&n, &num_taken, &one_fp, eigenvectors.get() + eigenvalues_offset, (DAAL_INT *)&n, b, + &one, &zero, eigenvalues.get(), &one); + BlasInst::xxgemv(&trans_no, (DAAL_INT *)&n, &num_taken, &one_fp, eigenvectors.get() + eigenvalues_offset, (DAAL_INT *)&n, + eigenvalues.get(), &one, &zero, b, &one); } else { - BlasInst::xxgemm(&trans_yes, &trans_no, &num_taken, (DAAL_INT *)&nX, (DAAL_INT *)&n, &one_fp, a, (DAAL_INT *)&n, b, - (DAAL_INT *)&n, &zero, eigenvalues.get(), &num_taken); - BlasInst::xxgemm(&trans_no, &trans_no, (DAAL_INT *)&n, (DAAL_INT *)&nX, &num_taken, &one_fp, a, (DAAL_INT *)&n, - eigenvalues.get(), &num_taken, &zero, b, (DAAL_INT *)&n); + BlasInst::xxgemm(&trans_yes, &trans_no, &num_taken, (DAAL_INT *)&nX, (DAAL_INT *)&n, &one_fp, + eigenvectors.get() + eigenvalues_offset, (DAAL_INT *)&n, b, (DAAL_INT *)&n, &zero, eigenvalues.get(), + &num_taken); + BlasInst::xxgemm(&trans_no, &trans_no, (DAAL_INT *)&n, (DAAL_INT *)&nX, &num_taken, &one_fp, + eigenvectors.get() + eigenvalues_offset, (DAAL_INT *)&n, eigenvalues.get(), &num_taken, &zero, b, + (DAAL_INT *)&n); } } @@ -796,16 +815,20 @@ bool solveEquationsSystemWithSpectralDecomposition(FPType * a, FPType * b, size_ { if (nX == 1) { - BlasInst::xgemv(&trans_yes, (DAAL_INT *)&n, &num_taken, &one_fp, a, (DAAL_INT *)&n, b, &one, &zero, eigenvalues.get(), &one); - BlasInst::xgemv(&trans_no, (DAAL_INT *)&n, &num_taken, &one_fp, a, (DAAL_INT *)&n, eigenvalues.get(), &one, &zero, b, &one); + BlasInst::xgemv(&trans_yes, (DAAL_INT *)&n, &num_taken, &one_fp, eigenvectors.get() + eigenvalues_offset, (DAAL_INT *)&n, b, + &one, &zero, eigenvalues.get(), &one); + BlasInst::xgemv(&trans_no, (DAAL_INT *)&n, &num_taken, &one_fp, eigenvectors.get() + eigenvalues_offset, (DAAL_INT *)&n, + eigenvalues.get(), &one, &zero, b, &one); } else { - BlasInst::xgemm(&trans_yes, &trans_no, &num_taken, (DAAL_INT *)&nX, (DAAL_INT *)&n, &one_fp, a, (DAAL_INT *)&n, b, - (DAAL_INT *)&n, &zero, eigenvalues.get(), &num_taken); - BlasInst::xgemm(&trans_no, &trans_no, (DAAL_INT *)&n, (DAAL_INT *)&nX, &num_taken, &one_fp, a, (DAAL_INT *)&n, - eigenvalues.get(), &num_taken, &zero, b, (DAAL_INT *)&n); + BlasInst::xgemm(&trans_yes, &trans_no, &num_taken, (DAAL_INT *)&nX, (DAAL_INT *)&n, &one_fp, + eigenvectors.get() + eigenvalues_offset, (DAAL_INT *)&n, b, (DAAL_INT *)&n, &zero, eigenvalues.get(), + &num_taken); + BlasInst::xgemm(&trans_no, &trans_no, (DAAL_INT *)&n, (DAAL_INT *)&nX, &num_taken, &one_fp, + eigenvectors.get() + eigenvalues_offset, (DAAL_INT *)&n, eigenvalues.get(), &num_taken, &zero, b, + (DAAL_INT *)&n); } } diff --git a/cpp/daal/src/externals/service_lapack.h b/cpp/daal/src/externals/service_lapack.h index ec7e9ac25cf..68a5b0f555c 100644 --- a/cpp/daal/src/externals/service_lapack.h +++ b/cpp/daal/src/externals/service_lapack.h @@ -186,6 +186,9 @@ struct Lapack _impl::xxgesvd(jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, work, lwork, info); } + /* Note: 'syevd' and 'syevr' both compute symmetric eigenvalues, but they use different routines. 'syevd' + is slower but more precise and should thus be preferred for situations in which small numerical inaccuracies + have adverse side effects, whereas 'syevr' is faster and more suitable for general usage. */ static void xsyevd(char * jobz, char * uplo, SizeType * n, fpType * a, SizeType * lda, fpType * w, fpType * work, SizeType * lwork, SizeType * iwork, SizeType * liwork, SizeType * info) { @@ -198,16 +201,20 @@ struct Lapack _impl::xxsyevd(jobz, uplo, n, a, lda, w, work, lwork, iwork, liwork, info); } - static void xsyev(const char * jobz, const char * uplo, const SizeType * n, fpType * a, const SizeType * lda, fpType * w, fpType * work, - SizeType * lwork, SizeType * info) + static void xsyevr(const char * jobz, const char * range, const char * uplo, const SizeType * n, fpType * a, const SizeType * lda, + const fpType * vl, const fpType * vu, const SizeType * il, const SizeType * iu, const fpType * abstol, SizeType * m, + fpType * w, fpType * z, const SizeType * ldz, SizeType * isuppz, fpType * work, const SizeType * lwork, SizeType * iwork, + const SizeType * liwork, SizeType * info) { - _impl::xsyev(jobz, uplo, n, a, lda, w, work, lwork, info); + _impl::xsyevr(jobz, range, uplo, n, a, lda, vl, vu, il, iu, abstol, m, w, z, ldz, isuppz, work, lwork, iwork, liwork, info); } - static void xxsyev(const char * jobz, const char * uplo, const SizeType * n, fpType * a, const SizeType * lda, fpType * w, fpType * work, - SizeType * lwork, SizeType * info) + static void xxsyevr(const char * jobz, const char * range, const char * uplo, const SizeType * n, fpType * a, const SizeType * lda, + const fpType * vl, const fpType * vu, const SizeType * il, const SizeType * iu, const fpType * abstol, SizeType * m, + fpType * w, fpType * z, const SizeType * ldz, SizeType * isuppz, fpType * work, const SizeType * lwork, SizeType * iwork, + const SizeType * liwork, SizeType * info) { - _impl::xxsyev(jobz, uplo, n, a, lda, w, work, lwork, info); + _impl::xxsyevr(jobz, range, uplo, n, a, lda, vl, vu, il, iu, abstol, m, w, z, ldz, isuppz, work, lwork, iwork, liwork, info); } static void xormqr(char * side, char * trans, SizeType * m, SizeType * n, SizeType * k, fpType * a, SizeType * lda, fpType * tau, fpType * c, @@ -382,16 +389,22 @@ struct LapackAutoDispatch DAAL_DISPATCH_LAPACK_BY_CPU(fpType, xxsyevd, jobz, uplo, n, a, lda, w, work, lwork, iwork, liwork, info); } - static void xsyev(char * jobz, char * uplo, SizeType * n, fpType * a, SizeType * lda, fpType * w, fpType * work, SizeType * lwork, - SizeType * info) + static void xsyevr(const char * jobz, const char * range, const char * uplo, const SizeType * n, fpType * a, const SizeType * lda, + const fpType * vl, const fpType * vu, const SizeType * il, const SizeType * iu, const fpType * abstol, SizeType * m, + fpType * w, fpType * z, const SizeType * ldz, SizeType * isuppz, fpType * work, const SizeType * lwork, SizeType * iwork, + const SizeType * liwork, SizeType * info) { - DAAL_DISPATCH_LAPACK_BY_CPU(fpType, xsyev, jobz, uplo, n, a, lda, w, work, lwork, info); + DAAL_DISPATCH_LAPACK_BY_CPU(fpType, xsyevr, jobz, range, uplo, n, a, lda, vl, vu, il, iu, abstol, m, w, z, ldz, isuppz, work, lwork, iwork, + liwork, info); } - static void xxsyev(char * jobz, char * uplo, SizeType * n, fpType * a, SizeType * lda, fpType * w, fpType * work, SizeType * lwork, - SizeType * info) + static void xxsyevr(const char * jobz, const char * range, const char * uplo, const SizeType * n, fpType * a, const SizeType * lda, + const fpType * vl, const fpType * vu, const SizeType * il, const SizeType * iu, const fpType * abstol, SizeType * m, + fpType * w, fpType * z, const SizeType * ldz, SizeType * isuppz, fpType * work, const SizeType * lwork, SizeType * iwork, + const SizeType * liwork, SizeType * info) { - DAAL_DISPATCH_LAPACK_BY_CPU(fpType, xxsyev, jobz, uplo, n, a, lda, w, work, lwork, info); + DAAL_DISPATCH_LAPACK_BY_CPU(fpType, xxsyevr, jobz, range, uplo, n, a, lda, vl, vu, il, iu, abstol, m, w, z, ldz, isuppz, work, lwork, iwork, + liwork, info); } static void xormqr(char * side, char * trans, SizeType * m, SizeType * n, SizeType * k, fpType * a, SizeType * lda, fpType * tau, fpType * c, diff --git a/cpp/daal/src/externals/service_lapack_declar_ref.h b/cpp/daal/src/externals/service_lapack_declar_ref.h index 064d4848d3d..2df4ff5d0a7 100644 --- a/cpp/daal/src/externals/service_lapack_declar_ref.h +++ b/cpp/daal/src/externals/service_lapack_declar_ref.h @@ -82,8 +82,10 @@ extern "C" extern void ssyevd_(char *, char *, DAAL_INT *, float *, DAAL_INT *, float *, float *, DAAL_INT *, DAAL_INT *, DAAL_INT *, DAAL_INT *); extern void dsyevd_(char *, char *, DAAL_INT *, double *, DAAL_INT *, double *, double *, DAAL_INT *, DAAL_INT *, DAAL_INT *, DAAL_INT *); - extern void ssyev_(const char *, const char *, const DAAL_INT *, float *, const DAAL_INT *, float *, float *, DAAL_INT *, DAAL_INT *); - extern void dsyev_(const char *, const char *, const DAAL_INT *, double *, const DAAL_INT *, double *, double *, DAAL_INT *, DAAL_INT *); + extern void ssyevr_(const char *, const char *, const char *, const int *, float *, const int *, const float *, const float *, const int *, + const int *, const float *, int *, float *, float *, const int *, int *, float *, const int *, int *, const int *, int *); + extern void dsyevr_(const char *, const char *, const char *, const int *, double *, const int *, const double *, const double *, const int *, + const int *, const double *, int *, double *, double *, const int *, int *, double *, const int *, int *, const int *, int *); extern void sormqr_(char *, char *, DAAL_INT *, DAAL_INT *, DAAL_INT *, float *, DAAL_INT *, float *, float *, DAAL_INT *, float *, DAAL_INT *, DAAL_INT *); diff --git a/cpp/daal/src/externals/service_lapack_mkl.h b/cpp/daal/src/externals/service_lapack_mkl.h index b0d1cf81e9a..f75bb1fcf7a 100644 --- a/cpp/daal/src/externals/service_lapack_mkl.h +++ b/cpp/daal/src/externals/service_lapack_mkl.h @@ -243,17 +243,25 @@ struct MklLapack mkl_set_num_threads_local(old_nthr); } - static void xsyev(const char * jobz, const char * uplo, const DAAL_INT * n, double * a, const DAAL_INT * lda, double * w, double * work, - DAAL_INT * lwork, DAAL_INT * info) + static void xsyevr(const char * jobz, const char * range, const char * uplo, const DAAL_INT * n, double * a, const DAAL_INT * lda, + const double * vl, const double * vu, const DAAL_INT * il, const DAAL_INT * iu, const double * abstol, DAAL_INT * m, + double * w, double * z, const DAAL_INT * ldz, DAAL_INT * isuppz, double * work, const DAAL_INT * lwork, DAAL_INT * iwork, + const DAAL_INT * liwork, DAAL_INT * info) { - __DAAL_MKLFN_CALL_LAPACK(dsyev, (jobz, uplo, (MKL_INT *)n, a, (MKL_INT *)lda, w, work, (MKL_INT *)lwork, (MKL_INT *)info)); + __DAAL_MKLFN_CALL_LAPACK(dsyevr, (jobz, range, uplo, (const MKL_INT *)n, a, (const MKL_INT *)lda, vl, vu, (const MKL_INT *)il, + (const MKL_INT *)iu, abstol, (MKL_INT *)m, w, z, (const MKL_INT *)ldz, (MKL_INT *)isuppz, work, + (const MKL_INT *)lwork, (MKL_INT *)iwork, (const MKL_INT *)liwork, (MKL_INT *)info)); } - static void xxsyev(const char * jobz, const char * uplo, const DAAL_INT * n, double * a, const DAAL_INT * lda, double * w, double * work, - DAAL_INT * lwork, DAAL_INT * info) + static void xxsyevr(const char * jobz, const char * range, const char * uplo, const DAAL_INT * n, double * a, const DAAL_INT * lda, + const double * vl, const double * vu, const DAAL_INT * il, const DAAL_INT * iu, const double * abstol, DAAL_INT * m, + double * w, double * z, const DAAL_INT * ldz, DAAL_INT * isuppz, double * work, const DAAL_INT * lwork, DAAL_INT * iwork, + const DAAL_INT * liwork, DAAL_INT * info) { int old_nthr = mkl_set_num_threads_local(1); - __DAAL_MKLFN_CALL_LAPACK(dsyev, (jobz, uplo, (MKL_INT *)n, a, (MKL_INT *)lda, w, work, (MKL_INT *)lwork, (MKL_INT *)info)); + __DAAL_MKLFN_CALL_LAPACK(dsyevr, (jobz, range, uplo, (const MKL_INT *)n, a, (const MKL_INT *)lda, vl, vu, (const MKL_INT *)il, + (const MKL_INT *)iu, abstol, (MKL_INT *)m, w, z, (const MKL_INT *)ldz, (MKL_INT *)isuppz, work, + (const MKL_INT *)lwork, (MKL_INT *)iwork, (const MKL_INT *)liwork, (MKL_INT *)info)); mkl_set_num_threads_local(old_nthr); } @@ -487,17 +495,25 @@ struct MklLapack mkl_set_num_threads_local(old_nthr); } - static void xsyev(const char * jobz, const char * uplo, const DAAL_INT * n, float * a, const DAAL_INT * lda, float * w, float * work, - DAAL_INT * lwork, DAAL_INT * info) + static void xsyevr(const char * jobz, const char * range, const char * uplo, const DAAL_INT * n, float * a, const DAAL_INT * lda, + const float * vl, const float * vu, const DAAL_INT * il, const DAAL_INT * iu, const float * abstol, DAAL_INT * m, float * w, + float * z, const DAAL_INT * ldz, DAAL_INT * isuppz, float * work, const DAAL_INT * lwork, DAAL_INT * iwork, + const DAAL_INT * liwork, DAAL_INT * info) { - __DAAL_MKLFN_CALL_LAPACK(ssyev, (jobz, uplo, (MKL_INT *)n, a, (MKL_INT *)lda, w, work, (MKL_INT *)lwork, (MKL_INT *)info)); + __DAAL_MKLFN_CALL_LAPACK(ssyevr, (jobz, range, uplo, (const MKL_INT *)n, a, (const MKL_INT *)lda, vl, vu, (const MKL_INT *)il, + (const MKL_INT *)iu, abstol, (MKL_INT *)m, w, z, (const MKL_INT *)ldz, (MKL_INT *)isuppz, work, + (const MKL_INT *)lwork, (MKL_INT *)iwork, (const MKL_INT *)liwork, (MKL_INT *)info)); } - static void xxsyev(const char * jobz, const char * uplo, const DAAL_INT * n, float * a, const DAAL_INT * lda, float * w, float * work, - DAAL_INT * lwork, DAAL_INT * info) + static void xxsyevr(const char * jobz, const char * range, const char * uplo, const DAAL_INT * n, float * a, const DAAL_INT * lda, + const float * vl, const float * vu, const DAAL_INT * il, const DAAL_INT * iu, const float * abstol, DAAL_INT * m, float * w, + float * z, const DAAL_INT * ldz, DAAL_INT * isuppz, float * work, const DAAL_INT * lwork, DAAL_INT * iwork, + const DAAL_INT * liwork, DAAL_INT * info) { int old_nthr = mkl_set_num_threads_local(1); - __DAAL_MKLFN_CALL_LAPACK(ssyev, (jobz, uplo, (MKL_INT *)n, a, (MKL_INT *)lda, w, work, (MKL_INT *)lwork, (MKL_INT *)info)); + __DAAL_MKLFN_CALL_LAPACK(ssyevr, (jobz, range, uplo, (const MKL_INT *)n, a, (const MKL_INT *)lda, vl, vu, (const MKL_INT *)il, + (const MKL_INT *)iu, abstol, (MKL_INT *)m, w, z, (const MKL_INT *)ldz, (MKL_INT *)isuppz, work, + (const MKL_INT *)lwork, (MKL_INT *)iwork, (const MKL_INT *)liwork, (MKL_INT *)info)); mkl_set_num_threads_local(old_nthr); } diff --git a/cpp/daal/src/externals/service_lapack_ref.h b/cpp/daal/src/externals/service_lapack_ref.h index c8967469c6f..c68462caf35 100644 --- a/cpp/daal/src/externals/service_lapack_ref.h +++ b/cpp/daal/src/externals/service_lapack_ref.h @@ -204,17 +204,21 @@ struct OpenBlasLapack dsyevd_(jobz, uplo, n, a, lda, w, work, lwork, iwork, liwork, info); } - static void xsyev(const char * jobz, const char * uplo, const DAAL_INT * n, double * a, const DAAL_INT * lda, double * w, double * work, - DAAL_INT * lwork, DAAL_INT * info) + static void xsyevr(const char * jobz, const char * range, const char * uplo, const DAAL_INT * n, double * a, const DAAL_INT * lda, + const double * vl, const double * vu, const DAAL_INT * il, const DAAL_INT * iu, const double * abstol, DAAL_INT * m, + double * w, double * z, const DAAL_INT * ldz, DAAL_INT * isuppz, double * work, const DAAL_INT * lwork, DAAL_INT * iwork, + const DAAL_INT * liwork, DAAL_INT * info) { - dsyev_(jobz, uplo, n, a, lda, w, work, lwork, info); + dsyevr_(jobz, range, uplo, n, a, lda, vl, vu, il, iu, abstol, m, w, z ldz, isuppz, work, lwork, iwork, liwork, info); } - static void xxsyev(const char * jobz, const char * uplo, const DAAL_INT * n, double * a, const DAAL_INT * lda, double * w, double * work, - DAAL_INT * lwork, DAAL_INT * info) + static void xxsyevr(const char * jobz, const char * range, const char * uplo, const DAAL_INT * n, double * a, const DAAL_INT * lda, + const double * vl, const double * vu, const DAAL_INT * il, const DAAL_INT * iu, const double * abstol, DAAL_INT * m, + double * w, double * z, const DAAL_INT * ldz, DAAL_INT * isuppz, double * work, const DAAL_INT * lwork, DAAL_INT * iwork, + const DAAL_INT * liwork, DAAL_INT * info) { openblas_thread_setter ots(1); - dsyev_(jobz, uplo, n, a, lda, w, work, lwork, info); + dsyevr_(jobz, range, uplo, n, a, lda, vl, vu, il, iu, abstol, m, w, z ldz, isuppz, work, lwork, iwork, liwork, info); } static void xormqr(char * side, char * trans, DAAL_INT * m, DAAL_INT * n, DAAL_INT * k, double * a, DAAL_INT * lda, double * tau, double * c, @@ -402,17 +406,21 @@ struct OpenBlasLapack ssyevd_(jobz, uplo, n, a, lda, w, work, lwork, iwork, liwork, info); } - static void xsyev(const char * jobz, const char * uplo, const DAAL_INT * n, float * a, const DAAL_INT * lda, float * w, float * work, - DAAL_INT * lwork, DAAL_INT * info) + static void xsyevr(const char * jobz, const char * range, const char * uplo, const DAAL_INT * n, float * a, const DAAL_INT * lda, + const float * vl, const float * vu, const DAAL_INT * il, const DAAL_INT * iu, const float * abstol, DAAL_INT * m, float * w, + float * z, const DAAL_INT * ldz, DAAL_INT * isuppz, float * work, const DAAL_INT * lwork, DAAL_INT * iwork, + const DAAL_INT * liwork, DAAL_INT * info) { - ssyev_(jobz, uplo, n, a, lda, w, work, lwork, info); + ssyevr_(jobz, range, uplo, n, a, lda, vl, vu, il, iu, abstol, m, w, z ldz, isuppz, work, lwork, iwork, liwork, info); } - static void xxsyev(const char * jobz, const char * uplo, const DAAL_INT * n, float * a, const DAAL_INT * lda, float * w, float * work, - DAAL_INT * lwork, DAAL_INT * info) + static void xxsyevr(const char * jobz, const char * range, const char * uplo, const DAAL_INT * n, float * a, const DAAL_INT * lda, + const float * vl, const float * vu, const DAAL_INT * il, const DAAL_INT * iu, const float * abstol, DAAL_INT * m, float * w, + float * z, const DAAL_INT * ldz, DAAL_INT * isuppz, float * work, const DAAL_INT * lwork, DAAL_INT * iwork, + const DAAL_INT * liwork, DAAL_INT * info) { openblas_thread_setter ots(1); - ssyev_(jobz, uplo, n, a, lda, w, work, lwork, info); + ssyevr_(jobz, range, uplo, n, a, lda, vl, vu, il, iu, abstol, m, w, z ldz, isuppz, work, lwork, iwork, liwork, info); } static void xormqr(char * side, char * trans, DAAL_INT * m, DAAL_INT * n, DAAL_INT * k, float * a, DAAL_INT * lda, float * tau, float * c, From c2deb3cbb576a503c995c1c6db64a891b19458fd Mon Sep 17 00:00:00 2001 From: David Cortes Date: Thu, 3 Oct 2024 07:55:07 +0200 Subject: [PATCH 08/16] avoid too eager threshold for aborting fp32 cholesky --- cpp/daal/src/algorithms/service_kernel_math.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cpp/daal/src/algorithms/service_kernel_math.h b/cpp/daal/src/algorithms/service_kernel_math.h index 590af4b17c1..33ce31052ec 100644 --- a/cpp/daal/src/algorithms/service_kernel_math.h +++ b/cpp/daal/src/algorithms/service_kernel_math.h @@ -668,7 +668,7 @@ bool solveEquationsSystemWithCholesky(FPType * a, FPType * b, size_t n, size_t n it's preferrable to fall back to a different type of solver that can work correctly with those. Note that the thresholds chosen there are just a guess and not based on any properties of floating points or academic research. */ - const FPType threshold_chol_diag = std::is_same::value ? 1e-4 : 1e-6; + const FPType threshold_chol_diag = 1e-6; for (size_t ix = 0; ix < n; ix++) { if (a[ix * (ix + 1)] < threshold_chol_diag) return false; @@ -756,7 +756,9 @@ bool solveEquationsSystemWithSpectralDecomposition(FPType * a, FPType * b, size_ } if (info) return false; - /* Components with small singular values get eliminated using the exact same logic as 'gelsd' with default parameters */ + /* Components with small singular values get eliminated using the exact same logic as 'gelsd' with default parameters + Note: these are hard-coded versions of machine epsilon for single and double precision. They aren't obtained through + 'std::numeric_limits' in order to avoid potential template instantiation errors with some types. */ const FPType eps = std::is_same::value ? 1.1920929e-07 : 2.220446049250313e-16; if (eigenvalues[n - 1] <= eps) return false; const double component_threshold = eps * eigenvalues[n - 1]; From 97f6f8a34c162f52ca103d3d541ca029f9873e26 Mon Sep 17 00:00:00 2001 From: David Cortes Date: Thu, 3 Oct 2024 09:13:56 +0200 Subject: [PATCH 09/16] missing commas --- cpp/daal/src/externals/service_lapack_ref.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/daal/src/externals/service_lapack_ref.h b/cpp/daal/src/externals/service_lapack_ref.h index c68462caf35..6d52326d3e1 100644 --- a/cpp/daal/src/externals/service_lapack_ref.h +++ b/cpp/daal/src/externals/service_lapack_ref.h @@ -209,7 +209,7 @@ struct OpenBlasLapack double * w, double * z, const DAAL_INT * ldz, DAAL_INT * isuppz, double * work, const DAAL_INT * lwork, DAAL_INT * iwork, const DAAL_INT * liwork, DAAL_INT * info) { - dsyevr_(jobz, range, uplo, n, a, lda, vl, vu, il, iu, abstol, m, w, z ldz, isuppz, work, lwork, iwork, liwork, info); + dsyevr_(jobz, range, uplo, n, a, lda, vl, vu, il, iu, abstol, m, w, z, ldz, isuppz, work, lwork, iwork, liwork, info); } static void xxsyevr(const char * jobz, const char * range, const char * uplo, const DAAL_INT * n, double * a, const DAAL_INT * lda, @@ -218,7 +218,7 @@ struct OpenBlasLapack const DAAL_INT * liwork, DAAL_INT * info) { openblas_thread_setter ots(1); - dsyevr_(jobz, range, uplo, n, a, lda, vl, vu, il, iu, abstol, m, w, z ldz, isuppz, work, lwork, iwork, liwork, info); + dsyevr_(jobz, range, uplo, n, a, lda, vl, vu, il, iu, abstol, m, w, z, ldz, isuppz, work, lwork, iwork, liwork, info); } static void xormqr(char * side, char * trans, DAAL_INT * m, DAAL_INT * n, DAAL_INT * k, double * a, DAAL_INT * lda, double * tau, double * c, @@ -411,7 +411,7 @@ struct OpenBlasLapack float * z, const DAAL_INT * ldz, DAAL_INT * isuppz, float * work, const DAAL_INT * lwork, DAAL_INT * iwork, const DAAL_INT * liwork, DAAL_INT * info) { - ssyevr_(jobz, range, uplo, n, a, lda, vl, vu, il, iu, abstol, m, w, z ldz, isuppz, work, lwork, iwork, liwork, info); + ssyevr_(jobz, range, uplo, n, a, lda, vl, vu, il, iu, abstol, m, w, z, ldz, isuppz, work, lwork, iwork, liwork, info); } static void xxsyevr(const char * jobz, const char * range, const char * uplo, const DAAL_INT * n, float * a, const DAAL_INT * lda, @@ -420,7 +420,7 @@ struct OpenBlasLapack const DAAL_INT * liwork, DAAL_INT * info) { openblas_thread_setter ots(1); - ssyevr_(jobz, range, uplo, n, a, lda, vl, vu, il, iu, abstol, m, w, z ldz, isuppz, work, lwork, iwork, liwork, info); + ssyevr_(jobz, range, uplo, n, a, lda, vl, vu, il, iu, abstol, m, w, z, ldz, isuppz, work, lwork, iwork, liwork, info); } static void xormqr(char * side, char * trans, DAAL_INT * m, DAAL_INT * n, DAAL_INT * k, float * a, DAAL_INT * lda, float * tau, float * c, From 7d49a1490bc00aad71e08a0f6b73acd36d0920ea Mon Sep 17 00:00:00 2001 From: David Cortes Date: Thu, 3 Oct 2024 09:18:53 +0200 Subject: [PATCH 10/16] fix declaration types --- cpp/daal/src/externals/service_lapack_declar_ref.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/cpp/daal/src/externals/service_lapack_declar_ref.h b/cpp/daal/src/externals/service_lapack_declar_ref.h index 2df4ff5d0a7..418c9f05b02 100644 --- a/cpp/daal/src/externals/service_lapack_declar_ref.h +++ b/cpp/daal/src/externals/service_lapack_declar_ref.h @@ -82,10 +82,12 @@ extern "C" extern void ssyevd_(char *, char *, DAAL_INT *, float *, DAAL_INT *, float *, float *, DAAL_INT *, DAAL_INT *, DAAL_INT *, DAAL_INT *); extern void dsyevd_(char *, char *, DAAL_INT *, double *, DAAL_INT *, double *, double *, DAAL_INT *, DAAL_INT *, DAAL_INT *, DAAL_INT *); - extern void ssyevr_(const char *, const char *, const char *, const int *, float *, const int *, const float *, const float *, const int *, - const int *, const float *, int *, float *, float *, const int *, int *, float *, const int *, int *, const int *, int *); - extern void dsyevr_(const char *, const char *, const char *, const int *, double *, const int *, const double *, const double *, const int *, - const int *, const double *, int *, double *, double *, const int *, int *, double *, const int *, int *, const int *, int *); + extern void ssyevr_(const char *, const char *, const char *, const DAAL_INT *, float *, const DAAL_INT *, const float *, const float *, + const DAAL_INT *, const DAAL_INT *, const float *, DAAL_INT *, float *, float *, const DAAL_INT *, DAAL_INT *, float *, + const DAAL_INT *, DAAL_INT *, const DAAL_INT *, DAAL_INT *); + extern void dsyevr_(const char *, const char *, const char *, const DAAL_INT *, double *, const DAAL_INT *, const double *, const double *, + const DAAL_INT *, const DAAL_INT *, const double *, DAAL_INT *, double *, double *, const DAAL_INT *, DAAL_INT *, double *, + const DAAL_INT *, DAAL_INT *, const DAAL_INT *, DAAL_INT *); extern void sormqr_(char *, char *, DAAL_INT *, DAAL_INT *, DAAL_INT *, float *, DAAL_INT *, float *, float *, DAAL_INT *, float *, DAAL_INT *, DAAL_INT *); From c859340d16ce49cda8e2f9978e5bdb72507bc1bf Mon Sep 17 00:00:00 2001 From: David Cortes Date: Thu, 3 Oct 2024 10:00:31 +0200 Subject: [PATCH 11/16] fix incorrect diagonal indexing --- cpp/daal/src/algorithms/service_kernel_math.h | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/cpp/daal/src/algorithms/service_kernel_math.h b/cpp/daal/src/algorithms/service_kernel_math.h index 33ce31052ec..6f8fd06b6b4 100644 --- a/cpp/daal/src/algorithms/service_kernel_math.h +++ b/cpp/daal/src/algorithms/service_kernel_math.h @@ -671,7 +671,7 @@ bool solveEquationsSystemWithCholesky(FPType * a, FPType * b, size_t n, size_t n const FPType threshold_chol_diag = 1e-6; for (size_t ix = 0; ix < n; ix++) { - if (a[ix * (ix + 1)] < threshold_chol_diag) return false; + if (a[ix * (n + 1)] < threshold_chol_diag) return false; } /* Solve L*L' * x = b */ @@ -788,27 +788,27 @@ bool solveEquationsSystemWithSpectralDecomposition(FPType * a, FPType * b, size_ } /* Now calculate the actual solution: Qis * Qis' * B */ - char trans_yes = 'T'; - char trans_no = 'N'; - FPType one_fp = 1; - const size_t eigenvalues_offset = static_cast(num_discarded) * n; + char trans_yes = 'T'; + char trans_no = 'N'; + FPType one_fp = 1; + const size_t eigenvectors_offset = static_cast(num_discarded) * n; if (sequential) { if (nX == 1) { - BlasInst::xxgemv(&trans_yes, (DAAL_INT *)&n, &num_taken, &one_fp, eigenvectors.get() + eigenvalues_offset, (DAAL_INT *)&n, b, - &one, &zero, eigenvalues.get(), &one); - BlasInst::xxgemv(&trans_no, (DAAL_INT *)&n, &num_taken, &one_fp, eigenvectors.get() + eigenvalues_offset, (DAAL_INT *)&n, + BlasInst::xxgemv(&trans_yes, (DAAL_INT *)&n, &num_taken, &one_fp, eigenvectors.get() + eigenvectors_offset, (DAAL_INT *)&n, + b, &one, &zero, eigenvalues.get(), &one); + BlasInst::xxgemv(&trans_no, (DAAL_INT *)&n, &num_taken, &one_fp, eigenvectors.get() + eigenvectors_offset, (DAAL_INT *)&n, eigenvalues.get(), &one, &zero, b, &one); } else { BlasInst::xxgemm(&trans_yes, &trans_no, &num_taken, (DAAL_INT *)&nX, (DAAL_INT *)&n, &one_fp, - eigenvectors.get() + eigenvalues_offset, (DAAL_INT *)&n, b, (DAAL_INT *)&n, &zero, eigenvalues.get(), + eigenvectors.get() + eigenvectors_offset, (DAAL_INT *)&n, b, (DAAL_INT *)&n, &zero, eigenvalues.get(), &num_taken); BlasInst::xxgemm(&trans_no, &trans_no, (DAAL_INT *)&n, (DAAL_INT *)&nX, &num_taken, &one_fp, - eigenvectors.get() + eigenvalues_offset, (DAAL_INT *)&n, eigenvalues.get(), &num_taken, &zero, b, + eigenvectors.get() + eigenvectors_offset, (DAAL_INT *)&n, eigenvalues.get(), &num_taken, &zero, b, (DAAL_INT *)&n); } } @@ -817,20 +817,20 @@ bool solveEquationsSystemWithSpectralDecomposition(FPType * a, FPType * b, size_ { if (nX == 1) { - BlasInst::xgemv(&trans_yes, (DAAL_INT *)&n, &num_taken, &one_fp, eigenvectors.get() + eigenvalues_offset, (DAAL_INT *)&n, b, + BlasInst::xgemv(&trans_yes, (DAAL_INT *)&n, &num_taken, &one_fp, eigenvectors.get() + eigenvectors_offset, (DAAL_INT *)&n, b, &one, &zero, eigenvalues.get(), &one); - BlasInst::xgemv(&trans_no, (DAAL_INT *)&n, &num_taken, &one_fp, eigenvectors.get() + eigenvalues_offset, (DAAL_INT *)&n, + BlasInst::xgemv(&trans_no, (DAAL_INT *)&n, &num_taken, &one_fp, eigenvectors.get() + eigenvectors_offset, (DAAL_INT *)&n, eigenvalues.get(), &one, &zero, b, &one); } else { BlasInst::xgemm(&trans_yes, &trans_no, &num_taken, (DAAL_INT *)&nX, (DAAL_INT *)&n, &one_fp, - eigenvectors.get() + eigenvalues_offset, (DAAL_INT *)&n, b, (DAAL_INT *)&n, &zero, eigenvalues.get(), + eigenvectors.get() + eigenvectors_offset, (DAAL_INT *)&n, b, (DAAL_INT *)&n, &zero, eigenvalues.get(), &num_taken); - BlasInst::xgemm(&trans_no, &trans_no, (DAAL_INT *)&n, (DAAL_INT *)&nX, &num_taken, &one_fp, - eigenvectors.get() + eigenvalues_offset, (DAAL_INT *)&n, eigenvalues.get(), &num_taken, &zero, b, - (DAAL_INT *)&n); + BlasInst::xxgemm(&trans_no, &trans_no, (DAAL_INT *)&n, (DAAL_INT *)&nX, &num_taken, &one_fp, + eigenvectors.get() + eigenvectors_offset, (DAAL_INT *)&n, eigenvalues.get(), &num_taken, &zero, b, + (DAAL_INT *)&n); } } From 119edcb272400ede88d58dfe3fdfc4f67be99043 Mon Sep 17 00:00:00 2001 From: David Cortes Date: Thu, 3 Oct 2024 10:00:50 +0200 Subject: [PATCH 12/16] add test with multi-output and overdetermined --- .../dal/algo/linear_regression/test/batch.cpp | 1 + .../algo/linear_regression/test/fixture.hpp | 68 +++++++++++++++++++ 2 files changed, 69 insertions(+) diff --git a/cpp/oneapi/dal/algo/linear_regression/test/batch.cpp b/cpp/oneapi/dal/algo/linear_regression/test/batch.cpp index ae37e8cac15..39910131f89 100644 --- a/cpp/oneapi/dal/algo/linear_regression/test/batch.cpp +++ b/cpp/oneapi/dal/algo/linear_regression/test/batch.cpp @@ -49,6 +49,7 @@ TEMPLATE_LIST_TEST_M(lr_batch_test, "LR common flow", "[lr][batch]", lr_types) { this->run_and_check_linear(); this->run_and_check_linear_indefinite(); + this->run_and_check_linear_indefinite_multioutput(); } TEMPLATE_LIST_TEST_M(lr_batch_test, "RR common flow", "[rr][batch]", lr_types) { diff --git a/cpp/oneapi/dal/algo/linear_regression/test/fixture.hpp b/cpp/oneapi/dal/algo/linear_regression/test/fixture.hpp index 20eca90b939..bf8c22abe37 100644 --- a/cpp/oneapi/dal/algo/linear_regression/test/fixture.hpp +++ b/cpp/oneapi/dal/algo/linear_regression/test/fixture.hpp @@ -364,6 +364,74 @@ class lr_test : public te::crtp_algo_fixture { } } + void run_and_check_linear_indefinite_multioutput(double tol = 1e-3) { + const double X[] = { -0.98912135, -0.36778665, 1.28792526, 0.19397442, 0.9202309, + 0.57710379, -0.63646365, 0.54195222, -0.31659545, -0.32238912, + 0.09716732, -1.52593041, 1.1921661, -0.67108968, 1.00026942 }; + const double y[] = { 0.13632112, 1.53203308, -0.65996941, + -0.31179486, 0.33776913, -2.2074711 }; + auto X_tbl = oneapi::dal::detail::homogen_table_builder() + .set_data_type(data_type::float64) + .set_layout(data_layout::row_major) + .allocate(3, 5) + .copy_data(X, 3, 5) + .build(); + auto y_tbl = oneapi::dal::detail::homogen_table_builder() + .set_data_type(data_type::float64) + .set_layout(data_layout::row_major) + .allocate(3, 2) + .copy_data(y, 3, 2) + .build(); + + auto desc = this->get_descriptor(); + auto train_res = this->train(desc, X_tbl, y_tbl); + const auto coefs = train_res.get_coefficients(); + + if (desc.get_result_options().test(result_options::intercept)) { + const double expected_beta[] = { + -0.18692112, -0.20034801, -0.09590892, -0.13672683, 0.56229012, + -0.97006008, 1.39413595, 0.49238012, 1.11041239, -0.79213452, + }; + const double expected_intercept[] = { -0.48964358, 0.96467681 }; + const auto expected_beta_tbl = oneapi::dal::detail::homogen_table_builder() + .set_data_type(data_type::float64) + .set_layout(data_layout::row_major) + .allocate(2, 5) + .copy_data(expected_beta, 2, 5) + .build(); + const auto expected_intercept_tbl = oneapi::dal::detail::homogen_table_builder() + .set_data_type(data_type::float64) + .set_layout(data_layout::row_major) + .allocate(1, 2) + .copy_data(expected_intercept, 1, 2) + .build(); + + const auto intercept = train_res.get_intercept(); + + SECTION("Checking intercept values") { + check_if_close(intercept, expected_intercept_tbl, tol); + } + SECTION("Checking coefficient values") { + check_if_close(coefs, expected_beta_tbl, tol); + } + } + + else { + const double expected_beta[] = { -0.22795353, -0.09930168, -0.69685744, -0.22700585, + 0.88658098, -0.88921961, 1.19505839, 1.67634561, + 1.2882766, -1.43103981 }; + const auto expected_beta_tbl = oneapi::dal::detail::homogen_table_builder() + .set_data_type(data_type::float64) + .set_layout(data_layout::row_major) + .allocate(2, 5) + .copy_data(expected_beta, 2, 5) + .build(); + SECTION("Checking coefficient values") { + check_if_close(coefs, expected_beta_tbl, tol); + } + } + } + template std::vector split_table_by_rows(const dal::table& t, std::int64_t split_count) { ONEDAL_ASSERT(0l < split_count); From 9a971618f30e50bed8a81798a1b3984c0c43b7e5 Mon Sep 17 00:00:00 2001 From: David Cortes Date: Thu, 3 Oct 2024 10:18:52 +0200 Subject: [PATCH 13/16] missing dispatch --- cpp/daal/src/externals/service_lapack.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/cpp/daal/src/externals/service_lapack.h b/cpp/daal/src/externals/service_lapack.h index 68a5b0f555c..130dceb8248 100644 --- a/cpp/daal/src/externals/service_lapack.h +++ b/cpp/daal/src/externals/service_lapack.h @@ -418,6 +418,16 @@ struct LapackAutoDispatch { DAAL_DISPATCH_LAPACK_BY_CPU(fpType, xxormqr, side, trans, m, n, k, a, lda, tau, c, ldc, work, lwork, info); } + + static void xrscl(SizeType * n, const fpType * sa, fpType * sx, const SizeType * incx) + { + DAAL_DISPATCH_LAPACK_BY_CPU(fpType, xrscl, n, sa, sx, incx); + } + + static void xxrscl(SizeType * n, const fpType * sa, fpType * sx, const SizeType * incx) + { + DAAL_DISPATCH_LAPACK_BY_CPU(fpType, xxrscl, n, sa, sx, incx); + } }; } // namespace internal From 81fca31562f58fb9cb0b00f938492eb9317e93cb Mon Sep 17 00:00:00 2001 From: David Cortes Date: Thu, 10 Oct 2024 17:22:49 +0200 Subject: [PATCH 14/16] skip recently introduced test on GPU --- cpp/oneapi/dal/test/engine/fixtures.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cpp/oneapi/dal/test/engine/fixtures.hpp b/cpp/oneapi/dal/test/engine/fixtures.hpp index d64cda0440a..ff3202d1e7f 100644 --- a/cpp/oneapi/dal/test/engine/fixtures.hpp +++ b/cpp/oneapi/dal/test/engine/fixtures.hpp @@ -89,6 +89,10 @@ class float_algo_fixture : public algo_fixture { return is_double && !this->get_policy().has_native_float64(); } + bool running_on_gpu() { + return this->get_policy().is_gpu(); + } + table_id get_homogen_table_id() const { return table_id::homogen(); } From 74dce1c17d710d81243c9011871dbdd6687d7f32 Mon Sep 17 00:00:00 2001 From: David Cortes Date: Fri, 11 Oct 2024 08:03:34 +0200 Subject: [PATCH 15/16] missing file --- cpp/oneapi/dal/algo/linear_regression/test/batch.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cpp/oneapi/dal/algo/linear_regression/test/batch.cpp b/cpp/oneapi/dal/algo/linear_regression/test/batch.cpp index 39910131f89..269067d05ff 100644 --- a/cpp/oneapi/dal/algo/linear_regression/test/batch.cpp +++ b/cpp/oneapi/dal/algo/linear_regression/test/batch.cpp @@ -48,6 +48,12 @@ TEMPLATE_LIST_TEST_M(lr_batch_test, "LR common flow", "[lr][batch]", lr_types) { this->generate(777); this->run_and_check_linear(); +} + +TEMPLATE_LIST_TEST_M(lr_batch_test, "LR with non-PSD matrix", "[lr][batch-nonpsd]", lr_types) { + SKIP_IF(this->running_on_gpu()); + + this->generate(777); this->run_and_check_linear_indefinite(); this->run_and_check_linear_indefinite_multioutput(); } From 4c1271e32b86acc8bdebb2452f9fb0442dd190cf Mon Sep 17 00:00:00 2001 From: David Cortes Date: Fri, 11 Oct 2024 10:23:30 +0200 Subject: [PATCH 16/16] rename GPU check and move to correct file --- cpp/oneapi/dal/algo/linear_regression/test/batch.cpp | 2 +- cpp/oneapi/dal/algo/linear_regression/test/fixture.hpp | 4 ++++ cpp/oneapi/dal/test/engine/fixtures.hpp | 4 ---- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/oneapi/dal/algo/linear_regression/test/batch.cpp b/cpp/oneapi/dal/algo/linear_regression/test/batch.cpp index 269067d05ff..a5585202c31 100644 --- a/cpp/oneapi/dal/algo/linear_regression/test/batch.cpp +++ b/cpp/oneapi/dal/algo/linear_regression/test/batch.cpp @@ -51,7 +51,7 @@ TEMPLATE_LIST_TEST_M(lr_batch_test, "LR common flow", "[lr][batch]", lr_types) { } TEMPLATE_LIST_TEST_M(lr_batch_test, "LR with non-PSD matrix", "[lr][batch-nonpsd]", lr_types) { - SKIP_IF(this->running_on_gpu()); + SKIP_IF(this->non_psd_system_not_supported_on_device()); this->generate(777); this->run_and_check_linear_indefinite(); diff --git a/cpp/oneapi/dal/algo/linear_regression/test/fixture.hpp b/cpp/oneapi/dal/algo/linear_regression/test/fixture.hpp index bf8c22abe37..e1e54092552 100644 --- a/cpp/oneapi/dal/algo/linear_regression/test/fixture.hpp +++ b/cpp/oneapi/dal/algo/linear_regression/test/fixture.hpp @@ -65,6 +65,10 @@ class lr_test : public te::crtp_algo_fixture { return static_cast(this); } + bool non_psd_system_not_supported_on_device() { + return this->get_policy().is_gpu(); + } + table compute_responses(const table& beta, const table& bias, const table& data) const { const auto s_count = data.get_row_count(); diff --git a/cpp/oneapi/dal/test/engine/fixtures.hpp b/cpp/oneapi/dal/test/engine/fixtures.hpp index ff3202d1e7f..d64cda0440a 100644 --- a/cpp/oneapi/dal/test/engine/fixtures.hpp +++ b/cpp/oneapi/dal/test/engine/fixtures.hpp @@ -89,10 +89,6 @@ class float_algo_fixture : public algo_fixture { return is_double && !this->get_policy().has_native_float64(); } - bool running_on_gpu() { - return this->get_policy().is_gpu(); - } - table_id get_homogen_table_id() const { return table_id::homogen(); }