Use spectral decomposition as more accurate fallback from Cholesky (#…

…2930) * Use spectral decomposition as more accurate fallback from Cholesky * correct buffer size * add optional pragma (has no effect though) * add test with an overdetermined system * Update cpp/daal/src/algorithms/service_kernel_math.h Co-authored-by: Victoriya Fedotova <[email protected]> * PR comments * use syevr which is faster than syev * avoid too eager threshold for aborting fp32 cholesky * missing commas * fix declaration types * fix incorrect diagonal indexing * add test with multi-output and overdetermined * missing dispatch * skip recently introduced test on GPU * missing file * rename GPU check and move to correct file --------- Co-authored-by: Victoriya Fedotova <[email protected]>
uxlfoundation · Oct 16, 2024 · e93063f · e93063f
1 parent 38bbbd9
commit e93063f
Show file tree

Hide file tree

Showing 7 changed files with 486 additions and 29 deletions.
diff --git a/cpp/daal/src/algorithms/service_kernel_math.h b/cpp/daal/src/algorithms/service_kernel_math.h
@@ -24,6 +24,8 @@
 #ifndef __SERVICE_KERNEL_MATH_H__
 #define __SERVICE_KERNEL_MATH_H__
 
+#include <type_traits>
+
 #include "services/daal_defines.h"
 #include "services/env_detect.h"
 #include "src/algorithms/service_error_handling.h"
@@ -660,6 +662,18 @@ bool solveEquationsSystemWithCholesky(FPType * a, FPType * b, size_t n, size_t n
     }
     if (info != 0) return false;
 
+    /* Note: there can be cases in which the matrix is singular / rank-deficient, but due to numerical
+    inaccuracies, Cholesky still succeeds. In such cases, it might produce a solution successfully, but
+    it will not be the minimum-norm solution, and might be prone towards having too large numbers. Thus
+    it's preferrable to fall back to a different type of solver that can work correctly with those.
+    Note that the thresholds chosen there are just a guess and not based on any properties of floating
+    points or academic research. */
+    const FPType threshold_chol_diag = 1e-6;
+    for (size_t ix = 0; ix < n; ix++)
+    {
+        if (a[ix * (n + 1)] < threshold_chol_diag) return false;
+    }
+
     /* Solve L*L' * x = b */
     if (sequential)
     {
@@ -673,72 +687,178 @@ bool solveEquationsSystemWithCholesky(FPType * a, FPType * b, size_t n, size_t n
 }
 
 template <typename FPType, CpuType cpu>
-bool solveEquationsSystemWithPLU(FPType * a, FPType * b, size_t n, size_t nX, bool sequential, bool extendFromSymmetric)
+bool solveEquationsSystemWithSpectralDecomposition(FPType * a, FPType * b, size_t n, size_t nX, bool sequential)
 {
-    if (extendFromSymmetric)
+    /* Storage for the eigenvalues.
+    Note: this allocates more size than they might require when nX > 1, because the same
+    buffer will get reused later on and needs the extra size. Those additional entries
+    will not be filled with eigenvalues. */
+    TArrayScalable<FPType, cpu> eigenvalues(n * nX);
+    DAAL_CHECK_MALLOC(eigenvalues.get());
+
+    TArrayScalable<FPType, cpu> eigenvectors(n * n);
+    DAAL_CHECK_MALLOC(eigenvectors.get());
+
+    TArrayScalable<DAAL_INT, cpu> buffer_isuppz(2 * n);
+    DAAL_CHECK_MALLOC(buffer_isuppz.get());
+
+    /* SYEV parameters */
+    const char jobz  = 'V';
+    const char range = 'A';
+    const char uplo  = 'U';
+    FPType zero      = 0;
+    DAAL_INT info;
+    DAAL_INT num_eigenvalues;
+
+    /* Query the procedure for size of required buffer */
+    DAAL_INT lwork_query_indicator = -1;
+    FPType buffer_size_work        = 0;
+    DAAL_INT buffer_size_iwork     = 0;
+    if (sequential)
     {
-        /* Extend symmetric matrix to generic through filling of upper triangle */
-        for (size_t i = 0; i < n; ++i)
-        {
-            for (size_t j = 0; j < i; ++j)
-            {
-                a[j * n + i] = a[i * n + j];
-            }
-        }
+        LapackInst<FPType, cpu>::xxsyevr(&jobz, &range, &uplo, (DAAL_INT *)&n, a, (DAAL_INT *)&n, nullptr, nullptr, nullptr, nullptr, &zero,
+                                         &num_eigenvalues, eigenvalues.get(), eigenvectors.get(), (DAAL_INT *)&n, buffer_isuppz.get(),
+                                         &buffer_size_work, &lwork_query_indicator, &buffer_size_iwork, &lwork_query_indicator, &info);
     }
 
-    /* GETRF and GETRS parameters */
-    char trans    = 'N';
-    DAAL_INT info = 0;
+    else
+    {
+        LapackInst<FPType, cpu>::xsyevr(&jobz, &range, &uplo, (DAAL_INT *)&n, a, (DAAL_INT *)&n, nullptr, nullptr, nullptr, nullptr, &zero,
+                                        &num_eigenvalues, eigenvalues.get(), eigenvectors.get(), (DAAL_INT *)&n, buffer_isuppz.get(),
+                                        &buffer_size_work, &lwork_query_indicator, &buffer_size_iwork, &lwork_query_indicator, &info);
+    }
+
+    if (info) return false;
+
+    /* Check that buffer sizes will not overflow when passed to LAPACK */
+    if (static_cast<size_t>(buffer_size_work) > std::numeric_limits<DAAL_INT>::max()) return false;
+    if (buffer_size_iwork < 0) return false;
 
-    TArrayScalable<DAAL_INT, cpu> ipiv(n);
-    DAAL_CHECK_MALLOC(ipiv.get());
+    /* Allocate work buffers as needed */
+    DAAL_INT work_buffer_size = static_cast<DAAL_INT>(buffer_size_work);
+    TArrayScalable<FPType, cpu> work_buffer(work_buffer_size);
+    DAAL_CHECK_MALLOC(work_buffer.get());
+    TArrayScalable<DAAL_INT, cpu> iwork_buffer(buffer_size_iwork);
+    DAAL_CHECK_MALLOC(iwork_buffer.get());
 
-    /* Perform P*L*U factorization of A */
+    /* Perform Q*diag(l)*Q' factorization of A */
     if (sequential)
     {
-        LapackInst<FPType, cpu>::xxgetrf((DAAL_INT *)&n, (DAAL_INT *)&n, a, (DAAL_INT *)&n, ipiv.get(), &info);
+        LapackInst<FPType, cpu>::xxsyevr(&jobz, &range, &uplo, (DAAL_INT *)&n, a, (DAAL_INT *)&n, nullptr, nullptr, nullptr, nullptr, &zero,
+                                         &num_eigenvalues, eigenvalues.get(), eigenvectors.get(), (DAAL_INT *)&n, buffer_isuppz.get(),
+                                         work_buffer.get(), &work_buffer_size, iwork_buffer.get(), &buffer_size_iwork, &info);
     }
     else
     {
-        LapackInst<FPType, cpu>::xgetrf((DAAL_INT *)&n, (DAAL_INT *)&n, a, (DAAL_INT *)&n, ipiv.get(), &info);
+        LapackInst<FPType, cpu>::xsyevr(&jobz, &range, &uplo, (DAAL_INT *)&n, a, (DAAL_INT *)&n, nullptr, nullptr, nullptr, nullptr, &zero,
+                                        &num_eigenvalues, eigenvalues.get(), eigenvectors.get(), (DAAL_INT *)&n, buffer_isuppz.get(),
+                                        work_buffer.get(), &work_buffer_size, iwork_buffer.get(), &buffer_size_iwork, &info);
     }
-    if (info != 0) return false;
+    if (info) return false;
+
+    /* Components with small singular values get eliminated using the exact same logic as 'gelsd' with default parameters
+    Note: these are hard-coded versions of machine epsilon for single and double precision. They aren't obtained through
+    'std::numeric_limits' in order to avoid potential template instantiation errors with some types. */
+    const FPType eps = std::is_same<FPType, float>::value ? 1.1920929e-07 : 2.220446049250313e-16;
+    if (eigenvalues[n - 1] <= eps) return false;
+    const double component_threshold = eps * eigenvalues[n - 1];
+    DAAL_INT num_discarded;
+    for (num_discarded = 0; num_discarded < static_cast<DAAL_INT>(n) - 1; num_discarded++)
+    {
+        if (eigenvalues[num_discarded] > component_threshold) break;
+    }
+
+    /* Create the square root of the inverse: Qis = Q * diag(1 / sqrt(l)) */
+    DAAL_INT num_taken = static_cast<DAAL_INT>(n) - num_discarded;
+    daal::internal::MathInst<FPType, cpu>::vSqrt(num_taken, eigenvalues.get() + num_discarded, eigenvalues.get() + num_discarded);
+    DAAL_INT one = 1;
+    PRAGMA_IVDEP
+    for (size_t col = num_discarded; col < n; col++)
+    {
+        const FPType scale = eigenvalues[col];
+        if (sequential)
+        {
+            LapackInst<FPType, cpu>::xxrscl((DAAL_INT *)&n, &scale, eigenvectors.get() + col * n, &one);
+        }
 
-    /* Solve P*L*U * x = b */
+        else
+        {
+            LapackInst<FPType, cpu>::xrscl((DAAL_INT *)&n, &scale, eigenvectors.get() + col * n, &one);
+        }
+    }
+
+    /* Now calculate the actual solution: Qis * Qis' * B */
+    char trans_yes                   = 'T';
+    char trans_no                    = 'N';
+    FPType one_fp                    = 1;
+    const size_t eigenvectors_offset = static_cast<size_t>(num_discarded) * n;
     if (sequential)
     {
-        LapackInst<FPType, cpu>::xxgetrs(&trans, (DAAL_INT *)&n, (DAAL_INT *)&nX, a, (DAAL_INT *)&n, ipiv.get(), b, (DAAL_INT *)&n, &info);
+        if (nX == 1)
+        {
+            BlasInst<FPType, cpu>::xxgemv(&trans_yes, (DAAL_INT *)&n, &num_taken, &one_fp, eigenvectors.get() + eigenvectors_offset, (DAAL_INT *)&n,
+                                          b, &one, &zero, eigenvalues.get(), &one);
+            BlasInst<FPType, cpu>::xxgemv(&trans_no, (DAAL_INT *)&n, &num_taken, &one_fp, eigenvectors.get() + eigenvectors_offset, (DAAL_INT *)&n,
+                                          eigenvalues.get(), &one, &zero, b, &one);
+        }
+
+        else
+        {
+            BlasInst<FPType, cpu>::xxgemm(&trans_yes, &trans_no, &num_taken, (DAAL_INT *)&nX, (DAAL_INT *)&n, &one_fp,
+                                          eigenvectors.get() + eigenvectors_offset, (DAAL_INT *)&n, b, (DAAL_INT *)&n, &zero, eigenvalues.get(),
+                                          &num_taken);
+            BlasInst<FPType, cpu>::xxgemm(&trans_no, &trans_no, (DAAL_INT *)&n, (DAAL_INT *)&nX, &num_taken, &one_fp,
+                                          eigenvectors.get() + eigenvectors_offset, (DAAL_INT *)&n, eigenvalues.get(), &num_taken, &zero, b,
+                                          (DAAL_INT *)&n);
+        }
     }
+
     else
     {
-        LapackInst<FPType, cpu>::xgetrs(&trans, (DAAL_INT *)&n, (DAAL_INT *)&nX, a, (DAAL_INT *)&n, ipiv.get(), b, (DAAL_INT *)&n, &info);
+        if (nX == 1)
+        {
+            BlasInst<FPType, cpu>::xgemv(&trans_yes, (DAAL_INT *)&n, &num_taken, &one_fp, eigenvectors.get() + eigenvectors_offset, (DAAL_INT *)&n, b,
+                                         &one, &zero, eigenvalues.get(), &one);
+            BlasInst<FPType, cpu>::xgemv(&trans_no, (DAAL_INT *)&n, &num_taken, &one_fp, eigenvectors.get() + eigenvectors_offset, (DAAL_INT *)&n,
+                                         eigenvalues.get(), &one, &zero, b, &one);
+        }
+
+        else
+        {
+            BlasInst<FPType, cpu>::xgemm(&trans_yes, &trans_no, &num_taken, (DAAL_INT *)&nX, (DAAL_INT *)&n, &one_fp,
+                                         eigenvectors.get() + eigenvectors_offset, (DAAL_INT *)&n, b, (DAAL_INT *)&n, &zero, eigenvalues.get(),
+                                         &num_taken);
+            BlasInst<FPType, cpu>::xxgemm(&trans_no, &trans_no, (DAAL_INT *)&n, (DAAL_INT *)&nX, &num_taken, &one_fp,
+                                          eigenvectors.get() + eigenvectors_offset, (DAAL_INT *)&n, eigenvalues.get(), &num_taken, &zero, b,
+                                          (DAAL_INT *)&n);
+        }
     }
-    return (info == 0);
+
+    return true;
 }
 
 template <typename FPType, CpuType cpu>
 bool solveSymmetricEquationsSystem(FPType * a, FPType * b, size_t n, size_t nX, bool sequential)
 {
-    /* Copy data for fallback from Cholesky to PLU factorization */
+    /* Copy data for fallback from Cholesky to spectral decomposition */
     TArrayScalable<FPType, cpu> aCopy(n * n);
-    TArrayScalable<FPType, cpu> bCopy(n);
+    TArrayScalable<FPType, cpu> bCopy(n * nX);
     DAAL_CHECK_MALLOC(aCopy.get());
     DAAL_CHECK_MALLOC(bCopy.get());
 
     int copy_status = services::internal::daal_memcpy_s(aCopy.get(), n * n * sizeof(FPType), a, n * n * sizeof(FPType));
-    copy_status += services::internal::daal_memcpy_s(bCopy.get(), n * sizeof(FPType), b, n * sizeof(FPType));
+    copy_status += services::internal::daal_memcpy_s(bCopy.get(), n * nX * sizeof(FPType), b, n * nX * sizeof(FPType));
 
     if (copy_status != 0) return false;
 
     /* Try to solve with Cholesky factorization */
     if (!solveEquationsSystemWithCholesky<FPType, cpu>(a, b, n, nX, sequential))
     {
-        /* Fallback to PLU factorization */
-        bool status = solveEquationsSystemWithPLU<FPType, cpu>(aCopy.get(), bCopy.get(), n, nX, sequential, true);
+        /* Fall back to spectral decomposition */
+        bool status = solveEquationsSystemWithSpectralDecomposition<FPType, cpu>(aCopy.get(), bCopy.get(), n, nX, sequential);
         if (status)
         {
-            status = status && (services::internal::daal_memcpy_s(b, n * sizeof(FPType), bCopy.get(), n * sizeof(FPType)) == 0);
+            status = status && (services::internal::daal_memcpy_s(b, n * nX * sizeof(FPType), bCopy.get(), n * nX * sizeof(FPType)) == 0);
         }
         return status;
     }

diff --git a/cpp/daal/src/externals/service_lapack.h b/cpp/daal/src/externals/service_lapack.h
@@ -21,6 +21,11 @@
 //--
 */
 
+/* Note: this file is not auto-generated. These 'x'/'xx' functions are manually added here on an
+as-needed basis, and are only used internally within the library so their signatures might not
+match LAPACK's to every minutiae like passing pointers to scalars or passing them by value, or
+having 'const' qualifiers or not. */
+
 #ifndef __SERVICE_LAPACK_H__
 #define __SERVICE_LAPACK_H__
 
@@ -181,6 +186,9 @@ struct Lapack
         _impl<fpType, cpu>::xxgesvd(jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, work, lwork, info);
     }
 
+    /* Note: 'syevd' and 'syevr' both compute symmetric eigenvalues, but they use different routines. 'syevd'
+    is slower but more precise and should thus be preferred for situations in which small numerical inaccuracies
+    have adverse side effects, whereas 'syevr' is faster and more suitable for general usage. */
     static void xsyevd(char * jobz, char * uplo, SizeType * n, fpType * a, SizeType * lda, fpType * w, fpType * work, SizeType * lwork,
                        SizeType * iwork, SizeType * liwork, SizeType * info)
     {
@@ -193,6 +201,22 @@ struct Lapack
         _impl<fpType, cpu>::xxsyevd(jobz, uplo, n, a, lda, w, work, lwork, iwork, liwork, info);
     }
 
+    static void xsyevr(const char * jobz, const char * range, const char * uplo, const SizeType * n, fpType * a, const SizeType * lda,
+                       const fpType * vl, const fpType * vu, const SizeType * il, const SizeType * iu, const fpType * abstol, SizeType * m,
+                       fpType * w, fpType * z, const SizeType * ldz, SizeType * isuppz, fpType * work, const SizeType * lwork, SizeType * iwork,
+                       const SizeType * liwork, SizeType * info)
+    {
+        _impl<fpType, cpu>::xsyevr(jobz, range, uplo, n, a, lda, vl, vu, il, iu, abstol, m, w, z, ldz, isuppz, work, lwork, iwork, liwork, info);
+    }
+
+    static void xxsyevr(const char * jobz, const char * range, const char * uplo, const SizeType * n, fpType * a, const SizeType * lda,
+                        const fpType * vl, const fpType * vu, const SizeType * il, const SizeType * iu, const fpType * abstol, SizeType * m,
+                        fpType * w, fpType * z, const SizeType * ldz, SizeType * isuppz, fpType * work, const SizeType * lwork, SizeType * iwork,
+                        const SizeType * liwork, SizeType * info)
+    {
+        _impl<fpType, cpu>::xxsyevr(jobz, range, uplo, n, a, lda, vl, vu, il, iu, abstol, m, w, z, ldz, isuppz, work, lwork, iwork, liwork, info);
+    }
+
     static void xormqr(char * side, char * trans, SizeType * m, SizeType * n, SizeType * k, fpType * a, SizeType * lda, fpType * tau, fpType * c,
                        SizeType * ldc, fpType * work, SizeType * lwork, SizeType * info)
     {
@@ -204,6 +228,10 @@ struct Lapack
     {
         _impl<fpType, cpu>::xxormqr(side, trans, m, n, k, a, lda, tau, c, ldc, work, lwork, info);
     }
+
+    static void xrscl(const SizeType * n, const fpType * sa, fpType * sx, const SizeType * incx) { _impl<fpType, cpu>::xrscl(n, sa, sx, incx); }
+
+    static void xxrscl(const SizeType * n, const fpType * sa, fpType * sx, const SizeType * incx) { _impl<fpType, cpu>::xxrscl(n, sa, sx, incx); }
 };
 
 template <typename fpType, CpuType cpu>
@@ -361,6 +389,24 @@ struct LapackAutoDispatch
         DAAL_DISPATCH_LAPACK_BY_CPU(fpType, xxsyevd, jobz, uplo, n, a, lda, w, work, lwork, iwork, liwork, info);
     }
 
+    static void xsyevr(const char * jobz, const char * range, const char * uplo, const SizeType * n, fpType * a, const SizeType * lda,
+                       const fpType * vl, const fpType * vu, const SizeType * il, const SizeType * iu, const fpType * abstol, SizeType * m,
+                       fpType * w, fpType * z, const SizeType * ldz, SizeType * isuppz, fpType * work, const SizeType * lwork, SizeType * iwork,
+                       const SizeType * liwork, SizeType * info)
+    {
+        DAAL_DISPATCH_LAPACK_BY_CPU(fpType, xsyevr, jobz, range, uplo, n, a, lda, vl, vu, il, iu, abstol, m, w, z, ldz, isuppz, work, lwork, iwork,
+                                    liwork, info);
+    }
+
+    static void xxsyevr(const char * jobz, const char * range, const char * uplo, const SizeType * n, fpType * a, const SizeType * lda,
+                        const fpType * vl, const fpType * vu, const SizeType * il, const SizeType * iu, const fpType * abstol, SizeType * m,
+                        fpType * w, fpType * z, const SizeType * ldz, SizeType * isuppz, fpType * work, const SizeType * lwork, SizeType * iwork,
+                        const SizeType * liwork, SizeType * info)
+    {
+        DAAL_DISPATCH_LAPACK_BY_CPU(fpType, xxsyevr, jobz, range, uplo, n, a, lda, vl, vu, il, iu, abstol, m, w, z, ldz, isuppz, work, lwork, iwork,
+                                    liwork, info);
+    }
+
     static void xormqr(char * side, char * trans, SizeType * m, SizeType * n, SizeType * k, fpType * a, SizeType * lda, fpType * tau, fpType * c,
                        SizeType * ldc, fpType * work, SizeType * lwork, SizeType * info)
     {
@@ -372,6 +418,16 @@ struct LapackAutoDispatch
     {
         DAAL_DISPATCH_LAPACK_BY_CPU(fpType, xxormqr, side, trans, m, n, k, a, lda, tau, c, ldc, work, lwork, info);
     }
+
+    static void xrscl(SizeType * n, const fpType * sa, fpType * sx, const SizeType * incx)
+    {
+        DAAL_DISPATCH_LAPACK_BY_CPU(fpType, xrscl, n, sa, sx, incx);
+    }
+
+    static void xxrscl(SizeType * n, const fpType * sa, fpType * sx, const SizeType * incx)
+    {
+        DAAL_DISPATCH_LAPACK_BY_CPU(fpType, xxrscl, n, sa, sx, incx);
+    }
 };
 
 } // namespace internal

diff --git a/cpp/daal/src/externals/service_lapack_declar_ref.h b/cpp/daal/src/externals/service_lapack_declar_ref.h
@@ -82,10 +82,20 @@ extern "C"
     extern void ssyevd_(char *, char *, DAAL_INT *, float *, DAAL_INT *, float *, float *, DAAL_INT *, DAAL_INT *, DAAL_INT *, DAAL_INT *);
     extern void dsyevd_(char *, char *, DAAL_INT *, double *, DAAL_INT *, double *, double *, DAAL_INT *, DAAL_INT *, DAAL_INT *, DAAL_INT *);
 
+    extern void ssyevr_(const char *, const char *, const char *, const DAAL_INT *, float *, const DAAL_INT *, const float *, const float *,
+                        const DAAL_INT *, const DAAL_INT *, const float *, DAAL_INT *, float *, float *, const DAAL_INT *, DAAL_INT *, float *,
+                        const DAAL_INT *, DAAL_INT *, const DAAL_INT *, DAAL_INT *);
+    extern void dsyevr_(const char *, const char *, const char *, const DAAL_INT *, double *, const DAAL_INT *, const double *, const double *,
+                        const DAAL_INT *, const DAAL_INT *, const double *, DAAL_INT *, double *, double *, const DAAL_INT *, DAAL_INT *, double *,
+                        const DAAL_INT *, DAAL_INT *, const DAAL_INT *, DAAL_INT *);
+
     extern void sormqr_(char *, char *, DAAL_INT *, DAAL_INT *, DAAL_INT *, float *, DAAL_INT *, float *, float *, DAAL_INT *, float *, DAAL_INT *,
                         DAAL_INT *);
     extern void dormqr_(char *, char *, DAAL_INT *, DAAL_INT *, DAAL_INT *, double *, DAAL_INT *, double *, double *, DAAL_INT *, double *,
                         DAAL_INT *, DAAL_INT *);
+
+    extern void drscl_(const DAAL_INT *, const double *, double *, const DAAL_INT *);
+    extern void srscl_(const DAAL_INT *, const float *, float *, const DAAL_INT *);
 }
 
 } // namespace ref