Skip to content

Commit

Permalink
Use spectral decomposition as more accurate fallback from Cholesky (#…
Browse files Browse the repository at this point in the history
…2930)

* Use spectral decomposition as more accurate fallback from Cholesky

* correct buffer size

* add optional pragma (has no effect though)

* add test with an overdetermined system

* Update cpp/daal/src/algorithms/service_kernel_math.h

Co-authored-by: Victoriya Fedotova <[email protected]>

* PR comments

* use syevr which is faster than syev

* avoid too eager threshold for aborting fp32 cholesky

* missing commas

* fix declaration types

* fix incorrect diagonal indexing

* add test with multi-output and overdetermined

* missing dispatch

* skip recently introduced test on GPU

* missing file

* rename GPU check and move to correct file

---------

Co-authored-by: Victoriya Fedotova <[email protected]>
  • Loading branch information
david-cortes-intel and Vika-F authored Oct 16, 2024
1 parent 38bbbd9 commit e93063f
Show file tree
Hide file tree
Showing 7 changed files with 486 additions and 29 deletions.
178 changes: 149 additions & 29 deletions cpp/daal/src/algorithms/service_kernel_math.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
#ifndef __SERVICE_KERNEL_MATH_H__
#define __SERVICE_KERNEL_MATH_H__

#include <type_traits>

#include "services/daal_defines.h"
#include "services/env_detect.h"
#include "src/algorithms/service_error_handling.h"
Expand Down Expand Up @@ -660,6 +662,18 @@ bool solveEquationsSystemWithCholesky(FPType * a, FPType * b, size_t n, size_t n
}
if (info != 0) return false;

/* Note: there can be cases in which the matrix is singular / rank-deficient, but due to numerical
inaccuracies, Cholesky still succeeds. In such cases, it might produce a solution successfully, but
it will not be the minimum-norm solution, and might be prone towards having too large numbers. Thus
it's preferrable to fall back to a different type of solver that can work correctly with those.
Note that the thresholds chosen there are just a guess and not based on any properties of floating
points or academic research. */
const FPType threshold_chol_diag = 1e-6;
for (size_t ix = 0; ix < n; ix++)
{
if (a[ix * (n + 1)] < threshold_chol_diag) return false;
}

/* Solve L*L' * x = b */
if (sequential)
{
Expand All @@ -673,72 +687,178 @@ bool solveEquationsSystemWithCholesky(FPType * a, FPType * b, size_t n, size_t n
}

template <typename FPType, CpuType cpu>
bool solveEquationsSystemWithPLU(FPType * a, FPType * b, size_t n, size_t nX, bool sequential, bool extendFromSymmetric)
bool solveEquationsSystemWithSpectralDecomposition(FPType * a, FPType * b, size_t n, size_t nX, bool sequential)
{
if (extendFromSymmetric)
/* Storage for the eigenvalues.
Note: this allocates more size than they might require when nX > 1, because the same
buffer will get reused later on and needs the extra size. Those additional entries
will not be filled with eigenvalues. */
TArrayScalable<FPType, cpu> eigenvalues(n * nX);
DAAL_CHECK_MALLOC(eigenvalues.get());

TArrayScalable<FPType, cpu> eigenvectors(n * n);
DAAL_CHECK_MALLOC(eigenvectors.get());

TArrayScalable<DAAL_INT, cpu> buffer_isuppz(2 * n);
DAAL_CHECK_MALLOC(buffer_isuppz.get());

/* SYEV parameters */
const char jobz = 'V';
const char range = 'A';
const char uplo = 'U';
FPType zero = 0;
DAAL_INT info;
DAAL_INT num_eigenvalues;

/* Query the procedure for size of required buffer */
DAAL_INT lwork_query_indicator = -1;
FPType buffer_size_work = 0;
DAAL_INT buffer_size_iwork = 0;
if (sequential)
{
/* Extend symmetric matrix to generic through filling of upper triangle */
for (size_t i = 0; i < n; ++i)
{
for (size_t j = 0; j < i; ++j)
{
a[j * n + i] = a[i * n + j];
}
}
LapackInst<FPType, cpu>::xxsyevr(&jobz, &range, &uplo, (DAAL_INT *)&n, a, (DAAL_INT *)&n, nullptr, nullptr, nullptr, nullptr, &zero,
&num_eigenvalues, eigenvalues.get(), eigenvectors.get(), (DAAL_INT *)&n, buffer_isuppz.get(),
&buffer_size_work, &lwork_query_indicator, &buffer_size_iwork, &lwork_query_indicator, &info);
}

/* GETRF and GETRS parameters */
char trans = 'N';
DAAL_INT info = 0;
else
{
LapackInst<FPType, cpu>::xsyevr(&jobz, &range, &uplo, (DAAL_INT *)&n, a, (DAAL_INT *)&n, nullptr, nullptr, nullptr, nullptr, &zero,
&num_eigenvalues, eigenvalues.get(), eigenvectors.get(), (DAAL_INT *)&n, buffer_isuppz.get(),
&buffer_size_work, &lwork_query_indicator, &buffer_size_iwork, &lwork_query_indicator, &info);
}

if (info) return false;

/* Check that buffer sizes will not overflow when passed to LAPACK */
if (static_cast<size_t>(buffer_size_work) > std::numeric_limits<DAAL_INT>::max()) return false;
if (buffer_size_iwork < 0) return false;

TArrayScalable<DAAL_INT, cpu> ipiv(n);
DAAL_CHECK_MALLOC(ipiv.get());
/* Allocate work buffers as needed */
DAAL_INT work_buffer_size = static_cast<DAAL_INT>(buffer_size_work);
TArrayScalable<FPType, cpu> work_buffer(work_buffer_size);
DAAL_CHECK_MALLOC(work_buffer.get());
TArrayScalable<DAAL_INT, cpu> iwork_buffer(buffer_size_iwork);
DAAL_CHECK_MALLOC(iwork_buffer.get());

/* Perform P*L*U factorization of A */
/* Perform Q*diag(l)*Q' factorization of A */
if (sequential)
{
LapackInst<FPType, cpu>::xxgetrf((DAAL_INT *)&n, (DAAL_INT *)&n, a, (DAAL_INT *)&n, ipiv.get(), &info);
LapackInst<FPType, cpu>::xxsyevr(&jobz, &range, &uplo, (DAAL_INT *)&n, a, (DAAL_INT *)&n, nullptr, nullptr, nullptr, nullptr, &zero,
&num_eigenvalues, eigenvalues.get(), eigenvectors.get(), (DAAL_INT *)&n, buffer_isuppz.get(),
work_buffer.get(), &work_buffer_size, iwork_buffer.get(), &buffer_size_iwork, &info);
}
else
{
LapackInst<FPType, cpu>::xgetrf((DAAL_INT *)&n, (DAAL_INT *)&n, a, (DAAL_INT *)&n, ipiv.get(), &info);
LapackInst<FPType, cpu>::xsyevr(&jobz, &range, &uplo, (DAAL_INT *)&n, a, (DAAL_INT *)&n, nullptr, nullptr, nullptr, nullptr, &zero,
&num_eigenvalues, eigenvalues.get(), eigenvectors.get(), (DAAL_INT *)&n, buffer_isuppz.get(),
work_buffer.get(), &work_buffer_size, iwork_buffer.get(), &buffer_size_iwork, &info);
}
if (info != 0) return false;
if (info) return false;

/* Components with small singular values get eliminated using the exact same logic as 'gelsd' with default parameters
Note: these are hard-coded versions of machine epsilon for single and double precision. They aren't obtained through
'std::numeric_limits' in order to avoid potential template instantiation errors with some types. */
const FPType eps = std::is_same<FPType, float>::value ? 1.1920929e-07 : 2.220446049250313e-16;
if (eigenvalues[n - 1] <= eps) return false;
const double component_threshold = eps * eigenvalues[n - 1];
DAAL_INT num_discarded;
for (num_discarded = 0; num_discarded < static_cast<DAAL_INT>(n) - 1; num_discarded++)
{
if (eigenvalues[num_discarded] > component_threshold) break;
}

/* Create the square root of the inverse: Qis = Q * diag(1 / sqrt(l)) */
DAAL_INT num_taken = static_cast<DAAL_INT>(n) - num_discarded;
daal::internal::MathInst<FPType, cpu>::vSqrt(num_taken, eigenvalues.get() + num_discarded, eigenvalues.get() + num_discarded);
DAAL_INT one = 1;
PRAGMA_IVDEP
for (size_t col = num_discarded; col < n; col++)
{
const FPType scale = eigenvalues[col];
if (sequential)
{
LapackInst<FPType, cpu>::xxrscl((DAAL_INT *)&n, &scale, eigenvectors.get() + col * n, &one);
}

/* Solve P*L*U * x = b */
else
{
LapackInst<FPType, cpu>::xrscl((DAAL_INT *)&n, &scale, eigenvectors.get() + col * n, &one);
}
}

/* Now calculate the actual solution: Qis * Qis' * B */
char trans_yes = 'T';
char trans_no = 'N';
FPType one_fp = 1;
const size_t eigenvectors_offset = static_cast<size_t>(num_discarded) * n;
if (sequential)
{
LapackInst<FPType, cpu>::xxgetrs(&trans, (DAAL_INT *)&n, (DAAL_INT *)&nX, a, (DAAL_INT *)&n, ipiv.get(), b, (DAAL_INT *)&n, &info);
if (nX == 1)
{
BlasInst<FPType, cpu>::xxgemv(&trans_yes, (DAAL_INT *)&n, &num_taken, &one_fp, eigenvectors.get() + eigenvectors_offset, (DAAL_INT *)&n,
b, &one, &zero, eigenvalues.get(), &one);
BlasInst<FPType, cpu>::xxgemv(&trans_no, (DAAL_INT *)&n, &num_taken, &one_fp, eigenvectors.get() + eigenvectors_offset, (DAAL_INT *)&n,
eigenvalues.get(), &one, &zero, b, &one);
}

else
{
BlasInst<FPType, cpu>::xxgemm(&trans_yes, &trans_no, &num_taken, (DAAL_INT *)&nX, (DAAL_INT *)&n, &one_fp,
eigenvectors.get() + eigenvectors_offset, (DAAL_INT *)&n, b, (DAAL_INT *)&n, &zero, eigenvalues.get(),
&num_taken);
BlasInst<FPType, cpu>::xxgemm(&trans_no, &trans_no, (DAAL_INT *)&n, (DAAL_INT *)&nX, &num_taken, &one_fp,
eigenvectors.get() + eigenvectors_offset, (DAAL_INT *)&n, eigenvalues.get(), &num_taken, &zero, b,
(DAAL_INT *)&n);
}
}

else
{
LapackInst<FPType, cpu>::xgetrs(&trans, (DAAL_INT *)&n, (DAAL_INT *)&nX, a, (DAAL_INT *)&n, ipiv.get(), b, (DAAL_INT *)&n, &info);
if (nX == 1)
{
BlasInst<FPType, cpu>::xgemv(&trans_yes, (DAAL_INT *)&n, &num_taken, &one_fp, eigenvectors.get() + eigenvectors_offset, (DAAL_INT *)&n, b,
&one, &zero, eigenvalues.get(), &one);
BlasInst<FPType, cpu>::xgemv(&trans_no, (DAAL_INT *)&n, &num_taken, &one_fp, eigenvectors.get() + eigenvectors_offset, (DAAL_INT *)&n,
eigenvalues.get(), &one, &zero, b, &one);
}

else
{
BlasInst<FPType, cpu>::xgemm(&trans_yes, &trans_no, &num_taken, (DAAL_INT *)&nX, (DAAL_INT *)&n, &one_fp,
eigenvectors.get() + eigenvectors_offset, (DAAL_INT *)&n, b, (DAAL_INT *)&n, &zero, eigenvalues.get(),
&num_taken);
BlasInst<FPType, cpu>::xxgemm(&trans_no, &trans_no, (DAAL_INT *)&n, (DAAL_INT *)&nX, &num_taken, &one_fp,
eigenvectors.get() + eigenvectors_offset, (DAAL_INT *)&n, eigenvalues.get(), &num_taken, &zero, b,
(DAAL_INT *)&n);
}
}
return (info == 0);

return true;
}

template <typename FPType, CpuType cpu>
bool solveSymmetricEquationsSystem(FPType * a, FPType * b, size_t n, size_t nX, bool sequential)
{
/* Copy data for fallback from Cholesky to PLU factorization */
/* Copy data for fallback from Cholesky to spectral decomposition */
TArrayScalable<FPType, cpu> aCopy(n * n);
TArrayScalable<FPType, cpu> bCopy(n);
TArrayScalable<FPType, cpu> bCopy(n * nX);
DAAL_CHECK_MALLOC(aCopy.get());
DAAL_CHECK_MALLOC(bCopy.get());

int copy_status = services::internal::daal_memcpy_s(aCopy.get(), n * n * sizeof(FPType), a, n * n * sizeof(FPType));
copy_status += services::internal::daal_memcpy_s(bCopy.get(), n * sizeof(FPType), b, n * sizeof(FPType));
copy_status += services::internal::daal_memcpy_s(bCopy.get(), n * nX * sizeof(FPType), b, n * nX * sizeof(FPType));

if (copy_status != 0) return false;

/* Try to solve with Cholesky factorization */
if (!solveEquationsSystemWithCholesky<FPType, cpu>(a, b, n, nX, sequential))
{
/* Fallback to PLU factorization */
bool status = solveEquationsSystemWithPLU<FPType, cpu>(aCopy.get(), bCopy.get(), n, nX, sequential, true);
/* Fall back to spectral decomposition */
bool status = solveEquationsSystemWithSpectralDecomposition<FPType, cpu>(aCopy.get(), bCopy.get(), n, nX, sequential);
if (status)
{
status = status && (services::internal::daal_memcpy_s(b, n * sizeof(FPType), bCopy.get(), n * sizeof(FPType)) == 0);
status = status && (services::internal::daal_memcpy_s(b, n * nX * sizeof(FPType), bCopy.get(), n * nX * sizeof(FPType)) == 0);
}
return status;
}
Expand Down
56 changes: 56 additions & 0 deletions cpp/daal/src/externals/service_lapack.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@
//--
*/

/* Note: this file is not auto-generated. These 'x'/'xx' functions are manually added here on an
as-needed basis, and are only used internally within the library so their signatures might not
match LAPACK's to every minutiae like passing pointers to scalars or passing them by value, or
having 'const' qualifiers or not. */

#ifndef __SERVICE_LAPACK_H__
#define __SERVICE_LAPACK_H__

Expand Down Expand Up @@ -181,6 +186,9 @@ struct Lapack
_impl<fpType, cpu>::xxgesvd(jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, work, lwork, info);
}

/* Note: 'syevd' and 'syevr' both compute symmetric eigenvalues, but they use different routines. 'syevd'
is slower but more precise and should thus be preferred for situations in which small numerical inaccuracies
have adverse side effects, whereas 'syevr' is faster and more suitable for general usage. */
static void xsyevd(char * jobz, char * uplo, SizeType * n, fpType * a, SizeType * lda, fpType * w, fpType * work, SizeType * lwork,
SizeType * iwork, SizeType * liwork, SizeType * info)
{
Expand All @@ -193,6 +201,22 @@ struct Lapack
_impl<fpType, cpu>::xxsyevd(jobz, uplo, n, a, lda, w, work, lwork, iwork, liwork, info);
}

static void xsyevr(const char * jobz, const char * range, const char * uplo, const SizeType * n, fpType * a, const SizeType * lda,
const fpType * vl, const fpType * vu, const SizeType * il, const SizeType * iu, const fpType * abstol, SizeType * m,
fpType * w, fpType * z, const SizeType * ldz, SizeType * isuppz, fpType * work, const SizeType * lwork, SizeType * iwork,
const SizeType * liwork, SizeType * info)
{
_impl<fpType, cpu>::xsyevr(jobz, range, uplo, n, a, lda, vl, vu, il, iu, abstol, m, w, z, ldz, isuppz, work, lwork, iwork, liwork, info);
}

static void xxsyevr(const char * jobz, const char * range, const char * uplo, const SizeType * n, fpType * a, const SizeType * lda,
const fpType * vl, const fpType * vu, const SizeType * il, const SizeType * iu, const fpType * abstol, SizeType * m,
fpType * w, fpType * z, const SizeType * ldz, SizeType * isuppz, fpType * work, const SizeType * lwork, SizeType * iwork,
const SizeType * liwork, SizeType * info)
{
_impl<fpType, cpu>::xxsyevr(jobz, range, uplo, n, a, lda, vl, vu, il, iu, abstol, m, w, z, ldz, isuppz, work, lwork, iwork, liwork, info);
}

static void xormqr(char * side, char * trans, SizeType * m, SizeType * n, SizeType * k, fpType * a, SizeType * lda, fpType * tau, fpType * c,
SizeType * ldc, fpType * work, SizeType * lwork, SizeType * info)
{
Expand All @@ -204,6 +228,10 @@ struct Lapack
{
_impl<fpType, cpu>::xxormqr(side, trans, m, n, k, a, lda, tau, c, ldc, work, lwork, info);
}

static void xrscl(const SizeType * n, const fpType * sa, fpType * sx, const SizeType * incx) { _impl<fpType, cpu>::xrscl(n, sa, sx, incx); }

static void xxrscl(const SizeType * n, const fpType * sa, fpType * sx, const SizeType * incx) { _impl<fpType, cpu>::xxrscl(n, sa, sx, incx); }
};

template <typename fpType, CpuType cpu>
Expand Down Expand Up @@ -361,6 +389,24 @@ struct LapackAutoDispatch
DAAL_DISPATCH_LAPACK_BY_CPU(fpType, xxsyevd, jobz, uplo, n, a, lda, w, work, lwork, iwork, liwork, info);
}

static void xsyevr(const char * jobz, const char * range, const char * uplo, const SizeType * n, fpType * a, const SizeType * lda,
const fpType * vl, const fpType * vu, const SizeType * il, const SizeType * iu, const fpType * abstol, SizeType * m,
fpType * w, fpType * z, const SizeType * ldz, SizeType * isuppz, fpType * work, const SizeType * lwork, SizeType * iwork,
const SizeType * liwork, SizeType * info)
{
DAAL_DISPATCH_LAPACK_BY_CPU(fpType, xsyevr, jobz, range, uplo, n, a, lda, vl, vu, il, iu, abstol, m, w, z, ldz, isuppz, work, lwork, iwork,
liwork, info);
}

static void xxsyevr(const char * jobz, const char * range, const char * uplo, const SizeType * n, fpType * a, const SizeType * lda,
const fpType * vl, const fpType * vu, const SizeType * il, const SizeType * iu, const fpType * abstol, SizeType * m,
fpType * w, fpType * z, const SizeType * ldz, SizeType * isuppz, fpType * work, const SizeType * lwork, SizeType * iwork,
const SizeType * liwork, SizeType * info)
{
DAAL_DISPATCH_LAPACK_BY_CPU(fpType, xxsyevr, jobz, range, uplo, n, a, lda, vl, vu, il, iu, abstol, m, w, z, ldz, isuppz, work, lwork, iwork,
liwork, info);
}

static void xormqr(char * side, char * trans, SizeType * m, SizeType * n, SizeType * k, fpType * a, SizeType * lda, fpType * tau, fpType * c,
SizeType * ldc, fpType * work, SizeType * lwork, SizeType * info)
{
Expand All @@ -372,6 +418,16 @@ struct LapackAutoDispatch
{
DAAL_DISPATCH_LAPACK_BY_CPU(fpType, xxormqr, side, trans, m, n, k, a, lda, tau, c, ldc, work, lwork, info);
}

static void xrscl(SizeType * n, const fpType * sa, fpType * sx, const SizeType * incx)
{
DAAL_DISPATCH_LAPACK_BY_CPU(fpType, xrscl, n, sa, sx, incx);
}

static void xxrscl(SizeType * n, const fpType * sa, fpType * sx, const SizeType * incx)
{
DAAL_DISPATCH_LAPACK_BY_CPU(fpType, xxrscl, n, sa, sx, incx);
}
};

} // namespace internal
Expand Down
10 changes: 10 additions & 0 deletions cpp/daal/src/externals/service_lapack_declar_ref.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,10 +82,20 @@ extern "C"
extern void ssyevd_(char *, char *, DAAL_INT *, float *, DAAL_INT *, float *, float *, DAAL_INT *, DAAL_INT *, DAAL_INT *, DAAL_INT *);
extern void dsyevd_(char *, char *, DAAL_INT *, double *, DAAL_INT *, double *, double *, DAAL_INT *, DAAL_INT *, DAAL_INT *, DAAL_INT *);

extern void ssyevr_(const char *, const char *, const char *, const DAAL_INT *, float *, const DAAL_INT *, const float *, const float *,
const DAAL_INT *, const DAAL_INT *, const float *, DAAL_INT *, float *, float *, const DAAL_INT *, DAAL_INT *, float *,
const DAAL_INT *, DAAL_INT *, const DAAL_INT *, DAAL_INT *);
extern void dsyevr_(const char *, const char *, const char *, const DAAL_INT *, double *, const DAAL_INT *, const double *, const double *,
const DAAL_INT *, const DAAL_INT *, const double *, DAAL_INT *, double *, double *, const DAAL_INT *, DAAL_INT *, double *,
const DAAL_INT *, DAAL_INT *, const DAAL_INT *, DAAL_INT *);

extern void sormqr_(char *, char *, DAAL_INT *, DAAL_INT *, DAAL_INT *, float *, DAAL_INT *, float *, float *, DAAL_INT *, float *, DAAL_INT *,
DAAL_INT *);
extern void dormqr_(char *, char *, DAAL_INT *, DAAL_INT *, DAAL_INT *, double *, DAAL_INT *, double *, double *, DAAL_INT *, double *,
DAAL_INT *, DAAL_INT *);

extern void drscl_(const DAAL_INT *, const double *, double *, const DAAL_INT *);
extern void srscl_(const DAAL_INT *, const float *, float *, const DAAL_INT *);
}

} // namespace ref
Expand Down
Loading

0 comments on commit e93063f

Please sign in to comment.