Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adapt EigenDecomposition implementation #2038

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
342 changes: 342 additions & 0 deletions src/main/java/org/apache/sysds/runtime/matrix/data/LibCommonsMath.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,12 @@
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Arrays;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.commons.math3.exception.MaxCountExceededException;
import org.apache.commons.math3.exception.util.LocalizedFormats;
import org.apache.commons.math3.linear.Array2DRowRealMatrix;
import org.apache.commons.math3.linear.BlockRealMatrix;
import org.apache.commons.math3.linear.CholeskyDecomposition;
Expand All @@ -40,11 +42,13 @@
import org.apache.commons.math3.linear.QRDecomposition;
import org.apache.commons.math3.linear.RealMatrix;
import org.apache.commons.math3.linear.SingularValueDecomposition;
import org.apache.commons.math3.util.FastMath;
import org.apache.sysds.runtime.DMLRuntimeException;
import org.apache.sysds.runtime.codegen.CodegenUtils;
import org.apache.sysds.runtime.codegen.SpoofOperator.SideInput;
import org.apache.sysds.runtime.compress.utils.IntArrayList;
import org.apache.sysds.runtime.data.DenseBlock;
import org.apache.sysds.runtime.data.DenseBlockFactory;
import org.apache.sysds.runtime.functionobjects.Builtin;
import org.apache.sysds.runtime.functionobjects.Divide;
import org.apache.sysds.runtime.functionobjects.MinusMultiply;
Expand All @@ -61,6 +65,7 @@
import org.apache.sysds.runtime.matrix.operators.UnaryOperator;
import org.apache.sysds.runtime.util.DataConverter;
import org.apache.sysds.runtime.util.UtilFunctions;
import org.apache.commons.math3.util.Precision;

/**
* Library for matrix operations that need invocation of
Expand All @@ -74,6 +79,8 @@ public class LibCommonsMath
private static final Log LOG = LogFactory.getLog(LibCommonsMath.class.getName());
private static final double RELATIVE_SYMMETRY_THRESHOLD = 1e-6;
private static final double EIGEN_LAMBDA = 1e-8;
private static final double EIGEN_EPS = Precision.EPSILON;
Aast12 marked this conversation as resolved.
Show resolved Hide resolved
private static final int EIGEN_MAX_ITER = 30;

private LibCommonsMath() {
//prevent instantiation via private constructor
Expand Down Expand Up @@ -146,6 +153,8 @@ else if (opcode.equals("eigen_lanczos"))
return computeEigenLanczos(in, threads, seed);
else if (opcode.equals("eigen_qr"))
return computeEigenQR(in, threads);
else if (opcode.equals("eigen_symm"))
return computeEigenDecompositionSymm(in, threads);
else if (opcode.equals("svd"))
return computeSvd(in);
else if (opcode.equals("fft"))
Expand Down Expand Up @@ -209,6 +218,339 @@ private static MatrixBlock computeSolve(MatrixBlock in1, MatrixBlock in2) {

return DataConverter.convertToMatrixBlock(solutionMatrix);
}

/**
* Computes the eigen decomposition of a symmetric matrix.
*
* @param in The input matrix to compute the eigen decomposition on.
* @param threads The number of threads to use for computation.
* @return An array of MatrixBlock objects containing the real eigen values and eigen vectors.
*/
public static MatrixBlock[] computeEigenDecompositionSymm(MatrixBlock in, int threads) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i would like to know where the time is spend inside this function call.

is it in transformToTridiagonal, getQ or findEigenVectors?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

image

findEigenVectors seems to take ~60-70% of the time.
That performEigenDecomposition is a wrapper for the do-while loop and most of its execution is taken by the matrix updates at the end of each iteration

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

great this is something that is easy to fix, simply do not use the get and set method for the dense block, but directly on the underlying linearized array assign the cells.

This should immediately reduce the execution time by at least ~50% of that part.


// TODO: Verify matrix is symmetric
final double[] mainDiag = new double[in.rlen];
final double[] secDiag = new double[in.rlen - 1];

final MatrixBlock householderVectors = transformToTridiagonal(in, mainDiag, secDiag, threads);

// TODO: Consider using sparse blocks
final double[] hv = householderVectors.getDenseBlockValues();
MatrixBlock houseHolderMatrix = getQ(hv, mainDiag, secDiag);

MatrixBlock[] evResult = findEigenVectors(mainDiag, secDiag, houseHolderMatrix, EIGEN_MAX_ITER, threads);

MatrixBlock realEigenValues = evResult[0];
MatrixBlock eigenVectors = evResult[1];

realEigenValues.setNonZeros(realEigenValues.rlen * realEigenValues.rlen);
eigenVectors.setNonZeros(eigenVectors.rlen * eigenVectors.clen);

return new MatrixBlock[] { realEigenValues, eigenVectors };
}

private static MatrixBlock transformToTridiagonal(MatrixBlock matrix, double[] main, double[] secondary, int threads) {
final int m = matrix.rlen;

// MatrixBlock householderVectors = ;
MatrixBlock householderVectors = matrix.extractTriangular(new MatrixBlock(m, m, false), false, true, true);
if (householderVectors.isInSparseFormat()) {
householderVectors.sparseToDense(threads);
}

final double[] z = new double[m];
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This temporary allocation seems like the only thing that limits you from parallelizing the entire for loop.


// TODO: Consider sparse block case
final double[] hv = householderVectors.getDenseBlockValues();

for (int k = 0; k < m - 1; k++) {
final int k_kp1 = k * m + k + 1;
final int km = k * m;

// zero-out a row and a column simultaneously
main[k] = hv[km + k];

// TODO: may or may not improve, seems it does not
Aast12 marked this conversation as resolved.
Show resolved Hide resolved
// double xNormSqr = householderVectors.slice(k, k, k + 1, m - 1).sumSq();
// double xNormSqr = LibMatrixMult.dotProduct(hv, hv, k_kp1, k_kp1, m - (k + 1));
double xNormSqr = 0;
for (int j = k + 1; j < m; ++j) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this looks like a square row sum,
this is an operation you can perform directly.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We tried these other approaches to try to get a performance boost but didn't change if not made it slower

double xNormSqr = householderVectors.slice(k, k, k + 1, m - 1).sumSq();
double xNormSqr = LibMatrixMult.dotProduct(hv, hv, k * m + k + 1, k * m + k + 1, m - (k + 1)); 

are you referring to other way to do it directly?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

okay, then we do no change it.

final double c = hv[k * m + j];
Aast12 marked this conversation as resolved.
Show resolved Hide resolved
xNormSqr += c * c;
}

final double a = (hv[k_kp1] > 0) ? -FastMath.sqrt(xNormSqr) : FastMath.sqrt(xNormSqr);

secondary[k] = a;

if (a != 0.0) {
// apply Householder transform from left and right simultaneously
hv[k_kp1] -= a;

final double beta = -1 / (a * hv[k_kp1]);
double gamma = 0;

// compute a = beta A v, where v is the Householder vector
// this loop is written in such a way
// 1) only the upper triangular part of the matrix is accessed
// 2) access is cache-friendly for a matrix stored in rows
Arrays.fill(z, k + 1, m, 0);
for (int i = k + 1; i < m; ++i) {
final double hKI = hv[km + i];
double zI = hv[i * m + i] * hKI;
for (int j = i + 1; j < m; ++j) {
final double hIJ = hv[i * m + j];
zI += hIJ * hv[k * m + j];
z[j] += hIJ * hKI;
}
z[i] = beta * (z[i] + zI);

gamma += z[i] * hv[km + i];
}

gamma *= beta / 2;

// compute z = z - gamma v
// LibMatrixMult.vectMultiplyAdd(-gamma, hv, z, k_kp1, k + 1, m - (k + 1));
for (int i = k + 1; i < m; ++i) {
z[i] -= gamma * hv[km + i];
}

// update matrix: A = A - v zT - z vT
// only the upper triangular part of the matrix is updated
for (int i = k + 1; i < m; ++i) {
final double hki = hv[km + i];
for (int j = i; j < m; ++j) {
final double hkj = hv[km + j];

hv[i * m + j] -= (hki * z[j] + z[i] * hkj);
}
}
}
}

main[m - 1] = hv[(m - 1) * m + m - 1];

return householderVectors;
}


/**
* Computes the orthogonal matrix Q using Householder transforms.
* The matrix Q is built by applying Householder transforms to the input vectors.
*
* @param hv The input vector containing the Householder vectors.
* @param main The main diagonal of the matrix.
* @param secondary The secondary diagonal of the matrix.
* @return The orthogonal matrix Q.
*/
public static MatrixBlock getQ(final double[] hv, double[] main, double[] secondary) {
final int m = main.length;

// orthogonal matrix, most operations are column major (TODO)
DenseBlock qaB = DenseBlockFactory.createDenseBlock(m, m);
double[] qaV = qaB.valuesAt(0);

// build up first part of the matrix by applying Householder transforms
for (int k = m - 1; k >= 1; --k) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there a reason why it should be done in opposite order?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes there is a reason. From page 263 of "Matrix Computations" by Gene H. Golub and Charles F. Van Loan: "Recall that the leading (j − 1)-by-(j − 1) portion of Qj is the identity. Thus, at the beginning of backward accumulation, Q is “mostly the identity” and it gradually becomes full as the iteration progresses. This pattern can be exploited to reduce the number of required flops. In contrast, Q is full in forward accumulation after the first step. For this reason, backward accumulation is cheaper and the strategy of choice."

final int km = k * m;
final int km1m = (k - 1) * m;

qaV[km + k] = 1.0;
if (hv[km1m + k] != 0.0) {
final double inv = 1.0 / (secondary[k - 1] * hv[km1m + k]);

double beta = 1.0 / secondary[k - 1];

qaV[km + k] = 1 + beta * hv[km1m + k];

// TODO: may speedup vector operations
for (int i = k + 1; i < m; ++i) {
qaV[i * m + k] = beta * hv[km1m + i];
}

for (int j = k + 1; j < m; ++j) {
beta = 0;
for (int i = k + 1; i < m; ++i) {
beta += qaV[m * i + j] * hv[km1m + i];
}
beta *= inv;
qaV[m * k + j] = hv[km1m + k] * beta;

for (int i = k + 1; i < m; ++i) {
qaV[m * i + j] += beta * hv[km1m + i];
}
}
}
}

qaV[0] = 1.0;
MatrixBlock res = new MatrixBlock(m, m, qaB);

return res;
}


/**
* Finds the eigen vectors corresponding to the given eigen values using the Householder transformation.
* (Dubrulle et al., 1971).
*
* @param main The main diagonal of the tridiagonal matrix.
* @param secondary The secondary diagonal of the tridiagonal matrix.
* @param hhMatrix The Householder matrix.
* @param maxIter The maximum number of iterations for convergence.
* @param threads The number of threads to use for computation.
* @return An array of two MatrixBlock objects: eigen values and eigen vectors.
* @throws MaxCountExceededException If the maximum number of iterations is exceeded and convergence fails.
*/
private static MatrixBlock[] findEigenVectors(double[] main, double[] secondary, final MatrixBlock hhMatrix,
final int maxIter, int threads) {

DenseBlock hhDense = hhMatrix.getDenseBlock();

final int n = hhMatrix.rlen;
MatrixBlock eigenValues = new MatrixBlock(n, 1, main);

double[] ev = eigenValues.denseBlock.valuesAt(0);
final double[] e = new double[n];

System.arraycopy(secondary, 0, e, 0, n - 1);
e[n - 1] = 0;

// Determine the largest main and secondary value in absolute term.
double maxAbsoluteValue = 0;
for (int i = 0; i < n; i++) {
maxAbsoluteValue = FastMath.max(maxAbsoluteValue, FastMath.abs(ev[i]));
maxAbsoluteValue = FastMath.max(maxAbsoluteValue, FastMath.abs(e[i]));
}
// Make null any main and secondary value too small to be significant
if (maxAbsoluteValue != 0) {
for (int i = 0; i < n; i++) {
if (FastMath.abs(ev[i]) <= EIGEN_EPS * maxAbsoluteValue && ev[i] != 0.0) {
ev[i] = 0;
}
if (FastMath.abs(e[i]) <= EIGEN_EPS * maxAbsoluteValue && e[i] != 0.0) {
e[i] = 0;
}
}
}

for (int j = 0; j < n; j++) {
int its = 0;
int m;
do {
Aast12 marked this conversation as resolved.
Show resolved Hide resolved
for (m = j; m < n - 1; m++) {
final double delta = FastMath.abs(ev[m]) +
FastMath.abs(ev[m + 1]);
if (FastMath.abs(e[m]) + delta == delta) {
break;
}
}
if (m != j) {
if (its == maxIter) {
throw new MaxCountExceededException(LocalizedFormats.CONVERGENCE_FAILED, maxIter);
}

its++;
double q = (ev[j + 1] - ev[j]) / (2 * e[j]);
double t = FastMath.sqrt(1 + q * q);
if (q < 0.0) {
q = ev[m] - ev[j] + e[j] / (q - t);
} else {
q = ev[m] - ev[j] + e[j] / (q + t);
}
double u = 0.0;
double s = 1.0;
double c = 1.0;
int i;
for (i = m - 1; i >= j; i--) {
double p = s * e[i];
double h = c * e[i];
if (FastMath.abs(p) >= FastMath.abs(q)) {
c = q / p;
t = FastMath.sqrt(c * c + 1.0);
e[i + 1] = p * t;
s = 1.0 / t;
c *= s;
} else {
s = p / q;
t = FastMath.sqrt(s * s + 1.0);
e[i + 1] = q * t;
c = 1.0 / t;
s *= c;
}
if (e[i + 1] == 0.0) {
ev[i + 1] -= u;
e[m] = 0.0;
break;
}
q = ev[i + 1] - u;
t = (ev[i] - q) * s + 2.0 * c * h;
u = s * t;
ev[i + 1] = q + u;
q = c * t - h;

for (int ia = 0; ia < n; ++ia) {
p = hhDense.get(ia, i + 1);
hhDense.set(ia, i + 1, s * hhDense.get(ia, i) + c * p);
hhDense.set(ia, i, c * hhDense.get(ia, i) - s * p);
}
}

if (t == 0.0 && i >= j) {
continue;
}
ev[j] -= u;
e[j] = q;
e[m] = 0.0;

}
} while (m != j);
}

// Sort the eigen values (and vectors) in increase order
for (int i = 0; i < n; i++) {
int k = i;
double p = ev[i];
for (int j = i + 1; j < n; j++) {
// reversed order from original implementation
if (ev[j] < p) {
k = j;
p = ev[j];
}
}
if (k != i) {
ev[k] = ev[i];
ev[i] = p;
for (int j = 0; j < n; j++) {
// TODO: an operation like SwapIndex be faster?
p = hhDense.get(j, i);
hhDense.set(j, i, hhDense.get(j, k));
hhDense.set(j, k, p);

}
}
}

// Determine the largest eigen value in absolute term.
maxAbsoluteValue = 0;
for (int i = 0; i < n; i++) {
maxAbsoluteValue = FastMath.max(maxAbsoluteValue, FastMath.abs(ev[i]));
}
// Make null any eigen value too small to be significant
if (maxAbsoluteValue != 0.0) {
for (int i = 0; i < n; i++) {
if (FastMath.abs(ev[i]) < EIGEN_EPS * maxAbsoluteValue) {
ev[i] = 0;
}
}
}

// MatrixBlock realEigenValues = new MatrixBlock(z.rlen, 1, ev);

return new MatrixBlock[] { eigenValues, hhMatrix };
}


/**
* Function to perform QR decomposition on a given matrix.
Expand Down
Loading
Loading