KrishnaswamyLab · stanleyjs · Jan 3, 2022 · Jan 3, 2022 · Jan 3, 2022 · Jan 3, 2022
diff --git a/graphtools/api.py b/graphtools/api.py
@@ -255,7 +255,7 @@ def Graph(
         else:
             msg = msg + " and PyGSP inheritance"
 
-    _logger.debug(msg)
+    _logger.log_debug(msg)
 
     class_names = [p.__name__.replace("Graph", "") for p in parent_classes]
     try:
@@ -273,7 +273,7 @@ def Graph(
                 pass
 
     # build graph and return
-    _logger.debug(
+    _logger.log_debug(
         "Initializing {} with arguments {}".format(
             parent_classes,
             ", ".join(

diff --git a/graphtools/base.py b/graphtools/base.py
@@ -1,13 +1,17 @@
 from future.utils import with_metaclass
 from builtins import super
 from copy import copy as shallow_copy
+from dataclasses import dataclass
+from mock import patch
+from functools import partial
 import numpy as np
 import abc
 import pygsp
 from inspect import signature
+import sklearn
 from sklearn.decomposition import PCA, TruncatedSVD
 from sklearn.preprocessing import normalize
-from sklearn.utils.graph import graph_shortest_path
+from scipy.sparse.csgraph import shortest_path as graph_shortest_path
 from scipy import sparse
 import warnings
 import numbers
@@ -20,6 +24,115 @@
 _logger = tasklogger.get_tasklogger("graphtools")
 
 
+@dataclass
+class PCAParameters(object):
+    """Data class that stores PCA parameters.
+    Parameters
+    ----------
+    n_oversamples : int, default=10
+        Additional number of random vectors to sample the range of M so as
+        to ensure proper conditioning. The total number of random vectors
+        used to find the range of M is n_components + n_oversamples. Smaller
+        number can improve speed but can negatively impact the quality of
+        approximation of singular vectors and singular values. Users might wish
+        to increase this parameter up to `2*k - n_components` where k is the
+        effective rank, for large matrices, noisy problems, matrices with
+        slowly decaying spectrums, or to increase precision accuracy.
+    n_iter : int or 'auto', default='auto'
+        Number of power iterations. It can be used to deal with very noisy
+        problems. When 'auto', it is set to 4, unless `n_components` is small
+        (< .1 * min(X.shape)) in which case `n_iter` is set to 7.
+        This improves precision with few components. Note that in general
+        users should rather increase `n_oversamples` before increasing `n_iter`
+        as the principle of the randomized method is to avoid usage of these
+        more costly power iterations steps. When `n_components` is equal
+        or greater to the effective matrix rank and the spectrum does not
+        present a slow decay, `n_iter=0` or `1` should even work fine in theory
+    power_iteration_normalizer : {'auto', 'QR', 'LU', 'none'}, default='auto'
+        Whether the power iterations are normalized with step-by-step
+        QR factorization (the slowest but most accurate), 'none'
+        (the fastest but numerically unstable when `n_iter` is large, e.g.
+        typically 5 or larger), or 'LU' factorization (numerically stable
+        but can lose slightly in accuracy). The 'auto' mode applies no
+        normalization if `n_iter` <= 2 and switches to LU otherwise.
+    See documentation for sklearn.utils.extmath.randomized_svd
+    """
+
+    _valid = {}
+    _valid["n_oversamples"] = {int: lambda x: x > 0}
+    _valid["n_iter"] = {str: lambda x: x in ["auto"], int: lambda x: x >= 0}
+    _valid["power_iteration_normalizer"] = {
+        str: lambda x: x.lower() in ["auto", "qr", "lu", "none"]
+    }
+    _valid_str = {}
+    _valid_str["n_oversamples"] = ["int > 0"]
+    _valid_str["n_iter"] = ["auto", "int >= 0"]
+    _valid_str["power_iteration_normalizer"] = ["auto", "QR", "LU", "none"]
+
+    n_oversamples: int = 10
+    n_iter: int = "auto"
+    power_iteration_normalizer: str = "auto"
+
+    def validate(self):
+        validated = []
+        errs = []
+        valids = []
+        fields = list(self.__dataclass_fields__.items())
+        fields.sort(key=lambda x: x[0])
+        for field_name, field_def in fields:
+            attr = getattr(self, field_name)
+            validated.append(False)
+            for typ, typfun in self._valid[field_name].items():
+                if isinstance(attr, typ):
+                    validated[-1] = typfun(attr)
+            if not validated[-1]:
+                errs.append(field_name)
+        return all(validated), errs
+
+    def __post_init__(self):
+        validated, errs = self.validate()
+        errs = errs
+        if not validated:
+            errorstring = f"{errs} were invalid type or value. " f"Valid values are "
+            for err in errs:
+                errorstring += f"{self._valid_str[err]}, "
+            errorstring += "respectively."
+            raise ValueError(errorstring)
+
+
+##some monkey patching of randomized_svd...
+def randomized_svd_monkey(
+    M,
+    n_components,
+    *,
+    pca_params=PCAParameters(),
+    n_oversamples=10,
+    n_iter="auto",
+    power_iteration_normalizer="auto",
+    transpose="auto",
+    flip_sign=True,
+    random_state="warn",
+):
+    if sklearn.__version__ > "1.0.1":
+        warnings.warn(
+            "Graphtools is using a patched version of randomized_svd "
+            "designed for sklearn version 1.0.1. The current version "
+            "of sklearn is {}. Please alert the graphtools authors to "
+            "update the patch.".format(sklearn.__version__),
+            RuntimeWarning,
+        )
+    return sklearn.utils.extmath.randomized_svd(
+        M,
+        n_components=n_components,
+        n_oversamples=pca_params.n_oversamples,
+        n_iter=pca_params.n_iter,
+        power_iteration_normalizer=pca_params.power_iteration_normalizer,
+        transpose=transpose,
+        flip_sign=flip_sign,
+        random_state=random_state,
+    )
+
+
 class Base(object):
     """Class that deals with key-word arguments but is otherwise
     just an object.
@@ -90,7 +203,9 @@ class Data(Base):
         s_max * eps * max(n_samples, n_features)
         where s_max is the maximum singular value of the data matrix
         and eps is numerical precision. [press2007]_.
-
+    pca_params : `PCAParameters`, optional (default: `PCAParameters()`)
+        Parameters to use for randomized SVD and PCA. See documentation
+        for graphtools.base.PCAParameters.
     random_state : `int` or `None`, optional (default: `None`)
         Random state for random PCA
 
@@ -109,11 +224,19 @@ class Data(Base):
     """
 
     def __init__(
-        self, data, n_pca=None, rank_threshold=None, random_state=None, **kwargs
+        self,
+        data,
+        n_pca=None,
+        rank_threshold=None,
+        pca_params=PCAParameters(),
+        random_state=None,
+        **kwargs,
     ):
 
         self._check_data(data)
-        n_pca, rank_threshold = self._parse_n_pca_threshold(data, n_pca, rank_threshold)
+        n_pca, rank_threshold, pca_params = self._parse_pca_parameters(
+            data, n_pca, rank_threshold, pca_params
+        )
 
         if utils.is_SparseDataFrame(data):
             data = data.to_coo()
@@ -130,11 +253,12 @@ def __init__(
         self.data = data
         self.n_pca = n_pca
         self.rank_threshold = rank_threshold
+        self.pca_params = pca_params
         self.random_state = random_state
         self.data_nu = self._reduce_data()
         super().__init__(**kwargs)
 
-    def _parse_n_pca_threshold(self, data, n_pca, rank_threshold):
+    def _parse_pca_parameters(self, data, n_pca, rank_threshold, pca_params):
         if isinstance(n_pca, str):
             n_pca = n_pca.lower()
             if n_pca != "auto":
@@ -173,7 +297,7 @@ def _parse_n_pca_threshold(self, data, n_pca, rank_threshold):
             n_pca = None
         elif n_pca is True:  # notify that we're going to estimate rank.
             n_pca = "auto"
-            _logger.info(
+            _logger.log_info(
                 "Estimating n_pca from matrix rank. "
                 "Supply an integer n_pca "
                 "for fixed amount."
@@ -207,7 +331,12 @@ def _parse_n_pca_threshold(self, data, n_pca, rank_threshold):
                     raise ValueError(
                         "rank_threshold must be positive float or 'auto'. "
                     )
-        return n_pca, rank_threshold
+        if pca_params is None:
+            pca_params = PCAParameters()
+        else:
+            if not isinstance(pca_params, PCAParameters):
+                raise ValueError("pca_params must be an instance of PCAParameters.")
+        return n_pca, rank_threshold, pca_params
 
     def _check_data(self, data):
         if len(data.shape) != 2:
@@ -237,7 +366,10 @@ def _reduce_data(self):
         if self.n_pca is not None and (
             self.n_pca == "auto" or self.n_pca < self.data.shape[1]
         ):
-            with _logger.task("PCA"):
+            with _logger.log_task("PCA"):
+                randomized_pca = partial(
+                    randomized_svd_monkey, pca_params=self.pca_params
+                )
                 n_pca = self.data.shape[1] - 1 if self.n_pca == "auto" else self.n_pca
                 if sparse.issparse(self.data):
                     if (
@@ -251,7 +383,14 @@ def _reduce_data(self):
                     self.data_pca = PCA(
                         n_pca, svd_solver="randomized", random_state=self.random_state
                     )
-                self.data_pca.fit(self.data)
+                with patch(
+                    "sklearn.decomposition._pca.randomized_svd", new=randomized_pca
+                ) as foo, patch(
+                    "sklearn.decomposition._truncated_svd.randomized_svd",
+                    new=randomized_pca,
+                ) as bar:
+                    self.data_pca.fit(self.data)
+
                 if self.n_pca == "auto":
                     s = self.data_pca.singular_values_
                     smax = s.max()
@@ -269,7 +408,7 @@ def _reduce_data(self):
                             "maximum singular value {} "
                             "for the data matrix".format(threshold, smax)
                         )
-                    _logger.info(
+                    _logger.log_info(
                         "Using rank estimate of {} as n_pca".format(self.n_pca)
                     )
                     # reset the sklearn operator
@@ -292,8 +431,7 @@ def _reduce_data(self):
             return data_nu
 
     def get_params(self):
-        """Get parameters from this object
-        """
+        """Get parameters from this object"""
         return {"n_pca": self.n_pca, "random_state": self.random_state}
 
     def set_params(self, **params):
@@ -469,7 +607,7 @@ def __init__(
         anisotropy=0,
         gamma=None,
         initialize=True,
-        **kwargs
+        **kwargs,
     ):
         if gamma is not None:
             warnings.warn(
@@ -498,10 +636,10 @@ def __init__(
         self.anisotropy = anisotropy
 
         if initialize:
-            _logger.debug("Initializing kernel...")
+            _logger.log_debug("Initializing kernel...")
             self.K
         else:
-            _logger.debug("Not initializing kernel.")
+            _logger.log_debug("Not initializing kernel.")
         super().__init__(**kwargs)
 
     def _check_symmetrization(self, kernel_symm, theta):
@@ -556,18 +694,20 @@ def _build_kernel(self):
     def symmetrize_kernel(self, K):
         # symmetrize
         if self.kernel_symm == "+":
-            _logger.debug("Using addition symmetrization.")
+            _logger.log_debug("Using addition symmetrization.")
             K = (K + K.T) / 2
         elif self.kernel_symm == "*":
-            _logger.debug("Using multiplication symmetrization.")
+            _logger.log_debug("Using multiplication symmetrization.")
             K = K.multiply(K.T)
         elif self.kernel_symm == "mnn":
-            _logger.debug("Using mnn symmetrization (theta = {}).".format(self.theta))
+            _logger.log_debug(
+                "Using mnn symmetrization (theta = {}).".format(self.theta)
+            )
             K = self.theta * matrix.elementwise_minimum(K, K.T) + (
                 1 - self.theta
             ) * matrix.elementwise_maximum(K, K.T)
         elif self.kernel_symm is None:
-            _logger.debug("Using no symmetrization.")
+            _logger.log_debug("Using no symmetrization.")
             pass
         else:
             raise NotImplementedError
@@ -589,8 +729,7 @@ def apply_anisotropy(self, K):
         return K
 
     def get_params(self):
-        """Get parameters from this object
-        """
+        """Get parameters from this object"""
         return {
             "kernel_symm": self.kernel_symm,
             "theta": self.theta,
@@ -697,8 +836,7 @@ def diff_aff(self):
 
     @property
     def diff_op(self):
-        """Synonym for P
-        """
+        """Synonym for P"""
         return self.P
 
     @property
@@ -719,8 +857,7 @@ def K(self):
 
     @property
     def kernel(self):
-        """Synonym for K
-        """
+        """Synonym for K"""
         return self.K
 
     @property
@@ -850,10 +987,10 @@ def _check_shortest_path_distance(self, distance):
     def _default_shortest_path_distance(self):
         if not self.weighted:
             distance = "data"
-            _logger.info("Using ambient data distances.")
+            _logger.log_info("Using ambient data distances.")
         else:
             distance = "affinity"
-            _logger.info("Using negative log affinity distances.")
+            _logger.log_info("Using negative log affinity distances.")
         return distance
 
     def shortest_path(self, method="auto", distance=None):
@@ -895,8 +1032,12 @@ def shortest_path(self, method="auto", distance=None):
                 np.sum((self.data_nu[D.row] - self.data_nu[D.col]) ** 2, axis=1)
             )
         elif distance == "affinity":
-            D = sparse.csr_matrix(self.K)
-            D.data = -1 * np.log(D.data)
+            # D = sparse.csr_matrix(self.K)
+            # D.data = -1 * np.log(D.data)
+            D = -1 * np.where(
+                self.K != 0, np.log(np.where(self.K != 0, self.K, np.nan)), 0
+            )
+            # D = sparse.csr_matrix(D)
         else:
             raise ValueError(
                 "Expected `distance` in ['constant', 'data', 'affinity']. "
@@ -1019,8 +1160,7 @@ def __init__(self, data, verbose=True, n_jobs=1, **kwargs):
         super().__init__(data, **kwargs)
 
     def get_params(self):
-        """Get parameters from this object
-        """
+        """Get parameters from this object"""
         params = Data.get_params(self)
         params.update(BaseGraph.get_params(self))
         return params