From c8e4a7cb9913ce588c2da2a903ed49ca116f6a7f Mon Sep 17 00:00:00 2001
From: Jay Stanley <jay.s.stanley.3@gmail.com>
Date: Mon, 3 Jan 2022 11:05:16 -0600
Subject: [PATCH 1/9]  base.py logger fix

---
 graphtools/base.py | 39 ++++++++++++++++++---------------------
 1 file changed, 18 insertions(+), 21 deletions(-)

diff --git a/graphtools/base.py b/graphtools/base.py
index 4789b8a..8707c21 100644
--- a/graphtools/base.py
+++ b/graphtools/base.py
@@ -173,7 +173,7 @@ def _parse_n_pca_threshold(self, data, n_pca, rank_threshold):
             n_pca = None
         elif n_pca is True:  # notify that we're going to estimate rank.
             n_pca = "auto"
-            _logger.info(
+            _logger.log_info(
                 "Estimating n_pca from matrix rank. "
                 "Supply an integer n_pca "
                 "for fixed amount."
@@ -237,7 +237,7 @@ def _reduce_data(self):
         if self.n_pca is not None and (
             self.n_pca == "auto" or self.n_pca < self.data.shape[1]
         ):
-            with _logger.task("PCA"):
+            with _logger.log_task("PCA"):
                 n_pca = self.data.shape[1] - 1 if self.n_pca == "auto" else self.n_pca
                 if sparse.issparse(self.data):
                     if (
@@ -269,7 +269,7 @@ def _reduce_data(self):
                             "maximum singular value {} "
                             "for the data matrix".format(threshold, smax)
                         )
-                    _logger.info(
+                    _logger.log_info(
                         "Using rank estimate of {} as n_pca".format(self.n_pca)
                     )
                     # reset the sklearn operator
@@ -292,8 +292,7 @@ def _reduce_data(self):
             return data_nu
 
     def get_params(self):
-        """Get parameters from this object
-        """
+        """Get parameters from this object"""
         return {"n_pca": self.n_pca, "random_state": self.random_state}
 
     def set_params(self, **params):
@@ -498,10 +497,10 @@ def __init__(
         self.anisotropy = anisotropy
 
         if initialize:
-            _logger.debug("Initializing kernel...")
+            _logger.log_debug("Initializing kernel...")
             self.K
         else:
-            _logger.debug("Not initializing kernel.")
+            _logger.log_debug("Not initializing kernel.")
         super().__init__(**kwargs)
 
     def _check_symmetrization(self, kernel_symm, theta):
@@ -556,18 +555,20 @@ def _build_kernel(self):
     def symmetrize_kernel(self, K):
         # symmetrize
         if self.kernel_symm == "+":
-            _logger.debug("Using addition symmetrization.")
+            _logger.log_debug("Using addition symmetrization.")
             K = (K + K.T) / 2
         elif self.kernel_symm == "*":
-            _logger.debug("Using multiplication symmetrization.")
+            _logger.log_debug("Using multiplication symmetrization.")
             K = K.multiply(K.T)
         elif self.kernel_symm == "mnn":
-            _logger.debug("Using mnn symmetrization (theta = {}).".format(self.theta))
+            _logger.log_debug(
+                "Using mnn symmetrization (theta = {}).".format(self.theta)
+            )
             K = self.theta * matrix.elementwise_minimum(K, K.T) + (
                 1 - self.theta
             ) * matrix.elementwise_maximum(K, K.T)
         elif self.kernel_symm is None:
-            _logger.debug("Using no symmetrization.")
+            _logger.log_debug("Using no symmetrization.")
             pass
         else:
             raise NotImplementedError
@@ -589,8 +590,7 @@ def apply_anisotropy(self, K):
         return K
 
     def get_params(self):
-        """Get parameters from this object
-        """
+        """Get parameters from this object"""
         return {
             "kernel_symm": self.kernel_symm,
             "theta": self.theta,
@@ -697,8 +697,7 @@ def diff_aff(self):
 
     @property
     def diff_op(self):
-        """Synonym for P
-        """
+        """Synonym for P"""
         return self.P
 
     @property
@@ -719,8 +718,7 @@ def K(self):
 
     @property
     def kernel(self):
-        """Synonym for K
-        """
+        """Synonym for K"""
         return self.K
 
     @property
@@ -850,10 +848,10 @@ def _check_shortest_path_distance(self, distance):
     def _default_shortest_path_distance(self):
         if not self.weighted:
             distance = "data"
-            _logger.info("Using ambient data distances.")
+            _logger.log_info("Using ambient data distances.")
         else:
             distance = "affinity"
-            _logger.info("Using negative log affinity distances.")
+            _logger.log_info("Using negative log affinity distances.")
         return distance
 
     def shortest_path(self, method="auto", distance=None):
@@ -1019,8 +1017,7 @@ def __init__(self, data, verbose=True, n_jobs=1, **kwargs):
         super().__init__(data, **kwargs)
 
     def get_params(self):
-        """Get parameters from this object
-        """
+        """Get parameters from this object"""
         params = Data.get_params(self)
         params.update(BaseGraph.get_params(self))
         return params

From f02e7b60f917067203b0ad4165f6d0dd06531a94 Mon Sep 17 00:00:00 2001
From: Jay Stanley <jay.s.stanley.3@gmail.com>
Date: Mon, 3 Jan 2022 11:14:00 -0600
Subject: [PATCH 2/9] fix tasklogger.debug -> log_debug, .task -> log_task,
 .info ->log_info

---
 graphtools/api.py       |  4 ++--
 graphtools/estimator.py | 22 ++++++++++----------
 graphtools/graphs.py    | 46 +++++++++++++++++++----------------------
 3 files changed, 34 insertions(+), 38 deletions(-)

diff --git a/graphtools/api.py b/graphtools/api.py
index b9a4b1b..27b9a79 100644
--- a/graphtools/api.py
+++ b/graphtools/api.py
@@ -255,7 +255,7 @@ def Graph(
         else:
             msg = msg + " and PyGSP inheritance"
 
-    _logger.debug(msg)
+    _logger.log_debug(msg)
 
     class_names = [p.__name__.replace("Graph", "") for p in parent_classes]
     try:
@@ -273,7 +273,7 @@ def Graph(
                 pass
 
     # build graph and return
-    _logger.debug(
+    _logger.log_debug(
         "Initializing {} with arguments {}".format(
             parent_classes,
             ", ".join(
diff --git a/graphtools/estimator.py b/graphtools/estimator.py
index 5cb130f..3244fbb 100644
--- a/graphtools/estimator.py
+++ b/graphtools/estimator.py
@@ -81,18 +81,18 @@ class GraphEstimator(object, metaclass=abc.ABCMeta):
 
     verbose : `int` or `boolean`, optional (default: 1)
         If `True` or `> 0`, print status messages
-        
+
     n_svd : int, optional (default: 100)
         number of singular vectors to compute for landmarking
-    
+
     thresh : float, optional (default: 1e-4)
         threshold below which to truncate kernel
-    
+
     kwargs : additional arguments for graphtools.Graph
-    
+
     Attributes
     ----------
-    
+
     graph : graphtools.Graph
     """
 
@@ -248,13 +248,13 @@ def _set_graph_params(self, **params):
                     )
                 self.graph.set_params(**params)
             except ValueError as e:
-                _logger.debug("Reset graph due to {}".format(str(e)))
+                _logger.log_debug("Reset graph due to {}".format(str(e)))
                 self.graph = None
 
     @abc.abstractmethod
     def _reset_graph(self):
         """Trigger a reset of self.graph
-        
+
         Any downstream effects of resetting the graph should override this function
         """
         raise NotImplementedError
@@ -361,7 +361,7 @@ def _update_graph(self, X, precomputed, n_pca, n_landmark, **kwargs):
                 **(self.kwargs)
             )
             if self.graph is not None:
-                _logger.info("Using precomputed graph and diffusion operator...")
+                _logger.log_info("Using precomputed graph and diffusion operator...")
 
     def fit(self, X, **kwargs):
         """Computes the graph
@@ -384,13 +384,13 @@ def fit(self, X, **kwargs):
         X, n_pca, n_landmark, precomputed, update_graph = self._parse_input(X)
 
         if precomputed is None:
-            _logger.info(
+            _logger.log_info(
                 "Building graph on {} samples and {} features.".format(
                     X.shape[0], X.shape[1]
                 )
             )
         else:
-            _logger.info(
+            _logger.log_info(
                 "Building graph on precomputed {} matrix with {} samples.".format(
                     precomputed, X.shape[0]
                 )
@@ -402,7 +402,7 @@ def fit(self, X, **kwargs):
         self.X = X
 
         if self.graph is None:
-            with _logger.task("graph and diffusion operator"):
+            with _logger.log_task("graph and diffusion operator"):
                 self.graph = api.Graph(
                     X,
                     n_pca=n_pca,
diff --git a/graphtools/graphs.py b/graphtools/graphs.py
index 2caa431..092d73b 100644
--- a/graphtools/graphs.py
+++ b/graphtools/graphs.py
@@ -132,8 +132,7 @@ def __init__(
         super().__init__(data, n_pca=n_pca, **kwargs)
 
     def get_params(self):
-        """Get parameters from this object
-        """
+        """Get parameters from this object"""
         params = super().get_params()
         params.update(
             {
@@ -347,19 +346,19 @@ def build_kernel_to_data(
 
         Y = self._check_extension_shape(Y)
         if self.decay is None or self.thresh == 1:
-            with _logger.task("KNN search"):
+            with _logger.log_task("KNN search"):
                 # binary connectivity matrix
                 K = self.knn_tree.kneighbors_graph(
                     Y, n_neighbors=knn, mode="connectivity"
                 )
         else:
-            with _logger.task("KNN search"):
+            with _logger.log_task("KNN search"):
                 # sparse fast alpha decay
                 knn_tree = self.knn_tree
                 search_knn = min(knn * self.search_multiplier, knn_max)
                 distances, indices = knn_tree.kneighbors(Y, n_neighbors=search_knn)
                 self._check_duplicates(distances, indices)
-            with _logger.task("affinities"):
+            with _logger.log_task("affinities"):
                 if bandwidth is None:
                     bandwidth = distances[:, knn - 1]
 
@@ -370,7 +369,7 @@ def build_kernel_to_data(
 
                 radius = bandwidth * np.power(-1 * np.log(self.thresh), 1 / self.decay)
                 update_idx = np.argwhere(np.max(distances, axis=1) < radius).reshape(-1)
-                _logger.debug(
+                _logger.log_debug(
                     "search_knn = {}; {} remaining".format(search_knn, len(update_idx))
                 )
                 if len(update_idx) > 0:
@@ -399,7 +398,7 @@ def build_kernel_to_data(
                             else radius[i]
                         )
                     ]
-                    _logger.debug(
+                    _logger.log_debug(
                         "search_knn = {}; {} remaining".format(
                             search_knn, len(update_idx)
                         )
@@ -412,7 +411,7 @@ def build_kernel_to_data(
                     ).fit(self.data_nu)
                 if len(update_idx) > 0:
                     if search_knn == knn_max:
-                        _logger.debug(
+                        _logger.log_debug(
                             "knn search to knn_max ({}) on {}".format(
                                 knn_max, len(update_idx)
                             )
@@ -425,7 +424,7 @@ def build_kernel_to_data(
                             distances[idx] = dist_new[i]
                             indices[idx] = ind_new[i]
                     else:
-                        _logger.debug("radius search on {}".format(len(update_idx)))
+                        _logger.log_debug("radius search on {}".format(len(update_idx)))
                         # give up - radius search
                         dist_new, ind_new = knn_tree.radius_neighbors(
                             Y[update_idx, :],
@@ -524,8 +523,7 @@ def __init__(self, data, n_landmark=2000, n_svd=100, **kwargs):
         super().__init__(data, **kwargs)
 
     def get_params(self):
-        """Get parameters from this object
-        """
+        """Get parameters from this object"""
         params = super().get_params()
         params.update({"n_landmark": self.n_landmark, "n_pca": self.n_pca})
         return params
@@ -653,16 +651,16 @@ def build_landmark_op(self):
         probabilities between cluster centers by using transition probabilities
         between samples assigned to each cluster.
         """
-        with _logger.task("landmark operator"):
+        with _logger.log_task("landmark operator"):
             is_sparse = sparse.issparse(self.kernel)
             # spectral clustering
-            with _logger.task("SVD"):
+            with _logger.log_task("SVD"):
                 _, _, VT = randomized_svd(
                     self.diff_aff,
                     n_components=self.n_svd,
                     random_state=self.random_state,
                 )
-            with _logger.task("KMeans"):
+            with _logger.log_task("KMeans"):
                 kmeans = MiniBatchKMeans(
                     self.n_landmark,
                     init_size=3 * self.n_landmark,
@@ -886,8 +884,7 @@ def __init__(
         super().__init__(data, n_pca=n_pca, **kwargs)
 
     def get_params(self):
-        """Get parameters from this object
-        """
+        """Get parameters from this object"""
         params = super().get_params()
         params.update(
             {
@@ -985,7 +982,7 @@ def build_kernel(self):
                 K = K.tolil()
             K = matrix.set_diagonal(K, 1)
         else:
-            with _logger.task("affinities"):
+            with _logger.log_task("affinities"):
                 if sparse.issparse(self.data_nu):
                     self.data_nu = self.data_nu.toarray()
                 if self.precomputed == "distance":
@@ -1091,7 +1088,7 @@ def build_kernel_to_data(self, Y, knn=None, bandwidth=None, bandwidth_scale=None
         if self.precomputed is not None:
             raise ValueError("Cannot extend kernel on precomputed graph")
         else:
-            with _logger.task("affinities"):
+            with _logger.log_task("affinities"):
                 Y = self._check_extension_shape(Y)
                 pdx = cdist(Y, self.data_nu, metric=self.distance)
                 if bandwidth is None:
@@ -1128,7 +1125,7 @@ def _check_shortest_path_distance(self, distance):
     def _default_shortest_path_distance(self):
         if self.precomputed is not None and not self.weighted:
             distance = "constant"
-            _logger.info("Using constant distances.")
+            _logger.log_info("Using constant distances.")
         else:
             distance = super()._default_shortest_path_distance()
         return distance
@@ -1222,8 +1219,7 @@ def _check_symmetrization(self, kernel_symm, theta):
             super()._check_symmetrization(kernel_symm, theta)
 
     def get_params(self):
-        """Get parameters from this object
-        """
+        """Get parameters from this object"""
         params = super().get_params()
         params.update(
             {
@@ -1296,13 +1292,13 @@ def build_kernel(self):
             symmetric matrix with ones down the diagonal
             with no non-negative entries.
         """
-        with _logger.task("subgraphs"):
+        with _logger.log_task("subgraphs"):
             self.subgraphs = []
             from .api import Graph
 
             # iterate through sample ids
             for i, idx in enumerate(self.samples):
-                _logger.debug(
+                _logger.log_debug(
                     "subgraph {}: sample {}, "
                     "n = {}, knn = {}".format(
                         i, idx, np.sum(self.sample_idx == idx), self.knn
@@ -1327,7 +1323,7 @@ def build_kernel(self):
                 )
                 self.subgraphs.append(graph)  # append to list of subgraphs
 
-        with _logger.task("MNN kernel"):
+        with _logger.log_task("MNN kernel"):
             if self.thresh > 0 or self.decay is None:
                 K = sparse.lil_matrix((self.data_nu.shape[0], self.data_nu.shape[0]))
             else:
@@ -1343,7 +1339,7 @@ def build_kernel(self):
                 for j, Y in enumerate(self.subgraphs):
                     if i == j:
                         continue
-                    with _logger.task(
+                    with _logger.log_task(
                         "kernel from sample {} to {}".format(
                             self.samples[i], self.samples[j]
                         )

From 2f2fbfb47fb20446e4a73abdb38103083658e43c Mon Sep 17 00:00:00 2001
From: Jay Stanley <jay.s.stanley.3@gmail.com>
Date: Mon, 3 Jan 2022 11:18:17 -0600
Subject: [PATCH 3/9] fixed deprecated
 sklearn.utils.graph.graph_shortest_path->scipy.sparse.csgraph.shortest_path

---
 graphtools/base.py | 2 +-
 test/test_exact.py | 2 +-
 test/test_knn.py   | 5 +++--
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/graphtools/base.py b/graphtools/base.py
index 8707c21..e0a985f 100644
--- a/graphtools/base.py
+++ b/graphtools/base.py
@@ -7,7 +7,7 @@
 from inspect import signature
 from sklearn.decomposition import PCA, TruncatedSVD
 from sklearn.preprocessing import normalize
-from sklearn.utils.graph import graph_shortest_path
+from scipy.sparse.csgraph import shortest_path as graph_shortest_path
 from scipy import sparse
 import warnings
 import numbers
diff --git a/test/test_exact.py b/test/test_exact.py
index 07faab0..f149ecf 100644
--- a/test/test_exact.py
+++ b/test/test_exact.py
@@ -1,5 +1,5 @@
 from __future__ import print_function
-from sklearn.utils.graph import graph_shortest_path
+from scipy.sparse.csgraph import shortest_path as graph_shortest_path
 from load_tests import (
     graphtools,
     np,
diff --git a/test/test_knn.py b/test/test_knn.py
index fe47c07..fe4e74d 100644
--- a/test/test_knn.py
+++ b/test/test_knn.py
@@ -1,5 +1,5 @@
 from __future__ import print_function, division
-from sklearn.utils.graph import graph_shortest_path
+from scipy.sparse.csgraph import shortest_path as graph_shortest_path
 from scipy.spatial.distance import pdist, squareform
 from load_tests import assert_raises_message, assert_warns_message
 from nose.tools import assert_raises_regex, assert_warns_regex
@@ -156,7 +156,8 @@ def test_knn_graph():
         ),
     ):
         G2.build_kernel_to_data(
-            Y=G2.data_nu, knn=data.shape[0] + 1,
+            Y=G2.data_nu,
+            knn=data.shape[0] + 1,
         )
 
 

From b1a2a8a9f712b987ed6c5d4ac9dbf48e39a86e86 Mon Sep 17 00:00:00 2001
From: Jay Stanley <jay.s.stanley.3@gmail.com>
Date: Mon, 3 Jan 2022 11:34:14 -0600
Subject: [PATCH 4/9] whitespace

---
 graphtools/graphs.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/graphtools/graphs.py b/graphtools/graphs.py
index 092d73b..e6eda49 100644
--- a/graphtools/graphs.py
+++ b/graphtools/graphs.py
@@ -668,7 +668,6 @@ def build_landmark_op(self):
                     random_state=self.random_state,
                 )
                 self._clusters = kmeans.fit_predict(self.diff_op.dot(VT.T))
-
             # transition matrices
             pmn = self._landmarks_to_data()
 

From c35ba08690aabac01edf21bf6a70540c6f840d29 Mon Sep 17 00:00:00 2001
From: Jay Stanley <jay.s.stanley.3@gmail.com>
Date: Mon, 3 Jan 2022 12:13:15 -0600
Subject: [PATCH 5/9] pca parameters

---
 graphtools/base.py | 113 +++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 105 insertions(+), 8 deletions(-)

diff --git a/graphtools/base.py b/graphtools/base.py
index e0a985f..66aa043 100644
--- a/graphtools/base.py
+++ b/graphtools/base.py
@@ -1,6 +1,7 @@
 from future.utils import with_metaclass
 from builtins import super
 from copy import copy as shallow_copy
+from dataclasses import dataclass
 import numpy as np
 import abc
 import pygsp
@@ -20,6 +21,82 @@
 _logger = tasklogger.get_tasklogger("graphtools")
 
 
+@dataclass
+class PCAParameters(object):
+    """Data class that stores PCA parameters.
+    Parameters
+    ----------
+    n_oversamples : int, default=10
+        Additional number of random vectors to sample the range of M so as
+        to ensure proper conditioning. The total number of random vectors
+        used to find the range of M is n_components + n_oversamples. Smaller
+        number can improve speed but can negatively impact the quality of
+        approximation of singular vectors and singular values. Users might wish
+        to increase this parameter up to `2*k - n_components` where k is the
+        effective rank, for large matrices, noisy problems, matrices with
+        slowly decaying spectrums, or to increase precision accuracy.
+    n_iter : int or 'auto', default='auto'
+        Number of power iterations. It can be used to deal with very noisy
+        problems. When 'auto', it is set to 4, unless `n_components` is small
+        (< .1 * min(X.shape)) in which case `n_iter` is set to 7.
+        This improves precision with few components. Note that in general
+        users should rather increase `n_oversamples` before increasing `n_iter`
+        as the principle of the randomized method is to avoid usage of these
+        more costly power iterations steps. When `n_components` is equal
+        or greater to the effective matrix rank and the spectrum does not
+        present a slow decay, `n_iter=0` or `1` should even work fine in theory
+    power_iteration_normalizer : {'auto', 'QR', 'LU', 'none'}, default='auto'
+        Whether the power iterations are normalized with step-by-step
+        QR factorization (the slowest but most accurate), 'none'
+        (the fastest but numerically unstable when `n_iter` is large, e.g.
+        typically 5 or larger), or 'LU' factorization (numerically stable
+        but can lose slightly in accuracy). The 'auto' mode applies no
+        normalization if `n_iter` <= 2 and switches to LU otherwise.
+    See documentation for sklearn.utils.extmath.randomized_svd
+    """
+
+    _valid = {}
+    _valid["n_oversamples"] = {int: lambda x: x > 0}
+    _valid["n_iter"] = {str: lambda x: x in ["auto"], int: lambda x: x >= 0}
+    _valid["power_iteration_normalizer"] = {
+        str: lambda x: x.lower() in ["auto", "qr", "lu", "none"]
+    }
+    _valid_str = {}
+    _valid_str["n_oversamples"] = ["int > 0"]
+    _valid_str["n_iter"] = ["auto", "int >= 0"]
+    _valid_str["power_iteration_normalizer"] = ["auto", "QR", "LU", "none"]
+
+    n_oversamples: int = 10
+    n_iter: int = "auto"
+    power_iteration_normalizer: str = "auto"
+
+    def validate(self):
+        validated = []
+        errs = []
+        valids = []
+        fields = list(self.__dataclass_fields__.items())
+        fields.sort(key=lambda x: x[0])
+        for field_name, field_def in fields:
+            attr = getattr(self, field_name)
+            validated.append(False)
+            for typ, typfun in self._valid[field_name].items():
+                if isinstance(attr, typ):
+                    validated[-1] = typfun(attr)
+            if not validated[-1]:
+                errs.append(field_name)
+        return all(validated), errs
+
+    def __post_init__(self):
+        validated, errs = self.validate()
+        errs = errs
+        if not validated:
+            errorstring = f"{errs} were invalid type or value. " f"Valid values are "
+            for err in errs:
+                errorstring += f"{self._valid_str[err]}, "
+            errorstring += "respectively."
+            raise ValueError(errorstring)
+
+
 class Base(object):
     """Class that deals with key-word arguments but is otherwise
     just an object.
@@ -90,7 +167,9 @@ class Data(Base):
         s_max * eps * max(n_samples, n_features)
         where s_max is the maximum singular value of the data matrix
         and eps is numerical precision. [press2007]_.
-
+    pca_params : `PCAParameters`, optional (default: `PCAParameters()`)
+        Parameters to use for randomized SVD and PCA. See documentation
+        for graphtools.base.PCAParameters.
     random_state : `int` or `None`, optional (default: `None`)
         Random state for random PCA
 
@@ -109,11 +188,19 @@ class Data(Base):
     """
 
     def __init__(
-        self, data, n_pca=None, rank_threshold=None, random_state=None, **kwargs
+        self,
+        data,
+        n_pca=None,
+        rank_threshold=None,
+        pca_params=PCAParameters(),
+        random_state=None,
+        **kwargs,
     ):
 
         self._check_data(data)
-        n_pca, rank_threshold = self._parse_n_pca_threshold(data, n_pca, rank_threshold)
+        n_pca, rank_threshold = self._parse_pca_parameters(
+            data, n_pca, rank_threshold, pca_params
+        )
 
         if utils.is_SparseDataFrame(data):
             data = data.to_coo()
@@ -130,11 +217,12 @@ def __init__(
         self.data = data
         self.n_pca = n_pca
         self.rank_threshold = rank_threshold
+        self.pca_params = pca_params
         self.random_state = random_state
         self.data_nu = self._reduce_data()
         super().__init__(**kwargs)
 
-    def _parse_n_pca_threshold(self, data, n_pca, rank_threshold):
+    def _parse_pca_parameters(self, data, n_pca, rank_threshold, pca_params):
         if isinstance(n_pca, str):
             n_pca = n_pca.lower()
             if n_pca != "auto":
@@ -207,7 +295,12 @@ def _parse_n_pca_threshold(self, data, n_pca, rank_threshold):
                     raise ValueError(
                         "rank_threshold must be positive float or 'auto'. "
                     )
-        return n_pca, rank_threshold
+        if pca_params is None:
+            pca_params = PCAParameters()
+        else:
+            if not isinstance(pca_params, PCAParameters):
+                raise ValueError("pca_params must be an instance of PCAParameters.")
+        return n_pca, rank_threshold, pca_params
 
     def _check_data(self, data):
         if len(data.shape) != 2:
@@ -468,7 +561,7 @@ def __init__(
         anisotropy=0,
         gamma=None,
         initialize=True,
-        **kwargs
+        **kwargs,
     ):
         if gamma is not None:
             warnings.warn(
@@ -893,8 +986,12 @@ def shortest_path(self, method="auto", distance=None):
                 np.sum((self.data_nu[D.row] - self.data_nu[D.col]) ** 2, axis=1)
             )
         elif distance == "affinity":
-            D = sparse.csr_matrix(self.K)
-            D.data = -1 * np.log(D.data)
+            # D = sparse.csr_matrix(self.K)
+            # D.data = -1 * np.log(D.data)
+            D = -1 * np.where(
+                self.K != 0, np.log(np.where(self.K != 0, self.K, np.nan)), 0
+            )
+            # D = sparse.csr_matrix(D)
         else:
             raise ValueError(
                 "Expected `distance` in ['constant', 'data', 'affinity']. "

From 6c64951c33f01afc31bb9eb6e7130ea12226ea6c Mon Sep 17 00:00:00 2001
From: Jay Stanley <jay.s.stanley.3@gmail.com>
Date: Mon, 3 Jan 2022 12:16:48 -0600
Subject: [PATCH 6/9]  monkey patch

---
 graphtools/base.py | 47 +++++++++++++++++++++++++++++++++++++++++++++-
 requirements.txt   |  1 +
 setup.py           | 12 ++++++++++--
 3 files changed, 57 insertions(+), 3 deletions(-)

diff --git a/graphtools/base.py b/graphtools/base.py
index 66aa043..6659420 100644
--- a/graphtools/base.py
+++ b/graphtools/base.py
@@ -2,6 +2,8 @@
 from builtins import super
 from copy import copy as shallow_copy
 from dataclasses import dataclass
+from mock import patch
+from functools import partial
 import numpy as np
 import abc
 import pygsp
@@ -97,6 +99,39 @@ def __post_init__(self):
             raise ValueError(errorstring)
 
 
+##some monkey patching of randomized_svd...
+def randomized_svd_monkey(
+    M,
+    n_components,
+    *,
+    pca_params=PCAParameters(),
+    n_oversamples=10,
+    n_iter="auto",
+    power_iteration_normalizer="auto",
+    transpose="auto",
+    flip_sign=True,
+    random_state="warn",
+):
+    if sklearn.__version__ > "1.0.1":
+        warnings.warn(
+            "Graphtools is using a patched version of randomized_svd "
+            "designed for sklearn version 1.0.1. The current version "
+            "of sklearn is {}. Please alert the graphtools authors to "
+            "update the patch.".format(sklearn.__version__),
+            RuntimeWarning,
+        )
+    return sklearn.utils.extmath.randomized_svd(
+        M,
+        n_components=n_components,
+        n_oversamples=pca_params.n_oversamples,
+        n_iter=pca_params.n_iter,
+        power_iteration_normalizer=pca_params.power_iteration_normalizer,
+        transpose=transpose,
+        flip_sign=flip_sign,
+        random_state=random_state,
+    )
+
+
 class Base(object):
     """Class that deals with key-word arguments but is otherwise
     just an object.
@@ -331,6 +366,9 @@ def _reduce_data(self):
             self.n_pca == "auto" or self.n_pca < self.data.shape[1]
         ):
             with _logger.log_task("PCA"):
+                randomized_pca = partial(
+                    randomized_svd_monkey, pca_params=self.pca_params
+                )
                 n_pca = self.data.shape[1] - 1 if self.n_pca == "auto" else self.n_pca
                 if sparse.issparse(self.data):
                     if (
@@ -344,7 +382,14 @@ def _reduce_data(self):
                     self.data_pca = PCA(
                         n_pca, svd_solver="randomized", random_state=self.random_state
                     )
-                self.data_pca.fit(self.data)
+                with patch(
+                    "sklearn.decomposition._pca.randomized_svd", new=randomized_pca
+                ) as foo, patch(
+                    "sklearn.decomposition._truncated_svd.randomized_svd",
+                    new=randomized_pca,
+                ) as bar:
+                    self.data_pca.fit(self.data)
+
                 if self.n_pca == "auto":
                     s = self.data_pca.singular_values_
                     smax = s.max()
diff --git a/requirements.txt b/requirements.txt
index c31163d..96f623d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,3 +5,4 @@ scikit-learn>=0.20.0
 future
 tasklogger>=1.0
 Deprecated
+mock
diff --git a/setup.py b/setup.py
index 0586273..792709a 100644
--- a/setup.py
+++ b/setup.py
@@ -10,6 +10,7 @@
     "future",
     "tasklogger>=1.0",
     "Deprecated",
+    "mock",
 ]
 
 test_requires = [
@@ -44,7 +45,9 @@
     description="graphtools",
     author="Scott Gigante, Daniel Burkhardt, and Jay Stanley, Yale University",
     author_email="scott.gigante@yale.edu",
-    packages=["graphtools",],
+    packages=[
+        "graphtools",
+    ],
     license="GNU General Public License Version 2",
     install_requires=install_requires,
     extras_require={"test": test_requires, "doc": doc_requires},
@@ -54,7 +57,12 @@
     download_url="https://github.com/KrishnaswamyLab/graphtools/archive/v{}.tar.gz".format(
         version
     ),
-    keywords=["graphs", "big-data", "signal processing", "manifold-learning",],
+    keywords=[
+        "graphs",
+        "big-data",
+        "signal processing",
+        "manifold-learning",
+    ],
     classifiers=[
         "Development Status :: 4 - Beta",
         "Environment :: Console",

From 3f88eb2bebe6dd619ec22f9ed5033294edd852e6 Mon Sep 17 00:00:00 2001
From: Jay Stanley <jay.s.stanley.3@gmail.com>
Date: Mon, 3 Jan 2022 12:18:35 -0600
Subject: [PATCH 7/9] testing for monkey patch

---
 test/test_data.py | 78 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)

diff --git a/test/test_data.py b/test/test_data.py
index 24f6dd2..8d465c8 100644
--- a/test/test_data.py
+++ b/test/test_data.py
@@ -528,6 +528,84 @@ def test_transform_sparse_adaptive_pca():
     assert np.allclose(G3.data_nu, G2.transform(G2.data))
 
 
+#####################################################
+# Check PCAParameters
+#####################################################
+
+
+def test_pca_parameters():
+    params = PCAParameters()
+    assert params.n_oversamples == 10
+    assert params.n_iter == "auto"
+    assert params.power_iteration_normalizer == "auto"
+
+    with assert_raises_message(
+        ValueError,
+        "['n_oversamples'] were invalid type or value. Valid values are ['int > 0'], respectively.",
+    ):
+        params = PCAParameters(n_oversamples=0)
+    try:
+        params = PCAParameters(
+            n_oversamples=0, n_iter="foo", power_iteration_normalizer="bar"
+        )
+    except ValueError as e:
+        assert (
+            str(e)
+            == "['n_iter', 'n_oversamples', 'power_iteration_normalizer'] were invalid type or value. Valid values are ['auto', 'int >= 0'], ['int > 0'], ['auto', 'QR', 'LU', 'none'], respectively."
+        )
+    params = PCAParameters(11, 2, "QR")
+
+
+#####################################################
+# Check randomized_svd monkey patch
+#####################################################
+
+
+def test_warns_sklearn_version():
+    import sklearn
+
+    sklbak = sklearn.__version__
+    sklearn.__version__ = "1.0.2"
+    x = np.random.randn(100, 100)
+    with assert_warns_message(
+        RuntimeWarning,
+        "Graphtools is using a patched version of randomized_svd designed for sklearn version 1.0.1. The current version of sklearn is 1.0.2. Please alert the graphtools authors to update the patch.",
+    ):
+        Data(x, n_pca=2)
+    sklearn.__version__ = sklbak
+
+
+def test_gets_good_svs():
+    x = np.random.randn(1000, 500)
+    u, s, vt = np.linalg.svd(x, full_matrices=False)
+    sy = np.r_[
+        np.arange(50),
+        np.zeros(
+            450,
+        ),
+    ]
+    y = (u * sy) @ vt
+    # test the sparse case (truncated SVD, no mean centering)
+    y = sp.csr_matrix(y)
+    obj = Data(y, n_pca=25)
+    assert np.any(
+        np.logical_not(obj.data_pca.singular_values_ == np.arange(50)[::-1][:25])
+    )
+    params = PCAParameters(n_oversamples=100)
+    obj = Data(y, n_pca=25, pca_params=params)
+    assert np.allclose(obj.data_pca.singular_values_, np.arange(50)[::-1][:25])
+    # test the dense case, has mean centering
+    y = y.toarray()
+    y = y - np.mean(y, axis=0)
+    u, s, vt = np.linalg.svd(y, full_matrices=False)
+    params = PCAParameters(n_oversamples=1)
+    obj = Data(y, n_pca=25, pca_params=params)
+    assert not (np.allclose(obj.data_pca.singular_values_, s[:25]))
+    params = PCAParameters(n_oversamples=1000)
+    obj = Data(y, n_pca=25, pca_params=params)
+    assert np.allclose(obj.data_pca.singular_values_, s[:25])
+
+
 #############
 # Test API
 #############

From f921a63f05b1cf74406f3c52cd3f4ed82007b262 Mon Sep 17 00:00:00 2001
From: Jay Stanley <jay.s.stanley.3@gmail.com>
Date: Mon, 3 Jan 2022 12:21:14 -0600
Subject: [PATCH 8/9]  add sklearn, import requirements to tests

---
 graphtools/base.py          | 3 ++-
 test/load_tests/__init__.py | 3 ++-
 test/test_data.py           | 3 +++
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/graphtools/base.py b/graphtools/base.py
index 6659420..59f166a 100644
--- a/graphtools/base.py
+++ b/graphtools/base.py
@@ -8,6 +8,7 @@
 import abc
 import pygsp
 from inspect import signature
+import sklearn
 from sklearn.decomposition import PCA, TruncatedSVD
 from sklearn.preprocessing import normalize
 from scipy.sparse.csgraph import shortest_path as graph_shortest_path
@@ -233,7 +234,7 @@ def __init__(
     ):
 
         self._check_data(data)
-        n_pca, rank_threshold = self._parse_pca_parameters(
+        n_pca, rank_threshold, pca_params = self._parse_pca_parameters(
             data, n_pca, rank_threshold, pca_params
         )
 
diff --git a/test/load_tests/__init__.py b/test/load_tests/__init__.py
index 1c6213f..d17be6e 100644
--- a/test/load_tests/__init__.py
+++ b/test/load_tests/__init__.py
@@ -1,4 +1,5 @@
 from sklearn.decomposition import PCA, TruncatedSVD
+from graphtools.base import PCAParameters, Data
 from sklearn import datasets
 from scipy.spatial.distance import pdist, cdist, squareform
 import pygsp
@@ -7,7 +8,7 @@
 import scipy.sparse as sp
 import warnings
 import pandas as pd
-
+import sklearn
 import nose2
 from nose.tools import assert_raises_regex, assert_warns_regex
 import re
diff --git a/test/test_data.py b/test/test_data.py
index 8d465c8..3b24a03 100644
--- a/test/test_data.py
+++ b/test/test_data.py
@@ -3,7 +3,10 @@
     np,
     sp,
     pd,
+    sklearn,
     graphtools,
+    PCAParameters,
+    Data,
     nose2,
     data,
     build_graph,

From f79c867e5484ac70f6ae1fb610d04cb594f45894 Mon Sep 17 00:00:00 2001
From: Jay Stanley <jay.s.stanley.3@gmail.com>
Date: Mon, 3 Jan 2022 12:41:46 -0600
Subject: [PATCH 9/9] make tests follow monkey patch

---
 test/load_tests/__init__.py | 1 +
 test/test_exact.py          | 9 +++++----
 test/test_knn.py            | 5 +++--
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/test/load_tests/__init__.py b/test/load_tests/__init__.py
index d17be6e..8b0d25b 100644
--- a/test/load_tests/__init__.py
+++ b/test/load_tests/__init__.py
@@ -1,4 +1,5 @@
 from sklearn.decomposition import PCA, TruncatedSVD
+from sklearn.utils.extmath import randomized_svd
 from graphtools.base import PCAParameters, Data
 from sklearn import datasets
 from scipy.spatial.distance import pdist, cdist, squareform
diff --git a/test/test_exact.py b/test/test_exact.py
index f149ecf..684807a 100644
--- a/test/test_exact.py
+++ b/test/test_exact.py
@@ -12,6 +12,7 @@
     pdist,
     PCA,
     TruncatedSVD,
+    Data,
     assert_raises_message,
     assert_warns_message,
 )
@@ -212,8 +213,8 @@ def test_truncated_exact_graph():
     n_pca = 20
     thresh = 1e-4
     data_small = data[np.random.choice(len(data), len(data) // 2, replace=False)]
-    pca = PCA(n_pca, svd_solver="randomized", random_state=42).fit(data_small)
-    data_small_nu = pca.transform(data_small)
+    pca = Data(data_small, n_pca, random_state=42)
+    data_small_nu = pca.data_pca.transform(data_small)
     pdx = squareform(pdist(data_small_nu, metric="euclidean"))
     knn_dist = np.partition(pdx, k, axis=1)[:, :k]
     epsilon = np.max(knn_dist, axis=1)
@@ -283,8 +284,8 @@ def test_truncated_exact_graph_sparse():
     n_pca = 20
     thresh = 1e-4
     data_small = data[np.random.choice(len(data), len(data) // 2, replace=False)]
-    pca = TruncatedSVD(n_pca, random_state=42).fit(data_small)
-    data_small_nu = pca.transform(data_small)
+    pca = Data(sp.coo_matrix(data_small), n_pca, random_state=42)
+    data_small_nu = pca.data_pca.transform(data_small)
     pdx = squareform(pdist(data_small_nu, metric="euclidean"))
     knn_dist = np.partition(pdx, k, axis=1)[:, :k]
     epsilon = np.max(knn_dist, axis=1)
diff --git a/test/test_knn.py b/test/test_knn.py
index fe4e74d..9706240 100644
--- a/test/test_knn.py
+++ b/test/test_knn.py
@@ -14,6 +14,7 @@
     build_graph,
     PCA,
     TruncatedSVD,
+    Data,
 )
 
 
@@ -196,8 +197,8 @@ def test_knn_graph_multiplication_symm():
 def test_knn_graph_sparse():
     k = 3
     n_pca = 20
-    pca = TruncatedSVD(n_pca, random_state=42).fit(data)
-    data_nu = pca.transform(data)
+    pca = Data(sp.coo_matrix(data), n_pca, random_state=42)
+    data_nu = pca.data_pca.transform(data)
     pdx = squareform(pdist(data_nu, metric="euclidean"))
     knn_dist = np.partition(pdx, k, axis=1)[:, :k]
     epsilon = np.max(knn_dist, axis=1)