From c8e4a7cb9913ce588c2da2a903ed49ca116f6a7f Mon Sep 17 00:00:00 2001 From: Jay Stanley Date: Mon, 3 Jan 2022 11:05:16 -0600 Subject: [PATCH 1/9] base.py logger fix --- graphtools/base.py | 39 ++++++++++++++++++--------------------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/graphtools/base.py b/graphtools/base.py index 4789b8a..8707c21 100644 --- a/graphtools/base.py +++ b/graphtools/base.py @@ -173,7 +173,7 @@ def _parse_n_pca_threshold(self, data, n_pca, rank_threshold): n_pca = None elif n_pca is True: # notify that we're going to estimate rank. n_pca = "auto" - _logger.info( + _logger.log_info( "Estimating n_pca from matrix rank. " "Supply an integer n_pca " "for fixed amount." @@ -237,7 +237,7 @@ def _reduce_data(self): if self.n_pca is not None and ( self.n_pca == "auto" or self.n_pca < self.data.shape[1] ): - with _logger.task("PCA"): + with _logger.log_task("PCA"): n_pca = self.data.shape[1] - 1 if self.n_pca == "auto" else self.n_pca if sparse.issparse(self.data): if ( @@ -269,7 +269,7 @@ def _reduce_data(self): "maximum singular value {} " "for the data matrix".format(threshold, smax) ) - _logger.info( + _logger.log_info( "Using rank estimate of {} as n_pca".format(self.n_pca) ) # reset the sklearn operator @@ -292,8 +292,7 @@ def _reduce_data(self): return data_nu def get_params(self): - """Get parameters from this object - """ + """Get parameters from this object""" return {"n_pca": self.n_pca, "random_state": self.random_state} def set_params(self, **params): @@ -498,10 +497,10 @@ def __init__( self.anisotropy = anisotropy if initialize: - _logger.debug("Initializing kernel...") + _logger.log_debug("Initializing kernel...") self.K else: - _logger.debug("Not initializing kernel.") + _logger.log_debug("Not initializing kernel.") super().__init__(**kwargs) def _check_symmetrization(self, kernel_symm, theta): @@ -556,18 +555,20 @@ def _build_kernel(self): def symmetrize_kernel(self, K): # symmetrize if self.kernel_symm == "+": - _logger.debug("Using addition symmetrization.") + _logger.log_debug("Using addition symmetrization.") K = (K + K.T) / 2 elif self.kernel_symm == "*": - _logger.debug("Using multiplication symmetrization.") + _logger.log_debug("Using multiplication symmetrization.") K = K.multiply(K.T) elif self.kernel_symm == "mnn": - _logger.debug("Using mnn symmetrization (theta = {}).".format(self.theta)) + _logger.log_debug( + "Using mnn symmetrization (theta = {}).".format(self.theta) + ) K = self.theta * matrix.elementwise_minimum(K, K.T) + ( 1 - self.theta ) * matrix.elementwise_maximum(K, K.T) elif self.kernel_symm is None: - _logger.debug("Using no symmetrization.") + _logger.log_debug("Using no symmetrization.") pass else: raise NotImplementedError @@ -589,8 +590,7 @@ def apply_anisotropy(self, K): return K def get_params(self): - """Get parameters from this object - """ + """Get parameters from this object""" return { "kernel_symm": self.kernel_symm, "theta": self.theta, @@ -697,8 +697,7 @@ def diff_aff(self): @property def diff_op(self): - """Synonym for P - """ + """Synonym for P""" return self.P @property @@ -719,8 +718,7 @@ def K(self): @property def kernel(self): - """Synonym for K - """ + """Synonym for K""" return self.K @property @@ -850,10 +848,10 @@ def _check_shortest_path_distance(self, distance): def _default_shortest_path_distance(self): if not self.weighted: distance = "data" - _logger.info("Using ambient data distances.") + _logger.log_info("Using ambient data distances.") else: distance = "affinity" - _logger.info("Using negative log affinity distances.") + _logger.log_info("Using negative log affinity distances.") return distance def shortest_path(self, method="auto", distance=None): @@ -1019,8 +1017,7 @@ def __init__(self, data, verbose=True, n_jobs=1, **kwargs): super().__init__(data, **kwargs) def get_params(self): - """Get parameters from this object - """ + """Get parameters from this object""" params = Data.get_params(self) params.update(BaseGraph.get_params(self)) return params From f02e7b60f917067203b0ad4165f6d0dd06531a94 Mon Sep 17 00:00:00 2001 From: Jay Stanley Date: Mon, 3 Jan 2022 11:14:00 -0600 Subject: [PATCH 2/9] fix tasklogger.debug -> log_debug, .task -> log_task, .info ->log_info --- graphtools/api.py | 4 ++-- graphtools/estimator.py | 22 ++++++++++---------- graphtools/graphs.py | 46 +++++++++++++++++++---------------------- 3 files changed, 34 insertions(+), 38 deletions(-) diff --git a/graphtools/api.py b/graphtools/api.py index b9a4b1b..27b9a79 100644 --- a/graphtools/api.py +++ b/graphtools/api.py @@ -255,7 +255,7 @@ def Graph( else: msg = msg + " and PyGSP inheritance" - _logger.debug(msg) + _logger.log_debug(msg) class_names = [p.__name__.replace("Graph", "") for p in parent_classes] try: @@ -273,7 +273,7 @@ def Graph( pass # build graph and return - _logger.debug( + _logger.log_debug( "Initializing {} with arguments {}".format( parent_classes, ", ".join( diff --git a/graphtools/estimator.py b/graphtools/estimator.py index 5cb130f..3244fbb 100644 --- a/graphtools/estimator.py +++ b/graphtools/estimator.py @@ -81,18 +81,18 @@ class GraphEstimator(object, metaclass=abc.ABCMeta): verbose : `int` or `boolean`, optional (default: 1) If `True` or `> 0`, print status messages - + n_svd : int, optional (default: 100) number of singular vectors to compute for landmarking - + thresh : float, optional (default: 1e-4) threshold below which to truncate kernel - + kwargs : additional arguments for graphtools.Graph - + Attributes ---------- - + graph : graphtools.Graph """ @@ -248,13 +248,13 @@ def _set_graph_params(self, **params): ) self.graph.set_params(**params) except ValueError as e: - _logger.debug("Reset graph due to {}".format(str(e))) + _logger.log_debug("Reset graph due to {}".format(str(e))) self.graph = None @abc.abstractmethod def _reset_graph(self): """Trigger a reset of self.graph - + Any downstream effects of resetting the graph should override this function """ raise NotImplementedError @@ -361,7 +361,7 @@ def _update_graph(self, X, precomputed, n_pca, n_landmark, **kwargs): **(self.kwargs) ) if self.graph is not None: - _logger.info("Using precomputed graph and diffusion operator...") + _logger.log_info("Using precomputed graph and diffusion operator...") def fit(self, X, **kwargs): """Computes the graph @@ -384,13 +384,13 @@ def fit(self, X, **kwargs): X, n_pca, n_landmark, precomputed, update_graph = self._parse_input(X) if precomputed is None: - _logger.info( + _logger.log_info( "Building graph on {} samples and {} features.".format( X.shape[0], X.shape[1] ) ) else: - _logger.info( + _logger.log_info( "Building graph on precomputed {} matrix with {} samples.".format( precomputed, X.shape[0] ) @@ -402,7 +402,7 @@ def fit(self, X, **kwargs): self.X = X if self.graph is None: - with _logger.task("graph and diffusion operator"): + with _logger.log_task("graph and diffusion operator"): self.graph = api.Graph( X, n_pca=n_pca, diff --git a/graphtools/graphs.py b/graphtools/graphs.py index 2caa431..092d73b 100644 --- a/graphtools/graphs.py +++ b/graphtools/graphs.py @@ -132,8 +132,7 @@ def __init__( super().__init__(data, n_pca=n_pca, **kwargs) def get_params(self): - """Get parameters from this object - """ + """Get parameters from this object""" params = super().get_params() params.update( { @@ -347,19 +346,19 @@ def build_kernel_to_data( Y = self._check_extension_shape(Y) if self.decay is None or self.thresh == 1: - with _logger.task("KNN search"): + with _logger.log_task("KNN search"): # binary connectivity matrix K = self.knn_tree.kneighbors_graph( Y, n_neighbors=knn, mode="connectivity" ) else: - with _logger.task("KNN search"): + with _logger.log_task("KNN search"): # sparse fast alpha decay knn_tree = self.knn_tree search_knn = min(knn * self.search_multiplier, knn_max) distances, indices = knn_tree.kneighbors(Y, n_neighbors=search_knn) self._check_duplicates(distances, indices) - with _logger.task("affinities"): + with _logger.log_task("affinities"): if bandwidth is None: bandwidth = distances[:, knn - 1] @@ -370,7 +369,7 @@ def build_kernel_to_data( radius = bandwidth * np.power(-1 * np.log(self.thresh), 1 / self.decay) update_idx = np.argwhere(np.max(distances, axis=1) < radius).reshape(-1) - _logger.debug( + _logger.log_debug( "search_knn = {}; {} remaining".format(search_knn, len(update_idx)) ) if len(update_idx) > 0: @@ -399,7 +398,7 @@ def build_kernel_to_data( else radius[i] ) ] - _logger.debug( + _logger.log_debug( "search_knn = {}; {} remaining".format( search_knn, len(update_idx) ) @@ -412,7 +411,7 @@ def build_kernel_to_data( ).fit(self.data_nu) if len(update_idx) > 0: if search_knn == knn_max: - _logger.debug( + _logger.log_debug( "knn search to knn_max ({}) on {}".format( knn_max, len(update_idx) ) @@ -425,7 +424,7 @@ def build_kernel_to_data( distances[idx] = dist_new[i] indices[idx] = ind_new[i] else: - _logger.debug("radius search on {}".format(len(update_idx))) + _logger.log_debug("radius search on {}".format(len(update_idx))) # give up - radius search dist_new, ind_new = knn_tree.radius_neighbors( Y[update_idx, :], @@ -524,8 +523,7 @@ def __init__(self, data, n_landmark=2000, n_svd=100, **kwargs): super().__init__(data, **kwargs) def get_params(self): - """Get parameters from this object - """ + """Get parameters from this object""" params = super().get_params() params.update({"n_landmark": self.n_landmark, "n_pca": self.n_pca}) return params @@ -653,16 +651,16 @@ def build_landmark_op(self): probabilities between cluster centers by using transition probabilities between samples assigned to each cluster. """ - with _logger.task("landmark operator"): + with _logger.log_task("landmark operator"): is_sparse = sparse.issparse(self.kernel) # spectral clustering - with _logger.task("SVD"): + with _logger.log_task("SVD"): _, _, VT = randomized_svd( self.diff_aff, n_components=self.n_svd, random_state=self.random_state, ) - with _logger.task("KMeans"): + with _logger.log_task("KMeans"): kmeans = MiniBatchKMeans( self.n_landmark, init_size=3 * self.n_landmark, @@ -886,8 +884,7 @@ def __init__( super().__init__(data, n_pca=n_pca, **kwargs) def get_params(self): - """Get parameters from this object - """ + """Get parameters from this object""" params = super().get_params() params.update( { @@ -985,7 +982,7 @@ def build_kernel(self): K = K.tolil() K = matrix.set_diagonal(K, 1) else: - with _logger.task("affinities"): + with _logger.log_task("affinities"): if sparse.issparse(self.data_nu): self.data_nu = self.data_nu.toarray() if self.precomputed == "distance": @@ -1091,7 +1088,7 @@ def build_kernel_to_data(self, Y, knn=None, bandwidth=None, bandwidth_scale=None if self.precomputed is not None: raise ValueError("Cannot extend kernel on precomputed graph") else: - with _logger.task("affinities"): + with _logger.log_task("affinities"): Y = self._check_extension_shape(Y) pdx = cdist(Y, self.data_nu, metric=self.distance) if bandwidth is None: @@ -1128,7 +1125,7 @@ def _check_shortest_path_distance(self, distance): def _default_shortest_path_distance(self): if self.precomputed is not None and not self.weighted: distance = "constant" - _logger.info("Using constant distances.") + _logger.log_info("Using constant distances.") else: distance = super()._default_shortest_path_distance() return distance @@ -1222,8 +1219,7 @@ def _check_symmetrization(self, kernel_symm, theta): super()._check_symmetrization(kernel_symm, theta) def get_params(self): - """Get parameters from this object - """ + """Get parameters from this object""" params = super().get_params() params.update( { @@ -1296,13 +1292,13 @@ def build_kernel(self): symmetric matrix with ones down the diagonal with no non-negative entries. """ - with _logger.task("subgraphs"): + with _logger.log_task("subgraphs"): self.subgraphs = [] from .api import Graph # iterate through sample ids for i, idx in enumerate(self.samples): - _logger.debug( + _logger.log_debug( "subgraph {}: sample {}, " "n = {}, knn = {}".format( i, idx, np.sum(self.sample_idx == idx), self.knn @@ -1327,7 +1323,7 @@ def build_kernel(self): ) self.subgraphs.append(graph) # append to list of subgraphs - with _logger.task("MNN kernel"): + with _logger.log_task("MNN kernel"): if self.thresh > 0 or self.decay is None: K = sparse.lil_matrix((self.data_nu.shape[0], self.data_nu.shape[0])) else: @@ -1343,7 +1339,7 @@ def build_kernel(self): for j, Y in enumerate(self.subgraphs): if i == j: continue - with _logger.task( + with _logger.log_task( "kernel from sample {} to {}".format( self.samples[i], self.samples[j] ) From 2f2fbfb47fb20446e4a73abdb38103083658e43c Mon Sep 17 00:00:00 2001 From: Jay Stanley Date: Mon, 3 Jan 2022 11:18:17 -0600 Subject: [PATCH 3/9] fixed deprecated sklearn.utils.graph.graph_shortest_path->scipy.sparse.csgraph.shortest_path --- graphtools/base.py | 2 +- test/test_exact.py | 2 +- test/test_knn.py | 5 +++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/graphtools/base.py b/graphtools/base.py index 8707c21..e0a985f 100644 --- a/graphtools/base.py +++ b/graphtools/base.py @@ -7,7 +7,7 @@ from inspect import signature from sklearn.decomposition import PCA, TruncatedSVD from sklearn.preprocessing import normalize -from sklearn.utils.graph import graph_shortest_path +from scipy.sparse.csgraph import shortest_path as graph_shortest_path from scipy import sparse import warnings import numbers diff --git a/test/test_exact.py b/test/test_exact.py index 07faab0..f149ecf 100644 --- a/test/test_exact.py +++ b/test/test_exact.py @@ -1,5 +1,5 @@ from __future__ import print_function -from sklearn.utils.graph import graph_shortest_path +from scipy.sparse.csgraph import shortest_path as graph_shortest_path from load_tests import ( graphtools, np, diff --git a/test/test_knn.py b/test/test_knn.py index fe47c07..fe4e74d 100644 --- a/test/test_knn.py +++ b/test/test_knn.py @@ -1,5 +1,5 @@ from __future__ import print_function, division -from sklearn.utils.graph import graph_shortest_path +from scipy.sparse.csgraph import shortest_path as graph_shortest_path from scipy.spatial.distance import pdist, squareform from load_tests import assert_raises_message, assert_warns_message from nose.tools import assert_raises_regex, assert_warns_regex @@ -156,7 +156,8 @@ def test_knn_graph(): ), ): G2.build_kernel_to_data( - Y=G2.data_nu, knn=data.shape[0] + 1, + Y=G2.data_nu, + knn=data.shape[0] + 1, ) From b1a2a8a9f712b987ed6c5d4ac9dbf48e39a86e86 Mon Sep 17 00:00:00 2001 From: Jay Stanley Date: Mon, 3 Jan 2022 11:34:14 -0600 Subject: [PATCH 4/9] whitespace --- graphtools/graphs.py | 1 - 1 file changed, 1 deletion(-) diff --git a/graphtools/graphs.py b/graphtools/graphs.py index 092d73b..e6eda49 100644 --- a/graphtools/graphs.py +++ b/graphtools/graphs.py @@ -668,7 +668,6 @@ def build_landmark_op(self): random_state=self.random_state, ) self._clusters = kmeans.fit_predict(self.diff_op.dot(VT.T)) - # transition matrices pmn = self._landmarks_to_data() From c35ba08690aabac01edf21bf6a70540c6f840d29 Mon Sep 17 00:00:00 2001 From: Jay Stanley Date: Mon, 3 Jan 2022 12:13:15 -0600 Subject: [PATCH 5/9] pca parameters --- graphtools/base.py | 113 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 105 insertions(+), 8 deletions(-) diff --git a/graphtools/base.py b/graphtools/base.py index e0a985f..66aa043 100644 --- a/graphtools/base.py +++ b/graphtools/base.py @@ -1,6 +1,7 @@ from future.utils import with_metaclass from builtins import super from copy import copy as shallow_copy +from dataclasses import dataclass import numpy as np import abc import pygsp @@ -20,6 +21,82 @@ _logger = tasklogger.get_tasklogger("graphtools") +@dataclass +class PCAParameters(object): + """Data class that stores PCA parameters. + Parameters + ---------- + n_oversamples : int, default=10 + Additional number of random vectors to sample the range of M so as + to ensure proper conditioning. The total number of random vectors + used to find the range of M is n_components + n_oversamples. Smaller + number can improve speed but can negatively impact the quality of + approximation of singular vectors and singular values. Users might wish + to increase this parameter up to `2*k - n_components` where k is the + effective rank, for large matrices, noisy problems, matrices with + slowly decaying spectrums, or to increase precision accuracy. + n_iter : int or 'auto', default='auto' + Number of power iterations. It can be used to deal with very noisy + problems. When 'auto', it is set to 4, unless `n_components` is small + (< .1 * min(X.shape)) in which case `n_iter` is set to 7. + This improves precision with few components. Note that in general + users should rather increase `n_oversamples` before increasing `n_iter` + as the principle of the randomized method is to avoid usage of these + more costly power iterations steps. When `n_components` is equal + or greater to the effective matrix rank and the spectrum does not + present a slow decay, `n_iter=0` or `1` should even work fine in theory + power_iteration_normalizer : {'auto', 'QR', 'LU', 'none'}, default='auto' + Whether the power iterations are normalized with step-by-step + QR factorization (the slowest but most accurate), 'none' + (the fastest but numerically unstable when `n_iter` is large, e.g. + typically 5 or larger), or 'LU' factorization (numerically stable + but can lose slightly in accuracy). The 'auto' mode applies no + normalization if `n_iter` <= 2 and switches to LU otherwise. + See documentation for sklearn.utils.extmath.randomized_svd + """ + + _valid = {} + _valid["n_oversamples"] = {int: lambda x: x > 0} + _valid["n_iter"] = {str: lambda x: x in ["auto"], int: lambda x: x >= 0} + _valid["power_iteration_normalizer"] = { + str: lambda x: x.lower() in ["auto", "qr", "lu", "none"] + } + _valid_str = {} + _valid_str["n_oversamples"] = ["int > 0"] + _valid_str["n_iter"] = ["auto", "int >= 0"] + _valid_str["power_iteration_normalizer"] = ["auto", "QR", "LU", "none"] + + n_oversamples: int = 10 + n_iter: int = "auto" + power_iteration_normalizer: str = "auto" + + def validate(self): + validated = [] + errs = [] + valids = [] + fields = list(self.__dataclass_fields__.items()) + fields.sort(key=lambda x: x[0]) + for field_name, field_def in fields: + attr = getattr(self, field_name) + validated.append(False) + for typ, typfun in self._valid[field_name].items(): + if isinstance(attr, typ): + validated[-1] = typfun(attr) + if not validated[-1]: + errs.append(field_name) + return all(validated), errs + + def __post_init__(self): + validated, errs = self.validate() + errs = errs + if not validated: + errorstring = f"{errs} were invalid type or value. " f"Valid values are " + for err in errs: + errorstring += f"{self._valid_str[err]}, " + errorstring += "respectively." + raise ValueError(errorstring) + + class Base(object): """Class that deals with key-word arguments but is otherwise just an object. @@ -90,7 +167,9 @@ class Data(Base): s_max * eps * max(n_samples, n_features) where s_max is the maximum singular value of the data matrix and eps is numerical precision. [press2007]_. - + pca_params : `PCAParameters`, optional (default: `PCAParameters()`) + Parameters to use for randomized SVD and PCA. See documentation + for graphtools.base.PCAParameters. random_state : `int` or `None`, optional (default: `None`) Random state for random PCA @@ -109,11 +188,19 @@ class Data(Base): """ def __init__( - self, data, n_pca=None, rank_threshold=None, random_state=None, **kwargs + self, + data, + n_pca=None, + rank_threshold=None, + pca_params=PCAParameters(), + random_state=None, + **kwargs, ): self._check_data(data) - n_pca, rank_threshold = self._parse_n_pca_threshold(data, n_pca, rank_threshold) + n_pca, rank_threshold = self._parse_pca_parameters( + data, n_pca, rank_threshold, pca_params + ) if utils.is_SparseDataFrame(data): data = data.to_coo() @@ -130,11 +217,12 @@ def __init__( self.data = data self.n_pca = n_pca self.rank_threshold = rank_threshold + self.pca_params = pca_params self.random_state = random_state self.data_nu = self._reduce_data() super().__init__(**kwargs) - def _parse_n_pca_threshold(self, data, n_pca, rank_threshold): + def _parse_pca_parameters(self, data, n_pca, rank_threshold, pca_params): if isinstance(n_pca, str): n_pca = n_pca.lower() if n_pca != "auto": @@ -207,7 +295,12 @@ def _parse_n_pca_threshold(self, data, n_pca, rank_threshold): raise ValueError( "rank_threshold must be positive float or 'auto'. " ) - return n_pca, rank_threshold + if pca_params is None: + pca_params = PCAParameters() + else: + if not isinstance(pca_params, PCAParameters): + raise ValueError("pca_params must be an instance of PCAParameters.") + return n_pca, rank_threshold, pca_params def _check_data(self, data): if len(data.shape) != 2: @@ -468,7 +561,7 @@ def __init__( anisotropy=0, gamma=None, initialize=True, - **kwargs + **kwargs, ): if gamma is not None: warnings.warn( @@ -893,8 +986,12 @@ def shortest_path(self, method="auto", distance=None): np.sum((self.data_nu[D.row] - self.data_nu[D.col]) ** 2, axis=1) ) elif distance == "affinity": - D = sparse.csr_matrix(self.K) - D.data = -1 * np.log(D.data) + # D = sparse.csr_matrix(self.K) + # D.data = -1 * np.log(D.data) + D = -1 * np.where( + self.K != 0, np.log(np.where(self.K != 0, self.K, np.nan)), 0 + ) + # D = sparse.csr_matrix(D) else: raise ValueError( "Expected `distance` in ['constant', 'data', 'affinity']. " From 6c64951c33f01afc31bb9eb6e7130ea12226ea6c Mon Sep 17 00:00:00 2001 From: Jay Stanley Date: Mon, 3 Jan 2022 12:16:48 -0600 Subject: [PATCH 6/9] monkey patch --- graphtools/base.py | 47 +++++++++++++++++++++++++++++++++++++++++++++- requirements.txt | 1 + setup.py | 12 ++++++++++-- 3 files changed, 57 insertions(+), 3 deletions(-) diff --git a/graphtools/base.py b/graphtools/base.py index 66aa043..6659420 100644 --- a/graphtools/base.py +++ b/graphtools/base.py @@ -2,6 +2,8 @@ from builtins import super from copy import copy as shallow_copy from dataclasses import dataclass +from mock import patch +from functools import partial import numpy as np import abc import pygsp @@ -97,6 +99,39 @@ def __post_init__(self): raise ValueError(errorstring) +##some monkey patching of randomized_svd... +def randomized_svd_monkey( + M, + n_components, + *, + pca_params=PCAParameters(), + n_oversamples=10, + n_iter="auto", + power_iteration_normalizer="auto", + transpose="auto", + flip_sign=True, + random_state="warn", +): + if sklearn.__version__ > "1.0.1": + warnings.warn( + "Graphtools is using a patched version of randomized_svd " + "designed for sklearn version 1.0.1. The current version " + "of sklearn is {}. Please alert the graphtools authors to " + "update the patch.".format(sklearn.__version__), + RuntimeWarning, + ) + return sklearn.utils.extmath.randomized_svd( + M, + n_components=n_components, + n_oversamples=pca_params.n_oversamples, + n_iter=pca_params.n_iter, + power_iteration_normalizer=pca_params.power_iteration_normalizer, + transpose=transpose, + flip_sign=flip_sign, + random_state=random_state, + ) + + class Base(object): """Class that deals with key-word arguments but is otherwise just an object. @@ -331,6 +366,9 @@ def _reduce_data(self): self.n_pca == "auto" or self.n_pca < self.data.shape[1] ): with _logger.log_task("PCA"): + randomized_pca = partial( + randomized_svd_monkey, pca_params=self.pca_params + ) n_pca = self.data.shape[1] - 1 if self.n_pca == "auto" else self.n_pca if sparse.issparse(self.data): if ( @@ -344,7 +382,14 @@ def _reduce_data(self): self.data_pca = PCA( n_pca, svd_solver="randomized", random_state=self.random_state ) - self.data_pca.fit(self.data) + with patch( + "sklearn.decomposition._pca.randomized_svd", new=randomized_pca + ) as foo, patch( + "sklearn.decomposition._truncated_svd.randomized_svd", + new=randomized_pca, + ) as bar: + self.data_pca.fit(self.data) + if self.n_pca == "auto": s = self.data_pca.singular_values_ smax = s.max() diff --git a/requirements.txt b/requirements.txt index c31163d..96f623d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,4 @@ scikit-learn>=0.20.0 future tasklogger>=1.0 Deprecated +mock diff --git a/setup.py b/setup.py index 0586273..792709a 100644 --- a/setup.py +++ b/setup.py @@ -10,6 +10,7 @@ "future", "tasklogger>=1.0", "Deprecated", + "mock", ] test_requires = [ @@ -44,7 +45,9 @@ description="graphtools", author="Scott Gigante, Daniel Burkhardt, and Jay Stanley, Yale University", author_email="scott.gigante@yale.edu", - packages=["graphtools",], + packages=[ + "graphtools", + ], license="GNU General Public License Version 2", install_requires=install_requires, extras_require={"test": test_requires, "doc": doc_requires}, @@ -54,7 +57,12 @@ download_url="https://github.com/KrishnaswamyLab/graphtools/archive/v{}.tar.gz".format( version ), - keywords=["graphs", "big-data", "signal processing", "manifold-learning",], + keywords=[ + "graphs", + "big-data", + "signal processing", + "manifold-learning", + ], classifiers=[ "Development Status :: 4 - Beta", "Environment :: Console", From 3f88eb2bebe6dd619ec22f9ed5033294edd852e6 Mon Sep 17 00:00:00 2001 From: Jay Stanley Date: Mon, 3 Jan 2022 12:18:35 -0600 Subject: [PATCH 7/9] testing for monkey patch --- test/test_data.py | 78 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) diff --git a/test/test_data.py b/test/test_data.py index 24f6dd2..8d465c8 100644 --- a/test/test_data.py +++ b/test/test_data.py @@ -528,6 +528,84 @@ def test_transform_sparse_adaptive_pca(): assert np.allclose(G3.data_nu, G2.transform(G2.data)) +##################################################### +# Check PCAParameters +##################################################### + + +def test_pca_parameters(): + params = PCAParameters() + assert params.n_oversamples == 10 + assert params.n_iter == "auto" + assert params.power_iteration_normalizer == "auto" + + with assert_raises_message( + ValueError, + "['n_oversamples'] were invalid type or value. Valid values are ['int > 0'], respectively.", + ): + params = PCAParameters(n_oversamples=0) + try: + params = PCAParameters( + n_oversamples=0, n_iter="foo", power_iteration_normalizer="bar" + ) + except ValueError as e: + assert ( + str(e) + == "['n_iter', 'n_oversamples', 'power_iteration_normalizer'] were invalid type or value. Valid values are ['auto', 'int >= 0'], ['int > 0'], ['auto', 'QR', 'LU', 'none'], respectively." + ) + params = PCAParameters(11, 2, "QR") + + +##################################################### +# Check randomized_svd monkey patch +##################################################### + + +def test_warns_sklearn_version(): + import sklearn + + sklbak = sklearn.__version__ + sklearn.__version__ = "1.0.2" + x = np.random.randn(100, 100) + with assert_warns_message( + RuntimeWarning, + "Graphtools is using a patched version of randomized_svd designed for sklearn version 1.0.1. The current version of sklearn is 1.0.2. Please alert the graphtools authors to update the patch.", + ): + Data(x, n_pca=2) + sklearn.__version__ = sklbak + + +def test_gets_good_svs(): + x = np.random.randn(1000, 500) + u, s, vt = np.linalg.svd(x, full_matrices=False) + sy = np.r_[ + np.arange(50), + np.zeros( + 450, + ), + ] + y = (u * sy) @ vt + # test the sparse case (truncated SVD, no mean centering) + y = sp.csr_matrix(y) + obj = Data(y, n_pca=25) + assert np.any( + np.logical_not(obj.data_pca.singular_values_ == np.arange(50)[::-1][:25]) + ) + params = PCAParameters(n_oversamples=100) + obj = Data(y, n_pca=25, pca_params=params) + assert np.allclose(obj.data_pca.singular_values_, np.arange(50)[::-1][:25]) + # test the dense case, has mean centering + y = y.toarray() + y = y - np.mean(y, axis=0) + u, s, vt = np.linalg.svd(y, full_matrices=False) + params = PCAParameters(n_oversamples=1) + obj = Data(y, n_pca=25, pca_params=params) + assert not (np.allclose(obj.data_pca.singular_values_, s[:25])) + params = PCAParameters(n_oversamples=1000) + obj = Data(y, n_pca=25, pca_params=params) + assert np.allclose(obj.data_pca.singular_values_, s[:25]) + + ############# # Test API ############# From f921a63f05b1cf74406f3c52cd3f4ed82007b262 Mon Sep 17 00:00:00 2001 From: Jay Stanley Date: Mon, 3 Jan 2022 12:21:14 -0600 Subject: [PATCH 8/9] add sklearn, import requirements to tests --- graphtools/base.py | 3 ++- test/load_tests/__init__.py | 3 ++- test/test_data.py | 3 +++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/graphtools/base.py b/graphtools/base.py index 6659420..59f166a 100644 --- a/graphtools/base.py +++ b/graphtools/base.py @@ -8,6 +8,7 @@ import abc import pygsp from inspect import signature +import sklearn from sklearn.decomposition import PCA, TruncatedSVD from sklearn.preprocessing import normalize from scipy.sparse.csgraph import shortest_path as graph_shortest_path @@ -233,7 +234,7 @@ def __init__( ): self._check_data(data) - n_pca, rank_threshold = self._parse_pca_parameters( + n_pca, rank_threshold, pca_params = self._parse_pca_parameters( data, n_pca, rank_threshold, pca_params ) diff --git a/test/load_tests/__init__.py b/test/load_tests/__init__.py index 1c6213f..d17be6e 100644 --- a/test/load_tests/__init__.py +++ b/test/load_tests/__init__.py @@ -1,4 +1,5 @@ from sklearn.decomposition import PCA, TruncatedSVD +from graphtools.base import PCAParameters, Data from sklearn import datasets from scipy.spatial.distance import pdist, cdist, squareform import pygsp @@ -7,7 +8,7 @@ import scipy.sparse as sp import warnings import pandas as pd - +import sklearn import nose2 from nose.tools import assert_raises_regex, assert_warns_regex import re diff --git a/test/test_data.py b/test/test_data.py index 8d465c8..3b24a03 100644 --- a/test/test_data.py +++ b/test/test_data.py @@ -3,7 +3,10 @@ np, sp, pd, + sklearn, graphtools, + PCAParameters, + Data, nose2, data, build_graph, From f79c867e5484ac70f6ae1fb610d04cb594f45894 Mon Sep 17 00:00:00 2001 From: Jay Stanley Date: Mon, 3 Jan 2022 12:41:46 -0600 Subject: [PATCH 9/9] make tests follow monkey patch --- test/load_tests/__init__.py | 1 + test/test_exact.py | 9 +++++---- test/test_knn.py | 5 +++-- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/test/load_tests/__init__.py b/test/load_tests/__init__.py index d17be6e..8b0d25b 100644 --- a/test/load_tests/__init__.py +++ b/test/load_tests/__init__.py @@ -1,4 +1,5 @@ from sklearn.decomposition import PCA, TruncatedSVD +from sklearn.utils.extmath import randomized_svd from graphtools.base import PCAParameters, Data from sklearn import datasets from scipy.spatial.distance import pdist, cdist, squareform diff --git a/test/test_exact.py b/test/test_exact.py index f149ecf..684807a 100644 --- a/test/test_exact.py +++ b/test/test_exact.py @@ -12,6 +12,7 @@ pdist, PCA, TruncatedSVD, + Data, assert_raises_message, assert_warns_message, ) @@ -212,8 +213,8 @@ def test_truncated_exact_graph(): n_pca = 20 thresh = 1e-4 data_small = data[np.random.choice(len(data), len(data) // 2, replace=False)] - pca = PCA(n_pca, svd_solver="randomized", random_state=42).fit(data_small) - data_small_nu = pca.transform(data_small) + pca = Data(data_small, n_pca, random_state=42) + data_small_nu = pca.data_pca.transform(data_small) pdx = squareform(pdist(data_small_nu, metric="euclidean")) knn_dist = np.partition(pdx, k, axis=1)[:, :k] epsilon = np.max(knn_dist, axis=1) @@ -283,8 +284,8 @@ def test_truncated_exact_graph_sparse(): n_pca = 20 thresh = 1e-4 data_small = data[np.random.choice(len(data), len(data) // 2, replace=False)] - pca = TruncatedSVD(n_pca, random_state=42).fit(data_small) - data_small_nu = pca.transform(data_small) + pca = Data(sp.coo_matrix(data_small), n_pca, random_state=42) + data_small_nu = pca.data_pca.transform(data_small) pdx = squareform(pdist(data_small_nu, metric="euclidean")) knn_dist = np.partition(pdx, k, axis=1)[:, :k] epsilon = np.max(knn_dist, axis=1) diff --git a/test/test_knn.py b/test/test_knn.py index fe4e74d..9706240 100644 --- a/test/test_knn.py +++ b/test/test_knn.py @@ -14,6 +14,7 @@ build_graph, PCA, TruncatedSVD, + Data, ) @@ -196,8 +197,8 @@ def test_knn_graph_multiplication_symm(): def test_knn_graph_sparse(): k = 3 n_pca = 20 - pca = TruncatedSVD(n_pca, random_state=42).fit(data) - data_nu = pca.transform(data) + pca = Data(sp.coo_matrix(data), n_pca, random_state=42) + data_nu = pca.data_pca.transform(data) pdx = squareform(pdist(data_nu, metric="euclidean")) knn_dist = np.partition(pdx, k, axis=1)[:, :k] epsilon = np.max(knn_dist, axis=1)