Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Patches: tasklogger log_x, randomized_svd arguments, deprecated graph_shortest_path #62

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions graphtools/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ def Graph(
else:
msg = msg + " and PyGSP inheritance"

_logger.debug(msg)
_logger.log_debug(msg)

class_names = [p.__name__.replace("Graph", "") for p in parent_classes]
try:
Expand All @@ -273,7 +273,7 @@ def Graph(
pass

# build graph and return
_logger.debug(
_logger.log_debug(
"Initializing {} with arguments {}".format(
parent_classes,
", ".join(
Expand Down
202 changes: 171 additions & 31 deletions graphtools/base.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
from future.utils import with_metaclass
from builtins import super
from copy import copy as shallow_copy
from dataclasses import dataclass
from mock import patch
from functools import partial
import numpy as np
import abc
import pygsp
from inspect import signature
import sklearn
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.preprocessing import normalize
from sklearn.utils.graph import graph_shortest_path
from scipy.sparse.csgraph import shortest_path as graph_shortest_path
from scipy import sparse
import warnings
import numbers
Expand All @@ -20,6 +24,115 @@
_logger = tasklogger.get_tasklogger("graphtools")


@dataclass
class PCAParameters(object):
"""Data class that stores PCA parameters.
Parameters
----------
n_oversamples : int, default=10
Additional number of random vectors to sample the range of M so as
to ensure proper conditioning. The total number of random vectors
used to find the range of M is n_components + n_oversamples. Smaller
number can improve speed but can negatively impact the quality of
approximation of singular vectors and singular values. Users might wish
to increase this parameter up to `2*k - n_components` where k is the
effective rank, for large matrices, noisy problems, matrices with
slowly decaying spectrums, or to increase precision accuracy.
n_iter : int or 'auto', default='auto'
Number of power iterations. It can be used to deal with very noisy
problems. When 'auto', it is set to 4, unless `n_components` is small
(< .1 * min(X.shape)) in which case `n_iter` is set to 7.
This improves precision with few components. Note that in general
users should rather increase `n_oversamples` before increasing `n_iter`
as the principle of the randomized method is to avoid usage of these
more costly power iterations steps. When `n_components` is equal
or greater to the effective matrix rank and the spectrum does not
present a slow decay, `n_iter=0` or `1` should even work fine in theory
power_iteration_normalizer : {'auto', 'QR', 'LU', 'none'}, default='auto'
Whether the power iterations are normalized with step-by-step
QR factorization (the slowest but most accurate), 'none'
(the fastest but numerically unstable when `n_iter` is large, e.g.
typically 5 or larger), or 'LU' factorization (numerically stable
but can lose slightly in accuracy). The 'auto' mode applies no
normalization if `n_iter` <= 2 and switches to LU otherwise.
See documentation for sklearn.utils.extmath.randomized_svd
"""

_valid = {}
_valid["n_oversamples"] = {int: lambda x: x > 0}
_valid["n_iter"] = {str: lambda x: x in ["auto"], int: lambda x: x >= 0}
_valid["power_iteration_normalizer"] = {
str: lambda x: x.lower() in ["auto", "qr", "lu", "none"]
}
_valid_str = {}
_valid_str["n_oversamples"] = ["int > 0"]
_valid_str["n_iter"] = ["auto", "int >= 0"]
_valid_str["power_iteration_normalizer"] = ["auto", "QR", "LU", "none"]

n_oversamples: int = 10
n_iter: int = "auto"
power_iteration_normalizer: str = "auto"

def validate(self):
validated = []
errs = []
valids = []
fields = list(self.__dataclass_fields__.items())
fields.sort(key=lambda x: x[0])
for field_name, field_def in fields:
attr = getattr(self, field_name)
validated.append(False)
for typ, typfun in self._valid[field_name].items():
if isinstance(attr, typ):
validated[-1] = typfun(attr)
if not validated[-1]:
errs.append(field_name)
return all(validated), errs

def __post_init__(self):
validated, errs = self.validate()
errs = errs
if not validated:
errorstring = f"{errs} were invalid type or value. " f"Valid values are "
for err in errs:
errorstring += f"{self._valid_str[err]}, "
errorstring += "respectively."
raise ValueError(errorstring)


##some monkey patching of randomized_svd...
def randomized_svd_monkey(
M,
n_components,
*,
pca_params=PCAParameters(),
n_oversamples=10,
n_iter="auto",
power_iteration_normalizer="auto",
transpose="auto",
flip_sign=True,
random_state="warn",
):
if sklearn.__version__ > "1.0.1":
warnings.warn(
"Graphtools is using a patched version of randomized_svd "
"designed for sklearn version 1.0.1. The current version "
"of sklearn is {}. Please alert the graphtools authors to "
"update the patch.".format(sklearn.__version__),
RuntimeWarning,
)
return sklearn.utils.extmath.randomized_svd(
M,
n_components=n_components,
n_oversamples=pca_params.n_oversamples,
n_iter=pca_params.n_iter,
power_iteration_normalizer=pca_params.power_iteration_normalizer,
transpose=transpose,
flip_sign=flip_sign,
random_state=random_state,
)


class Base(object):
"""Class that deals with key-word arguments but is otherwise
just an object.
Expand Down Expand Up @@ -90,7 +203,9 @@ class Data(Base):
s_max * eps * max(n_samples, n_features)
where s_max is the maximum singular value of the data matrix
and eps is numerical precision. [press2007]_.

pca_params : `PCAParameters`, optional (default: `PCAParameters()`)
Parameters to use for randomized SVD and PCA. See documentation
for graphtools.base.PCAParameters.
random_state : `int` or `None`, optional (default: `None`)
Random state for random PCA

Expand All @@ -109,11 +224,19 @@ class Data(Base):
"""

def __init__(
self, data, n_pca=None, rank_threshold=None, random_state=None, **kwargs
self,
data,
n_pca=None,
rank_threshold=None,
pca_params=PCAParameters(),
random_state=None,
**kwargs,
):

self._check_data(data)
n_pca, rank_threshold = self._parse_n_pca_threshold(data, n_pca, rank_threshold)
n_pca, rank_threshold, pca_params = self._parse_pca_parameters(
data, n_pca, rank_threshold, pca_params
)

if utils.is_SparseDataFrame(data):
data = data.to_coo()
Expand All @@ -130,11 +253,12 @@ def __init__(
self.data = data
self.n_pca = n_pca
self.rank_threshold = rank_threshold
self.pca_params = pca_params
self.random_state = random_state
self.data_nu = self._reduce_data()
super().__init__(**kwargs)

def _parse_n_pca_threshold(self, data, n_pca, rank_threshold):
def _parse_pca_parameters(self, data, n_pca, rank_threshold, pca_params):
if isinstance(n_pca, str):
n_pca = n_pca.lower()
if n_pca != "auto":
Expand Down Expand Up @@ -173,7 +297,7 @@ def _parse_n_pca_threshold(self, data, n_pca, rank_threshold):
n_pca = None
elif n_pca is True: # notify that we're going to estimate rank.
n_pca = "auto"
_logger.info(
_logger.log_info(
"Estimating n_pca from matrix rank. "
"Supply an integer n_pca "
"for fixed amount."
Expand Down Expand Up @@ -207,7 +331,12 @@ def _parse_n_pca_threshold(self, data, n_pca, rank_threshold):
raise ValueError(
"rank_threshold must be positive float or 'auto'. "
)
return n_pca, rank_threshold
if pca_params is None:
pca_params = PCAParameters()
else:
if not isinstance(pca_params, PCAParameters):
raise ValueError("pca_params must be an instance of PCAParameters.")
return n_pca, rank_threshold, pca_params

def _check_data(self, data):
if len(data.shape) != 2:
Expand Down Expand Up @@ -237,7 +366,10 @@ def _reduce_data(self):
if self.n_pca is not None and (
self.n_pca == "auto" or self.n_pca < self.data.shape[1]
):
with _logger.task("PCA"):
with _logger.log_task("PCA"):
randomized_pca = partial(
randomized_svd_monkey, pca_params=self.pca_params
)
n_pca = self.data.shape[1] - 1 if self.n_pca == "auto" else self.n_pca
if sparse.issparse(self.data):
if (
Expand All @@ -251,7 +383,14 @@ def _reduce_data(self):
self.data_pca = PCA(
n_pca, svd_solver="randomized", random_state=self.random_state
)
self.data_pca.fit(self.data)
with patch(
"sklearn.decomposition._pca.randomized_svd", new=randomized_pca
) as foo, patch(
"sklearn.decomposition._truncated_svd.randomized_svd",
new=randomized_pca,
) as bar:
self.data_pca.fit(self.data)

if self.n_pca == "auto":
s = self.data_pca.singular_values_
smax = s.max()
Expand All @@ -269,7 +408,7 @@ def _reduce_data(self):
"maximum singular value {} "
"for the data matrix".format(threshold, smax)
)
_logger.info(
_logger.log_info(
"Using rank estimate of {} as n_pca".format(self.n_pca)
)
# reset the sklearn operator
Expand All @@ -292,8 +431,7 @@ def _reduce_data(self):
return data_nu

def get_params(self):
"""Get parameters from this object
"""
"""Get parameters from this object"""
return {"n_pca": self.n_pca, "random_state": self.random_state}

def set_params(self, **params):
Expand Down Expand Up @@ -469,7 +607,7 @@ def __init__(
anisotropy=0,
gamma=None,
initialize=True,
**kwargs
**kwargs,
):
if gamma is not None:
warnings.warn(
Expand Down Expand Up @@ -498,10 +636,10 @@ def __init__(
self.anisotropy = anisotropy

if initialize:
_logger.debug("Initializing kernel...")
_logger.log_debug("Initializing kernel...")
self.K
else:
_logger.debug("Not initializing kernel.")
_logger.log_debug("Not initializing kernel.")
super().__init__(**kwargs)

def _check_symmetrization(self, kernel_symm, theta):
Expand Down Expand Up @@ -556,18 +694,20 @@ def _build_kernel(self):
def symmetrize_kernel(self, K):
# symmetrize
if self.kernel_symm == "+":
_logger.debug("Using addition symmetrization.")
_logger.log_debug("Using addition symmetrization.")
K = (K + K.T) / 2
elif self.kernel_symm == "*":
_logger.debug("Using multiplication symmetrization.")
_logger.log_debug("Using multiplication symmetrization.")
K = K.multiply(K.T)
elif self.kernel_symm == "mnn":
_logger.debug("Using mnn symmetrization (theta = {}).".format(self.theta))
_logger.log_debug(
"Using mnn symmetrization (theta = {}).".format(self.theta)
)
K = self.theta * matrix.elementwise_minimum(K, K.T) + (
1 - self.theta
) * matrix.elementwise_maximum(K, K.T)
elif self.kernel_symm is None:
_logger.debug("Using no symmetrization.")
_logger.log_debug("Using no symmetrization.")
pass
else:
raise NotImplementedError
Expand All @@ -589,8 +729,7 @@ def apply_anisotropy(self, K):
return K

def get_params(self):
"""Get parameters from this object
"""
"""Get parameters from this object"""
return {
"kernel_symm": self.kernel_symm,
"theta": self.theta,
Expand Down Expand Up @@ -697,8 +836,7 @@ def diff_aff(self):

@property
def diff_op(self):
"""Synonym for P
"""
"""Synonym for P"""
return self.P

@property
Expand All @@ -719,8 +857,7 @@ def K(self):

@property
def kernel(self):
"""Synonym for K
"""
"""Synonym for K"""
return self.K

@property
Expand Down Expand Up @@ -850,10 +987,10 @@ def _check_shortest_path_distance(self, distance):
def _default_shortest_path_distance(self):
if not self.weighted:
distance = "data"
_logger.info("Using ambient data distances.")
_logger.log_info("Using ambient data distances.")
else:
distance = "affinity"
_logger.info("Using negative log affinity distances.")
_logger.log_info("Using negative log affinity distances.")
return distance

def shortest_path(self, method="auto", distance=None):
Expand Down Expand Up @@ -895,8 +1032,12 @@ def shortest_path(self, method="auto", distance=None):
np.sum((self.data_nu[D.row] - self.data_nu[D.col]) ** 2, axis=1)
)
elif distance == "affinity":
D = sparse.csr_matrix(self.K)
D.data = -1 * np.log(D.data)
# D = sparse.csr_matrix(self.K)
# D.data = -1 * np.log(D.data)
D = -1 * np.where(
self.K != 0, np.log(np.where(self.K != 0, self.K, np.nan)), 0
)
# D = sparse.csr_matrix(D)
else:
raise ValueError(
"Expected `distance` in ['constant', 'data', 'affinity']. "
Expand Down Expand Up @@ -1019,8 +1160,7 @@ def __init__(self, data, verbose=True, n_jobs=1, **kwargs):
super().__init__(data, **kwargs)

def get_params(self):
"""Get parameters from this object
"""
"""Get parameters from this object"""
params = Data.get_params(self)
params.update(BaseGraph.get_params(self))
return params
Expand Down
Loading