Skip to content

Commit

Permalink
distances: pylint
Browse files Browse the repository at this point in the history
  • Loading branch information
janezd committed Mar 15, 2019
1 parent 180a041 commit c12ec4a
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 22 deletions.
47 changes: 37 additions & 10 deletions Orange/distance/base.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# This module defines abstract base classes; derived classes are abstract, too
# pylint: disable=abstract-method

import numpy as np
import sklearn.metrics as skl_metrics

Expand All @@ -13,10 +16,11 @@
# TODO this *private* function is called from several widgets to prepare
# data for calling the below classes. After we (mostly) stopped relying
# on sklearn.metrics, this is (mostly) unnecessary

# Afterwards, also remove the following line:
# pylint: disable=redefined-outer-name
def _preprocess(table, impute=True):
"""Remove categorical attributes and impute missing values."""
if not len(table):
if not len(table): # this can be an array, pylint: disable=len-as-condition
return table
new_domain = Domain(
[a for a in table.domain.attributes if a.is_continuous],
Expand Down Expand Up @@ -120,6 +124,9 @@ class Distance:
impute (bool):
if `True` (default is `False`), nans in the computed distances
are replaced with zeros, and infs with very large numbers.
normalize (bool):
if `True`, columns are normalized before computation. This attribute
applies only if the distance supports normalization.
The capabilities of the metrics are described with class attributes.
Expand Down Expand Up @@ -150,6 +157,11 @@ class Distance:
supports_normalization = False
supports_missing = True

# Predefined here to silence pylint, which doesn't look into __new__
normalize = False
axis = 1
impute = False

def __new__(cls, e1=None, e2=None, axis=1, impute=False, **kwargs):
self = super().__new__(cls)
self.axis = axis
Expand All @@ -168,16 +180,25 @@ def __new__(cls, e1=None, e2=None, axis=1, impute=False, **kwargs):
or hasattr(e1, "is_sparse") and e1.is_sparse()):
fallback = getattr(self, "fallback", None)
if fallback is not None:
# pylint disable=not-callable
# pylint: disable=not-callable
return fallback(e1, e2, axis, impute)

# Magic constructor
model = self.fit(e1)
return model(e1, e2)

def fit(self, e1):
"""Abstract method returning :obj:`DistanceModel` fit to the data"""
pass
def fit(self, data):
"""
Abstract method returning :obj:`DistanceModel` fit to the data
Args:
e1 (Orange.data.Table, Orange.data.Instance, np.ndarray):
data for fitting the distance model
Returns:
model (DistanceModel)
"""
raise NotImplementedError

@staticmethod
def check_no_discrete(n_vals):
Expand Down Expand Up @@ -256,7 +277,7 @@ def compute_distances(self, x1, x2):
"""
Abstract method for computation of distances between rows or columns of
`x1`, or between rows of `x1` and `x2`. Do not call directly."""
pass
raise NotImplementedError


class FittedDistanceModel(DistanceModel):
Expand All @@ -268,10 +289,15 @@ class FittedDistanceModel(DistanceModel):
attributes (list of `Variable`): attributes on which the model was fit
discrete (np.ndarray): bool array indicating discrete attributes
continuous (np.ndarray): bool array indicating continuous attributes
normalize (bool):
if `True` (default is `False`) continuous columns are normalized
"""
def __init__(self, attributes, axis=1, impute=False):
super().__init__(axis, impute)
self.attributes = attributes
self.discrete = None
self.continuous = None
self.normalize = False

def __call__(self, e1, e2=None):
if self.attributes is not None and (
Expand Down Expand Up @@ -373,7 +399,7 @@ def fit_cols(self, attributes, x, n_vals):
x (np.ndarray): data
n_vals (np.ndarray): number of attribute values, 0 for continuous
"""
pass
raise NotImplementedError

def fit_rows(self, attributes, x, n_vals):
"""
Expand Down Expand Up @@ -440,7 +466,8 @@ def fit_rows(self, attributes, x, n_vals):
dist_missing2_cont[:curr_cont],
dist_missing_disc, dist_missing2_disc)

def get_discrete_stats(self, column, n_bins):
@staticmethod
def get_discrete_stats(column, n_bins):
"""
Return tables used computing distance between missing discrete values.
Expand Down Expand Up @@ -474,7 +501,7 @@ def get_continuous_stats(self, column):
dist_missing2_cont (float): the value used for distance between two
missing values in column
"""
pass
raise NotImplementedError


# Fallbacks for distances in sparse data
Expand Down
37 changes: 25 additions & 12 deletions Orange/distance/distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from .base import (Distance, DistanceModel, FittedDistance, FittedDistanceModel,
SklDistance, _orange_to_numpy)


class EuclideanRowsModel(FittedDistanceModel):
"""
Model for computation of Euclidean distances between rows.
Expand All @@ -23,14 +24,14 @@ class EuclideanRowsModel(FittedDistanceModel):
"""
def __init__(self, attributes, impute, normalize,
continuous, discrete,
means, vars, dist_missing2_cont,
means, stdvars, dist_missing2_cont,
dist_missing_disc, dist_missing2_disc):
super().__init__(attributes, 1, impute)
self.normalize = normalize
self.continuous = continuous
self.discrete = discrete
self.means = means
self.vars = vars
self.vars = stdvars
self.dist_missing2_cont = dist_missing2_cont
self.dist_missing_disc = dist_missing_disc
self.dist_missing2_disc = dist_missing2_disc
Expand Down Expand Up @@ -91,11 +92,11 @@ class EuclideanColumnsModel(FittedDistanceModel):
Means are used as offsets for normalization, and two deviations are
used for scaling.
"""
def __init__(self, attributes, impute, normalize, means, vars):
def __init__(self, attributes, impute, normalize, means, stdvars):
super().__init__(attributes, 0, impute)
self.normalize = normalize
self.means = means
self.vars = vars
self.vars = stdvars

def compute_distances(self, x1, x2=None):
"""
Expand Down Expand Up @@ -134,6 +135,7 @@ class Euclidean(FittedDistance):
rows_model_type = EuclideanRowsModel

def __new__(cls, e1=None, e2=None, axis=1, impute=False, normalize=False):
# pylint: disable=arguments-differ
return super().__new__(cls, e1, e2, axis, impute, normalize=normalize)

def get_continuous_stats(self, column):
Expand All @@ -160,9 +162,8 @@ def fit_cols(self, attributes, x, n_vals):
for normalization and imputation.
"""
def nowarn(msg, cat, *args, **kwargs):
if cat is RuntimeWarning and (
msg == "Mean of empty slice"
or msg == "Degrees of freedom <= 0 for slice"):
if cat is RuntimeWarning and msg in (
"Mean of empty slice", "Degrees of freedom <= 0 for slice"):
if self.normalize:
raise ValueError("some columns have no defined values")
else:
Expand All @@ -174,11 +175,11 @@ def nowarn(msg, cat, *args, **kwargs):
orig_warn = warnings.warn
with patch("warnings.warn", new=nowarn):
means = np.nanmean(x, axis=0)
vars = np.nanvar(x, axis=0)
if self.normalize and not vars.all():
stdvars = np.nanvar(x, axis=0)
if self.normalize and not stdvars.all():
raise ValueError("some columns are constant")
return EuclideanColumnsModel(
attributes, self.impute, self.normalize, means, vars)
attributes, self.impute, self.normalize, means, stdvars)


class ManhattanRowsModel(FittedDistanceModel):
Expand Down Expand Up @@ -270,6 +271,7 @@ class Manhattan(FittedDistance):
rows_model_type = ManhattanRowsModel

def __new__(cls, e1=None, e2=None, axis=1, impute=False, normalize=False):
# pylint: disable=arguments-differ
return super().__new__(cls, e1, e2, axis, impute, normalize=normalize)

def get_continuous_stats(self, column):
Expand Down Expand Up @@ -337,6 +339,10 @@ def fit_rows(self, attributes, x, n_vals):

fit_cols = fit_rows

def get_continuous_stats(self, column):
# Implement an unneeded abstract method to silence pylint
return None

class CosineModel(FittedDistanceModel):
"""Model for computation of cosine distances across rows and columns.
All non-zero discrete values are treated as 1."""
Expand Down Expand Up @@ -402,6 +408,7 @@ def _compute_dense(self, x1, x2):
compute distances between rows without missing values, and a slower
loop for those with missing values.
"""
# view is false positive, pylint: disable=no-member
nonzeros1 = np.not_equal(x1, 0).view(np.int8)
if self.axis == 1:
nans1 = _distance.any_nan_row(x1)
Expand All @@ -421,7 +428,8 @@ def _compute_dense(self, x1, x2):
return _distance.jaccard_cols(
nonzeros1, x1, nans1, self.ps)

def _compute_sparse(self, x1, x2=None):
@staticmethod
def _compute_sparse(x1, x2=None):
symmetric = x2 is None
if symmetric:
x2 = x1
Expand Down Expand Up @@ -462,6 +470,10 @@ def fit_rows(self, attributes, x, n_vals):

fit_cols = fit_rows

def get_continuous_stats(self, column):
# Implement an unneeded abstract method to silence pylint
return None


class CorrelationDistanceModel(DistanceModel):
"""Helper class for normal and absolute Pearson and Spearman correlation"""
Expand Down Expand Up @@ -561,7 +573,7 @@ def _corrcoef2(a, b, axis=0):
numpy.corrcoef
"""
a, b = np.atleast_2d(a, b)
if not (axis == 0 or axis == 1):
if axis not in (0, 1):
raise ValueError("Invalid axis {} (only 0 or 1 accepted)".format(axis))

mean_a = np.mean(a, axis=axis, keepdims=True)
Expand Down Expand Up @@ -597,6 +609,7 @@ def _corrcoef2(a, b, axis=0):


class CorrelationDistance(Distance):
# pylint: disable=abstract-method
supports_missing = False


Expand Down

0 comments on commit c12ec4a

Please sign in to comment.