Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pylint distances #3674

Merged
merged 1 commit into from
Mar 15, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 37 additions & 10 deletions Orange/distance/base.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# This module defines abstract base classes; derived classes are abstract, too
# pylint: disable=abstract-method

import numpy as np
import sklearn.metrics as skl_metrics

Expand All @@ -13,10 +16,11 @@
# TODO this *private* function is called from several widgets to prepare
# data for calling the below classes. After we (mostly) stopped relying
# on sklearn.metrics, this is (mostly) unnecessary

# Afterwards, also remove the following line:
# pylint: disable=redefined-outer-name
def _preprocess(table, impute=True):
"""Remove categorical attributes and impute missing values."""
if not len(table):
if not len(table): # this can be an array, pylint: disable=len-as-condition
return table
new_domain = Domain(
[a for a in table.domain.attributes if a.is_continuous],
Expand Down Expand Up @@ -120,6 +124,9 @@ class Distance:
impute (bool):
if `True` (default is `False`), nans in the computed distances
are replaced with zeros, and infs with very large numbers.
normalize (bool):
if `True`, columns are normalized before computation. This attribute
applies only if the distance supports normalization.

The capabilities of the metrics are described with class attributes.

Expand Down Expand Up @@ -150,6 +157,11 @@ class Distance:
supports_normalization = False
supports_missing = True

# Predefined here to silence pylint, which doesn't look into __new__
normalize = False
axis = 1
impute = False

def __new__(cls, e1=None, e2=None, axis=1, impute=False, **kwargs):
self = super().__new__(cls)
self.axis = axis
Expand All @@ -168,16 +180,25 @@ def __new__(cls, e1=None, e2=None, axis=1, impute=False, **kwargs):
or hasattr(e1, "is_sparse") and e1.is_sparse()):
fallback = getattr(self, "fallback", None)
if fallback is not None:
# pylint disable=not-callable
# pylint: disable=not-callable
return fallback(e1, e2, axis, impute)

# Magic constructor
model = self.fit(e1)
return model(e1, e2)

def fit(self, e1):
"""Abstract method returning :obj:`DistanceModel` fit to the data"""
pass
def fit(self, data):
"""
Abstract method returning :obj:`DistanceModel` fit to the data

Args:
e1 (Orange.data.Table, Orange.data.Instance, np.ndarray):
data for fitting the distance model

Returns:
model (DistanceModel)
"""
raise NotImplementedError

@staticmethod
def check_no_discrete(n_vals):
Expand Down Expand Up @@ -256,7 +277,7 @@ def compute_distances(self, x1, x2):
"""
Abstract method for computation of distances between rows or columns of
`x1`, or between rows of `x1` and `x2`. Do not call directly."""
pass
raise NotImplementedError


class FittedDistanceModel(DistanceModel):
Expand All @@ -268,10 +289,15 @@ class FittedDistanceModel(DistanceModel):
attributes (list of `Variable`): attributes on which the model was fit
discrete (np.ndarray): bool array indicating discrete attributes
continuous (np.ndarray): bool array indicating continuous attributes
normalize (bool):
if `True` (default is `False`) continuous columns are normalized
"""
def __init__(self, attributes, axis=1, impute=False):
super().__init__(axis, impute)
self.attributes = attributes
self.discrete = None
self.continuous = None
self.normalize = False

def __call__(self, e1, e2=None):
if self.attributes is not None and (
Expand Down Expand Up @@ -373,7 +399,7 @@ def fit_cols(self, attributes, x, n_vals):
x (np.ndarray): data
n_vals (np.ndarray): number of attribute values, 0 for continuous
"""
pass
raise NotImplementedError

def fit_rows(self, attributes, x, n_vals):
"""
Expand Down Expand Up @@ -440,7 +466,8 @@ def fit_rows(self, attributes, x, n_vals):
dist_missing2_cont[:curr_cont],
dist_missing_disc, dist_missing2_disc)

def get_discrete_stats(self, column, n_bins):
@staticmethod
def get_discrete_stats(column, n_bins):
"""
Return tables used computing distance between missing discrete values.

Expand Down Expand Up @@ -474,7 +501,7 @@ def get_continuous_stats(self, column):
dist_missing2_cont (float): the value used for distance between two
missing values in column
"""
pass
raise NotImplementedError


# Fallbacks for distances in sparse data
Expand Down
37 changes: 25 additions & 12 deletions Orange/distance/distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from .base import (Distance, DistanceModel, FittedDistance, FittedDistanceModel,
SklDistance, _orange_to_numpy)


class EuclideanRowsModel(FittedDistanceModel):
"""
Model for computation of Euclidean distances between rows.
Expand All @@ -23,14 +24,14 @@ class EuclideanRowsModel(FittedDistanceModel):
"""
def __init__(self, attributes, impute, normalize,
continuous, discrete,
means, vars, dist_missing2_cont,
means, stdvars, dist_missing2_cont,
dist_missing_disc, dist_missing2_disc):
super().__init__(attributes, 1, impute)
self.normalize = normalize
self.continuous = continuous
self.discrete = discrete
self.means = means
self.vars = vars
self.vars = stdvars
self.dist_missing2_cont = dist_missing2_cont
self.dist_missing_disc = dist_missing_disc
self.dist_missing2_disc = dist_missing2_disc
Expand Down Expand Up @@ -91,11 +92,11 @@ class EuclideanColumnsModel(FittedDistanceModel):
Means are used as offsets for normalization, and two deviations are
used for scaling.
"""
def __init__(self, attributes, impute, normalize, means, vars):
def __init__(self, attributes, impute, normalize, means, stdvars):
super().__init__(attributes, 0, impute)
self.normalize = normalize
self.means = means
self.vars = vars
self.vars = stdvars

def compute_distances(self, x1, x2=None):
"""
Expand Down Expand Up @@ -134,6 +135,7 @@ class Euclidean(FittedDistance):
rows_model_type = EuclideanRowsModel

def __new__(cls, e1=None, e2=None, axis=1, impute=False, normalize=False):
# pylint: disable=arguments-differ
return super().__new__(cls, e1, e2, axis, impute, normalize=normalize)

def get_continuous_stats(self, column):
Expand All @@ -160,9 +162,8 @@ def fit_cols(self, attributes, x, n_vals):
for normalization and imputation.
"""
def nowarn(msg, cat, *args, **kwargs):
if cat is RuntimeWarning and (
msg == "Mean of empty slice"
or msg == "Degrees of freedom <= 0 for slice"):
if cat is RuntimeWarning and msg in (
"Mean of empty slice", "Degrees of freedom <= 0 for slice"):
if self.normalize:
raise ValueError("some columns have no defined values")
else:
Expand All @@ -174,11 +175,11 @@ def nowarn(msg, cat, *args, **kwargs):
orig_warn = warnings.warn
with patch("warnings.warn", new=nowarn):
means = np.nanmean(x, axis=0)
vars = np.nanvar(x, axis=0)
if self.normalize and not vars.all():
stdvars = np.nanvar(x, axis=0)
if self.normalize and not stdvars.all():
raise ValueError("some columns are constant")
return EuclideanColumnsModel(
attributes, self.impute, self.normalize, means, vars)
attributes, self.impute, self.normalize, means, stdvars)


class ManhattanRowsModel(FittedDistanceModel):
Expand Down Expand Up @@ -270,6 +271,7 @@ class Manhattan(FittedDistance):
rows_model_type = ManhattanRowsModel

def __new__(cls, e1=None, e2=None, axis=1, impute=False, normalize=False):
# pylint: disable=arguments-differ
return super().__new__(cls, e1, e2, axis, impute, normalize=normalize)

def get_continuous_stats(self, column):
Expand Down Expand Up @@ -337,6 +339,10 @@ def fit_rows(self, attributes, x, n_vals):

fit_cols = fit_rows

def get_continuous_stats(self, column):
# Implement an unneeded abstract method to silence pylint
return None

class CosineModel(FittedDistanceModel):
"""Model for computation of cosine distances across rows and columns.
All non-zero discrete values are treated as 1."""
Expand Down Expand Up @@ -402,6 +408,7 @@ def _compute_dense(self, x1, x2):
compute distances between rows without missing values, and a slower
loop for those with missing values.
"""
# view is false positive, pylint: disable=no-member
nonzeros1 = np.not_equal(x1, 0).view(np.int8)
if self.axis == 1:
nans1 = _distance.any_nan_row(x1)
Expand All @@ -421,7 +428,8 @@ def _compute_dense(self, x1, x2):
return _distance.jaccard_cols(
nonzeros1, x1, nans1, self.ps)

def _compute_sparse(self, x1, x2=None):
@staticmethod
def _compute_sparse(x1, x2=None):
symmetric = x2 is None
if symmetric:
x2 = x1
Expand Down Expand Up @@ -462,6 +470,10 @@ def fit_rows(self, attributes, x, n_vals):

fit_cols = fit_rows

def get_continuous_stats(self, column):
# Implement an unneeded abstract method to silence pylint
return None


class CorrelationDistanceModel(DistanceModel):
"""Helper class for normal and absolute Pearson and Spearman correlation"""
Expand Down Expand Up @@ -561,7 +573,7 @@ def _corrcoef2(a, b, axis=0):
numpy.corrcoef
"""
a, b = np.atleast_2d(a, b)
if not (axis == 0 or axis == 1):
if axis not in (0, 1):
raise ValueError("Invalid axis {} (only 0 or 1 accepted)".format(axis))

mean_a = np.mean(a, axis=axis, keepdims=True)
Expand Down Expand Up @@ -597,6 +609,7 @@ def _corrcoef2(a, b, axis=0):


class CorrelationDistance(Distance):
# pylint: disable=abstract-method
supports_missing = False


Expand Down