Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FIX] Continuize: prevent crashing - column with equal and NaN values #2144

Merged
merged 3 commits into from
Apr 21, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 21 additions & 14 deletions Orange/widgets/data/owcontinuize.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
from functools import reduce

import numpy as np

from AnyQt import QtWidgets
from AnyQt.QtCore import Qt

import Orange.data
from Orange.util import Reprable
from Orange.statistics import distribution
from Orange.preprocess import Continuize, Normalize
from Orange.preprocess.transformation import \
Identity, Indicator, Indicator1, Normalizer
from Orange.data.table import Table
from Orange.widgets import gui, widget
from Orange.widgets.settings import Setting
Expand Down Expand Up @@ -138,12 +144,6 @@ def send_report(self):
("Value range", self.value_ranges[self.zero_based])])


from Orange.preprocess.transformation import \
Identity, Indicator, Indicator1, Normalizer

from functools import reduce


class WeightedIndicator(Indicator):
def __init__(self, variable, value, weight=1.0):
super().__init__(variable, value)
Expand All @@ -156,7 +156,7 @@ def transform(self, c):
return t


class WeightedIndicator_1(Indicator1):
class WeightedIndicator1(Indicator1):
def __init__(self, variable, value, weight=1.0):
super().__init__(variable, value)
self.weight = weight
Expand All @@ -176,7 +176,7 @@ def make_indicator_var(source, value_ind, weight=None, zero_based=True):
elif weight is None:
indicator = Indicator1(source, value=value_ind)
else:
indicator = WeightedIndicator_1(source, value=value_ind, weight=weight)
indicator = WeightedIndicator1(source, value=value_ind, weight=weight)
return Orange.data.ContinuousVariable(
"{}={}".format(source.name, source.values[value_ind]),
compute_value=indicator
Expand Down Expand Up @@ -279,7 +279,7 @@ def continuize_var(var,
elif multinomial_treatment == Continuize.AsOrdinal:
return [ordinal_to_continuous(var)]
elif multinomial_treatment == Continuize.AsNormalizedOrdinal:
return [ordinal_to_normalized_continuous(var, zero_based)]
return [ordinal_to_norm_continuous(var, zero_based)]
elif multinomial_treatment == Continuize.Indicators:
return one_hot_coding(var, zero_based)
elif multinomial_treatment == Continuize.FirstAsBase or \
Expand Down Expand Up @@ -320,7 +320,7 @@ def ordinal_to_continuous(var):
compute_value=Identity(var))


def ordinal_to_normalized_continuous(var, zero_based=True):
def ordinal_to_norm_continuous(var, zero_based=True):
n_values = len(var.values)
if zero_based:
return normalized_var(var, 0, 1 / (n_values - 1))
Expand All @@ -330,8 +330,11 @@ def ordinal_to_normalized_continuous(var, zero_based=True):

def normalize_by_span(var, data_or_dist, zero_based=True):
dist = _ensure_dist(var, data_or_dist)
v_max, v_min = dist.max(), dist.min()
span = v_max - v_min
if dist.shape[1] > 0:
v_max, v_min = dist.max(), dist.min()
else:
v_max, v_min = 0, 0
span = (v_max - v_min)
if span < 1e-15:
span = 1

Expand All @@ -343,7 +346,11 @@ def normalize_by_span(var, data_or_dist, zero_based=True):

def normalize_by_sd(var, data_or_dist):
dist = _ensure_dist(var, data_or_dist)
mean, sd = dist.mean(), dist.standard_deviation()
if dist.shape[1] > 0:
mean, sd = dist.mean(), dist.standard_deviation()
else:
mean, sd = 0, 1
sd = sd if sd > 1e-10 else 1
return normalized_var(var, mean, 1 / sd)


Expand All @@ -365,7 +372,7 @@ def __call__(self, data):
domain = data.domain

if (treat == Continuize.ReportError and
any(var.is_discrete and len(var.values) > 2 for var in domain)):
any(var.is_discrete and len(var.values) > 2 for var in domain)):
raise ValueError("Domain has multinomial attributes")

newdomain = continuize_domain(
Expand Down
50 changes: 50 additions & 0 deletions Orange/widgets/data/tests/test_owcontinuize.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,53 @@ def test_empty_data(self):
widget.unconditional_commit()
imp_data = self.get_output("Data")
self.assertIsNone(imp_data)

def test_one_column_equal_values(self):
"""
No crash on a column with equal values and with selected option
normalize by standard deviation.
GH-2144
"""
table = Table("iris")
table = table[:, 1]
table[:] = 42.0
self.send_signal("Data", table)
# Normalize.NormalizeBySD
self.widget.continuous_treatment = 2
self.widget.unconditional_commit()

def test_one_column_nan_values_normalize_sd(self):
"""
No crash on a column with NaN values and with selected option
normalize by standard deviation (Not the same issue which is
tested above).
GH-2144
"""
table = Table("iris")
table[:, 2] = np.NaN
self.send_signal("Data", table)
# Normalize.NormalizeBySD
self.widget.continuous_treatment = 2
self.widget.unconditional_commit()
table = Table("iris")
table[1, 2] = np.NaN
self.send_signal("Data", table)
self.widget.unconditional_commit()


def test_one_column_nan_values_normalize_span(self):
"""
No crash on a column with NaN values and with selected option
normalize by span.
GH-2144
"""
table = Table("iris")
table[:, 2] = np.NaN
self.send_signal("Data", table)
# Normalize.NormalizeBySpan
self.widget.continuous_treatment = 1
self.widget.unconditional_commit()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also here: try also with a single nan, which should do any harm.

table = Table("iris")
table[1, 2] = np.NaN
self.send_signal("Data", table)
self.widget.unconditional_commit()