Skip to content

Commit

Permalink
Merge pull request #4431 from AndrejaKovacic/unique-names
Browse files Browse the repository at this point in the history
[FIX] Ensure unique var names in file
  • Loading branch information
AndrejaKovacic authored Feb 28, 2020
2 parents 34063e6 + e48a32f commit 1045e72
Show file tree
Hide file tree
Showing 6 changed files with 258 additions and 53 deletions.
31 changes: 30 additions & 1 deletion Orange/data/tests/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,39 @@ def test_get_unique_names_from_duplicates(self):
["x (2)", "x (3)", "x (1)"])
self.assertEqual(
get_unique_names_duplicates(["x (2)", "x", "x", "x (2)", "x (3)"]),
["x (2) (1)", "x (1)", "x (4)", "x (2) (2)", "x (3)"])
["x (2) (1)", "x (4)", "x (5)", "x (2) (2)", "x (3)"])
self.assertEqual(
get_unique_names_duplicates(["iris", "iris", "iris (1)"]),
["iris (2)", "iris (3)", "iris (1)"])

self.assertEqual(
get_unique_names_duplicates(["foo", "bar", "baz"], return_duplicated=True),
(["foo", "bar", "baz"], []))
self.assertEqual(
get_unique_names_duplicates(["foo", "bar", "baz", "bar"], return_duplicated=True),
(["foo", "bar (1)", "baz", "bar (2)"], ["bar"]))
self.assertEqual(
get_unique_names_duplicates(["x", "x", "x (1)"], return_duplicated=True),
(["x (2)", "x (3)", "x (1)"], ["x"]))
self.assertEqual(
get_unique_names_duplicates(["x (2)", "x", "x", "x (2)", "x (3)"], return_duplicated=True),
(["x (2) (1)", "x (4)", "x (5)", "x (2) (2)", "x (3)"], ["x (2)", "x"]))
self.assertEqual(
get_unique_names_duplicates(["x", "", "", None, None, "x"]),
["x (1)", "", "", None, None, "x (2)"])
self.assertEqual(
get_unique_names_duplicates(["iris", "iris", "iris (1)", "iris (2)"], return_duplicated=True),
(["iris (3)", "iris (4)", "iris (1)", "iris (2)"], ["iris"]))

self.assertEqual(
get_unique_names_duplicates(["iris (1) (1)", "iris (1)", "iris (1)"]),
["iris (1) (1)", "iris (1) (2)", "iris (1) (3)"]
)

self.assertEqual(
get_unique_names_duplicates(["iris (1) (1)", "iris (1)", "iris (1)", "iris", "iris"]),
["iris (1) (1)", "iris (1) (2)", "iris (1) (3)", "iris (2)", "iris (3)"]
)

def test_get_unique_names_domain(self):
(attrs, classes, metas), renamed = \
Expand Down
34 changes: 15 additions & 19 deletions Orange/data/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
Data-manipulation utilities.
"""
import re
from collections import Counter, defaultdict
from itertools import chain
from collections import Counter
from itertools import chain, count
from typing import Callable

import numpy as np
Expand Down Expand Up @@ -155,8 +155,8 @@ def get_indices(names, name):
:param name: str
:return: list of indices
"""
return [int(a.group(2)) for x in names
for a in re.finditer(RE_FIND_INDEX.format(name), x)]
return [int(a.group(2)) for x in filter(None, names)
for a in re.finditer(RE_FIND_INDEX.format(re.escape(name)), x)]


def get_unique_names(names, proposed):
Expand Down Expand Up @@ -203,26 +203,22 @@ def get_unique_names(names, proposed):
return [f"{name} ({max_index})" for name in proposed]


def get_unique_names_duplicates(proposed: list) -> list:
def get_unique_names_duplicates(proposed: list, return_duplicated=False) -> list:
"""
Returns list of unique names. If a name is duplicated, the
function appends the smallest available index in parentheses.
function appends the next available index in parentheses.
For example, a proposed list of names `x`, `x` and `x (2)`
results in `x (1)`, `x (3)`, `x (2)`.
results in `x (3)`, `x (4)`, `x (2)`.
"""
counter = Counter(proposed)
index = defaultdict(int)
names = []
for name in proposed:
if name and counter[name] > 1:
unique_name = name
while unique_name in counter:
index[name] += 1
unique_name = f"{name} ({index[name]})"
name = unique_name
names.append(name)
return names
indices = {name: count(max(get_indices(proposed, name), default=0) + 1)
for name, cnt in Counter(proposed).items()
if name and cnt > 1}
new_names = [f"{name} ({next(indices[name])})" if name in indices else name
for name in proposed]
if return_duplicated:
return new_names, list(indices)
return new_names


def get_unique_names_domain(attributes, class_vars=(), metas=()):
Expand Down
27 changes: 17 additions & 10 deletions Orange/widgets/data/owfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from Orange.widgets.utils.filedialogs import RecentPathsWComboMixin, \
open_filename_dialog
from Orange.widgets.utils.widgetpreview import WidgetPreview
from Orange.widgets.widget import Output
from Orange.widgets.widget import Output, Msg

# Backward compatibility: class RecentPath used to be defined in this module,
# and it is used in saved (pickled) settings. It must be imported into the
Expand Down Expand Up @@ -121,17 +121,19 @@ class Outputs:
domain_editor = SettingProvider(DomainEditor)

class Warning(widget.OWWidget.Warning):
file_too_big = widget.Msg("The file is too large to load automatically."
" Press Reload to load.")
load_warning = widget.Msg("Read warning:\n{}")
performance_warning = widget.Msg(
file_too_big = Msg("The file is too large to load automatically."
" Press Reload to load.")
load_warning = Msg("Read warning:\n{}")
performance_warning = Msg(
"Categorical variables with >100 values may decrease performance.")
renamed_vars = Msg("Some variables have been renamed "
"to avoid duplicates.\n{}")

class Error(widget.OWWidget.Error):
file_not_found = widget.Msg("File not found.")
missing_reader = widget.Msg("Missing reader.")
sheet_error = widget.Msg("Error listing available sheets.")
unknown = widget.Msg("Read error:\n{}")
file_not_found = Msg("File not found.")
missing_reader = Msg("Missing reader.")
sheet_error = Msg("Error listing available sheets.")
unknown = Msg("Read error:\n{}")

class NoFileSelected:
pass
Expand Down Expand Up @@ -478,10 +480,13 @@ def _inspect_discrete_variables(self, domain):

def apply_domain_edit(self):
self.Warning.performance_warning.clear()
self.Warning.renamed_vars.clear()
if self.data is None:
table = None
else:
domain, cols = self.domain_editor.get_domain(self.data.domain, self.data)
domain, cols, renamed = \
self.domain_editor.get_domain(self.data.domain, self.data,
deduplicate=True)
if not (domain.variables or domain.metas):
table = None
elif domain is self.data.domain:
Expand All @@ -493,6 +498,8 @@ def apply_domain_edit(self):
table.ids = np.array(self.data.ids)
table.attributes = getattr(self.data, 'attributes', {})
self._inspect_discrete_variables(domain)
if renamed:
self.Warning.renamed_vars(f"Renamed: {', '.join(renamed)}")

self.Outputs.data.send(table)
self.apply_button.setEnabled(False)
Expand Down
29 changes: 29 additions & 0 deletions Orange/widgets/data/tests/test_owfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,22 @@ def test_domain_changes_are_stored(self):
data = self.get_output(self.widget.Outputs.data)
self.assertIsInstance(data.domain["iris"], StringVariable)

def test_rename_duplicates(self):
self.open_dataset("iris")

idx = self.widget.domain_editor.model().createIndex(3, 0)
self.assertFalse(self.widget.Warning.renamed_vars.is_shown())
self.widget.domain_editor.model().setData(idx, "iris", Qt.EditRole)
self.widget.apply_button.click()
data = self.get_output(self.widget.Outputs.data)
self.assertIn("iris (1)", data.domain)
self.assertIn("iris (2)", data.domain)
self.assertTrue(self.widget.Warning.renamed_vars.is_shown())

self.widget.domain_editor.model().setData(idx, "different iris", Qt.EditRole)
self.widget.apply_button.click()
self.assertFalse(self.widget.Warning.renamed_vars.is_shown())

def test_variable_name_change(self):
"""
Test whether the name of the variable is changed correctly by
Expand All @@ -155,6 +171,12 @@ def test_variable_name_change(self):
data = self.get_output(self.widget.Outputs.data)
self.assertIn("a", data.domain)

idx = self.widget.domain_editor.model().createIndex(3, 0)
self.widget.domain_editor.model().setData(idx, "d", Qt.EditRole)
self.widget.apply_button.click()
data = self.get_output(self.widget.Outputs.data)
self.assertIn("d", data.domain)

# rename and change to text
idx = self.widget.domain_editor.model().createIndex(4, 0)
self.widget.domain_editor.model().setData(idx, "b", Qt.EditRole)
Expand Down Expand Up @@ -250,6 +272,13 @@ def test_check_column_noname(self):
self.widget.domain_editor.model().setData(idx, "", Qt.EditRole)
self.assertEqual(self.widget.domain_editor.model().data(idx, Qt.DisplayRole), temp)

def test_invalid_role_mode(self):
self.open_dataset("iris")
model = self.widget.domain_editor.model()
idx = model.createIndex(1, 0)
self.assertFalse(model.setData(idx, Qt.StatusTipRole, ""))
self.assertIsNone(model.data(idx, Qt.StatusTipRole))

def test_context_match_includes_variable_values(self):
file1 = """\
var
Expand Down
75 changes: 52 additions & 23 deletions Orange/widgets/utils/domaineditor.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from Orange.data import DiscreteVariable, ContinuousVariable, StringVariable, \
TimeVariable, Domain
from Orange.data.util import get_unique_names_duplicates
from Orange.statistics.util import unique
from Orange.widgets import gui
from Orange.widgets.gui import HorizontalGridDelegate
Expand Down Expand Up @@ -61,13 +62,14 @@ def set_variables(self, variables):
def rowCount(self, parent):
return 0 if parent.isValid() else len(self.variables)

def columnCount(self, parent):
@staticmethod
def columnCount(parent):
return 0 if parent.isValid() else Column.not_valid

def data(self, index, role):
row, col = index.row(), index.column()
val = self.variables[row][col]
if role == Qt.DisplayRole or role == Qt.EditRole:
if role in (Qt.DisplayRole, Qt.EditRole):
if col == Column.tpe:
return self.type2name[val]
if col == Column.place:
Expand All @@ -90,8 +92,9 @@ def data(self, index, role):
font = QFont()
font.setBold(True)
return font
return None

def setData(self, index, value, role):
def setData(self, index, value, role=Qt.EditRole):
row, col = index.row(), index.column()
row_data = self.variables[row]
if role == Qt.EditRole:
Expand All @@ -110,6 +113,7 @@ def setData(self, index, value, role):
# Settings may change background colors
self.dataChanged.emit(index.sibling(row, 0), index.sibling(row, 3))
return True
return False

def headerData(self, i, orientation, role=Qt.DisplayRole):
if orientation == Qt.Horizontal and role == Qt.DisplayRole and i < 4:
Expand All @@ -130,7 +134,7 @@ def __init__(self, view, items):
self.view = view
self.items = items

def createEditor(self, parent, option, index):
def createEditor(self, parent, _option, index):
# This ugly hack closes the combo when the user selects an item
class Combo(QComboBox):
def __init__(self, *args):
Expand All @@ -145,6 +149,8 @@ def showPopup(self, *args):
super().showPopup(*args)
self.popup_shown = True

# Here, we need `self` from the closure
# pylint: disable=no-self-argument,attribute-defined-outside-init
def hidePopup(me):
if me.popup_shown:
self.view.model().setData(
Expand Down Expand Up @@ -250,21 +256,27 @@ def _merge(cols, force_dense=False):
sparse_cols = [c if sp.issparse(c) else sp.csc_matrix(c) for c in cols]
return sp.hstack(sparse_cols).tocsr()

def get_domain(self, domain, data):
"""Create domain (and dataset) from changes made in the widget.
Parameters
----------
domain : old domain
data : source data
def get_domain(self, domain, data, deduplicate=False):
"""
Create domain (and dataset) from changes made in the widget.
Returns
-------
(new_domain, [attribute_columns, class_var_columns, meta_columns])
Args:
domain (Domain): original domain
data (Table): original data
deduplicate (bool): if True, variable names are deduplicated and
the result contains an additional list with names of renamed
variables
Returns:
(new_domain, [attribute_columns, class_var_columns, meta_columns])
or
(new_domain, [attribute_columns, class_var_columns, meta_columns], renamed)
"""
# Allow type-checking with type() instead of isinstance() for exact comparison
# pylint: disable=unidiomatic-typecheck

variables = self.model().variables
places = [[], [], []] # attributes, class_vars, metas
cols = [[], [], []] # Xcols, Ycols, Mcols
Expand All @@ -283,8 +295,17 @@ def numbers_are_round(var, col_data):
chain(((at, Place.feature) for at in domain.attributes),
((cl, Place.class_var) for cl in domain.class_vars),
((mt, Place.meta) for mt in domain.metas)))):
return domain, [data.X, data.Y, data.metas]
if deduplicate:
return domain, [data.X, data.Y, data.metas], []
else:
return domain, [data.X, data.Y, data.metas]

relevant_names = [var[0] for var in variables if var[2] != Place.skip]
if deduplicate:
renamed_iter = iter(get_unique_names_duplicates(relevant_names))
else:
renamed_iter = iter(relevant_names)
renamed = []
for (name, tpe, place, _, may_be_numeric), (orig_var, orig_plc) in \
zip(variables,
chain([(at, Place.feature) for at in domain.attributes],
Expand All @@ -293,24 +314,28 @@ def numbers_are_round(var, col_data):
if place == Place.skip:
continue

new_name = next(renamed_iter)
if new_name != name and name not in renamed:
renamed.append(name)

col_data = self._get_column(data, orig_var, orig_plc)
is_sparse = sp.issparse(col_data)

if name == orig_var.name and tpe == type(orig_var):
if new_name == orig_var.name and tpe == type(orig_var):
var = orig_var
elif tpe == type(orig_var):
var = orig_var.copy(name=name)
var = orig_var.copy(name=new_name)
elif tpe == DiscreteVariable:
values = list(str(i) for i in unique(col_data) if not self._is_missing(i))
round_numbers = numbers_are_round(orig_var, col_data)
col_data = [np.nan if self._is_missing(x) else values.index(str(x))
for x in self._iter_vals(col_data)]
if round_numbers:
values = [str(int(float(v))) for v in values]
var = tpe(name, values)
var = tpe(new_name, values)
col_data = self._to_column(col_data, is_sparse)
elif tpe == StringVariable:
var = tpe.make(name)
var = tpe.make(new_name)
if type(orig_var) in [DiscreteVariable, TimeVariable]:
col_data = [orig_var.repr_val(x) if not np.isnan(x) else ""
for x in self._iter_vals(col_data)]
Expand All @@ -324,25 +349,29 @@ def numbers_are_round(var, col_data):
# in metas which are transformed to dense below
col_data = self._to_column(col_data, False, dtype=object)
elif tpe == ContinuousVariable and type(orig_var) == DiscreteVariable:
var = tpe.make(name)
var = tpe.make(new_name)
if may_be_numeric:
col_data = [np.nan if self._is_missing(x) else float(orig_var.values[int(x)])
for x in self._iter_vals(col_data)]
col_data = self._to_column(col_data, is_sparse)
else:
var = tpe(name)
var = tpe(new_name)
places[place].append(var)
cols[place].append(col_data)

# merge columns for X, Y and metas
feats = cols[Place.feature]
X = self._merge(feats) if len(feats) else np.empty((len(data), 0))
X = self._merge(feats) if feats else np.empty((len(data), 0))
Y = self._merge(cols[Place.class_var], force_dense=True)
m = self._merge(cols[Place.meta], force_dense=True)
domain = Domain(*places)
return domain, [X, Y, m]
if deduplicate:
return domain, [X, Y, m], renamed
else:
return domain, [X, Y, m]

def _get_column(self, data, source_var, source_place):
@staticmethod
def _get_column(data, source_var, source_place):
""" Extract column from data and preserve sparsity. """
if source_place == Place.meta:
col_data = data[:, source_var].metas
Expand Down
Loading

0 comments on commit 1045e72

Please sign in to comment.