diff --git a/Orange/preprocess/discretize.py b/Orange/preprocess/discretize.py index ba84b1ec424..65ed508da13 100644 --- a/Orange/preprocess/discretize.py +++ b/Orange/preprocess/discretize.py @@ -8,7 +8,7 @@ import numpy as np import scipy.sparse as sp -from Orange.data import DiscreteVariable, Domain +from Orange.data import DiscreteVariable, Domain, TimeVariable, Table from Orange.data.sql.table import SqlTable from Orange.statistics import distribution, contingency, util as ut from Orange.statistics.basic_stats import BasicStats @@ -58,13 +58,17 @@ def _fmt_interval(low, high, formatter): return f"{formatter(low)} - {formatter(high)}" @classmethod - def create_discretized_var(cls, var, points): - def fmt(val): - sval = var.str_val(val) - # For decimal numbers, remove trailing 0's and . if no decimals left - if re.match(r"^\d+\.\d+", sval): - return sval.rstrip("0").rstrip(".") - return sval + def create_discretized_var(cls, var, points, ndigits=None): + if ndigits is None: + def fmt(val): + sval = var.str_val(val) + # For decimal numbers, remove trailing 0's and . if no decimals left + if re.match(r"^\d+\.\d+", sval): + return sval.rstrip("0").rstrip(".") + return sval + else: + def fmt(val): + return f"{val:.{ndigits}f}" lpoints = list(points) if lpoints: @@ -96,8 +100,8 @@ def __init__(self, var, points): self.points = points def __call__(self): - return 'width_bucket(%s, ARRAY%s::double precision[])' % ( - self.var.to_sql(), str(self.points)) + return f'width_bucket({self.var.to_sql()}, ' \ + f'ARRAY{str(self.points)}::double precision[])' class SingleValueSql: @@ -163,30 +167,174 @@ def __init__(self, n=4): self.n = n # noinspection PyProtectedMember - def __call__(self, data, attribute, fixed=None): + def __call__(self, data: Table, attribute, fixed=None): if fixed: - min, max = fixed[attribute.name] - points = self._split_eq_width(min, max) + mn, mx = fixed[attribute.name] + points = self._split_eq_width(mn, mx) else: if type(data) == SqlTable: stats = BasicStats(data, attribute) points = self._split_eq_width(stats.min, stats.max) else: - values = data[:, attribute] - values = values.X if values.X.size else values.Y + values, _ = data.get_column_view(attribute) if values.size: - min, max = ut.nanmin(values), ut.nanmax(values) - points = self._split_eq_width(min, max) + mn, mx = ut.nanmin(values), ut.nanmax(values) + points = self._split_eq_width(mn, mx) else: points = [] return Discretizer.create_discretized_var( data.domain[attribute], points) - def _split_eq_width(self, min, max): - if np.isnan(min) or np.isnan(max) or min == max: + def _split_eq_width(self, mn, mx): + if np.isnan(mn) or np.isnan(mx) or mn == mx: return [] - dif = (max - min) / self.n - return [min + (i + 1) * dif for i in range(self.n - 1)] + dif = (mx - mn) / self.n + return [mn + i * dif for i in range(1, self.n)] + + +class TooManyIntervals(ValueError): + pass + + +class FixedWidth(Discretization): + def __init__(self, width, digits=None): + super().__init__() + self.width = width + self.digits = digits + + def __call__(self, data: Table, attribute): + values, _ = data.get_column_view(attribute) + points = [] + if values.size: + mn, mx = ut.nanmin(values), ut.nanmax(values) + if not np.isnan(mn): + minf = int(1 + np.floor(mn / self.width)) + maxf = int(1 + np.floor(mx / self.width)) + if maxf - minf - 1 >= 100: + raise TooManyIntervals + points = [i * self.width for i in range(minf, maxf)] + return Discretizer.create_discretized_var( + data.domain[attribute], points, ndigits=self.digits) + + +class FixedTimeWidth(Discretization): + def __init__(self, width, unit): + # unit: 0=year, 1=month, 2=day, 3=hour, 4=minute, 5=second + # for week, use day with a width of 7 + super().__init__() + self.width = width + self.unit = unit + + def __call__(self, data: Table, attribute): + fmt = ["%Y", "%y %b", "%y %b %d", "%y %b %d %H:%M", "%y %b %d %H:%M", + "%H:%M:%S"][self.unit] + values, _ = data.get_column_view(attribute) + times = [] + if values.size: + mn, mx = ut.nanmin(values), ut.nanmax(values) + if not np.isnan(mn): + mn = utc_from_timestamp(mn).timetuple() + mx = utc_from_timestamp(mx).timetuple() + times = _time_range(mn, mx, self.unit, self.width, 0, 100) + if times is None: + raise TooManyIntervals + times = [time.struct_time(t + (0, 0, 0)) for t in times][1:-1] + points = np.array([calendar.timegm(t) for t in times]) + values = [time.strftime(fmt, t) for t in times] + values = _simplified_time_intervals(values) + var = data.domain[attribute] + return DiscreteVariable(name=var.name, values=values, + compute_value=Discretizer(var, points), + sparse=var.sparse) + + +def _simplified_time_intervals(labels): + def no_common(a, b): + for i, pa, pb in zip(count(), a, b): + if pa != pb: + if common + i == 2: + i -= 1 + return b[i:] + # can't come here (unless a == b?!) + return b # pragma: no cover + + + if not labels: + return [] + common = 100 + labels = [label.split() for label in labels] + for common, parts in enumerate(map(set, zip(*labels))): + if len(parts) > 1: + break + if common == 2: # If we keep days, we must also keep months + common = 1 + labels = [label[common:] for label in labels] + join = " ".join + return [f"< {join(labels[0])}"] + [ + f"{join(low)} - {join(no_common(low, high))}" + for low, high in zip(labels, labels[1:]) + ] + [f"≥ {join(labels[-1])}"] + + + +class Binning(Discretization): + """Discretization with nice thresholds + + This class creates different decimal or time binnings and picks the one + in which the number of interval is closest to the desired number. + The difference is measured as proportion; e.g. having 30 % less intervals + is the same difference as having 30 % too many. + + .. attribute:: n + + Desired number of bins (default: 4). + """ + def __init__(self, n=4): + self.n = n + + def __call__(self, data: Table, attribute): + attribute = data.domain[attribute] + values, _ = data.get_column_view(attribute) + values = values.astype(float) + if not values.size: + return self._create_binned_var(None, attribute) + + var = data.domain[attribute] + if isinstance(var, TimeVariable): + binnings = time_binnings(values) + else: + binnings = decimal_binnings(values) + return self._create_binned_var(binnings, attribute) + + def _create_binned_var(self, binnings, variable): + if not binnings: + return Discretizer.create_discretized_var(variable, []) + + # If self.n is 2, require two intervals (one threshold, excluding top + # and bottom), else require at least three intervals + # ... unless this is the only option, in which case we use it + # Break ties in favour of more bins + binning = min( + (binning for binning in binnings + if len(binning.thresholds) - 2 >= 1 + (self.n != 2)), + key=lambda binning: (abs(self.n - (len(binning.short_labels) - 1)), + -len(binning.short_labels)), + default=binnings[-1]) + + if len(binning.thresholds) == 2: + return Discretizer.create_discretized_var(variable, []) + + blabels = binning.labels[1:-1] + labels = [f"< {blabels[0]}"] + [ + f"{lab1} - {lab2}" for lab1, lab2 in zip(blabels, blabels[1:]) + ] + [f"≥ {blabels[-1]}"] + + discretizer = Discretizer(variable, list(binning.thresholds[1:-1])) + dvar = DiscreteVariable(name=variable.name, values=labels, + compute_value=discretizer, + sparse=variable.sparse) + dvar.source_variable = variable + return dvar class BinDefinition(NamedTuple): @@ -234,7 +382,7 @@ def decimal_binnings( data, *, min_width=0, min_bins=2, max_bins=50, min_unique=5, add_unique=0, factors=(0.01, 0.02, 0.025, 0.05, 0.1, 0.2, 0.25, 0.5, 1, 2, 5, 10, 20), - label_fmt="%g"): + label_fmt="%g") -> List[BinDefinition]: """ Find a set of nice splits of data into bins @@ -283,13 +431,6 @@ def decimal_binnings( or a function for formatting thresholds (e.g. var.str_val) Returns: - bin_boundaries (list of np.ndarray): a list of bin boundaries, - including the top boundary of the last interval, hence the list - size equals the number bins + 1. These array match the `bin` - argument of `numpy.histogram`. - - This is returned if `return_defs` is left `True`. - bin_definition (list of BinDefinition): `BinDefinition` is a named tuple containing the beginning of the first bin (`start`), number of bins (`nbins`) and their widths @@ -297,8 +438,6 @@ def decimal_binnings( elements, which describes bins of unequal width and is used for binnings that match the unique values in the data (see `min_unique` and `add_unique`). - - This is returned if `return_defs` is `False`. """ bins = [] @@ -329,7 +468,8 @@ def decimal_binnings( return bins -def time_binnings(data, *, min_bins=2, max_bins=50, min_unique=5, add_unique=0): +def time_binnings(data, *, min_bins=2, max_bins=50, min_unique=5, add_unique=0 + ) -> List[BinDefinition]: """ Find a set of nice splits of time variable data into bins @@ -355,7 +495,7 @@ def time_binnings(data, *, min_bins=2, max_bins=50, min_unique=5, add_unique=0): number of unique values Returns: - bin_boundaries (list): a list of possible binning. + bin_boundaries (list of BinDefinition): a list of possible binning. Each element of `bin_boundaries` is a tuple consisting of a label describing the bin size (e.g. `2 weeks`) and a list of thresholds. Thresholds are given as pairs @@ -448,7 +588,7 @@ def _simplified_labels(labels): to_remove = "42" while True: firsts = {f for f, *_ in (lab.split() for lab in labels)} - if len(firsts) > 1: + if len(firsts) != 1: # can be 0 if there are no labels break to_remove = firsts.pop() flen = len(to_remove) diff --git a/Orange/preprocess/tests/test_discretize.py b/Orange/preprocess/tests/test_discretize.py index e292d66242f..6ae4a6f4ac6 100644 --- a/Orange/preprocess/tests/test_discretize.py +++ b/Orange/preprocess/tests/test_discretize.py @@ -1,14 +1,332 @@ # File contains some long lines; breaking them would decrease readability -# pylint: disable=line-too-long +# pylint: disable=line-too-long,too-many-lines,protected-access import calendar import unittest +from unittest.mock import patch from time import struct_time, mktime import numpy as np -from Orange.data import ContinuousVariable +from Orange.data import ContinuousVariable, TimeVariable, Table, Domain from Orange.preprocess.discretize import \ - _time_binnings, time_binnings, BinDefinition, Discretizer + _time_binnings, time_binnings, BinDefinition, Discretizer, FixedWidth, \ + FixedTimeWidth , Binning, \ + TooManyIntervals + + +class TestFixedWidth(unittest.TestCase): + def test_discretization(self): + x = np.array([[0.21, 0.335, 0, 0.26, np.nan], + [0] * 5, + [np.nan] * 5]).T + domain = Domain([ContinuousVariable(f"c{i}") for i in range(x.shape[1])]) + data = Table.from_numpy(domain, x, None) + + dvar = FixedWidth(0.1, 2)(data, 0) + np.testing.assert_almost_equal(dvar.compute_value.points, + (0.1, 0.2, 0.3)) + self.assertEqual(dvar.values, + ('< 0.10', '0.10 - 0.20', '0.20 - 0.30', '≥ 0.30')) + + dvar = FixedWidth(0.2, 1)(data, 0) + np.testing.assert_almost_equal(dvar.compute_value.points, (0.2, )) + self.assertEqual(dvar.values, ('< 0.2', '≥ 0.2')) + + dvar = FixedWidth(1, 2)(data, 0) + np.testing.assert_almost_equal(dvar.compute_value.points, []) + + dvar = FixedWidth(0.11, 2)(data, 1) + np.testing.assert_almost_equal(dvar.compute_value.points, []) + + dvar = FixedWidth(0.11, 2)(data, 2) + np.testing.assert_almost_equal(dvar.compute_value.points, []) + + self.assertRaises(TooManyIntervals, FixedWidth(0.0001, 1), data, 0) + + +class TestFixedTimeWidth(unittest.TestCase): + def test_discretization(self): + t = TimeVariable("t") + x = np.array([[t.to_val("1914"), t.to_val("1945"), np.nan], + [t.to_val("1914"), t.to_val("1914"), np.nan], + [np.nan, np.nan, np.nan], + ]).T + domain = Domain([t, TimeVariable("t2"), TimeVariable("t3")]) + data = Table.from_numpy(domain, x, None) + + dvar = FixedTimeWidth(10, 1)(data, 1) + np.testing.assert_almost_equal(dvar.compute_value.points, []) + + dvar = FixedTimeWidth(10, 2)(data, 2) + np.testing.assert_almost_equal(dvar.compute_value.points, []) + + self.assertRaises(TooManyIntervals, FixedWidth(0.0001, 1), data, 0) + + dvar = FixedTimeWidth(10, 0)(data, 0) + np.testing.assert_almost_equal( + dvar.compute_value.points, + [int(t.to_val(str(y))) for y in (1920, 1930, 1940)]) + self.assertEqual(dvar.values, + ('< 1920', '1920 - 1930', '1930 - 1940', '≥ 1940')) + + dvar = FixedTimeWidth(5, 0)(data, 0) + np.testing.assert_almost_equal( + dvar.compute_value.points, + [int(t.to_val(str(y))) for y in (1915, 1920, 1925, 1930, 1935, + 1940, 1945)]) + self.assertEqual(dvar.values, + ('< 1915', '1915 - 1920', '1920 - 1925', '1925 - 1930', + '1930 - 1935', '1935 - 1940', '1940 - 1945', '≥ 1945') + ) + + data = Table.from_numpy( + Domain([t]), + np.array([[t.to_val("1914-07-28"), t.to_val("1918-11-11")]]).T) + dvar = FixedTimeWidth(6, 1)(data, 0) + np.testing.assert_almost_equal( + dvar.compute_value.points, + [int(t.to_val(y)) for y in ("1915-01-01", "1915-07-01", + "1916-01-01", "1916-07-01", + "1917-01-01", "1917-07-01", + "1918-01-01", "1918-07-01")]) + self.assertEqual(dvar.values, + ('< 15 Jan', '15 Jan - Jul', '15 Jul - 16 Jan', + '16 Jan - Jul', '16 Jul - 17 Jan', '17 Jan - Jul', + '17 Jul - 18 Jan', '18 Jan - Jul', '≥ 18 Jul')) + + data = Table.from_numpy( + Domain([t]), + np.array([[t.to_val("1914-07-28"), t.to_val("1914-11-11")]]).T) + dvar = FixedTimeWidth(6, 1)(data, 0) + np.testing.assert_almost_equal(dvar.compute_value.points, []) + + dvar = FixedTimeWidth(2, 1)(data, 0) + np.testing.assert_almost_equal( + dvar.compute_value.points, + [int(t.to_val(y)) for y in ("1914-09-01", "1914-11-01")]) + self.assertEqual(dvar.values, ('< Sep', 'Sep - Nov', '≥ Nov')) + + dvar = FixedTimeWidth(1, 1)(data, 0) + np.testing.assert_almost_equal( + dvar.compute_value.points, + [int(t.to_val(y)) for y in ("1914-08-01", "1914-09-01", + "1914-10-01", "1914-11-01")]) + self.assertEqual(dvar.values, ('< Aug', 'Aug - Sep', 'Sep - Oct', + 'Oct - Nov', '≥ Nov')) + + data = Table.from_numpy( + Domain([t]), + np.array([[t.to_val("1914-06-28 10:45"), + t.to_val("1914-07-04 15:25")]]).T) + dvar = FixedTimeWidth(2, 2)(data, 0) + np.testing.assert_almost_equal( + dvar.compute_value.points, + [int(t.to_val(y)) for y in ("1914-06-29", "1914-07-01", + "1914-07-03")]) + self.assertEqual(dvar.values, ('< Jun 29', 'Jun 29 - Jul 01', + 'Jul 01 - Jul 03', '≥ Jul 03')) + + dvar = FixedTimeWidth(1, 2)(data, 0) + np.testing.assert_almost_equal( + dvar.compute_value.points, + [int(t.to_val(y)) for y in ("1914-06-29", "1914-06-30", + "1914-07-01", "1914-07-02", + "1914-07-03", "1914-07-04")]) + self.assertEqual(dvar.values, ('< Jun 29', 'Jun 29 - Jun 30', + 'Jun 30 - Jul 01', 'Jul 01 - Jul 02', + 'Jul 02 - Jul 03', 'Jul 03 - Jul 04', + '≥ Jul 04')) + + data = Table.from_numpy( + Domain([t]), + np.array([[t.to_val("1914-12-30 22:45"), + t.to_val("1915-01-02 15:25")]]).T) + dvar = FixedTimeWidth(1, 2)(data, 0) + np.testing.assert_almost_equal( + dvar.compute_value.points, + [int(t.to_val(y)) for y in ("1914-12-31", "1915-01-01", + "1915-01-02")]) + self.assertEqual(dvar.values, ('< 14 Dec 31', + '14 Dec 31 - 15 Jan 01', + '15 Jan 01 - Jan 02', '≥ 15 Jan 02')) + + data = Table.from_numpy( + Domain([t]), + np.array([[t.to_val("1914-06-28 10:45"), + t.to_val("1914-06-28 15:25")]]).T) + dvar = FixedTimeWidth(2, 3)(data, 0) + np.testing.assert_almost_equal( + dvar.compute_value.points, + [int(t.to_val(y)) for y in ("1914-06-28 12:00", "1914-06-28 14:00")]) + self.assertEqual(dvar.values, ('< 12:00', '12:00 - 14:00', '≥ 14:00')) + + data = Table.from_numpy( + Domain([t]), + np.array([[t.to_val("1914-06-28 10:45"), + t.to_val("1914-06-28 15:25")]]).T) + dvar = FixedTimeWidth(1, 3)(data, 0) + np.testing.assert_almost_equal( + dvar.compute_value.points, + [int(t.to_val(y)) for y in ("1914-06-28 11:00", "1914-06-28 12:00", + "1914-06-28 13:00", "1914-06-28 14:00", + "1914-06-28 15:00")]) + self.assertEqual(dvar.values, ('< 11:00', '11:00 - 12:00', + '12:00 - 13:00', '13:00 - 14:00', + '14:00 - 15:00', '≥ 15:00')) + + data = Table.from_numpy( + Domain([t]), + np.array([[t.to_val("1914-06-28 22:45"), + t.to_val("1914-06-29 03:25")]]).T) + dvar = FixedTimeWidth(1, 3)(data, 0) + np.testing.assert_almost_equal( + dvar.compute_value.points, + [int(t.to_val(y)) for y in ("1914-06-28 23:00", "1914-06-29 00:00", + "1914-06-29 01:00", "1914-06-29 02:00", + "1914-06-29 03:00")]) + self.assertEqual(dvar.values, ('< Jun 28 23:00', + 'Jun 28 23:00 - Jun 29 00:00', + 'Jun 29 00:00 - 01:00', + 'Jun 29 01:00 - 02:00', + 'Jun 29 02:00 - 03:00', + '≥ Jun 29 03:00')) + + data = Table.from_numpy( + Domain([t]), + np.array([[t.to_val("1914-06-28 22:43"), + t.to_val("1914-06-28 23:01")]]).T) + dvar = FixedTimeWidth(5, 4)(data, 0) + np.testing.assert_almost_equal( + dvar.compute_value.points, + [int(t.to_val(y)) for y in ("1914-06-28 22:45", "1914-06-28 22:50", + "1914-06-28 22:55", "1914-06-28 23:00")]) + self.assertEqual(dvar.values, ('< 22:45', "22:45 - 22:50", + "22:50 - 22:55", "22:55 - 23:00", + '≥ 23:00')) + + data = Table.from_numpy( + Domain([t]), + np.array([[t.to_val("1914-06-30 23:48"), + t.to_val("1914-07-01 00:06")]]).T) + dvar = FixedTimeWidth(5, 4)(data, 0) + np.testing.assert_almost_equal( + dvar.compute_value.points, + [int(t.to_val(y)) for y in ("1914-06-30 23:50", "1914-06-30 23:55", + "1914-07-01 00:00", "1914-07-01 00:05")]) + self.assertEqual(dvar.values, ('< Jun 30 23:50', "Jun 30 23:50 - 23:55", + "Jun 30 23:55 - Jul 01 00:00", + "Jul 01 00:00 - 00:05", '≥ Jul 01 00:05')) + + data = Table.from_numpy( + Domain([t]), + np.array([[t.to_val("1914-06-29 23:48"), + t.to_val("1914-06-30 00:06")]]).T) + dvar = FixedTimeWidth(5, 4)(data, 0) + np.testing.assert_almost_equal( + dvar.compute_value.points, + [int(t.to_val(y)) for y in ("1914-06-29 23:50", "1914-06-29 23:55", + "1914-06-30 00:00", "1914-06-30 00:05")]) + self.assertEqual(dvar.values, ('< Jun 29 23:50', "Jun 29 23:50 - 23:55", + "Jun 29 23:55 - Jun 30 00:00", + "Jun 30 00:00 - 00:05", '≥ Jun 30 00:05')) + + data = Table.from_numpy( + Domain([t]), + np.array([[t.to_val("1914-06-29 23:48:05"), + t.to_val("1914-06-29 23:51:59")]]).T) + dvar = FixedTimeWidth(1, 4)(data, 0) + np.testing.assert_almost_equal( + dvar.compute_value.points, + [int(t.to_val(y)) for y in ("1914-06-29 23:49", "1914-06-29 23:50", + "1914-06-29 23:51")]) + self.assertEqual(dvar.values, ('< 23:49', "23:49 - 23:50", + "23:50 - 23:51", '≥ 23:51')) + + data = Table.from_numpy( + Domain([t]), + np.array([[t.to_val("1914-06-29 23:48:05.123"), + t.to_val("1914-06-29 23:48:33.684")]]).T) + dvar = FixedTimeWidth(10, 5)(data, 0) + np.testing.assert_almost_equal( + dvar.compute_value.points, + [int(t.to_val(y)) for y in ("1914-06-29 23:48:10", + "1914-06-29 23:48:20", + "1914-06-29 23:48:30")]) + self.assertEqual(dvar.values, ('< 23:48:10', "23:48:10 - 23:48:20", + "23:48:20 - 23:48:30", '≥ 23:48:30')) + + data = Table.from_numpy( + Domain([t]), + np.array([[t.to_val("1914-12-31 23:59:58.1"), + t.to_val("1915-01-01 00:00:01.8")]]).T) + dvar = FixedTimeWidth(1, 5)(data, 0) + np.testing.assert_almost_equal( + dvar.compute_value.points, + [int(t.to_val(y)) for y in ("1914-12-31 23:59:59", + "1915-01-01 00:00:00", + "1915-01-01 00:00:01")]) + self.assertEqual(dvar.values, ('< 23:59:59', "23:59:59 - 00:00:00", + "00:00:00 - 00:00:01", '≥ 00:00:01')) + + self.assertRaises(TooManyIntervals, FixedTimeWidth(0.0001, 5), data, 0) + + +class TestBinningDiscretizer(unittest.TestCase): + def test_no_data(self): + no_data = Table(Domain([ContinuousVariable("y")]), np.zeros((0, 1))) + dvar = Binning()(no_data, 0) + self.assertEqual(dvar.compute_value.points, []) + + @patch("Orange.preprocess.discretize.time_binnings") + @patch("Orange.preprocess.discretize.decimal_binnings") + @patch("Orange.preprocess.discretize.Binning._create_binned_var") + def test_call(self, _, decbin, timebin): + data = Table(Domain([ContinuousVariable("y"), TimeVariable("t")]), + np.array([[1, 2], [3, 4]])) + + Binning(5)(data, 0) + timebin.assert_not_called() + self.assertEqual(list(decbin.call_args[0][0]), [1, 3]) + decbin.reset_mock() + + Binning(5)(data, 1) + decbin.assert_not_called() + self.assertEqual(list(timebin.call_args[0][0]), [2, 4]) + + def test_binning_selection(self): + var = ContinuousVariable("y") + discretize = Binning(2) + # pylint: disable=redefined-outer-name + create = discretize._create_binned_var + + binnings = [] + self.assertEqual(create(binnings, var).compute_value.points, []) + + binnings = None + self.assertEqual(create(binnings, var).compute_value.points, []) + + binnings = [ + BinDefinition(np.arange(i + 1), + [f"t{x}" for x in range(i + 1)], + [f"t{x}" for x in range(i + 1)], + 1 / i, str(i) + ) + for i in (3, 5, 10, 20) + ] + + for discretize.n in (2, 3): + self.assertEqual(create(binnings, var).values, + ('< t1', "t1 - t2", "≥ t2")) + + for discretize.n in (4, 5, 6, 7): + self.assertEqual(create(binnings, var).values, + ('< t1', "t1 - t2", "t2 - t3", "t3 - t4", "≥ t4")) + + for discretize.n in range(8, 15): + self.assertEqual(len(create(binnings, var).values), 10) + + for discretize.n in range(16, 25): + self.assertEqual(len(create(binnings, var).values), 20) # pylint: disable=redefined-builtin @@ -34,12 +352,12 @@ def tr1(s): s = s.replace(localname, engname) return s - def tr(ss): + def tr2(ss): return list(map(tr1, ss)) def testbin(start, end): bins = _time_binnings(create(*start), create(*end), 3, 51) - return [(bin.width_label, tr(bin.short_labels), + return [(bin.width_label, tr2(bin.short_labels), list(bin.thresholds)) for bin in reversed(bins)] diff --git a/Orange/widgets/data/owdiscretize.py b/Orange/widgets/data/owdiscretize.py index 157d5c4f5d2..d9b5de8fc79 100644 --- a/Orange/widgets/data/owdiscretize.py +++ b/Orange/widgets/data/owdiscretize.py @@ -1,359 +1,597 @@ import re +import html from enum import IntEnum -from collections import namedtuple -from typing import Optional, Tuple, Iterable, Union, Callable, Any +from typing import Optional, Tuple, Union, Callable, NamedTuple, Dict, List +from AnyQt.QtCore import ( + Qt, QTimer, QPoint, QItemSelectionModel, QSize, QAbstractListModel, + pyqtSignal as Signal) +from AnyQt.QtGui import ( + QValidator, QPalette, QDoubleValidator, QIntValidator, QColor) from AnyQt.QtWidgets import ( QListView, QHBoxLayout, QStyledItemDelegate, QButtonGroup, QWidget, - QLineEdit, QToolTip, QLabel, QApplication -) -from AnyQt.QtGui import QValidator, QPalette -from AnyQt.QtCore import Qt, QTimer, QPoint -from orangewidget.utils.listview import ListViewSearch + QLineEdit, QToolTip, QLabel, QApplication, + QSpinBox, QSizePolicy, QRadioButton, QComboBox) + +from orangewidget.settings import Setting +from orangewidget.utils import listview -import Orange.data +from Orange.data import ( + Variable, ContinuousVariable, DiscreteVariable, TimeVariable, Domain, Table) import Orange.preprocess.discretize as disc -from Orange.data import Variable -from Orange.widgets import widget, gui, settings -from Orange.widgets.utils import itemmodels, vartype, unique_everseen +from Orange.widgets import widget, gui +from Orange.widgets.utils import unique_everseen +from Orange.widgets.utils.itemmodels import DomainModel from Orange.widgets.utils.widgetpreview import WidgetPreview from Orange.widgets.widget import Input, Output from Orange.widgets.data.oweditdomain import FixedSizeButton -__all__ = ["OWDiscretize"] - -# 'Default' method delegates to 'method' -Default = namedtuple("Default", ["method"]) -Leave = namedtuple("Leave", []) -MDL = namedtuple("MDL", []) -EqualFreq = namedtuple("EqualFreq", ["k"]) -EqualWidth = namedtuple("EqualWidth", ["k"]) -Remove = namedtuple("Remove", []) -Custom = namedtuple("Custom", ["points"]) - - -MethodType = Union[ - Default, - Leave, - MDL, - EqualFreq, - EqualWidth, - Remove, - Custom, -] - -_dispatch = { - Default: - lambda m, data, var: _dispatch[type(m.method)](m.method, data, var), - Leave: lambda m, data, var: var, - MDL: lambda m, data, var: disc.EntropyMDL()(data, var), - EqualFreq: lambda m, data, var: disc.EqualFreq(m.k)(data, var), - EqualWidth: lambda m, data, var: disc.EqualWidth(m.k)(data, var), - Remove: lambda m, data, var: None, - Custom: - lambda m, data, var: - disc.Discretizer.create_discretized_var(var, m.points) + +# Remove this when we require PyQt 5.15 +if not hasattr(QButtonGroup, "idClicked"): + class QButtonGroup(QButtonGroup): # pylint: disable=function-redefined + idClicked = Signal(int) + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.buttonClicked.connect( + lambda button: self.idClicked.emit(self.id(button))) + + +re_custom_sep = re.compile(r"\s*,\s*") +time_units = ["year", "month", "day", "week", "hour", "minute", "second"] +INVALID_WIDTH = "invalid width" +TOO_MANY_INTERVALS = "too many intervals" + + +def _fixed_width_discretization( + data: Table, + var: Union[ContinuousVariable, str, int], + width: str) -> Union[DiscreteVariable, str]: + """ + Discretize numeric variable with fixed bin width. Used in method definition. + + Width is given as string (coming from line edit). The labels for the new + variable will have the same number of digits; this is more appropriate + than the number of digits in the original variable, which may be too large. + + Args: + data: data used to deduce the interval of values + var: variable to discretize + width: interval width + + Returns: + Discrete variable, if successful; a string with error otherwise + """ + digits = len(width) - width.index(".") - 1 if "." in width else 0 + try: + width = float(width) + except ValueError: + return INVALID_WIDTH + if width <= 0: + return INVALID_WIDTH + try: + return disc.FixedWidth(width, digits)(data, var) + except disc.TooManyIntervals: + return TOO_MANY_INTERVALS + + +# pylint: disable=invalid-name +def _fixed_time_width_discretization( + data: Table, + var: Union[TimeVariable, str, int], + width: str, unit: int) -> Union[DiscreteVariable]: + """ + Discretize time variable with fixed bin width. Used in method definition. + + Width is given as string (coming from line edit). + + Args: + data: data used to deduce the interval of values + var: variable to discretize + width: interval width + unit: 0 = year, 1 = month, 2 = week, 3 = day, 4 = hour, 5 = min, 6 = sec + + Returns: + Discrete variable, if successful; a string with error otherwise + """ + try: + width = int(width) + except ValueError: + return INVALID_WIDTH + if width <= 0: + return INVALID_WIDTH + if unit == 3: # week + width *= 7 + unit -= unit >= 3 + try: + return disc.FixedTimeWidth(width, unit)(data, var) + except disc.TooManyIntervals: + return TOO_MANY_INTERVALS + + +def _mdl_discretization( + data: Table, + var: Union[ContinuousVariable, str, int]) -> Union[DiscreteVariable, str]: + if not data.domain.has_discrete_class: + return "no discrete class" + return disc.EntropyMDL()(data, var) + + +def _custom_discretization( + _, + var: Union[ContinuousVariable, str, int], + points: str) -> Union[DiscreteVariable, str]: + """ + Discretize variable using custom thresholds. Used in method definition. + + Thresholds are given as string (coming from line edit). + + Args: + data: data used to deduce the interval of values + var: variable to discretize + points: thresholds + + Returns: + Discrete variable, if successful; a string with error otherwise + """ + try: + cuts = [float(x) for x in re_custom_sep.split(points.strip())] + except ValueError: + cuts = [] + if any(x >= y for x, y in zip(cuts, cuts[1:])): + cuts = [] + if not cuts: + return "invalid cuts" + return disc.Discretizer.create_discretized_var(var, cuts) + + +class Methods(IntEnum): + # pylint: disable=invalid-name + Default, Keep, MDL, EqualFreq, EqualWidth, Remove, Custom, Binning, \ + FixedWidth, FixedWidthTime = range(10) + + +class MethodDesc(NamedTuple): + """ + Definitions of all methods; used for creation of interface and calling + """ + id_: Methods # Method id + label: str # Label used for radio button + short_desc: str # Short descriptions for list views + tooltip: str # Tooltip for radio button + # Discretization function, see, e.g. fixed_width_discretization + function: Optional[Callable[..., Union[DiscreteVariable, str]]] + controls: Tuple[str, ...] = () # Widget attributes with related ux controls + + +Options: Dict[Methods, MethodDesc] = { + method.id_: method + for method in ( + MethodDesc(Methods.Default, + "Use default setting", "default", + "Treat the variable as defined in 'default setting'", + None, + ()), + MethodDesc(Methods.Keep, + "Keep numeric", "keep", + "Keep the variable as is", + lambda data, var: var, + ()), + MethodDesc(Methods.MDL, + "Entropy vs. MDL", "entropy", + "Split values until MDL exceeds the entropy (Fayyad-Irani)\n" + "(requires discrete class variable)", + _mdl_discretization, + ()), + MethodDesc(Methods.EqualFreq, + "Equal frequency, intervals: ", "equal freq, k={}", + "Create bins with same number of instances", + lambda data, var, k: disc.EqualFreq(k)(data, var), + ("freq_spin", )), + MethodDesc(Methods.EqualWidth, + "Equal width, intervals: ", "equal width, k={}", + "Create bins of the same width", + lambda data, var, k: disc.EqualWidth(k)(data, var), + ("width_spin", )), + MethodDesc(Methods.Remove, + "Remove", "remove", + "Remove variable", + lambda *_: None, + ()), + MethodDesc(Methods.Binning, + "Natural binning, desired bins: ", "binning, desired={}", + "Create bins with nice thresholds; " + "try matching desired number of bins", + lambda data, var, nbins: disc.Binning(nbins)(data, var), + ("binning_spin", )), + MethodDesc(Methods.FixedWidth, + "Fixed width: ", "fixed width {}", + "Create bins with the given width (not for time variables)", + _fixed_width_discretization, + ("width_line", )), + MethodDesc(Methods.FixedWidthTime, + "Time interval: ", "time interval, {} {}", + "Create bins with the give width (for time variables)", + _fixed_time_width_discretization, + ("width_time_line", "width_time_unit")), + MethodDesc(Methods.Custom, + "Custom: ", "custom: {}", + "Use manually specified thresholds", + _custom_discretization, + ("threshold_line", )) + ) } -# Variable discretization state (back compat for deserialization) -DState = namedtuple( - "DState", - ["method", # discretization method - "points", # induced cut points - "disc_var"] # induced discretized variable -) +class VarHint(NamedTuple): + """Description for settings""" + method_id: Methods + args: Tuple[Union[str, float, int]] -def is_discretized(var): - return isinstance(var.compute_value, disc.Discretizer) +class DiscDesc(NamedTuple): + """Data for list view model""" + hint: VarHint + points: str + values: Tuple[str] -def variable_key(var): - return vartype(var), var.name +KeyType = Optional[Tuple[str, bool]] +DefaultHint = VarHint(Methods.Keep, ()) +DefaultKey = None -def button_group_reset(group): - button = group.checkedButton() - if button is not None: - group.setExclusive(False) - button.setChecked(False) - group.setExclusive(True) +def variable_key(var: ContinuousVariable) -> KeyType: + """Key for that variable in var_hints and discretized_vars""" + return var.name, isinstance(var, TimeVariable) -class DiscDelegate(QStyledItemDelegate): - def initStyleOption(self, option, index): - super().initStyleOption(option, index) - state = index.data(Qt.UserRole) - var = index.data(Qt.EditRole) - if state is not None: - if isinstance(var, Variable): - fmt = var.repr_val - else: - fmt = str - extra = self.cutsText(state, fmt) - option.text = option.text + ": " + extra +class ListViewSearch(listview.ListViewSearch): + """ + A list view with two components shown above it: + - a listview containing a single item representing default settings + - a filter for search - @staticmethod - def cutsText(state: DState, fmt: Callable[[Any], str] = str): - # This function has many branches, but they don't hurt readabability - # pylint: disable=too-many-branches - method = state.method - # Need a better way to distinguish discretization states - # i.e. between 'induced no points v.s. 'removed by choice' - if state.points is None and state.disc_var is not None: - points = "" - elif state.points is None: - points = "..." - elif state.points == []: - points = "" + The class is based on listview.ListViewSearch and needs to have the same + name in order to override its private method __layout. + + Inherited __init__ calls __layout, so `default_view` must be constructed + there. Construction before calling super().__init__ doesn't work because + PyQt does not allow it. + """ + class DiscDelegate(QStyledItemDelegate): + """ + A delegate that shows items (variables) with specific settings in bold + """ + def initStyleOption(self, option, index): + super().initStyleOption(option, index) + option.font.setBold(index.data(Qt.UserRole).hint is not None) + + def __init__(self, *args, **kwargs): + self.default_view = None + super().__init__(preferred_size=QSize(350, -1), *args, **kwargs) + self.setItemDelegate(self.DiscDelegate(self)) + + def select_default(self): + """Select the item representing default settings""" + index = self.default_view.model().index(0) + self.default_view.selectionModel().select( + index, QItemSelectionModel.Select) + + # pylint: disable=unused-private-member + def __layout(self): + if self.default_view is None: # __layout was called from __init__ + view = self.default_view = QListView(self) + view.setModel(DefaultDiscModel()) + view.verticalScrollBar().setDisabled(True) + view.horizontalScrollBar().setDisabled(True) + view.setHorizontalScrollBarPolicy( + Qt.ScrollBarPolicy.ScrollBarAlwaysOff) + view.setVerticalScrollBarPolicy( + Qt.ScrollBarPolicy.ScrollBarAlwaysOff) + font = view.font() + font.setBold(True) + view.setFont(font) else: - points = ", ".join(map(fmt, state.points)) - - if isinstance(method, Default): - name = None - elif isinstance(method, Leave): - name = "(leave)" - elif isinstance(method, MDL): - name = "(entropy)" - elif isinstance(method, EqualFreq): - name = "(equal frequency k={})".format(method.k) - elif isinstance(method, EqualWidth): - name = "(equal width k={})".format(method.k) - elif isinstance(method, Remove): - name = "(removed)" - elif isinstance(method, Custom): - name = "(custom)" + view = self.default_view + + # Put the list view with default on top + margins = self.viewportMargins() + def_height = view.sizeHintForRow(0) + 2 * view.spacing() + 2 + view.setGeometry(0, 0, self.geometry().width(), def_height) + view.setFixedHeight(def_height) + + # Then search + search = self.__search + src_height = search.sizeHint().height() + size = self.size() + search.setGeometry(0, def_height + 2, size.width(), src_height) + + # Then the real list view + margins.setTop(def_height + 2 + src_height) + self.setViewportMargins(margins) + + +def format_desc(hint: VarHint) -> str: + """Describe the method and its parameters; used in list views and report""" + if hint is None: + return Options[Methods.Default].short_desc + desc = Options[hint.method_id].short_desc + if hint.method_id == Methods.FixedWidthTime: + width, unit = hint.args + unit = time_units[unit] + try: + width = int(width) + except ValueError: + unit += "(s)" else: - assert False + if width != 1: + unit += "s" + return desc.format(width, unit) + return desc.format(*hint.args) - if name is not None: - return points + " " + name - else: - return points +class DiscDomainModel(DomainModel): + """ + Domain model that adds description of discretization methods and thresholds -#: Discretization methods -class Methods(IntEnum): - Default, Leave, MDL, EqualFreq, EqualWidth, Remove, Custom = range(7) + Also provides a tooltip that shows bins, that is, labels of the discretized + variable. + """ + def data(self, index, role=Qt.DisplayRole): + if role == Qt.ToolTipRole: + var = self[index.row()] + data = index.data(Qt.UserRole) + tip = f"{var.name}: " + values = map(html.escape, data.values) + if not data.values: + return None + if len(data.values) <= 3: + return f'

{tip}' \ + f'{",  ".join(values)}

' + else: + return tip + "
" \ + + "".join(f"- {value}
" for value in values) + value = super().data(index, role) + if role == Qt.DisplayRole: + hint, points, values = index.data(Qt.UserRole) + value += f" ({format_desc(hint)}){points}" + return value - @staticmethod - def from_method(method): - return Methods[type(method).__name__] +class DefaultDiscModel(QAbstractListModel): + """ + A model used for showing "Default settings" above the list view with var + """ + icon = None -def parse_float(string: str) -> Optional[float]: - try: - return float(string) - except ValueError: + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + if DefaultDiscModel.icon is None: + DefaultDiscModel.icon = gui.createAttributePixmap( + "★", QColor(0, 0, 0, 0), Qt.black) + self.hint: VarHint = DefaultHint + + @staticmethod + def rowCount(parent): + return 0 if parent.isValid() else 1 + + @staticmethod + def columnCount(parent): + return 0 if parent.isValid() else 1 + + def data(self, _, role=Qt.DisplayRole): + if role == Qt.DisplayRole: + return "Default setting: " + format_desc(self.hint) + elif role == Qt.DecorationRole: + return DefaultDiscModel.icon + elif role == Qt.ToolTipRole: + return "Default setting for variables without specific setings" return None + def setData(self, index, value, role=Qt.DisplayRole): + if role == Qt.UserRole: + self.hint = value + self.dataChanged.emit(index, index) + class IncreasingNumbersListValidator(QValidator): """ - Match a comma separated list of non-empty and increasing number strings. - - Example - ------- - >>> v = IncreasingNumbersListValidator() - >>> v.validate("", 0) # Acceptable - (2, '', 0) - >>> v.validate("1", 1) # Acceptable - (2, '1', 1) - >>> v.validate("1,,", 1) # Intermediate - (1, '1,,', 1) + A validator for custom thresholds + + Requires a string with increasing comma-separated values. If the string + ends with number followed by space, it inserts a comma. """ @staticmethod - def itersplit(string: str) -> Iterable[Tuple[int, int]]: - sepiter = re.finditer(r"(? Tuple[QValidator.State, str, int]: - state = QValidator.Acceptable - # Matches non-complete intermediate numbers (while editing) - intermediate = re.compile(r"([+-]?\s?\d*\s?\d*\.?\d*\s?\d*)") - values = [] - for start, end in self.itersplit(string): - valuestr = string[start:end].strip() - if not valuestr: - # Middle element is empty (will be fixed by fixup) - continue - value = parse_float(valuestr) - if value is None: - if intermediate.fullmatch(valuestr): - state = min(state, QValidator.Intermediate) - continue - return QValidator.Invalid, string, pos - if values and value <= values[-1]: - state = min(state, QValidator.Intermediate) - else: - values.append(value) - return state, string, pos + def validate(string: str, pos: int) -> Tuple[QValidator.State, str, int]: + for i, c in enumerate(string, start=1): + if c not in "+-., 0123456789": + return QValidator.Invalid, string, i + prev = None + if pos == len(string) >= 2 \ + and string[-1] == " " and string[-2].isdigit(): + string = string[:-1] + ", " + pos += 1 + for valuestr in re_custom_sep.split(string.strip()): + try: + value = float(valuestr) + except ValueError: + return QValidator.Intermediate, string, pos + if prev is not None and value <= prev: + return QValidator.Intermediate, string, pos + prev = value + return QValidator.Acceptable, string, pos - def fixup(self, string): - # type: (str) -> str - """ - Fixup the input. Remove empty parts from the string. - """ - parts = [string[start: end] for start, end in self.itersplit(string)] - parts = [part for part in parts if part.strip()] - return ", ".join(parts) - - -def show_tip( - widget: QWidget, pos: QPoint, text: str, timeout=-1, - textFormat=Qt.AutoText, wordWrap=None -): - propname = __name__ + "::show_tip_qlabel" - if timeout < 0: - timeout = widget.toolTipDuration() - if timeout < 0: - timeout = 5000 + 40 * max(0, len(text) - 100) - tip = widget.property(propname) - if not text and tip is None: - return - - def hide(): - w = tip.parent() - w.setProperty(propname, None) - tip.timer.stop() - tip.close() - tip.deleteLater() - - if not isinstance(tip, QLabel): - tip = QLabel(objectName="tip-label", focusPolicy=Qt.NoFocus) - tip.setBackgroundRole(QPalette.ToolTipBase) - tip.setForegroundRole(QPalette.ToolTipText) - tip.setPalette(QToolTip.palette()) - tip.setFont(QApplication.font("QTipLabel")) - tip.timer = QTimer(tip, singleShot=True, objectName="hide-timer") - tip.timer.timeout.connect(hide) - widget.setProperty(propname, tip) - tip.setParent(widget, Qt.ToolTip) - - tip.setText(text) - tip.setTextFormat(textFormat) - if wordWrap is None: - wordWrap = textFormat != Qt.PlainText - tip.setWordWrap(wordWrap) - - if not text: - hide() - else: - tip.timer.start(timeout) - tip.show() - tip.move(pos) + @staticmethod + def show_tip( + widget: QWidget, pos: QPoint, text: str, timeout=-1, + textFormat=Qt.AutoText, wordWrap=None): + """Show a tooltip; used for invalid custom thresholds""" + propname = __name__ + "::show_tip_qlabel" + if timeout < 0: + timeout = widget.toolTipDuration() + if timeout < 0: + timeout = 5000 + 40 * max(0, len(text) - 100) + tip = widget.property(propname) + if not text and tip is None: + return + + def hide(): + w = tip.parent() + w.setProperty(propname, None) + tip.timer.stop() + tip.close() + tip.deleteLater() + + if not isinstance(tip, QLabel): + tip = QLabel(objectName="tip-label", focusPolicy=Qt.NoFocus) + tip.setBackgroundRole(QPalette.ToolTipBase) + tip.setForegroundRole(QPalette.ToolTipText) + tip.setPalette(QToolTip.palette()) + tip.setFont(QApplication.font("QTipLabel")) + tip.setContentsMargins(2, 2, 2, 2) + tip.timer = QTimer(tip, singleShot=True, objectName="hide-timer") + tip.timer.timeout.connect(hide) + widget.setProperty(propname, tip) + tip.setParent(widget, Qt.ToolTip) + + tip.setText(text) + tip.setTextFormat(textFormat) + if wordWrap is None: + wordWrap = textFormat != Qt.PlainText + tip.setWordWrap(wordWrap) + + if not text: + hide() + else: + tip.timer.start(timeout) + tip.show() + tip.move(pos) + + +# These are no longer used, but needed for loading and migrating old pickles. +# We insert them into namespace instead of normally defining them, in order +# to hide it from IDE's and avoid mistakenly using them. +# pylint: disable=wrong-import-position,wrong-import-order +from collections import namedtuple +globals().update(dict( + DState=namedtuple( + "DState", + ["method", # discretization method + "points", # induced cut points + "disc_var"], + defaults=(None, None) # induced discretized variable + ), + Default=namedtuple("Default", ["method"]), + Leave=namedtuple("Leave", []), + MDL=namedtuple("MDL", []), + EqualFreq=namedtuple("EqualFreq", ["k"]), + EqualWidth=namedtuple("EqualWidth", ["k"]), + Remove=namedtuple("Remove", []), + Custom=namedtuple("Custom", ["points"]) +)) class OWDiscretize(widget.OWWidget): # pylint: disable=too-many-instance-attributes name = "Discretize" - description = "Discretize the numeric data features." + description = "Discretize numeric variables" category = "Transform" icon = "icons/Discretize.svg" keywords = ["bin", "categorical", "nominal", "ordinal"] priority = 2130 class Inputs: - data = Input("Data", Orange.data.Table, doc="Input data table") + data = Input("Data", Table, doc="Input data table") class Outputs: - data = Output("Data", Orange.data.Table, doc="Table with discretized features") - - settingsHandler = settings.DomainContextHandler() - settings_version = 2 - saved_var_states = settings.ContextSetting({}) + data = Output("Data", Table, doc="Table with categorical features") - #: The default method name - default_method_name = settings.Setting(Methods.EqualFreq.name) - #: The k for Equal{Freq,Width} - default_k = settings.Setting(3) - #: The default cut points for custom entry - default_cutpoints: Tuple[float, ...] = settings.Setting(()) - autosend = settings.Setting(True) + settings_version = 3 - #: Discretization methods - Default, Leave, MDL, EqualFreq, EqualWidth, Remove, Custom = list(Methods) + #: Default setting (key DefaultKey) and specific settings for variables; + # if variable is not in the dict, it uses default + var_hints: Dict[KeyType, VarHint] = Setting( + {DefaultKey: DefaultHint}, schema_only=True) + autosend = Setting(True) want_main_area = False - resizing_enabled = False def __init__(self): super().__init__() #: input data self.data = None - self.class_var = None - #: Current variable discretization state - self.var_state = {} - #: Saved variable discretization settings (context setting) - self.saved_var_states = {} - - self.method = Methods.Default - self.k = 5 - self.cutpoints = () - - box = gui.vBox(self.controlArea, self.tr("Default Discretization")) - self._default_method_ = 0 - self.default_bbox = rbox = gui.radioButtons( - box, self, "_default_method_", callback=self._default_disc_changed) - self.default_button_group = bg = rbox.findChild(QButtonGroup) - bg.buttonClicked[int].connect(self.set_default_method) - - rb = gui.hBox(rbox) - self.left = gui.vBox(rb) - right = gui.vBox(rb) - rb.layout().setStretch(0, 1) - rb.layout().setStretch(1, 1) - self.options = [ - (Methods.Default, self.tr("Default")), - (Methods.Leave, self.tr("Leave numeric")), - (Methods.MDL, self.tr("Entropy-MDL discretization")), - (Methods.EqualFreq, self.tr("Equal-frequency discretization")), - (Methods.EqualWidth, self.tr("Equal-width discretization")), - (Methods.Remove, self.tr("Remove numeric variables")), - (Methods.Custom, self.tr("Manual")), - ] - - for id_, opt in self.options[1:]: - t = gui.appendRadioButton(rbox, opt) - bg.setId(t, id_) - t.setChecked(id_ == self.default_method) - [right, self.left][opt.startswith("Equal")].layout().addWidget(t) - - def _intbox(parent, attr, callback): - box = gui.indentedBox(parent) - s = gui.spin( - box, self, attr, minv=2, maxv=10, label="Num. of intervals:", - callback=callback) - s.setMaximumWidth(60) + #: Cached discretized variables + self.discretized_vars: Dict[KeyType, DiscreteVariable] = {} + + # Indicates that buttons, spins, edit and combos are being changed + # programmatically (when interface is changed due to selection change), + # so this should not trigger update of hints and invalidation of + # discretization in `self.discretized_vars`. + self.__interface_update = False + + box = gui.hBox(self.controlArea, True, spacing=8) + self._create_var_list(box) + self._create_buttons(box) + gui.auto_apply(self.buttonsArea, self, "autosend") + gui.rubber(self.buttonsArea) + + self.varview.select_default() + + def _create_var_list(self, box): + """Create list view with variables""" + # If we decide to not elide, remove the `uniformItemSize` argument + self.varview = ListViewSearch( + selectionMode=QListView.ExtendedSelection, uniformItemSizes=True) + self.varview.setModel( + DiscDomainModel( + valid_types=(ContinuousVariable, TimeVariable), + order=DiscDomainModel.MIXED + )) + self.varview.selectionModel().selectionChanged.connect( + self._var_selection_changed) + self.varview.default_view.selectionModel().selectionChanged.connect( + self._default_selected) + self._update_default_model() + box.layout().addWidget(self.varview) + + def _create_buttons(self, box): + """Create radio buttons""" + def intspin(): + s = QSpinBox(self) + s.setMinimum(2) + s.setMaximum(10) + s.setFixedWidth(60) s.setAlignment(Qt.AlignRight) - gui.rubber(s.box) - return box.box + s.setContentsMargins(0, 0, 0, 0) + return s, s.valueChanged - self.k_general = _intbox(self.left, "default_k", - self._default_disc_changed) - self.k_general.layout().setContentsMargins(0, 0, 0, 0) + def widthline(validator): + s = QLineEdit(self) + s.setFixedWidth(60) + s.setAlignment(Qt.AlignRight) + s.setValidator(validator) + s.setContentsMargins(0, 0, 0, 0) + return s, s.textChanged def manual_cut_editline(text="", enabled=True) -> QLineEdit: edit = QLineEdit( text=text, placeholderText="e.g. 0.0, 0.5, 1.0", - toolTip="Enter fixed discretization cut points (a comma " - "separated list of strictly increasing numbers e.g. " - "0.0, 0.5, 1.0).", + toolTip='

' + + 'Enter cut points as a comma-separate list of \n' + 'strictly increasing numbers e.g. 0.0, 0.5, 1.0).

', enabled=enabled, ) + edit.setValidator(IncreasingNumbersListValidator()) + edit.setSizePolicy(QSizePolicy.MinimumExpanding, QSizePolicy.Fixed) + @edit.textChanged.connect def update(): validator = edit.validator() - if validator is not None: + if validator is not None and edit.text().strip(): state, _, _ = validator.validate(edit.text(), 0) else: state = QValidator.Acceptable @@ -372,432 +610,424 @@ def update(): p = edit.mapToGlobal(cr.bottomRight()) edit.setPalette(palette) if state != QValidator.Acceptable and edit.isVisible(): - show_tip(edit, p, edit.toolTip(), textFormat=Qt.RichText) + validator.show_tip(edit, p, edit.toolTip(), + textFormat=Qt.RichText) else: - show_tip(edit, p, "") - return edit - - self.manual_cuts_edit = manual_cut_editline( - text=", ".join(map(str, self.default_cutpoints)), - enabled=self.default_method == Methods.Custom, - ) - - def set_manual_default_cuts(): - text = self.manual_cuts_edit.text() - self.default_cutpoints = tuple( - float(s.strip()) for s in text.split(",") if s.strip()) - self._default_disc_changed() - self.manual_cuts_edit.editingFinished.connect(set_manual_default_cuts) - - validator = IncreasingNumbersListValidator() - self.manual_cuts_edit.setValidator(validator) - ibox = gui.indentedBox(right, orientation=Qt.Horizontal) - ibox.layout().addWidget(self.manual_cuts_edit) - - right.layout().addStretch(10) - self.left.layout().addStretch(10) - - self.connect_control( - "default_cutpoints", - lambda values: self.manual_cuts_edit.setText(", ".join(map(str, values))) - ) - vlayout = QHBoxLayout() - box = gui.widgetBox( - self.controlArea, "Individual Attribute Settings", - orientation=vlayout, spacing=8 - ) - - # List view with all attributes - self.varview = ListViewSearch( - selectionMode=QListView.ExtendedSelection, - uniformItemSizes=True, - ) - self.varview.setItemDelegate(DiscDelegate()) - self.varmodel = itemmodels.VariableListModel() - self.varview.setModel(self.varmodel) - self.varview.selectionModel().selectionChanged.connect( - self._var_selection_changed - ) - - vlayout.addWidget(self.varview) - # Controls for individual attr settings - self.bbox = controlbox = gui.radioButtons( - box, self, "method", callback=self._disc_method_changed - ) - vlayout.addWidget(controlbox) - self.variable_button_group = bg = controlbox.findChild(QButtonGroup) - for id_, opt in self.options[:5]: - b = gui.appendRadioButton(controlbox, opt) - bg.setId(b, id_) - - self.k_specific = _intbox(controlbox, "k", self._disc_method_changed) - - gui.appendRadioButton(controlbox, "Remove attribute", id=Methods.Remove) - b = gui.appendRadioButton(controlbox, "Manual", id=Methods.Custom) - - self.manual_cuts_specific = manual_cut_editline( - text=", ".join(map(str, self.cutpoints)), - enabled=self.method == Methods.Custom - ) - self.manual_cuts_specific.setValidator(validator) - b.toggled[bool].connect(self.manual_cuts_specific.setEnabled) - - def set_manual_cuts(): - text = self.manual_cuts_specific.text() - points = [t for t in text.split(",") if t.split()] - self.cutpoints = tuple(float(t) for t in points) - self._disc_method_changed() - self.manual_cuts_specific.editingFinished.connect(set_manual_cuts) - - self.connect_control( - "cutpoints", - lambda values: self.manual_cuts_specific.setText(", ".join(map(str, values))) - ) - ibox = gui.indentedBox(controlbox, orientation=Qt.Horizontal) - self.copy_current_to_manual_button = b = FixedSizeButton( - text="CC", toolTip="Copy the current cut points to manual mode", - enabled=False - ) - b.clicked.connect(self._copy_to_manual) - ibox.layout().addWidget(self.manual_cuts_specific) - ibox.layout().addWidget(b) - - gui.rubber(controlbox) - controlbox.setEnabled(False) - bg.button(self.method) - self.controlbox = controlbox + validator.show_tip(edit, p, "") + return edit, edit.textChanged + + children = [] + + def button(id_, *controls, stretch=True): + layout = QHBoxLayout() + desc = Options[id_] + button = QRadioButton(desc.label) + button.setToolTip(desc.tooltip) + self.button_group.addButton(button, id_) + layout.addWidget(button) + if controls: + if stretch: + layout.addStretch(1) + for c, signal in controls: + layout.addWidget(c) + if signal is not None: + @signal.connect + def arg_changed(): + self.button_group.button(id_).setChecked(True) + self.update_hints(id_) + + children.append(layout) + button_box.layout().addLayout(layout) + return (*controls, (None, ))[0][0] + + button_box = gui.vBox(box) + button_box.layout().setSpacing(0) + button_box.setSizePolicy(QSizePolicy(QSizePolicy.Fixed, QSizePolicy.Preferred)) + self.button_group = QButtonGroup(self) + self.button_group.idClicked.connect(self.update_hints) + + button(Methods.Keep) + button(Methods.Remove) + + self.binning_spin = button(Methods.Binning, intspin()) + validator = QDoubleValidator() + validator.setBottom(0) + self.width_line = button(Methods.FixedWidth, widthline(validator)) + + self.width_time_unit = u = QComboBox(self) + u.setContentsMargins(0, 0, 0, 0) + u.addItems([unit + "(s)" for unit in time_units]) + validator = QIntValidator() + validator.setBottom(1) + self.width_time_line = button(Methods.FixedWidthTime, + widthline(validator), + (u, u.currentTextChanged)) + + self.freq_spin = button(Methods.EqualFreq, intspin()) + self.width_spin = button(Methods.EqualWidth, intspin()) + button(Methods.MDL) + + self.copy_to_custom = FixedSizeButton( + text="CC", toolTip="Copy the current cut points to manual mode") + self.copy_to_custom.clicked.connect(self._copy_to_manual) + self.threshold_line = button(Methods.Custom, + manual_cut_editline(), + (self.copy_to_custom, None), + stretch=False) + button(Methods.Default) + maxheight = max(w.sizeHint().height() for w in children) + for w in children: + w.itemAt(0).widget().setFixedHeight(maxheight) + button_box.layout().addStretch(1) + + def _update_default_model(self): + """Update data in the model showing default settings""" + model = self.varview.default_view.model() + model.setData(model.index(0), self.var_hints[DefaultKey], Qt.UserRole) + + def _set_mdl_button(self): + """Disable MDL discretization for data with non-discrete class""" + mdl_button = self.button_group.button(Methods.MDL) + if self.data is None or self.data.domain.has_discrete_class: + mdl_button.setEnabled(True) + else: + if mdl_button.isChecked(): + self._check_button(Methods.Keep, True) + mdl_button.setEnabled(False) + + def _check_button(self, method_id: Methods, checked: bool): + """Checks the given button""" + self.button_group.button(method_id).setChecked(checked) + + def _uncheck_all_buttons(self): + """Uncheck all radio buttons""" + group = self.button_group + button = group.checkedButton() + if button is not None: + group.setExclusive(False) + button.setChecked(False) + group.setExclusive(True) + + def _set_radio_enabled(self, method_id: Methods, value: bool): + """Enable/disable radio button and related controls""" + if self.button_group.button(method_id).isChecked() and not value: + self._uncheck_all_buttons() + self.button_group.button(method_id).setEnabled(value) + for control_name in Options[method_id].controls: + getattr(self, control_name).setEnabled(value) + + def _get_values(self, method_id: Methods) -> Tuple[Union[int, float, str]]: + """Return parameters from controls pertaining to the given method""" + controls = Options[method_id].controls + values = [] + for control_name in controls: + control = getattr(self, control_name) + if isinstance(control, QSpinBox): + values.append(control.value()) + elif isinstance(control, QComboBox): + values.append(control.currentIndex()) + else: + values.append(control.text()) + return tuple(values) - gui.auto_apply(self.buttonsArea, self, "autosend") + def _set_values(self, method_id: Methods, + values: Tuple[Union[str, int, float]]): + """ + Set controls pertaining to the given method to parameters from hint + """ + controls = Options[method_id].controls + for control_name, value in zip(controls, values): + control = getattr(self, control_name) + if isinstance(control, QSpinBox): + control.setValue(value) + elif isinstance(control, QComboBox): + control.setCurrentIndex(value) + else: + control.setText(value) - self._update_spin_positions() + def varkeys_for_selection(self) -> List[KeyType]: + """ + Return list of KeyType's for selected variables (for indexing var_hints) - @property - def default_method(self) -> Methods: - return Methods[self.default_method_name] + If 'Default settings' are selected, this returns DefaultKey + """ + model = self.varview.model() + varkeys = [variable_key(model[index.row()]) + for index in self.varview.selectionModel().selectedRows()] + return varkeys or [DefaultKey] # default settings are selected - @default_method.setter - def default_method(self, method): - self.set_default_method(method) + def update_hints(self, method_id: Methods): + """ + Callback for radio buttons and for controls regulating parameters - def set_default_method(self, method: Methods): - if isinstance(method, int): - method = Methods(method) - else: - method = Methods.from_method(method) + This function: + - updates `var_hints` for all selected methods + - invalidates (removes) `discretized_vars` for affected variables + - calls _update_discretizations to compute and commit new discretization + - calls deferred commit - if method != self.default_method: - self.default_method_name = method.name - self.default_button_group.button(method).setChecked(True) - self._default_disc_changed() - self.manual_cuts_edit.setEnabled(method == Methods.Custom) + Data for list view models is updated in _update_discretizations + """ + if self.__interface_update: + return - @Inputs.data - def set_data(self, data): - self.closeContext() - self.data = data - if self.data is not None: - self._initialize(data) - self.openContext(data) - # Restore the per variable discretization settings - self._restore(self.saved_var_states) - # Complete the induction of cut points - self._update_points() + method_id = Methods(method_id) + args = self._get_values(method_id) + keys = self.varkeys_for_selection() + if method_id == Methods.Default: + for key in keys: + if key in self.var_hints: + del self.var_hints[key] else: - self._clear() - self.commit.now() + self.var_hints.update(dict.fromkeys(keys, VarHint(method_id, args))) + if keys == [DefaultKey]: + invalidate = set(self.discretized_vars) - set(self.var_hints) + else: + invalidate = keys + for key in invalidate: + del self.discretized_vars[key] - def _initialize(self, data): - # Initialize the default variable states for new data. - self.class_var = data.domain.class_var - cvars = [var for var in data.domain.variables - if var.is_continuous] - self.varmodel[:] = cvars - - has_disc_class = data.domain.has_discrete_class - - def set_enabled(box: QWidget, id_: Methods, state: bool): - bg = box.findChild(QButtonGroup) - b = bg.button(id_) - b.setEnabled(state) - - set_enabled(self.default_bbox, self.MDL, has_disc_class) - bg = self.bbox.findChild(QButtonGroup) - b = bg.button(Methods.MDL) - b.setEnabled(has_disc_class) - set_enabled(self.bbox, self.MDL, has_disc_class) - - # If the newly disabled MDL button is checked then change it - if not has_disc_class and self.default_method == self.MDL: - self.default_method = Methods.Leave - if not has_disc_class and self.method == self.MDL: - self.method = Methods.Default - - # Reset (initialize) the variable discretization states. - self._reset() - - def _restore(self, saved_state): - # Restore variable states from a saved_state dictionary. - def_method = self._current_default_method() - for i, var in enumerate(self.varmodel): - key = variable_key(var) - if key in saved_state: - state = saved_state[key] - if isinstance(state.method, Default): - state = DState(Default(def_method), None, None) - self._set_var_state(i, state) - - def _reset(self): - # restore the individual variable settings back to defaults. - def_method = self._current_default_method() - self.var_state = {} - for i in range(len(self.varmodel)): - state = DState(Default(def_method), None, None) - self._set_var_state(i, state) - - def _set_var_state(self, index, state): - # set the state of variable at `index` to `state`. - self.var_state[index] = state - self.varmodel.setData(self.varmodel.index(index), state, Qt.UserRole) - - def _clear(self): - self.data = None - self.varmodel[:] = [] - self.var_state = {} - self.saved_var_states = {} - self.default_button_group.button(self.MDL).setEnabled(True) - self.variable_button_group.button(self.MDL).setEnabled(True) + if keys == [DefaultKey]: + self._update_default_model() + self._update_discretizations() + self.commit.deferred() - def _update_points(self): + def _update_discretizations(self): """ - Update the induced cut points. + Compute invalidated (missing) discretizations + + Also set data for list view models for all invalidated variables """ if self.data is None: return - def induce_cuts(method, data, var): - dvar = _dispatch[type(method)](method, data, var) - if dvar is None: - # removed - return [], None - elif dvar is var: - # no transformation took place - return None, var - elif is_discretized(dvar): - return dvar.compute_value.points, dvar - raise ValueError - - for i, var in enumerate(self.varmodel): - state = self.var_state[i] - if state.points is None and state.disc_var is None: - points, dvar = induce_cuts(state.method, self.data, var) - new_state = state._replace(points=points, disc_var=dvar) - self._set_var_state(i, new_state) - - def _current_default_method(self): - method = self.default_method - k = self.default_k - if method == Methods.Leave: - def_method = Leave() - elif method == Methods.MDL: - def_method = MDL() - elif method == Methods.EqualFreq: - def_method = EqualFreq(k) - elif method == Methods.EqualWidth: - def_method = EqualWidth(k) - elif method == Methods.Remove: - def_method = Remove() - elif method == Methods.Custom: - def_method = Custom(self.default_cutpoints) - else: - assert False - return def_method - - def _current_method(self): - if self.method == Methods.Default: - method = Default(self._current_default_method()) - elif self.method == Methods.Leave: - method = Leave() - elif self.method == Methods.MDL: - method = MDL() - elif self.method == Methods.EqualFreq: - method = EqualFreq(self.k) - elif self.method == Methods.EqualWidth: - method = EqualWidth(self.k) - elif self.method == Methods.Remove: - method = Remove() - elif self.method == Methods.Custom: - method = Custom(self.cutpoints) - else: - assert False - return method - - def _update_spin_positions(self): - kmethods = [Methods.EqualFreq, Methods.EqualWidth] - self.k_general.setDisabled(self.default_method not in kmethods) - if self.default_method == Methods.EqualFreq: - self.left.layout().insertWidget(1, self.k_general) - elif self.default_method == Methods.EqualWidth: - self.left.layout().insertWidget(2, self.k_general) - - self.k_specific.setDisabled(self.method not in kmethods) - if self.method == Methods.EqualFreq: - self.bbox.layout().insertWidget(4, self.k_specific) - elif self.method == Methods.EqualWidth: - self.bbox.layout().insertWidget(5, self.k_specific) - - def _default_disc_changed(self): - self._update_spin_positions() - method = self._current_default_method() - state = DState(Default(method), None, None) - for i, _ in enumerate(self.varmodel): - if isinstance(self.var_state[i].method, Default): - self._set_var_state(i, state) - self._update_points() - self.commit.deferred() + default_hint = self.var_hints[DefaultKey] + model = self.varview.model() + for index, var in enumerate(model): + key = variable_key(var) + if key in self.discretized_vars: + continue # still valid + var_hint = self.var_hints.get(key) + points, dvar = self._discretize_var(var, var_hint or default_hint) + self.discretized_vars[key] = dvar + values = getattr(dvar, "values", ()) + model.setData(model.index(index), + DiscDesc(var_hint, points, values), + Qt.UserRole) + + def _discretize_var(self, var: ContinuousVariable, hint: VarHint) \ + -> Tuple[str, Optional[Variable]]: + """ + Discretize using method and data in the hint. - def _disc_method_changed(self): - self._update_spin_positions() - indices = self.selected_indices() - method = self._current_method() - state = DState(method, None, None) - for idx in indices: - self._set_var_state(idx, state) - self._update_points() - self._copy_to_manual_update_enabled() - self.commit.deferred() + Returns a description (list of points or error/warning) and a + - discrete variable + - same variable (if kept numeric) + - None (if removed or errored) + """ + if isinstance(var, TimeVariable): + if hint.method_id in (Methods.FixedWidth, Methods.Custom): + return ": ", var + else: + if hint.method_id == Methods.FixedWidthTime: + return ": ", var + + function = Options[hint.method_id].function + dvar = function(self.data, var, *hint.args) + if isinstance(dvar, str): + return f" <{dvar}>", None # error + if dvar is None: + return "", None # removed + elif dvar is var: + return "", var # no transformation + thresholds = dvar.compute_value.points + if len(thresholds) == 0: + return " ", None + return ": " + ", ".join(map(var.repr_val, thresholds)), dvar def _copy_to_manual(self): - indices = self.selected_indices() - # set of all methods for the current selection - if len(indices) != 1: - return - index = indices[0] - state = self.var_state[index] - var = self.varmodel[index] - fmt = var.repr_val - points = state.points - if points is None: - points = () - else: - points = tuple(state.points) - state = state._replace(method=Custom(points), points=None, disc_var=None) - self._set_var_state(index, state) - self.method = Methods.Custom - self.cutpoints = points - self.manual_cuts_specific.setText(", ".join(map(fmt, points))) - self._update_points() - self.commit.deferred() + """ + Callback for 'CC' button - def _copy_to_manual_update_enabled(self): - indices = self.selected_indices() - methods = [self.var_state[i].method for i in indices] - self.copy_current_to_manual_button.setEnabled( - len(indices) == 1 and not isinstance(methods[0], Custom)) - - def _var_selection_changed(self, *_): - self._copy_to_manual_update_enabled() - indices = self.selected_indices() - # set of all methods for the current selection - methods = [self.var_state[i].method for i in indices] - - def key(method): - if isinstance(method, Default): - return Default, (None, ) - return type(method), tuple(method) - - mset = list(unique_everseen(methods, key=key)) - - self.controlbox.setEnabled(len(mset) > 0) - if len(mset) == 1: - method = mset.pop() - self.method = Methods.from_method(method) - if isinstance(method, (EqualFreq, EqualWidth)): - self.k = method.k - elif isinstance(method, Custom): - self.cutpoints = method.points - else: - # deselect the current button - self.method = -1 - bg = self.controlbox.group - button_group_reset(bg) - self._update_spin_positions() - - def selected_indices(self): - rows = self.varview.selectionModel().selectedRows() - return [index.row() for index in rows] - - def method_for_index(self, index): - state = self.var_state[index] - return state.method - - def discretized_var(self, index): - # type: (int) -> Optional[Orange.data.DiscreteVariable] - state = self.var_state[index] - if state.disc_var is not None and state.points == []: - # Removed by MDL Entropy - return None - else: - return state.disc_var + Sets selected variables' method to "Custom" and copies thresholds + to their VarHints. Variables that are not discretized (for any reason) + are skipped. - def discretized_domain(self): - """ - Return the current effective discretized domain. + Discretizations are invalidated and then updated + (`_update_discretizations`). + + If all selected variables have the same thresholds, it copies it to + the line edit. Otherwise it unchecks all radio buttons to keep the + interface consistent. """ - if self.data is None: - return None + varkeys = self.varkeys_for_selection() + texts = set() + for key in varkeys: + dvar = self.discretized_vars.get(key) + fmt = self.data.domain[key[0]].repr_val + if isinstance(dvar, DiscreteVariable): + text = ", ".join(map(fmt, dvar.compute_value.points)) + texts.add(text) + self.var_hints[key] = VarHint(Methods.Custom, (text, )) + del self.discretized_vars[key] + try: + self.__interface_update = True + if len(texts) == 1: + self.threshold_line.setText(texts.pop()) + else: + self._uncheck_all_buttons() + finally: + self.__interface_update = False + self._update_discretizations() + self.commit.deferred() - # a mapping of all applied changes for variables in `varmodel` - mapping = {var: self.discretized_var(i) - for i, var in enumerate(self.varmodel)} + def _default_selected(self, selected): + """Callback for selecting 'Default setting'""" + if not selected: + # Prevent infinite recursion (with _var_selection_changed) + return + self.varview.selectionModel().clearSelection() + self._update_interface() + + set_enabled = self._set_radio_enabled + set_enabled(Methods.Default, False) + set_enabled(Methods.FixedWidth, True) + set_enabled(Methods.FixedWidthTime, True) + set_enabled(Methods.Custom, True) + self.copy_to_custom.setEnabled(False) + + def _var_selection_changed(self, _): + """Callback for changed selection in listview with variables""" + selected = self.varview.selectionModel().selectedIndexes() + if not selected: + # Prevent infinite recursion (with _default_selected) + return + self.varview.default_view.selectionModel().clearSelection() + self._update_interface() + + set_enabled = self._set_radio_enabled + vars_ = [self.data.domain[name] + for name, _ in self.varkeys_for_selection()] + no_time = not any(isinstance(var, TimeVariable) for var in vars_) + all_time = all(isinstance(var, TimeVariable) for var in vars_) + set_enabled(Methods.Default, True) + set_enabled(Methods.FixedWidth, no_time) + set_enabled(Methods.Custom, no_time) + self.copy_to_custom.setEnabled(no_time) + set_enabled(Methods.FixedWidthTime, all_time) + + def _update_interface(self): + """ + Update the user interface according to selection - def disc_var(source): - return mapping.get(source, source) + - If VarHints for all selected variables are the same, check the + corresponding radio button and fill the corresponding controls; + - otherwise, uncheck all radios. + """ + if self.__interface_update: + return - # map the full input domain to the new variables (where applicable) - attributes = [disc_var(v) for v in self.data.domain.attributes] - attributes = [v for v in attributes if v is not None] + try: + self.__interface_update = True + keys = self.varkeys_for_selection() + mset = list(unique_everseen(map(self.var_hints.get, keys))) + if len(mset) != 1: + self._uncheck_all_buttons() + return - class_vars = [disc_var(v) for v in self.data.domain.class_vars] - class_vars = [v for v in class_vars if v is not None] + if mset == [None]: + method_id, args = Methods.Default, () + else: + method_id, args = mset.pop() + self._check_button(method_id, True) + self._set_values(method_id, args) + finally: + self.__interface_update = False - domain = Orange.data.Domain( - attributes, class_vars, metas=self.data.domain.metas - ) - return domain + @Inputs.data + def set_data(self, data: Optional[Table]): + self.discretized_vars = {} + self.data = data + self.varview.model().set_domain(None if data is None else data.domain) + self._update_discretizations() + self._update_default_model() + self.varview.select_default() + self._set_mdl_button() + self.commit.now() @gui.deferred def commit(self): - output = None - if self.data is not None: - domain = self.discretized_domain() - output = self.data.transform(domain) - self.Outputs.data.send(output) + if self.data is None: + self.Outputs.data.send(None) + return - def storeSpecificSettings(self): - super().storeSpecificSettings() - self.saved_var_states = { - variable_key(var): - self.var_state[i]._replace(points=None, disc_var=None) - for i, var in enumerate(self.varmodel) - } + def part(variables: List[Variable]) -> List[Variable]: + return [dvar + for dvar in (self.discretized_vars.get(variable_key(v), v) + for v in variables) + if dvar] + + d = self.data.domain + domain = Domain(part(d.attributes), part(d.class_vars), part(d.metas)) + output = self.data.transform(domain) + self.Outputs.data.send(output) def send_report(self): - self.report_items(( - ("Default method", self.options[self.default_method][1]),)) - if self.varmodel: - self.report_items("Thresholds", [ - (var.name, - DiscDelegate.cutsText(self.var_state[i], var.repr_val) or "leave numeric") - for i, var in enumerate(self.varmodel)]) + dmodel = self.varview.default_view.model() + desc = dmodel.data(dmodel.index(0)) + self.report_items((tuple(desc.split(": ", maxsplit=1)), )) + model = self.varview.model() + reported = [] + for row in range(model.rowCount()): + name = model[row].name + desc = model.data(model.index(row), Qt.UserRole) + if desc.hint is not None: + name = f"{name} ({format_desc(desc.hint)})" + reported.append((name, ', '.join(desc.values))) + self.report_items("Variables", reported) @classmethod - def migrate_settings(cls, settings, version): # pylint: disable=redefined-outer-name + def migrate_settings(cls, settings, version): if version is None or version < 2: # was stored as int indexing Methods (but offset by 1) default = settings.pop("default_method", 0) default = Methods(default + 1) settings["default_method_name"] = default.name + if version is None or version < 3: + method_name = settings.pop("default_method_name", + DefaultHint.method_id.name) + k = settings.pop("default_k", 3) + cut_points = settings.pop("default_cutpoints", ()) + + method_id = getattr(Methods, method_name) + if method_id in (Methods.EqualFreq, Methods.EqualWidth): + args = (k, ) + elif method_id == Methods.Custom: + args = (cut_points, ) + else: + args = () + default_hint = VarHint(method_id, args) + var_hints = {DefaultKey: default_hint} + for context in settings.pop("context_settings", []): + values = context.values + if "saved_var_states" not in values: + continue + var_states, _ = values.pop("saved_var_states") + for (tpe, name), dstate in var_states.items(): + key = (name, tpe == 4) # time variable == 4 + method = dstate.method + method_name = type(method).__name__.replace("Leave", "Keep") + if method_name == "Default": + continue + if method_name == "Custom": + args = (", ".join(f"{x:g}" for x in method.points), ) + else: + args = tuple(method) + var_hints[key] = VarHint(getattr(Methods, method_name), args) + settings["var_hints"] = var_hints + if __name__ == "__main__": # pragma: no cover - WidgetPreview(OWDiscretize).run(Orange.data.Table("brown-selected")) + #WidgetPreview(OWDiscretize).run(Table("/Users/janez/Downloads/banking-crises.tab")) + WidgetPreview(OWDiscretize).run(Table("heart_disease")) diff --git a/Orange/widgets/data/tests/test_owdiscretize.py b/Orange/widgets/data/tests/test_owdiscretize.py index 61a16b115c9..2fb0979c225 100644 --- a/Orange/widgets/data/tests/test_owdiscretize.py +++ b/Orange/widgets/data/tests/test_owdiscretize.py @@ -1,178 +1,581 @@ # Test methods with long descriptive names can omit docstrings # pylint: disable=missing-docstring,unsubscriptable-object,protected-access import unittest +from functools import partial +from unittest.mock import patch, Mock -from AnyQt.QtCore import Qt, QPoint +import numpy as np + +from AnyQt.QtCore import QPoint, Qt, QModelIndex from AnyQt.QtWidgets import QWidget, QApplication, QStyleOptionViewItem +from AnyQt.QtGui import QIcon + +from orangewidget.settings import Context -from Orange.data import Table, DiscreteVariable -from Orange.widgets.data.owdiscretize import OWDiscretize, Default, EqualFreq, \ - Remove, Leave, Custom, IncreasingNumbersListValidator, DiscDelegate, MDL, \ - EqualWidth, DState, show_tip -from Orange.widgets.tests.base import WidgetTest -from Orange.widgets.tests.base import GuiTest -from Orange.widgets.utils.itemmodels import select_row, VariableListModel +from Orange.data import Table, ContinuousVariable, TimeVariable, Domain +from Orange.preprocess.discretize import TooManyIntervals +from Orange.widgets.data.owdiscretize import OWDiscretize, \ + IncreasingNumbersListValidator, VarHint, Methods, DefaultKey, \ + _fixed_width_discretization, _fixed_time_width_discretization, \ + _custom_discretization, variable_key, Options, DefaultHint, \ + _mdl_discretization, ListViewSearch, format_desc, DefaultDiscModel +from Orange.widgets.tests.base import WidgetTest, GuiTest +from Orange.widgets.utils.itemmodels import select_rows -class TestOWDiscretize(WidgetTest): +class DataMixin: + def prepare_data(self): + self.domain = Domain([ContinuousVariable("x"), + ContinuousVariable("y"), + ContinuousVariable("z"), + TimeVariable("t"), + TimeVariable("u")]) + self.data = Table.from_numpy(self.domain, np.arange(20).reshape(4, 5)) + self.var_hints = { + DefaultKey: VarHint(Methods.Keep, ()), + ("x", False): VarHint(Methods.EqualFreq, (3, )), + ("y", False): VarHint(Methods.Keep, ()), + ("z", False): VarHint(Methods.Remove, ()), + ("t", True): VarHint(Methods.Binning, (2, )) + } + # Copy the following line to tests, for reference: + # Def: Keep, x: EqFreq 3, y: Keep, z: Remove, t (time): Bin 2, u (time): + +class TestOWDiscretize(WidgetTest, DataMixin): def setUp(self): super().setUp() + self.prepare_data() self.widget = self.create_widget(OWDiscretize) + def test_empty_data(self): data = Table("iris") widget = self.widget self.send_signal(self.widget.Inputs.data, Table.from_domain(data.domain)) - for m in (OWDiscretize.Leave, OWDiscretize.MDL, OWDiscretize.EqualFreq, - OWDiscretize.EqualWidth, OWDiscretize.Remove, - OWDiscretize.Custom): - widget.default_method = m + for m in range(len(Methods)): + widget.var_hints = {DefaultKey: VarHint(m, ())} widget.commit.now() self.assertIsNotNone(self.get_output(widget.Outputs.data)) - def test_select_method(self): - widget = self.widget - data = Table("iris")[::5] - self.send_signal(self.widget.Inputs.data, data) - - model = widget.varmodel - view = widget.varview - defbg = widget.default_button_group - varbg = widget.variable_button_group - self.assertSequenceEqual(list(model), data.domain.attributes) - defbg.button(OWDiscretize.EqualFreq).click() - self.assertEqual(widget.default_method, OWDiscretize.EqualFreq) - self.assertTrue( - all(isinstance(m, Default) and isinstance(m.method, EqualFreq) - for m in map(widget.method_for_index, - range(len(data.domain.attributes))))) - - # change method for first variable - select_row(view, 0) - varbg.button(OWDiscretize.Remove).click() - met = widget.method_for_index(0) - self.assertIsInstance(met, Remove) - - # select a second var - selmodel = view.selectionModel() - selmodel.select(model.index(2), selmodel.Select) - # the current checked button must unset - self.assertEqual(varbg.checkedId(), -1) - - varbg.button(OWDiscretize.Leave).click() - self.assertIsInstance(widget.method_for_index(0), Leave) - self.assertIsInstance(widget.method_for_index(2), Leave) - # reset both back to default - varbg.button(OWDiscretize.Default).click() - self.assertIsInstance(widget.method_for_index(0), Default) - self.assertIsInstance(widget.method_for_index(2), Default) - - def test_migration(self): - w = self.create_widget(OWDiscretize, stored_settings={ - "default_method": 0 - }) - self.assertEqual(w.default_method, OWDiscretize.Leave) - - def test_manual_cuts_edit(self): - widget = self.widget - data = Table("iris")[::5] - self.send_signal(self.widget.Inputs.data, data) - view = widget.varview - varbg = widget.variable_button_group - widget.set_default_method(OWDiscretize.Custom) - widget.default_cutpoints = (0, 2, 4) - ledit = widget.manual_cuts_edit - self.assertEqual(ledit.text(), "0, 2, 4") - ledit.setText("3, 4, 5") - ledit.editingFinished.emit() - self.assertEqual(widget.default_cutpoints, (3, 4, 5)) - self.assertEqual(widget._current_default_method(), Custom((3, 4, 5))) - self.assertTrue( - all(widget.method_for_index(i) == Default(Custom((3, 4, 5))) - for i in range(len(data.domain.attributes))) - ) - select_row(view, 0) - varbg.button(OWDiscretize.Custom).click() - ledit = widget.manual_cuts_specific - ledit.setText("1, 2, 3") - ledit.editingFinished.emit() - self.assertEqual(widget.method_for_index(0), Custom((1, 2, 3))) - ledit.setText("") - ledit.editingFinished.emit() - self.assertEqual(widget.method_for_index(0), Custom(())) - - def test_manual_cuts_copy(self): - widget = self.widget - data = Table("iris")[::5] - self.send_signal(self.widget.Inputs.data, data) - view = widget.varview - select_row(view, 0) - varbg = widget.variable_button_group - varbg.button(OWDiscretize.EqualWidth).click() - v = widget.discretized_var(0) - points = tuple(v.compute_value.points) - cc_button = widget.copy_current_to_manual_button - cc_button.click() - self.assertEqual(widget.method_for_index(0), Custom(points)) - self.assertEqual(varbg.checkedId(), OWDiscretize.Custom) - def test_report(self): - widget = self.widget - data = Table("iris")[::5] - self.send_signal(widget.Inputs.data, data) - widget.send_report() + data = Table("brown-selected") + + w = self.create_widget( + OWDiscretize, + {"var_hints": + {None: VarHint(Methods.EqualFreq, (3,)), + ('alpha 0', False): VarHint(Methods.Keep, ()), + ('alpha 7', False): VarHint(Methods.Remove, ()), + ('alpha 14', False): VarHint(Methods.Binning, (2, )), + ('alpha 21', False): VarHint(Methods.FixedWidth, ("0.05", )), + ('alpha 28', False): VarHint(Methods.EqualFreq, (4, )), + ('alpha 35', False): VarHint(Methods.MDL, ()), + ('alpha 42', False): VarHint(Methods.Custom, ("0, 0.125", )), + ('alpha 49', False): VarHint(Methods.MDL, ())}, + "__version__": 3}) + self.send_signal(w.Inputs.data, data) + + self.widget.send_report() + + def test_all(self): + data = Table("brown-selected") + + w = self.create_widget( + OWDiscretize, + {"var_hints": + {None: VarHint(Methods.EqualFreq, (3,)), + ('alpha 0', False): VarHint(Methods.Keep, ()), + ('alpha 7', False): VarHint(Methods.Remove, ()), + ('alpha 14', False): VarHint(Methods.Binning, (2, )), + ('alpha 21', False): VarHint(Methods.FixedWidth, ("0.05", )), + ('alpha 28', False): VarHint(Methods.EqualFreq, (4, )), + ('alpha 35', False): VarHint(Methods.MDL, ()), + ('alpha 42', False): VarHint(Methods.Custom, ("0, 0.125", )), + ('alpha 49', False): VarHint(Methods.MDL, ())}, + "__version__": 3}) + + self.send_signal(w.Inputs.data, data) + + self.assertTrue(w.button_group.button(Methods.MDL).isEnabled()) + self.assertEqual(w.varview.default_view.model().hint, + VarHint(Methods.EqualFreq, (3, ))) + + out = self.get_output(w.Outputs.data) + dom = out.domain + self.assertIsInstance(dom["alpha 0"], ContinuousVariable) + self.assertNotIn("alpha 7", dom) + self.assertEqual(dom["alpha 14"].values, ('< 0', '≥ 0')) + self.assertEqual(dom["alpha 21"].values, + ('< -0.15', "-0.15 - -0.10", "-0.10 - -0.05", + "-0.05 - 0.00", "0.00 - 0.05", "0.05 - 0.10", + '≥ 0.10')) + self.assertEqual(len(dom["alpha 28"].values), 4) + self.assertNotIn("alpha 35", dom) # removed by MDL + self.assertEqual(dom["alpha 42"].values, ('< 0', '0 - 0.125', '≥ 0.125')) + self.assertEqual(len(dom["alpha 49"].values), 2) + + self.send_signal(w.Inputs.data, None) + self.assertIsNone(self.get_output(w.Outputs.data)) + self.assertIsNone(w.data) + self.assertEqual(w.discretized_vars, {}) + self.assertEqual(len(w.varview.model()), 0) + + self.send_signal(w.Inputs.data, data) + self.assertIsNotNone(self.get_output(w.Outputs.data)) + w.button_group.button(Methods.MDL).setChecked(True) + self.assertTrue(w.button_group.button(Methods.MDL).isEnabled()) + self.assertTrue(w.button_group.button(Methods.MDL).isChecked()) + + self.send_signal(w.Inputs.data, data[:, 0]) + self.assertFalse(w.button_group.button(Methods.MDL).isEnabled()) + self.assertFalse(w.button_group.button(Methods.MDL).isChecked()) + + self.send_signal(w.Inputs.data, data) + self.assertTrue(w.button_group.button(Methods.MDL).isEnabled()) + + def test_get_values(self): + w = self.widget + + w.binning_spin.setValue(5) + w.width_line.setText("6") + w.width_time_line.setText("7") + w.width_time_unit.setCurrentIndex(1) + w.freq_spin.setValue(8) + w.width_spin.setValue(9) + w.threshold_line.setText("1, 2, 3, 4, 5") + + self.assertEqual(w._get_values(Methods.Keep), ()) + self.assertEqual(w._get_values(Methods.Remove), ()) + self.assertEqual(w._get_values(Methods.Binning), (5, )) + self.assertEqual(w._get_values(Methods.FixedWidth), ("6", )) + self.assertEqual(w._get_values(Methods.FixedWidthTime), ("7", 1)) + self.assertEqual(w._get_values(Methods.EqualFreq), (8, )) + self.assertEqual(w._get_values(Methods.EqualWidth), (9, )) + self.assertEqual(w._get_values(Methods.MDL), ()) + self.assertEqual(w._get_values(Methods.Custom), ("1, 2, 3, 4, 5", )) + + def test_set_values(self): + w = self.widget + + w._set_values(Methods.Keep, ()) + w._set_values(Methods.Remove, ()) + w._set_values(Methods.Binning, (5,)) + w._set_values(Methods.FixedWidth, ("6",)) + w._set_values(Methods.FixedWidthTime, ("7", 1)) + w._set_values(Methods.EqualFreq, (8,)) + w._set_values(Methods.EqualWidth, (9,)) + w._set_values(Methods.MDL, ()) + w._set_values(Methods.Custom, ("1, 2, 3, 4, 5",)) + + self.assertEqual(w.binning_spin.value(), 5) + self.assertEqual(w.width_line.text(), "6") + self.assertEqual(w.width_time_line.text(), "7") + self.assertEqual(w.width_time_unit.currentIndex(), 1) + self.assertEqual(w.freq_spin.value(), 8) + self.assertEqual(w.width_spin.value(), 9) + self.assertEqual(w.threshold_line.text(), "1, 2, 3, 4, 5") + + def test_varkeys_for_selection(self): + w = self.widget + self.send_signal(w.Inputs.data, self.data) + select_rows(w.varview, (0, 4)) + self.assertEqual(w.varkeys_for_selection(), [("x", False), ("u", True)]) + + def test_change_selection_update_interface(self): + w = self.widget + self.send_signal(w.Inputs.data, self.data) + w.var_hints = { + DefaultKey: DefaultHint, + ("x", False): VarHint(Methods.FixedWidth, ("10", )), + ("y", False): VarHint(Methods.FixedWidth, ("10", )), + ("z", False): VarHint(Methods.FixedWidth, ("5", )), + ("t", False): VarHint(Methods.Binning, (5, )) + } + + select_rows(w.varview, (0, 1)) + self.assertTrue(w.button_group.button(Methods.FixedWidth).isChecked()) + self.assertTrue(w.button_group.button(Methods.FixedWidth).isEnabled()) + self.assertFalse(w.button_group.button(Methods.FixedWidthTime).isEnabled()) + self.assertTrue(w.button_group.button(Methods.Custom).isEnabled()) + self.assertTrue(w.copy_to_custom.isEnabled()) + self.assertEqual(w.width_line.text(), "10") + + select_rows(w.varview, (1, 2)) + self.assertFalse(w.button_group.button(Methods.FixedWidth).isChecked()) + self.assertTrue(w.button_group.button(Methods.FixedWidth).isEnabled()) + self.assertFalse(w.button_group.button(Methods.FixedWidthTime).isEnabled()) + self.assertTrue(w.button_group.button(Methods.Custom).isEnabled()) + self.assertTrue(w.copy_to_custom.isEnabled()) + + select_rows(w.varview, (2, 4)) + self.assertFalse(w.button_group.button(Methods.FixedWidth).isChecked()) + self.assertFalse(w.button_group.button(Methods.FixedWidth).isEnabled()) + self.assertFalse(w.button_group.button(Methods.FixedWidthTime).isEnabled()) + self.assertFalse(w.button_group.button(Methods.Custom).isEnabled()) + + select_rows(w.varview, (3, 4)) + self.assertFalse(w.button_group.button(Methods.FixedWidth).isChecked()) + self.assertFalse(w.button_group.button(Methods.FixedWidth).isEnabled()) + self.assertTrue(w.button_group.button(Methods.FixedWidthTime).isEnabled()) + self.assertFalse(w.button_group.button(Methods.Custom).isEnabled()) + self.assertFalse(w.copy_to_custom.isEnabled()) + + select_rows(w.varview.default_view, (0, )) + self.assertEqual(len(w.varview.selectionModel().selectedIndexes()), 0) + self.assertFalse(w.button_group.button(Methods.FixedWidth).isChecked()) + self.assertTrue(w.button_group.button(Methods.FixedWidth).isEnabled()) + self.assertTrue(w.button_group.button(Methods.FixedWidthTime).isEnabled()) + self.assertTrue(w.button_group.button(Methods.Custom).isEnabled()) + self.assertFalse(w.copy_to_custom.isEnabled()) + self.assertFalse(w.button_group.button(Methods.Default).isEnabled()) + w._check_button(Methods.FixedWidth, True) + self.assertTrue(w.button_group.button(Methods.FixedWidth).isChecked()) + + select_rows(w.varview, (3, )) + self.assertEqual(len(w.varview.default_view.selectionModel().selectedIndexes()), 0) + self.assertFalse(w.button_group.button(Methods.FixedWidth).isChecked()) + self.assertFalse(w.button_group.button(Methods.FixedWidth).isEnabled()) + self.assertTrue(w.button_group.button(Methods.FixedWidthTime).isEnabled()) + self.assertFalse(w.button_group.button(Methods.Custom).isEnabled()) + self.assertTrue(w.button_group.button(Methods.Default).isEnabled()) + + def test_update_hints(self): + w = self.widget + update_disc = w._update_discretizations + w._update_discretizations = Mock() + w.width_line.setText("10") + self.send_signal(w.Inputs.data, self.data) + w.var_hints = { + DefaultKey: DefaultHint, + ("x", False): VarHint(Methods.EqualFreq, (3, )), + ("y", False): VarHint(Methods.EqualFreq, (3, )), + ("z", False): VarHint(Methods.EqualFreq, (4, )), + ("t", True): VarHint(Methods.Binning, (5, )) + } + update_disc() + self.assertEqual(len(w.discretized_vars), 5) + + select_rows(w.varview, (0, )) + w.button_group.button(Methods.Default).click() + self.assertNotIn(("x", False), w.var_hints) + # Check that "x" is invalidated + self.assertEqual(len(w.discretized_vars), 4) + self.assertNotIn(("x", False), w.discretized_vars) + update_disc() + self.assertEqual(len(w.discretized_vars), 5) + self.assertIn(("x", False), w.discretized_vars) + + select_rows(w.varview, (0, 1)) + w.button_group.button(Methods.FixedWidth).click() + self.assertEqual(w.var_hints[("x", False)], + VarHint(Methods.FixedWidth, ("10", ))) + self.assertEqual(w.var_hints[("y", False)], + VarHint(Methods.FixedWidth, ("10", ))) + # Check that "x" and "y" are invalidated + self.assertEqual(len(w.discretized_vars), 3) + self.assertNotIn(("x", False), w.discretized_vars) + self.assertNotIn(("y", False), w.discretized_vars) + update_disc() + self.assertEqual(len(w.discretized_vars), 5) + self.assertIn(("x", False), w.discretized_vars) + self.assertIn(("y", False), w.discretized_vars) + + w.width_line.setText("5") + self.assertEqual(w.var_hints[("x", False)], + VarHint(Methods.FixedWidth, ("5", ))) + self.assertEqual(w.var_hints[("y", False)], + VarHint(Methods.FixedWidth, ("5", ))) + # Check that "x" and "y" are invalidated + self.assertEqual(len(w.discretized_vars), 3) + self.assertNotIn(("x", False), w.discretized_vars) + self.assertNotIn(("y", False), w.discretized_vars) + update_disc() + self.assertEqual(len(w.discretized_vars), 5) + self.assertIn(("x", False), w.discretized_vars) + self.assertIn(("y", False), w.discretized_vars) + + select_rows(w.varview.default_view, (0, )) + w.button_group.button(Methods.FixedWidth).click() + self.assertEqual(len(w.discretized_vars), 4) + self.assertNotIn(("u", True), w.discretized_vars) + update_disc() + self.assertEqual(len(w.discretized_vars), 5) + self.assertIn(("u", True), w.discretized_vars) + + def test_discretize_var(self): + w = self.widget + self.send_signal(w.Inputs.data, self.data) + + x = self.data.domain["x"] + t = self.data.domain["t"] + + s, dvar = w._discretize_var(x, VarHint(Methods.FixedWidthTime, ("10", 0))) + self.assertIn("keep", s) + self.assertIs(dvar, x) + + s, dvar = w._discretize_var(t, VarHint(Methods.FixedWidth, ("10", ))) + self.assertIn("keep", s) + self.assertIs(dvar, t) + + try: + Options[42] = Mock() + + # Errored + # Unit test - mocked function + Options[42].function = lambda *_: "foo error" + s, dvar = w._discretize_var(t, VarHint(42, ())) + self.assertIn("foo error", s) + self.assertIsNone(dvar) + # Real error + s, dvar = w._discretize_var(t, VarHint(Methods.MDL, ())) + self.assertIn("<", s) + self.assertIsNone(dvar) + + # Removed attribute + Options[42].function = lambda *_: None + s, dvar = w._discretize_var(t, VarHint(42, ())) + self.assertEqual("", s) + self.assertIsNone(dvar) + # Really removed + s, dvar = w._discretize_var(t, VarHint(Methods.Remove, ())) + self.assertEqual("", s) + self.assertIsNone(dvar) + + # No intervals + var = Mock(compute_value=Mock(points=[])) + Options[42].function = lambda *_: var + s, dvar = w._discretize_var(t, VarHint(42, ())) + self.assertIn("removed", s) + self.assertIsNone(dvar) + s, dvar = w._discretize_var(x, VarHint(Methods.FixedWidth, ("1000", ))) + self.assertIn("removed", s) + self.assertIsNone(dvar) + + # All fine + var = Mock(compute_value=Mock(points=[1, 2, 3])) + Options[42].function = lambda *_: var + s, dvar = w._discretize_var(t, VarHint(42, ())) + self.assertIn("1, 2, 3", s) + self.assertIs(dvar, var) + s, dvar = w._discretize_var(x, VarHint(Methods.EqualWidth, (3, ))) + self.assertEqual(dvar.compute_value.points, [5, 10]) + + finally: + del Options[42] + + def test_update_discretizations(self): + w = self.widget + # Def: Keep, x: EqFreq 3, y: Keep, z: Remove, t (time): Bin 2, u (time): + w.var_hints = self.var_hints + y, t, u = map(self.domain.__getitem__, "ytu") + + # no data: do nothing, but don't crash + w._update_discretizations() + + self.send_signal(w.Inputs.data, self.data) + d = w.discretized_vars + self.assertEqual(len(d), 5) + self.assertEqual(len(d[("x", False)].values), 3) + self.assertIs(d[("y", False)], y) + self.assertIsNone(d[("z", False)]) + self.assertIsNot(d[("t", True)], t) + self.assertIsNotNone(d[("t", True)], t) + self.assertIs(d[("u", True)], u) + + d[("t", True)] = t + del d[("x", False)] + del d[("u", True)] + w._update_discretizations() + self.assertEqual(len(d[("x", False)].values), 3) + self.assertIs(d[("t", True)], t) + self.assertIs(d[("u", True)], u) + + w.var_hints[None] = VarHint(Methods.Remove, ()) + del d[("u", True)] + w._update_discretizations() + self.assertIsNone(d[("u", True)]) + + def test_copy_to_manual(self): + w = self.widget + w.var_hints = { DefaultKey: VarHint(Methods.EqualFreq, (5, )) } + self.send_signal(w.Inputs.data, self.data) + w.button_group.button(Methods.MDL).setChecked(True) + + select_rows(w.varview, (0, 2)) + self.assertTrue(w.copy_to_custom.isEnabled()) + w.copy_to_custom.click() + self.assertFalse(any(w.button_group.button(i).isChecked() + for i in Methods)) + self.assertEqual(w.var_hints[("x", False)], + VarHint(Methods.Custom, ('2.5, 7.5, 12.5', ))) + self.assertEqual(w.var_hints[("z", False)], + VarHint(Methods.Custom, ('4.5, 9.5, 14.5', ))) + self.assertNotIn(("y", False), w.var_hints) + + select_rows(w.varview, (1, )) + self.assertTrue(w.copy_to_custom.isEnabled()) + w.copy_to_custom.click() + self.assertTrue(w.button_group.button(Methods.Custom).isChecked()) + self.assertEqual(w.var_hints[("x", False)], + VarHint(Methods.Custom, ('2.5, 7.5, 12.5', ))) + self.assertEqual(w.var_hints[("z", False)], + VarHint(Methods.Custom, ('4.5, 9.5, 14.5', ))) + self.assertEqual(w.var_hints[("y", False)], + VarHint(Methods.Custom, ('3.5, 8.5, 13.5', ))) + self.assertEqual(w.threshold_line.text(), '3.5, 8.5, 13.5') + + select_rows(w.varview, (1, 4)) + w.copy_to_custom.click() + self.assertNotIn(("u", False), w.var_hints) + + def test_migration_2_3(self): + # Obsolete, don't want to cause confusion by public import + # pylint: disable=import-outside-toplevel + from Orange.widgets.data.owdiscretize import \ + Default, EqualFreq, Leave, Custom, MDL, EqualWidth, DState + context_values = { + 'saved_var_states': + ({(2, 'age'): DState(method=Leave()), + (2, 'rest SBP'): DState(method=EqualWidth(k=4)), + (2, 'cholesterol'): DState(method=EqualFreq(k=6)), + (4, 'max HR'): DState( + method=Custom(points=(1.0, 2.0, 3.0))), + (2, 'ST by exercise'): DState(method=MDL()), + (2, 'major vessels colored'): + DState(method=Default(method=EqualFreq(k=3)))}, -2), + '__version__': 2} + + settings = {'autosend': True, 'controlAreaVisible': True, + 'default_cutpoints': (), 'default_k': 3, + 'default_method_name': 'EqualFreq', + '__version__': 2, + "context_settings": [Context(values=context_values)]} + + OWDiscretize.migrate_settings(settings, 2) + self.assertNotIn("default_method_name", settings) + self.assertNotIn("default_k", settings) + self.assertNotIn("default_cutpoints", settings) + self.assertNotIn("context_settings", settings) + self.assertEqual( + settings["var_hints"], + {None: VarHint(Methods.EqualFreq, (3,)), + ('ST by exercise', False): VarHint(Methods.MDL, ()), + ('age', False): VarHint(Methods.Keep, ()), + ('cholesterol', False): VarHint(Methods.EqualFreq, (6,)), + ('max HR', True): VarHint(Methods.Custom, (('1, 2, 3'),)), + ('rest SBP', False): VarHint(Methods.EqualWidth, (4,))}) class TestValidator(unittest.TestCase): def test_validate(self): v = IncreasingNumbersListValidator() - self.assertEqual(v.validate("", 0), (v.Acceptable, '', 0)) + self.assertEqual(v.validate("", 0), (v.Intermediate, '', 0)) self.assertEqual(v.validate("1", 1), (v.Acceptable, '1', 1)) - self.assertEqual(v.validate(",", 0), (v.Acceptable, ',', 0)) + self.assertEqual(v.validate(",", 0), (v.Intermediate, ',', 0)) self.assertEqual(v.validate("-", 0), (v.Intermediate, '-', 0)) - self.assertEqual(v.validate("1,,", 1), (v.Acceptable, '1,,', 1)) - self.assertEqual(v.validate("1,a,", 1), (v.Invalid, '1,a,', 1)) + self.assertEqual(v.validate("1,,", 1), (v.Intermediate, '1,,', 1)) + self.assertEqual(v.validate("1,a,", 1), (v.Invalid, '1,a,', 3)) self.assertEqual(v.validate("a", 1), (v.Invalid, 'a', 1)) self.assertEqual(v.validate("1,1", 0), (v.Intermediate, '1,1', 0)) self.assertEqual(v.validate("1,12", 0), (v.Acceptable, '1,12', 0)) - def test_fixup(self): - v = IncreasingNumbersListValidator() - self.assertEqual(v.fixup(""), "") - self.assertEqual(v.fixup("1,,2"), "1, 2") - self.assertEqual(v.fixup("1,,"), "1") - self.assertEqual(v.fixup("1,"), "1") - self.assertEqual(v.fixup(",1"), "1") - self.assertEqual(v.fixup(","), "") + self.assertEqual(v.validate("1, 2 ", 5), (v.Intermediate, "1, 2, ", 6)) -class TestDelegate(GuiTest): +class TestModels(WidgetTest, DataMixin): + def setUp(self): + self.prepare_data() + self.widget = self.create_widget(OWDiscretize) + def test_delegate(self): - cases = ( - (DState(Default(Leave()), None, None), ""), - (DState(Leave(), None, None), "(leave)"), - (DState(MDL(), [1], None), "(entropy)"), - (DState(MDL(), [], None), ""), - (DState(EqualFreq(2), [1], None), "(equal frequency k=2)"), - (DState(EqualWidth(2), [1], None), "(equal width k=2)"), - (DState(Remove(), None, None), "(removed)"), - (DState(Custom([1]), None, None), "(custom)"), - ) - delegate = DiscDelegate() - var = DiscreteVariable("C", ("a", "b")) - model = VariableListModel() - model.append(var) - for state, text in cases: - model.setData(model.index(0), state, Qt.UserRole) - option = QStyleOptionViewItem() - delegate.initStyleOption(option, model.index(0)) - self.assertIn(text, option.text) - - -class TestShowTip(GuiTest): + self.prepare_data() + w = self.widget + w.var_hints = self.var_hints + # Def: Keep, x: EqFreq 3, y: Keep, z: Remove, t (time): Bin 2, u (time): + self.send_signal(w.Inputs.data, self.data) + + model = w.varview.model() + delegate: ListViewSearch.DiscDelegate = w.varview.itemDelegate() + option = QStyleOptionViewItem() + delegate.initStyleOption(option, model.index(0)) + self.assertTrue(option.font.bold()) + + option = QStyleOptionViewItem() + delegate.initStyleOption(option, model.index(4)) + self.assertFalse(option.font.bold()) + + def test_layout(self): + # Not much to test, just don't crash + self.widget.varview.updateGeometries() + + def test_model(self): + self.prepare_data() + w = self.widget + w.var_hints = self.var_hints + # Def: Keep, x: EqFreq 3, y: Keep, z: Remove, t (time): Bin 2, u (time): + self.send_signal(w.Inputs.data, self.data) + + model = w.varview.model() + display = model.index(0).data() + self.assertIn("x", display) + self.assertIn("equal", display) + self.assertIn("3", display) + self.assertIn( + str(w.discretized_vars[("x", False)].compute_value.points[0])[:3], + display) + + tooltip = model.index(0).data(Qt.ToolTipRole) + self.assertIn("x", tooltip) + self.assertIn( + str(w.discretized_vars[("x", False)].compute_value.points[0])[:3], + tooltip) + + display = model.index(1).data() + self.assertIn("y", display) + self.assertIn("keep", display) + + self.assertIsNone(model.index(1).data(Qt.ToolTipRole)) + + w.var_hints[("x", False)] = VarHint(Methods.EqualWidth, (7, )) + del w.discretized_vars[("x", False)] + w._update_discretizations() + display = model.index(0).data() + self.assertIn("x", display) + self.assertIn("equal", display) + self.assertIn("3", display) + self.assertIn( + str(w.discretized_vars[("x", False)].compute_value.points[0])[:3], + display) + + +class TestDefaultDiscModel(GuiTest): + def test_counts(self): + model = DefaultDiscModel() + self.assertEqual(model.rowCount(QModelIndex()), 1) + self.assertEqual(model.rowCount(model.index(0)), 0) + + self.assertEqual(model.columnCount(QModelIndex()), 1) + self.assertEqual(model.columnCount(model.index(0)), 0) + + def test_data(self): + model = DefaultDiscModel() + self.assertIn(format_desc(DefaultHint), model.index(0).data()) + self.assertIsInstance(model.index(0).data(Qt.DecorationRole), QIcon) + self.assertIsInstance(model.index(0).data(Qt.ToolTipRole), str) + + hint = VarHint(Methods.FixedWidth, ("314", )) + model.setData(model.index(0), hint, Qt.UserRole) + self.assertIn(format_desc(hint), model.index(0).data()) + self.assertIsInstance(model.index(0).data(Qt.DecorationRole), QIcon) + self.assertIsInstance(model.index(0).data(Qt.ToolTipRole), str) + + + +class TestUtils(GuiTest): def test_show_tip(self): w = QWidget() + show_tip = IncreasingNumbersListValidator.show_tip show_tip(w, QPoint(100, 100), "Ha Ha") app = QApplication.instance() windows = app.topLevelWidgets() @@ -185,3 +588,94 @@ def test_show_tip(self): self.assertTrue(label.text() == "Ha") show_tip(w, QPoint(100, 100), "") self.assertFalse(label.isVisible()) + + def test_format_desc(self): + self.assertEqual(format_desc(VarHint(Methods.MDL, ())), + Options[Methods.MDL].short_desc) + self.assertEqual(format_desc(VarHint(Methods.EqualWidth, ("10", ))), + Options[Methods.EqualWidth].short_desc.format(10)) + self.assertEqual(format_desc(None), + Options[Methods.Default].short_desc) + + fwt = Methods.FixedWidthTime + desc = Options[fwt].short_desc.format + self.assertEqual(format_desc(VarHint(fwt, ("1", 0))), desc("1", "year")) + self.assertEqual(format_desc(VarHint(fwt, ("2", 0))), desc("2", "years")) + self.assertEqual(format_desc(VarHint(fwt, ("1", 2))), desc("1", "day")) + self.assertEqual(format_desc(VarHint(fwt, ("2", 2))), desc("2", "days")) + self.assertEqual(format_desc(VarHint(fwt, ("x", 2))), desc("x", "day(s)")) + self.assertEqual(format_desc(VarHint(fwt, ("", 2))), desc("", "day(s)")) + + def test_fixed_width_disc(self): + fw = partial(_fixed_width_discretization, None, None) + for arg in ("", "5.3.1", "abc", "-5", "0"): + self.assertIsInstance(fw(arg), str) + + with patch("Orange.preprocess.discretize.FixedWidth") as disc: + self.assertNotIsInstance(fw("5.13"), str) + disc.assert_called_with(5.13, 2) + + self.assertNotIsInstance(fw("5"), str) + disc.assert_called_with(5, 0) + + with patch("Orange.preprocess.discretize.FixedWidth", + side_effect=TooManyIntervals): + self.assertIsInstance(fw("42"), str) + + def test_fixed_time_width_disc(self): + ftw = partial(_fixed_time_width_discretization, None, None) + + for arg in ("", "5.3.1", "5.3", "abc", "-5", "0"): + self.assertIsInstance(ftw(arg, 1), str) + + with patch("Orange.preprocess.discretize.FixedTimeWidth") as disc: + self.assertNotIsInstance(ftw("5", 2), str) + disc.assert_called_with(5, 2) + + self.assertNotIsInstance(ftw("5", 3), str) + disc.assert_called_with(35, 2) + + self.assertNotIsInstance(ftw("5", 4), str) + disc.assert_called_with(5, 3) + + with patch("Orange.preprocess.discretize.FixedTimeWidth", + side_effect=TooManyIntervals): + self.assertIsInstance(ftw("42", 3), str) + + def test_custom_discretization(self): + cd = partial(_custom_discretization, None, None) + + for arg in ("", "4 5", "2, 1, 5", "1, foo, 13"): + self.assertIsInstance(cd(arg), str) + + with patch("Orange.preprocess.discretize.Discretizer." + "create_discretized_var") as disc: + cd("1, 1.25, 1.5, 4") + disc.assert_called_with(None, [1, 1.25, 1.5, 4]) + + def test_mdl_discretization(self): + mdl = _mdl_discretization + data = Table("iris")[::10] + var = data.domain[0] + with patch("Orange.preprocess.discretize.EntropyMDL") as mdldisc: + mdl(data, var) + mdldisc.return_value.assert_called_with(data, var) + mdldisc.reset_mock() + + data = data[:, :4] + self.assertIsInstance(mdl(data, var), str) + mdldisc.assert_not_called() + + data = data.transform(Domain(data.domain[:3], data.domain[3])) + self.assertIsInstance(mdl(data, var), str) + mdldisc.assert_not_called() + + def test_var_key(self): + self.assertEqual(variable_key(ContinuousVariable("foo")), + ("foo", False)) + self.assertEqual(variable_key(TimeVariable("bar")), + ("bar", True)) + + +if __name__ == "__main__": + unittest.main() diff --git a/doc/visual-programming/source/widgets/data/discretize.md b/doc/visual-programming/source/widgets/data/discretize.md index a019c7c046a..d58557bf292 100644 --- a/doc/visual-programming/source/widgets/data/discretize.md +++ b/doc/visual-programming/source/widgets/data/discretize.md @@ -1,7 +1,7 @@ Discretize ========== -Discretizes continuous attributes from an input dataset. +Converts numeric attributes to categorical. **Inputs** @@ -11,24 +11,36 @@ Discretizes continuous attributes from an input dataset. - Data: dataset with discretized values -The **Discretize** widget [discretizes](https://en.wikipedia.org/wiki/Discretization) continuous attributes with a selected method. +The **Discretize** widget [discretizes](https://en.wikipedia.org/wiki/Discretization) numeric variables. -![](images/Discretize-All-stamped.png) +![](images/Discretize.png) -1. The basic version of the widget is rather simple. It allows choosing between three different discretizations. - - [Entropy-MDL](http://ijcai.org/Past%20Proceedings/IJCAI-93-VOL2/PDF/022.pdf), invented by Fayyad and Irani is a top-down discretization, which recursively splits the attribute at a cut maximizing information gain, until the gain is lower than the minimal description length of the cut. This discretization can result in an arbitrary number of intervals, including a single interval, in which case the attribute is discarded as useless (removed). - - [Equal-frequency](http://www.saedsayad.com/unsupervised_binning.htm) splits the attribute into a given number of intervals, so that they each contain approximately the same number of instances. - - [Equal-width](https://en.wikipedia.org/wiki/Data_binning) evenly splits the range between the smallest and the largest observed value. The *Number of intervals* can be set manually. - - The widget can also be set to leave the attributes continuous or to remove them. -2. To treat attributes individually, go to **Individual Attribute Settings**. They show a specific discretization of each attribute and allow changes. First, the top left list shows the cut-off points for each attribute. In the snapshot, we used the entropy-MDL discretization, which determines the optimal number of intervals automatically; we can see it discretized the age into seven intervals with cut-offs at 21.50, 23.50, 27.50, 35.50, 43.50, 54.50 and 61.50, respectively, while the capital-gain got split into many intervals with several cut-offs. The final weight (fnlwgt), for instance, was left with a single interval and thus removed. -On the right, we can select a specific discretization method for each attribute. Attribute *“fnlwgt”* would be removed by the MDL-based discretization, so to prevent its removal, we select the attribute and choose, for instance, **Equal-frequency discretization**. We could also choose to leave the attribute continuous. -3. Produce a report. -4. Tick *Apply automatically* for the widget to automatically commit changes. Alternatively, press *Apply*. +1. Set default method for discretization. + +2. Select variables to set specific discretization methods for each. Hovering over a variable shows intervals. + +3. Discretization methods + + - **Keep numeric** keeps the variable as it is. + - **Remove** removes variable. + - **Natural binning** finds nice thresholds for the variable's range of values, for instance 10, 20, 30 or 0.2, 0.4, 0.6, 0.8. We can set the desired number of bins; the actual number will depend on the interval. + - **Fixed width** uses a user-defined bin width. Boundaries of bins will be multiples of width. For instance, if the width is 10 and the variable's values range from 35 to 68, the resulting bins will be <40, 40-50, 50-60, >60. This method does not work for time variables. If the width is too large (resulting in a single interval) or too small (resulting in more than 100 intervals), the variable is removed. + - **Time interval** is similar to Fixed width, but for time variables. We specify the width and a time unit, e.g. 4 months or 3 days. Bin boundaries will be multiples of the interval; e.g. with 4 months, bins will always include Jan-Mar, Apr-Jun, Jul-Sep and Oct-Dec. + - **[Equal-frequency](http://www.saedsayad.com/unsupervised_binning.htm)** splits the attribute into a given number of intervals with approximately the same number of instances. + - [Equal-width](https://en.wikipedia.org/wiki/Data_binning) evenly splits the range between the smallest and the largest observed value. + - [Entropy-MDL](http://ijcai.org/Past%20Proceedings/IJCAI-93-VOL2/PDF/022.pdf) is a top-down discretization invented by Fayyad and Irani, which recursively splits the attribute at a cut maximizing information gain, until the gain is lower than the minimal description length of the cut. This discretization can result in an arbitrary number of intervals, including a single interval, in which case the variable is discarded as useless (removed). + - **Custom** allows entering an increasing, comma-separated list of thresholds. This is not applicable to time variables. + - **Use default setting** (enabled for particular settings and not default) sets the method to specified as "Default setting". + +4. The CC button sets the method for the currently selected variables to Custom, using their current thresholds. This allows for manual editing of automatically determined bins. Example ------- -In the schema below, we show the *Iris* dataset with continuous attributes -(as in the original data file) and with discretized attributes. +In the schema below, we took the *Heart disease* data set and +- discretized *age* to a fixed interval of 10 (years), +- *max HR* to approximately 6 bins (the closest match were 7 bins with a width of 25), +- removed *Cholesterol*, +- and used *entropy-mdl* for the remaining variables, which resulted in removing *rest SBP* and in two intervals for *ST by exercise* and *major vessels colored*. ![](images/Discretize-Example.png) diff --git a/doc/visual-programming/source/widgets/data/images/Discretize-All-stamped.png b/doc/visual-programming/source/widgets/data/images/Discretize-All-stamped.png deleted file mode 100644 index a504dbe4698..00000000000 Binary files a/doc/visual-programming/source/widgets/data/images/Discretize-All-stamped.png and /dev/null differ diff --git a/doc/visual-programming/source/widgets/data/images/Discretize-All.png b/doc/visual-programming/source/widgets/data/images/Discretize-All.png deleted file mode 100644 index 51f224d5d6f..00000000000 Binary files a/doc/visual-programming/source/widgets/data/images/Discretize-All.png and /dev/null differ diff --git a/doc/visual-programming/source/widgets/data/images/Discretize-Example.png b/doc/visual-programming/source/widgets/data/images/Discretize-Example.png index 67dc0540deb..3db56a32d89 100644 Binary files a/doc/visual-programming/source/widgets/data/images/Discretize-Example.png and b/doc/visual-programming/source/widgets/data/images/Discretize-Example.png differ diff --git a/doc/visual-programming/source/widgets/data/images/Discretize.png b/doc/visual-programming/source/widgets/data/images/Discretize.png new file mode 100644 index 00000000000..11f80cf05c8 Binary files /dev/null and b/doc/visual-programming/source/widgets/data/images/Discretize.png differ