Skip to content

Commit

Permalink
Merge pull request #6878 from janezd/unique-discrete-values
Browse files Browse the repository at this point in the history
DiscreteVariable: Raise an exception if values are not unique
  • Loading branch information
markotoplak authored Nov 8, 2024
2 parents 52df166 + a410361 commit 871b406
Show file tree
Hide file tree
Showing 6 changed files with 280 additions and 38 deletions.
4 changes: 4 additions & 0 deletions Orange/data/tests/test_variable.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,10 @@ def test_no_duplicated_values(self):
self.assertEqual(list(a.values), ["a", "b", "c"])
self.assertEqual(list(a._value_index), ["a", "b", "c"])

def test_no_duplicates_in_constructor(self):
self.assertRaises(ValueError, DiscreteVariable,
"foo", values=("a", "b", "a"))

def test_unpickle(self):
d1 = DiscreteVariable("A", values=("two", "one"))
s = pickle.dumps(d1)
Expand Down
2 changes: 2 additions & 0 deletions Orange/data/variable.py
Original file line number Diff line number Diff line change
Expand Up @@ -690,6 +690,8 @@ def __init__(
values = tuple(values) # some people (including me) pass a generator
if not all(isinstance(value, str) for value in values):
raise TypeError("values of DiscreteVariables must be strings")
if len(set(values)) < len(values):
raise ValueError("Duplicate values in DiscreteVariable")

super().__init__(name, compute_value, sparse=sparse)
self._values = values
Expand Down
105 changes: 78 additions & 27 deletions Orange/preprocess/discretize.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from numbers import Number
from typing import NamedTuple, List, Union, Callable, Optional
import datetime
from itertools import count
from itertools import count, chain

import numpy as np
import scipy.sparse as sp
Expand Down Expand Up @@ -49,38 +49,89 @@ def transform(self, c):
return np.array([], dtype=int)

@staticmethod
def _fmt_interval(low, high, formatter):
assert low is not None or high is not None
def _fmt_interval(low, high, formatter, strip_zeros=True):
assert low is None or high is None or low < high
if low is None or np.isinf(low):
return f"< {formatter(high)}"
if high is None or np.isinf(high):
return f"≥ {formatter(low)}"
return f"{formatter(low)} - {formatter(high)}"

def strip0(s):
if strip_zeros and re.match(r"^\d+\.\d+", s):
return s.rstrip("0").rstrip(".")
return s

lows = (low is not None and not np.isinf(low)
and strip0(formatter(low)))
highs = (high is not None and not np.isinf(high)
and strip0(formatter(high)))
assert lows or highs
if lows == highs:
raise ValueError(f"Formatter returned identical thresholds: {lows}")

if not lows:
return f"< {highs}"
if not highs:
return f"≥ {lows}"
return f"{lows} - {highs}"

@classmethod
def create_discretized_var(cls, var, points, ndigits=None):
if ndigits is None:
def fmt(val):
sval = var.str_val(val)
# For decimal numbers, remove trailing 0's and . if no decimals left
if re.match(r"^\d+\.\d+", sval):
return sval.rstrip("0").rstrip(".")
return sval
else:
def fmt(val):
return f"{val:.{ndigits}f}"

lpoints = list(points)
if lpoints:
values = [
cls._fmt_interval(low, high, fmt)
for low, high in zip([-np.inf] + lpoints, lpoints + [np.inf])]
to_sql = BinSql(var, lpoints)
else:
def _get_labels(cls, fmt, points, strip_zeros=True):
return [
cls._fmt_interval(low, high, fmt, strip_zeros=strip_zeros)
for low, high in zip(
chain([-np.inf], points),
chain(points, [np.inf]))]

@classmethod
def _get_discretized_values(cls, var, points, ndigits=None):
if len(points) == 0:
values = ["single_value"]
to_sql = SingleValueSql(values[0])
return points, values, to_sql

npoints = np.array(points, dtype=np.float64)
if len(points) > 1:
mindiff = np.min(npoints[1:] - npoints[:-1])
if mindiff == 0:
raise ValueError("Some interval thresholds are identical")
else:
mindiff = 1 # prevent warnings

if ndigits is None or len(points) == 1:
try:
values = cls._get_labels(var.str_val, points)
except ValueError: # points would create identical formatted thresholds
pass
else:
if len(values) == len(set(values)):
to_sql = BinSql(var, points)
return points, values, to_sql

mindigits = max(ndigits or 0,
int(-np.log10(mindiff)))
maxdigits = np.finfo(npoints.dtype).precision + 2
for digits in range(mindigits, maxdigits + 1):
# ensure that builtin round is used for compatibility with float formatting
# de-numpyize points p (otherwise np.floats use numpy's round)
npoints = [round(float(p), digits) for p in points]
if len(npoints) == len(set(npoints)):
def fmt_fixed(val):
# We break the loop, pylint: disable=cell-var-from-loop
return f"{val:.{digits}f}"

points = list(npoints)
break
else:
# pragma: no cover
assert False

values = cls._get_labels(
fmt_fixed, points,
strip_zeros=digits != ndigits)
assert len(values) == len(set(values))
to_sql = BinSql(var, points)
return points, values, to_sql

@classmethod
def create_discretized_var(cls, var, points, ndigits=None):
points, values, to_sql = cls._get_discretized_values(var, points, ndigits)
dvar = DiscreteVariable(name=var.name, values=values,
compute_value=cls(var, points),
sparse=var.sparse)
Expand Down
182 changes: 180 additions & 2 deletions Orange/preprocess/tests/test_discretize.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
from Orange.data import ContinuousVariable, TimeVariable, Table, Domain
from Orange.preprocess.discretize import \
_time_binnings, time_binnings, BinDefinition, Discretizer, FixedWidth, \
FixedTimeWidth , Binning, \
TooManyIntervals
FixedTimeWidth, Binning, \
TooManyIntervals, SingleValueSql, BinSql


class TestFixedWidth(unittest.TestCase):
Expand Down Expand Up @@ -1101,6 +1101,184 @@ def test_equality(self):
self.assertNotEqual(t1, t1a)
self.assertNotEqual(hash(t1), hash(t1a))

def test_fmt_interval(self):
def fmt(x):
return f"{x:.2f}"

f = Discretizer._fmt_interval
self.assertEqual(f(1, 2, str), "1 - 2")
self.assertEqual(f(1, 2, fmt), "1 - 2")
self.assertEqual(f(1, 2, fmt, strip_zeros=False), "1.00 - 2.00")

self.assertEqual(f(-np.inf, 2, fmt), "< 2")
self.assertEqual(f(-np.inf, 2, fmt, strip_zeros=False), "< 2.00")
self.assertEqual(f(None, 2, fmt), "< 2")
self.assertEqual(f(None, 2, fmt, strip_zeros=False), "< 2.00")

self.assertEqual(f(2, np.inf, fmt), "≥ 2")
self.assertEqual(f(2, np.inf, fmt, strip_zeros=False), "≥ 2.00")
self.assertEqual(f(2, None, fmt), "≥ 2")
self.assertEqual(f(2, None, fmt, strip_zeros=False), "≥ 2.00")

with self.assertRaises(ValueError):
f(1.122, 1.123, fmt)


def test_get_labels(self):
points = [2.46, 2.68, 2.794]
self.assertEqual(
Discretizer._get_labels(lambda x: f"{x:.1f}", points),
['< 2.5', '2.5 - 2.7', '2.7 - 2.8', '≥ 2.8']
)
self.assertEqual(
Discretizer._get_labels(lambda x: f"{x:.2f}", points),
['< 2.46', '2.46 - 2.68', '2.68 - 2.79', '≥ 2.79']
)
self.assertEqual(
Discretizer._get_labels(lambda x: f"{x:.4f}", points),
['< 2.46', '2.46 - 2.68', '2.68 - 2.794', '≥ 2.794']
)

self.assertEqual(
Discretizer._get_labels(lambda x: f"{x:.4f}", points,
strip_zeros=False),
['< 2.4600', '2.4600 - 2.6800', '2.6800 - 2.7940', '≥ 2.7940']
)

self.assertEqual(
Discretizer._get_labels(lambda x: f"{x:.4f}", [100, 200]),
['< 100', '100 - 200', '≥ 200']
)

self.assertEqual(
Discretizer._get_labels(lambda x: f"{x:.4f}", [0]),
['< 0', '≥ 0']
)

def test_get_discretized_values_empty(self):
x = ContinuousVariable("x")
d = Discretizer(x, [])
points, values, to_sql = d._get_discretized_values(None, np.array([]))
self.assertEqual(len(points), 0)
self.assertEqual(values, ["single_value"])
self.assertIsInstance(to_sql, SingleValueSql)

def test_get_discretized_values_identical_points(self):
x = ContinuousVariable("x")
with self.assertRaises(ValueError):
Discretizer._get_discretized_values(x, np.array([0, 1, 1, 2]))

def test_get_discretized_values_no_ndigits(self):
x = ContinuousVariable("x", number_of_decimals=2)
points, values, to_sql \
= Discretizer._get_discretized_values(x, np.array([1, 2, 3, 4]))
np.testing.assert_equal(points, [1, 2, 3, 4])
self.assertEqual(values, ['< 1', '1 - 2', '2 - 3', '3 - 4', '≥ 4'])
self.assertIsInstance(to_sql, BinSql)

points, values, to_sql \
= Discretizer._get_discretized_values(x, np.array([1, 2.1, 3, 4]))
np.testing.assert_equal(points, [1, 2.1, 3, 4])
self.assertEqual(values, ['< 1', '1 - 2.1', '2.1 - 3', '3 - 4', '≥ 4'])
self.assertIsInstance(to_sql, BinSql)

points, values, to_sql \
= Discretizer._get_discretized_values(x,
np.array([1, 2.1, 3.1234, 4]))
np.testing.assert_equal(points, [1, 2.1, 3.1234, 4])
self.assertEqual(values, ['< 1', '1 - 2.1', '2.1 - 3.1234', '3.1234 - 4', '≥ 4'])
self.assertIsInstance(to_sql, BinSql)

points, values, to_sql \
= Discretizer._get_discretized_values(x,
np.array([1,
2.1000000001,
2.1000000002,
4]))
np.testing.assert_equal(points, [1, 2.1000000001, 2.1000000002, 4])
self.assertEqual(values, ['< 1',
'1 - 2.1000000001',
'2.1000000001 - 2.1000000002',
'2.1000000002 - 4',
'≥ 4'])
self.assertIsInstance(to_sql, BinSql)

points, values, to_sql \
= Discretizer._get_discretized_values(x,
np.array([1,
2.1000000001,
2.1000000002,
2.1000000003]))
np.testing.assert_equal(points,
[1, 2.1000000001, 2.1000000002, 2.1000000003])
self.assertEqual(values, ['< 1',
'1 - 2.1000000001',
'2.1000000001 - 2.1000000002',
'2.1000000002 - 2.1000000003',
'≥ 2.1000000003'])
self.assertIsInstance(to_sql, BinSql)

points, values, to_sql \
= Discretizer._get_discretized_values(x,
np.array([1,
2.1000000001,
2.100000000211,
2.1000000003]))
np.testing.assert_equal(points,
[1, 2.1000000001, 2.1000000002, 2.1000000003])
self.assertEqual(values, ['< 1',
'1 - 2.1000000001',
'2.1000000001 - 2.1000000002',
'2.1000000002 - 2.1000000003',
'≥ 2.1000000003'])
self.assertIsInstance(to_sql, BinSql)

def test_get_discretized_values_round_builtin_vs_numpy(self):
x = ContinuousVariable("x", number_of_decimals=0)
points, values, _ \
= Discretizer._get_discretized_values(x,
np.array([2.3455,
2.346]))
np.testing.assert_equal(points,
[2.345, 2.346])
self.assertEqual(values, ['< 2.345',
'2.345 - 2.346',
'≥ 2.346'])

points, values, _ \
= Discretizer._get_discretized_values(x,
np.array([2.1345,
2.135]))
np.testing.assert_equal(points,
[2.1345, 2.135])
self.assertEqual(values, ['< 2.1345',
'2.1345 - 2.135',
'≥ 2.135'])

def test_get_discretized_values_with_ndigits(self):
x = ContinuousVariable("x")
apoints = [1, 2, 3, 4]
points, values, to_sql \
= Discretizer._get_discretized_values(x, apoints, ndigits=0)
np.testing.assert_array_equal(points, np.array([1, 2, 3, 4]))
self.assertEqual(values, ['< 1', '1 - 2', '2 - 3', '3 - 4', '≥ 4'])
self.assertIsInstance(to_sql, BinSql)

points, values, to_sql \
= Discretizer._get_discretized_values(x, apoints, ndigits=2)
np.testing.assert_array_equal(points, np.array([1, 2, 3, 4]))
self.assertEqual(values, ['< 1.00', '1.00 - 2.00', '2.00 - 3.00',
'3.00 - 4.00', '≥ 4.00'])
self.assertIsInstance(to_sql, BinSql)

apoints = [1, 2.1234, 2.1345, 4]
points, values, to_sql \
= Discretizer._get_discretized_values(x, apoints, ndigits=1)
np.testing.assert_array_equal(points, np.array([1, 2.12, 2.13, 4]))
self.assertEqual(values,
['< 1', '1 - 2.12', '2.12 - 2.13', '2.13 - 4', '≥ 4'])
self.assertIsInstance(to_sql, BinSql)


if __name__ == '__main__':
unittest.main()
4 changes: 2 additions & 2 deletions Orange/widgets/data/tests/test_oweditdomain.py
Original file line number Diff line number Diff line change
Expand Up @@ -1431,7 +1431,7 @@ def test_as_time(self):
["07.02.2022 01:02:03", "18.04.2021 01:02:03"], # datetime
# datetime with timezone
["2021-02-08 01:02:03+01:00", "2021-02-07 01:02:03+01:00"],
["010203", "010203"], # time
["010203", "010204"], # time
["02-07", "04-18"],
)
formats = [
Expand All @@ -1441,7 +1441,7 @@ def test_as_time(self):
[d("2022-02-07"), d("2021-04-18")],
[d("2022-02-07 01:02:03"), d("2021-04-18 01:02:03")],
[d("2021-02-08 01:02:03+0100"), d("2021-02-07 01:02:03+0100")],
[d("01:02:03"), d("01:02:03")],
[d("01:02:03"), d("01:02:04")],
[d("1900-02-07"), d("1900-04-18")],
]
variables = [StringVariable(f"s{i}") for i in range(len(times))]
Expand Down
21 changes: 14 additions & 7 deletions i18n/si/msgs.jaml
Original file line number Diff line number Diff line change
Expand Up @@ -1615,6 +1615,7 @@ data/variable.py:
categorical: false
def `__init__`:
values of DiscreteVariables must be strings: false
Duplicate values in DiscreteVariable: false
def `get_mapper_from`:
def `mapper`:
In-place mapping of sparse matrices must map 0 to 0: false
Expand Down Expand Up @@ -2611,13 +2612,19 @@ preprocess/discretize.py:
BinDefinition: false
class `Discretizer`:
def `_fmt_interval`:
< {formatter(high)}: false
≥ {formatter(low)}: false
{formatter(low)} - {formatter(high)}: false
def `create_discretized_var`:
def `fmt`:
{val:.{ndigits}f}: false
single_value: false
def `strip0`:
^\d+\.\d+: false
0: false
.: false
'Formatter returned identical thresholds: {lows}': false
< {highs}: true
≥ {lows}: true
{lows} - {highs}: true
def `_get_discretized_values`:
single_value: konstanta
Some interval thresholds are identical: false
def `fmt_fixed`:
{val:.{digits}f}: false
class `BinSql`:
def `__call__`:
'width_bucket({self.var.to_sql()}, ': false
Expand Down

0 comments on commit 871b406

Please sign in to comment.