Skip to content

Commit

Permalink
Fix guessing strategy for date and time variables
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Nov 27, 2019
1 parent 2e07ccf commit 8665c52
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 17 deletions.
33 changes: 21 additions & 12 deletions Orange/data/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,30 +145,39 @@ def guess_data_type(orig_values, namask=None):
"""
valuemap, values = None, orig_values
is_discrete = is_discrete_values(orig_values)
orig_values = np.asarray(orig_values, dtype=str)
if namask is None:
namask = isnastr(orig_values)
if is_discrete:
valuemap = sorted(is_discrete)
coltype = DiscreteVariable
else:
# try to parse as float
orig_values = np.asarray(orig_values)
if namask is None:
namask = isnastr(orig_values)
values = np.empty_like(orig_values, dtype=float)
values[namask] = np.nan
try:
np.copyto(values, orig_values, where=~namask, casting="unsafe")
except ValueError:
tvar = TimeVariable('_')
try:
values[~namask] = [tvar.parse(i) for i in orig_values[~namask]]
except ValueError:
coltype = StringVariable
# return original_values
values = orig_values
else:
coltype = TimeVariable
values = orig_values
coltype = StringVariable
else:
coltype = ContinuousVariable

if coltype is not ContinuousVariable:
# when not continuous variable it can still be time variable even it
# was before recognized as a discrete
tvar = TimeVariable('_')
# introducing new variable prevent overwriting orig_values and values
temp_values = np.empty_like(orig_values, dtype=float)
try:
temp_values[~namask] = [
tvar.parse_exact_iso(i) for i in orig_values[~namask]]
except ValueError:
pass
else:
valuemap = None
coltype = TimeVariable
values = temp_values
return valuemap, values, coltype


Expand Down
27 changes: 26 additions & 1 deletion Orange/data/tests/test_io.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import unittest
import numpy as np

from Orange.data import ContinuousVariable, DiscreteVariable, StringVariable
from Orange.data import ContinuousVariable, DiscreteVariable, StringVariable, \
TimeVariable
from Orange.data.io import guess_data_type


Expand Down Expand Up @@ -68,3 +69,27 @@ def test_guess_data_type_string(self):
self.assertEqual(StringVariable, coltype)
self.assertIsNone(valuemap)
np.testing.assert_array_equal(in_values, values)

def test_guess_data_type_time(self):
in_values = ["2019-10-10", "2019-10-10", "2019-10-10", "2019-10-01"]
valuemap, values, coltype = guess_data_type(in_values)
self.assertEqual(TimeVariable, coltype)
self.assertIsNone(valuemap)

in_values = ["2019-10-10T12:08:51", "2019-10-10T12:08:51",
"2019-10-10T12:08:51", "2019-10-01T12:08:51"]
valuemap, values, coltype = guess_data_type(in_values)
self.assertEqual(TimeVariable, coltype)
self.assertIsNone(valuemap)

in_values = ["2019-10-10 12:08:51", "2019-10-10 12:08:51",
"2019-10-10 12:08:51", "2019-10-01 12:08:51"]
valuemap, values, coltype = guess_data_type(in_values)
self.assertEqual(TimeVariable, coltype)
self.assertIsNone(valuemap)

in_values = ["2019-10-10 12:08", "2019-10-10 12:08",
"2019-10-10 12:08", "2019-10-01 12:08"]
valuemap, values, coltype = guess_data_type(in_values)
self.assertEqual(TimeVariable, coltype)
self.assertIsNone(valuemap)
23 changes: 19 additions & 4 deletions Orange/data/variable.py
Original file line number Diff line number Diff line change
Expand Up @@ -889,6 +889,13 @@ class TimeVariable(ContinuousVariable):
r'\d{2}\d{2}\d{2}\.\d+|'
r'\d{1,4}(-?\d{2,3})?'
r')$')

class InvalidDateTimeFormatError(ValueError):
def __init__(self, date_string):
super().__init__(
"Invalid datetime format '{}'. "
"Only ISO 8601 supported.".format(date_string))

_matches_iso_format = re.compile(REGEX).match

# UTC offset and associated timezone. If parsed datetime values provide an
Expand Down Expand Up @@ -954,16 +961,14 @@ def parse(self, datestr):
return Unknown
datestr = datestr.strip().rstrip('Z')

ERROR = ValueError("Invalid datetime format '{}'. "
"Only ISO 8601 supported.".format(datestr))
if not self._matches_iso_format(datestr):
try:
# If it is a number, assume it is a unix timestamp
value = float(datestr)
self.have_date = self.have_time = 1
return value
except ValueError:
raise ERROR
raise self.InvalidDateTimeFormatError(datestr)

for i, (have_date, have_time, fmt) in enumerate(self._ISO_FORMATS):
try:
Expand All @@ -984,7 +989,7 @@ def parse(self, datestr):
self.UNIX_EPOCH.day)
break
else:
raise ERROR
raise self.InvalidDateTimeFormatError(datestr)

# Remember UTC offset. If not all parsed values share the same offset,
# remember none of it.
Expand All @@ -1010,6 +1015,16 @@ def parse(self, datestr):
except OverflowError:
return -(self.UNIX_EPOCH - dt).total_seconds()

def parse_exact_iso(self, datestr):
"""
This function is a meta function to `parse` function. It checks
whether the date is of the iso format - it does not accept float-like
date.
"""
if not self._matches_iso_format(datestr):
raise self.InvalidDateTimeFormatError(datestr)
return self.parse(datestr)

def to_val(self, s):
"""
Convert a value, given as an instance of an arbitrary type, to a float.
Expand Down

0 comments on commit 8665c52

Please sign in to comment.