Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FIX] Fix guessing strategy for date and time variables #4226

Merged
merged 1 commit into from
Nov 29, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 21 additions & 12 deletions Orange/data/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,30 +145,39 @@ def guess_data_type(orig_values, namask=None):
"""
valuemap, values = None, orig_values
is_discrete = is_discrete_values(orig_values)
orig_values = np.asarray(orig_values, dtype=str)
if namask is None:
namask = isnastr(orig_values)
if is_discrete:
valuemap = sorted(is_discrete)
coltype = DiscreteVariable
else:
# try to parse as float
orig_values = np.asarray(orig_values)
if namask is None:
namask = isnastr(orig_values)
values = np.empty_like(orig_values, dtype=float)
values[namask] = np.nan
try:
np.copyto(values, orig_values, where=~namask, casting="unsafe")
except ValueError:
tvar = TimeVariable('_')
try:
values[~namask] = [tvar.parse(i) for i in orig_values[~namask]]
except ValueError:
coltype = StringVariable
# return original_values
values = orig_values
else:
coltype = TimeVariable
values = orig_values
coltype = StringVariable
else:
coltype = ContinuousVariable

if coltype is not ContinuousVariable:
# when not continuous variable it can still be time variable even it
# was before recognized as a discrete
tvar = TimeVariable('_')
# introducing new variable prevent overwriting orig_values and values
temp_values = np.empty_like(orig_values, dtype=float)
try:
temp_values[~namask] = [
tvar.parse_exact_iso(i) for i in orig_values[~namask]]
except ValueError:
pass
else:
valuemap = None
coltype = TimeVariable
values = temp_values
return valuemap, values, coltype


Expand Down
27 changes: 26 additions & 1 deletion Orange/data/tests/test_io.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import unittest
import numpy as np

from Orange.data import ContinuousVariable, DiscreteVariable, StringVariable
from Orange.data import ContinuousVariable, DiscreteVariable, StringVariable, \
TimeVariable
from Orange.data.io import guess_data_type


Expand Down Expand Up @@ -68,3 +69,27 @@ def test_guess_data_type_string(self):
self.assertEqual(StringVariable, coltype)
self.assertIsNone(valuemap)
np.testing.assert_array_equal(in_values, values)

def test_guess_data_type_time(self):
in_values = ["2019-10-10", "2019-10-10", "2019-10-10", "2019-10-01"]
valuemap, _, coltype = guess_data_type(in_values)
self.assertEqual(TimeVariable, coltype)
self.assertIsNone(valuemap)

in_values = ["2019-10-10T12:08:51", "2019-10-10T12:08:51",
"2019-10-10T12:08:51", "2019-10-01T12:08:51"]
valuemap, _, coltype = guess_data_type(in_values)
self.assertEqual(TimeVariable, coltype)
self.assertIsNone(valuemap)

in_values = ["2019-10-10 12:08:51", "2019-10-10 12:08:51",
"2019-10-10 12:08:51", "2019-10-01 12:08:51"]
valuemap, _, coltype = guess_data_type(in_values)
self.assertEqual(TimeVariable, coltype)
self.assertIsNone(valuemap)

in_values = ["2019-10-10 12:08", "2019-10-10 12:08",
"2019-10-10 12:08", "2019-10-01 12:08"]
valuemap, _, coltype = guess_data_type(in_values)
self.assertEqual(TimeVariable, coltype)
self.assertIsNone(valuemap)
23 changes: 19 additions & 4 deletions Orange/data/variable.py
Original file line number Diff line number Diff line change
Expand Up @@ -889,6 +889,13 @@ class TimeVariable(ContinuousVariable):
r'\d{2}\d{2}\d{2}\.\d+|'
r'\d{1,4}(-?\d{2,3})?'
r')$')

class InvalidDateTimeFormatError(ValueError):
def __init__(self, date_string):
super().__init__(
"Invalid datetime format '{}'. "
"Only ISO 8601 supported.".format(date_string))

_matches_iso_format = re.compile(REGEX).match

# UTC offset and associated timezone. If parsed datetime values provide an
Expand Down Expand Up @@ -954,16 +961,14 @@ def parse(self, datestr):
return Unknown
datestr = datestr.strip().rstrip('Z')

ERROR = ValueError("Invalid datetime format '{}'. "
"Only ISO 8601 supported.".format(datestr))
if not self._matches_iso_format(datestr):
try:
# If it is a number, assume it is a unix timestamp
value = float(datestr)
self.have_date = self.have_time = 1
return value
except ValueError:
raise ERROR
raise self.InvalidDateTimeFormatError(datestr)

for i, (have_date, have_time, fmt) in enumerate(self._ISO_FORMATS):
try:
Expand All @@ -984,7 +989,7 @@ def parse(self, datestr):
self.UNIX_EPOCH.day)
break
else:
raise ERROR
raise self.InvalidDateTimeFormatError(datestr)

# Remember UTC offset. If not all parsed values share the same offset,
# remember none of it.
Expand All @@ -1010,6 +1015,16 @@ def parse(self, datestr):
except OverflowError:
return -(self.UNIX_EPOCH - dt).total_seconds()

def parse_exact_iso(self, datestr):
"""
This function is a meta function to `parse` function. It checks
whether the date is of the iso format - it does not accept float-like
date.
"""
if not self._matches_iso_format(datestr):
raise self.InvalidDateTimeFormatError(datestr)
return self.parse(datestr)

def to_val(self, s):
"""
Convert a value, given as an instance of an arbitrary type, to a float.
Expand Down