Skip to content

Commit

Permalink
pandas_compat: fix conversion of datetime
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Aug 10, 2021
1 parent 376530d commit 25b7a48
Show file tree
Hide file tree
Showing 3 changed files with 310 additions and 10 deletions.
57 changes: 49 additions & 8 deletions Orange/data/pandas_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,13 +157,56 @@ def _is_datetime(s):
return True
try:
if is_object_dtype(s):
pd.to_datetime(s, infer_datetime_format=True)
# utc=True - to allow different timezones in a series object
pd.to_datetime(s, infer_datetime_format=True, utc=True)
return True
except Exception: # pylint: disable=broad-except
pass
return False


def _convert_datetime(series, var):
def col_type(dt):
"""Test if is date, time or datetime"""
dt_nonnat = dt[~pd.isnull(dt)] # nat == nat is False
if (dt_nonnat.dt.floor("d") == dt_nonnat).all():
# all times are 00:00:00.0 - pure date
return 1, 0
elif (dt_nonnat.dt.date == pd.Timestamp("now").date()).all():
# all dates are today's date - pure time
return 0, 1 # pure time
else:
# else datetime
return 1, 1

try:
dt = pd.to_datetime(series)
except ValueError:
# series with type object and different timezones will raise a
# ValueError - normalizing to utc
dt = pd.to_datetime(series, utc=True)

# set variable type to date, time or datetime
var.have_date, var.have_time = col_type(dt)

if dt.dt.tz is not None:
# set timezone if available and convert to utc
var.timezone = dt.dt.tz
dt = dt.dt.tz_convert("UTC")

if var.have_time and not var.have_date:
# if time only measure seconds from midnight - equal to setting date
# to unix epoch
return (
(dt.dt.tz_localize(None) - pd.Timestamp("now").normalize())
/ pd.Timedelta("1s")
).values

return (
(dt.dt.tz_localize(None) - pd.Timestamp("1970-01-01")) / pd.Timedelta("1s")
).values


def vars_from_df(df, role=None, force_nominal=False):
if role is None and hasattr(df, 'orange_role'):
_role = df.orange_role
Expand Down Expand Up @@ -210,6 +253,11 @@ def vars_from_df(df, role=None, force_nominal=False):
Mcols.append(column)
Mexpr.append(None)
metas.append(var)
elif _is_datetime(s):
var = TimeVariable(str(column))
attrs.append(var)
Xcols.append(column)
Xexpr.append(_convert_datetime)
elif _is_discrete(s, force_nominal):
discrete = s.astype('category').cat
var = DiscreteVariable(str(column),
Expand All @@ -224,13 +272,6 @@ def to_cat(s, _):
return np.asarray(x)

Xexpr.append(to_cat)
elif _is_datetime(s):
var = TimeVariable(str(column))
attrs.append(var)
Xcols.append(column)
Xexpr.append(lambda s, v: np.asarray(
s.astype('str').replace('NaT', np.nan).map(v.parse)
))
elif is_numeric_dtype(s):
var = ContinuousVariable(
# set number of decimals to 0 if int else keeps default behaviour
Expand Down
245 changes: 244 additions & 1 deletion Orange/data/tests/test_pandas.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
# pylint: disable=import-outside-toplevel

import unittest
from datetime import date, datetime, timezone

import numpy as np
import pytz
from scipy.sparse import csr_matrix
import scipy.sparse as sp

from Orange.data import ContinuousVariable, DiscreteVariable, TimeVariable, Table, Domain, \
StringVariable
from Orange.data.pandas_compat import OrangeDataFrame
from Orange.data.tests.test_variable import TestTimeVariable

try:
import pandas as pd
Expand Down Expand Up @@ -164,6 +167,246 @@ def test_not_orangedf(self):
for v1, v2 in zip(vars1, vars2):
self.assertEqual(type(v1), type(v2))

def test_table_from_frame_date(self):
from Orange.data.pandas_compat import table_from_frame

df = pd.DataFrame(
[[pd.Timestamp("2017-12-19")], [pd.Timestamp("1724-12-20")], [np.nan]]
)
table = table_from_frame(df)
np.testing.assert_equal(
table.X,
[
[pd.Timestamp("2017-12-19").timestamp()],
[pd.Timestamp("1724-12-20").timestamp()],
[np.nan],
],
)
self.assertEqual(table.domain.variables[0].have_time, 0)
self.assertEqual(table.domain.variables[0].have_date, 1)

df = pd.DataFrame([["2017-12-19"], ["1724-12-20"], [np.nan]])
table = table_from_frame(df)
np.testing.assert_equal(
table.X,
[
[pd.Timestamp("2017-12-19").timestamp()],
[pd.Timestamp("1724-12-20").timestamp()],
[np.nan],
],
)
self.assertEqual(table.domain.variables[0].have_time, 0)
self.assertEqual(table.domain.variables[0].have_date, 1)

df = pd.DataFrame([[date(2017, 12, 19)], [date(1724, 12, 20)], [np.nan]])
table = table_from_frame(df)
np.testing.assert_equal(
table.X,
[
[pd.Timestamp("2017-12-19").timestamp()],
[pd.Timestamp("1724-12-20").timestamp()],
[np.nan],
],
)
self.assertEqual(table.domain.variables[0].have_time, 0)
self.assertEqual(table.domain.variables[0].have_date, 1)

def test_table_from_frame_time(self):
from Orange.data.pandas_compat import table_from_frame

df = pd.DataFrame(
[[pd.Timestamp("00:00:00.25")], [pd.Timestamp("20:20:20.30")], [np.nan]]
)
table = table_from_frame(df)
np.testing.assert_equal(
table.X,
[
[pd.Timestamp("1970-01-01 00:00:00.25").timestamp()],
[pd.Timestamp("1970-01-01 20:20:20.30").timestamp()],
[np.nan],
],
)
self.assertEqual(table.domain.variables[0].have_time, 1)
self.assertEqual(table.domain.variables[0].have_date, 0)

df = pd.DataFrame([["00:00:00.25"], ["20:20:20.30"], [np.nan]])
table = table_from_frame(df)
np.testing.assert_equal(
table.X,
[
[pd.Timestamp("1970-01-01 00:00:00.25").timestamp()],
[pd.Timestamp("1970-01-01 20:20:20.30").timestamp()],
[np.nan],
],
)
self.assertEqual(table.domain.variables[0].have_time, 1)
self.assertEqual(table.domain.variables[0].have_date, 0)

def test_table_from_frame_datetime(self):
from Orange.data.pandas_compat import table_from_frame

df = pd.DataFrame(
[
[pd.Timestamp("2017-12-19 00:00:00.50")],
[pd.Timestamp("1724-12-20 20:20:20.30")],
[np.nan],
]
)
table = table_from_frame(df)
np.testing.assert_equal(
table.X,
[
[pd.Timestamp("2017-12-19 00:00:00.50").timestamp()],
[pd.Timestamp("1724-12-20 20:20:20.30").timestamp()],
[np.nan],
],
)
self.assertEqual(table.domain.variables[0].have_time, 1)
self.assertEqual(table.domain.variables[0].have_date, 1)

df = pd.DataFrame(
[["2017-12-19 00:00:00.50"], ["1724-12-20 20:20:20.30"], [np.nan]]
)
table = table_from_frame(df)
np.testing.assert_equal(
table.X,
[
[pd.Timestamp("2017-12-19 00:00:00.50").timestamp()],
[pd.Timestamp("1724-12-20 20:20:20.30").timestamp()],
[np.nan],
],
)
self.assertEqual(table.domain.variables[0].have_time, 1)
self.assertEqual(table.domain.variables[0].have_date, 1)

df = pd.DataFrame(
[
[datetime(2017, 12, 19, 0, 0, 0, 500000)],
[datetime(1724, 12, 20, 20, 20, 20, 300000)],
[np.nan],
]
)
table = table_from_frame(df)
np.testing.assert_equal(
table.X,
[
[pd.Timestamp("2017-12-19 00:00:00.50").timestamp()],
[pd.Timestamp("1724-12-20 20:20:20.30").timestamp()],
[np.nan],
],
)
self.assertEqual(table.domain.variables[0].have_time, 1)
self.assertEqual(table.domain.variables[0].have_date, 1)

def test_table_from_frame_timezones(self):
from Orange.data.pandas_compat import table_from_frame

df = pd.DataFrame(
[
[pd.Timestamp("2017-12-19 00:00:00")],
[pd.Timestamp("1724-12-20 20:20:20")],
[np.nan],
]
)
table = table_from_frame(df)
self.assertIsNone(table.domain.variables[0].utc_offset)
self.assertEqual(table.domain.variables[0].timezone, timezone.utc)

df = pd.DataFrame(
[
[pd.Timestamp("2017-12-19 00:00:00Z")],
[pd.Timestamp("1724-12-20 20:20:20Z")],
[np.nan],
]
)
table = table_from_frame(df)
self.assertEqual(pytz.utc, table.domain.variables[0].timezone)
np.testing.assert_equal(
table.X,
[
[pd.Timestamp("2017-12-19 00:00:00").timestamp()],
[pd.Timestamp("1724-12-20 20:20:20").timestamp()],
[np.nan],
],
)

df = pd.DataFrame(
[
[pd.Timestamp("2017-12-19 00:00:00+1")],
[pd.Timestamp("1724-12-20 20:20:20+1")],
[np.nan],
]
)
table = table_from_frame(df)
self.assertEqual(pytz.FixedOffset(60), table.domain.variables[0].timezone)
np.testing.assert_equal(
table.X,
[
[pd.Timestamp("2017-12-19 00:00:00+1").timestamp()],
[pd.Timestamp("1724-12-20 20:20:20+1").timestamp()],
[np.nan],
],
)

df = pd.DataFrame(
[
[pd.Timestamp("2017-12-19 00:00:00", tz="CET")],
[pd.Timestamp("1724-12-20 20:20:20", tz="CET")],
[np.nan],
]
)
table = table_from_frame(df)
self.assertEqual(pytz.timezone("CET"), table.domain.variables[0].timezone)
np.testing.assert_equal(
table.X,
[
[pd.Timestamp("2017-12-19 00:00:00+1").timestamp()],
[pd.Timestamp("1724-12-20 20:20:20+1").timestamp()],
[np.nan],
],
)

df = pd.DataFrame(
[
[pd.Timestamp("2017-12-19 00:00:00", tz="CET")],
[pd.Timestamp("1724-12-20 20:20:20")],
[np.nan],
]
)
table = table_from_frame(df)
self.assertEqual(pytz.utc, table.domain.variables[0].timezone)
np.testing.assert_equal(
table.X,
[
[pd.Timestamp("2017-12-19 00:00:00+1").timestamp()],
[pd.Timestamp("1724-12-20 20:20:20").timestamp()],
[np.nan],
],
)

def test_time_variable_compatible(self):
from Orange.data.pandas_compat import table_from_frame

def to_df(val):
return pd.DataFrame([[pd.Timestamp(val)]])

for datestr, timestamp, outstr in TestTimeVariable.TESTS:
var = TimeVariable("time")
var_parse = var.to_val(datestr)
try:
pandas_parse = table_from_frame(to_df(datestr)).X[0, 0]
except ValueError as ex:
# pandas cannot automatically parse some formats in the list
# skip them
continue
if not (np.isnan(var_parse) and np.isnan(pandas_parse)):
# nan == nan => False
self.assertEqual(var_parse, pandas_parse)
self.assertEqual(pandas_parse, timestamp)

self.assertEqual(var.repr_val(var_parse), var.repr_val(var_parse))
self.assertEqual(outstr, var.repr_val(var_parse))

@unittest.skip("Convert all Orange demo dataset. It takes about 5s which is way to slow")
def test_table_to_frame_on_all_orange_dataset(self):
from os import listdir
Expand Down
18 changes: 17 additions & 1 deletion Orange/data/variable.py
Original file line number Diff line number Diff line change
Expand Up @@ -934,14 +934,30 @@ def __init__(self, date_string):
# UTC offset and associated timezone. If parsed datetime values provide an
# offset, it is used for display. If not all values have the same offset,
# +0000 (=UTC) timezone is used and utc_offset is set to False.
utc_offset = None
_utc_offset = None
timezone = timezone.utc

def __init__(self, *args, have_date=0, have_time=0, **kwargs):
super().__init__(*args, **kwargs)
self.have_date = have_date
self.have_time = have_time

@property
def utc_offset(self):
warnings.warn(
"utc_offset is deprecated and will be removed in Orange 3.31",
OrangeDeprecationWarning
)
return self._utc_offset

@utc_offset.setter
def utc_offset(self, val):
warnings.warn(
"utc_offset is deprecated and will be removed in Orange 3.31",
OrangeDeprecationWarning
)
self._utc_offset = val

def copy(self, compute_value=Variable._CopyComputeValue, *, name=None, **_):
return super().copy(compute_value=compute_value, name=name,
have_date=self.have_date, have_time=self.have_time)
Expand Down

0 comments on commit 25b7a48

Please sign in to comment.