Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FIX] pandas_compat: fix conversion of datetime series #5547

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 49 additions & 8 deletions Orange/data/pandas_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,13 +157,56 @@ def _is_datetime(s):
return True
try:
if is_object_dtype(s):
pd.to_datetime(s, infer_datetime_format=True)
# utc=True - to allow different timezones in a series object
pd.to_datetime(s, infer_datetime_format=True, utc=True)
return True
except Exception: # pylint: disable=broad-except
pass
return False


def _convert_datetime(series, var):
def col_type(dt):
"""Test if is date, time or datetime"""
dt_nonnat = dt[~pd.isnull(dt)] # nat == nat is False
if (dt_nonnat.dt.floor("d") == dt_nonnat).all():
# all times are 00:00:00.0 - pure date
return 1, 0
elif (dt_nonnat.dt.date == pd.Timestamp("now").date()).all():
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does pandas automatically assume today's date if only time is present?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yep. When pd.datetime() is called on column of times it will be transformed to datetime64 and all dates will be today's dates

# all dates are today's date - pure time
return 0, 1 # pure time
else:
# else datetime
return 1, 1

try:
dt = pd.to_datetime(series)
except ValueError:
# series with type object and different timezones will raise a
# ValueError - normalizing to utc
dt = pd.to_datetime(series, utc=True)

# set variable type to date, time or datetime
var.have_date, var.have_time = col_type(dt)

if dt.dt.tz is not None:
# set timezone if available and convert to utc
var.timezone = dt.dt.tz
dt = dt.dt.tz_convert("UTC")

if var.have_time and not var.have_date:
# if time only measure seconds from midnight - equal to setting date
# to unix epoch
return (
(dt.dt.tz_localize(None) - pd.Timestamp("now").normalize())
/ pd.Timedelta("1s")
).values

return (
(dt.dt.tz_localize(None) - pd.Timestamp("1970-01-01")) / pd.Timedelta("1s")
).values


def vars_from_df(df, role=None, force_nominal=False):
if role is None and hasattr(df, 'orange_role'):
_role = df.orange_role
Expand Down Expand Up @@ -210,6 +253,11 @@ def vars_from_df(df, role=None, force_nominal=False):
Mcols.append(column)
Mexpr.append(None)
metas.append(var)
elif _is_datetime(s):
var = TimeVariable(str(column))
attrs.append(var)
Xcols.append(column)
Xexpr.append(_convert_datetime)
elif _is_discrete(s, force_nominal):
discrete = s.astype('category').cat
var = DiscreteVariable(str(column),
Expand All @@ -224,13 +272,6 @@ def to_cat(s, _):
return np.asarray(x)

Xexpr.append(to_cat)
elif _is_datetime(s):
var = TimeVariable(str(column))
attrs.append(var)
Xcols.append(column)
Xexpr.append(lambda s, v: np.asarray(
s.astype('str').replace('NaT', np.nan).map(v.parse)
))
elif is_numeric_dtype(s):
var = ContinuousVariable(
# set number of decimals to 0 if int else keeps default behaviour
Expand Down
243 changes: 242 additions & 1 deletion Orange/data/tests/test_pandas.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
# pylint: disable=import-outside-toplevel

import unittest
from datetime import date, datetime, timezone

import numpy as np
import pytz
from scipy.sparse import csr_matrix
import scipy.sparse as sp

from Orange.data import ContinuousVariable, DiscreteVariable, TimeVariable, Table, Domain, \
StringVariable
from Orange.data.pandas_compat import OrangeDataFrame
from Orange.data.tests.test_variable import TestTimeVariable

try:
import pandas as pd
Expand Down Expand Up @@ -164,6 +167,244 @@ def test_not_orangedf(self):
for v1, v2 in zip(vars1, vars2):
self.assertEqual(type(v1), type(v2))

def test_table_from_frame_date(self):
from Orange.data.pandas_compat import table_from_frame

df = pd.DataFrame(
[[pd.Timestamp("2017-12-19")], [pd.Timestamp("1724-12-20")], [np.nan]]
)
table = table_from_frame(df)
np.testing.assert_equal(
table.X,
[
[pd.Timestamp("2017-12-19").timestamp()],
[pd.Timestamp("1724-12-20").timestamp()],
[np.nan],
],
)
self.assertEqual(table.domain.variables[0].have_time, 0)
self.assertEqual(table.domain.variables[0].have_date, 1)

df = pd.DataFrame([["2017-12-19"], ["1724-12-20"], [np.nan]])
table = table_from_frame(df)
np.testing.assert_equal(
table.X,
[
[pd.Timestamp("2017-12-19").timestamp()],
[pd.Timestamp("1724-12-20").timestamp()],
[np.nan],
],
)
self.assertEqual(table.domain.variables[0].have_time, 0)
self.assertEqual(table.domain.variables[0].have_date, 1)

df = pd.DataFrame([[date(2017, 12, 19)], [date(1724, 12, 20)], [np.nan]])
table = table_from_frame(df)
np.testing.assert_equal(
table.X,
[
[pd.Timestamp("2017-12-19").timestamp()],
[pd.Timestamp("1724-12-20").timestamp()],
[np.nan],
],
)
self.assertEqual(table.domain.variables[0].have_time, 0)
self.assertEqual(table.domain.variables[0].have_date, 1)

def test_table_from_frame_time(self):
from Orange.data.pandas_compat import table_from_frame

df = pd.DataFrame(
[[pd.Timestamp("00:00:00.25")], [pd.Timestamp("20:20:20.30")], [np.nan]]
)
table = table_from_frame(df)
np.testing.assert_equal(
table.X,
[
[pd.Timestamp("1970-01-01 00:00:00.25").timestamp()],
[pd.Timestamp("1970-01-01 20:20:20.30").timestamp()],
[np.nan],
],
)
self.assertEqual(table.domain.variables[0].have_time, 1)
self.assertEqual(table.domain.variables[0].have_date, 0)

df = pd.DataFrame([["00:00:00.25"], ["20:20:20.30"], [np.nan]])
table = table_from_frame(df)
np.testing.assert_equal(
table.X,
[
[pd.Timestamp("1970-01-01 00:00:00.25").timestamp()],
[pd.Timestamp("1970-01-01 20:20:20.30").timestamp()],
[np.nan],
],
)
self.assertEqual(table.domain.variables[0].have_time, 1)
self.assertEqual(table.domain.variables[0].have_date, 0)

def test_table_from_frame_datetime(self):
from Orange.data.pandas_compat import table_from_frame

df = pd.DataFrame(
[
[pd.Timestamp("2017-12-19 00:00:00.50")],
[pd.Timestamp("1724-12-20 20:20:20.30")],
[np.nan],
]
)
table = table_from_frame(df)
np.testing.assert_equal(
table.X,
[
[pd.Timestamp("2017-12-19 00:00:00.50").timestamp()],
[pd.Timestamp("1724-12-20 20:20:20.30").timestamp()],
[np.nan],
],
)
self.assertEqual(table.domain.variables[0].have_time, 1)
self.assertEqual(table.domain.variables[0].have_date, 1)

df = pd.DataFrame(
[["2017-12-19 00:00:00.50"], ["1724-12-20 20:20:20.30"], [np.nan]]
)
table = table_from_frame(df)
np.testing.assert_equal(
table.X,
[
[pd.Timestamp("2017-12-19 00:00:00.50").timestamp()],
[pd.Timestamp("1724-12-20 20:20:20.30").timestamp()],
[np.nan],
],
)
self.assertEqual(table.domain.variables[0].have_time, 1)
self.assertEqual(table.domain.variables[0].have_date, 1)

df = pd.DataFrame(
[
[datetime(2017, 12, 19, 0, 0, 0, 500000)],
[datetime(1724, 12, 20, 20, 20, 20, 300000)],
[np.nan],
]
)
table = table_from_frame(df)
np.testing.assert_equal(
table.X,
[
[pd.Timestamp("2017-12-19 00:00:00.50").timestamp()],
[pd.Timestamp("1724-12-20 20:20:20.30").timestamp()],
[np.nan],
],
)
self.assertEqual(table.domain.variables[0].have_time, 1)
self.assertEqual(table.domain.variables[0].have_date, 1)

def test_table_from_frame_timezones(self):
from Orange.data.pandas_compat import table_from_frame

df = pd.DataFrame(
[
[pd.Timestamp("2017-12-19 00:00:00")],
[pd.Timestamp("1724-12-20 20:20:20")],
[np.nan],
]
)
table = table_from_frame(df)
self.assertEqual(table.domain.variables[0].timezone, timezone.utc)

df = pd.DataFrame(
[
[pd.Timestamp("2017-12-19 00:00:00Z")],
[pd.Timestamp("1724-12-20 20:20:20Z")],
[np.nan],
]
)
table = table_from_frame(df)
self.assertEqual(pytz.utc, table.domain.variables[0].timezone)
np.testing.assert_equal(
table.X,
[
[pd.Timestamp("2017-12-19 00:00:00").timestamp()],
[pd.Timestamp("1724-12-20 20:20:20").timestamp()],
[np.nan],
],
)

df = pd.DataFrame(
[
[pd.Timestamp("2017-12-19 00:00:00+1")],
[pd.Timestamp("1724-12-20 20:20:20+1")],
[np.nan],
]
)
table = table_from_frame(df)
self.assertEqual(pytz.FixedOffset(60), table.domain.variables[0].timezone)
np.testing.assert_equal(
table.X,
[
[pd.Timestamp("2017-12-19 00:00:00+1").timestamp()],
[pd.Timestamp("1724-12-20 20:20:20+1").timestamp()],
[np.nan],
],
)

df = pd.DataFrame(
[
[pd.Timestamp("2017-12-19 00:00:00", tz="CET")],
[pd.Timestamp("1724-12-20 20:20:20", tz="CET")],
[np.nan],
]
)
table = table_from_frame(df)
self.assertEqual(pytz.timezone("CET"), table.domain.variables[0].timezone)
np.testing.assert_equal(
table.X,
[
[pd.Timestamp("2017-12-19 00:00:00+1").timestamp()],
[pd.Timestamp("1724-12-20 20:20:20+1").timestamp()],
[np.nan],
],
)

df = pd.DataFrame(
[
[pd.Timestamp("2017-12-19 00:00:00", tz="CET")],
[pd.Timestamp("1724-12-20 20:20:20")],
[np.nan],
]
)
table = table_from_frame(df)
self.assertEqual(pytz.utc, table.domain.variables[0].timezone)
np.testing.assert_equal(
table.X,
[
[pd.Timestamp("2017-12-19 00:00:00+1").timestamp()],
[pd.Timestamp("1724-12-20 20:20:20").timestamp()],
[np.nan],
],
)

def test_time_variable_compatible(self):
from Orange.data.pandas_compat import table_from_frame

def to_df(val):
return pd.DataFrame([[pd.Timestamp(val)]])

for datestr, timestamp, outstr in TestTimeVariable.TESTS:
var = TimeVariable("time")
var_parse = var.to_val(datestr)
try:
pandas_parse = table_from_frame(to_df(datestr)).X[0, 0]
except ValueError:
# pandas cannot parse some formats in the list skip them
continue
if not (np.isnan(var_parse) and np.isnan(pandas_parse)):
# nan == nan => False
self.assertEqual(var_parse, pandas_parse)
self.assertEqual(pandas_parse, timestamp)

self.assertEqual(var.repr_val(var_parse), var.repr_val(var_parse))
self.assertEqual(outstr, var.repr_val(var_parse))

@unittest.skip("Convert all Orange demo dataset. It takes about 5s which is way to slow")
def test_table_to_frame_on_all_orange_dataset(self):
from os import listdir
Expand Down
10 changes: 9 additions & 1 deletion Orange/data/tests/test_variable.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
import numpy as np
import scipy.sparse as sp

import Orange
from Orange.data import Variable, ContinuousVariable, DiscreteVariable, \
StringVariable, TimeVariable, Unknown, Value, Table
from Orange.data.io import CSVReader
Expand Down Expand Up @@ -695,6 +694,15 @@ def varcls_modified(self, name):
var.have_time = 1
return var

def test_remove_deprecated_utc_offset(self):
""" When this test start to fail:
- remove all marked locations in TimeVariable class
- uncomment new implementation for setting timezones in parse method
- remove this test
"""
import Orange # pylint: disable=import-outside-toplevel
self.assertLess(Orange.__version__, "3.32")


PickleContinuousVariable = create_pickling_tests(
"PickleContinuousVariable",
Expand Down
Loading