Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FIX] pandas_compat: fix table_from_frames for "normal" dataframe #5652

Merged
merged 3 commits into from
Nov 14, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
210 changes: 96 additions & 114 deletions Orange/data/pandas_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,26 @@ def __finalize__(self, other, method=None, **_):
pd.DataFrame.__finalize__ = __finalize__


def _reset_index(df: pd.DataFrame) -> pd.DataFrame:
"""If df index is not a simple RangeIndex (or similar), include it into a table"""
if (
# not range-like index - test first to skip slow startswith(_o) check
not (
df.index.is_integer()
and (df.index.is_monotonic_increasing or df.index.is_monotonic_decreasing)
)
# check that it does not contain Orange index
and (
# startswith is slow (for long dfs) - firs check if col has strings
isinstance(df.index, pd.MultiIndex)
or not is_string_dtype(df.index)
or not any(str(i).startswith("_o") for i in df.index)
)
):
df = df.reset_index()
return df


def _is_discrete(s, force_nominal):
return (is_categorical_dtype(s) or
is_object_dtype(s) and (force_nominal or
Expand All @@ -157,6 +177,16 @@ def _is_datetime(s):
return True
try:
if is_object_dtype(s):
# pd.to_datetime would sucessfuly parse column of numbers to datetime
# but for column of object dtype with numbers we want to be either
# discret or string - following code try to parse column to numeric
# if connversion to numeric is sucessful return False
try:
pd.to_numeric(s)
return False
except (ValueError, TypeError):
pass

# utc=True - to allow different timezones in a series object
pd.to_datetime(s, infer_datetime_format=True, utc=True)
return True
Expand Down Expand Up @@ -207,136 +237,81 @@ def col_type(dt):
).values


def vars_from_df(df, role=None, force_nominal=False):
if role is None and hasattr(df, 'orange_role'):
_role = df.orange_role
else:
_role = role
def to_categorical(s, _):
x = s.astype("category").cat.codes
# it is same than x.replace(-1, np.nan), but much faster
x = x.where(x != -1, np.nan)
return np.asarray(x)

# If df index is not a simple RangeIndex (or similar), put it into data
if (
# not range-like index - test first to skip slow startswith(_o) check
not (
df.index.is_integer()
and (df.index.is_monotonic_increasing or df.index.is_monotonic_decreasing)
)
# check that it does not contain Orange index
and (
# startswith is slow (for long drs) - firs check if col has strings
isinstance(df.index, pd.MultiIndex)
or not is_string_dtype(df.index)
or not any(str(i).startswith("_o") for i in df.index)
)
):
df = df.reset_index()

Xcols, Ycols, Mcols = [], [], []
Xexpr, Yexpr, Mexpr = [], [], []
attrs, class_vars, metas = [], [], []
def vars_from_df(df, role=None, force_nominal=False):
if role is None and hasattr(df, 'orange_role'):
role = df.orange_role
df = _reset_index(df)

contains_strings = _role == Role.Meta
cols = [], [], []
exprs = [], [], []
vars_ = [], [], []

for column in df.columns:
s = df[column]
_role = Role.Attribute if role is None else role
if hasattr(df, 'orange_variables') and column in df.orange_variables:
original_var = df.orange_variables[column]
var = original_var.copy(compute_value=None)
if _role == Role.Attribute:
Xcols.append(column)
Xexpr.append(None)
attrs.append(var)
elif _role == Role.ClassAttribute:
Ycols.append(column)
Yexpr.append(None)
class_vars.append(var)
else: # if role == Role.Meta:
Mcols.append(column)
Mexpr.append(None)
metas.append(var)
expr = None
elif _is_datetime(s):
var = TimeVariable(str(column))
attrs.append(var)
Xcols.append(column)
Xexpr.append(_convert_datetime)
expr = _convert_datetime
elif _is_discrete(s, force_nominal):
discrete = s.astype('category').cat
var = DiscreteVariable(str(column),
discrete.categories.astype(str).tolist())
attrs.append(var)
Xcols.append(column)

def to_cat(s, _):
x = s.astype("category").cat.codes
# it is same than x.replace(-1, np.nan), but much faster
x = x.where(x != -1, np.nan)
return np.asarray(x)

Xexpr.append(to_cat)
discrete = s.astype("category").cat
var = DiscreteVariable(
str(column), discrete.categories.astype(str).tolist()
)
expr = to_categorical
elif is_numeric_dtype(s):
var = ContinuousVariable(
# set number of decimals to 0 if int else keeps default behaviour
str(column), number_of_decimals=(0 if is_integer_dtype(s) else None)
)
attrs.append(var)
Xcols.append(column)
Xexpr.append(None)
expr = None
else:
contains_strings = True
if role is not None and role != Role.Meta:
raise ValueError("String variable must be in metas.")
_role = Role.Meta
var = StringVariable(str(column))
metas.append(var)
Mcols.append(column)
Mexpr.append(lambda s, _: np.asarray(s, dtype=object))

# if role isn't explicitly set, try to
# export dataframes into one contiguous block.
# for this all columns must be of the same role
if isinstance(df, OrangeDataFrame) \
and not role \
and contains_strings \
and not force_nominal:
attrs.extend(class_vars)
attrs.extend(metas)
metas = attrs
Xcols.extend(Ycols)
Xcols.extend(Mcols)
Mcols = Xcols
Xexpr.extend(Yexpr)
Xexpr.extend(Mexpr)
Mexpr = Xexpr

attrs, class_vars = [], []
Xcols, Ycols = [], []
Xexpr, Yexpr = [], []

XYM = []
for Avars, Acols, Aexpr in zip(
(attrs, class_vars, metas),
(Xcols, Ycols, Mcols),
(Xexpr, Yexpr, Mexpr)):
if not Acols:
A = None if Acols != Xcols else np.empty((df.shape[0], 0))
XYM.append(A)
continue
if not any(Aexpr):
Adf = df if all(c in Acols
for c in df.columns) else df[Acols]
if all(isinstance(a, SparseDtype) for a in Adf.dtypes):
A = csr_matrix(Adf.sparse.to_coo())
expr = lambda s, _: np.asarray(s, dtype=object)

cols[_role].append(column)
exprs[_role].append(expr)
vars_[_role].append(var)

xym = []
for a_vars, a_cols, a_expr in zip(vars_, cols, exprs):
if not a_cols:
arr = None if a_cols != cols[0] else np.empty((df.shape[0], 0))
elif not any(a_expr):
# if all c in columns table will share memory with dataframe
a_df = df if all(c in a_cols for c in df.columns) else df[a_cols]
if all(isinstance(a, SparseDtype) for a in a_df.dtypes):
arr = csr_matrix(a_df.sparse.to_coo())
else:
A = np.asarray(Adf)
XYM.append(A)
continue
# we'll have to copy the table to resolve any expressions
# TODO eliminate expr (preprocessing for pandas -> table)
A = np.array([expr(df[col], var) if expr else np.asarray(df[col])
for var, col, expr in zip(Avars, Acols, Aexpr)]).T
XYM.append(A)
arr = np.asarray(a_df)
else:
# we'll have to copy the table to resolve any expressions
arr = np.array(
[
expr(df[col], var) if expr else np.asarray(df[col])
for var, col, expr in zip(a_vars, a_cols, a_expr)
]
).T
xym.append(arr)

# Let the tables share memory with pandas frame
if XYM[1] is not None and XYM[1].ndim == 2 and XYM[1].shape[1] == 1:
XYM[1] = XYM[1][:, 0]
if xym[1] is not None and xym[1].ndim == 2 and xym[1].shape[1] == 1:
xym[1] = xym[1][:, 0]

return XYM, Domain(attrs, class_vars, metas)
return xym, Domain(*vars_)


def table_from_frame(df, *, force_nominal=False):
Expand Down Expand Up @@ -373,6 +348,15 @@ def table_from_frame(df, *, force_nominal=False):


def table_from_frames(xdf, ydf, mdf):
if not (xdf.index.equals(ydf.index) and xdf.index.equals(mdf.index)):
raise ValueError(
"Indexes not equal. Make sure that all three dataframes have equal index"
)

# drop index from x and y - it makes sure that index if not range will be
# placed in metas
xdf = xdf.reset_index(drop=True)
ydf = ydf.reset_index(drop=True)
dfs = xdf, ydf, mdf

if not all(df.shape[0] == xdf.shape[0] for df in dfs):
Expand All @@ -386,25 +370,23 @@ def table_from_frames(xdf, ydf, mdf):
XYM = (xXYM[0], yXYM[1], mXYM[2])
domain = Domain(xDomain.attributes, yDomain.class_vars, mDomain.metas)

indexes = [df.index for df in dfs]
ids = [
int(x[2:])
if str(x).startswith("_o") and x[2:].isdigit() and x == y == m
int(idx[2:])
if str(idx).startswith("_o") and idx[2:].isdigit()
else Table.new_id()
for x, y, m in zip(*indexes)
for idx in mdf.index
]

attributes = {}
W = None
for df in dfs:
if isinstance(df, OrangeDataFrame):
W = [df.orange_weights[i] for i in df.index
if i in df.orange_weights]
W = [df.orange_weights[i] for i in df.index if i in df.orange_weights]
if len(W) != len(df.index):
W = None
attributes.update(df.orange_attributes)
else:
W = None
attributes.update(df.orange_attributes)

return Table.from_numpy(
domain,
Expand Down
68 changes: 67 additions & 1 deletion Orange/data/tests/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,6 +383,25 @@ def test_table_from_frame_timezones(self):
],
)

def test_table_from_frame_no_datetim(self):
"""
In case when dtype of column is object and column contains numbers only,
column could be recognized as a TimeVarialbe since pd.to_datetime can parse
numbers as datetime. That column must be result either in StringVariable
or DiscreteVariable since it's dtype is object.
"""
from Orange.data.pandas_compat import table_from_frame

df = pd.DataFrame([[1], [2], [3]], dtype="object")
table = table_from_frame(df)
# check if exactly ContinuousVariable and not subtype TimeVariable
self.assertIsInstance(table.domain.metas[0], StringVariable)

df = pd.DataFrame([[1], [2], [2]], dtype="object")
table = table_from_frame(df)
# check if exactly ContinuousVariable and not subtype TimeVariable
self.assertIsInstance(table.domain.attributes[0], DiscreteVariable)

def test_time_variable_compatible(self):
from Orange.data.pandas_compat import table_from_frame

Expand Down Expand Up @@ -443,6 +462,53 @@ def test_table_from_frames(self):
self.assertTupleEqual(table.domain.metas, new_table.domain.metas)
self.assertEqual(table.domain.class_var, new_table.domain.class_var)

def test_table_from_frames_not_orange_dataframe(self):
x = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=["x1", "x2", "x3"])
y = pd.DataFrame([[5], [6]], columns=["y"])
m = pd.DataFrame([[1, 2], [4, 5]], columns=["m1", "m2"])
new_table = Table.from_pandas_dfs(x, y, m)

np.testing.assert_array_equal(x, new_table.X)
np.testing.assert_array_equal(y.values.flatten(), new_table.Y)
np.testing.assert_array_equal(m, new_table.metas)
d = new_table.domain
self.assertListEqual(x.columns.tolist(), [a.name for a in d.attributes])
self.assertEqual(y.columns[0], d.class_var.name)
self.assertListEqual(m.columns.tolist(), [a.name for a in d.metas])

def test_table_from_frames_same_index(self):
"""
Test that index column is placed in metas. Function should fail
with ValueError when indexes are different
"""
index = np.array(["a", "b"])
x = pd.DataFrame(
[[1, 2, 3], [4, 5, 6]], columns=["x1", "x2", "x3"], index=index
)
y = pd.DataFrame([[5], [6]], columns=["y"], index=index)
m = pd.DataFrame([[1, 2], [4, 5]], columns=["m1", "m2"], index=index)
new_table = Table.from_pandas_dfs(x, y, m)

# index should be placed in metas
np.testing.assert_array_equal(x, new_table.X)
np.testing.assert_array_equal(y.values.flatten(), new_table.Y)
np.testing.assert_array_equal(
np.hstack((index[:, None], m.values.astype("object"))), new_table.metas
)
d = new_table.domain
self.assertListEqual(x.columns.tolist(), [a.name for a in d.attributes])
self.assertEqual(y.columns[0], d.class_var.name)
self.assertListEqual(["index"] + m.columns.tolist(), [a.name for a in d.metas])

index2 = np.array(["a", "c"])
x = pd.DataFrame(
[[1, 2, 3], [4, 5, 6]], columns=["x1", "x2", "x3"], index=index
)
y = pd.DataFrame([[5], [6]], columns=["y"], index=index2)
m = pd.DataFrame([[1, 2], [4, 5]], columns=["m1", "m2"], index=index)
with self.assertRaises(ValueError):
Table.from_pandas_dfs(x, y, m)


class TestTablePandas(unittest.TestCase):
def setUp(self):
Expand Down Expand Up @@ -579,7 +645,7 @@ def test_merge(self):
table3 = df3.to_orange_table()

self.assertEqual(len(table2), len(table3))
self.assertFalse(any(table3.W))
self.assertEqual(0, table3.W.size)
self.assertEqual(self.table.attributes, table3.attributes)

d1 = table2.domain
Expand Down