Skip to content

Commit

Permalink
Table: Add methods 'join' and 'with_column'
Browse files Browse the repository at this point in the history
  • Loading branch information
janezd committed Feb 10, 2021
1 parent 5cda891 commit 63d69d0
Show file tree
Hide file tree
Showing 2 changed files with 199 additions and 0 deletions.
74 changes: 74 additions & 0 deletions Orange/data/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -942,6 +942,80 @@ def collect(attr):
conc.attributes.update(table.attributes)
return conc

@classmethod
def join(cls, *tables):
"""
Horizontally join the given tables into a new table
- All variable names must be unique.
- Ids are copied from the first table.
- Weights are copied from the first table in which they are defined.
- The dictionary of table's attributes are merged. If the same attribute
appears in multiple dictionaries, the earlier are used.
Args:
*tables (Table): tables to be joined
Returns:
table (Table)
"""
if not tables:
raise ValueError('need at least one table to join')

def all_of(objs, names):
return (tuple(getattr(obj, name) for obj in objs)
for name in names)

def stack(arrs):
non_empty = tuple(arr if arr.ndim == 2 else arr[:, np.newaxis]
for arr in arrs
if arr is not None and arr.size > 0)
return np.hstack(non_empty) if non_empty else None

doms, Ws, table_attrss = all_of(tables, ("domain", "W", "attributes"))
Xs, Ys, Ms = map(stack, all_of(tables, ("X", "Y", "metas")))
if Ys is not None and Ys.shape[0] == 1:
Ys = Ys.flatten()
# pylint: disable=undefined-loop-variable
for W in Ws:
if W.size:
break

parts = all_of(doms, ("attributes", "class_vars", "metas"))
domain = Domain(*(tuple(chain(*lst)) for lst in parts))
table = cls.from_numpy(domain, Xs, Ys, Ms, W, ids=tables[0].ids)
for ta in reversed(table_attrss):
table.attributes.update(ta)

return table

def with_column(self, variable, data, to_metas=None):
"""
Create a new table with an additional column
Column's name must be unique
Args:
variable (Variable): variable for the new column
data (np.ndarray): data for the new column
to_metas (bool, optional): if `True` the column is added as meta
column. Otherwise, primitive variables are added to attributes
and non-primitive to metas.
Returns:
table (Table): a new table with the additional column
"""
dom = self.domain
attrs, classes, metas = dom.attributes, dom.class_vars, dom.metas
if to_metas or not variable.is_primitive():
metas += (variable, )
else:
attrs += (variable, )
domain = Domain(attrs, classes, metas)
new_table = self.transform(domain)
new_table.get_column_view(variable)[0][:] = data
return new_table

def is_view(self):
"""
Return `True` if all arrays represent a view referring to another table
Expand Down
125 changes: 125 additions & 0 deletions Orange/data/tests/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,131 @@ def test_from_numpy_sparse(self):
t = Table.from_numpy(domain, sp.bsr_matrix(x))
self.assertTrue(sp.isspmatrix_csr(t.X))

@staticmethod
def _new_table(attrs, classes, metas, s):
def nz(x): # pylint: disable=invalid-name
return x if x.size else np.empty((5, 0))

domain = Domain(attrs, classes, metas)
X = np.arange(s, s + len(attrs) * 5).reshape(5, -1)
Y = np.arange(100 + s, 100 + s + len(classes) * 5)
if len(classes) > 1:
Y = Y.reshape(5, -1)
M = np.arange(200 + s, 200 + s + len(metas) * 5).reshape(5, -1)
return Table.from_numpy(domain, nz(X), nz(Y), nz(M))

def test_join(self):
a, b, c, d, e, f, g = map(ContinuousVariable, "abcdefg")

# Common case; one class, no empty's
tab1 = self._new_table((a, b), (c, ), (d, ), 0)
tab2 = self._new_table((e, ), (), (f, g), 1000)
joined = Table.join(tab1, tab2)
domain = joined.domain
self.assertEqual(domain.attributes, (a, b, e))
self.assertEqual(domain.class_vars, (c, ))
self.assertEqual(domain.metas, (d, f, g))
np.testing.assert_equal(joined.X, np.hstack((tab1.X, tab2.X)))
np.testing.assert_equal(joined.Y, tab1.Y)
np.testing.assert_equal(joined.metas, np.hstack((tab1.metas, tab2.metas)))

# One part of one table is empty
tab1 = self._new_table((a, b), (), (), 0)
tab2 = self._new_table((), (), (c, ), 1000)
joined = Table.join(tab1, tab2)
domain = joined.domain
self.assertEqual(domain.attributes, (a, b))
self.assertEqual(domain.class_vars, ())
self.assertEqual(domain.metas, (c, ))
np.testing.assert_equal(joined.X, np.hstack((tab1.X, tab2.X)))
np.testing.assert_equal(joined.metas, np.hstack((tab1.metas, tab2.metas)))

# Multiple classes, two empty parts are merged
tab1 = self._new_table((a, b), (c, ), (), 0)
tab2 = self._new_table((), (d, ), (), 1000)
joined = Table.join(tab1, tab2)
domain = joined.domain
self.assertEqual(domain.attributes, (a, b))
self.assertEqual(domain.class_vars, (c, d))
self.assertEqual(domain.metas, ())
np.testing.assert_equal(joined.X, np.hstack((tab1.X, tab2.X)))
np.testing.assert_equal(joined.Y, np.vstack((tab1.Y, tab2.Y)).T)

# Merging of attributes and selection of weights
tab1 = self._new_table((a, b), (c, ), (), 0)
tab1.attributes = dict(a=5, b=7)
tab2 = self._new_table((d, ), (e, ), (), 1000)
tab2.W = np.arange(5)
tab3 = self._new_table((f, g), (), (), 2000)
tab3.attributes = dict(a=1, c=4)
tab3.W = np.arange(5, 10)
joined = Table.join(tab1, tab2, tab3)
domain = joined.domain
self.assertEqual(domain.attributes, (a, b, d, f, g))
self.assertEqual(domain.class_vars, (c, e))
self.assertEqual(domain.metas, ())
np.testing.assert_equal(joined.X, np.hstack((tab1.X, tab2.X, tab3.X)))
np.testing.assert_equal(joined.Y, np.vstack((tab1.Y, tab2.Y)).T)
self.assertEqual(joined.attributes, dict(a=5, b=7, c=4))
np.testing.assert_equal(joined.ids, tab1.ids)
np.testing.assert_equal(joined.W, tab2.W)

# Raise an exception when no tables are given
self.assertRaises(ValueError, Table.join)

def test_with_column(self):
a, b, c, d, e, f, g = map(ContinuousVariable, "abcdefg")
col = np.arange(9, 14)
colr = col.reshape(5, -1)
tab = self._new_table((a, b, c), (d, ), (e, f), 0)

# Add to attributes
tabw = tab.with_column(g, np.arange(9, 14))
self.assertEqual(tabw.domain.attributes, (a, b, c, g))
np.testing.assert_equal(tabw.X, np.hstack((tab.X, colr)))
np.testing.assert_equal(tabw.Y, tab.Y)
np.testing.assert_equal(tabw.metas, tab.metas)

# Add to metas
tabw = tab.with_column(g, np.arange(9, 14), to_metas=True)
self.assertEqual(tabw.domain.metas, (e, f, g))
np.testing.assert_equal(tabw.X, tab.X)
np.testing.assert_equal(tabw.Y, tab.Y)
np.testing.assert_equal(tabw.metas, np.hstack((tab.metas, colr)))

# Add to empty attributes
tab = self._new_table((), (d, ), (e, f), 0)
tabw = tab.with_column(g, np.arange(9, 14))
self.assertEqual(tabw.domain.attributes, (g, ))
np.testing.assert_equal(tabw.X, colr)
np.testing.assert_equal(tabw.Y, tab.Y)
np.testing.assert_equal(tabw.metas, tab.metas)

# Add to empty metas
tab = self._new_table((a, b, c), (d, ), (), 0)
tabw = tab.with_column(g, np.arange(9, 14), to_metas=True)
self.assertEqual(tabw.domain.metas, (g, ))
np.testing.assert_equal(tabw.X, tab.X)
np.testing.assert_equal(tabw.Y, tab.Y)
np.testing.assert_equal(tabw.metas, colr)

# Pass values as a list
tab = self._new_table((a, ), (d, ), (e, f), 0)
tabw = tab.with_column(g, [4, 2, -1, 2, 5])
self.assertEqual(tabw.domain.attributes, (a, g))
np.testing.assert_equal(
tabw.X, np.array([[0, 1, 2, 3, 4], [4, 2, -1, 2, 5]]).T)

# Add non-primitives as metas; join `float` and `object` to `object`
tab = self._new_table((a, ), (d, ), (e, f), 0)
t = StringVariable("t")
tabw = tab.with_column(t, list("abcde"))
self.assertEqual(tabw.domain.attributes, (a, ))
self.assertEqual(tabw.domain.metas, (e, f, t))
np.testing.assert_equal(
tabw.metas,
np.hstack((tab.metas, np.array(list("abcde")).reshape(5, -1))))


class TestTableFilters(unittest.TestCase):
def setUp(self):
Expand Down

0 comments on commit 63d69d0

Please sign in to comment.