From c79669f77f9564fd035e7926b4423342db37b2c9 Mon Sep 17 00:00:00 2001 From: Jernej Urankar Date: Mon, 8 May 2017 11:32:47 +0200 Subject: [PATCH 1/2] Merge Data: work with sparse --- Orange/widgets/data/owmergedata.py | 41 ++++++++++++------- Orange/widgets/data/tests/test_owmergedata.py | 26 ++++++++++++ 2 files changed, 52 insertions(+), 15 deletions(-) diff --git a/Orange/widgets/data/owmergedata.py b/Orange/widgets/data/owmergedata.py index aad4dc10dfb..dd8550435ec 100644 --- a/Orange/widgets/data/owmergedata.py +++ b/Orange/widgets/data/owmergedata.py @@ -4,9 +4,11 @@ from AnyQt.QtWidgets import QApplication, QStyle, QSizePolicy import numpy as np +import scipy.sparse as sp import Orange from Orange.data import StringVariable, ContinuousVariable +from Orange.data.util import hstack from Orange.widgets import widget, gui, settings from Orange.widgets.utils import itemmodels from Orange.widgets.utils.sql import check_sql_input @@ -362,20 +364,29 @@ def _join_table_by_indices(self, reduced_extra, indices): def _join_array_by_indices(left, right, indices, string_cols=None): """Join (horizontally) two arrays, taking pairs of rows given in indices """ - tpe = object if object in (left.dtype, right.dtype) else left.dtype - left_width, right_width = left.shape[1], right.shape[1] - arr = np.full((indices.shape[1], left_width + right_width), np.nan, tpe) - if string_cols: - arr[:, string_cols] = "" - for indices, to_change, lookup in ( - (indices[0], arr[:, :left_width], left), - (indices[1], arr[:, left_width:], right)): - known = indices != -1 - to_change[known] = lookup[indices[known]] - return arr - - -def test(): + def prepare(arr, inds, str_cols): + try: + newarr = arr[inds] + except IndexError: + newarr = np.full_like(arr, np.nan) + else: + empty = np.full(arr.shape[1], np.nan) + if str_cols: + assert arr.dtype == object + empty = empty.astype(object) + empty[str_cols] = '' + newarr[inds == -1] = empty + return newarr + + left_width = left.shape[1] + str_left = [i for i in string_cols or () if i < left_width] + str_right = [i - left_width for i in string_cols or () if i >= left_width] + res = hstack((prepare(left, indices[0], str_left), + prepare(right, indices[1], str_right))) + return res + + +def main(): app = QApplication([]) w = OWMergeData() data = Orange.data.Table("tests/data-gender-region") @@ -388,4 +399,4 @@ def test(): if __name__ == "__main__": - test() + main() diff --git a/Orange/widgets/data/tests/test_owmergedata.py b/Orange/widgets/data/tests/test_owmergedata.py index 8e0929bb8c8..fc2a4560c78 100644 --- a/Orange/widgets/data/tests/test_owmergedata.py +++ b/Orange/widgets/data/tests/test_owmergedata.py @@ -3,6 +3,7 @@ from itertools import chain import numpy as np +import scipy.sparse as sp from Orange.data import Table, Domain, DiscreteVariable, StringVariable from Orange.widgets.data.owmergedata import OWMergeData, INSTANCEID, INDEX @@ -425,3 +426,28 @@ def test_best_match(self): self.assertEqual(self.widget.attr_merge_extra, zoo_images.domain[-1]) self.assertEqual(self.widget.attr_combine_data, zoo.domain[-1]) self.assertEqual(self.widget.attr_combine_extra, zoo_images.domain[-1]) + + def test_sparse(self): + """ + Merge should work with sparse. + GH-2295 + GH-2155 + """ + data = Table("iris")[::25] + data_ed_dense = Table("titanic")[::300] + data_ed_sparse = Table("titanic")[::300] + data_ed_sparse.X = sp.csr_matrix(data_ed_sparse.X) + self.send_signal("Data", data) + + self.send_signal("Extra Data", data_ed_dense) + output_dense = self.get_output("Data") + self.assertFalse(sp.issparse(output_dense.X)) + self.assertFalse(output_dense.is_sparse()) + + self.send_signal("Extra Data", data_ed_sparse) + output_sparse = self.get_output("Data") + self.assertTrue(sp.issparse(output_sparse.X)) + self.assertTrue(output_sparse.is_sparse()) + + output_sparse.X = output_sparse.X.toarray() + self.assertTablesEqual(output_dense, output_sparse) From e461dc32684b1dc97a8e035066bd16d6ecb4cb97 Mon Sep 17 00:00:00 2001 From: Jernej Urankar Date: Mon, 22 May 2017 13:55:32 +0200 Subject: [PATCH 2/2] Table: fix sparse indices --- Orange/data/table.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Orange/data/table.py b/Orange/data/table.py index 65e3b5c3870..175d9cc07ef 100644 --- a/Orange/data/table.py +++ b/Orange/data/table.py @@ -57,15 +57,15 @@ def __init__(self, table, row_index): self.id = table.ids[row_index] self._x = table.X[row_index] if sp.issparse(self._x): - self.sparse_x = self._x + self.sparse_x = sp.csr_matrix(self._x) self._x = np.asarray(self._x.todense())[0] self._y = table._Y[row_index] if sp.issparse(self._y): - self.sparse_y = self._y + self.sparse_y = sp.csr_matrix(self._y) self._y = np.asarray(self._y.todense())[0] self._metas = table.metas[row_index] if sp.issparse(self._metas): - self.sparse_metas = self._metas + self.sparse_metas = sp.csr_matrix(self._metas) self._metas = np.asarray(self._metas.todense())[0] @property