Skip to content

Commit

Permalink
Load data: Add data loader for H5AD format
Browse files Browse the repository at this point in the history
  • Loading branch information
pavlin-policar committed Sep 2, 2019
1 parent 96efc46 commit 231f024
Show file tree
Hide file tree
Showing 6 changed files with 116 additions and 4 deletions.
Binary file not shown.
Binary file not shown.
20 changes: 19 additions & 1 deletion orangecontrib/single_cell/tests/test_load_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from orangecontrib.single_cell.widgets.load_data import (
LoomLoader, ExcelLoader, MtxLoader, CountLoader, Loader, PickleLoader,
CsvLoader, get_data_loader, Concatenate
CsvLoader, H5ADLoader, get_data_loader, Concatenate,
)


Expand All @@ -21,6 +21,8 @@ def test_get_data_loader(self):
self.assertIsInstance(loader, Loader)
self.assertIsInstance(get_data_loader("data.xls"), ExcelLoader)
self.assertIsInstance(get_data_loader("data.loom"), LoomLoader)
self.assertIsInstance(get_data_loader("data_sparse.h5ad"), H5ADLoader)
self.assertIsInstance(get_data_loader("data_dense.h5ad"), H5ADLoader)

def test_get_data_loader_pickle(self):
self.assertIsInstance(get_data_loader("data.pkl"), PickleLoader)
Expand Down Expand Up @@ -89,6 +91,22 @@ def test_file_summary_loom(self):
self.assertEqual(loader.n_cols, 20)
self.assertEqual(round(loader.sparsity, 2), 0.93)

def test_file_summary_h5ad_sparse(self):
file_name = os.path.join(os.path.dirname(__file__), "data/data_sparse.h5ad")
loader = H5ADLoader(file_name)
self.assertEqual(19560, loader.file_size)
self.assertEqual(20, loader.n_rows)
self.assertEqual(25, loader.n_cols)
self.assertEqual(0.59, round(loader.sparsity, 2))

def test_file_summary_h5ad_dense(self):
file_name = os.path.join(os.path.dirname(__file__), "data/data_dense.h5ad")
loader = H5ADLoader(file_name)
self.assertEqual(11060, loader.file_size)
self.assertEqual(20, loader.n_rows)
self.assertEqual(25, loader.n_cols)
self.assertEqual(0.59, round(loader.sparsity, 2))

def test_load_data_mtx(self):
file_name = os.path.join(os.path.dirname(__file__),
"data/10x/mm10/matrix.mtx")
Expand Down
41 changes: 40 additions & 1 deletion orangecontrib/single_cell/tests/test_owloaddata.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import pandas as pd

from Orange.data import ContinuousVariable, Variable
from Orange.widgets.data.owtable import OWDataTable
from Orange.widgets.tests.base import WidgetTest

from orangecontrib.single_cell.widgets.owloaddata import OWLoadData
Expand Down Expand Up @@ -241,6 +240,46 @@ def test_load_data_loom_sample(self):
self.assertListEqual([attr.name for attr in data.domain.attributes],
["0", "1", "2", "3", "6", "7"])

def test_load_data_h5ad_sparse_sample(self):
file_name = os.path.join(self._path, "data_sparse.h5ad")
self.widget.set_current_path(file_name)
self.widget.sample_rows_cb.setChecked(True)
self.widget.sample_cols_cb.setChecked(True)
self.widget.set_sample_rows_p(30)
self.widget.set_sample_cols_p(10)
self.widget.commit()
data = self.get_output("Data")
X = np.array([[0, 2, 1, 0], [1, 0, 0, 0], [2, 1, 2, 1], [1, 1, 0, 0],
[1, 0, 0, 0], [0, 1, 0, 2], [1, 0, 0, 1], [0, 2, 0, 1],
[0, 1, 1, 0]])
npt.assert_array_equal(data.X, X)
metas = np.array([[0, 0, 10], [1, 0, 10], [2, 0, 15], [3, 0, 10],
[6, 1, 11], [7, 1, 15], [8, 1, 11], [9, 1, 20],
[14, 2, 12]])
npt.assert_array_equal(data.metas, metas)
self.assertListEqual([attr.name for attr in data.domain.attributes],
["Gene 0", "Gene 1", "Gene 2", "Gene 3"])

def test_load_data_h5ad_dense_sample(self):
file_name = os.path.join(self._path, "data_dense.h5ad")
self.widget.set_current_path(file_name)
self.widget.sample_rows_cb.setChecked(True)
self.widget.sample_cols_cb.setChecked(True)
self.widget.set_sample_rows_p(30)
self.widget.set_sample_cols_p(10)
self.widget.commit()
data = self.get_output("Data")
X = np.array([[0, 2, 1, 0], [1, 0, 0, 0], [2, 1, 2, 1], [1, 1, 0, 0],
[1, 0, 0, 0], [0, 1, 0, 2], [1, 0, 0, 1], [0, 2, 0, 1],
[0, 1, 1, 0]])
npt.assert_array_equal(data.X, X)
metas = np.array([[0, 0, 10], [1, 0, 10], [2, 0, 15], [3, 0, 10],
[6, 1, 11], [7, 1, 15], [8, 1, 11], [9, 1, 20],
[14, 2, 12]])
npt.assert_array_equal(data.metas, metas)
self.assertListEqual([attr.name for attr in data.domain.attributes],
["Gene 0", "Gene 1", "Gene 2", "Gene 3"])

def test_not_enough_headers(self):
file_name = os.path.join(self._path, "DATA_MATRIX_LOG_TPM.txt")
self.widget.set_current_path(file_name)
Expand Down
58 changes: 56 additions & 2 deletions orangecontrib/single_cell/widgets/load_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@
import numpy as np
import pandas as pd
import scipy.io
import scipy.sparse as sp
import loompy as lp
import xlrd

from Orange.data import (
ContinuousVariable, DiscreteVariable, Domain, Table, StringVariable
ContinuousVariable, DiscreteVariable, Domain, Table
)
from Orange.data.io import (
Compression, open_compressed, PickleReader,
Expand Down Expand Up @@ -49,6 +50,8 @@ def get_data_loader(file_name):
return ExcelLoader(file_name)
elif ext == ".loom":
return LoomLoader(file_name)
elif ext == ".h5ad":
return H5ADLoader(file_name)
else:
return Loader(file_name)

Expand Down Expand Up @@ -425,7 +428,7 @@ def __into_orange_table(self, attrs, X, meta_parts):
@staticmethod
def __guess_metas(meta_parts):
def guessed_var(i, var_name, dtype):
if np.issubdtype(dtype, np.number):
if pd.core.dtypes.common.is_numeric_dtype(dtype):
return ContinuousVariable.make(var_name)
orig_values = M[:, i]
val_map, values, var_type = guess_data_type(orig_values)
Expand Down Expand Up @@ -666,6 +669,57 @@ def _load_data(self, skip_row=None, skip_col=None, **kwargs):
attrs = [ContinuousVariable.make(str(g)) for g in gene_names]
meta_df = pd.DataFrame({key: ds.ca[key][self._use_cols_mask]
for key in ds.ca.keys()})

return attrs, X, meta_df, meta_df.index


class H5ADLoader(Loader):
def __init__(self, file_name):
super().__init__(file_name)
self.header_rows_count = 0
self.header_cols_count = 0
self.FIXED_FORMAT = False
self.ENABLE_ANNOTATIONS = False
self.transposed = False
self.row_annotations_enabled = False
self.col_annotations_enabled = False

def _set_file_parameters(self):
import anndata
try:
adata = anndata.read_h5ad(self._file_name)
self.n_rows, self.n_cols = adata.shape
all_el = self.n_rows * self.n_cols
if sp.issparse(adata.X):
self.sparsity = (all_el - adata.X.tocsr().count_nonzero()) / all_el
else:
self.sparsity = (all_el - np.count_nonzero(adata.X)) / all_el
except OSError:
pass

def _load_data(self, skip_row=None, skip_col=None, **kwargs):
import anndata
adata = anndata.read_h5ad(self._file_name)

if skip_row is not None:
mask = np.array([not skip_row(i) for i in range(adata.shape[0])])
self._use_rows_mask = mask
else:
self._use_rows_mask = np.ones(adata.shape[0], dtype=bool)
if skip_col is not None:
mask = np.array([not skip_col(i) for i in range(adata.shape[1])])
self._use_cols_mask = mask
else:
self._use_cols_mask = np.ones(adata.shape[1], dtype=bool)

adata = adata[self._use_rows_mask, :]
adata = adata[:, self._use_cols_mask]

attrs = [ContinuousVariable.make(str(g)) for g in adata.var_names]
meta_df = pd.DataFrame({key: adata.obs[key] for key in adata.obs.keys()})

X = adata.X.toarray() if sp.issparse(adata.X) else adata.X

return attrs, X, meta_df, meta_df.index


Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def include_documentation(local_dir, install_dir):
'fastdtw==0.3.2',
'pandas>=0.23',
'loompy>=2.0.10',
'anndata>=0.6.21',
'numpy',
'scikit-learn',
],
Expand Down

0 comments on commit 231f024

Please sign in to comment.