Skip to content

Commit

Permalink
Merge pull request #1422 from nikicc/sparse-fixup
Browse files Browse the repository at this point in the history
Statistics.util.stats: Support sparse data
  • Loading branch information
kernc authored Jul 7, 2016
2 parents 2f5975f + 78183e8 commit d689844
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 9 deletions.
42 changes: 33 additions & 9 deletions Orange/statistics/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,12 +163,36 @@ def stats(X, weights=None, compute_variance=False):
if weights is not None:
X = X * weights
is_numeric = np.issubdtype(X.dtype, np.number)
nans = (np.isnan(X) if is_numeric else ~X.astype(bool)).sum(axis=0)
variance = np.nanvar(X, axis=0) if compute_variance and is_numeric else np.zeros(X.shape[1])
return np.column_stack((
np.nanmin(X, axis=0) if is_numeric else np.tile(np.inf, X.shape[1]),
np.nanmax(X, axis=0) if is_numeric else np.tile(-np.inf, X.shape[1]),
np.nanmean(X, axis=0) if is_numeric else np.zeros(X.shape[1]),
variance,
nans,
X.shape[0] - nans))
is_sparse = issparse(X)

if is_numeric and not is_sparse:
nans = np.isnan(X).sum(axis=0)
return np.column_stack((
np.nanmin(X, axis=0),
np.nanmax(X, axis=0),
np.nanmean(X, axis=0),
np.nanvar(X, axis=0) if compute_variance else np.zeros(X.shape[1]),
nans,
X.shape[0] - nans))
elif is_sparse:
if compute_variance:
raise NotImplementedError

non_zero = np.bincount(X.nonzero()[1])
X = X.tocsc()
return np.column_stack((
X.min(axis=0).toarray().ravel(),
X.max(axis=0).toarray().ravel(),
np.asarray(X.mean(axis=0)).ravel(),
np.zeros(X.shape[1]), # variance not supported
X.shape[1] - non_zero,
non_zero))
else:
nans = ~X.astype(bool).sum(axis=0)
return np.column_stack((
np.tile(np.inf, X.shape[1]),
np.tile(-np.inf, X.shape[1]),
np.zeros(X.shape[1]),
np.zeros(X.shape[1]),
nans,
X.shape[0] - nans))
9 changes: 9 additions & 0 deletions Orange/tests/test_statistics.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import unittest
import numpy as np
from scipy.sparse import csr_matrix

from Orange.statistics.util import bincount, countnans, contingency, stats

Expand Down Expand Up @@ -32,3 +33,11 @@ def test_stats(self):
X[1, 1] = np.nan
np.testing.assert_equal(stats(X), [[0, 2, 1, 0, 0, 2],
[1, 1, 1, 0, 1, 1]])

def test_stats_sparse(self):
X = csr_matrix(np.identity(5))
np.testing.assert_equal(stats(X), [[0, 1, .2, 0, 4, 1],
[0, 1, .2, 0, 4, 1],
[0, 1, .2, 0, 4, 1],
[0, 1, .2, 0, 4, 1],
[0, 1, .2, 0, 4, 1]])

0 comments on commit d689844

Please sign in to comment.