Skip to content

Commit

Permalink
feat(4.0): Remove 5000 cell limit for raw validation (#634)
Browse files Browse the repository at this point in the history
- remove `max_values_to_check`  from `Validator._is_raw`
- Add unit test for the `Validator._is_raw`
- Used [memory_profiler](https://pypi.org/project/memory-profiler/) to verify memory usuage does not explode with this change. A 1,395,601 cell dataset used <5GB of memory. The original size of the file was 50GB.
  • Loading branch information
Bento007 authored Sep 19, 2023
1 parent 1f482ae commit aa7679f
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 10 deletions.
13 changes: 3 additions & 10 deletions cellxgene_schema_cli/cellxgene_schema/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,8 +289,8 @@ def _validate_feature_id(self, feature_id: str, df_name: str):

return

@staticmethod
def _chunk_matrix(
self,
matrix: Union[np.ndarray, sparse.spmatrix],
obs_chunk_size: Optional[int] = 10_000,
):
Expand Down Expand Up @@ -898,18 +898,16 @@ def _get_raw_x_loc(self) -> str:
else:
return "X"

def _is_raw(self, max_values_to_check: int = 5000, force: bool = False) -> bool:
def _is_raw(self, force: bool = False) -> bool:
"""
Checks if the first non-zero "max_values_to_check" in the best guess for the raw matrix (adata.X or adata.raw.X)
Checks if the non-zero values for the raw matrix (adata.X or adata.raw.X)
are integers. Returns False if at least one value is not an integer,
True otherwise.
Since this process is memory intensive, it will return a cache value if this function has been called before.
If calculation needs to be repeated use `force = True`
:param int max_values_to_check: total values to check, default set to 5000 due to performance concerns.
:rtype bool
:return False if at least one value is not an integer, True otherwise
"""
Expand All @@ -921,7 +919,6 @@ def _is_raw(self, max_values_to_check: int = 5000, force: bool = False) -> bool:
raw_loc = self._get_raw_x_loc()
x = self.adata.raw.X if raw_loc == "raw.X" else self.adata.X

num_values_checked = 0
format = get_matrix_format(self.adata, x)
assert format != "unknown"
self._raw_layer_exists = True
Expand All @@ -931,10 +928,6 @@ def _is_raw(self, max_values_to_check: int = 5000, force: bool = False) -> bool:
self._raw_layer_exists = False
break

num_values_checked += matrix_chunk.nnz if format != "dense" else np.count_nonzero(matrix_chunk)
if num_values_checked > max_values_to_check:
break

return self._raw_layer_exists

def _validate_x_raw_x_dimensions(self):
Expand Down
50 changes: 50 additions & 0 deletions cellxgene_schema_cli/tests/test_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@
import os
import tempfile
import unittest
from typing import Union
from unittest import mock

import anndata
import numpy as np
import pytest
from cellxgene_schema.ontology import OntologyChecker
from cellxgene_schema.schema import get_schema_definition
from cellxgene_schema.validate import Validator, validate
Expand All @@ -22,7 +24,9 @@
h5ad_invalid,
h5ad_valid,
)
from numpy import ndarray
from scipy import sparse
from scipy.sparse import spmatrix

# Tests for internal functions of the Validator and LabelWriter classes.

Expand Down Expand Up @@ -352,3 +356,49 @@ def test_determine_seurat_convertibility(self):
self.assertTrue(len(self.validator.errors) == 1)
self.assertFalse(self.validator.is_seurat_convertible)
self.assertFalse(self.validator.is_valid)


class TestIsRaw:
@staticmethod
def create_validator(data: Union[ndarray, spmatrix], format: str) -> Validator:
"""
Create a sample AnnData instance with the given data and format.
:param data: The data matrix.
:param format: The format of the data matrix (e.g., "dense", "csr", "csc").
:return anndata.AnnData: An AnnData instance with the specified data and format.
"""
validator = Validator()

adata = anndata.AnnData(X=data)
adata.obsm["X_" + format] = data

validator.adata = adata
return validator

@pytest.mark.parametrize(
"data, format, expected_result",
[
# Test case with integer values in a dense matrix
(np.array([[1, 2, 3], [4, 5, 6]], dtype=int), "dense", True),
# Test case with float values in a dense matrix
(np.array([[1.1, 2.2, 3.3], [4.4, 5.5, 6.6]]), "dense", False),
# Test case with integer values in a sparse matrix (CSR format)
(sparse.csr_matrix([[1, 0, 3], [0, 5, 0]], dtype=int), "csr", True),
# Test case with float values in a sparse matrix (CSC format)
(sparse.csc_matrix([[1.1, 0, 3.3], [0, 5.5, 0]]), "csc", False),
# Test case with mixed integer and float values in a dense matrix
(np.array([[1, 2.2, 3], [4.4, 5, 6.6]]), "dense", False),
],
)
def test_is_raw(self, data, format, expected_result):
validator = self.create_validator(data, format)
assert validator._is_raw() == expected_result

@mock.patch("cellxgene_schema.validate.get_matrix_format", return_value="unknown")
def test_is_raw_with_unknown_format(self, mock_get_matrix_format):
data = np.array([[1, 2, 3], [4, 5, 6]], dtype=int)
validator = self.create_validator(data, "unknown")
with pytest.raises(AssertionError):
validator._is_raw()

0 comments on commit aa7679f

Please sign in to comment.