From aa7679fb92fe4a45222e159e3629fd6ee456fdda Mon Sep 17 00:00:00 2001 From: Trent Smith <1429913+Bento007@users.noreply.github.com> Date: Tue, 19 Sep 2023 16:54:45 -0700 Subject: [PATCH] feat(4.0): Remove 5000 cell limit for raw validation (#634) - remove `max_values_to_check` from `Validator._is_raw` - Add unit test for the `Validator._is_raw` - Used [memory_profiler](https://pypi.org/project/memory-profiler/) to verify memory usuage does not explode with this change. A 1,395,601 cell dataset used <5GB of memory. The original size of the file was 50GB. --- .../cellxgene_schema/validate.py | 13 ++--- cellxgene_schema_cli/tests/test_validate.py | 50 +++++++++++++++++++ 2 files changed, 53 insertions(+), 10 deletions(-) diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py index ba07b73b4..83b6e2b8e 100644 --- a/cellxgene_schema_cli/cellxgene_schema/validate.py +++ b/cellxgene_schema_cli/cellxgene_schema/validate.py @@ -289,8 +289,8 @@ def _validate_feature_id(self, feature_id: str, df_name: str): return + @staticmethod def _chunk_matrix( - self, matrix: Union[np.ndarray, sparse.spmatrix], obs_chunk_size: Optional[int] = 10_000, ): @@ -898,9 +898,9 @@ def _get_raw_x_loc(self) -> str: else: return "X" - def _is_raw(self, max_values_to_check: int = 5000, force: bool = False) -> bool: + def _is_raw(self, force: bool = False) -> bool: """ - Checks if the first non-zero "max_values_to_check" in the best guess for the raw matrix (adata.X or adata.raw.X) + Checks if the non-zero values for the raw matrix (adata.X or adata.raw.X) are integers. Returns False if at least one value is not an integer, True otherwise. @@ -908,8 +908,6 @@ def _is_raw(self, max_values_to_check: int = 5000, force: bool = False) -> bool: Since this process is memory intensive, it will return a cache value if this function has been called before. If calculation needs to be repeated use `force = True` - :param int max_values_to_check: total values to check, default set to 5000 due to performance concerns. - :rtype bool :return False if at least one value is not an integer, True otherwise """ @@ -921,7 +919,6 @@ def _is_raw(self, max_values_to_check: int = 5000, force: bool = False) -> bool: raw_loc = self._get_raw_x_loc() x = self.adata.raw.X if raw_loc == "raw.X" else self.adata.X - num_values_checked = 0 format = get_matrix_format(self.adata, x) assert format != "unknown" self._raw_layer_exists = True @@ -931,10 +928,6 @@ def _is_raw(self, max_values_to_check: int = 5000, force: bool = False) -> bool: self._raw_layer_exists = False break - num_values_checked += matrix_chunk.nnz if format != "dense" else np.count_nonzero(matrix_chunk) - if num_values_checked > max_values_to_check: - break - return self._raw_layer_exists def _validate_x_raw_x_dimensions(self): diff --git a/cellxgene_schema_cli/tests/test_validate.py b/cellxgene_schema_cli/tests/test_validate.py index 48f4c2566..0dcf36141 100644 --- a/cellxgene_schema_cli/tests/test_validate.py +++ b/cellxgene_schema_cli/tests/test_validate.py @@ -3,10 +3,12 @@ import os import tempfile import unittest +from typing import Union from unittest import mock import anndata import numpy as np +import pytest from cellxgene_schema.ontology import OntologyChecker from cellxgene_schema.schema import get_schema_definition from cellxgene_schema.validate import Validator, validate @@ -22,7 +24,9 @@ h5ad_invalid, h5ad_valid, ) +from numpy import ndarray from scipy import sparse +from scipy.sparse import spmatrix # Tests for internal functions of the Validator and LabelWriter classes. @@ -352,3 +356,49 @@ def test_determine_seurat_convertibility(self): self.assertTrue(len(self.validator.errors) == 1) self.assertFalse(self.validator.is_seurat_convertible) self.assertFalse(self.validator.is_valid) + + +class TestIsRaw: + @staticmethod + def create_validator(data: Union[ndarray, spmatrix], format: str) -> Validator: + """ + Create a sample AnnData instance with the given data and format. + + :param data: The data matrix. + :param format: The format of the data matrix (e.g., "dense", "csr", "csc"). + + :return anndata.AnnData: An AnnData instance with the specified data and format. + """ + validator = Validator() + + adata = anndata.AnnData(X=data) + adata.obsm["X_" + format] = data + + validator.adata = adata + return validator + + @pytest.mark.parametrize( + "data, format, expected_result", + [ + # Test case with integer values in a dense matrix + (np.array([[1, 2, 3], [4, 5, 6]], dtype=int), "dense", True), + # Test case with float values in a dense matrix + (np.array([[1.1, 2.2, 3.3], [4.4, 5.5, 6.6]]), "dense", False), + # Test case with integer values in a sparse matrix (CSR format) + (sparse.csr_matrix([[1, 0, 3], [0, 5, 0]], dtype=int), "csr", True), + # Test case with float values in a sparse matrix (CSC format) + (sparse.csc_matrix([[1.1, 0, 3.3], [0, 5.5, 0]]), "csc", False), + # Test case with mixed integer and float values in a dense matrix + (np.array([[1, 2.2, 3], [4.4, 5, 6.6]]), "dense", False), + ], + ) + def test_is_raw(self, data, format, expected_result): + validator = self.create_validator(data, format) + assert validator._is_raw() == expected_result + + @mock.patch("cellxgene_schema.validate.get_matrix_format", return_value="unknown") + def test_is_raw_with_unknown_format(self, mock_get_matrix_format): + data = np.array([[1, 2, 3], [4, 5, 6]], dtype=int) + validator = self.create_validator(data, "unknown") + with pytest.raises(AssertionError): + validator._is_raw()