From ccad8d95fd694e8abdc34d0f3edd5cbbb3df1643 Mon Sep 17 00:00:00 2001 From: Daniel Hegeman Date: Fri, 15 Sep 2023 08:58:04 -0700 Subject: [PATCH] feat: validate raw matrix does not have more genes than raw var (#619) * feat: validate raw matrix does not have more genes than raw var * only test raw matrix dimensionality if it exists * update tests to validate raw.var and raw.X dimension consistency if raw exists + add check if raw exists (#621) * remove unused imports --------- Co-authored-by: Nayib Gloria <55710092+nayib-jose-gloria@users.noreply.github.com> --- .../cellxgene_schema/validate.py | 8 ++++++++ cellxgene_schema_cli/tests/test_validate.py | 19 +++++++++++++++---- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py index 62d987101..80dd15f7a 100644 --- a/cellxgene_schema_cli/cellxgene_schema/validate.py +++ b/cellxgene_schema_cli/cellxgene_schema/validate.py @@ -801,6 +801,14 @@ def _validate_seurat_convertibility(self): self.is_seurat_convertible = False + if self.adata.raw and self.adata.raw.X.shape[1] != self.adata.raw.var.shape[0]: + self.warnings.append( + "This dataset cannot be converted to the .rds (Seurat v4) format. " + "There is a mismatch in the number of variables in the raw matrix and the raw var key-indexed " + "variables." + ) + self.is_seurat_convertible = False + def _validate_embedding_dict(self): """ Validates the embedding dictionary -- it checks that all values of adata.obsm are numpy arrays with the correct diff --git a/cellxgene_schema_cli/tests/test_validate.py b/cellxgene_schema_cli/tests/test_validate.py index 0c71f6fa6..01becd50a 100644 --- a/cellxgene_schema_cli/tests/test_validate.py +++ b/cellxgene_schema_cli/tests/test_validate.py @@ -54,16 +54,16 @@ def test_schema_definition(self): self.assertTrue("type" in self.schema_def["components"]["obs"]["columns"][i]) if i == "curie": self.assertIsInstance( - self.schema_def["components"]["obs"]["columns"][i]["curie_constrains"], + self.schema_def["components"]["obs"]["columns"][i]["curie_constraints"], dict, ) self.assertIsInstance( - self.schema_def["components"]["obs"]["columns"][i]["curie_constrains"]["ontolgies"], + self.schema_def["components"]["obs"]["columns"][i]["curie_constraints"]["ontolgies"], list, ) # Check that the allowed ontologies are in the ontology checker - for ontology_name in self.schema_def["components"]["obs"]["columns"][i]["curie_constrains"][ + for ontology_name in self.schema_def["components"]["obs"]["columns"][i]["curie_constraints"][ "ontolgies" ]: self.assertTrue(self.OntologyChecker.is_valid_ontology(ontology_name)) @@ -281,8 +281,10 @@ def test__validate_with_h5ad_invalid_and_without_labels(self): class TestSeuratConvertibility(unittest.TestCase): - def validation_helper(self, matrix): + def validation_helper(self, matrix, raw=None): data = anndata.AnnData(X=matrix, obs=good_obs, uns=good_uns, obsm=good_obsm, var=good_var) + if raw: + data.raw = raw self.validator: Validator = Validator() self.validator._set_schema_def() self.validator.schema_def["max_size_for_seurat"] = 2**3 - 1 # Reduce size required to fail (faster tests) @@ -319,3 +321,12 @@ def test_determine_seurat_convertibility(self): self.validator._validate_seurat_convertibility() self.assertTrue(len(self.validator.warnings) == 0) self.assertTrue(self.validator.is_seurat_convertible) + + # h5ad where raw matrix variable count != length of raw var variables array is not Seurat-convertible + matrix = sparse.csr_matrix(np.zeros([good_obs.shape[0], good_var.shape[0]], dtype=np.float32)) + raw = anndata.AnnData(X=matrix, var=good_var) + raw.var.drop("ENSSASG00005000004", axis=0, inplace=True) + self.validation_helper(matrix, raw) + self.validator._validate_seurat_convertibility() + self.assertTrue(len(self.validator.warnings) == 1) + self.assertFalse(self.validator.is_seurat_convertible)