Skip to content

Commit

Permalink
feat: validate raw matrix does not have more genes than raw var (#619)
Browse files Browse the repository at this point in the history
* feat: validate raw matrix does not have more genes than raw var

* only test raw matrix dimensionality if it exists

* update tests to validate raw.var and raw.X dimension consistency if raw exists + add check if raw exists (#621)

* remove unused imports

---------

Co-authored-by: Nayib Gloria <[email protected]>
  • Loading branch information
Daniel Hegeman and nayib-jose-gloria authored Sep 15, 2023
1 parent 9bfa58c commit ccad8d9
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 4 deletions.
8 changes: 8 additions & 0 deletions cellxgene_schema_cli/cellxgene_schema/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -801,6 +801,14 @@ def _validate_seurat_convertibility(self):

self.is_seurat_convertible = False

if self.adata.raw and self.adata.raw.X.shape[1] != self.adata.raw.var.shape[0]:
self.warnings.append(
"This dataset cannot be converted to the .rds (Seurat v4) format. "
"There is a mismatch in the number of variables in the raw matrix and the raw var key-indexed "
"variables."
)
self.is_seurat_convertible = False

def _validate_embedding_dict(self):
"""
Validates the embedding dictionary -- it checks that all values of adata.obsm are numpy arrays with the correct
Expand Down
19 changes: 15 additions & 4 deletions cellxgene_schema_cli/tests/test_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,16 +54,16 @@ def test_schema_definition(self):
self.assertTrue("type" in self.schema_def["components"]["obs"]["columns"][i])
if i == "curie":
self.assertIsInstance(
self.schema_def["components"]["obs"]["columns"][i]["curie_constrains"],
self.schema_def["components"]["obs"]["columns"][i]["curie_constraints"],
dict,
)
self.assertIsInstance(
self.schema_def["components"]["obs"]["columns"][i]["curie_constrains"]["ontolgies"],
self.schema_def["components"]["obs"]["columns"][i]["curie_constraints"]["ontolgies"],
list,
)

# Check that the allowed ontologies are in the ontology checker
for ontology_name in self.schema_def["components"]["obs"]["columns"][i]["curie_constrains"][
for ontology_name in self.schema_def["components"]["obs"]["columns"][i]["curie_constraints"][
"ontolgies"
]:
self.assertTrue(self.OntologyChecker.is_valid_ontology(ontology_name))
Expand Down Expand Up @@ -281,8 +281,10 @@ def test__validate_with_h5ad_invalid_and_without_labels(self):


class TestSeuratConvertibility(unittest.TestCase):
def validation_helper(self, matrix):
def validation_helper(self, matrix, raw=None):
data = anndata.AnnData(X=matrix, obs=good_obs, uns=good_uns, obsm=good_obsm, var=good_var)
if raw:
data.raw = raw
self.validator: Validator = Validator()
self.validator._set_schema_def()
self.validator.schema_def["max_size_for_seurat"] = 2**3 - 1 # Reduce size required to fail (faster tests)
Expand Down Expand Up @@ -319,3 +321,12 @@ def test_determine_seurat_convertibility(self):
self.validator._validate_seurat_convertibility()
self.assertTrue(len(self.validator.warnings) == 0)
self.assertTrue(self.validator.is_seurat_convertible)

# h5ad where raw matrix variable count != length of raw var variables array is not Seurat-convertible
matrix = sparse.csr_matrix(np.zeros([good_obs.shape[0], good_var.shape[0]], dtype=np.float32))
raw = anndata.AnnData(X=matrix, var=good_var)
raw.var.drop("ENSSASG00005000004", axis=0, inplace=True)
self.validation_helper(matrix, raw)
self.validator._validate_seurat_convertibility()
self.assertTrue(len(self.validator.warnings) == 1)
self.assertFalse(self.validator.is_seurat_convertible)

0 comments on commit ccad8d9

Please sign in to comment.