From a6c9086bc57d0d374701d5ac2a5941cf1517c2de Mon Sep 17 00:00:00 2001 From: Joyce Yan <5653616+joyceyan@users.noreply.github.com> Date: Tue, 10 Dec 2024 15:13:00 -0800 Subject: [PATCH] chore: update logging to log by donor id instead (#1150) --- cellxgene_schema_cli/cellxgene_schema/validate.py | 9 ++++++--- cellxgene_schema_cli/tests/test_schema_compliance.py | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py index 781b72d1..6b62f796 100644 --- a/cellxgene_schema_cli/cellxgene_schema/validate.py +++ b/cellxgene_schema_cli/cellxgene_schema/validate.py @@ -554,9 +554,10 @@ def is_valid_row(row): invalid_rows = ~self.adata.obs.apply(is_valid_row, axis=1) if invalid_rows.any(): - invalid_indices = self.adata.obs.index[invalid_rows].tolist() + donor_ids = self.adata.obs[donor_id_column].tolist() + unique_donor_ids = list(set(donor_ids)) self.errors.append( - f"obs rows with indices {invalid_indices} have invalid genetic_ancestry_* values. All " + f"obs rows with donor ids {unique_donor_ids} have invalid genetic_ancestry_* values. All " f"observations with the same donor_id must contain the same genetic_ancestry_* values. If " f"organism_ontolology_term_id is NOT 'NCBITaxon:9606' for Homo sapiens, then all genetic" f"ancestry values MUST be float('nan'). If organism_ontolology_term_id is 'NCBITaxon:9606' " @@ -959,7 +960,6 @@ def _validate_dataframe(self, df_name: str): f"Column '{column_name}' in dataframe '{df_name}' contains a category '{category}' with " f"zero observations. These categories will be removed when `--add-labels` flag is present." ) - self._validate_genetic_ancestry() categorical_types = {type(x) for x in column.dtype.categories.values} # Check for columns that have illegal categories, which are not supported by anndata 0.8.0 # TODO: check if this can be removed after upgading to anndata 0.10.0 @@ -2058,6 +2058,9 @@ def _deep_check(self): # Checks spatial self._check_spatial() + # Validate genetic ancestry + self._validate_genetic_ancestry() + # Checks each component for component_name, component_def in self.schema_def["components"].items(): logger.debug(f"Validating component: {component_name}") diff --git a/cellxgene_schema_cli/tests/test_schema_compliance.py b/cellxgene_schema_cli/tests/test_schema_compliance.py index 85baab4b..bf83c97b 100644 --- a/cellxgene_schema_cli/tests/test_schema_compliance.py +++ b/cellxgene_schema_cli/tests/test_schema_compliance.py @@ -1740,7 +1740,7 @@ def test_genetic_ancestry_same_donor_id(self, validator_with_adata): validator.adata.obs["genetic_ancestry_South_Asian"] = [0.0, 0.0] validator.reset(None, 2) validator.validate_adata() - assert len(validator.errors) > 0 + assert len(validator.errors) == 1 # Change the donor id back to two different donor id's. Now, this should pass validation validator.adata.obs["donor_id"] = original_donor_id_column