multi-species filtering per the 2.0.0 census schema

chanzuckerberg · Mar 19, 2024 · cc0ed89 · cc0ed89
1 parent 36cd170
commit cc0ed89
Show file tree

Hide file tree

Showing 7 changed files with 76 additions and 41 deletions.
diff --git a/docs/cellxgene_census_schema.md b/docs/cellxgene_census_schema.md
@@ -97,12 +97,12 @@ The table below shows all possible combinations of organisms for both observatio
   <tr>
     <td>"NCBITaxon:9606" for <i>Homo sapiens</i></td>
     <td>"NCBITaxon:9606" for <i>Homo sapiens</i> <b>AND</b> "NCBITaxon:10090" for <i>Mus musculus</i></td>
-    <td>All observations MUST be included. The Census MUST only contain feautes from "NCBITaxon:9606" for <i>Homo sapiens</i>.</td>
+    <td>All observations MUST be included. The Census MUST only contain features from "NCBITaxon:9606" for <i>Homo sapiens</i>.</td>
   </tr>
   <tr>
     <td>"NCBITaxon:10090" for <i>Mus musculus</i></td>
     <td>"NCBITaxon:9606" for <i>Homo sapiens</i> <b>AND</b> "NCBITaxon:10090" for <i>Mus musculus</i></td>
-    <td>All observations MUST be included. The Census MUST only contain feautes from "NCBITaxon:10090" for <i>Mus musculus</i>.</td>
+    <td>All observations MUST be included. The Census MUST only contain features from "NCBITaxon:10090" for <i>Mus musculus</i>.</td>
   </tr>
   <tr>
     <td>"NCBITaxon:9606" for <i>Homo sapiens</i> <b>AND</b> "NCBITaxon:10090" for <i>Mus musculus</i></td>

diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/anndata.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/anndata.py
@@ -14,14 +14,14 @@
 
 from ..util import urlcat
 from .datasets import Dataset
-from .globals import CXG_SCHEMA_VERSION, FEATURE_REFERENCE_IGNORE
+from .globals import CXG_SCHEMA_VERSION
 
 logger = logging.getLogger(__name__)
 
 
 class AnnDataFilterSpec(TypedDict):
-    organism_ontology_term_id: str | None
-    assay_ontology_term_ids: list[str] | None
+    organism_ontology_term_id: str
+    assay_ontology_term_ids: list[str]
 
 
 # Indexing types
@@ -175,7 +175,12 @@ def get_estimated_density(self) -> float:
         This is NOT the density for any given slice.
 
         Approach: divide the whole file nnz by the product of the shape.
+
+        Arbitarily picks density of 1.0 if the file is empty on either axis
         """
+        if self.n_obs * self.n_vars == 0:
+            return 1.0
+
         nnz: int
         if isinstance(self._X, CSRDataset | CSCDataset):
             nnz = self._X.group["data"].size
@@ -298,46 +303,43 @@ def __call__(self, ad: AnnDataProxy) -> AnnDataProxy:
 def make_anndata_cell_filter(filter_spec: AnnDataFilterSpec) -> AnnDataFilterFunction:
     """Return an anndata sliced/filtered for those cells/genes of interest.
 
+    See: https://github.com/chanzuckerberg/cellxgene-census/blob/main/docs/cellxgene_census_schema.md
+
     obs filter:
     * not organoid or cell culture
     * Caller-specified assays only
     * Caller-specified taxa (obs.organism_ontology_term_id == '<user-supplied>')
     * Organism term ID value not equal to gene feature_reference value
+    * Single organism
 
     var filter:
     * genes only  (var.feature_biotype == 'gene')
+    * Single organism
     """
     organism_ontology_term_id = filter_spec.get("organism_ontology_term_id", None)
     assay_ontology_term_ids = filter_spec.get("assay_ontology_term_ids", None)
 
     def _filter(ad: AnnDataProxy) -> AnnDataProxy:
-        # Multi-organism datasets are dropped - any dataset with 2+ feature_reference organisms is ignored,
-        # exclusive of values in FEATURE_REFERENCE_IGNORE. See also, cell filter for mismatched
-        # cell/feature organism values.
-        feature_reference_organisms = set(ad.var.feature_reference.unique()) - FEATURE_REFERENCE_IGNORE
-        if len(feature_reference_organisms) > 1:
-            logger.info(f"H5AD ignored due to multi-organism feature_reference: {ad.filename}")
+        """Filter observations and features per Census schema."""
+        var_mask = ad.var.feature_biotype == "gene"
+        obs_mask = ad.obs.tissue_type == "tissue"
+
+        # Handle multi-species edge case
+        var_organisms = set(ad.var.feature_reference[var_mask].unique())
+        obs_organisms = set(ad.obs.organism_ontology_term_id[obs_mask].unique())
+        if len(var_organisms) > 1 and len(obs_organisms) > 1:
+            # if multi-species on both axis -- drop everything
+            logger.info(f"H5AD ignored - multi-species content on both axes: {ad.filename}")
             return ad[0:0]  # ie., drop all cells
 
-        #
-        # Filter cells per Census schema
-        #
-        obs_mask = ad.obs.tissue_type == "tissue"
-        if organism_ontology_term_id is not None:
-            obs_mask = obs_mask & (ad.obs.organism_ontology_term_id == organism_ontology_term_id)
-        if assay_ontology_term_ids is not None:
+        # Filter by the species specified in the filter-spec
+        var_mask = var_mask & (ad.var.feature_reference == organism_ontology_term_id)
+        obs_mask = obs_mask & (ad.obs.organism_ontology_term_id == organism_ontology_term_id)
+        if assay_ontology_term_ids:
             obs_mask = obs_mask & ad.obs.assay_ontology_term_id.isin(assay_ontology_term_ids)
 
-        # multi-organism dataset cell filter - exclude any cells where organism != feature_reference
-        feature_references = set(ad.var.feature_reference.unique()) - FEATURE_REFERENCE_IGNORE
-        assert len(feature_references) == 1  # else there is a bug in the test above
-        feature_reference_organism_ontology_id = feature_references.pop()
-        obs_mask = obs_mask & (ad.obs.organism_ontology_term_id == feature_reference_organism_ontology_id)
-
-        #
-        # Filter features per Census schema
-        #
-        var_mask = ad.var.feature_biotype == "gene"
+        if not (var_mask.any() and obs_mask.any()):
+            return ad[0:0]  # i.e., drop all cells
 
         return ad[
             slice(None) if obs_mask.all() else obs_mask.to_numpy(),

diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/experiment_builder.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/experiment_builder.py
@@ -93,15 +93,17 @@ class ExperimentSpecification:
 
     name: str
     anndata_cell_filter_spec: AnnDataFilterSpec
+    organism_ontology_term_id: str
 
     @classmethod
     def create(
         cls,
         name: str,
         anndata_cell_filter_spec: AnnDataFilterSpec,
+        organism_ontology_term_id: str,
     ) -> Self:
         """Factory method. Do not instantiate the class directly."""
-        return cls(name, anndata_cell_filter_spec)
+        return cls(name, anndata_cell_filter_spec, organism_ontology_term_id)
 
 
 class ExperimentBuilder:

diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/experiment_specs.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/experiment_specs.py
@@ -18,13 +18,15 @@ def make_experiment_specs() -> list[ExperimentSpecification]:
                 "organism_ontology_term_id": "NCBITaxon:9606",
                 "assay_ontology_term_ids": RNA_SEQ,
             },
+            organism_ontology_term_id="NCBITaxon:9606",
         ),
         ExperimentSpecification.create(
             name="mus_musculus",
             anndata_cell_filter_spec={
                 "organism_ontology_term_id": "NCBITaxon:10090",
                 "assay_ontology_term_ids": RNA_SEQ,
             },
+            organism_ontology_term_id="NCBITaxon:10090",
         ),
     ]
 

diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/globals.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/globals.py
@@ -388,10 +388,6 @@
 
 DONOR_ID_IGNORE = ["pooled", "unknown"]
 
-# Feature_reference values which are ignored (not considered) in
-# multi-organism filtering. Currently the null set.
-FEATURE_REFERENCE_IGNORE: set[str] = set()
-
 
 # The default configuration for TileDB contexts used in the builder.
 # Ref: https://docs.tiledb.com/main/how-to/configuration#configuration-parameters

diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/manifest.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/manifest.py
@@ -12,6 +12,8 @@
 logger = logging.getLogger(__name__)
 
 CXG_BASE_URI = "https://api.cellxgene.cziscience.com/"
+# The DEV base, occasionally used for testing.
+# CXG_BASE_URI = "https://api.cellxgene.dev.single-cell.czi.technology/"
 
 
 def parse_manifest_file(manifest_fp: io.TextIOBase) -> list[Dataset]:

diff --git a/tools/cellxgene_census_builder/tests/anndata/test_anndata.py b/tools/cellxgene_census_builder/tests/anndata/test_anndata.py
@@ -11,7 +11,7 @@
 from cellxgene_census_builder.build_soma.datasets import Dataset
 from cellxgene_census_builder.build_state import CensusBuildArgs
 
-from ..conftest import ORGANISMS, get_anndata
+from ..conftest import GENE_IDS, ORGANISMS, get_anndata
 
 
 def test_open_anndata(datasets: list[Dataset]) -> None:
@@ -34,9 +34,15 @@ def test_open_anndata_filters_out_datasets_with_mixed_feature_reference(
     datasets_with_mixed_feature_reference: list[Dataset],
 ) -> None:
     """Datasets with a "mixed" feature_reference will not be included by the filter pipeline"""
-    ad_filter = make_anndata_cell_filter({})
+    ad_filter = make_anndata_cell_filter(
+        {
+            "organism_ontology_term_id": ORGANISMS[0].organism_ontology_term_id,
+            "assay_ontology_term_ids": [],
+        }
+    )
+
     result = [ad_filter(open_anndata(d, base_path=".")) for d in datasets_with_mixed_feature_reference]
-    assert all(len(ad) == 0 for ad in result)
+    assert all(len(ad) == 4 and ad.n_vars == len(GENE_IDS[0]) - 1 for ad in result)
 
 
 def test_open_anndata_filters_out_wrong_schema_version_datasets(
@@ -53,7 +59,12 @@ def test_make_anndata_cell_filter(tmp_path: pathlib.Path, h5ad_simple: str) -> N
     dataset = Dataset(dataset_id="test", dataset_asset_h5ad_uri="test", dataset_h5ad_path=h5ad_simple)
     adata_simple = open_anndata(dataset, base_path=tmp_path.as_posix())
 
-    func = make_anndata_cell_filter({})
+    func = make_anndata_cell_filter(
+        {
+            "organism_ontology_term_id": ORGANISMS[0].organism_ontology_term_id,
+            "assay_ontology_term_ids": [],
+        }
+    )
     filtered_adata_simple = func(adata_simple)
 
     assert adata_simple.var.equals(filtered_adata_simple.var)
@@ -70,7 +81,12 @@ def test_make_anndata_cell_filter_filters_out_organoids_cell_culture(
     )
     adata_with_organoids_and_cell_culture = open_anndata(dataset, base_path=tmp_path.as_posix())
 
-    func = make_anndata_cell_filter({})
+    func = make_anndata_cell_filter(
+        {
+            "organism_ontology_term_id": ORGANISMS[0].organism_ontology_term_id,
+            "assay_ontology_term_ids": [],
+        }
+    )
     filtered_adata_with_organoids_and_cell_culture = func(adata_with_organoids_and_cell_culture)
 
     assert adata_with_organoids_and_cell_culture.var.equals(filtered_adata_with_organoids_and_cell_culture.var)
@@ -81,7 +97,12 @@ def test_make_anndata_cell_filter_organism(tmp_path: pathlib.Path, h5ad_with_org
     dataset = Dataset(dataset_id="test", dataset_asset_h5ad_uri="test", dataset_h5ad_path=h5ad_with_organism)
     adata_with_organism = open_anndata(dataset, base_path=tmp_path.as_posix())
 
-    func = make_anndata_cell_filter({"organism_ontology_term_id": ORGANISMS[0].organism_ontology_term_id})
+    func = make_anndata_cell_filter(
+        {
+            "organism_ontology_term_id": ORGANISMS[0].organism_ontology_term_id,
+            "assay_ontology_term_ids": [],
+        }
+    )
     filtered_adata_with_organism = func(adata_with_organism)
 
     assert adata_with_organism.var.equals(filtered_adata_with_organism.var)
@@ -92,7 +113,12 @@ def test_make_anndata_cell_filter_feature_biotype_gene(tmp_path: pathlib.Path, h
     dataset = Dataset(dataset_id="test", dataset_asset_h5ad_uri="test", dataset_h5ad_path=h5ad_with_feature_biotype)
     adata_with_feature_biotype = open_anndata(dataset, base_path=tmp_path.as_posix())
 
-    func = make_anndata_cell_filter({})
+    func = make_anndata_cell_filter(
+        {
+            "organism_ontology_term_id": ORGANISMS[0].organism_ontology_term_id,
+            "assay_ontology_term_ids": [],
+        }
+    )
     filtered_adata_with_feature_biotype = func(adata_with_feature_biotype)
 
     assert adata_with_feature_biotype.obs.equals(filtered_adata_with_feature_biotype.obs)
@@ -103,7 +129,12 @@ def test_make_anndata_cell_filter_assay(tmp_path: pathlib.Path, h5ad_with_assays
     dataset = Dataset(dataset_id="test", dataset_asset_h5ad_uri="test", dataset_h5ad_path=h5ad_with_assays)
     adata_with_assays = open_anndata(dataset, base_path=tmp_path.as_posix(), include_filter_columns=True)
 
-    func = make_anndata_cell_filter({"assay_ontology_term_ids": ["EFO:1234", "EFO:1235"]})
+    func = make_anndata_cell_filter(
+        {
+            "organism_ontology_term_id": ORGANISMS[0].organism_ontology_term_id,
+            "assay_ontology_term_ids": ["EFO:1234", "EFO:1235"],
+        }
+    )
     filtered_adata_with_assays = func(adata_with_assays)
 
     assert filtered_adata_with_assays.obs.shape[0] == 2