Skip to content

Commit

Permalink
multi-species filtering per the 2.0.0 census schema
Browse files Browse the repository at this point in the history
  • Loading branch information
bkmartinjr committed Mar 19, 2024
1 parent 36cd170 commit cc0ed89
Show file tree
Hide file tree
Showing 7 changed files with 76 additions and 41 deletions.
4 changes: 2 additions & 2 deletions docs/cellxgene_census_schema.md
Original file line number Diff line number Diff line change
Expand Up @@ -97,12 +97,12 @@ The table below shows all possible combinations of organisms for both observatio
<tr>
<td>"NCBITaxon:9606" for <i>Homo sapiens</i></td>
<td>"NCBITaxon:9606" for <i>Homo sapiens</i> <b>AND</b> "NCBITaxon:10090" for <i>Mus musculus</i></td>
<td>All observations MUST be included. The Census MUST only contain feautes from "NCBITaxon:9606" for <i>Homo sapiens</i>.</td>
<td>All observations MUST be included. The Census MUST only contain features from "NCBITaxon:9606" for <i>Homo sapiens</i>.</td>
</tr>
<tr>
<td>"NCBITaxon:10090" for <i>Mus musculus</i></td>
<td>"NCBITaxon:9606" for <i>Homo sapiens</i> <b>AND</b> "NCBITaxon:10090" for <i>Mus musculus</i></td>
<td>All observations MUST be included. The Census MUST only contain feautes from "NCBITaxon:10090" for <i>Mus musculus</i>.</td>
<td>All observations MUST be included. The Census MUST only contain features from "NCBITaxon:10090" for <i>Mus musculus</i>.</td>
</tr>
<tr>
<td>"NCBITaxon:9606" for <i>Homo sapiens</i> <b>AND</b> "NCBITaxon:10090" for <i>Mus musculus</i></td>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,14 @@

from ..util import urlcat
from .datasets import Dataset
from .globals import CXG_SCHEMA_VERSION, FEATURE_REFERENCE_IGNORE
from .globals import CXG_SCHEMA_VERSION

logger = logging.getLogger(__name__)


class AnnDataFilterSpec(TypedDict):
organism_ontology_term_id: str | None
assay_ontology_term_ids: list[str] | None
organism_ontology_term_id: str
assay_ontology_term_ids: list[str]


# Indexing types
Expand Down Expand Up @@ -175,7 +175,12 @@ def get_estimated_density(self) -> float:
This is NOT the density for any given slice.
Approach: divide the whole file nnz by the product of the shape.
Arbitarily picks density of 1.0 if the file is empty on either axis
"""
if self.n_obs * self.n_vars == 0:
return 1.0

nnz: int
if isinstance(self._X, CSRDataset | CSCDataset):
nnz = self._X.group["data"].size
Expand Down Expand Up @@ -298,46 +303,43 @@ def __call__(self, ad: AnnDataProxy) -> AnnDataProxy:
def make_anndata_cell_filter(filter_spec: AnnDataFilterSpec) -> AnnDataFilterFunction:
"""Return an anndata sliced/filtered for those cells/genes of interest.
See: https://github.com/chanzuckerberg/cellxgene-census/blob/main/docs/cellxgene_census_schema.md
obs filter:
* not organoid or cell culture
* Caller-specified assays only
* Caller-specified taxa (obs.organism_ontology_term_id == '<user-supplied>')
* Organism term ID value not equal to gene feature_reference value
* Single organism
var filter:
* genes only (var.feature_biotype == 'gene')
* Single organism
"""
organism_ontology_term_id = filter_spec.get("organism_ontology_term_id", None)
assay_ontology_term_ids = filter_spec.get("assay_ontology_term_ids", None)

def _filter(ad: AnnDataProxy) -> AnnDataProxy:
# Multi-organism datasets are dropped - any dataset with 2+ feature_reference organisms is ignored,
# exclusive of values in FEATURE_REFERENCE_IGNORE. See also, cell filter for mismatched
# cell/feature organism values.
feature_reference_organisms = set(ad.var.feature_reference.unique()) - FEATURE_REFERENCE_IGNORE
if len(feature_reference_organisms) > 1:
logger.info(f"H5AD ignored due to multi-organism feature_reference: {ad.filename}")
"""Filter observations and features per Census schema."""
var_mask = ad.var.feature_biotype == "gene"
obs_mask = ad.obs.tissue_type == "tissue"

# Handle multi-species edge case
var_organisms = set(ad.var.feature_reference[var_mask].unique())
obs_organisms = set(ad.obs.organism_ontology_term_id[obs_mask].unique())
if len(var_organisms) > 1 and len(obs_organisms) > 1:
# if multi-species on both axis -- drop everything
logger.info(f"H5AD ignored - multi-species content on both axes: {ad.filename}")
return ad[0:0] # ie., drop all cells

#
# Filter cells per Census schema
#
obs_mask = ad.obs.tissue_type == "tissue"
if organism_ontology_term_id is not None:
obs_mask = obs_mask & (ad.obs.organism_ontology_term_id == organism_ontology_term_id)
if assay_ontology_term_ids is not None:
# Filter by the species specified in the filter-spec
var_mask = var_mask & (ad.var.feature_reference == organism_ontology_term_id)
obs_mask = obs_mask & (ad.obs.organism_ontology_term_id == organism_ontology_term_id)
if assay_ontology_term_ids:
obs_mask = obs_mask & ad.obs.assay_ontology_term_id.isin(assay_ontology_term_ids)

# multi-organism dataset cell filter - exclude any cells where organism != feature_reference
feature_references = set(ad.var.feature_reference.unique()) - FEATURE_REFERENCE_IGNORE
assert len(feature_references) == 1 # else there is a bug in the test above
feature_reference_organism_ontology_id = feature_references.pop()
obs_mask = obs_mask & (ad.obs.organism_ontology_term_id == feature_reference_organism_ontology_id)

#
# Filter features per Census schema
#
var_mask = ad.var.feature_biotype == "gene"
if not (var_mask.any() and obs_mask.any()):
return ad[0:0] # i.e., drop all cells

return ad[
slice(None) if obs_mask.all() else obs_mask.to_numpy(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,15 +93,17 @@ class ExperimentSpecification:

name: str
anndata_cell_filter_spec: AnnDataFilterSpec
organism_ontology_term_id: str

@classmethod
def create(
cls,
name: str,
anndata_cell_filter_spec: AnnDataFilterSpec,
organism_ontology_term_id: str,
) -> Self:
"""Factory method. Do not instantiate the class directly."""
return cls(name, anndata_cell_filter_spec)
return cls(name, anndata_cell_filter_spec, organism_ontology_term_id)


class ExperimentBuilder:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,15 @@ def make_experiment_specs() -> list[ExperimentSpecification]:
"organism_ontology_term_id": "NCBITaxon:9606",
"assay_ontology_term_ids": RNA_SEQ,
},
organism_ontology_term_id="NCBITaxon:9606",
),
ExperimentSpecification.create(
name="mus_musculus",
anndata_cell_filter_spec={
"organism_ontology_term_id": "NCBITaxon:10090",
"assay_ontology_term_ids": RNA_SEQ,
},
organism_ontology_term_id="NCBITaxon:10090",
),
]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -388,10 +388,6 @@

DONOR_ID_IGNORE = ["pooled", "unknown"]

# Feature_reference values which are ignored (not considered) in
# multi-organism filtering. Currently the null set.
FEATURE_REFERENCE_IGNORE: set[str] = set()


# The default configuration for TileDB contexts used in the builder.
# Ref: https://docs.tiledb.com/main/how-to/configuration#configuration-parameters
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
logger = logging.getLogger(__name__)

CXG_BASE_URI = "https://api.cellxgene.cziscience.com/"
# The DEV base, occasionally used for testing.
# CXG_BASE_URI = "https://api.cellxgene.dev.single-cell.czi.technology/"


def parse_manifest_file(manifest_fp: io.TextIOBase) -> list[Dataset]:
Expand Down
47 changes: 39 additions & 8 deletions tools/cellxgene_census_builder/tests/anndata/test_anndata.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from cellxgene_census_builder.build_soma.datasets import Dataset
from cellxgene_census_builder.build_state import CensusBuildArgs

from ..conftest import ORGANISMS, get_anndata
from ..conftest import GENE_IDS, ORGANISMS, get_anndata


def test_open_anndata(datasets: list[Dataset]) -> None:
Expand All @@ -34,9 +34,15 @@ def test_open_anndata_filters_out_datasets_with_mixed_feature_reference(
datasets_with_mixed_feature_reference: list[Dataset],
) -> None:
"""Datasets with a "mixed" feature_reference will not be included by the filter pipeline"""
ad_filter = make_anndata_cell_filter({})
ad_filter = make_anndata_cell_filter(
{
"organism_ontology_term_id": ORGANISMS[0].organism_ontology_term_id,
"assay_ontology_term_ids": [],
}
)

result = [ad_filter(open_anndata(d, base_path=".")) for d in datasets_with_mixed_feature_reference]
assert all(len(ad) == 0 for ad in result)
assert all(len(ad) == 4 and ad.n_vars == len(GENE_IDS[0]) - 1 for ad in result)


def test_open_anndata_filters_out_wrong_schema_version_datasets(
Expand All @@ -53,7 +59,12 @@ def test_make_anndata_cell_filter(tmp_path: pathlib.Path, h5ad_simple: str) -> N
dataset = Dataset(dataset_id="test", dataset_asset_h5ad_uri="test", dataset_h5ad_path=h5ad_simple)
adata_simple = open_anndata(dataset, base_path=tmp_path.as_posix())

func = make_anndata_cell_filter({})
func = make_anndata_cell_filter(
{
"organism_ontology_term_id": ORGANISMS[0].organism_ontology_term_id,
"assay_ontology_term_ids": [],
}
)
filtered_adata_simple = func(adata_simple)

assert adata_simple.var.equals(filtered_adata_simple.var)
Expand All @@ -70,7 +81,12 @@ def test_make_anndata_cell_filter_filters_out_organoids_cell_culture(
)
adata_with_organoids_and_cell_culture = open_anndata(dataset, base_path=tmp_path.as_posix())

func = make_anndata_cell_filter({})
func = make_anndata_cell_filter(
{
"organism_ontology_term_id": ORGANISMS[0].organism_ontology_term_id,
"assay_ontology_term_ids": [],
}
)
filtered_adata_with_organoids_and_cell_culture = func(adata_with_organoids_and_cell_culture)

assert adata_with_organoids_and_cell_culture.var.equals(filtered_adata_with_organoids_and_cell_culture.var)
Expand All @@ -81,7 +97,12 @@ def test_make_anndata_cell_filter_organism(tmp_path: pathlib.Path, h5ad_with_org
dataset = Dataset(dataset_id="test", dataset_asset_h5ad_uri="test", dataset_h5ad_path=h5ad_with_organism)
adata_with_organism = open_anndata(dataset, base_path=tmp_path.as_posix())

func = make_anndata_cell_filter({"organism_ontology_term_id": ORGANISMS[0].organism_ontology_term_id})
func = make_anndata_cell_filter(
{
"organism_ontology_term_id": ORGANISMS[0].organism_ontology_term_id,
"assay_ontology_term_ids": [],
}
)
filtered_adata_with_organism = func(adata_with_organism)

assert adata_with_organism.var.equals(filtered_adata_with_organism.var)
Expand All @@ -92,7 +113,12 @@ def test_make_anndata_cell_filter_feature_biotype_gene(tmp_path: pathlib.Path, h
dataset = Dataset(dataset_id="test", dataset_asset_h5ad_uri="test", dataset_h5ad_path=h5ad_with_feature_biotype)
adata_with_feature_biotype = open_anndata(dataset, base_path=tmp_path.as_posix())

func = make_anndata_cell_filter({})
func = make_anndata_cell_filter(
{
"organism_ontology_term_id": ORGANISMS[0].organism_ontology_term_id,
"assay_ontology_term_ids": [],
}
)
filtered_adata_with_feature_biotype = func(adata_with_feature_biotype)

assert adata_with_feature_biotype.obs.equals(filtered_adata_with_feature_biotype.obs)
Expand All @@ -103,7 +129,12 @@ def test_make_anndata_cell_filter_assay(tmp_path: pathlib.Path, h5ad_with_assays
dataset = Dataset(dataset_id="test", dataset_asset_h5ad_uri="test", dataset_h5ad_path=h5ad_with_assays)
adata_with_assays = open_anndata(dataset, base_path=tmp_path.as_posix(), include_filter_columns=True)

func = make_anndata_cell_filter({"assay_ontology_term_ids": ["EFO:1234", "EFO:1235"]})
func = make_anndata_cell_filter(
{
"organism_ontology_term_id": ORGANISMS[0].organism_ontology_term_id,
"assay_ontology_term_ids": ["EFO:1234", "EFO:1235"],
}
)
filtered_adata_with_assays = func(adata_with_assays)

assert filtered_adata_with_assays.obs.shape[0] == 2
Expand Down

0 comments on commit cc0ed89

Please sign in to comment.