chanzuckerberg · bkmartinjr · Mar 26, 2024 · Feb 28, 2024 · Feb 28, 2024 · Feb 28, 2024
diff --git a/docs/cellxgene_census_schema.md b/docs/cellxgene_census_schema.md
diff --git a/docs/census_accepted_assays.csv b/docs/census_accepted_assays.csv
@@ -0,0 +1,108 @@
+EFO:0003755,FL-cDNA
+EFO:0004158,random RNA-Seq across whole transcriptome
+EFO:0005684,RNA-seq of coding RNA from single cells
+EFO:0005685,RNA-seq of non coding RNA from single cells
+EFO:0008440,tag based single cell RNA sequencing
+EFO:0008441,full length single cell RNA sequencing
+EFO:0008640,3'T-fill
+EFO:0008641,3’-end-seq
+EFO:0008643,3′-Seq
+EFO:0008661,Bru-Seq
+EFO:0008669,CAGEscan
+EFO:0008673,CapSeq
+EFO:0008675,CaptureSeq
+EFO:0008679,CEL-seq
+EFO:0008694,ClickSeq
+EFO:0008697,cP-RNA-Seq
+EFO:0008708,DeepCAGE
+EFO:0008710,Digital RNA
+EFO:0008718,DP-Seq
+EFO:0008720,DroNc-seq
+EFO:0008722,Drop-seq
+EFO:0008735,FACS-seq
+EFO:0008747,FRISCR
+EFO:0008748,FRT-Seq
+EFO:0008752,GMUCT 1.0
+EFO:0008753,GMUCT 2.0
+EFO:0008756,GRO-CAP
+EFO:0008763,Hi-SCL
+EFO:0008780,inDrop
+EFO:0008796,MARS-seq
+EFO:0008797,MATQ-seq
+EFO:0008824,NanoCAGE
+EFO:0008825,Nanogrid RNA-Seq
+EFO:0008826,NET-Seq
+EFO:0008850,PAS-Seq
+EFO:0008859,PEAT
+EFO:0008863,PLATE-Seq
+EFO:0008868,PRO-cap
+EFO:0008869,PRO-seq
+EFO:0008877,Quartz-seq
+EFO:0008896,RNA-Seq
+EFO:0008897,RNAtag-Seq
+EFO:0008898,RNET-seq
+EFO:0008903,SC3-seq
+EFO:0008908,SCI-seq
+EFO:0008913,single-cell RNA sequencing
+EFO:0008919,Seq-Well
+EFO:0008929,SMA
+EFO:0008930,Smart-seq
+EFO:0008931,Smart-seq2
+EFO:0008937,snDrop-seq
+EFO:0008941,sNuc-Seq
+EFO:0008945,SPET-seq
+EFO:0008953,STRT-seq
+EFO:0008954,STRT-seq-2i
+EFO:0008956,SUPeR-seq
+EFO:0008962,TARDIS
+EFO:0008966,TCR Chain Paring
+EFO:0008967,TCR-LA-MC PCR
+EFO:0008972,TL-seq
+EFO:0008974,Tomo-Seq
+EFO:0008975,TRAP-Seq
+EFO:0008978,TSS Sequencing
+EFO:0008980,UMI Method
+EFO:0009309,Div-Seq
+EFO:0009809,single nucleus RNA sequencing
+EFO:0009810,full length single nucleus RNA sequencing
+EFO:0009811,tag based single nucleus RNA sequencing
+EFO:0009899,10x 3' v2
+EFO:0009900,10x 5' v2
+EFO:0009901,10x 3' v1
+EFO:0009919,SPLiT-seq
+EFO:0009922,10x 3' v3
+EFO:0009991,Nuc-Seq
+EFO:0010003,RASL-seq
+EFO:0010004,SCRB-seq
+EFO:0010010,CEL-seq2
+EFO:0010022,Smart-3Seq
+EFO:0010034,Cappable-Seq
+EFO:0010041,Nascent-Seq
+EFO:0010058,Fluidigm C1-based library preparation
+EFO:0010184,Smart-like
+EFO:0010550,sci-RNA-seq
+EFO:0010713,10x immune profiling
+EFO:0010714,10x TCR enrichment
+EFO:0010715,10x Ig enrichment
+EFO:0010964,barcoded plate-based single cell RNA-seq
+EFO:0011025,10x 5' v1
+EFO:0022396,TruSeq
+EFO:0022488,Smart-seq3
+EFO:0022490,ScaleBio single cell RNA sequencing
+EFO:0030002,microwell-seq
+EFO:0030003,10x 3' transcription profiling
+EFO:0030004,10x 5' transcription profiling
+EFO:0030019,Seq-Well S3
+EFO:0030021,Nx1-seq
+EFO:0030028,sci-RNA-seq3
+EFO:0030030,Quant-seq
+EFO:0030031,SCOPE-chip
+EFO:0030061,mcSCRB-seq
+EFO:0030074,SORT-seq
+EFO:0030078,droplet-based single-cell RNA library preparation
+EFO:0030080,10x transcription profiling
+EFO:0700003,BD Rhapsody Whole Transcriptome Analysis
+EFO:0700004,BD Rhapsody Targeted mRNA
+EFO:0700010,TruDrop
+EFO:0700011,GEXSCOPE technology
+EFO:0700016,Smart-seq v4
diff --git a/docs/census_accepted_assays_full_gene.csv b/docs/census_accepted_assays_full_gene.csv
@@ -0,0 +1,19 @@
+EFO:0003755,FL-cDNA
+EFO:0008441,full length single cell RNA sequencing
+EFO:0008747,FRISCR
+EFO:0008763,Hi-SCL
+EFO:0008797,MATQ-seq
+EFO:0008877,Quartz-seq
+EFO:0008930,Smart-seq
+EFO:0008931,Smart-seq2
+EFO:0008956,SUPeR-seq
+EFO:0009810,full length single nucleus RNA sequencing
+EFO:0010004,SCRB-seq
+EFO:0010022,Smart-3Seq
+EFO:0010058,Fluidigm C1-based library preparation
+EFO:0010184,Smart-like
+EFO:0022396,TruSeq
+EFO:0022488,Smart-seq3
+EFO:0030031,SCOPE-chip
+EFO:0030061,mcSCRB-seq
+EFO:0700016,Smart-seq v4
diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/anndata.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/anndata.py
@@ -14,14 +14,14 @@
 
 from ..util import urlcat
 from .datasets import Dataset
-from .globals import CXG_SCHEMA_VERSION, FEATURE_REFERENCE_IGNORE
+from .globals import CXG_SCHEMA_VERSION
 
 logger = logging.getLogger(__name__)
 
 
 class AnnDataFilterSpec(TypedDict):
-    organism_ontology_term_id: str | None
-    assay_ontology_term_ids: list[str] | None
+    organism_ontology_term_id: str
+    assay_ontology_term_ids: list[str]
 
 
 # Indexing types
@@ -175,7 +175,12 @@ def get_estimated_density(self) -> float:
         This is NOT the density for any given slice.
 
         Approach: divide the whole file nnz by the product of the shape.
+
+        Arbitarily picks density of 1.0 if the file is empty on either axis
         """
+        if self.n_obs * self.n_vars == 0:
+            return 1.0
+
         nnz: int
         if isinstance(self._X, CSRDataset | CSCDataset):
             nnz = self._X.group["data"].size
@@ -298,46 +303,43 @@ def __call__(self, ad: AnnDataProxy) -> AnnDataProxy:
 def make_anndata_cell_filter(filter_spec: AnnDataFilterSpec) -> AnnDataFilterFunction:
     """Return an anndata sliced/filtered for those cells/genes of interest.
 
+    See: https://github.com/chanzuckerberg/cellxgene-census/blob/main/docs/cellxgene_census_schema.md
+
     obs filter:
     * not organoid or cell culture
     * Caller-specified assays only
     * Caller-specified taxa (obs.organism_ontology_term_id == '<user-supplied>')
     * Organism term ID value not equal to gene feature_reference value
+    * Single organism
 
     var filter:
     * genes only  (var.feature_biotype == 'gene')
+    * Single organism
     """
     organism_ontology_term_id = filter_spec.get("organism_ontology_term_id", None)
     assay_ontology_term_ids = filter_spec.get("assay_ontology_term_ids", None)
 
     def _filter(ad: AnnDataProxy) -> AnnDataProxy:
-        # Multi-organism datasets are dropped - any dataset with 2+ feature_reference organisms is ignored,
-        # exclusive of values in FEATURE_REFERENCE_IGNORE. See also, cell filter for mismatched
-        # cell/feature organism values.
-        feature_reference_organisms = set(ad.var.feature_reference.unique()) - FEATURE_REFERENCE_IGNORE
-        if len(feature_reference_organisms) > 1:
-            logger.info(f"H5AD ignored due to multi-organism feature_reference: {ad.filename}")
+        """Filter observations and features per Census schema."""
+        var_mask = ad.var.feature_biotype == "gene"
+        obs_mask = ad.obs.tissue_type == "tissue"
+
+        # Handle multi-species edge case
+        var_organisms = set(ad.var.feature_reference[var_mask].unique())
+        obs_organisms = set(ad.obs.organism_ontology_term_id[obs_mask].unique())
+        if len(var_organisms) > 1 and len(obs_organisms) > 1:
+            # if multi-species on both axis -- drop everything
+            logger.info(f"H5AD ignored - multi-species content on both axes: {ad.filename}")
             return ad[0:0]  # ie., drop all cells
 
-        #
-        # Filter cells per Census schema
-        #
-        obs_mask = ad.obs.tissue_type == "tissue"
-        if organism_ontology_term_id is not None:
-            obs_mask = obs_mask & (ad.obs.organism_ontology_term_id == organism_ontology_term_id)
-        if assay_ontology_term_ids is not None:
+        # Filter by the species specified in the filter-spec
+        var_mask = var_mask & (ad.var.feature_reference == organism_ontology_term_id)
+        obs_mask = obs_mask & (ad.obs.organism_ontology_term_id == organism_ontology_term_id)
+        if assay_ontology_term_ids:
             obs_mask = obs_mask & ad.obs.assay_ontology_term_id.isin(assay_ontology_term_ids)
 
-        # multi-organism dataset cell filter - exclude any cells where organism != feature_reference
-        feature_references = set(ad.var.feature_reference.unique()) - FEATURE_REFERENCE_IGNORE
-        assert len(feature_references) == 1  # else there is a bug in the test above
-        feature_reference_organism_ontology_id = feature_references.pop()
-        obs_mask = obs_mask & (ad.obs.organism_ontology_term_id == feature_reference_organism_ontology_id)
-
-        #
-        # Filter features per Census schema
-        #
-        var_mask = ad.var.feature_biotype == "gene"
+        if not (var_mask.any() and obs_mask.any()):
+            return ad[0:0]  # i.e., drop all cells
 
         return ad[
             slice(None) if obs_mask.all() else obs_mask.to_numpy(),

diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/build_soma.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/build_soma.py
@@ -73,67 +73,61 @@ def build(args: CensusBuildArgs, *, validate: bool = True) -> int:
 
     prepare_file_system(args)
 
-    try:
-        with create_dask_client(args, n_workers=cpu_count(), threads_per_worker=1, memory_limit=0) as client:
-            # Step 1 - get all source datasets
-            datasets = build_step1_get_source_datasets(args)
+    n_workers = clamp(cpu_count(), 1, args.config.max_worker_processes)
+    with create_dask_client(args, n_workers=n_workers, threads_per_worker=1, memory_limit=0) as client:
+        # Step 1 - get all source datasets
+        datasets = build_step1_get_source_datasets(args)
 
-            # Step 2 - create root collection, and all child objects, but do not populate any dataframes or matrices
-            root_collection = build_step2_create_root_collection(args.soma_path.as_posix(), experiment_builders)
+        # Step 2 - create root collection, and all child objects, but do not populate any dataframes or matrices
+        root_collection = build_step2_create_root_collection(args.soma_path.as_posix(), experiment_builders)
 
-            # Step 3 - populate axes
-            filtered_datasets = build_step3_populate_obs_and_var_axes(
-                args.h5ads_path.as_posix(), datasets, experiment_builders, args
-            )
+        # Step 3 - populate axes
+        filtered_datasets = build_step3_populate_obs_and_var_axes(
+            args.h5ads_path.as_posix(), datasets, experiment_builders, args
+        )
 
-            # Constraining parallelism is critical at this step, as each worker utilizes (max) ~64GiB+ of memory to
-            # process the X array (partitions are large to reduce TileDB fragment count, which reduces consolidation time).
-            #
-            # TODO: when global order writes are supported, processing of much smaller slices will be
-            # possible, and this budget should drop considerably. When that is implemented, n_workers should be
-            # be much larger (eg., use default value of #CPUs or some such).
-            # https://github.com/single-cell-data/TileDB-SOMA/issues/2054
-            MEM_BUDGET = 64 * 1024**3
-            n_workers = clamp(int(psutil.virtual_memory().total // MEM_BUDGET), 1, args.config.max_worker_processes)
-            logger.info(f"Scaling cluster to {n_workers} workers.")
-            client.cluster.scale(n_workers)
-
-            # Step 4 - populate X layers
-            build_step4_populate_X_layers(args.h5ads_path.as_posix(), filtered_datasets, experiment_builders, args)
-
-            # Prune datasets that we will not use, and do not want to include in the build
-            prune_unused_datasets(args.h5ads_path, datasets, filtered_datasets)
-
-            # Step 5- write out dataset manifest and summary information
-            build_step5_save_axis_and_summary_info(
-                root_collection, experiment_builders, filtered_datasets, args.config.build_tag
-            )
+        # Constraining parallelism is critical at this step, as each worker utilizes (max) ~64GiB+ of memory to
+        # process the X array (partitions are large to reduce TileDB fragment count, which reduces consolidation time).
+        #
+        # TODO: when global order writes are supported, processing of much smaller slices will be
+        # possible, and this budget should drop considerably. When that is implemented, n_workers should be
+        # be much larger (eg., use default value of #CPUs or some such).
+        # https://github.com/single-cell-data/TileDB-SOMA/issues/2054
+        MEM_BUDGET = 64 * 1024**3
+        n_workers = clamp(int(psutil.virtual_memory().total // MEM_BUDGET), 1, args.config.max_worker_processes)
+        logger.info(f"Scaling cluster to {n_workers} workers.")
+        client.cluster.scale(n_workers)
+
+        # Step 4 - populate X layers
+        build_step4_populate_X_layers(args.h5ads_path.as_posix(), filtered_datasets, experiment_builders, args)
+
+        # Prune datasets that we will not use, and do not want to include in the build
+        prune_unused_datasets(args.h5ads_path, datasets, filtered_datasets)
+
+        # Step 5- write out dataset manifest and summary information
+        build_step5_save_axis_and_summary_info(
+            root_collection, experiment_builders, filtered_datasets, args.config.build_tag
+        )
 
-            # Temporary work-around. Can be removed when single-cell-data/TileDB-SOMA#1969 fixed.
-            tiledb_soma_1969_work_around(root_collection.uri)
-
-            # Scale the cluster up as we are no longer memory constrained in the following phases
-            n_workers = clamp(cpu_count(), 1, args.config.max_worker_processes)
-            logger.info(f"Scaling cluster to {n_workers} workers.")
-            client.cluster.scale(n=n_workers)
-
-            if args.config.consolidate:
-                for f in dask.distributed.as_completed(
-                    submit_consolidate(root_collection.uri, pool=client, vacuum=True)
-                ):
-                    assert f.result()
-            if validate:
-                for f in dask.distributed.as_completed(validate_soma(args, client)):
-                    assert f.result()
-            if args.config.consolidate and validate:
-                validate_consolidation(args)
-            logger.info("Validation & consolidation complete.")
-
-            shutdown_dask_cluster(client)
-
-    except TimeoutError:
-        # quiet tornado race conditions (harmless) on shutdown
-        pass
+        # Temporary work-around. Can be removed when single-cell-data/TileDB-SOMA#1969 fixed.
+        tiledb_soma_1969_work_around(root_collection.uri)
+
+        # Scale the cluster up as we are no longer memory constrained in the following phases
+        n_workers = clamp(cpu_count(), 1, args.config.max_worker_processes)
+        logger.info(f"Scaling cluster to {n_workers} workers.")
+        client.cluster.scale(n=n_workers)
+
+        if args.config.consolidate:
+            for f in dask.distributed.as_completed(submit_consolidate(root_collection.uri, pool=client, vacuum=True)):
+                assert f.result()
+        if validate:
+            for f in dask.distributed.as_completed(validate_soma(args, client)):
+                assert f.result()
+        if args.config.consolidate and validate:
+            validate_consolidation(args)
+        logger.info("Validation & consolidation complete.")
+
+        shutdown_dask_cluster(client)
 
     return 0