support multi-tables

gustaveroussy · Apr 3, 2024 · 70162f6 · 70162f6
1 parent bc248c2
commit 70162f6
Show file tree

Hide file tree

Showing 17 changed files with 100 additions and 78 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,11 @@
+## [1.0.9] - 2024-04-03
+
+### Added:
+- Support multiple tables
+
+### Fixed
+- Spatial elements not saved when data is not backed
+
 ## [1.0.8] - 2024-04-02
 
 Hotfix: resolve issues related to `spatialdata>=1.0.0`

diff --git a/docs/tutorials/api_usage.ipynb b/docs/tutorials/api_usage.ipynb
@@ -513,7 +513,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Now, `sdata.table` is an `AnnData` object.\n",
+    "Now, `sdata.tables[\"table\"]` is an `AnnData` object.\n",
     "- If you count the transcripts, then `adata.X` are the raw counts\n",
     "- If you average the channel intensities, then `adata.X` are the channels intensities\n",
     "- If you both count the transcript and average the intensities, then `adata.X` are the raw counts, and `adata.obsm[\"intensities\"]` are the channels intensities"
@@ -585,7 +585,7 @@
     "    \"CD3\": \"T cell\"\n",
     "}\n",
     "\n",
-    "higher_z_score(sdata.table, marker_cell_dict)"
+    "higher_z_score(sdata.tables[\"table\"], marker_cell_dict)"
    ]
   },
   {

diff --git a/docs/tutorials/he.ipynb b/docs/tutorials/he.ipynb
@@ -437,7 +437,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The resulting `GeoDataFrame` may have more columns than cells, because one cell may be inside multiple patches. We will keep only the first patch, and then save the resulting `\"cluster\"` column into the `sdata.table`."
+    "The resulting `GeoDataFrame` may have more columns than cells, because one cell may be inside multiple patches. We will keep only the first patch, and then save the resulting `\"cluster\"` column into the `sdata.tables[\"table\"]`."
    ]
   },
   {
@@ -446,7 +446,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "sdata.table.obs[\"cluster\"] = res_gdf[~res_gdf.index.duplicated()][\"cluster\"].values"
+    "sdata.tables[\"table\"].obs[\"cluster\"] = res_gdf[~res_gdf.index.duplicated()][\"cluster\"].values"
    ]
   },
   {

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "sopa"
-version = "1.0.8"
+version = "1.0.9"
 description = "Spatial-omics pipeline and analysis"
 documentation = "https://gustaveroussy.github.io/sopa"
 homepage = "https://gustaveroussy.github.io/sopa"

diff --git a/sopa/_constants.py b/sopa/_constants.py
@@ -2,6 +2,7 @@ class SopaKeys:
     CELLPOSE_BOUNDARIES = "cellpose_boundaries"
     BAYSOR_BOUNDARIES = "baysor_boundaries"
     PATCHES = "sopa_patches"
+    TABLE = "table"
 
     BOUNDS = "bboxes"
     PATCHES_ILOCS = "ilocs"

diff --git a/sopa/_sdata.py b/sopa/_sdata.py
@@ -130,13 +130,17 @@ def get_item(sdata: SpatialData, attr: str, key: str | None = None):
 
 def get_intensities(sdata: SpatialData) -> pd.DataFrame | None:
     """Gets the intensity dataframe of shape `n_obs x n_channels`"""
-    if not sdata.table.uns[SopaKeys.UNS_KEY][SopaKeys.UNS_HAS_INTENSITIES]:
+    assert SopaKeys.TABLE in sdata.tables, f"No '{SopaKeys.TABLE}' found in sdata.tables"
+
+    adata = sdata.tables[SopaKeys.TABLE]
+
+    if not adata.uns[SopaKeys.UNS_KEY][SopaKeys.UNS_HAS_INTENSITIES]:
         return None
 
-    if sdata.table.uns[SopaKeys.UNS_KEY][SopaKeys.UNS_HAS_TRANSCRIPTS]:
-        return sdata.table.obsm[SopaKeys.INTENSITIES_OBSM]
+    if adata.uns[SopaKeys.UNS_KEY][SopaKeys.UNS_HAS_TRANSCRIPTS]:
+        return adata.obsm[SopaKeys.INTENSITIES_OBSM]
 
-    return sdata.table.to_df()
+    return adata.to_df()
 
 
 def iter_scales(image: MultiscaleSpatialImage) -> Iterator[xr.DataArray]:
@@ -187,6 +191,9 @@ def save_shapes(
     name: str,
     overwrite: bool = False,
 ) -> None:
+    if not sdata.is_backed():
+        return
+
     elem_group = sdata._init_add_element(name=name, element_type="shapes", overwrite=overwrite)
     write_shapes(
         shapes=sdata.shapes[name],
@@ -200,6 +207,9 @@ def save_image(
     name: str,
     overwrite: bool = False,
 ) -> None:
+    if not sdata.is_backed():
+        return
+
     elem_group = sdata._init_add_element(name=name, element_type="images", overwrite=overwrite)
     write_image(
         image=sdata.images[name],
@@ -213,11 +223,14 @@ def save_image(
     assert elem_group.path == "images"
     path = Path(elem_group.store.path) / "images" / name
     image = _read_multiscale(path, raster_type="image")
-    sdata._add_image_in_memory(name=name, image=image, overwrite=True)
+    sdata.images[name] = image
+
 
+def save_table(sdata: SpatialData, name: str):
+    if not sdata.is_backed():
+        return
 
-def save_table(sdata: SpatialData):
     store = parse_url(sdata.path, mode="r+").store
     root = zarr.group(store=store)
-    elem_group = root.require_group(name="table")
-    write_table(table=sdata.table, group=elem_group, name="table")
+    elem_group = root.require_group(name="tables")
+    write_table(table=sdata.tables[name], group=elem_group, name=name)
diff --git a/sopa/annotation/tangram/run.py b/sopa/annotation/tangram/run.py
@@ -41,7 +41,9 @@ def tangram_annotate(
         bag_size: Size of each bag on which tangram will be run. Use smaller bags to lower the RAM usage
         max_obs_reference: Maximum number of cells used in `adata_sc` at each level. Decrease it to lower the RAM usage.
     """
-    ad_sp = sdata.table
+    assert SopaKeys.TABLE in sdata.tables, f"No '{SopaKeys.TABLE}' found in sdata.tables"
+
+    ad_sp = sdata.tables[SopaKeys.TABLE]
 
     MultiLevelAnnotation(
         ad_sp,

diff --git a/sopa/cli/annotate.py b/sopa/cli/annotate.py
@@ -21,17 +21,17 @@ def fluorescence(
 
     For each cell, one z-score statistic is computed and the population with the highest z-score is attributed.
     """
-    from pathlib import Path
-
+    from sopa._constants import SopaKeys
+    from sopa._sdata import save_table
     from sopa.annotation.fluorescence import higher_z_score
     from sopa.io.standardize import read_zarr_standardized
 
     sdata = read_zarr_standardized(sdata_path)
 
-    assert sdata.table is not None, "Annotation requires `sdata.table` to be not None"
+    assert SopaKeys.TABLE in sdata.tables, f"No '{SopaKeys.TABLE}' found in sdata.tables"
 
-    higher_z_score(sdata.table, marker_cell_dict, cell_type_key)
-    sdata.table.write_zarr(Path(sdata_path) / "table" / "table")
+    higher_z_score(sdata.tables[SopaKeys.TABLE], marker_cell_dict, cell_type_key)
+    save_table(sdata, SopaKeys.TABLE)
 
 
 @app_annotate.command()
@@ -55,10 +55,10 @@ def tangram(
     ),
 ):
     """Tangram segmentation (i.e., uses an annotated scRNAseq reference to transfer cell-types)"""
-    from pathlib import Path
-
     import anndata
 
+    from sopa._constants import SopaKeys
+    from sopa._sdata import save_table
     from sopa.annotation.tangram.run import tangram_annotate
     from sopa.io.standardize import read_zarr_standardized
 
@@ -73,4 +73,4 @@ def tangram(
         bag_size=bag_size,
         max_obs_reference=max_obs_reference,
     )
-    sdata.table.write_zarr(Path(sdata_path) / "table" / "table")
+    save_table(sdata, SopaKeys.TABLE)
diff --git a/sopa/io/explorer/converter.py b/sopa/io/explorer/converter.py
@@ -7,6 +7,7 @@
 import geopandas as gpd
 from spatialdata import SpatialData
 
+from ..._constants import SopaKeys
 from ..._sdata import get_boundaries, get_element, get_spatial_image, to_intrinsic
 from . import (
     write_cell_categories,
@@ -87,7 +88,7 @@ def write(
         points_key: Name of the transcripts (key of `sdata.points`).
         gene_column: Column name of the points dataframe containing the gene names.
         pixel_size: Number of microns in a pixel. Invalid value can lead to inconsistent scales in the Explorer.
-        layer: Layer of `sdata.table` where the gene counts are saved. If `None`, uses `sdata.table.X`.
+        layer: Layer of the AnnData table where the gene counts are saved. If `None`, uses `table.X`.
         polygon_max_vertices: Maximum number of vertices for the cell polygons.
         lazy: If `True`, will not load the full images in memory (except if the image memory is below `ram_threshold_gb`).
         ram_threshold_gb: Threshold (in gygabytes) from which image can be loaded in memory. If `None`, the image is never loaded in memory.
@@ -100,8 +101,8 @@ def write(
     image_key, image = get_spatial_image(sdata, image_key, return_key=True)
 
     ### Saving cell categories and gene counts
-    if sdata.table is not None:
-        adata = sdata.table
+    if SopaKeys.TABLE in sdata.tables:
+        adata = sdata.tables[SopaKeys.TABLE]
 
         shapes_key = adata.uns["spatialdata_attrs"]["region"]
         geo_df = sdata[shapes_key]
@@ -120,7 +121,7 @@ def write(
     if _should_save(mode, "b") and geo_df is not None:
         geo_df = to_intrinsic(sdata, geo_df, image_key)
 
-        if sdata.table is not None:
+        if SopaKeys.TABLE in sdata.tables:
             geo_df = geo_df.loc[adata.obs[adata.uns["spatialdata_attrs"]["instance_key"]]]
 
         write_polygons(path, geo_df.geometry, polygon_max_vertices, pixel_size=pixel_size)
@@ -145,16 +146,16 @@ def write(
     if _should_save(mode, "m"):
         write_metadata(path, image_key, shapes_key, _get_n_obs(sdata, geo_df), pixel_size)
 
-    if save_h5ad:
-        sdata.table.write_h5ad(path / FileNames.H5AD)
+    if save_h5ad and SopaKeys.TABLE in sdata.tables:
+        sdata.tables[SopaKeys.TABLE].write_h5ad(path / FileNames.H5AD)
 
     log.info(f"Saved files in the following directory: {path}")
     log.info(f"You can open the experiment with 'open {path / FileNames.METADATA}'")
 
 
 def _get_n_obs(sdata: SpatialData, geo_df: gpd.GeoDataFrame) -> int:
-    if sdata.table is not None:
-        return sdata.table.n_obs
+    if SopaKeys.TABLE in sdata.tables:
+        return sdata.tables[SopaKeys.TABLE].n_obs
     return len(geo_df) if geo_df is not None else 0
 
 

diff --git a/sopa/io/report/generate.py b/sopa/io/report/generate.py
@@ -56,11 +56,12 @@ def _kdeplot_vmax_quantile(values: np.ndarray, quantile: float = 0.95):
 class SectionBuilder:
     def __init__(self, sdata: SpatialData):
         self.sdata = sdata
+        self.adata = self.sdata.tables.get(SopaKeys.TABLE)
 
     def _table_has(self, key, default=False):
-        if SopaKeys.UNS_KEY not in self.sdata.table.uns:
+        if SopaKeys.UNS_KEY not in self.adata.uns:
             return default
-        return self.sdata.table.uns[SopaKeys.UNS_KEY].get(key, default)
+        return self.adata.uns[SopaKeys.UNS_KEY].get(key, default)
 
     def general_section(self):
         log.info("Writing general section")
@@ -87,15 +88,15 @@ def cell_section(self):
         coord_system = get_intrinsic_cs(self.sdata, shapes_key)
 
         fig = plt.figure()
-        _kdeplot_vmax_quantile(self.sdata.table.obs[SopaKeys.AREA_OBS])
+        _kdeplot_vmax_quantile(self.adata.obs[SopaKeys.AREA_OBS])
         plt.xlabel("Area (coordinate_system_unit ^ 2)")
 
         return Section(
             "Cells",
             [
                 SubSection(
                     "Number",
-                    Paragraph(f"Number of cells:<br>{Message(f'{self.sdata.table.n_obs} cells')}"),
+                    Paragraph(f"Number of cells:<br>{Message(f'{self.adata.n_obs} cells')}"),
                 ),
                 SubSection(
                     "Areas",
@@ -155,7 +156,7 @@ def transcripts_section(self):
 
         log.info("Writing transcript section")
 
-        mean_transcript_count = self.sdata.table.X.mean(0).A1
+        mean_transcript_count = self.adata.X.mean(0).A1
         low_average = mean_transcript_count < LOW_AVERAGE_COUNT
 
         QC_subsubsections = []
@@ -166,15 +167,15 @@ def transcripts_section(self):
                 )
             )
             QC_subsubsections.append(
-                Message(", ".join(self.sdata.table.var_names[low_average]), color="danger")
+                Message(", ".join(self.adata.var_names[low_average]), color="danger")
             )
 
         fig1 = plt.figure()
         _kdeplot_vmax_quantile(mean_transcript_count)
         plt.xlabel("Count per transcript (average / cells)")
 
         fig2 = plt.figure()
-        _kdeplot_vmax_quantile(self.sdata.table.X.sum(1).A1)
+        _kdeplot_vmax_quantile(self.adata.X.sum(1).A1)
         plt.xlabel("Transcript count per cell")
 
         QC_subsubsections.append(Columns([Image(fig1), Image(fig2)]))
@@ -184,23 +185,21 @@ def transcripts_section(self):
     def representation_section(self, max_obs: int = 400_000):
         log.info("Writing representation section")
 
-        adata = self.sdata.table
-
         if self._table_has(SopaKeys.UNS_HAS_TRANSCRIPTS):
-            sc.pp.normalize_total(adata)
-            sc.pp.log1p(adata)
+            sc.pp.normalize_total(self.adata)
+            sc.pp.log1p(self.adata)
 
-        if adata.n_obs > max_obs:
-            sc.pp.subsample(adata, n_obs=max_obs)
+        if self.adata.n_obs > max_obs:
+            sc.pp.subsample(self.adata, n_obs=max_obs)
 
-        log.info(f"Computing UMAP on {adata.n_obs} cells")
+        log.info(f"Computing UMAP on {self.adata.n_obs} cells")
 
-        sc.pp.pca(adata)
-        sc.pp.neighbors(adata)
-        sc.tl.umap(adata)
+        sc.pp.pca(self.adata)
+        sc.pp.neighbors(self.adata)
+        sc.tl.umap(self.adata)
 
         colors = self._table_has(SopaKeys.UNS_CELL_TYPES, None)
-        sc.pl.umap(adata, color=colors, show=False)
+        sc.pl.umap(self.adata, color=colors, show=False)
 
         return Section(
             "Representation",

diff --git a/sopa/io/standardize.py b/sopa/io/standardize.py
@@ -6,7 +6,7 @@
 import spatialdata
 from spatialdata import SpatialData
 
-from .._constants import VALID_DIMENSIONS
+from .._constants import VALID_DIMENSIONS, SopaKeys
 from .._sdata import get_spatial_image
 from ..utils.image import _check_integer_dtype
 
@@ -42,12 +42,12 @@ def sanity_check(sdata: SpatialData, delete_table: bool = False, warn: bool = Fa
     #     log.warn(f"Channel names are not strings. Converting {image_channels} to string values.")
     #     sdata[image_key].data = sdata[image_key].assign_coords(c=image_channels.astype(str))
 
-    if sdata.table is not None:
+    if SopaKeys.TABLE in sdata.tables:
         if delete_table:
             log.info(
-                "The table (i.e. `sdata.table`) will not be saved, since it will be created later by sopa"
+                f"The table `sdata.tables['{SopaKeys.TABLE}']` will not be saved, since it will be created later by sopa"
             )
-            del sdata.table
+            del sdata.tables[SopaKeys.TABLE]
 
 
 def read_zarr_standardized(path: str, warn: bool = False) -> SpatialData:
@@ -73,8 +73,8 @@ def write_standardized(sdata: SpatialData, sdata_path: str, delete_table: bool =
     sanity_check(sdata, delete_table)
 
     assert (
-        sdata.table is None
-    ), "sdata.table exists. Delete it you want to use sopa, to avoid conflicts with future table generation"
+        SopaKeys.TABLE not in sdata.tables
+    ), f"sdata.tables['{SopaKeys.TABLE}'] exists. Delete it you want to use sopa, to avoid conflicts with future table generation"
 
     if len(sdata.points) == 0:
         log.warn("No transcripts found. Some tools from sopa will not be available.")

diff --git a/sopa/segmentation/aggregate.py b/sopa/segmentation/aggregate.py
@@ -63,13 +63,13 @@ def __init__(
             self.shapes_key = shapes_key
             self.geo_df = self.sdata[shapes_key]
 
-        if sdata.table is not None and len(self.geo_df) != sdata.table.n_obs:
-            log.warn(
-                f"Table already existing with {sdata.table.n_obs} obs, but aggregating on {len(self.geo_df)} cells. Deleting table."
-            )
-            del sdata.table
-
-        self.table = sdata.table
+        self.table = None
+        if SopaKeys.TABLE in sdata.tables:
+            table = sdata.tables[SopaKeys.TABLE]
+            if len(self.geo_df) != table.n_obs:
+                log.warn("Not using existing table (aggregating on a different number of cells)")
+            else:
+                self.table = table
 
     def standardize_table(self):
         self.table.obs_names = list(map(str_cell_id, range(self.table.n_obs)))
@@ -101,10 +101,8 @@ def standardize_table(self):
             instance_key=SopaKeys.INSTANCE_KEY,
         )
 
-        if self.sdata.table is not None and self.overwrite:
-            del self.sdata.table
-        self.sdata.tables["table"] = self.table
-        save_table(self.sdata)
+        self.sdata.tables[SopaKeys.TABLE] = self.table
+        save_table(self.sdata, SopaKeys.TABLE)
 
     def filter_cells(self, where_filter: np.ndarray):
         log.info(f"Filtering {where_filter.sum()} cells")