From 70162f69fb8a1a439a922de3f185dd9ab64ea1ad Mon Sep 17 00:00:00 2001 From: Blampey Quentin Date: Wed, 3 Apr 2024 11:03:54 +0200 Subject: [PATCH] support multi-tables --- CHANGELOG.md | 8 +++++++ docs/tutorials/api_usage.ipynb | 4 ++-- docs/tutorials/he.ipynb | 4 ++-- pyproject.toml | 2 +- sopa/_constants.py | 1 + sopa/_sdata.py | 29 +++++++++++++++++------- sopa/annotation/tangram/run.py | 4 +++- sopa/cli/annotate.py | 16 ++++++------- sopa/io/explorer/converter.py | 17 +++++++------- sopa/io/report/generate.py | 35 ++++++++++++++--------------- sopa/io/standardize.py | 12 +++++----- sopa/segmentation/aggregate.py | 20 ++++++++--------- sopa/segmentation/baysor/resolve.py | 11 ++++----- sopa/spatial/_build.py | 4 +++- sopa/spatial/distance.py | 5 +++-- sopa/spatial/morpho.py | 4 ++-- workflow/utils.py | 2 +- 17 files changed, 100 insertions(+), 78 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 908786de..1561fa6e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,11 @@ +## [1.0.9] - 2024-04-03 + +### Added: +- Support multiple tables + +### Fixed +- Spatial elements not saved when data is not backed + ## [1.0.8] - 2024-04-02 Hotfix: resolve issues related to `spatialdata>=1.0.0` diff --git a/docs/tutorials/api_usage.ipynb b/docs/tutorials/api_usage.ipynb index 390f911f..f8e13927 100644 --- a/docs/tutorials/api_usage.ipynb +++ b/docs/tutorials/api_usage.ipynb @@ -513,7 +513,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now, `sdata.table` is an `AnnData` object.\n", + "Now, `sdata.tables[\"table\"]` is an `AnnData` object.\n", "- If you count the transcripts, then `adata.X` are the raw counts\n", "- If you average the channel intensities, then `adata.X` are the channels intensities\n", "- If you both count the transcript and average the intensities, then `adata.X` are the raw counts, and `adata.obsm[\"intensities\"]` are the channels intensities" @@ -585,7 +585,7 @@ " \"CD3\": \"T cell\"\n", "}\n", "\n", - "higher_z_score(sdata.table, marker_cell_dict)" + "higher_z_score(sdata.tables[\"table\"], marker_cell_dict)" ] }, { diff --git a/docs/tutorials/he.ipynb b/docs/tutorials/he.ipynb index 00910b88..ce3c2a0c 100644 --- a/docs/tutorials/he.ipynb +++ b/docs/tutorials/he.ipynb @@ -437,7 +437,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The resulting `GeoDataFrame` may have more columns than cells, because one cell may be inside multiple patches. We will keep only the first patch, and then save the resulting `\"cluster\"` column into the `sdata.table`." + "The resulting `GeoDataFrame` may have more columns than cells, because one cell may be inside multiple patches. We will keep only the first patch, and then save the resulting `\"cluster\"` column into the `sdata.tables[\"table\"]`." ] }, { @@ -446,7 +446,7 @@ "metadata": {}, "outputs": [], "source": [ - "sdata.table.obs[\"cluster\"] = res_gdf[~res_gdf.index.duplicated()][\"cluster\"].values" + "sdata.tables[\"table\"].obs[\"cluster\"] = res_gdf[~res_gdf.index.duplicated()][\"cluster\"].values" ] }, { diff --git a/pyproject.toml b/pyproject.toml index b3de3424..51b77ca6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "sopa" -version = "1.0.8" +version = "1.0.9" description = "Spatial-omics pipeline and analysis" documentation = "https://gustaveroussy.github.io/sopa" homepage = "https://gustaveroussy.github.io/sopa" diff --git a/sopa/_constants.py b/sopa/_constants.py index cf1f8600..5c72c651 100644 --- a/sopa/_constants.py +++ b/sopa/_constants.py @@ -2,6 +2,7 @@ class SopaKeys: CELLPOSE_BOUNDARIES = "cellpose_boundaries" BAYSOR_BOUNDARIES = "baysor_boundaries" PATCHES = "sopa_patches" + TABLE = "table" BOUNDS = "bboxes" PATCHES_ILOCS = "ilocs" diff --git a/sopa/_sdata.py b/sopa/_sdata.py index cca5e8b5..40c0fe67 100644 --- a/sopa/_sdata.py +++ b/sopa/_sdata.py @@ -130,13 +130,17 @@ def get_item(sdata: SpatialData, attr: str, key: str | None = None): def get_intensities(sdata: SpatialData) -> pd.DataFrame | None: """Gets the intensity dataframe of shape `n_obs x n_channels`""" - if not sdata.table.uns[SopaKeys.UNS_KEY][SopaKeys.UNS_HAS_INTENSITIES]: + assert SopaKeys.TABLE in sdata.tables, f"No '{SopaKeys.TABLE}' found in sdata.tables" + + adata = sdata.tables[SopaKeys.TABLE] + + if not adata.uns[SopaKeys.UNS_KEY][SopaKeys.UNS_HAS_INTENSITIES]: return None - if sdata.table.uns[SopaKeys.UNS_KEY][SopaKeys.UNS_HAS_TRANSCRIPTS]: - return sdata.table.obsm[SopaKeys.INTENSITIES_OBSM] + if adata.uns[SopaKeys.UNS_KEY][SopaKeys.UNS_HAS_TRANSCRIPTS]: + return adata.obsm[SopaKeys.INTENSITIES_OBSM] - return sdata.table.to_df() + return adata.to_df() def iter_scales(image: MultiscaleSpatialImage) -> Iterator[xr.DataArray]: @@ -187,6 +191,9 @@ def save_shapes( name: str, overwrite: bool = False, ) -> None: + if not sdata.is_backed(): + return + elem_group = sdata._init_add_element(name=name, element_type="shapes", overwrite=overwrite) write_shapes( shapes=sdata.shapes[name], @@ -200,6 +207,9 @@ def save_image( name: str, overwrite: bool = False, ) -> None: + if not sdata.is_backed(): + return + elem_group = sdata._init_add_element(name=name, element_type="images", overwrite=overwrite) write_image( image=sdata.images[name], @@ -213,11 +223,14 @@ def save_image( assert elem_group.path == "images" path = Path(elem_group.store.path) / "images" / name image = _read_multiscale(path, raster_type="image") - sdata._add_image_in_memory(name=name, image=image, overwrite=True) + sdata.images[name] = image + +def save_table(sdata: SpatialData, name: str): + if not sdata.is_backed(): + return -def save_table(sdata: SpatialData): store = parse_url(sdata.path, mode="r+").store root = zarr.group(store=store) - elem_group = root.require_group(name="table") - write_table(table=sdata.table, group=elem_group, name="table") + elem_group = root.require_group(name="tables") + write_table(table=sdata.tables[name], group=elem_group, name=name) diff --git a/sopa/annotation/tangram/run.py b/sopa/annotation/tangram/run.py index c7b6a423..85507af2 100644 --- a/sopa/annotation/tangram/run.py +++ b/sopa/annotation/tangram/run.py @@ -41,7 +41,9 @@ def tangram_annotate( bag_size: Size of each bag on which tangram will be run. Use smaller bags to lower the RAM usage max_obs_reference: Maximum number of cells used in `adata_sc` at each level. Decrease it to lower the RAM usage. """ - ad_sp = sdata.table + assert SopaKeys.TABLE in sdata.tables, f"No '{SopaKeys.TABLE}' found in sdata.tables" + + ad_sp = sdata.tables[SopaKeys.TABLE] MultiLevelAnnotation( ad_sp, diff --git a/sopa/cli/annotate.py b/sopa/cli/annotate.py index 74060f11..deb028ca 100644 --- a/sopa/cli/annotate.py +++ b/sopa/cli/annotate.py @@ -21,17 +21,17 @@ def fluorescence( For each cell, one z-score statistic is computed and the population with the highest z-score is attributed. """ - from pathlib import Path - + from sopa._constants import SopaKeys + from sopa._sdata import save_table from sopa.annotation.fluorescence import higher_z_score from sopa.io.standardize import read_zarr_standardized sdata = read_zarr_standardized(sdata_path) - assert sdata.table is not None, "Annotation requires `sdata.table` to be not None" + assert SopaKeys.TABLE in sdata.tables, f"No '{SopaKeys.TABLE}' found in sdata.tables" - higher_z_score(sdata.table, marker_cell_dict, cell_type_key) - sdata.table.write_zarr(Path(sdata_path) / "table" / "table") + higher_z_score(sdata.tables[SopaKeys.TABLE], marker_cell_dict, cell_type_key) + save_table(sdata, SopaKeys.TABLE) @app_annotate.command() @@ -55,10 +55,10 @@ def tangram( ), ): """Tangram segmentation (i.e., uses an annotated scRNAseq reference to transfer cell-types)""" - from pathlib import Path - import anndata + from sopa._constants import SopaKeys + from sopa._sdata import save_table from sopa.annotation.tangram.run import tangram_annotate from sopa.io.standardize import read_zarr_standardized @@ -73,4 +73,4 @@ def tangram( bag_size=bag_size, max_obs_reference=max_obs_reference, ) - sdata.table.write_zarr(Path(sdata_path) / "table" / "table") + save_table(sdata, SopaKeys.TABLE) diff --git a/sopa/io/explorer/converter.py b/sopa/io/explorer/converter.py index d0d7cbed..ceeb7576 100644 --- a/sopa/io/explorer/converter.py +++ b/sopa/io/explorer/converter.py @@ -7,6 +7,7 @@ import geopandas as gpd from spatialdata import SpatialData +from ..._constants import SopaKeys from ..._sdata import get_boundaries, get_element, get_spatial_image, to_intrinsic from . import ( write_cell_categories, @@ -87,7 +88,7 @@ def write( points_key: Name of the transcripts (key of `sdata.points`). gene_column: Column name of the points dataframe containing the gene names. pixel_size: Number of microns in a pixel. Invalid value can lead to inconsistent scales in the Explorer. - layer: Layer of `sdata.table` where the gene counts are saved. If `None`, uses `sdata.table.X`. + layer: Layer of the AnnData table where the gene counts are saved. If `None`, uses `table.X`. polygon_max_vertices: Maximum number of vertices for the cell polygons. lazy: If `True`, will not load the full images in memory (except if the image memory is below `ram_threshold_gb`). ram_threshold_gb: Threshold (in gygabytes) from which image can be loaded in memory. If `None`, the image is never loaded in memory. @@ -100,8 +101,8 @@ def write( image_key, image = get_spatial_image(sdata, image_key, return_key=True) ### Saving cell categories and gene counts - if sdata.table is not None: - adata = sdata.table + if SopaKeys.TABLE in sdata.tables: + adata = sdata.tables[SopaKeys.TABLE] shapes_key = adata.uns["spatialdata_attrs"]["region"] geo_df = sdata[shapes_key] @@ -120,7 +121,7 @@ def write( if _should_save(mode, "b") and geo_df is not None: geo_df = to_intrinsic(sdata, geo_df, image_key) - if sdata.table is not None: + if SopaKeys.TABLE in sdata.tables: geo_df = geo_df.loc[adata.obs[adata.uns["spatialdata_attrs"]["instance_key"]]] write_polygons(path, geo_df.geometry, polygon_max_vertices, pixel_size=pixel_size) @@ -145,16 +146,16 @@ def write( if _should_save(mode, "m"): write_metadata(path, image_key, shapes_key, _get_n_obs(sdata, geo_df), pixel_size) - if save_h5ad: - sdata.table.write_h5ad(path / FileNames.H5AD) + if save_h5ad and SopaKeys.TABLE in sdata.tables: + sdata.tables[SopaKeys.TABLE].write_h5ad(path / FileNames.H5AD) log.info(f"Saved files in the following directory: {path}") log.info(f"You can open the experiment with 'open {path / FileNames.METADATA}'") def _get_n_obs(sdata: SpatialData, geo_df: gpd.GeoDataFrame) -> int: - if sdata.table is not None: - return sdata.table.n_obs + if SopaKeys.TABLE in sdata.tables: + return sdata.tables[SopaKeys.TABLE].n_obs return len(geo_df) if geo_df is not None else 0 diff --git a/sopa/io/report/generate.py b/sopa/io/report/generate.py index 0a58ccda..71f7bc86 100644 --- a/sopa/io/report/generate.py +++ b/sopa/io/report/generate.py @@ -56,11 +56,12 @@ def _kdeplot_vmax_quantile(values: np.ndarray, quantile: float = 0.95): class SectionBuilder: def __init__(self, sdata: SpatialData): self.sdata = sdata + self.adata = self.sdata.tables.get(SopaKeys.TABLE) def _table_has(self, key, default=False): - if SopaKeys.UNS_KEY not in self.sdata.table.uns: + if SopaKeys.UNS_KEY not in self.adata.uns: return default - return self.sdata.table.uns[SopaKeys.UNS_KEY].get(key, default) + return self.adata.uns[SopaKeys.UNS_KEY].get(key, default) def general_section(self): log.info("Writing general section") @@ -87,7 +88,7 @@ def cell_section(self): coord_system = get_intrinsic_cs(self.sdata, shapes_key) fig = plt.figure() - _kdeplot_vmax_quantile(self.sdata.table.obs[SopaKeys.AREA_OBS]) + _kdeplot_vmax_quantile(self.adata.obs[SopaKeys.AREA_OBS]) plt.xlabel("Area (coordinate_system_unit ^ 2)") return Section( @@ -95,7 +96,7 @@ def cell_section(self): [ SubSection( "Number", - Paragraph(f"Number of cells:
{Message(f'{self.sdata.table.n_obs} cells')}"), + Paragraph(f"Number of cells:
{Message(f'{self.adata.n_obs} cells')}"), ), SubSection( "Areas", @@ -155,7 +156,7 @@ def transcripts_section(self): log.info("Writing transcript section") - mean_transcript_count = self.sdata.table.X.mean(0).A1 + mean_transcript_count = self.adata.X.mean(0).A1 low_average = mean_transcript_count < LOW_AVERAGE_COUNT QC_subsubsections = [] @@ -166,7 +167,7 @@ def transcripts_section(self): ) ) QC_subsubsections.append( - Message(", ".join(self.sdata.table.var_names[low_average]), color="danger") + Message(", ".join(self.adata.var_names[low_average]), color="danger") ) fig1 = plt.figure() @@ -174,7 +175,7 @@ def transcripts_section(self): plt.xlabel("Count per transcript (average / cells)") fig2 = plt.figure() - _kdeplot_vmax_quantile(self.sdata.table.X.sum(1).A1) + _kdeplot_vmax_quantile(self.adata.X.sum(1).A1) plt.xlabel("Transcript count per cell") QC_subsubsections.append(Columns([Image(fig1), Image(fig2)])) @@ -184,23 +185,21 @@ def transcripts_section(self): def representation_section(self, max_obs: int = 400_000): log.info("Writing representation section") - adata = self.sdata.table - if self._table_has(SopaKeys.UNS_HAS_TRANSCRIPTS): - sc.pp.normalize_total(adata) - sc.pp.log1p(adata) + sc.pp.normalize_total(self.adata) + sc.pp.log1p(self.adata) - if adata.n_obs > max_obs: - sc.pp.subsample(adata, n_obs=max_obs) + if self.adata.n_obs > max_obs: + sc.pp.subsample(self.adata, n_obs=max_obs) - log.info(f"Computing UMAP on {adata.n_obs} cells") + log.info(f"Computing UMAP on {self.adata.n_obs} cells") - sc.pp.pca(adata) - sc.pp.neighbors(adata) - sc.tl.umap(adata) + sc.pp.pca(self.adata) + sc.pp.neighbors(self.adata) + sc.tl.umap(self.adata) colors = self._table_has(SopaKeys.UNS_CELL_TYPES, None) - sc.pl.umap(adata, color=colors, show=False) + sc.pl.umap(self.adata, color=colors, show=False) return Section( "Representation", diff --git a/sopa/io/standardize.py b/sopa/io/standardize.py index 78af74ae..1a2fbbb1 100644 --- a/sopa/io/standardize.py +++ b/sopa/io/standardize.py @@ -6,7 +6,7 @@ import spatialdata from spatialdata import SpatialData -from .._constants import VALID_DIMENSIONS +from .._constants import VALID_DIMENSIONS, SopaKeys from .._sdata import get_spatial_image from ..utils.image import _check_integer_dtype @@ -42,12 +42,12 @@ def sanity_check(sdata: SpatialData, delete_table: bool = False, warn: bool = Fa # log.warn(f"Channel names are not strings. Converting {image_channels} to string values.") # sdata[image_key].data = sdata[image_key].assign_coords(c=image_channels.astype(str)) - if sdata.table is not None: + if SopaKeys.TABLE in sdata.tables: if delete_table: log.info( - "The table (i.e. `sdata.table`) will not be saved, since it will be created later by sopa" + f"The table `sdata.tables['{SopaKeys.TABLE}']` will not be saved, since it will be created later by sopa" ) - del sdata.table + del sdata.tables[SopaKeys.TABLE] def read_zarr_standardized(path: str, warn: bool = False) -> SpatialData: @@ -73,8 +73,8 @@ def write_standardized(sdata: SpatialData, sdata_path: str, delete_table: bool = sanity_check(sdata, delete_table) assert ( - sdata.table is None - ), "sdata.table exists. Delete it you want to use sopa, to avoid conflicts with future table generation" + SopaKeys.TABLE not in sdata.tables + ), f"sdata.tables['{SopaKeys.TABLE}'] exists. Delete it you want to use sopa, to avoid conflicts with future table generation" if len(sdata.points) == 0: log.warn("No transcripts found. Some tools from sopa will not be available.") diff --git a/sopa/segmentation/aggregate.py b/sopa/segmentation/aggregate.py index 3f02a82c..de4141b0 100644 --- a/sopa/segmentation/aggregate.py +++ b/sopa/segmentation/aggregate.py @@ -63,13 +63,13 @@ def __init__( self.shapes_key = shapes_key self.geo_df = self.sdata[shapes_key] - if sdata.table is not None and len(self.geo_df) != sdata.table.n_obs: - log.warn( - f"Table already existing with {sdata.table.n_obs} obs, but aggregating on {len(self.geo_df)} cells. Deleting table." - ) - del sdata.table - - self.table = sdata.table + self.table = None + if SopaKeys.TABLE in sdata.tables: + table = sdata.tables[SopaKeys.TABLE] + if len(self.geo_df) != table.n_obs: + log.warn("Not using existing table (aggregating on a different number of cells)") + else: + self.table = table def standardize_table(self): self.table.obs_names = list(map(str_cell_id, range(self.table.n_obs))) @@ -101,10 +101,8 @@ def standardize_table(self): instance_key=SopaKeys.INSTANCE_KEY, ) - if self.sdata.table is not None and self.overwrite: - del self.sdata.table - self.sdata.tables["table"] = self.table - save_table(self.sdata) + self.sdata.tables[SopaKeys.TABLE] = self.table + save_table(self.sdata, SopaKeys.TABLE) def filter_cells(self, where_filter: np.ndarray): log.info(f"Filtering {where_filter.sum()} cells") diff --git a/sopa/segmentation/baysor/resolve.py b/sopa/segmentation/baysor/resolve.py index a66d0190..dd7816ce 100644 --- a/sopa/segmentation/baysor/resolve.py +++ b/sopa/segmentation/baysor/resolve.py @@ -16,7 +16,7 @@ from tqdm import tqdm from ..._constants import SopaKeys -from ..._sdata import get_element, get_key, save_shapes +from ..._sdata import get_element, get_key, save_shapes, save_table from .. import aggregate, shapes log = logging.getLogger(__name__) @@ -174,12 +174,9 @@ def resolve( sdata.shapes[SopaKeys.BAYSOR_BOUNDARIES] = geo_df save_shapes(sdata, SopaKeys.BAYSOR_BOUNDARIES, overwrite=True) - if sdata.table is not None: - log.warn("Table already existing. It will be replaced by the new one.") - del sdata.table - - sdata.table = table + sdata.tables[SopaKeys.TABLE] = table + save_table(sdata, SopaKeys.TABLE) log.info( - f"Added sdata.table, and {len(geo_df)} cell boundaries to sdata['{SopaKeys.BAYSOR_BOUNDARIES}']" + f"Added sdata.tables['{SopaKeys.TABLE}'], and {len(geo_df)} cell boundaries to sdata['{SopaKeys.BAYSOR_BOUNDARIES}']" ) diff --git a/sopa/spatial/_build.py b/sopa/spatial/_build.py index 196311eb..d5d01590 100644 --- a/sopa/spatial/_build.py +++ b/sopa/spatial/_build.py @@ -19,6 +19,8 @@ from sklearn.metrics.pairwise import euclidean_distances from spatialdata import SpatialData +from .._constants import SopaKeys + log = logging.getLogger(__name__) __all__ = ["spatial_neighbors"] @@ -40,7 +42,7 @@ def spatial_neighbors( set_diag: Whether to set the diagonal of the spatial connectivities to `1.0`. """ if isinstance(adata, SpatialData): - adata = adata.table + adata = adata.tables[SopaKeys.TABLE] assert ( radius is None or len(radius) == 2 diff --git a/sopa/spatial/distance.py b/sopa/spatial/distance.py index 211e1845..f59c8858 100644 --- a/sopa/spatial/distance.py +++ b/sopa/spatial/distance.py @@ -8,6 +8,7 @@ from spatialdata import SpatialData from tqdm import tqdm +from .._constants import SopaKeys from ._build import _check_has_delaunay log = logging.getLogger(__name__) @@ -31,7 +32,7 @@ def cells_to_groups( A `Dataframe` of shape `n_obs * n_groups`, or `None` if `key_added_prefix` was provided (in this case, the dataframe is saved in `"{key_added_prefix}{group_key}"`) """ if isinstance(adata, SpatialData): - adata = adata.table + adata = adata.tables[SopaKeys.TABLE] _check_has_delaunay(adata) @@ -91,7 +92,7 @@ def mean_distance( `DataFrame` of shape `n_groups * n_groups_target` of mean hop-distances """ if isinstance(adata, SpatialData): - adata = adata.table + adata = adata.tables[SopaKeys.TABLE] target_group_key = group_key if target_group_key is None else target_group_key diff --git a/sopa/spatial/morpho.py b/sopa/spatial/morpho.py index 84402675..47cc6e69 100644 --- a/sopa/spatial/morpho.py +++ b/sopa/spatial/morpho.py @@ -42,7 +42,7 @@ def geometrize_niches( A `GeoDataFrame` with geometries for each niche component. We also compute the area/perimeter/roundness of each component. """ if isinstance(adata, SpatialData): - adata = adata.table + adata = adata.tables[SopaKeys.TABLE] _check_has_delaunay(adata) data = {"geometry": [], niche_key: []} @@ -138,7 +138,7 @@ def niches_geometry_stats( A `DataFrame` of shape `n_niches * n_statistics` """ if isinstance(adata, SpatialData): - adata = adata.table + adata = adata.tables[SopaKeys.TABLE] gdf = geometrize_niches(adata, niche_key, **geometrize_niches_kwargs) value_counts = gdf[niche_key].value_counts() diff --git a/workflow/utils.py b/workflow/utils.py index f78a0b0f..c45dfe69 100644 --- a/workflow/utils.py +++ b/workflow/utils.py @@ -47,7 +47,7 @@ def __init__(self, config: dict) -> None: self.shapes_dir = self.sdata_path / "shapes" self.points_dir = self.sdata_path / "points" self.images_dir = self.sdata_path / "images" - self.table_dir = self.sdata_path / "table" + self.table_dir = self.sdata_path / "tables" # sdata.shapes self.baysor_boundaries = self.shapes_dir / "baysor_boundaries"