Skip to content

Commit

Permalink
Merge pull request #46 from gustaveroussy/dev
Browse files Browse the repository at this point in the history
support multi-tables
  • Loading branch information
quentinblampey authored Apr 3, 2024
2 parents 4b07024 + 70162f6 commit 97597d1
Show file tree
Hide file tree
Showing 17 changed files with 100 additions and 78 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
## [1.0.9] - 2024-04-03

### Added:
- Support multiple tables

### Fixed
- Spatial elements not saved when data is not backed

## [1.0.8] - 2024-04-02

Hotfix: resolve issues related to `spatialdata>=1.0.0`
Expand Down
4 changes: 2 additions & 2 deletions docs/tutorials/api_usage.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -513,7 +513,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"Now, `sdata.table` is an `AnnData` object.\n",
"Now, `sdata.tables[\"table\"]` is an `AnnData` object.\n",
"- If you count the transcripts, then `adata.X` are the raw counts\n",
"- If you average the channel intensities, then `adata.X` are the channels intensities\n",
"- If you both count the transcript and average the intensities, then `adata.X` are the raw counts, and `adata.obsm[\"intensities\"]` are the channels intensities"
Expand Down Expand Up @@ -585,7 +585,7 @@
" \"CD3\": \"T cell\"\n",
"}\n",
"\n",
"higher_z_score(sdata.table, marker_cell_dict)"
"higher_z_score(sdata.tables[\"table\"], marker_cell_dict)"
]
},
{
Expand Down
4 changes: 2 additions & 2 deletions docs/tutorials/he.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -437,7 +437,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"The resulting `GeoDataFrame` may have more columns than cells, because one cell may be inside multiple patches. We will keep only the first patch, and then save the resulting `\"cluster\"` column into the `sdata.table`."
"The resulting `GeoDataFrame` may have more columns than cells, because one cell may be inside multiple patches. We will keep only the first patch, and then save the resulting `\"cluster\"` column into the `sdata.tables[\"table\"]`."
]
},
{
Expand All @@ -446,7 +446,7 @@
"metadata": {},
"outputs": [],
"source": [
"sdata.table.obs[\"cluster\"] = res_gdf[~res_gdf.index.duplicated()][\"cluster\"].values"
"sdata.tables[\"table\"].obs[\"cluster\"] = res_gdf[~res_gdf.index.duplicated()][\"cluster\"].values"
]
},
{
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "sopa"
version = "1.0.8"
version = "1.0.9"
description = "Spatial-omics pipeline and analysis"
documentation = "https://gustaveroussy.github.io/sopa"
homepage = "https://gustaveroussy.github.io/sopa"
Expand Down
1 change: 1 addition & 0 deletions sopa/_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ class SopaKeys:
CELLPOSE_BOUNDARIES = "cellpose_boundaries"
BAYSOR_BOUNDARIES = "baysor_boundaries"
PATCHES = "sopa_patches"
TABLE = "table"

BOUNDS = "bboxes"
PATCHES_ILOCS = "ilocs"
Expand Down
29 changes: 21 additions & 8 deletions sopa/_sdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,13 +130,17 @@ def get_item(sdata: SpatialData, attr: str, key: str | None = None):

def get_intensities(sdata: SpatialData) -> pd.DataFrame | None:
"""Gets the intensity dataframe of shape `n_obs x n_channels`"""
if not sdata.table.uns[SopaKeys.UNS_KEY][SopaKeys.UNS_HAS_INTENSITIES]:
assert SopaKeys.TABLE in sdata.tables, f"No '{SopaKeys.TABLE}' found in sdata.tables"

adata = sdata.tables[SopaKeys.TABLE]

if not adata.uns[SopaKeys.UNS_KEY][SopaKeys.UNS_HAS_INTENSITIES]:
return None

if sdata.table.uns[SopaKeys.UNS_KEY][SopaKeys.UNS_HAS_TRANSCRIPTS]:
return sdata.table.obsm[SopaKeys.INTENSITIES_OBSM]
if adata.uns[SopaKeys.UNS_KEY][SopaKeys.UNS_HAS_TRANSCRIPTS]:
return adata.obsm[SopaKeys.INTENSITIES_OBSM]

return sdata.table.to_df()
return adata.to_df()


def iter_scales(image: MultiscaleSpatialImage) -> Iterator[xr.DataArray]:
Expand Down Expand Up @@ -187,6 +191,9 @@ def save_shapes(
name: str,
overwrite: bool = False,
) -> None:
if not sdata.is_backed():
return

elem_group = sdata._init_add_element(name=name, element_type="shapes", overwrite=overwrite)
write_shapes(
shapes=sdata.shapes[name],
Expand All @@ -200,6 +207,9 @@ def save_image(
name: str,
overwrite: bool = False,
) -> None:
if not sdata.is_backed():
return

elem_group = sdata._init_add_element(name=name, element_type="images", overwrite=overwrite)
write_image(
image=sdata.images[name],
Expand All @@ -213,11 +223,14 @@ def save_image(
assert elem_group.path == "images"
path = Path(elem_group.store.path) / "images" / name
image = _read_multiscale(path, raster_type="image")
sdata._add_image_in_memory(name=name, image=image, overwrite=True)
sdata.images[name] = image


def save_table(sdata: SpatialData, name: str):
if not sdata.is_backed():
return

def save_table(sdata: SpatialData):
store = parse_url(sdata.path, mode="r+").store
root = zarr.group(store=store)
elem_group = root.require_group(name="table")
write_table(table=sdata.table, group=elem_group, name="table")
elem_group = root.require_group(name="tables")
write_table(table=sdata.tables[name], group=elem_group, name=name)
4 changes: 3 additions & 1 deletion sopa/annotation/tangram/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,9 @@ def tangram_annotate(
bag_size: Size of each bag on which tangram will be run. Use smaller bags to lower the RAM usage
max_obs_reference: Maximum number of cells used in `adata_sc` at each level. Decrease it to lower the RAM usage.
"""
ad_sp = sdata.table
assert SopaKeys.TABLE in sdata.tables, f"No '{SopaKeys.TABLE}' found in sdata.tables"

ad_sp = sdata.tables[SopaKeys.TABLE]

MultiLevelAnnotation(
ad_sp,
Expand Down
16 changes: 8 additions & 8 deletions sopa/cli/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,17 @@ def fluorescence(
For each cell, one z-score statistic is computed and the population with the highest z-score is attributed.
"""
from pathlib import Path

from sopa._constants import SopaKeys
from sopa._sdata import save_table
from sopa.annotation.fluorescence import higher_z_score
from sopa.io.standardize import read_zarr_standardized

sdata = read_zarr_standardized(sdata_path)

assert sdata.table is not None, "Annotation requires `sdata.table` to be not None"
assert SopaKeys.TABLE in sdata.tables, f"No '{SopaKeys.TABLE}' found in sdata.tables"

higher_z_score(sdata.table, marker_cell_dict, cell_type_key)
sdata.table.write_zarr(Path(sdata_path) / "table" / "table")
higher_z_score(sdata.tables[SopaKeys.TABLE], marker_cell_dict, cell_type_key)
save_table(sdata, SopaKeys.TABLE)


@app_annotate.command()
Expand All @@ -55,10 +55,10 @@ def tangram(
),
):
"""Tangram segmentation (i.e., uses an annotated scRNAseq reference to transfer cell-types)"""
from pathlib import Path

import anndata

from sopa._constants import SopaKeys
from sopa._sdata import save_table
from sopa.annotation.tangram.run import tangram_annotate
from sopa.io.standardize import read_zarr_standardized

Expand All @@ -73,4 +73,4 @@ def tangram(
bag_size=bag_size,
max_obs_reference=max_obs_reference,
)
sdata.table.write_zarr(Path(sdata_path) / "table" / "table")
save_table(sdata, SopaKeys.TABLE)
17 changes: 9 additions & 8 deletions sopa/io/explorer/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import geopandas as gpd
from spatialdata import SpatialData

from ..._constants import SopaKeys
from ..._sdata import get_boundaries, get_element, get_spatial_image, to_intrinsic
from . import (
write_cell_categories,
Expand Down Expand Up @@ -87,7 +88,7 @@ def write(
points_key: Name of the transcripts (key of `sdata.points`).
gene_column: Column name of the points dataframe containing the gene names.
pixel_size: Number of microns in a pixel. Invalid value can lead to inconsistent scales in the Explorer.
layer: Layer of `sdata.table` where the gene counts are saved. If `None`, uses `sdata.table.X`.
layer: Layer of the AnnData table where the gene counts are saved. If `None`, uses `table.X`.
polygon_max_vertices: Maximum number of vertices for the cell polygons.
lazy: If `True`, will not load the full images in memory (except if the image memory is below `ram_threshold_gb`).
ram_threshold_gb: Threshold (in gygabytes) from which image can be loaded in memory. If `None`, the image is never loaded in memory.
Expand All @@ -100,8 +101,8 @@ def write(
image_key, image = get_spatial_image(sdata, image_key, return_key=True)

### Saving cell categories and gene counts
if sdata.table is not None:
adata = sdata.table
if SopaKeys.TABLE in sdata.tables:
adata = sdata.tables[SopaKeys.TABLE]

shapes_key = adata.uns["spatialdata_attrs"]["region"]
geo_df = sdata[shapes_key]
Expand All @@ -120,7 +121,7 @@ def write(
if _should_save(mode, "b") and geo_df is not None:
geo_df = to_intrinsic(sdata, geo_df, image_key)

if sdata.table is not None:
if SopaKeys.TABLE in sdata.tables:
geo_df = geo_df.loc[adata.obs[adata.uns["spatialdata_attrs"]["instance_key"]]]

write_polygons(path, geo_df.geometry, polygon_max_vertices, pixel_size=pixel_size)
Expand All @@ -145,16 +146,16 @@ def write(
if _should_save(mode, "m"):
write_metadata(path, image_key, shapes_key, _get_n_obs(sdata, geo_df), pixel_size)

if save_h5ad:
sdata.table.write_h5ad(path / FileNames.H5AD)
if save_h5ad and SopaKeys.TABLE in sdata.tables:
sdata.tables[SopaKeys.TABLE].write_h5ad(path / FileNames.H5AD)

log.info(f"Saved files in the following directory: {path}")
log.info(f"You can open the experiment with 'open {path / FileNames.METADATA}'")


def _get_n_obs(sdata: SpatialData, geo_df: gpd.GeoDataFrame) -> int:
if sdata.table is not None:
return sdata.table.n_obs
if SopaKeys.TABLE in sdata.tables:
return sdata.tables[SopaKeys.TABLE].n_obs
return len(geo_df) if geo_df is not None else 0


Expand Down
35 changes: 17 additions & 18 deletions sopa/io/report/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,12 @@ def _kdeplot_vmax_quantile(values: np.ndarray, quantile: float = 0.95):
class SectionBuilder:
def __init__(self, sdata: SpatialData):
self.sdata = sdata
self.adata = self.sdata.tables.get(SopaKeys.TABLE)

def _table_has(self, key, default=False):
if SopaKeys.UNS_KEY not in self.sdata.table.uns:
if SopaKeys.UNS_KEY not in self.adata.uns:
return default
return self.sdata.table.uns[SopaKeys.UNS_KEY].get(key, default)
return self.adata.uns[SopaKeys.UNS_KEY].get(key, default)

def general_section(self):
log.info("Writing general section")
Expand All @@ -87,15 +88,15 @@ def cell_section(self):
coord_system = get_intrinsic_cs(self.sdata, shapes_key)

fig = plt.figure()
_kdeplot_vmax_quantile(self.sdata.table.obs[SopaKeys.AREA_OBS])
_kdeplot_vmax_quantile(self.adata.obs[SopaKeys.AREA_OBS])
plt.xlabel("Area (coordinate_system_unit ^ 2)")

return Section(
"Cells",
[
SubSection(
"Number",
Paragraph(f"Number of cells:<br>{Message(f'{self.sdata.table.n_obs} cells')}"),
Paragraph(f"Number of cells:<br>{Message(f'{self.adata.n_obs} cells')}"),
),
SubSection(
"Areas",
Expand Down Expand Up @@ -155,7 +156,7 @@ def transcripts_section(self):

log.info("Writing transcript section")

mean_transcript_count = self.sdata.table.X.mean(0).A1
mean_transcript_count = self.adata.X.mean(0).A1
low_average = mean_transcript_count < LOW_AVERAGE_COUNT

QC_subsubsections = []
Expand All @@ -166,15 +167,15 @@ def transcripts_section(self):
)
)
QC_subsubsections.append(
Message(", ".join(self.sdata.table.var_names[low_average]), color="danger")
Message(", ".join(self.adata.var_names[low_average]), color="danger")
)

fig1 = plt.figure()
_kdeplot_vmax_quantile(mean_transcript_count)
plt.xlabel("Count per transcript (average / cells)")

fig2 = plt.figure()
_kdeplot_vmax_quantile(self.sdata.table.X.sum(1).A1)
_kdeplot_vmax_quantile(self.adata.X.sum(1).A1)
plt.xlabel("Transcript count per cell")

QC_subsubsections.append(Columns([Image(fig1), Image(fig2)]))
Expand All @@ -184,23 +185,21 @@ def transcripts_section(self):
def representation_section(self, max_obs: int = 400_000):
log.info("Writing representation section")

adata = self.sdata.table

if self._table_has(SopaKeys.UNS_HAS_TRANSCRIPTS):
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)
sc.pp.normalize_total(self.adata)
sc.pp.log1p(self.adata)

if adata.n_obs > max_obs:
sc.pp.subsample(adata, n_obs=max_obs)
if self.adata.n_obs > max_obs:
sc.pp.subsample(self.adata, n_obs=max_obs)

log.info(f"Computing UMAP on {adata.n_obs} cells")
log.info(f"Computing UMAP on {self.adata.n_obs} cells")

sc.pp.pca(adata)
sc.pp.neighbors(adata)
sc.tl.umap(adata)
sc.pp.pca(self.adata)
sc.pp.neighbors(self.adata)
sc.tl.umap(self.adata)

colors = self._table_has(SopaKeys.UNS_CELL_TYPES, None)
sc.pl.umap(adata, color=colors, show=False)
sc.pl.umap(self.adata, color=colors, show=False)

return Section(
"Representation",
Expand Down
12 changes: 6 additions & 6 deletions sopa/io/standardize.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import spatialdata
from spatialdata import SpatialData

from .._constants import VALID_DIMENSIONS
from .._constants import VALID_DIMENSIONS, SopaKeys
from .._sdata import get_spatial_image
from ..utils.image import _check_integer_dtype

Expand Down Expand Up @@ -42,12 +42,12 @@ def sanity_check(sdata: SpatialData, delete_table: bool = False, warn: bool = Fa
# log.warn(f"Channel names are not strings. Converting {image_channels} to string values.")
# sdata[image_key].data = sdata[image_key].assign_coords(c=image_channels.astype(str))

if sdata.table is not None:
if SopaKeys.TABLE in sdata.tables:
if delete_table:
log.info(
"The table (i.e. `sdata.table`) will not be saved, since it will be created later by sopa"
f"The table `sdata.tables['{SopaKeys.TABLE}']` will not be saved, since it will be created later by sopa"
)
del sdata.table
del sdata.tables[SopaKeys.TABLE]


def read_zarr_standardized(path: str, warn: bool = False) -> SpatialData:
Expand All @@ -73,8 +73,8 @@ def write_standardized(sdata: SpatialData, sdata_path: str, delete_table: bool =
sanity_check(sdata, delete_table)

assert (
sdata.table is None
), "sdata.table exists. Delete it you want to use sopa, to avoid conflicts with future table generation"
SopaKeys.TABLE not in sdata.tables
), f"sdata.tables['{SopaKeys.TABLE}'] exists. Delete it you want to use sopa, to avoid conflicts with future table generation"

if len(sdata.points) == 0:
log.warn("No transcripts found. Some tools from sopa will not be available.")
Expand Down
20 changes: 9 additions & 11 deletions sopa/segmentation/aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,13 +63,13 @@ def __init__(
self.shapes_key = shapes_key
self.geo_df = self.sdata[shapes_key]

if sdata.table is not None and len(self.geo_df) != sdata.table.n_obs:
log.warn(
f"Table already existing with {sdata.table.n_obs} obs, but aggregating on {len(self.geo_df)} cells. Deleting table."
)
del sdata.table

self.table = sdata.table
self.table = None
if SopaKeys.TABLE in sdata.tables:
table = sdata.tables[SopaKeys.TABLE]
if len(self.geo_df) != table.n_obs:
log.warn("Not using existing table (aggregating on a different number of cells)")
else:
self.table = table

def standardize_table(self):
self.table.obs_names = list(map(str_cell_id, range(self.table.n_obs)))
Expand Down Expand Up @@ -101,10 +101,8 @@ def standardize_table(self):
instance_key=SopaKeys.INSTANCE_KEY,
)

if self.sdata.table is not None and self.overwrite:
del self.sdata.table
self.sdata.tables["table"] = self.table
save_table(self.sdata)
self.sdata.tables[SopaKeys.TABLE] = self.table
save_table(self.sdata, SopaKeys.TABLE)

def filter_cells(self, where_filter: np.ndarray):
log.info(f"Filtering {where_filter.sum()} cells")
Expand Down
Loading

0 comments on commit 97597d1

Please sign in to comment.