Merge pull request #12 from HiDiHlabs/dev

Prepare release 0.2.0
HiDiHlabs · Oct 11, 2024 · b281743 · b281743
2 parents 3911e7f + a7fe73d
commit b281743
Show file tree

Hide file tree

Showing 22 changed files with 465 additions and 216 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,23 +1,24 @@
 [package]
-name    = "_utils_rust"
-version = "0.1.0"
-edition = "2021"
+name         = "_utils_rust"
+version      = "0.1.0"
+edition      = "2021"
+rust-version = "1.76.0"
 
 [lib]
 name       = "_utils_rust"
 crate-type = ["cdylib"]
 
 [dependencies]
 bincode       = { version = "1.3" }
-indexmap      = { version = "2.1.0", features = ["rayon"] }
+indexmap      = { version = ">= 2.1, < 2.6", features = ["rayon"] }
 itertools     = { version = "0.12.1" }
 ndarray       = { version = "0.15.6", features = ["rayon"] }
 ndarray-stats = { version = "0.5.1" }
 num           = { version = "0.4.1" }
-numpy         = { version = "0.21.0" }
+numpy         = { version = "0.21" }
 polars        = { version = "0.41", features = ["partition_by", "dtype-categorical"] }
 polars-arrow  = { version = "0.41" }
-pyo3          = { version = "0.21.0", features = ["extension-module"] }
-pyo3-polars   = { version = "0.15.0" }
-rayon         = { version = "1.8.0" }
-sprs          = { version = "0.11.1", features = ["serde"] }
+pyo3          = { version = "0.21", features = ["extension-module"] }
+pyo3-polars   = { version = "0.15" }
+rayon         = { version = "1.8" }
+sprs          = { version = "= 0.11.1", features = ["serde"] }
diff --git a/README.md b/README.md
@@ -21,9 +21,20 @@ by exporting data in [`AnnData`](https://anndata.readthedocs.io/) or
 
 ## Installation
 
-`sainsc` will be made available on [PyPI](https://pypi.org/) and
-[bioconda](https://bioconda.github.io/). For detailed installation instructions
-please refer to the [documentation](https://sainsc.readthedocs.io/en/stable/installation.html).
+`sainsc` is available on [PyPI](https://pypi.org/) and [bioconda](https://bioconda.github.io/).
+
+```sh
+# PyPI
+pip install sainsc
+```
+
+```sh
+# or conda
+conda install bioconda::sainsc
+```
+
+For detailed installation instructions please refer to the
+[documentation](https://sainsc.readthedocs.io/page/installation.html).
 
 ## Documentation
 

diff --git a/docs/source/installation.rst b/docs/source/installation.rst
@@ -24,7 +24,7 @@ Bioconda and ``conda``
 
 If you prefer the installation using
 `Miniconda <https://docs.anaconda.com/miniconda/>`_ you can install from the
-`bioconda <https://bioconda.github.io/>`_ channel.
+`bioconda <http://bioconda.github.io/recipes/sainsc/README.html>`_ channel.
 
 .. code-block:: bash
 

diff --git a/docs/source/quickstart.ipynb b/docs/source/quickstart.ipynb
@@ -53,7 +53,7 @@
    "source": [
     "from pathlib import Path\n",
     "\n",
-    "from sainsc import read_StereoSeq"
+    "from sainsc.io import read_StereoSeq"
    ]
   },
   {
@@ -331,9 +331,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python [conda env:muellni-sainsc2]",
+   "display_name": "Python 3",
    "language": "python",
-   "name": "conda-env-muellni-sainsc2-py"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {

diff --git a/docs/source/usage.ipynb b/docs/source/usage.ipynb
@@ -40,7 +40,7 @@
     "import numpy as np\n",
     "import pandas as pd\n",
     "\n",
-    "from sainsc import read_StereoSeq"
+    "from sainsc.io import read_StereoSeq"
    ]
   },
   {
@@ -992,9 +992,9 @@
    "metadata": {},
    "source": [
     "To analyse other technologies beside Stereo-seq such as imaging-based (e.g. Xenium) we only need to generate a [`sainsc.GridCounts`](#sainsc.GridCounts) or [`sainsc.LazyKDE`](#sainsc.LazyKDE) instance from a dataframe and then proceed as previously described. \n",
-    "If the data is stored in GEM file format you can use the [`sainsc.io.read_gem_file`](#sainsc.io.read_gem_file) function to conveniently read the file and ensure the correct format.\n",
-    "Otherwise we can manually prepare the dataframe as shown below for the case of Xenium data (obtained from [10x website](https://www.10xgenomics.com/datasets/fresh-frozen-mouse-brain-replicates-1-standard)).\n",
+    "If the data is stored in GEM file format you can use the [`sainsc.io.read_gem_file`](#sainsc.io.read_gem_file) function to conveniently read the file and ensure the correct format. Other technologies, such as Xenium or Vizgen are already supported as well. Have a look at the [`sainsc.io`](#sainsc.io) module.\n",
     "\n",
+    "In case the technology/file format is not supported, we can manually prepare the data as shown below for the case of Xenium (obtained from [10x website](https://www.10xgenomics.com/datasets/fresh-frozen-mouse-brain-replicates-1-standard)).\n",
     "The dataframe needs to contain a 'gene', 'x', and 'y' column. If a 'count' column is present it will be used otherwise a count of 1 (single transcript) will be assumed for each row. If the 'x' and 'y' columns are integers they will be assumed as indices of a square grid. Otherwise we can additionally define a bin size to use when instantiating the objects."
    ]
   },
@@ -1028,7 +1028,7 @@
    "id": "48494ac5-9f69-4ece-9dd1-a10c94cf88aa",
    "metadata": {},
    "source": [
-    "We can now generate a [`sainsc.GridCounts`](#sainsc.GridCounts) object from this dataframe. We will generate bins with a size of 500 nm."
+    "We can now generate a [`sainsc.GridCounts`](#sainsc.GridCounts) object from this `polars.DataFrame`. We will generate bins with a size of 500 nm."
    ]
   },
   {
@@ -1062,7 +1062,7 @@
    "id": "72bda2b1-b6ed-4204-a144-5390ba8ad35a",
    "metadata": {},
    "source": [
-    "Alternatively we can directly generate a [`sainsc.LazyKDE`](#sainsc.LazyKDE) object. This has the additional benefit, that we can supply either a `polars.DataFrame` or a `pandas.DataFrame`. [`sainsc.GridCounts`](#sainsc.GridCounts) can only be generated from a `polars.DataFrame`."
+    "Alternatively, we can directly generate a [`sainsc.LazyKDE`](#sainsc.LazyKDE) object. This has the additional benefit, that we can supply either a `polars.DataFrame` or a `pandas.DataFrame`. [`sainsc.GridCounts`](#sainsc.GridCounts) can only be generated from a `polars.DataFrame`."
    ]
   },
   {
@@ -1208,9 +1208,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python [conda env:muellni-sainsc2]",
+   "display_name": "Python 3",
    "language": "python",
-   "name": "conda-env-muellni-sainsc2-py"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {

diff --git a/pyproject.toml b/pyproject.toml
@@ -25,6 +25,7 @@ dependencies = [
     "scikit-image>=0.18",
     "scipy>=1.9",
     "seaborn>=0.11",
+    "typing-extensions>=4",
 ]
 classifiers = [
     "Intended Audience :: Science/Research",
@@ -52,10 +53,10 @@ data = ["pooch>=1"]
 dev = ["sainsc[data,docs,spatialdata]", "pre-commit"]
 
 [project.urls]
-homepage      = "https://github.com/HiDiHlabs/sainsc"
-documentation = "https://sainsc.readthedocs.io"
-repository    = "https://github.com/HiDiHlabs/sainsc"
-
+Homepage      = "https://github.com/HiDiHlabs/sainsc"
+Documentation = "https://sainsc.readthedocs.io"
+Repository    = "https://github.com/HiDiHlabs/sainsc"
+Issues        = "https://github.com/HiDiHlabs/sainsc/issues"
 
 [tool]
 
@@ -79,6 +80,9 @@ target-version = ["py310", "py311", "py312"]
 [tool.ruff]
 target-version = "py310"
 
+[tool.ruff.lint.per-file-ignores]
+"__init__.py" = ["E402"]
+
 [tool.mypy]
 python_version         = "3.10"
 ignore_missing_imports = true

diff --git a/sainsc/__init__.py b/sainsc/__init__.py
@@ -5,14 +5,10 @@
 except PackageNotFoundError:
     __version__ = "unknown version"
 
+del PackageNotFoundError, version
+
+from . import io, lazykde, utils
 from ._utils_rust import GridCounts
-from .io import read_StereoSeq, read_StereoSeq_bins
-from .lazykde import LazyKDE, gaussian_kernel
+from .lazykde import LazyKDE
 
-__all__ = [
-    "GridCounts",
-    "LazyKDE",
-    "gaussian_kernel",
-    "read_StereoSeq",
-    "read_StereoSeq_bins",
-]
+__all__ = ["io", "lazykde", "utils", "GridCounts", "LazyKDE"]
diff --git a/sainsc/_utils.py b/sainsc/_utils.py
@@ -1,5 +1,6 @@
+import functools
 import os
-from typing import NoReturn
+from typing import Callable, NoReturn, ParamSpec, TypeVar
 
 import numpy as np
 import pandas as pd
@@ -9,15 +10,40 @@
 
 
 def _get_n_cpus() -> int:
-    return len(os.sched_getaffinity(0))
+    available_cpus = len(os.sched_getaffinity(0))
+    return min(available_cpus, 32)
+
+
+P = ParamSpec("P")
+T = TypeVar("T")
+
+
+def _validate_n_threads(n_threads: int | None) -> int:
+    if n_threads is None:
+        n_threads = 0
+    if n_threads < 0:
+        raise ValueError("`n_threads` must be >= 0.")
+    else:
+        return n_threads if n_threads > 0 else _get_n_cpus()
+
+
+def validate_threads(func: Callable[P, T]) -> Callable[P, T]:
+    @functools.wraps(func)
+    def wrapper(*args: P.args, **kwargs: P.kwargs) -> T:
+        n_threads = kwargs.get("n_threads", 0)
+        assert n_threads is None or isinstance(n_threads, int)
+        kwargs["n_threads"] = _validate_n_threads(n_threads)
+        return func(*args, **kwargs)
+
+    return wrapper
 
 
 def _get_coordinate_index(
     x: NDArray[np.integer],
     y: NDArray[np.integer],
     *,
     name: str | None = None,
-    n_threads: int = 1,
+    n_threads: int | None = None,
 ) -> pd.Index:
     x_i32: NDArray[np.int32] = x.astype(np.int32, copy=False)
     y_i32: NDArray[np.int32] = y.astype(np.int32, copy=False)
@@ -27,18 +53,6 @@ def _get_coordinate_index(
     )
 
 
-def _bin_coordinates(df: pd.DataFrame, bin_size: float) -> pd.DataFrame:
-    df = df.assign(
-        x=lambda df: _get_bin_coordinate(df["x"].to_numpy(), bin_size),
-        y=lambda df: _get_bin_coordinate(df["y"].to_numpy(), bin_size),
-    )
-    return df
-
-
-def _get_bin_coordinate(coor: NDArray[np.number], bin_size: float) -> NDArray[np.int32]:
-    return np.floor(coor / bin_size).astype(np.int32, copy=False)
-
-
 def _raise_module_load_error(e: Exception, fn: str, pkg: str, extra: str) -> NoReturn:
     raise ModuleNotFoundError(
         f"`{fn}` requires '{pkg}' to be installed, e.g. via the '{extra}' extra."

diff --git a/sainsc/_utils_rust.pyi b/sainsc/_utils_rust.pyi
@@ -1,6 +1,7 @@
 import numpy as np
 from numpy.typing import NDArray
 from polars import DataFrame
+from typing_extensions import Self
 
 from ._typealias import _Csx, _CsxArray
 
@@ -92,8 +93,8 @@ class GridCounts:
         resolution : float, optional
             Resolution as nm / pixel.
         n_threads : int, optional
-            Number of threads used for reading and processing file. If `None` this will
-            default to the number of logical CPUs.
+            Number of threads used for processing. If `None` or 0 this will default to
+            the number of logical CPUs.
 
         Raises
         ------
@@ -109,7 +110,7 @@ class GridCounts:
         resolution: float | None = None,
         binsize: float | None = None,
         n_threads: int | None = None,
-    ):  # -> Self
+    ) -> Self:
         """
         Initialize from dataframe.
 
@@ -127,8 +128,8 @@ class GridCounts:
             Resolution of each coordinate unit in nm. The default is 1,000 i.e. measurements
             are in um.
         n_threads : int, optional
-            Number of threads used for initializing :py:class:`sainsc.LazyKDE`.
-            If `None` this will default to the number of logical CPUs.
+            Number of threads used for processing. If `None` or 0 this will default to
+            the number of logical CPUs.
 
         Returns
         -------
@@ -251,7 +252,7 @@ class GridCounts:
         """
 
     @resolution.setter
-    def resolution(self, resolution: float): ...
+    def resolution(self, resolution: float | None): ...
     @property
     def n_threads(self) -> int:
         """
@@ -264,4 +265,4 @@ class GridCounts:
         """
 
     @n_threads.setter
-    def n_threads(self, n_threads: int): ...
+    def n_threads(self, n_threads: int | None): ...
diff --git a/sainsc/io/__init__.py b/sainsc/io/__init__.py
@@ -1,3 +1,26 @@
-from ._io import read_gem_file, read_StereoSeq, read_StereoSeq_bins
+"""
+This module contains functionality supporting reading data of different
+spatially-resolved transcriptomics technologies and file formats.
+"""
 
-__all__ = ["read_gem_file", "read_StereoSeq", "read_StereoSeq_bins"]
+from ._io import (
+    VIZGEN_CTRLS,
+    XENIUM_CTRLS,
+    read_gem_file,
+    read_gem_header,
+    read_StereoSeq,
+    read_StereoSeq_bins,
+    read_Vizgen,
+    read_Xenium,
+)
+
+__all__ = [
+    "VIZGEN_CTRLS",
+    "XENIUM_CTRLS",
+    "read_gem_file",
+    "read_gem_header",
+    "read_StereoSeq",
+    "read_StereoSeq_bins",
+    "read_Vizgen",
+    "read_Xenium",
+]