boilerplate setup

tansey-lab · Feb 8, 2024 · fda1994 · fda1994
1 parent 61e2eaf
commit fda1994
Show file tree

Hide file tree

Showing 22 changed files with 1,658 additions and 795 deletions.
diff --git a/.github/workflows/containers.yml b/.github/workflows/containers.yml
@@ -24,8 +24,8 @@ jobs:
           context: .
           file: Dockerfile
           push: true
-          tags: jeffquinnmsk/batchie:latest
-          cache-from: type=registry,ref=jeffquinnmsk/batchie:latest
+          tags: jeffquinnmsk/nuc2seg:latest
+          cache-from: type=registry,ref=jeffquinnmsk/nuc2seg:latest
           cache-to: type=inline
   nextflow-integration:
     runs-on: ubuntu-20.04

diff --git a/.github/workflows/python-unittest.yml b/.github/workflows/python-unittest.yml
@@ -6,7 +6,7 @@ on:
 
 jobs:
   test_data_models:
-    name: Run python unittests for batchie
+    name: Run python unittests for nuc2seg
     runs-on: ubuntu-latest
 
     steps:

diff --git a/docs/Makefile b/docs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/api.rst b/docs/api.rst
@@ -0,0 +1,5 @@
+nuc2seg package
+===============
+
+Submodules
+----------
diff --git a/docs/command_line_interface.rst b/docs/command_line_interface.rst
@@ -0,0 +1,6 @@
+.. _cli:
+
+Command Line Interface
+======================
+
+nuc2seg provides a suite of command line utilities that allow users to script running the pipeline end to end.
diff --git a/docs/conf.py b/docs/conf.py
@@ -0,0 +1,88 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+import os
+import sys
+
+sys.path.insert(0, os.path.abspath("../src"))
+
+
+# -- Project information -----------------------------------------------------
+
+project = "nuc2seg"
+copyright = "2024, Wesley Tansey, Jeffrey Quinn"
+author = "Wesley Tansey, Jeffrey Quinn"
+
+# The full version, including alpha/beta/rc tags
+release = "0.1"
+
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    "sphinx.ext.autodoc",
+    "sphinxarg.ext",
+    "sphinx_math_dollar",
+    "sphinx.ext.mathjax",
+]
+
+html_theme = "classic"
+
+mathjax_config = {
+    "tex2jax": {
+        "inlineMath": [["\\(", "\\)"]],
+        "displayMath": [["\\[", "\\]"]],
+    },
+}
+
+mathjax3_config = {
+    "tex": {
+        "inlineMath": [["\\(", "\\)"]],
+        "displayMath": [["\\[", "\\]"]],
+    }
+}
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ["_templates"]
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = []
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = "alabaster"
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ["_static"]
+
+autodoc_mock_imports = [
+    "numpy",
+    "pandas",
+    "scipy",
+    "tqdm",
+    "h5py",
+    "torch",
+    "matplotlib",
+    "seaborn",
+    "geopandas",
+    "shapely",
+]
diff --git a/docs/index.rst b/docs/index.rst
@@ -0,0 +1,24 @@
+nuc2seg
+=======
+
+Welcome to the documentation for the Python implementation of nuc2seg
+
+
+Contents
+=========
+
+.. toctree::
+    :maxdepth: 2
+
+    install
+    command_line_interface
+    api
+
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
diff --git a/docs/install.rst b/docs/install.rst
@@ -0,0 +1,29 @@
+Installation
+============
+
+This tutorial will walk you through the process of setting up an environment
+to run nuc2seg.
+
+.. _install-nextflow-docker:
+
+Option 1 (Recommended): Using Nextflow + Docker
+-----------------------------------------------
+
+nuc2seg uses several Python packages with C extensions,
+so the easiest way to get started is using the up to date
+docker image we maintain on docker hub.
+
+.. code::
+
+    docker pull jeffquinnmsk/nuc2seg:latest
+
+To install Nextflow see the instructions here: https://www.nextflow.io/docs/latest/getstarted.html
+
+Option 2: Install Using pip
+---------------------------
+
+For advanced usage, nuc2seg can be installed directly as a python package using pip.
+
+.. code::
+
+    pip install git+https://github.com/tansey-lab/nuc2seg
diff --git a/docs/make.bat b/docs/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=.
+set BUILDDIR=_build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.https://www.sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -0,0 +1,3 @@
+sphinx-argparse
+sphinx-math-dollar
+sphinx-immaterial
diff --git a/nextflow.config b/nextflow.config
@@ -157,10 +157,10 @@ dag {
 }
 
 manifest {
-    name            = 'batchie'
-    author          = """Chris Tosh, Jeffrey Quinn, Wesley Tansey"""
-    homePage        = 'https://github.com/tansey-lab/batchie'
-    description     = """BATCHIE pipeline"""
+    name            = 'nuc2seg'
+    author          = """Wesley Tansey, Jeffrey Quinn"""
+    homePage        = 'https://github.com/tansey-lab/nuc2seg'
+    description     = """nuc2seg pipeline"""
     mainScript      = 'main.nf'
     nextflowVersion = '!>=23.04.0'
     version         = '1.0dev'

diff --git a/pyproject.toml b/pyproject.toml
@@ -32,7 +32,8 @@ dependencies = [
     "torch",
     "torchvision",
     "spatialdata-io",
-    "matplotlib"
+    "matplotlib",
+    "autograd-minimize"
 ]
 
 [project.optional-dependencies]

diff --git a/src/nuc2seg/data_loading.py b/src/nuc2seg/data_loading.py
@@ -1,5 +1,6 @@
 import logging
 import os
+import torch
 from os.path import join
 from pathlib import Path
 
@@ -13,74 +14,73 @@ def xenium_collate_fn(data):
     for sample in data:
         for key, val in sample.items():
             outputs[key].append(val)
-    outputs['X'] = pad_sequence(outputs['X'], batch_first=True, padding_value=-1)
-    outputs['Y'] = pad_sequence(outputs['Y'], batch_first=True, padding_value=-1)
-    outputs['gene'] = pad_sequence(outputs['gene'], batch_first=True, padding_value=-1)
-    outputs['labels'] = torch.stack(outputs['labels'])
-    outputs['angles'] = torch.stack(outputs['angles'])
-    outputs['classes'] = torch.stack(outputs['classes'])
-    outputs['label_mask'] = torch.stack(outputs['label_mask']).type(torch.bool)
-    outputs['nucleus_mask'] = torch.stack(outputs['nucleus_mask']).type(torch.bool)
-    outputs['location'] = torch.stack(outputs['location']).type(torch.long)
+    outputs["X"] = pad_sequence(outputs["X"], batch_first=True, padding_value=-1)
+    outputs["Y"] = pad_sequence(outputs["Y"], batch_first=True, padding_value=-1)
+    outputs["gene"] = pad_sequence(outputs["gene"], batch_first=True, padding_value=-1)
+    outputs["labels"] = torch.stack(outputs["labels"])
+    outputs["angles"] = torch.stack(outputs["angles"])
+    outputs["classes"] = torch.stack(outputs["classes"])
+    outputs["label_mask"] = torch.stack(outputs["label_mask"]).type(torch.bool)
+    outputs["nucleus_mask"] = torch.stack(outputs["nucleus_mask"]).type(torch.bool)
+    outputs["location"] = torch.stack(outputs["location"]).type(torch.long)
 
     # Edge case: pad_sequence will squeeze tensors if there are no entries.
     # In that case, we just need to add the dimension back.
-    if len(outputs['gene'].shape) == 1:
-        outputs['X'] = outputs['X'][:,None]
-        outputs['Y'] = outputs['Y'][:,None]
-        outputs['gene'] = outputs['gene'][:,None]
+    if len(outputs["gene"].shape) == 1:
+        outputs["X"] = outputs["X"][:, None]
+        outputs["Y"] = outputs["Y"][:, None]
+        outputs["gene"] = outputs["gene"][:, None]
 
     return outputs
 
+
 class XeniumDataset(Dataset):
     def __init__(self, tiles_dir: str):
-        self.transcripts_dir = Path(join(tiles_dir, 'transcripts/'))
-        self.labels_dir = Path(join(tiles_dir, 'labels/'))
-        self.angles_dir = Path(join(tiles_dir, 'angles/'))
-        self.classes_dir = Path(join(tiles_dir, 'classes/'))
+        self.transcripts_dir = Path(join(tiles_dir, "transcripts/"))
+        self.labels_dir = Path(join(tiles_dir, "labels/"))
+        self.angles_dir = Path(join(tiles_dir, "angles/"))
+        self.classes_dir = Path(join(tiles_dir, "classes/"))
 
-        self.locations = np.load(join(tiles_dir, 'locations.npy'))
+        self.locations = np.load(join(tiles_dir, "locations.npy"))
 
         self.ids = np.arange(self.locations.shape[0])
 
-        logging.info(f'Creating dataset with {len(self.ids)} examples')
-        self.class_counts = np.load(join(tiles_dir, 'class_counts.npy'))
-        self.transcript_counts = np.load(join(tiles_dir, 'transcript_counts.npy'))
+        logging.info(f"Creating dataset with {len(self.ids)} examples")
+        self.class_counts = np.load(join(tiles_dir, "class_counts.npy"))
+        self.transcript_counts = np.load(join(tiles_dir, "transcript_counts.npy"))
         self.max_length = self.transcript_counts.max()
-        self.label_values = np.arange(self.class_counts.shape[1])-1
-        self.n_classes = self.class_counts.shape[1]-2
-        self.gene_ids = {int(i): j for i,j in np.load(join(tiles_dir, 'gene_ids.npy'))}
-        self.n_genes = max(self.gene_ids)+1
+        self.label_values = np.arange(self.class_counts.shape[1]) - 1
+        self.n_classes = self.class_counts.shape[1] - 2
+        self.gene_ids = {int(i): j for i, j in np.load(join(tiles_dir, "gene_ids.npy"))}
+        self.n_genes = max(self.gene_ids) + 1
 
         # Note: class IDs are 1-based since ID=0 is background
-        logging.info(f'Unique label values: {self.label_values}')
-
+        logging.info(f"Unique label values: {self.label_values}")
 
     def __len__(self):
         return len(self.ids)
 
     def __getitem__(self, idx):
-        transcripts_file = os.path.join(self.transcripts_dir, f'{idx}.npz')
-        labels_file = os.path.join(self.labels_dir, f'{idx}.npz')
-        angles_file = os.path.join(self.angles_dir, f'{idx}.npz')
-        classes_file = os.path.join(self.classes_dir, f'{idx}.npz')
-
-
-        xyg = np.load(transcripts_file)['arr_0']
-        labels = np.load(labels_file)['arr_0']
-        angles = np.load(angles_file)['arr_0']
-        classes = np.load(classes_file)['arr_0']
+        transcripts_file = os.path.join(self.transcripts_dir, f"{idx}.npz")
+        labels_file = os.path.join(self.labels_dir, f"{idx}.npz")
+        angles_file = os.path.join(self.angles_dir, f"{idx}.npz")
+        classes_file = os.path.join(self.classes_dir, f"{idx}.npz")
+
+        xyg = np.load(transcripts_file)["arr_0"]
+        labels = np.load(labels_file)["arr_0"]
+        angles = np.load(angles_file)["arr_0"]
+        classes = np.load(classes_file)["arr_0"]
         labels_mask = labels > -1
         nucleus_mask = labels > 0
 
         return {
-                'X': torch.as_tensor(np.array(xyg[:,0])).long().contiguous(),
-                'Y': torch.as_tensor(np.array(xyg[:,1])).long().contiguous(),
-                'gene': torch.as_tensor(np.array(xyg[:,2])).long().contiguous(),
-                'labels': torch.as_tensor(labels).long().contiguous(),
-                'angles': torch.as_tensor(angles).float().contiguous(),
-                'classes': torch.as_tensor(classes).long().contiguous(),
-                'label_mask': torch.as_tensor(labels_mask).bool().contiguous(),
-                'nucleus_mask': torch.as_tensor(nucleus_mask).bool().contiguous(),
-                'location': self.locations[idx]
+            "X": torch.as_tensor(np.array(xyg[:, 0])).long().contiguous(),
+            "Y": torch.as_tensor(np.array(xyg[:, 1])).long().contiguous(),
+            "gene": torch.as_tensor(np.array(xyg[:, 2])).long().contiguous(),
+            "labels": torch.as_tensor(labels).long().contiguous(),
+            "angles": torch.as_tensor(angles).float().contiguous(),
+            "classes": torch.as_tensor(classes).long().contiguous(),
+            "label_mask": torch.as_tensor(labels_mask).bool().contiguous(),
+            "nucleus_mask": torch.as_tensor(nucleus_mask).bool().contiguous(),
+            "location": self.locations[idx],
         }