From 1e3dd6fb4ee4b88dff73e0c3e2321d2a4a87ea2c Mon Sep 17 00:00:00 2001
From: Martin Kim <martinkim@berkeley.edu>
Date: Fri, 17 Feb 2023 11:02:24 -0800
Subject: [PATCH 1/3] Add documentation to scripts

---
 bin/composition_baseline.py | 79 +++++++++++++++++++++++++++++++++----
 bin/compute_2dreps.py       | 45 +++++++++++++++------
 bin/compute_rf.py           | 34 ++++++++++------
 bin/utils.py                |  8 ++++
 4 files changed, 132 insertions(+), 34 deletions(-)

diff --git a/bin/composition_baseline.py b/bin/composition_baseline.py
index 31e7719..69238a0 100644
--- a/bin/composition_baseline.py
+++ b/bin/composition_baseline.py
@@ -1,21 +1,50 @@
-from typing import Union, Literal, Optional
+from typing import Literal, Optional, Union
 
-from anndata import AnnData
+import numpy as np
 import scanpy as sc
+from anndata import AnnData
 from scvi.model import SCVI
 from sklearn.metrics import pairwise_distances
-import numpy as np
 
 
 class CompositionBaseline:
+    """
+    Baseline method for computing sample-sample distances within clusters.
+
+    Parameters
+    ----------
+    adata
+        Annotated data matrix.
+    batch_key
+        Key in `adata.obs` corresponding to batch/site information.
+    sample_key
+        Key in `adata.obs` corresponding to sample information.
+    rep
+        Representation to compute distances on. One of the following:
+
+        * `"PCA"`: :meth:`~scanpy.pp.pca`
+        * `"SCVI"`: :class:`~scvi.model.SCVI`
+    model_kwargs
+        Keyword arguments to pass into :class:`~scvi.model.SCVI`. Also contains the
+        following keys:
+
+        * `"n_dim"`: Number of components to use for PCA
+        * `"clustering_on"`: One of `"leiden"` or `"cluster_key"`. If `"leiden"`,
+            clusters are computed using :meth:`~scanpy.tl.leiden`. If `"cluster_key"`,
+            clusters correspond to values in `adata.obs[cluster_key]`.
+        * `"cluster_key"`: Key in `adata.obs` corresponding to cluster information.
+    train_kwargs
+        Keyword arguments to pass into :meth:`~scvi.model.SCVI.train`.
+    """
+
     def __init__(
         self,
         adata: AnnData,
         batch_key: Optional[str],
         sample_key: Optional[str],
         rep: Union[Literal["PCA"], Literal["SCVI"]],
-        model_kwargs=None,
-        train_kwargs=None,
+        model_kwargs: dict = None,
+        train_kwargs: dict = None,
     ):
         assert rep in ["PCA", "SCVI"]
         self.adata = adata.copy()
@@ -26,16 +55,17 @@ def __init__(
 
         self.rep = rep
         self.rep_key = "X_rep"
-        self.n_dim = model_kwargs.pop("n_dim", 50)
-        self.clustering_on = model_kwargs.pop(
+        self.n_dim = self.model_kwargs.pop("n_dim", 50)
+        self.clustering_on = self.model_kwargs.pop(
             "clustering_on", "leiden"
         )  # one of leiden, cluster_key
-        self.cluster_key = model_kwargs.pop("cluster_key", None)
+        self.cluster_key = self.model_kwargs.pop("cluster_key", None)
 
     def preprocess_data(self):
         if self.rep == "PCA":
             sc.pp.normalize_total(self.adata, target_sum=1e4)
             sc.pp.log1p(self.adata)
+        # TODO: snakemake workflow had hvg selection, should we add it here too? 
 
     def fit(self):
         self.preprocess_data()
@@ -52,6 +82,19 @@ def get_cell_representation(self):
         return self.adata.obsm[self.rep_key]
 
     def get_local_sample_representation(self):
+        """
+        Computes local sample representations within each cluster.
+
+        Represents samples as proportion of total sample cells within Leiden subclusters.
+
+        Returns
+        -------
+        Dictionary
+            Keys are unique labels in `adata.obs[self.cluster_key]`. 
+            Values are :class:`pandas.DataFrame` with rows corresponding to samples
+            within that cluster and columns corresponding to the frequency of each
+            sample within each subcluster.
+        """
         if self.clustering_on == "leiden":
             sc.pp.neighbors(self.adata, n_neighbors=30, use_rep=self.rep_key)
             sc.tl.leiden(self.adata, resolution=1.0, key_added="leiden_1.0")
@@ -73,20 +116,25 @@ def get_local_sample_representation(self):
             sc.pp.neighbors(subann, n_neighbors=30, use_rep=self.rep_key)
             sc.tl.leiden(subann, resolution=1.0, key_added=self.cluster_key)
 
+            # Step 2: compute number of cells per subcluster-sample pair
             szs = (
                 subann.obs.groupby([self.cluster_key, self.sample_key])
                 .size()
                 .to_frame("n_cells")
                 .reset_index()
             )
+            # Step 3: compute total number of cells per sample
             szs_total = (
                 szs.groupby(self.sample_key)
                 .sum()
                 .rename(columns={"n_cells": "n_cells_total"})
             )
+            # Step 4: compute frequency of each subcluster per sample
             comps = szs.merge(szs_total, on=self.sample_key).assign(
                 freqs=lambda x: x.n_cells / x.n_cells_total
             )
+            # Step 5: compute representation of each sample as the vector
+            # of frequencies in each subcluster
             freqs = (
                 comps.loc[:, [self.sample_key, self.cluster_key, "freqs"]]
                 .set_index([self.sample_key, self.cluster_key])
@@ -99,15 +147,30 @@ def get_local_sample_representation(self):
         return freqs_all
 
     def get_local_sample_distances(self):
+        """
+        Computes local sample distances for each cell.
+
+        Sample-sample distances for a particular cell are computed as the Euclidean
+        distance between the sample representations within the cell's cluster.
+        Distances for samples absent in a cluster are imputed as `1.0`.
+
+        Returns
+        -------
+        :class:`numpy.ndarray`
+            Array of shape `(n_cells, n_samples, n_samples)`. Cells in the same cluster
+            have the same distance matrix.
+        """
         freqs_all = self.get_local_sample_representation()
         n_sample = self.adata.obs[self.sample_key].nunique()
         sample_order = self.adata.obs[self.sample_key].unique()
         local_dists = np.zeros((self.adata.n_obs, n_sample, n_sample))
+
         for cluster, freqs in freqs_all.items():
             cell_is_selected = self.adata.obs[self.cluster_key] == cluster
             ordered_freqs = freqs.reindex(sample_order).fillna(1.)
             cluster_reps = ordered_freqs.values
             cluster_dists = pairwise_distances(cluster_reps, metric="euclidean")
             local_dists[cell_is_selected] = cluster_dists
+
         return local_dists
         
\ No newline at end of file
diff --git a/bin/compute_2dreps.py b/bin/compute_2dreps.py
index ea7d02e..1535098 100644
--- a/bin/compute_2dreps.py
+++ b/bin/compute_2dreps.py
@@ -1,7 +1,9 @@
+import pymde
 import scanpy as sc
+from anndata import AnnData
 from tsnecuda import TSNE
-import pymde
-from utils import make_parents, wrap_kwargs
+
+from utils import write_h5ad, wrap_kwargs
 
 
 mde_kwargs = dict(
@@ -16,22 +18,37 @@
 @wrap_kwargs
 def compute_2dreps(
     *,
-    adata_in,
-    adata_out,
-):
-    """Computes low dimensional representations for existing latent representations.
+    adata_in: str,
+    adata_out: str,
+) -> AnnData:
+    """
+    Computes low dimensional representations for existing latent representations.
+
+    Iterates over keys in `.uns["latent_keys"]` and computes the following:
+
+    * :meth:`~pymde.pca` with `embedding_dim=2`
+    * :meth:`~tsnecuda.TSNE.fit_transform` with `perplexity=30`
+    * :meth:`~pymde.preserve_neighbors` with `mde_kwargs`
 
     Parameters
     ----------
-    adata_in :
-        Path to anndata containing representations
-    adata_out :
-        Path to save the file
+    adata_in
+        Path to :class:`anndata.AnnData` containing representations.
+    adata_out
+        Path to save the output :class:`anndata.AnnData` with 2D representations.
+
+    Returns
+    -------
+    :class:`anndata.AnnData`
+        Annotated data matrix with the following:
+
+        * `.uns["latent_pca_keys"]`: Keys in `.obsm` corresponding to PCA representations.
+        * `.uns["latent_tsne_keys"]`: Keys in `.obsm` corresponding to t-SNE representations.
+        * `.uns["latent_mde_keys"]`: Keys in `.obsm` corresponding to MDE representations.
     """
     adata = sc.read_h5ad(adata_in)
     latent_keys = adata.uns.get("latent_keys", None)
-    make_parents(adata_out)
-
+    
     if latent_keys is None:
         adata.write_h5ad(adata_out)
 
@@ -55,11 +72,13 @@ def compute_2dreps(
         latent_mde_key = f"{latent_key}_mde"
         adata.obsm[latent_mde_key] = latent_mde
         latent_mde_keys.append(latent_mde_key)
+
     adata.uns["latent_pca_keys"] = latent_pca_keys
     adata.uns["latent_tsne_keys"] = latent_tsne_keys
     adata.uns["latent_mde_keys"] = latent_mde_keys
 
-    adata.write_h5ad(adata_out)
+    return write_h5ad(adata, adata_out)
+
 
 if __name__ == "__main__":
     compute_2dreps()
diff --git a/bin/compute_rf.py b/bin/compute_rf.py
index 4c3fbab..6ee03f1 100644
--- a/bin/compute_rf.py
+++ b/bin/compute_rf.py
@@ -6,8 +6,13 @@
 import xarray as xr
 from scipy.cluster.hierarchy import linkage, to_tree
 from scipy.spatial.distance import squareform
-from utils import (determine_if_file_empty, load_config, make_parents,
-                   wrap_kwargs)
+
+from utils import (
+    determine_if_file_empty, 
+    load_config, 
+    make_parents,
+    wrap_kwargs,
+)
 
 
 def linkage_to_ete(linkage_obj):
@@ -50,21 +55,24 @@ def hierarchical_clustering(dist_mtx, method="ward"):
 @wrap_kwargs
 def compute_rf(
     *,
-    distance_matrices,
-    distance_matrices_gt,
-    config_in,
-    table_out,
+    distance_matrices: str,
+    distance_matrices_gt: str,
+    config_in: str,
+    table_out: str,
 ):
-    """Main fn for computing RF distance.
+    """
+    Computes the Robinson-Foulds distance between two hierarchical clusterings.
 
     Parameters
     ----------
-    adata_in :
-        path to anndata file containing cell-type specific distance matrices
-    config_in :
-        path to config file
-    table_out :
-        desired output table file
+    distance_matrices
+        Path to the distance matrices.
+    distance_matrices_gt
+
+    config_in
+        Path to the configuration file corresponding to `adata_in`.
+    table_out
+        Path to save the output table.
     """
     basename = os.path.basename(distance_matrices)
     model_name = basename.split(".")[1]
diff --git a/bin/utils.py b/bin/utils.py
index 0256494..814c476 100644
--- a/bin/utils.py
+++ b/bin/utils.py
@@ -9,6 +9,7 @@
 import click
 import pandas as pd
 import scanpy as sc
+from anndata import AnnData
 from remote_pdb import RemotePdb
 
 INCH_TO_CM = 1 / 2.54
@@ -20,6 +21,13 @@ def make_parents(*paths) -> None:
         pathlib.Path(p).parent.mkdir(parents=True, exist_ok=True)
 
 
+def write_h5ad(adata: AnnData, path: str) -> AnnData:
+    """Write an AnnData object to an HDF5 file."""
+    make_parents(path)
+    adata.write_h5ad(path)
+    return adata
+
+
 def wrap_kwargs(fn: Callable) -> Callable:
     """Wrap a function to accept keyword arguments from the command line."""
     for param in signature(fn).parameters:

From 60e2e268f5336b370ca1522825e615f2252399c6 Mon Sep 17 00:00:00 2001
From: Martin Kim <martinkim@berkeley.edu>
Date: Fri, 17 Feb 2023 14:51:28 -0800
Subject: [PATCH 2/3] pre-commit changes

---
 README.md                                     | 36 +++++++++---
 bin/composition_baseline.py                   | 19 ++++---
 bin/compute_2dreps.py                         | 23 ++++----
 bin/compute_rf.py                             | 57 ++++++++++++++-----
 bin/fit_mrvi.py                               |  1 -
 bin/fit_scviv2.py                             |  3 +-
 bin/get_latent_scviv2.py                      | 11 +++-
 bin/produce_figures_symsim_new.py             |  5 +-
 bin/scib.py                                   | 10 ++--
 bin/vendi.py                                  |  6 +-
 .../sciplex_A549_significant_all_phases.json  |  2 +-
 .../sciplex_K562_significant_all_phases.json  |  2 +-
 .../sciplex_MCF7_significant_all_phases.json  |  2 +-
 env/gpu/run_models.yaml                       |  4 +-
 modules/get_latent_scviv2.nf                  |  2 +-
 notebooks/sciplex_analysis.py                 |  6 +-
 .../sciplex_get_significant_product_dose.R    | 22 +++----
 notebooks/sciplex_preprocess.py               |  2 +-
 18 files changed, 132 insertions(+), 81 deletions(-)

diff --git a/README.md b/README.md
index e09e81b..3229efb 100644
--- a/README.md
+++ b/README.md
@@ -1,26 +1,32 @@
 # scvi-v2-reproducibility
 
 ## Running existing workflows
+
 Existing workflows can be discovered under `workflows/`. These can be run as follows from
 the root of the repository:
+
 ```
 nextflow main.nf --workflow workflow_name --profile profile_name
 ```
-Setting both ``--workflow`` and ``--profile`` is required. Available profiles include 
-`standard` (run without GPU) and `gpu` (run with GPU). 
+
+Setting both `--workflow` and `--profile` is required. Available profiles include
+`standard` (run without GPU) and `gpu` (run with GPU).
 
 By default, intermediate and final outputs are placed in `results/`. This can be changed
 by modifying the `publishDir` directive in the configuration.
 
 ### Simple pipeline
+
 Currently, this pipeline just scans `data/` in the root directory for any `.h5ad` files
-and runs each subworkflow sequentially. It expects a configuration JSON file to be 
+and runs each subworkflow sequentially. It expects a configuration JSON file to be
 present in `conf/datasets/` with the same name as the `.h5ad` file.
 
 ### AWS pipeline
+
 This pipeline pulls data from `s3://largedonor` and runs each subworkflow locally. In
 order to run this pipeline, create the file `conf/aws_credentials.config` (ignored by
 git) with the following entries:
+
 ```
 aws {
     accessKey = "access_key"
@@ -28,14 +34,17 @@ aws {
     region = "us-west-1"
 }
 ```
+
 You can specify the individual AnnDatas being processed by modifying `params.inputs` in
 `conf/aws_pipeline.config`.
 
 ## Adding new workflows
-Workflows are intended to connect subworkflows into an end-to-end pipeline. In order to 
+
+Workflows are intended to connect subworkflows into an end-to-end pipeline. In order to
 add a new workflow, follow these steps:
 
 1. Add a `workflow_name.nf` file to `workflows/` with the following template:
+
 ```
 include { subworkflow } from "${params.subworkflows.subworkflow_name}"
 
@@ -46,8 +55,9 @@ workflow run_main {
 }
 ```
 
-2. Create the associated `workflow_name.config` file in `conf/` with the following 
-template:
+2. Create the associated `workflow_name.config` file in `conf/` with the following
+   template:
+
 ```
 params {
     inputs = //path or list of paths to inputs
@@ -55,10 +65,12 @@ params {
 ```
 
 ## Adding new subworkflows
-Subworkflows are intended to be reusable pieces across different workflows. To add a new 
+
+Subworkflows are intended to be reusable pieces across different workflows. To add a new
 subworkflow, follow these steps:
 
 1. Add a `main.nf` file to `subworkflows/subworkflow_name/` with the following template:
+
 ```
 include { module } from "${params.modules.module_name}"
 
@@ -73,8 +85,10 @@ workflow subworkflow_name {
     module.out
 }
 ```
+
 2. Add a reference to the new subworkflow under `nextflow.config` to be able to import
-it as `params.subworkflows.subworkflow_name
+   it as `params.subworkflows.subworkflow_name
+
 ```
 params {
     subworkflow_name = "${params.subworkflows.root}/subworkflow_name/main.nf"
@@ -88,6 +102,7 @@ They can be found under `modules/` and `bin/`, respecively. To add a new module,
 these steps:
 
 1. Add a `module_name.nf` file to `modules/` with the following template:
+
 ```
 process module_name {
     input:
@@ -107,6 +122,7 @@ process module_name {
 ```
 
 2. Add the corresponding Python script to `bin/` with the following template:
+
 ```
 from utils import wrap_kwargs
 
@@ -119,7 +135,8 @@ if __name__ == "__main__":
 ```
 
 3. Add references to the new module and script under `nextflow.config` to be able to
-import them as `params.modules.module_name` and `params.bin.module_name`, respectively.
+   import them as `params.modules.module_name` and `params.bin.module_name`, respectively.
+
 ```
 params {
     modules {
@@ -132,6 +149,7 @@ params {
 ```
 
 4. Specify the conda environment for the new module in `nextflow.config`:
+
 ```
 params {
     env {
diff --git a/bin/composition_baseline.py b/bin/composition_baseline.py
index 5482544..efd4ab5 100644
--- a/bin/composition_baseline.py
+++ b/bin/composition_baseline.py
@@ -22,7 +22,7 @@ class CompositionBaseline:
     rep
         Representation to compute distances on. One of the following:
 
-        * `"PCA"`: :meth:`~scanpy.pp.pca`
+        * `"PCA"`: :func:`~scanpy.pp.pca`
         * `"SCVI"`: :class:`~scvi.model.SCVI`
     model_kwargs
         Keyword arguments to pass into :class:`~scvi.model.SCVI`. Also contains the
@@ -30,11 +30,11 @@ class CompositionBaseline:
 
         * `"n_dim"`: Number of components to use for PCA
         * `"clustering_on"`: One of `"leiden"` or `"cluster_key"`. If `"leiden"`,
-            clusters are computed using :meth:`~scanpy.tl.leiden`. If `"cluster_key"`,
+            clusters are computed using :func:`~scanpy.tl.leiden`. If `"cluster_key"`,
             clusters correspond to values in `adata.obs[cluster_key]`.
         * `"cluster_key"`: Key in `adata.obs` corresponding to cluster information.
     train_kwargs
-        Keyword arguments to pass into :meth:`~scvi.model.SCVI.train`.
+        Keyword arguments to pass into :func:`~scvi.model.SCVI.train`.
     """
 
     def __init__(
@@ -50,8 +50,8 @@ def __init__(
         self.adata = adata.copy()
         self.batch_key = batch_key
         self.sample_key = sample_key
-        self.model_kwargs = model_kwargs if model_kwargs is not None else dict()
-        self.train_kwargs = train_kwargs if train_kwargs is not None else dict()
+        self.model_kwargs = model_kwargs if model_kwargs is not None else {}
+        self.train_kwargs = train_kwargs if train_kwargs is not None else {}
 
         self.rep = rep
         self.rep_key = "X_rep"
@@ -63,12 +63,14 @@ def __init__(
         self.subcluster_key = "leiden_subcluster"
 
     def preprocess_data(self):
+        """Preprocess data for PCA or SCVI."""
         if self.rep == "PCA":
             sc.pp.normalize_total(self.adata, target_sum=1e4)
             sc.pp.log1p(self.adata)
-        # TODO: snakemake workflow had hvg selection, should we add it here too? 
+        # TODO: snakemake workflow had hvg selection, should we add it here too?
 
     def fit(self):
+        """Fit PCA or SCVI."""
         self.preprocess_data()
         if self.rep == "PCA":
             sc.pp.pca(self.adata, n_comps=self.n_dim)  # saves "X_pca" in obsm
@@ -80,6 +82,7 @@ def fit(self):
             self.adata.obsm[self.rep_key] = scvi_model.get_latent_representation()
 
     def get_cell_representation(self):
+        """Get cell representations."""
         return self.adata.obsm[self.rep_key]
 
     def get_local_sample_representation(self):
@@ -91,7 +94,7 @@ def get_local_sample_representation(self):
         Returns
         -------
         Dictionary
-            Keys are unique labels in `adata.obs[self.cluster_key]`. 
+            Keys are unique labels in `adata.obs[self.cluster_key]`.
             Values are :class:`pandas.DataFrame` with rows corresponding to samples
             within that cluster and columns corresponding to the frequency of each
             sample within each subcluster.
@@ -108,7 +111,7 @@ def get_local_sample_representation(self):
                 "If clustering_on is cluster_key, cluster_key must be provided."
             )
 
-        freqs_all = dict()
+        freqs_all = {}
         for unique_cluster in self.adata.obs[self.cluster_key].unique():
             cell_is_selected = self.adata.obs[self.cluster_key] == unique_cluster
             subann = self.adata[cell_is_selected].copy()
diff --git a/bin/compute_2dreps.py b/bin/compute_2dreps.py
index df04795..a683309 100644
--- a/bin/compute_2dreps.py
+++ b/bin/compute_2dreps.py
@@ -1,16 +1,15 @@
 import pymde
 import scanpy as sc
 from anndata import AnnData
-
 from utils import write_h5ad, wrap_kwargs
 
-mde_kwargs = dict(
-    embedding_dim=2,
-    constraint=pymde.Standardized(),
-    repulsive_fraction=0.7,
-    device="cuda",
-    n_neighbors=15,
-)
+mde_kwargs = {
+    "embedding_dim": 2,
+    "constraint": pymde.Standardized(),
+    "repulsive_fraction": 0.7,
+    "device": "cuda",
+    "n_neighbors": 15,
+}
 
 
 @wrap_kwargs
@@ -24,9 +23,9 @@ def compute_2dreps(
 
     Iterates over keys in `.uns["latent_keys"]` and computes the following:
 
-    * :meth:`~pymde.pca` with `embedding_dim=2`
-    * :meth:`~tsnecuda.TSNE.fit_transform` with `perplexity=30`
-    * :meth:`~pymde.preserve_neighbors` with `mde_kwargs`
+    * :func:`~pymde.pca` with `embedding_dim=2`
+    * :func:`~tsnecuda.TSNE.fit_transform` with `perplexity=30`
+    * :func:`~pymde.preserve_neighbors` with `mde_kwargs`
 
     Parameters
     ----------
@@ -46,7 +45,7 @@ def compute_2dreps(
     """
     adata = sc.read_h5ad(adata_in)
     latent_keys = adata.uns.get("latent_keys", None)
-    
+
     if latent_keys is None:
         adata.write_h5ad(adata_out)
 
diff --git a/bin/compute_rf.py b/bin/compute_rf.py
index 11f85f8..e3337d4 100644
--- a/bin/compute_rf.py
+++ b/bin/compute_rf.py
@@ -6,21 +6,28 @@
 import numpy as np
 import pandas as pd
 import xarray as xr
-from scipy.cluster.hierarchy import linkage, to_tree
+from scipy.cluster.hierarchy import ClusterNode, linkage, to_tree
 from scipy.linalg import issymmetric
 from scipy.spatial.distance import squareform
+from utils import (determine_if_file_empty, load_config, make_parents,
+                   wrap_kwargs)
 
-from utils import (
-    determine_if_file_empty, 
-    load_config, 
-    make_parents,
-    wrap_kwargs,
-)
 
+def linkage_to_ete(linkage_obj: np.ndarray) -> ete3.Tree:
+    """
+    Converts a linkage matrix to an :class:`~ete3.Tree` object.
+
+    Parameters
+    ----------
+    linkage_obj
+        A linkage matrix.
 
-def linkage_to_ete(linkage_obj):
-    """Converts to ete3 tree representation."""
-    R = to_tree(linkage_obj)
+    Returns
+    -------
+    :class:`~ete3.Tree`
+        Tree representation of `linkage_obj`.
+    """
+    R: ClusterNode = to_tree(linkage_obj)
     root = ete3.Tree()
     root.dist = 0
     root.name = "root"
@@ -44,18 +51,39 @@ def linkage_to_ete(linkage_obj):
                 item2node[node.get_id()].add_child(ch)
                 item2node[ch_node_id] = ch
                 to_visit.append(ch_node)
+
     return root
 
 
-def hierarchical_clustering(dist_mtx, method="ward"):
-    """Perform hierarchical clustering on squared distance matrix."""
+def hierarchical_clustering(dist_mtx: np.ndarray, method: str = "ward"):
+    """
+    Perform hierarchical clustering on a squared distance matrix.
+
+    Parameters
+    ----------
+    dist_mtx
+        Squared distance matrix.
+    method
+        Method to use for hierarchical clustering. See :func:`~scipy.cluster.hierarchy.linkage`.
+
+    Returns
+    -------
+    :class:`~ete3.Tree`
+        Tree representation of the hierarchical clustering.
+    """
     assert dist_mtx.shape[0] == dist_mtx.shape[1]
+
     is_symmetric = issymmetric(dist_mtx)
     has_zero_diag = (dist_mtx.diagonal() == 0).all()
     if not (is_symmetric and has_zero_diag):
-        warnings.warn("Distance matrix may be invalid.")
+        warnings.warn(
+            "`dist_mtx` is not symmetric and has a non-zero diagonal. "
+            "The matrix will be symmetrized and the diagonal will be set to zero.",
+            stacklevel=2,
+        )
         dist_mtx = dist_mtx - np.diag(dist_mtx.diagonal())
         dist_mtx = (dist_mtx + dist_mtx.T) / 2.0
+
     red_mtx = squareform(dist_mtx)
     z = linkage(red_mtx, method=method)
     return linkage_to_ete(z)
@@ -77,7 +105,7 @@ def compute_rf(
     distance_matrices
         Path to the distance matrices.
     distance_matrices_gt
-
+        Path to the ground truth distance matrices.
     config_in
         Path to the configuration file corresponding to `adata_in`.
     table_out
@@ -123,6 +151,7 @@ def compute_rf(
         model_name=model_name
     )
     dists.to_csv(table_out, index=False)
+    return dists
 
 
 if __name__ == "__main__":
diff --git a/bin/fit_mrvi.py b/bin/fit_mrvi.py
index d93fcc9..3b7da64 100644
--- a/bin/fit_mrvi.py
+++ b/bin/fit_mrvi.py
@@ -1,6 +1,5 @@
 import mrvi
 import scanpy as sc
-
 from utils import load_config, make_parents, wrap_kwargs
 
 
diff --git a/bin/fit_scviv2.py b/bin/fit_scviv2.py
index 660a19c..a06e66b 100644
--- a/bin/fit_scviv2.py
+++ b/bin/fit_scviv2.py
@@ -1,6 +1,5 @@
-import scvi_v2
 import scanpy as sc
-
+import scvi_v2
 from utils import load_config, make_parents, wrap_kwargs
 
 
diff --git a/bin/get_latent_scviv2.py b/bin/get_latent_scviv2.py
index e50c0e0..4ac103c 100644
--- a/bin/get_latent_scviv2.py
+++ b/bin/get_latent_scviv2.py
@@ -1,7 +1,6 @@
 import scanpy as sc
 import scvi_v2
 from anndata import AnnData
-import numpy as np
 from utils import load_config, make_parents, wrap_kwargs
 
 
@@ -55,13 +54,19 @@ def get_latent_scviv2(
     cell_reps.to_netcdf(cell_representations_out)
     del cell_reps
 
-    cell_dists = model.get_local_sample_distances(adata, keep_cell=False, groupby=labels_key)
+    cell_dists = model.get_local_sample_distances(
+        adata, keep_cell=False, groupby=labels_key
+    )
     make_parents(distance_matrices_out)
     cell_dists.to_netcdf(distance_matrices_out)
     del cell_dists
 
     cell_normalized_dists = model.get_local_sample_distances(
-        adata, use_mean=False, normalize_distances=True, keep_cell=False, groupby=labels_key
+        adata,
+        use_mean=False,
+        normalize_distances=True,
+        keep_cell=False,
+        groupby=labels_key,
     )
     make_parents(normalized_distance_matrices_out)
     cell_normalized_dists.to_netcdf(normalized_distance_matrices_out)
diff --git a/bin/produce_figures_symsim_new.py b/bin/produce_figures_symsim_new.py
index a064835..d032521 100644
--- a/bin/produce_figures_symsim_new.py
+++ b/bin/produce_figures_symsim_new.py
@@ -2,12 +2,11 @@
 import argparse
 import glob
 import os
-import re
 
-import numpy as np
 import matplotlib.pyplot as plt
-import seaborn as sns
+import numpy as np
 import plotnine as p9
+import seaborn as sns
 import xarray as xr
 from utils import INCH_TO_CM, load_results
 
diff --git a/bin/scib.py b/bin/scib.py
index 1065386..0cf3bfa 100644
--- a/bin/scib.py
+++ b/bin/scib.py
@@ -5,11 +5,11 @@
 import scanpy as sc
 import scib_metrics as metrics
 from anndata import AnnData
-
 from utils import load_config, make_parents, wrap_kwargs
 
 
 def categorical_obs(adata: AnnData, key: Optional[str]) -> Optional[np.ndarray]:
+    """Get categorical observation as an array."""
     if key is None:
         return
     return np.array(adata.obs[key].astype("category").cat.codes).ravel()
@@ -51,7 +51,7 @@ def compute_scib(
         isolated_label_score = None
         if labels is not None and batch is not None:
             isolated_label_score = metrics.isolated_labels(X_latent, labels, batch)
-        
+
         silhouette_label_score = None
         if labels is not None:
             silhouette_label_score = metrics.silhouette_label(
@@ -62,7 +62,7 @@ def compute_scib(
         if sample is not None:
             silhouette_sample_score = metrics.silhouette_label(
                 X_latent, sample, rescale=True
-        )
+            )
         # (
         #     nmi_kmeans_label_score,
         #     ari_kmeans_label_score,
@@ -126,7 +126,9 @@ def compute_scib(
         all_metrics.update(latent_metrics)
 
     df = pd.DataFrame.from_dict(
-        all_metrics, orient="index", columns=["latent_key", "metric_name", "metric_value"]
+        all_metrics,
+        orient="index",
+        columns=["latent_key", "metric_name", "metric_value"],
     )
     make_parents(table_out)
     df.to_csv(table_out)
diff --git a/bin/vendi.py b/bin/vendi.py
index d5b9caf..acae1f1 100644
--- a/bin/vendi.py
+++ b/bin/vendi.py
@@ -1,10 +1,8 @@
+import numpy as np
 import pandas as pd
-import scanpy as sc
-from vendi_score import vendi
 import xarray as xr
-import numpy as np
-
 from utils import load_config, make_parents, wrap_kwargs
+from vendi_score import vendi
 
 
 @wrap_kwargs
diff --git a/conf/datasets/sciplex_A549_significant_all_phases.json b/conf/datasets/sciplex_A549_significant_all_phases.json
index 5a47586..fde87da 100644
--- a/conf/datasets/sciplex_A549_significant_all_phases.json
+++ b/conf/datasets/sciplex_A549_significant_all_phases.json
@@ -12,4 +12,4 @@
         "max_epochs": 4
     },
     "clustering_method": "ward"
-}
\ No newline at end of file
+}
diff --git a/conf/datasets/sciplex_K562_significant_all_phases.json b/conf/datasets/sciplex_K562_significant_all_phases.json
index 5a47586..fde87da 100644
--- a/conf/datasets/sciplex_K562_significant_all_phases.json
+++ b/conf/datasets/sciplex_K562_significant_all_phases.json
@@ -12,4 +12,4 @@
         "max_epochs": 4
     },
     "clustering_method": "ward"
-}
\ No newline at end of file
+}
diff --git a/conf/datasets/sciplex_MCF7_significant_all_phases.json b/conf/datasets/sciplex_MCF7_significant_all_phases.json
index 5a47586..fde87da 100644
--- a/conf/datasets/sciplex_MCF7_significant_all_phases.json
+++ b/conf/datasets/sciplex_MCF7_significant_all_phases.json
@@ -12,4 +12,4 @@
         "max_epochs": 4
     },
     "clustering_method": "ward"
-}
\ No newline at end of file
+}
diff --git a/env/gpu/run_models.yaml b/env/gpu/run_models.yaml
index 4e422f4..4782420 100644
--- a/env/gpu/run_models.yaml
+++ b/env/gpu/run_models.yaml
@@ -13,8 +13,8 @@ dependencies:
     - torchvision
     - torchaudio
     - cudatoolkit=11.6
-    - jax
-    - jaxlib
+    - jax=0.4.3
+    - jaxlib=0.4.3
     - cuda-nvcc
     - xarray=2022.12.0
     - dask=2022.12.1
diff --git a/modules/get_latent_scviv2.nf b/modules/get_latent_scviv2.nf
index e400777..7e9447a 100644
--- a/modules/get_latent_scviv2.nf
+++ b/modules/get_latent_scviv2.nf
@@ -19,7 +19,7 @@ process get_latent_scviv2 {
         --adata_out ${adata_out} \\
         --cell_representations_out ${cell_representations_out} \\
         --distance_matrices_out ${distance_matrices_out} \\
-        --normalized_distance_matrices_out ${normalized_distance_matrices_out} 
+        --normalized_distance_matrices_out ${normalized_distance_matrices_out}
     """
 
     output:
diff --git a/notebooks/sciplex_analysis.py b/notebooks/sciplex_analysis.py
index d2c459d..a9bc40b 100644
--- a/notebooks/sciplex_analysis.py
+++ b/notebooks/sciplex_analysis.py
@@ -1,13 +1,13 @@
 # %%
 import os
 
+import matplotlib.pyplot as plt
 import numpy as np
-import xarray as xr
-import seaborn as sns
 import scanpy as sc
 import scvi
 import scvi_v2
-import matplotlib.pyplot as plt
+import seaborn as sns
+import xarray as xr
 from matplotlib.patches import Patch
 from scipy.stats import norm
 
diff --git a/notebooks/sciplex_get_significant_product_dose.R b/notebooks/sciplex_get_significant_product_dose.R
index 357499b..2f6ade4 100644
--- a/notebooks/sciplex_get_significant_product_dose.R
+++ b/notebooks/sciplex_get_significant_product_dose.R
@@ -24,17 +24,17 @@ for (cell_line in names(sciPlex_cds.list)) {
       Var1 ~ Var2,
       value.var = "Freq"
     )
-  
+
   weighted.mat = product_cluster_mat.list[[cell_line]]
   ntc.counts = product_cluster_mat.list[[cell_line]]["Vehicle_0", ]
-  
+
   cluster.enrichment.df[[cell_line]] = do.call(rbind, lapply(rownames(weighted.mat), function(product_dose) {
     do.call(rbind, lapply(1:ncol(weighted.mat), function(Cluster) {
       test = fisher.test(cbind(c(weighted.mat[product_dose, Cluster], sum(
         weighted.mat[product_dose,-Cluster]
       )),
       c(ntc.counts[Cluster], sum(ntc.counts[-Cluster]))))
-      
+
       data.frame(
         product_dose = product_dose,
         Cluster = Cluster,
@@ -43,22 +43,22 @@ for (cell_line in names(sciPlex_cds.list)) {
       )
     }))
   }))
-  
+
   cluster.enrichment.df[[cell_line]]$q.value = p.adjust(cluster.enrichment.df[[cell_line]]$p.value, "BH")
-  
+
   cluster.enrichment.df[[cell_line]]$log2.odds = with(cluster.enrichment.df[[cell_line]],
                                                       ifelse(odds.ratio == 0,-5, round(log2(odds.ratio), 2)))
-  
+
   cluster.enrichment.df[[cell_line]]$product_name <-
     sapply(cluster.enrichment.df[[cell_line]]$product_dose, function(x) {
       stringr::str_split(x, pattern = "_")[[1]][1]
     })
-  
+
   cluster.enrichment.df[[cell_line]]$dose <-
     sapply(cluster.enrichment.df[[cell_line]]$product_dose, function(x) {
       stringr::str_split(x, pattern = "_")[[1]][2]
     })
-  
+
 }
 
 
@@ -73,12 +73,12 @@ for (cell_line in names(sciPlex_cds.list)) {
                  log2.odds > 2.5) %>%
         distinct(product_dose)
     )$product_dose
-  
+
   print(length(significant_product_dose_combinations.list[[cell_line]]))
-  
+
 }
 
 # Save to CSV
 for (cell_line in names(sciPlex_cds.list)) {
   write.table(significant_product_dose_combinations.list[[cell_line]], file = paste("notebooks/output/", cell_line, ".csv", sep = ""), row.names = FALSE, col.names = FALSE, sep=",")
-}
\ No newline at end of file
+}
diff --git a/notebooks/sciplex_preprocess.py b/notebooks/sciplex_preprocess.py
index 89b81da..76c0de8 100644
--- a/notebooks/sciplex_preprocess.py
+++ b/notebooks/sciplex_preprocess.py
@@ -1,7 +1,7 @@
 # %%
+import numpy as np
 import pandas as pd
 import scanpy as sc
-import numpy as np
 
 # %%
 adata = sc.read(

From 1685e6d28b0cce2bd337d4daedb6bbc204bac07c Mon Sep 17 00:00:00 2001
From: Martin Kim <martinkim@berkeley.edu>
Date: Fri, 17 Feb 2023 14:54:56 -0800
Subject: [PATCH 3/3] pre-commit changes

---
 bin/compute_2dreps.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/bin/compute_2dreps.py b/bin/compute_2dreps.py
index a683309..0ed455c 100644
--- a/bin/compute_2dreps.py
+++ b/bin/compute_2dreps.py
@@ -1,7 +1,7 @@
 import pymde
 import scanpy as sc
 from anndata import AnnData
-from utils import write_h5ad, wrap_kwargs
+from utils import wrap_kwargs, write_h5ad
 
 mde_kwargs = {
     "embedding_dim": 2,
@@ -59,7 +59,9 @@ def compute_2dreps(
         adata.obsm[latent_pca_key] = latent_pca
         latent_pca_keys.append(latent_pca_key)
 
-        latent_mde = pymde.preserve_neighbors(latent, **mde_kwargs).embed().cpu().numpy()
+        latent_mde = (
+            pymde.preserve_neighbors(latent, **mde_kwargs).embed().cpu().numpy()
+        )
         latent_mde_key = f"{latent_key}_mde"
         adata.obsm[latent_mde_key] = latent_mde
         latent_mde_keys.append(latent_mde_key)