From 1e3dd6fb4ee4b88dff73e0c3e2321d2a4a87ea2c Mon Sep 17 00:00:00 2001 From: Martin Kim Date: Fri, 17 Feb 2023 11:02:24 -0800 Subject: [PATCH 1/3] Add documentation to scripts --- bin/composition_baseline.py | 79 +++++++++++++++++++++++++++++++++---- bin/compute_2dreps.py | 45 +++++++++++++++------ bin/compute_rf.py | 34 ++++++++++------ bin/utils.py | 8 ++++ 4 files changed, 132 insertions(+), 34 deletions(-) diff --git a/bin/composition_baseline.py b/bin/composition_baseline.py index 31e7719..69238a0 100644 --- a/bin/composition_baseline.py +++ b/bin/composition_baseline.py @@ -1,21 +1,50 @@ -from typing import Union, Literal, Optional +from typing import Literal, Optional, Union -from anndata import AnnData +import numpy as np import scanpy as sc +from anndata import AnnData from scvi.model import SCVI from sklearn.metrics import pairwise_distances -import numpy as np class CompositionBaseline: + """ + Baseline method for computing sample-sample distances within clusters. + + Parameters + ---------- + adata + Annotated data matrix. + batch_key + Key in `adata.obs` corresponding to batch/site information. + sample_key + Key in `adata.obs` corresponding to sample information. + rep + Representation to compute distances on. One of the following: + + * `"PCA"`: :meth:`~scanpy.pp.pca` + * `"SCVI"`: :class:`~scvi.model.SCVI` + model_kwargs + Keyword arguments to pass into :class:`~scvi.model.SCVI`. Also contains the + following keys: + + * `"n_dim"`: Number of components to use for PCA + * `"clustering_on"`: One of `"leiden"` or `"cluster_key"`. If `"leiden"`, + clusters are computed using :meth:`~scanpy.tl.leiden`. If `"cluster_key"`, + clusters correspond to values in `adata.obs[cluster_key]`. + * `"cluster_key"`: Key in `adata.obs` corresponding to cluster information. + train_kwargs + Keyword arguments to pass into :meth:`~scvi.model.SCVI.train`. + """ + def __init__( self, adata: AnnData, batch_key: Optional[str], sample_key: Optional[str], rep: Union[Literal["PCA"], Literal["SCVI"]], - model_kwargs=None, - train_kwargs=None, + model_kwargs: dict = None, + train_kwargs: dict = None, ): assert rep in ["PCA", "SCVI"] self.adata = adata.copy() @@ -26,16 +55,17 @@ def __init__( self.rep = rep self.rep_key = "X_rep" - self.n_dim = model_kwargs.pop("n_dim", 50) - self.clustering_on = model_kwargs.pop( + self.n_dim = self.model_kwargs.pop("n_dim", 50) + self.clustering_on = self.model_kwargs.pop( "clustering_on", "leiden" ) # one of leiden, cluster_key - self.cluster_key = model_kwargs.pop("cluster_key", None) + self.cluster_key = self.model_kwargs.pop("cluster_key", None) def preprocess_data(self): if self.rep == "PCA": sc.pp.normalize_total(self.adata, target_sum=1e4) sc.pp.log1p(self.adata) + # TODO: snakemake workflow had hvg selection, should we add it here too? def fit(self): self.preprocess_data() @@ -52,6 +82,19 @@ def get_cell_representation(self): return self.adata.obsm[self.rep_key] def get_local_sample_representation(self): + """ + Computes local sample representations within each cluster. + + Represents samples as proportion of total sample cells within Leiden subclusters. + + Returns + ------- + Dictionary + Keys are unique labels in `adata.obs[self.cluster_key]`. + Values are :class:`pandas.DataFrame` with rows corresponding to samples + within that cluster and columns corresponding to the frequency of each + sample within each subcluster. + """ if self.clustering_on == "leiden": sc.pp.neighbors(self.adata, n_neighbors=30, use_rep=self.rep_key) sc.tl.leiden(self.adata, resolution=1.0, key_added="leiden_1.0") @@ -73,20 +116,25 @@ def get_local_sample_representation(self): sc.pp.neighbors(subann, n_neighbors=30, use_rep=self.rep_key) sc.tl.leiden(subann, resolution=1.0, key_added=self.cluster_key) + # Step 2: compute number of cells per subcluster-sample pair szs = ( subann.obs.groupby([self.cluster_key, self.sample_key]) .size() .to_frame("n_cells") .reset_index() ) + # Step 3: compute total number of cells per sample szs_total = ( szs.groupby(self.sample_key) .sum() .rename(columns={"n_cells": "n_cells_total"}) ) + # Step 4: compute frequency of each subcluster per sample comps = szs.merge(szs_total, on=self.sample_key).assign( freqs=lambda x: x.n_cells / x.n_cells_total ) + # Step 5: compute representation of each sample as the vector + # of frequencies in each subcluster freqs = ( comps.loc[:, [self.sample_key, self.cluster_key, "freqs"]] .set_index([self.sample_key, self.cluster_key]) @@ -99,15 +147,30 @@ def get_local_sample_representation(self): return freqs_all def get_local_sample_distances(self): + """ + Computes local sample distances for each cell. + + Sample-sample distances for a particular cell are computed as the Euclidean + distance between the sample representations within the cell's cluster. + Distances for samples absent in a cluster are imputed as `1.0`. + + Returns + ------- + :class:`numpy.ndarray` + Array of shape `(n_cells, n_samples, n_samples)`. Cells in the same cluster + have the same distance matrix. + """ freqs_all = self.get_local_sample_representation() n_sample = self.adata.obs[self.sample_key].nunique() sample_order = self.adata.obs[self.sample_key].unique() local_dists = np.zeros((self.adata.n_obs, n_sample, n_sample)) + for cluster, freqs in freqs_all.items(): cell_is_selected = self.adata.obs[self.cluster_key] == cluster ordered_freqs = freqs.reindex(sample_order).fillna(1.) cluster_reps = ordered_freqs.values cluster_dists = pairwise_distances(cluster_reps, metric="euclidean") local_dists[cell_is_selected] = cluster_dists + return local_dists \ No newline at end of file diff --git a/bin/compute_2dreps.py b/bin/compute_2dreps.py index ea7d02e..1535098 100644 --- a/bin/compute_2dreps.py +++ b/bin/compute_2dreps.py @@ -1,7 +1,9 @@ +import pymde import scanpy as sc +from anndata import AnnData from tsnecuda import TSNE -import pymde -from utils import make_parents, wrap_kwargs + +from utils import write_h5ad, wrap_kwargs mde_kwargs = dict( @@ -16,22 +18,37 @@ @wrap_kwargs def compute_2dreps( *, - adata_in, - adata_out, -): - """Computes low dimensional representations for existing latent representations. + adata_in: str, + adata_out: str, +) -> AnnData: + """ + Computes low dimensional representations for existing latent representations. + + Iterates over keys in `.uns["latent_keys"]` and computes the following: + + * :meth:`~pymde.pca` with `embedding_dim=2` + * :meth:`~tsnecuda.TSNE.fit_transform` with `perplexity=30` + * :meth:`~pymde.preserve_neighbors` with `mde_kwargs` Parameters ---------- - adata_in : - Path to anndata containing representations - adata_out : - Path to save the file + adata_in + Path to :class:`anndata.AnnData` containing representations. + adata_out + Path to save the output :class:`anndata.AnnData` with 2D representations. + + Returns + ------- + :class:`anndata.AnnData` + Annotated data matrix with the following: + + * `.uns["latent_pca_keys"]`: Keys in `.obsm` corresponding to PCA representations. + * `.uns["latent_tsne_keys"]`: Keys in `.obsm` corresponding to t-SNE representations. + * `.uns["latent_mde_keys"]`: Keys in `.obsm` corresponding to MDE representations. """ adata = sc.read_h5ad(adata_in) latent_keys = adata.uns.get("latent_keys", None) - make_parents(adata_out) - + if latent_keys is None: adata.write_h5ad(adata_out) @@ -55,11 +72,13 @@ def compute_2dreps( latent_mde_key = f"{latent_key}_mde" adata.obsm[latent_mde_key] = latent_mde latent_mde_keys.append(latent_mde_key) + adata.uns["latent_pca_keys"] = latent_pca_keys adata.uns["latent_tsne_keys"] = latent_tsne_keys adata.uns["latent_mde_keys"] = latent_mde_keys - adata.write_h5ad(adata_out) + return write_h5ad(adata, adata_out) + if __name__ == "__main__": compute_2dreps() diff --git a/bin/compute_rf.py b/bin/compute_rf.py index 4c3fbab..6ee03f1 100644 --- a/bin/compute_rf.py +++ b/bin/compute_rf.py @@ -6,8 +6,13 @@ import xarray as xr from scipy.cluster.hierarchy import linkage, to_tree from scipy.spatial.distance import squareform -from utils import (determine_if_file_empty, load_config, make_parents, - wrap_kwargs) + +from utils import ( + determine_if_file_empty, + load_config, + make_parents, + wrap_kwargs, +) def linkage_to_ete(linkage_obj): @@ -50,21 +55,24 @@ def hierarchical_clustering(dist_mtx, method="ward"): @wrap_kwargs def compute_rf( *, - distance_matrices, - distance_matrices_gt, - config_in, - table_out, + distance_matrices: str, + distance_matrices_gt: str, + config_in: str, + table_out: str, ): - """Main fn for computing RF distance. + """ + Computes the Robinson-Foulds distance between two hierarchical clusterings. Parameters ---------- - adata_in : - path to anndata file containing cell-type specific distance matrices - config_in : - path to config file - table_out : - desired output table file + distance_matrices + Path to the distance matrices. + distance_matrices_gt + + config_in + Path to the configuration file corresponding to `adata_in`. + table_out + Path to save the output table. """ basename = os.path.basename(distance_matrices) model_name = basename.split(".")[1] diff --git a/bin/utils.py b/bin/utils.py index 0256494..814c476 100644 --- a/bin/utils.py +++ b/bin/utils.py @@ -9,6 +9,7 @@ import click import pandas as pd import scanpy as sc +from anndata import AnnData from remote_pdb import RemotePdb INCH_TO_CM = 1 / 2.54 @@ -20,6 +21,13 @@ def make_parents(*paths) -> None: pathlib.Path(p).parent.mkdir(parents=True, exist_ok=True) +def write_h5ad(adata: AnnData, path: str) -> AnnData: + """Write an AnnData object to an HDF5 file.""" + make_parents(path) + adata.write_h5ad(path) + return adata + + def wrap_kwargs(fn: Callable) -> Callable: """Wrap a function to accept keyword arguments from the command line.""" for param in signature(fn).parameters: From 60e2e268f5336b370ca1522825e615f2252399c6 Mon Sep 17 00:00:00 2001 From: Martin Kim Date: Fri, 17 Feb 2023 14:51:28 -0800 Subject: [PATCH 2/3] pre-commit changes --- README.md | 36 +++++++++--- bin/composition_baseline.py | 19 ++++--- bin/compute_2dreps.py | 23 ++++---- bin/compute_rf.py | 57 ++++++++++++++----- bin/fit_mrvi.py | 1 - bin/fit_scviv2.py | 3 +- bin/get_latent_scviv2.py | 11 +++- bin/produce_figures_symsim_new.py | 5 +- bin/scib.py | 10 ++-- bin/vendi.py | 6 +- .../sciplex_A549_significant_all_phases.json | 2 +- .../sciplex_K562_significant_all_phases.json | 2 +- .../sciplex_MCF7_significant_all_phases.json | 2 +- env/gpu/run_models.yaml | 4 +- modules/get_latent_scviv2.nf | 2 +- notebooks/sciplex_analysis.py | 6 +- .../sciplex_get_significant_product_dose.R | 22 +++---- notebooks/sciplex_preprocess.py | 2 +- 18 files changed, 132 insertions(+), 81 deletions(-) diff --git a/README.md b/README.md index e09e81b..3229efb 100644 --- a/README.md +++ b/README.md @@ -1,26 +1,32 @@ # scvi-v2-reproducibility ## Running existing workflows + Existing workflows can be discovered under `workflows/`. These can be run as follows from the root of the repository: + ``` nextflow main.nf --workflow workflow_name --profile profile_name ``` -Setting both ``--workflow`` and ``--profile`` is required. Available profiles include -`standard` (run without GPU) and `gpu` (run with GPU). + +Setting both `--workflow` and `--profile` is required. Available profiles include +`standard` (run without GPU) and `gpu` (run with GPU). By default, intermediate and final outputs are placed in `results/`. This can be changed by modifying the `publishDir` directive in the configuration. ### Simple pipeline + Currently, this pipeline just scans `data/` in the root directory for any `.h5ad` files -and runs each subworkflow sequentially. It expects a configuration JSON file to be +and runs each subworkflow sequentially. It expects a configuration JSON file to be present in `conf/datasets/` with the same name as the `.h5ad` file. ### AWS pipeline + This pipeline pulls data from `s3://largedonor` and runs each subworkflow locally. In order to run this pipeline, create the file `conf/aws_credentials.config` (ignored by git) with the following entries: + ``` aws { accessKey = "access_key" @@ -28,14 +34,17 @@ aws { region = "us-west-1" } ``` + You can specify the individual AnnDatas being processed by modifying `params.inputs` in `conf/aws_pipeline.config`. ## Adding new workflows -Workflows are intended to connect subworkflows into an end-to-end pipeline. In order to + +Workflows are intended to connect subworkflows into an end-to-end pipeline. In order to add a new workflow, follow these steps: 1. Add a `workflow_name.nf` file to `workflows/` with the following template: + ``` include { subworkflow } from "${params.subworkflows.subworkflow_name}" @@ -46,8 +55,9 @@ workflow run_main { } ``` -2. Create the associated `workflow_name.config` file in `conf/` with the following -template: +2. Create the associated `workflow_name.config` file in `conf/` with the following + template: + ``` params { inputs = //path or list of paths to inputs @@ -55,10 +65,12 @@ params { ``` ## Adding new subworkflows -Subworkflows are intended to be reusable pieces across different workflows. To add a new + +Subworkflows are intended to be reusable pieces across different workflows. To add a new subworkflow, follow these steps: 1. Add a `main.nf` file to `subworkflows/subworkflow_name/` with the following template: + ``` include { module } from "${params.modules.module_name}" @@ -73,8 +85,10 @@ workflow subworkflow_name { module.out } ``` + 2. Add a reference to the new subworkflow under `nextflow.config` to be able to import -it as `params.subworkflows.subworkflow_name + it as `params.subworkflows.subworkflow_name + ``` params { subworkflow_name = "${params.subworkflows.root}/subworkflow_name/main.nf" @@ -88,6 +102,7 @@ They can be found under `modules/` and `bin/`, respecively. To add a new module, these steps: 1. Add a `module_name.nf` file to `modules/` with the following template: + ``` process module_name { input: @@ -107,6 +122,7 @@ process module_name { ``` 2. Add the corresponding Python script to `bin/` with the following template: + ``` from utils import wrap_kwargs @@ -119,7 +135,8 @@ if __name__ == "__main__": ``` 3. Add references to the new module and script under `nextflow.config` to be able to -import them as `params.modules.module_name` and `params.bin.module_name`, respectively. + import them as `params.modules.module_name` and `params.bin.module_name`, respectively. + ``` params { modules { @@ -132,6 +149,7 @@ params { ``` 4. Specify the conda environment for the new module in `nextflow.config`: + ``` params { env { diff --git a/bin/composition_baseline.py b/bin/composition_baseline.py index 5482544..efd4ab5 100644 --- a/bin/composition_baseline.py +++ b/bin/composition_baseline.py @@ -22,7 +22,7 @@ class CompositionBaseline: rep Representation to compute distances on. One of the following: - * `"PCA"`: :meth:`~scanpy.pp.pca` + * `"PCA"`: :func:`~scanpy.pp.pca` * `"SCVI"`: :class:`~scvi.model.SCVI` model_kwargs Keyword arguments to pass into :class:`~scvi.model.SCVI`. Also contains the @@ -30,11 +30,11 @@ class CompositionBaseline: * `"n_dim"`: Number of components to use for PCA * `"clustering_on"`: One of `"leiden"` or `"cluster_key"`. If `"leiden"`, - clusters are computed using :meth:`~scanpy.tl.leiden`. If `"cluster_key"`, + clusters are computed using :func:`~scanpy.tl.leiden`. If `"cluster_key"`, clusters correspond to values in `adata.obs[cluster_key]`. * `"cluster_key"`: Key in `adata.obs` corresponding to cluster information. train_kwargs - Keyword arguments to pass into :meth:`~scvi.model.SCVI.train`. + Keyword arguments to pass into :func:`~scvi.model.SCVI.train`. """ def __init__( @@ -50,8 +50,8 @@ def __init__( self.adata = adata.copy() self.batch_key = batch_key self.sample_key = sample_key - self.model_kwargs = model_kwargs if model_kwargs is not None else dict() - self.train_kwargs = train_kwargs if train_kwargs is not None else dict() + self.model_kwargs = model_kwargs if model_kwargs is not None else {} + self.train_kwargs = train_kwargs if train_kwargs is not None else {} self.rep = rep self.rep_key = "X_rep" @@ -63,12 +63,14 @@ def __init__( self.subcluster_key = "leiden_subcluster" def preprocess_data(self): + """Preprocess data for PCA or SCVI.""" if self.rep == "PCA": sc.pp.normalize_total(self.adata, target_sum=1e4) sc.pp.log1p(self.adata) - # TODO: snakemake workflow had hvg selection, should we add it here too? + # TODO: snakemake workflow had hvg selection, should we add it here too? def fit(self): + """Fit PCA or SCVI.""" self.preprocess_data() if self.rep == "PCA": sc.pp.pca(self.adata, n_comps=self.n_dim) # saves "X_pca" in obsm @@ -80,6 +82,7 @@ def fit(self): self.adata.obsm[self.rep_key] = scvi_model.get_latent_representation() def get_cell_representation(self): + """Get cell representations.""" return self.adata.obsm[self.rep_key] def get_local_sample_representation(self): @@ -91,7 +94,7 @@ def get_local_sample_representation(self): Returns ------- Dictionary - Keys are unique labels in `adata.obs[self.cluster_key]`. + Keys are unique labels in `adata.obs[self.cluster_key]`. Values are :class:`pandas.DataFrame` with rows corresponding to samples within that cluster and columns corresponding to the frequency of each sample within each subcluster. @@ -108,7 +111,7 @@ def get_local_sample_representation(self): "If clustering_on is cluster_key, cluster_key must be provided." ) - freqs_all = dict() + freqs_all = {} for unique_cluster in self.adata.obs[self.cluster_key].unique(): cell_is_selected = self.adata.obs[self.cluster_key] == unique_cluster subann = self.adata[cell_is_selected].copy() diff --git a/bin/compute_2dreps.py b/bin/compute_2dreps.py index df04795..a683309 100644 --- a/bin/compute_2dreps.py +++ b/bin/compute_2dreps.py @@ -1,16 +1,15 @@ import pymde import scanpy as sc from anndata import AnnData - from utils import write_h5ad, wrap_kwargs -mde_kwargs = dict( - embedding_dim=2, - constraint=pymde.Standardized(), - repulsive_fraction=0.7, - device="cuda", - n_neighbors=15, -) +mde_kwargs = { + "embedding_dim": 2, + "constraint": pymde.Standardized(), + "repulsive_fraction": 0.7, + "device": "cuda", + "n_neighbors": 15, +} @wrap_kwargs @@ -24,9 +23,9 @@ def compute_2dreps( Iterates over keys in `.uns["latent_keys"]` and computes the following: - * :meth:`~pymde.pca` with `embedding_dim=2` - * :meth:`~tsnecuda.TSNE.fit_transform` with `perplexity=30` - * :meth:`~pymde.preserve_neighbors` with `mde_kwargs` + * :func:`~pymde.pca` with `embedding_dim=2` + * :func:`~tsnecuda.TSNE.fit_transform` with `perplexity=30` + * :func:`~pymde.preserve_neighbors` with `mde_kwargs` Parameters ---------- @@ -46,7 +45,7 @@ def compute_2dreps( """ adata = sc.read_h5ad(adata_in) latent_keys = adata.uns.get("latent_keys", None) - + if latent_keys is None: adata.write_h5ad(adata_out) diff --git a/bin/compute_rf.py b/bin/compute_rf.py index 11f85f8..e3337d4 100644 --- a/bin/compute_rf.py +++ b/bin/compute_rf.py @@ -6,21 +6,28 @@ import numpy as np import pandas as pd import xarray as xr -from scipy.cluster.hierarchy import linkage, to_tree +from scipy.cluster.hierarchy import ClusterNode, linkage, to_tree from scipy.linalg import issymmetric from scipy.spatial.distance import squareform +from utils import (determine_if_file_empty, load_config, make_parents, + wrap_kwargs) -from utils import ( - determine_if_file_empty, - load_config, - make_parents, - wrap_kwargs, -) +def linkage_to_ete(linkage_obj: np.ndarray) -> ete3.Tree: + """ + Converts a linkage matrix to an :class:`~ete3.Tree` object. + + Parameters + ---------- + linkage_obj + A linkage matrix. -def linkage_to_ete(linkage_obj): - """Converts to ete3 tree representation.""" - R = to_tree(linkage_obj) + Returns + ------- + :class:`~ete3.Tree` + Tree representation of `linkage_obj`. + """ + R: ClusterNode = to_tree(linkage_obj) root = ete3.Tree() root.dist = 0 root.name = "root" @@ -44,18 +51,39 @@ def linkage_to_ete(linkage_obj): item2node[node.get_id()].add_child(ch) item2node[ch_node_id] = ch to_visit.append(ch_node) + return root -def hierarchical_clustering(dist_mtx, method="ward"): - """Perform hierarchical clustering on squared distance matrix.""" +def hierarchical_clustering(dist_mtx: np.ndarray, method: str = "ward"): + """ + Perform hierarchical clustering on a squared distance matrix. + + Parameters + ---------- + dist_mtx + Squared distance matrix. + method + Method to use for hierarchical clustering. See :func:`~scipy.cluster.hierarchy.linkage`. + + Returns + ------- + :class:`~ete3.Tree` + Tree representation of the hierarchical clustering. + """ assert dist_mtx.shape[0] == dist_mtx.shape[1] + is_symmetric = issymmetric(dist_mtx) has_zero_diag = (dist_mtx.diagonal() == 0).all() if not (is_symmetric and has_zero_diag): - warnings.warn("Distance matrix may be invalid.") + warnings.warn( + "`dist_mtx` is not symmetric and has a non-zero diagonal. " + "The matrix will be symmetrized and the diagonal will be set to zero.", + stacklevel=2, + ) dist_mtx = dist_mtx - np.diag(dist_mtx.diagonal()) dist_mtx = (dist_mtx + dist_mtx.T) / 2.0 + red_mtx = squareform(dist_mtx) z = linkage(red_mtx, method=method) return linkage_to_ete(z) @@ -77,7 +105,7 @@ def compute_rf( distance_matrices Path to the distance matrices. distance_matrices_gt - + Path to the ground truth distance matrices. config_in Path to the configuration file corresponding to `adata_in`. table_out @@ -123,6 +151,7 @@ def compute_rf( model_name=model_name ) dists.to_csv(table_out, index=False) + return dists if __name__ == "__main__": diff --git a/bin/fit_mrvi.py b/bin/fit_mrvi.py index d93fcc9..3b7da64 100644 --- a/bin/fit_mrvi.py +++ b/bin/fit_mrvi.py @@ -1,6 +1,5 @@ import mrvi import scanpy as sc - from utils import load_config, make_parents, wrap_kwargs diff --git a/bin/fit_scviv2.py b/bin/fit_scviv2.py index 660a19c..a06e66b 100644 --- a/bin/fit_scviv2.py +++ b/bin/fit_scviv2.py @@ -1,6 +1,5 @@ -import scvi_v2 import scanpy as sc - +import scvi_v2 from utils import load_config, make_parents, wrap_kwargs diff --git a/bin/get_latent_scviv2.py b/bin/get_latent_scviv2.py index e50c0e0..4ac103c 100644 --- a/bin/get_latent_scviv2.py +++ b/bin/get_latent_scviv2.py @@ -1,7 +1,6 @@ import scanpy as sc import scvi_v2 from anndata import AnnData -import numpy as np from utils import load_config, make_parents, wrap_kwargs @@ -55,13 +54,19 @@ def get_latent_scviv2( cell_reps.to_netcdf(cell_representations_out) del cell_reps - cell_dists = model.get_local_sample_distances(adata, keep_cell=False, groupby=labels_key) + cell_dists = model.get_local_sample_distances( + adata, keep_cell=False, groupby=labels_key + ) make_parents(distance_matrices_out) cell_dists.to_netcdf(distance_matrices_out) del cell_dists cell_normalized_dists = model.get_local_sample_distances( - adata, use_mean=False, normalize_distances=True, keep_cell=False, groupby=labels_key + adata, + use_mean=False, + normalize_distances=True, + keep_cell=False, + groupby=labels_key, ) make_parents(normalized_distance_matrices_out) cell_normalized_dists.to_netcdf(normalized_distance_matrices_out) diff --git a/bin/produce_figures_symsim_new.py b/bin/produce_figures_symsim_new.py index a064835..d032521 100644 --- a/bin/produce_figures_symsim_new.py +++ b/bin/produce_figures_symsim_new.py @@ -2,12 +2,11 @@ import argparse import glob import os -import re -import numpy as np import matplotlib.pyplot as plt -import seaborn as sns +import numpy as np import plotnine as p9 +import seaborn as sns import xarray as xr from utils import INCH_TO_CM, load_results diff --git a/bin/scib.py b/bin/scib.py index 1065386..0cf3bfa 100644 --- a/bin/scib.py +++ b/bin/scib.py @@ -5,11 +5,11 @@ import scanpy as sc import scib_metrics as metrics from anndata import AnnData - from utils import load_config, make_parents, wrap_kwargs def categorical_obs(adata: AnnData, key: Optional[str]) -> Optional[np.ndarray]: + """Get categorical observation as an array.""" if key is None: return return np.array(adata.obs[key].astype("category").cat.codes).ravel() @@ -51,7 +51,7 @@ def compute_scib( isolated_label_score = None if labels is not None and batch is not None: isolated_label_score = metrics.isolated_labels(X_latent, labels, batch) - + silhouette_label_score = None if labels is not None: silhouette_label_score = metrics.silhouette_label( @@ -62,7 +62,7 @@ def compute_scib( if sample is not None: silhouette_sample_score = metrics.silhouette_label( X_latent, sample, rescale=True - ) + ) # ( # nmi_kmeans_label_score, # ari_kmeans_label_score, @@ -126,7 +126,9 @@ def compute_scib( all_metrics.update(latent_metrics) df = pd.DataFrame.from_dict( - all_metrics, orient="index", columns=["latent_key", "metric_name", "metric_value"] + all_metrics, + orient="index", + columns=["latent_key", "metric_name", "metric_value"], ) make_parents(table_out) df.to_csv(table_out) diff --git a/bin/vendi.py b/bin/vendi.py index d5b9caf..acae1f1 100644 --- a/bin/vendi.py +++ b/bin/vendi.py @@ -1,10 +1,8 @@ +import numpy as np import pandas as pd -import scanpy as sc -from vendi_score import vendi import xarray as xr -import numpy as np - from utils import load_config, make_parents, wrap_kwargs +from vendi_score import vendi @wrap_kwargs diff --git a/conf/datasets/sciplex_A549_significant_all_phases.json b/conf/datasets/sciplex_A549_significant_all_phases.json index 5a47586..fde87da 100644 --- a/conf/datasets/sciplex_A549_significant_all_phases.json +++ b/conf/datasets/sciplex_A549_significant_all_phases.json @@ -12,4 +12,4 @@ "max_epochs": 4 }, "clustering_method": "ward" -} \ No newline at end of file +} diff --git a/conf/datasets/sciplex_K562_significant_all_phases.json b/conf/datasets/sciplex_K562_significant_all_phases.json index 5a47586..fde87da 100644 --- a/conf/datasets/sciplex_K562_significant_all_phases.json +++ b/conf/datasets/sciplex_K562_significant_all_phases.json @@ -12,4 +12,4 @@ "max_epochs": 4 }, "clustering_method": "ward" -} \ No newline at end of file +} diff --git a/conf/datasets/sciplex_MCF7_significant_all_phases.json b/conf/datasets/sciplex_MCF7_significant_all_phases.json index 5a47586..fde87da 100644 --- a/conf/datasets/sciplex_MCF7_significant_all_phases.json +++ b/conf/datasets/sciplex_MCF7_significant_all_phases.json @@ -12,4 +12,4 @@ "max_epochs": 4 }, "clustering_method": "ward" -} \ No newline at end of file +} diff --git a/env/gpu/run_models.yaml b/env/gpu/run_models.yaml index 4e422f4..4782420 100644 --- a/env/gpu/run_models.yaml +++ b/env/gpu/run_models.yaml @@ -13,8 +13,8 @@ dependencies: - torchvision - torchaudio - cudatoolkit=11.6 - - jax - - jaxlib + - jax=0.4.3 + - jaxlib=0.4.3 - cuda-nvcc - xarray=2022.12.0 - dask=2022.12.1 diff --git a/modules/get_latent_scviv2.nf b/modules/get_latent_scviv2.nf index e400777..7e9447a 100644 --- a/modules/get_latent_scviv2.nf +++ b/modules/get_latent_scviv2.nf @@ -19,7 +19,7 @@ process get_latent_scviv2 { --adata_out ${adata_out} \\ --cell_representations_out ${cell_representations_out} \\ --distance_matrices_out ${distance_matrices_out} \\ - --normalized_distance_matrices_out ${normalized_distance_matrices_out} + --normalized_distance_matrices_out ${normalized_distance_matrices_out} """ output: diff --git a/notebooks/sciplex_analysis.py b/notebooks/sciplex_analysis.py index d2c459d..a9bc40b 100644 --- a/notebooks/sciplex_analysis.py +++ b/notebooks/sciplex_analysis.py @@ -1,13 +1,13 @@ # %% import os +import matplotlib.pyplot as plt import numpy as np -import xarray as xr -import seaborn as sns import scanpy as sc import scvi import scvi_v2 -import matplotlib.pyplot as plt +import seaborn as sns +import xarray as xr from matplotlib.patches import Patch from scipy.stats import norm diff --git a/notebooks/sciplex_get_significant_product_dose.R b/notebooks/sciplex_get_significant_product_dose.R index 357499b..2f6ade4 100644 --- a/notebooks/sciplex_get_significant_product_dose.R +++ b/notebooks/sciplex_get_significant_product_dose.R @@ -24,17 +24,17 @@ for (cell_line in names(sciPlex_cds.list)) { Var1 ~ Var2, value.var = "Freq" ) - + weighted.mat = product_cluster_mat.list[[cell_line]] ntc.counts = product_cluster_mat.list[[cell_line]]["Vehicle_0", ] - + cluster.enrichment.df[[cell_line]] = do.call(rbind, lapply(rownames(weighted.mat), function(product_dose) { do.call(rbind, lapply(1:ncol(weighted.mat), function(Cluster) { test = fisher.test(cbind(c(weighted.mat[product_dose, Cluster], sum( weighted.mat[product_dose,-Cluster] )), c(ntc.counts[Cluster], sum(ntc.counts[-Cluster])))) - + data.frame( product_dose = product_dose, Cluster = Cluster, @@ -43,22 +43,22 @@ for (cell_line in names(sciPlex_cds.list)) { ) })) })) - + cluster.enrichment.df[[cell_line]]$q.value = p.adjust(cluster.enrichment.df[[cell_line]]$p.value, "BH") - + cluster.enrichment.df[[cell_line]]$log2.odds = with(cluster.enrichment.df[[cell_line]], ifelse(odds.ratio == 0,-5, round(log2(odds.ratio), 2))) - + cluster.enrichment.df[[cell_line]]$product_name <- sapply(cluster.enrichment.df[[cell_line]]$product_dose, function(x) { stringr::str_split(x, pattern = "_")[[1]][1] }) - + cluster.enrichment.df[[cell_line]]$dose <- sapply(cluster.enrichment.df[[cell_line]]$product_dose, function(x) { stringr::str_split(x, pattern = "_")[[1]][2] }) - + } @@ -73,12 +73,12 @@ for (cell_line in names(sciPlex_cds.list)) { log2.odds > 2.5) %>% distinct(product_dose) )$product_dose - + print(length(significant_product_dose_combinations.list[[cell_line]])) - + } # Save to CSV for (cell_line in names(sciPlex_cds.list)) { write.table(significant_product_dose_combinations.list[[cell_line]], file = paste("notebooks/output/", cell_line, ".csv", sep = ""), row.names = FALSE, col.names = FALSE, sep=",") -} \ No newline at end of file +} diff --git a/notebooks/sciplex_preprocess.py b/notebooks/sciplex_preprocess.py index 89b81da..76c0de8 100644 --- a/notebooks/sciplex_preprocess.py +++ b/notebooks/sciplex_preprocess.py @@ -1,7 +1,7 @@ # %% +import numpy as np import pandas as pd import scanpy as sc -import numpy as np # %% adata = sc.read( From 1685e6d28b0cce2bd337d4daedb6bbc204bac07c Mon Sep 17 00:00:00 2001 From: Martin Kim Date: Fri, 17 Feb 2023 14:54:56 -0800 Subject: [PATCH 3/3] pre-commit changes --- bin/compute_2dreps.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/bin/compute_2dreps.py b/bin/compute_2dreps.py index a683309..0ed455c 100644 --- a/bin/compute_2dreps.py +++ b/bin/compute_2dreps.py @@ -1,7 +1,7 @@ import pymde import scanpy as sc from anndata import AnnData -from utils import write_h5ad, wrap_kwargs +from utils import wrap_kwargs, write_h5ad mde_kwargs = { "embedding_dim": 2, @@ -59,7 +59,9 @@ def compute_2dreps( adata.obsm[latent_pca_key] = latent_pca latent_pca_keys.append(latent_pca_key) - latent_mde = pymde.preserve_neighbors(latent, **mde_kwargs).embed().cpu().numpy() + latent_mde = ( + pymde.preserve_neighbors(latent, **mde_kwargs).embed().cpu().numpy() + ) latent_mde_key = f"{latent_key}_mde" adata.obsm[latent_mde_key] = latent_mde latent_mde_keys.append(latent_mde_key)