Merge pull request #48 from scverse-bot/template-update-YosefLab-PopV…

…-v0.4.0 Update template to v0.4.0
YosefLab · Dec 13, 2024 · b384c9b · b384c9b
2 parents c453b64 + ce57fa9
commit b384c9b
Show file tree

Hide file tree

Showing 26 changed files with 510 additions and 150 deletions.
diff --git a/.cruft.json b/.cruft.json
@@ -1,7 +1,7 @@
 {
   "template": "https://github.com/scverse/cookiecutter-scverse",
-  "commit": "8e96abb5c3e2d5078c44713958da672711cf2a48",
-  "checkout": null,
+  "commit": "87a407a65408d75a949c0b54b19fd287475a56f8",
+  "checkout": "v0.4.0",
   "context": {
     "cookiecutter": {
       "project_name": "PopV",
@@ -13,7 +13,8 @@
       "project_repo": "https://github.com/YosefLab/PopV.git",
       "license": "MIT License",
       "_copy_without_render": [
-        ".github/workflows/**.yaml",
+        ".github/workflows/build.yaml",
+        ".github/workflows/test.yaml",
         "docs/_templates/autosummary/**.rst"
       ],
       "_render_devdocs": false,

diff --git a/.gitignore b/.gitignore
@@ -19,6 +19,7 @@ __pycache__/
 /.pytest_cache/
 /.cache/
 /data/
+/node_modules/
 
 # docs
 /docs/generated/

diff --git a/.pre-commit-config.yaml.rej b/.pre-commit-config.yaml.rej
@@ -0,0 +1,34 @@
+diff a/.pre-commit-config.yaml b/.pre-commit-config.yaml	(rejected hunks)
+@@ -6,29 +6,18 @@ default_stages:
+   - push
+ minimum_pre_commit_version: 2.16.0
+ repos:
+-  - repo: https://github.com/psf/black
+-    rev: "24.4.2"
+-    hooks:
+-      - id: black
+-  - repo: https://github.com/asottile/blacken-docs
+-    rev: 1.16.0
+-    hooks:
+-      - id: blacken-docs
+   - repo: https://github.com/pre-commit/mirrors-prettier
+     rev: v4.0.0-alpha.8
+     hooks:
+       - id: prettier
+-        # Newer versions of node don't work on systems that have an older version of GLIBC
+-        # (in particular Ubuntu 18.04 and Centos 7)
+-        # EOL of Centos 7 is in 2024-06, we can probably get rid of this then.
+-        # See https://github.com/scverse/cookiecutter-scverse/issues/143 and
+-        # https://github.com/jupyterlab/jupyterlab/issues/12675
+-        language_version: "17.9.1"
+   - repo: https://github.com/astral-sh/ruff-pre-commit
+     rev: v0.4.4
+     hooks:
+       - id: ruff
++        types_or: [python, pyi, jupyter]
+         args: [--fix, --exit-non-zero-on-fix]
++      - id: ruff-format
++        types_or: [python, pyi, jupyter]
+   - repo: https://github.com/pre-commit/pre-commit-hooks
+     rev: v4.6.0
+     hooks:
diff --git a/README.md.rej b/README.md.rej
@@ -0,0 +1,10 @@
+diff a/README.md b/README.md	(rejected hunks)
+@@ -17,7 +17,7 @@ Please refer to the [documentation][link-docs]. In particular, the
+
+ ## Installation
+
+-You need to have Python 3.9 or newer installed on your system. If you don't have
++You need to have Python 3.10 or newer installed on your system. If you don't have
+ Python installed, we recommend installing [Mambaforge](https://github.com/conda-forge/miniforge#mambaforge).
+
+ There are several alternative options to install PopV:
diff --git a/docs/conf.py b/docs/conf.py
@@ -1,5 +1,5 @@
 # Configuration file for the Sphinx documentation builder.
-#
+
 # This file only contains a selection of the most common options. For a full
 # list see the documentation:
 # https://www.sphinx-doc.org/en/master/usage/configuration.html
@@ -45,10 +45,10 @@
 
 html_context = {
     "display_github": True,  # Integrate GitHub
-    "github_user": "cane11",  # Username
-    "github_repo": project_name,  # Repo name
-    "github_version": "main",  # Version
-    "conf_py_path": "/docs/",  # Path in the checkout to the docs root
+    "github_user": "cane11",
+    "github_repo": "https://github.com/YosefLab/PopV.git",
+    "github_version": "main",
+    "conf_py_path": "/docs/",
 }
 
 # -- General configuration ---------------------------------------------------

diff --git a/docs/contributing.md b/docs/contributing.md
@@ -51,7 +51,7 @@ and [prettier][prettier-editors].
 ## Writing tests
 
 ```{note}
-Remember to first install the package with `pip install '-e[dev,test]'`
+Remember to first install the package with `pip install -e '.[dev,test]'`
 ```
 
 This package uses the [pytest][] for automated testing. Please [write tests][scanpy-test-docs] for every function added
@@ -93,7 +93,7 @@ Before making a release, you need to update the version number in the `pyproject
 > Additional labels for pre-release and build metadata are available as extensions to the MAJOR.MINOR.PATCH format.
 
 Once you are done, commit and push your changes and navigate to the "Releases" page of this project on GitHub.
-Specify `vX.X.X` as a tag name and create a release. For more information, see [managing Github releases][]. This will automatically create a git tag and trigger a Github workflow that creates a release on PyPI.
+Specify `vX.X.X` as a tag name and create a release. For more information, see [managing GitHub releases][]. This will automatically create a git tag and trigger a Github workflow that creates a release on PyPI.
 
 ## Writing documentation
 
@@ -157,3 +157,4 @@ open _build/html/index.html
 [numpydoc]: https://numpydoc.readthedocs.io/en/latest/format.html
 [sphinx autodoc typehints]: https://github.com/tox-dev/sphinx-autodoc-typehints
 [pypi]: https://pypi.org/
+[managing GitHub releases]: https://docs.github.com/en/repositories/releasing-projects-on-github/managing-releases-in-a-repository
diff --git a/popv/_settings.py b/popv/_settings.py
@@ -101,7 +101,9 @@ def verbosity(self, level: str | int):
             console = Console(force_terminal=True)
             if console.is_jupyter is True:
                 console.is_jupyter = False
-            ch = RichHandler(level=level, show_path=False, console=console, show_time=False)
+            ch = RichHandler(
+                level=level, show_path=False, console=console, show_time=False
+            )
             formatter = logging.Formatter("%(message)s")
             ch.setFormatter(formatter)
             popv_logger.addHandler(ch)

diff --git a/popv/_utils.py b/popv/_utils.py
@@ -49,7 +49,9 @@ def subsample_dataset(
         if labels_counts[label] < n_samples_per_label:
             sample_idx.append(label_locs)
         else:
-            label_subset = np.random.choice(label_locs, n_samples_per_label, replace=False)
+            label_subset = np.random.choice(
+                label_locs, n_samples_per_label, replace=False
+            )
             sample_idx.append(label_subset)
     sample_idx = np.concatenate(sample_idx)
     return adata.obs_names[sample_idx]
@@ -79,7 +81,9 @@ def check_genes_is_subset(ref_genes, query_genes):
         logging.info("All ref genes are in query dataset. Can use pretrained models.")
         is_subset = True
     else:
-        logging.info("Not all reference genes are in query dataset. Set 'prediction_mode' to 'retrain'.")
+        logging.info(
+            "Not all reference genes are in query dataset. Set 'prediction_mode' to 'retrain'."
+        )
         is_subset = False
     return is_subset
 
@@ -95,7 +99,9 @@ def make_batch_covariate(adata, batch_keys, new_batch_key):
     batch_keys
         List of keys in adat.obs corresponding to batches
     """
-    adata.obs[new_batch_key] = adata.obs[batch_keys].astype(str).sum(1).astype("category")
+    adata.obs[new_batch_key] = (
+        adata.obs[batch_keys].astype(str).sum(1).astype("category")
+    )
 
 
 def calculate_depths(g):
@@ -142,7 +148,9 @@ def make_ontology_dag(obofile, lowercase=False):
     """
     co = obonet.read_obo(obofile, encoding="utf-8")
     id_to_name = {id_: data.get("name") for id_, data in co.nodes(data=True)}
-    name_to_id = {data["name"]: id_ for id_, data in co.nodes(data=True) if ("name" in data)}
+    name_to_id = {
+        data["name"]: id_ for id_, data in co.nodes(data=True) if ("name" in data)
+    }
 
     # get all node ids that are celltypes (start with CL)
     cl_ids = {id_: True for _, id_ in name_to_id.items() if id_.startswith("CL:")}
@@ -160,7 +168,11 @@ def make_ontology_dag(obofile, lowercase=False):
     for node in co.nodes():
         if node in cl_ids:
             for child, parent, key in co.out_edges(node, keys=True):
-                if child.startswith("CL:") and parent.startswith("CL:") and key == "is_a":
+                if (
+                    child.startswith("CL:")
+                    and parent.startswith("CL:")
+                    and key == "is_a"
+                ):
                     childname = id_to_name[child]
                     parentname = id_to_name[parent]
                     g.add_edge(childname, parentname, key=key)

diff --git a/popv/algorithms/_bbknn.py b/popv/algorithms/_bbknn.py
@@ -86,7 +86,9 @@ def predict(self, adata):
             ]
         )
         if smallest_neighbor_graph < 15:
-            logging.warning(f"BBKNN found only {smallest_neighbor_graph} neighbors. Reduced neighbors in KNN.")
+            logging.warning(
+                f"BBKNN found only {smallest_neighbor_graph} neighbors. Reduced neighbors in KNN."
+            )
             self.classifier_dict["n_neighbors"] = smallest_neighbor_graph
 
         knn = KNeighborsClassifier(metric="precomputed", **self.classifier_dict)
@@ -95,9 +97,15 @@ def predict(self, adata):
         adata.obs[self.result_key] = knn.predict(test_distances)
 
         if adata.uns["_return_probabilities"]:
-            adata.obs[self.result_key + "_probabilities"] = np.max(knn.predict_proba(test_distances), axis=1)
+            adata.obs[self.result_key + "_probabilities"] = np.max(
+                knn.predict_proba(test_distances), axis=1
+            )
 
     def compute_embedding(self, adata):
         if adata.uns["_compute_embedding"]:
-            logging.info(f'Saving UMAP of bbknn results to adata.obs["{self.embedding_key}"]')
-            adata.obsm[self.embedding_key] = sc.tl.umap(adata, copy=True, **self.embedding_dict).obsm["X_umap"]
+            logging.info(
+                f'Saving UMAP of bbknn results to adata.obs["{self.embedding_key}"]'
+            )
+            adata.obsm[self.embedding_key] = sc.tl.umap(
+                adata, copy=True, **self.embedding_dict
+            ).obsm["X_umap"]
diff --git a/popv/algorithms/_celltypist.py b/popv/algorithms/_celltypist.py
@@ -63,12 +63,16 @@ def predict(self, adata):
             **self.classifier_dict,
         )
         out_column = (
-            "majority_voting" if "majority_voting" in predictions.predicted_labels.columns else "predicted_labels"
+            "majority_voting"
+            if "majority_voting" in predictions.predicted_labels.columns
+            else "predicted_labels"
         )
 
         adata.obs[self.result_key] = predictions.predicted_labels[out_column]
         if adata.uns["_return_probabilities"]:
-            adata.obs[self.result_key + "_probabilities"] = predictions.probability_matrix.max(axis=1).values
+            adata.obs[self.result_key + "_probabilities"] = (
+                predictions.probability_matrix.max(axis=1).values
+            )
 
     def compute_embedding(self, adata):
         pass
diff --git a/popv/algorithms/_harmony.py b/popv/algorithms/_harmony.py
@@ -61,7 +61,9 @@ def __init__(
     def compute_integration(self, adata):
         logging.info("Integrating data with harmony")
 
-        adata.obsm["X_pca_harmony"] = harmonize(adata.obsm["X_pca"], adata.obs, batch_key=self.batch_key)
+        adata.obsm["X_pca_harmony"] = harmonize(
+            adata.obsm["X_pca"], adata.obs, batch_key=self.batch_key
+        )
 
     def predict(self, adata, result_key="popv_knn_on_harmony_prediction"):
         logging.info(f'Saving knn on harmony results to adata.obs["{result_key}"]')
@@ -75,7 +77,9 @@ def predict(self, adata, result_key="popv_knn_on_harmony_prediction"):
                 n_neighbors=self.classifier_dict["n_neighbors"],
                 parallel_batch_queries=True,
             ),
-            KNeighborsClassifier(metric="precomputed", weights=self.classifier_dict["weights"]),
+            KNeighborsClassifier(
+                metric="precomputed", weights=self.classifier_dict["weights"]
+            ),
         )
 
         knn.fit(train_X, train_Y)
@@ -91,6 +95,10 @@ def predict(self, adata, result_key="popv_knn_on_harmony_prediction"):
 
     def compute_embedding(self, adata):
         if adata.uns["_compute_embedding"]:
-            logging.info(f'Saving UMAP of harmony results to adata.obs["{self.embedding_key}"]')
+            logging.info(
+                f'Saving UMAP of harmony results to adata.obs["{self.embedding_key}"]'
+            )
             sc.pp.neighbors(adata, use_rep="X_pca_harmony")
-            adata.obsm[self.embedding_key] = sc.tl.umap(adata, copy=True, **self.embedding_dict).obsm["X_umap"]
+            adata.obsm[self.embedding_key] = sc.tl.umap(
+                adata, copy=True, **self.embedding_dict
+            ).obsm["X_umap"]
diff --git a/popv/algorithms/_onclass.py b/popv/algorithms/_onclass.py
@@ -106,10 +106,12 @@ def compute_integration(self, adata):
         pass
 
     def predict(self, adata):
-        logging.info(f'Computing Onclass. Storing prediction in adata.obs["{self.result_key}"]')
-        adata.obs.loc[adata.obs["_dataset"] == "query", self.cell_ontology_obs_key] = adata.uns[
-            "unknown_celltype_label"
-        ]
+        logging.info(
+            f'Computing Onclass. Storing prediction in adata.obs["{self.result_key}"]'
+        )
+        adata.obs.loc[adata.obs["_dataset"] == "query", self.cell_ontology_obs_key] = (
+            adata.uns["unknown_celltype_label"]
+        )
 
         train_idx = adata.obs["_dataset"] == "ref"
 
@@ -127,10 +129,14 @@ def predict(self, adata):
         cl_ontology_file = adata.uns["_cl_ontology_file"]
         nlp_emb_file = adata.uns["_nlp_emb_file"]
 
-        celltype_dict, clid_2_name = self.make_celltype_to_cell_ontology_id_dict(cl_obo_file)
+        celltype_dict, clid_2_name = self.make_celltype_to_cell_ontology_id_dict(
+            cl_obo_file
+        )
         self.make_cell_ontology_id(adata, celltype_dict, self.cell_ontology_obs_key)
 
-        train_model = OnClassModel(cell_type_nlp_emb_file=nlp_emb_file, cell_type_network_file=cl_ontology_file)
+        train_model = OnClassModel(
+            cell_type_nlp_emb_file=nlp_emb_file, cell_type_network_file=cl_ontology_file
+        )
 
         if adata.uns["_save_path_trained_models"] is not None:
             model_path = adata.uns["_save_path_trained_models"] + "/OnClass"
@@ -175,13 +181,17 @@ def predict(self, adata):
         )
 
         if adata.uns["_prediction_mode"] == "fast":
-            onclass_seen = np.argmax(train_model.model.predict(corr_test_feature), axis=1)
+            onclass_seen = np.argmax(
+                train_model.model.predict(corr_test_feature), axis=1
+            )
             pred_label = [train_model.i2co[ind] for ind in onclass_seen]
             pred_label_str = [clid_2_name[ind] for ind in pred_label]
             adata.obs[self.result_key] = pred_label_str
             adata.obs[self.seen_result_key] = pred_label_str
         else:
-            onclass_pred = train_model.Predict(corr_test_feature, use_normalize=False, refine=True, unseen_ratio=-1.0)
+            onclass_pred = train_model.Predict(
+                corr_test_feature, use_normalize=False, refine=True, unseen_ratio=-1.0
+            )
             pred_label = [train_model.i2co[ind] for ind in onclass_pred[2]]
             pred_label_str = [clid_2_name[ind] for ind in pred_label]
             adata.obs[self.result_key] = pred_label_str
@@ -192,9 +202,15 @@ def predict(self, adata):
             adata.obs[self.seen_result_key] = pred_label_str
 
             if adata.uns["_return_probabilities"]:
-                adata.obs[self.result_key + "_probabilities"] = np.max(onclass_pred[1], axis=1) / onclass_pred[1].sum(1)
-                adata.obsm["onclass_probabilities"] = onclass_pred[1] / onclass_pred[1].sum(1, keepdims=True)
-                adata.obs["popv_onclass_seen" + "_probabilities"] = np.max(onclass_pred[0], axis=1)
+                adata.obs[self.result_key + "_probabilities"] = np.max(
+                    onclass_pred[1], axis=1
+                ) / onclass_pred[1].sum(1)
+                adata.obsm["onclass_probabilities"] = onclass_pred[1] / onclass_pred[
+                    1
+                ].sum(1, keepdims=True)
+                adata.obs["popv_onclass_seen" + "_probabilities"] = np.max(
+                    onclass_pred[0], axis=1
+                )
 
     def compute_embedding(self, adata):
         return None
diff --git a/popv/algorithms/_rf.py b/popv/algorithms/_rf.py
@@ -48,13 +48,19 @@ def compute_integration(self, adata):
         pass
 
     def predict(self, adata):
-        logging.info(f'Computing random forest classifier. Storing prediction in adata.obs["{self.result_key}"]')
+        logging.info(
+            f'Computing random forest classifier. Storing prediction in adata.obs["{self.result_key}"]'
+        )
 
         test_x = adata.layers[self.layers_key] if self.layers_key else adata.X
 
         if adata.uns["_prediction_mode"] == "retrain":
             train_idx = adata.obs["_ref_subsample"]
-            train_x = adata[train_idx].layers[self.layers_key] if self.layers_key else adata[train_idx].X
+            train_x = (
+                adata[train_idx].layers[self.layers_key]
+                if self.layers_key
+                else adata[train_idx].X
+            )
             train_y = adata[train_idx].obs[self.labels_key].to_numpy()
             rf = RandomForestClassifier(**self.classifier_dict)
             rf.fit(train_x, train_y)
@@ -67,10 +73,14 @@ def predict(self, adata):
                     ),
                 )
         else:
-            rf = pickle.load(open(adata.uns["_save_path_trained_models"] + "rf_classifier.pkl", "rb"))
+            rf = pickle.load(
+                open(adata.uns["_save_path_trained_models"] + "rf_classifier.pkl", "rb")
+            )
         adata.obs[self.result_key] = rf.predict(test_x)
         if adata.uns["_return_probabilities"]:
-            adata.obs[self.result_key + "_probabilities"] = np.max(rf.predict_proba(test_x), axis=1)
+            adata.obs[self.result_key + "_probabilities"] = np.max(
+                rf.predict_proba(test_x), axis=1
+            )
 
     def compute_embedding(self, adata):
         pass