diff --git a/README.md.rej b/README.md.rej index 1446723..9d5babc 100644 --- a/README.md.rej +++ b/README.md.rej @@ -1,10 +1,10 @@ diff a/README.md b/README.md (rejected hunks) @@ -17,7 +17,7 @@ Please refer to the [documentation][link-docs]. In particular, the - + ## Installation - + -You need to have Python 3.9 or newer installed on your system. If you don't have +You need to have Python 3.10 or newer installed on your system. If you don't have Python installed, we recommend installing [Mambaforge](https://github.com/conda-forge/miniforge#mambaforge). - + There are several alternative options to install PopV: diff --git a/popv/_settings.py b/popv/_settings.py index 06bd242..b587517 100644 --- a/popv/_settings.py +++ b/popv/_settings.py @@ -101,7 +101,9 @@ def verbosity(self, level: str | int): console = Console(force_terminal=True) if console.is_jupyter is True: console.is_jupyter = False - ch = RichHandler(level=level, show_path=False, console=console, show_time=False) + ch = RichHandler( + level=level, show_path=False, console=console, show_time=False + ) formatter = logging.Formatter("%(message)s") ch.setFormatter(formatter) popv_logger.addHandler(ch) diff --git a/popv/_utils.py b/popv/_utils.py index c6fa62c..98ac74a 100644 --- a/popv/_utils.py +++ b/popv/_utils.py @@ -49,7 +49,9 @@ def subsample_dataset( if labels_counts[label] < n_samples_per_label: sample_idx.append(label_locs) else: - label_subset = np.random.choice(label_locs, n_samples_per_label, replace=False) + label_subset = np.random.choice( + label_locs, n_samples_per_label, replace=False + ) sample_idx.append(label_subset) sample_idx = np.concatenate(sample_idx) return adata.obs_names[sample_idx] @@ -79,7 +81,9 @@ def check_genes_is_subset(ref_genes, query_genes): logging.info("All ref genes are in query dataset. Can use pretrained models.") is_subset = True else: - logging.info("Not all reference genes are in query dataset. Set 'prediction_mode' to 'retrain'.") + logging.info( + "Not all reference genes are in query dataset. Set 'prediction_mode' to 'retrain'." + ) is_subset = False return is_subset @@ -95,7 +99,9 @@ def make_batch_covariate(adata, batch_keys, new_batch_key): batch_keys List of keys in adat.obs corresponding to batches """ - adata.obs[new_batch_key] = adata.obs[batch_keys].astype(str).sum(1).astype("category") + adata.obs[new_batch_key] = ( + adata.obs[batch_keys].astype(str).sum(1).astype("category") + ) def calculate_depths(g): @@ -142,7 +148,9 @@ def make_ontology_dag(obofile, lowercase=False): """ co = obonet.read_obo(obofile, encoding="utf-8") id_to_name = {id_: data.get("name") for id_, data in co.nodes(data=True)} - name_to_id = {data["name"]: id_ for id_, data in co.nodes(data=True) if ("name" in data)} + name_to_id = { + data["name"]: id_ for id_, data in co.nodes(data=True) if ("name" in data) + } # get all node ids that are celltypes (start with CL) cl_ids = {id_: True for _, id_ in name_to_id.items() if id_.startswith("CL:")} @@ -160,7 +168,11 @@ def make_ontology_dag(obofile, lowercase=False): for node in co.nodes(): if node in cl_ids: for child, parent, key in co.out_edges(node, keys=True): - if child.startswith("CL:") and parent.startswith("CL:") and key == "is_a": + if ( + child.startswith("CL:") + and parent.startswith("CL:") + and key == "is_a" + ): childname = id_to_name[child] parentname = id_to_name[parent] g.add_edge(childname, parentname, key=key) diff --git a/popv/algorithms/_bbknn.py b/popv/algorithms/_bbknn.py index 422fac8..aa6b090 100644 --- a/popv/algorithms/_bbknn.py +++ b/popv/algorithms/_bbknn.py @@ -86,7 +86,9 @@ def predict(self, adata): ] ) if smallest_neighbor_graph < 15: - logging.warning(f"BBKNN found only {smallest_neighbor_graph} neighbors. Reduced neighbors in KNN.") + logging.warning( + f"BBKNN found only {smallest_neighbor_graph} neighbors. Reduced neighbors in KNN." + ) self.classifier_dict["n_neighbors"] = smallest_neighbor_graph knn = KNeighborsClassifier(metric="precomputed", **self.classifier_dict) @@ -95,9 +97,15 @@ def predict(self, adata): adata.obs[self.result_key] = knn.predict(test_distances) if adata.uns["_return_probabilities"]: - adata.obs[self.result_key + "_probabilities"] = np.max(knn.predict_proba(test_distances), axis=1) + adata.obs[self.result_key + "_probabilities"] = np.max( + knn.predict_proba(test_distances), axis=1 + ) def compute_embedding(self, adata): if adata.uns["_compute_embedding"]: - logging.info(f'Saving UMAP of bbknn results to adata.obs["{self.embedding_key}"]') - adata.obsm[self.embedding_key] = sc.tl.umap(adata, copy=True, **self.embedding_dict).obsm["X_umap"] + logging.info( + f'Saving UMAP of bbknn results to adata.obs["{self.embedding_key}"]' + ) + adata.obsm[self.embedding_key] = sc.tl.umap( + adata, copy=True, **self.embedding_dict + ).obsm["X_umap"] diff --git a/popv/algorithms/_celltypist.py b/popv/algorithms/_celltypist.py index 295577a..fc01368 100644 --- a/popv/algorithms/_celltypist.py +++ b/popv/algorithms/_celltypist.py @@ -63,12 +63,16 @@ def predict(self, adata): **self.classifier_dict, ) out_column = ( - "majority_voting" if "majority_voting" in predictions.predicted_labels.columns else "predicted_labels" + "majority_voting" + if "majority_voting" in predictions.predicted_labels.columns + else "predicted_labels" ) adata.obs[self.result_key] = predictions.predicted_labels[out_column] if adata.uns["_return_probabilities"]: - adata.obs[self.result_key + "_probabilities"] = predictions.probability_matrix.max(axis=1).values + adata.obs[ + self.result_key + "_probabilities" + ] = predictions.probability_matrix.max(axis=1).values def compute_embedding(self, adata): pass diff --git a/popv/algorithms/_harmony.py b/popv/algorithms/_harmony.py index 15d1857..cb56136 100644 --- a/popv/algorithms/_harmony.py +++ b/popv/algorithms/_harmony.py @@ -61,7 +61,9 @@ def __init__( def compute_integration(self, adata): logging.info("Integrating data with harmony") - adata.obsm["X_pca_harmony"] = harmonize(adata.obsm["X_pca"], adata.obs, batch_key=self.batch_key) + adata.obsm["X_pca_harmony"] = harmonize( + adata.obsm["X_pca"], adata.obs, batch_key=self.batch_key + ) def predict(self, adata, result_key="popv_knn_on_harmony_prediction"): logging.info(f'Saving knn on harmony results to adata.obs["{result_key}"]') @@ -75,7 +77,9 @@ def predict(self, adata, result_key="popv_knn_on_harmony_prediction"): n_neighbors=self.classifier_dict["n_neighbors"], parallel_batch_queries=True, ), - KNeighborsClassifier(metric="precomputed", weights=self.classifier_dict["weights"]), + KNeighborsClassifier( + metric="precomputed", weights=self.classifier_dict["weights"] + ), ) knn.fit(train_X, train_Y) @@ -91,6 +95,10 @@ def predict(self, adata, result_key="popv_knn_on_harmony_prediction"): def compute_embedding(self, adata): if adata.uns["_compute_embedding"]: - logging.info(f'Saving UMAP of harmony results to adata.obs["{self.embedding_key}"]') + logging.info( + f'Saving UMAP of harmony results to adata.obs["{self.embedding_key}"]' + ) sc.pp.neighbors(adata, use_rep="X_pca_harmony") - adata.obsm[self.embedding_key] = sc.tl.umap(adata, copy=True, **self.embedding_dict).obsm["X_umap"] + adata.obsm[self.embedding_key] = sc.tl.umap( + adata, copy=True, **self.embedding_dict + ).obsm["X_umap"] diff --git a/popv/algorithms/_onclass.py b/popv/algorithms/_onclass.py index 259cc43..1e012d1 100644 --- a/popv/algorithms/_onclass.py +++ b/popv/algorithms/_onclass.py @@ -106,10 +106,12 @@ def compute_integration(self, adata): pass def predict(self, adata): - logging.info(f'Computing Onclass. Storing prediction in adata.obs["{self.result_key}"]') - adata.obs.loc[adata.obs["_dataset"] == "query", self.cell_ontology_obs_key] = adata.uns[ - "unknown_celltype_label" - ] + logging.info( + f'Computing Onclass. Storing prediction in adata.obs["{self.result_key}"]' + ) + adata.obs.loc[ + adata.obs["_dataset"] == "query", self.cell_ontology_obs_key + ] = adata.uns["unknown_celltype_label"] train_idx = adata.obs["_dataset"] == "ref" @@ -127,10 +129,14 @@ def predict(self, adata): cl_ontology_file = adata.uns["_cl_ontology_file"] nlp_emb_file = adata.uns["_nlp_emb_file"] - celltype_dict, clid_2_name = self.make_celltype_to_cell_ontology_id_dict(cl_obo_file) + celltype_dict, clid_2_name = self.make_celltype_to_cell_ontology_id_dict( + cl_obo_file + ) self.make_cell_ontology_id(adata, celltype_dict, self.cell_ontology_obs_key) - train_model = OnClassModel(cell_type_nlp_emb_file=nlp_emb_file, cell_type_network_file=cl_ontology_file) + train_model = OnClassModel( + cell_type_nlp_emb_file=nlp_emb_file, cell_type_network_file=cl_ontology_file + ) if adata.uns["_save_path_trained_models"] is not None: model_path = adata.uns["_save_path_trained_models"] + "/OnClass" @@ -175,13 +181,17 @@ def predict(self, adata): ) if adata.uns["_prediction_mode"] == "fast": - onclass_seen = np.argmax(train_model.model.predict(corr_test_feature), axis=1) + onclass_seen = np.argmax( + train_model.model.predict(corr_test_feature), axis=1 + ) pred_label = [train_model.i2co[ind] for ind in onclass_seen] pred_label_str = [clid_2_name[ind] for ind in pred_label] adata.obs[self.result_key] = pred_label_str adata.obs[self.seen_result_key] = pred_label_str else: - onclass_pred = train_model.Predict(corr_test_feature, use_normalize=False, refine=True, unseen_ratio=-1.0) + onclass_pred = train_model.Predict( + corr_test_feature, use_normalize=False, refine=True, unseen_ratio=-1.0 + ) pred_label = [train_model.i2co[ind] for ind in onclass_pred[2]] pred_label_str = [clid_2_name[ind] for ind in pred_label] adata.obs[self.result_key] = pred_label_str @@ -192,9 +202,15 @@ def predict(self, adata): adata.obs[self.seen_result_key] = pred_label_str if adata.uns["_return_probabilities"]: - adata.obs[self.result_key + "_probabilities"] = np.max(onclass_pred[1], axis=1) / onclass_pred[1].sum(1) - adata.obsm["onclass_probabilities"] = onclass_pred[1] / onclass_pred[1].sum(1, keepdims=True) - adata.obs["popv_onclass_seen" + "_probabilities"] = np.max(onclass_pred[0], axis=1) + adata.obs[self.result_key + "_probabilities"] = np.max( + onclass_pred[1], axis=1 + ) / onclass_pred[1].sum(1) + adata.obsm["onclass_probabilities"] = onclass_pred[1] / onclass_pred[ + 1 + ].sum(1, keepdims=True) + adata.obs["popv_onclass_seen" + "_probabilities"] = np.max( + onclass_pred[0], axis=1 + ) def compute_embedding(self, adata): return None diff --git a/popv/algorithms/_rf.py b/popv/algorithms/_rf.py index 612731f..ca88ac7 100644 --- a/popv/algorithms/_rf.py +++ b/popv/algorithms/_rf.py @@ -48,13 +48,19 @@ def compute_integration(self, adata): pass def predict(self, adata): - logging.info(f'Computing random forest classifier. Storing prediction in adata.obs["{self.result_key}"]') + logging.info( + f'Computing random forest classifier. Storing prediction in adata.obs["{self.result_key}"]' + ) test_x = adata.layers[self.layers_key] if self.layers_key else adata.X if adata.uns["_prediction_mode"] == "retrain": train_idx = adata.obs["_ref_subsample"] - train_x = adata[train_idx].layers[self.layers_key] if self.layers_key else adata[train_idx].X + train_x = ( + adata[train_idx].layers[self.layers_key] + if self.layers_key + else adata[train_idx].X + ) train_y = adata[train_idx].obs[self.labels_key].to_numpy() rf = RandomForestClassifier(**self.classifier_dict) rf.fit(train_x, train_y) @@ -67,10 +73,14 @@ def predict(self, adata): ), ) else: - rf = pickle.load(open(adata.uns["_save_path_trained_models"] + "rf_classifier.pkl", "rb")) + rf = pickle.load( + open(adata.uns["_save_path_trained_models"] + "rf_classifier.pkl", "rb") + ) adata.obs[self.result_key] = rf.predict(test_x) if adata.uns["_return_probabilities"]: - adata.obs[self.result_key + "_probabilities"] = np.max(rf.predict_proba(test_x), axis=1) + adata.obs[self.result_key + "_probabilities"] = np.max( + rf.predict_proba(test_x), axis=1 + ) def compute_embedding(self, adata): pass diff --git a/popv/algorithms/_scaffold_algorithm.py b/popv/algorithms/_scaffold_algorithm.py index 5f90dc7..477c9e6 100644 --- a/popv/algorithms/_scaffold_algorithm.py +++ b/popv/algorithms/_scaffold_algorithm.py @@ -69,12 +69,16 @@ def compute_integration(self, adata): # adata.obsm["X_new_method"] = embedded_data def predict(self, adata): - logging.info(f'Computing new classifier method. Storing prediction in adata.obs["{self.result_key}"]') + logging.info( + f'Computing new classifier method. Storing prediction in adata.obs["{self.result_key}"]' + ) # adata.obs[self.result_key] = classifier_results def compute_embedding(self, adata): if adata.uns["_compute_embedding"]: - logging.info(f'Saving UMAP of new integration method to adata.obs["{self.embedding_key}"]') + logging.info( + f'Saving UMAP of new integration method to adata.obs["{self.embedding_key}"]' + ) # sc.pp.neighbors(adata, use_rep="embedding_space") # adata.obsm[self.embedding_key] = sc.tl.umap( # adata, copy=True, **self.embedding_dict diff --git a/popv/algorithms/_scanorama.py b/popv/algorithms/_scanorama.py index f403156..afd6f51 100644 --- a/popv/algorithms/_scanorama.py +++ b/popv/algorithms/_scanorama.py @@ -62,7 +62,10 @@ def __init__( def compute_integration(self, adata): logging.info("Integrating data with scanorama") - _adatas = [adata[adata.obs[self.batch_key] == i] for i in np.unique(adata.obs[self.batch_key])] + _adatas = [ + adata[adata.obs[self.batch_key] == i] + for i in np.unique(adata.obs[self.batch_key]) + ] scanorama.integrate_scanpy(_adatas, **self.method_dict) tmp_adata = anndata.concat(_adatas) adata.obsm["X_scanorama"] = tmp_adata[adata.obs_names].obsm["X_scanorama"] @@ -79,7 +82,9 @@ def predict(self, adata, result_key="popv_knn_on_scanorama_prediction"): n_neighbors=self.classifier_dict["n_neighbors"], parallel_batch_queries=True, ), - KNeighborsClassifier(metric="precomputed", weights=self.classifier_dict["weights"]), + KNeighborsClassifier( + metric="precomputed", weights=self.classifier_dict["weights"] + ), ) knn.fit(train_X, train_Y) @@ -89,10 +94,16 @@ def predict(self, adata, result_key="popv_knn_on_scanorama_prediction"): adata.obs[result_key] = knn_pred if adata.uns["_return_probabilities"]: - adata.obs[self.result_key + "_probabilities"] = np.max(knn.predict_proba(adata.obsm["X_scanorama"]), axis=1) + adata.obs[self.result_key + "_probabilities"] = np.max( + knn.predict_proba(adata.obsm["X_scanorama"]), axis=1 + ) def compute_embedding(self, adata): if adata.uns["_compute_embedding"]: - logging.info(f'Saving UMAP of scanorama results to adata.obs["{self.embedding_key}"]') + logging.info( + f'Saving UMAP of scanorama results to adata.obs["{self.embedding_key}"]' + ) sc.pp.neighbors(adata, use_rep="X_scanorama") - adata.obsm[self.embedding_key] = sc.tl.umap(adata, copy=True, **self.embedding_dict).obsm["X_umap"] + adata.obsm[self.embedding_key] = sc.tl.umap( + adata, copy=True, **self.embedding_dict + ).obsm["X_umap"] diff --git a/popv/algorithms/_scanvi.py b/popv/algorithms/_scanvi.py index 804888a..6faf11b 100644 --- a/popv/algorithms/_scanvi.py +++ b/popv/algorithms/_scanvi.py @@ -84,9 +84,13 @@ def compute_integration(self, adata): if "subsampled_labels" not in adata.obs.columns: adata.obs["subsampled_labels"] = [ label if subsampled else adata.uns["unknown_celltype_label"] - for label, subsampled in zip(adata.obs["_labels_annotation"], adata.obs["_ref_subsample"]) + for label, subsampled in zip( + adata.obs["_labels_annotation"], adata.obs["_ref_subsample"] + ) ] - adata.obs["subsampled_labels"] = adata.obs["subsampled_labels"].astype("category") + adata.obs["subsampled_labels"] = adata.obs["subsampled_labels"].astype( + "category" + ) yprior = torch.tensor( [ adata.obs["_labels_annotation"].value_counts()[i] / adata.n_obs @@ -96,11 +100,15 @@ def compute_integration(self, adata): ) if self.n_epochs_unsupervised is None: - self.n_epochs_unsupervised = round(min(round((10000 / adata.n_obs) * 200), 200)) + self.n_epochs_unsupervised = round( + min(round((10000 / adata.n_obs) * 200), 200) + ) if adata.uns["_prediction_mode"] == "retrain": if adata.uns["_pretrained_scvi_path"] is not None: - scvi_model = scvi.model.SCVI.load(adata.uns["_save_path_trained_models"] + "/scvi", adata=adata) + scvi_model = scvi.model.SCVI.load( + adata.uns["_save_path_trained_models"] + "/scvi", adata=adata + ) else: scvi.model.SCVI.setup_anndata( adata, @@ -164,15 +172,23 @@ def compute_integration(self, adata): ) def predict(self, adata): - logging.info(f'Saving scanvi label prediction to adata.obs["{self.result_key}"]') + logging.info( + f'Saving scanvi label prediction to adata.obs["{self.result_key}"]' + ) adata.obs[self.result_key] = self.model.predict(adata) if adata.uns["_return_probabilities"]: - adata.obs[self.result_key + "_probabilities"] = np.max(self.model.predict(adata, soft=True), axis=1) + adata.obs[self.result_key + "_probabilities"] = np.max( + self.model.predict(adata, soft=True), axis=1 + ) def compute_embedding(self, adata): if adata.uns["_compute_embedding"]: - logging.info(f'Saving UMAP of scanvi results to adata.obs["{self.embedding_key}"]') + logging.info( + f'Saving UMAP of scanvi results to adata.obs["{self.embedding_key}"]' + ) adata.obsm["X_scanvi"] = self.model.get_latent_representation(adata) sc.pp.neighbors(adata, use_rep="X_scanvi") - adata.obsm[self.embedding_key] = sc.tl.umap(adata, copy=True, **self.embedding_dict).obsm["X_umap"] + adata.obsm[self.embedding_key] = sc.tl.umap( + adata, copy=True, **self.embedding_dict + ).obsm["X_umap"] diff --git a/popv/algorithms/_scvi.py b/popv/algorithms/_scvi.py index 968c37c..3d75dd4 100644 --- a/popv/algorithms/_scvi.py +++ b/popv/algorithms/_scvi.py @@ -83,9 +83,13 @@ def compute_integration(self, adata): if "subsampled_labels" not in adata.obs.columns: adata.obs["subsampled_labels"] = [ label if subsampled else adata.uns["unknown_celltype_label"] - for label, subsampled in zip(adata.obs["_labels_annotation"], adata.obs["_ref_subsample"]) + for label, subsampled in zip( + adata.obs["_labels_annotation"], adata.obs["_ref_subsample"] + ) ] - adata.obs["subsampled_labels"] = adata.obs["subsampled_labels"].astype("category") + adata.obs["subsampled_labels"] = adata.obs["subsampled_labels"].astype( + "category" + ) if adata.uns["_pretrained_scvi_path"] is None: SCVI.setup_anndata( @@ -122,9 +126,14 @@ def compute_integration(self, adata): plan_kwargs={"n_epochs_kl_warmup": min(20, self.max_epochs)}, ) - if adata.uns["_save_path_trained_models"] is not None and adata.uns["_prediction_mode"] == "retrain": + if ( + adata.uns["_save_path_trained_models"] is not None + and adata.uns["_prediction_mode"] == "retrain" + ): # Update scvi for scanvi. - adata.uns["_pretrained_scvi_path"] = adata.uns["_save_path_trained_models"] + "/scvi" + adata.uns["_pretrained_scvi_path"] = ( + adata.uns["_save_path_trained_models"] + "/scvi" + ) model.save( adata.uns["_save_path_trained_models"] + "/scvi", save_anndata=False, @@ -145,14 +154,17 @@ def predict(self, adata): n_neighbors=self.classifier_dict["n_neighbors"], parallel_batch_queries=True, ), - KNeighborsClassifier(metric="precomputed", weights=self.classifier_dict["weights"]), + KNeighborsClassifier( + metric="precomputed", weights=self.classifier_dict["weights"] + ), ) knn.fit(train_X, train_Y) if adata.uns["_save_path_trained_models"]: pickle.dump( knn, open( - adata.uns["_save_path_trained_models"] + "scvi_knn_classifier.pkl", + adata.uns["_save_path_trained_models"] + + "scvi_knn_classifier.pkl", "wb", ), ) @@ -169,10 +181,16 @@ def predict(self, adata): # save_results adata.obs[self.result_key] = knn_pred if adata.uns["_return_probabilities"]: - adata.obs[self.result_key + "_probabilities"] = np.max(knn.predict_proba(adata.obsm["X_scvi"]), axis=1) + adata.obs[self.result_key + "_probabilities"] = np.max( + knn.predict_proba(adata.obsm["X_scvi"]), axis=1 + ) def compute_embedding(self, adata): if adata.uns["_compute_embedding"]: - logging.info(f'Saving UMAP of scvi results to adata.obs["{self.embedding_key}"]') + logging.info( + f'Saving UMAP of scvi results to adata.obs["{self.embedding_key}"]' + ) sc.pp.neighbors(adata, use_rep="X_scvi") - adata.obsm[self.embedding_key] = sc.tl.umap(adata, copy=True, **self.embedding_dict).obsm["X_umap"] + adata.obsm[self.embedding_key] = sc.tl.umap( + adata, copy=True, **self.embedding_dict + ).obsm["X_umap"] diff --git a/popv/algorithms/_svm.py b/popv/algorithms/_svm.py index e18c464..f9a424b 100644 --- a/popv/algorithms/_svm.py +++ b/popv/algorithms/_svm.py @@ -50,12 +50,18 @@ def compute_integration(self, adata): pass def predict(self, adata): - logging.info(f'Computing support vector machine. Storing prediction in adata.obs["{self.result_key}"]') + logging.info( + f'Computing support vector machine. Storing prediction in adata.obs["{self.result_key}"]' + ) test_x = adata.layers[self.layers_key] if self.layers_key else adata.X if adata.uns["_prediction_mode"] == "retrain": train_idx = adata.obs["_ref_subsample"] - train_x = adata[train_idx].layers[self.layers_key] if self.layers_key else adata[train_idx].X + train_x = ( + adata[train_idx].layers[self.layers_key] + if self.layers_key + else adata[train_idx].X + ) train_y = adata[train_idx].obs[self.labels_key].to_numpy() clf = CalibratedClassifierCV(svm.LinearSVC(**self.classifier_dict)) clf.fit(train_x, train_y) @@ -68,12 +74,18 @@ def predict(self, adata): ), ) else: - clf = pickle.load(open(adata.uns["_save_path_trained_models"] + "svm_classifier.pkl", "rb")) + clf = pickle.load( + open( + adata.uns["_save_path_trained_models"] + "svm_classifier.pkl", "rb" + ) + ) adata.obs[self.result_key] = clf.predict(test_x) if adata.uns["_return_probabilities"]: - adata.obs[self.result_key + "_probabilities"] = np.max(clf.predict_proba(test_x), axis=1) + adata.obs[self.result_key + "_probabilities"] = np.max( + clf.predict_proba(test_x), axis=1 + ) adata.obs[self.result_key] diff --git a/popv/annotation.py b/popv/annotation.py index ae6584d..975b39c 100644 --- a/popv/annotation.py +++ b/popv/annotation.py @@ -116,7 +116,9 @@ def compute_consensus(adata: anndata.AnnData, prediction_keys: list) -> None: Saves the consensus percentage between methods in adata.obs['popv_majority_vote_score'] """ - consensus_prediction = adata.obs[prediction_keys].apply(_utils.majority_vote, axis=1) + consensus_prediction = adata.obs[prediction_keys].apply( + _utils.majority_vote, axis=1 + ) adata.obs["popv_majority_vote_prediction"] = consensus_prediction agreement = adata.obs[prediction_keys].apply(_utils.majority_count, axis=1) @@ -149,9 +151,13 @@ def ontology_vote_onclass( if adata.uns["_prediction_mode"] == "retrain": G = _utils.make_ontology_dag(adata.uns["_cl_obo_file"]) if adata.uns["_save_path_trained_models"] is not None: - pickle.dump(G, open(adata.uns["_save_path_trained_models"] + "obo_dag.pkl", "wb")) + pickle.dump( + G, open(adata.uns["_save_path_trained_models"] + "obo_dag.pkl", "wb") + ) else: - G = pickle.load(open(adata.uns["_save_path_trained_models"] + "obo_dag.pkl", "rb")) + G = pickle.load( + open(adata.uns["_save_path_trained_models"] + "obo_dag.pkl", "rb") + ) cell_type_root_to_node = {} aggregate_ontology_pred = [None] * adata.n_obs @@ -174,7 +180,9 @@ def ontology_vote_onclass( cell_type_root_to_node[cell_type] = root_to_node depth[cell_type] = len(nx.shortest_path(G, cell_type, "cell")) for ancestor_cell_type in root_to_node: - depth[ancestor_cell_type] = len(nx.shortest_path(G, ancestor_cell_type, "cell")) + depth[ancestor_cell_type] = len( + nx.shortest_path(G, ancestor_cell_type, "cell") + ) if pred_key == "popv_onclass_prediction": onclass_depth[ind] = depth[cell_type] for ancestor_cell_type in root_to_node: @@ -197,11 +205,17 @@ def ontology_vote_onclass( adata.obs[save_key] = aggregate_ontology_pred adata.obs[save_key + "_score"] = scores adata.obs[save_key + "_depth"] = depths - adata.obs[save_key + "_onclass_relative_depth"] = np.array(onclass_depth) - adata.obs[save_key + "_depth"] + adata.obs[save_key + "_onclass_relative_depth"] = ( + np.array(onclass_depth) - adata.obs[save_key + "_depth"] + ) # Change numeric values to categoricals. - adata.obs[[save_key + "_score", save_key + "_depth", save_key + "_onclass_relative_depth"]] = adata.obs[ + adata.obs[ [save_key + "_score", save_key + "_depth", save_key + "_onclass_relative_depth"] - ].astype("category") + ] = adata.obs[ + [save_key + "_score", save_key + "_depth", save_key + "_onclass_relative_depth"] + ].astype( + "category" + ) return adata @@ -234,9 +248,13 @@ def ontology_parent_onclass( if adata.uns["_prediction_mode"] == "retrain": G = _utils.make_ontology_dag(adata.uns["_cl_obo_file"]) if adata.uns["_save_path_trained_models"] is not None: - pickle.dump(G, open(adata.uns["_save_path_trained_models"] + "obo_dag.pkl", "wb")) + pickle.dump( + G, open(adata.uns["_save_path_trained_models"] + "obo_dag.pkl", "wb") + ) else: - G = pickle.load(open(adata.uns["_save_path_trained_models"] + "obo_dag.pkl", "rb")) + G = pickle.load( + open(adata.uns["_save_path_trained_models"] + "obo_dag.pkl", "rb") + ) cell_type_root_to_node = {} aggregate_ontology_pred = [] @@ -255,11 +273,16 @@ def ontology_parent_onclass( cell_type_root_to_node[cell_type] = root_to_node depth[cell_type] = len(nx.shortest_path(G, cell_type, "cell")) for ancestor_cell_type in root_to_node: - depth[ancestor_cell_type] = len(nx.shortest_path(G, ancestor_cell_type, "cell")) + depth[ancestor_cell_type] = len( + nx.shortest_path(G, ancestor_cell_type, "cell") + ) for ancestor_cell_type in list(root_to_node) + [cell_type]: score[ancestor_cell_type] += 1 score_popv[cell_type] += 1 - score = {key: min(len(prediction_keys) - allowed_errors, value) for key, value in score.items()} + score = { + key: min(len(prediction_keys) - allowed_errors, value) + for key, value in score.items() + } # Find ancestor most present and deepest across all classifiers. # If tie, then highest in original classifier. diff --git a/popv/preprocessing.py b/popv/preprocessing.py index e9c2732..fb4572b 100644 --- a/popv/preprocessing.py +++ b/popv/preprocessing.py @@ -147,7 +147,9 @@ def __init__( set(query_adata.var_names) ), "Query dataset misses genes that were used for reference model training. Retrain reference model, set mode='retrain'" self.query_adata = query_adata[:, self.genes].copy() - assert hvg is None, "Highly variable gene selection is not available if using trained reference model." + assert ( + hvg is None + ), "Highly variable gene selection is not available if using trained reference model." else: self.query_adata = query_adata.copy() if query_layers_key is not None: @@ -164,15 +166,23 @@ def __init__( self.compute_embedding = compute_embedding if cl_obo_folder is None: - self.cl_obo_file = os.path.dirname(os.path.dirname(__file__)) + "/ontology/cl.obo" - self.cl_ontology_file = os.path.dirname(os.path.dirname(__file__)) + "/ontology/cl.ontology" - self.nlp_emb_file = os.path.dirname(os.path.dirname(__file__)) + "/ontology/cl.ontology.nlp.emb" + self.cl_obo_file = ( + os.path.dirname(os.path.dirname(__file__)) + "/ontology/cl.obo" + ) + self.cl_ontology_file = ( + os.path.dirname(os.path.dirname(__file__)) + "/ontology/cl.ontology" + ) + self.nlp_emb_file = ( + os.path.dirname(os.path.dirname(__file__)) + + "/ontology/cl.ontology.nlp.emb" + ) if not os.path.exists(self.nlp_emb_file): subprocess.call( [ "tar", "-czf", - os.path.dirname(os.path.dirname(__file__)) + "/ontology/nlp.emb.tar.gz", + os.path.dirname(os.path.dirname(__file__)) + + "/ontology/nlp.emb.tar.gz", "cl.ontology.nlp.emb", ] ) @@ -211,7 +221,9 @@ def __init__( self._preprocess() def _check_validity_anndata(self, adata, input_type): - assert check_nonnegative_integers(adata.X), f"Make sure input {input_type} adata contains raw_counts" + assert check_nonnegative_integers( + adata.X + ), f"Make sure input {input_type} adata contains raw_counts" assert len(set(adata.var_names)) == len( adata.var_names ), f"{input_type} dataset contains multiple genes with same gene name." @@ -220,17 +232,25 @@ def _check_validity_anndata(self, adata, input_type): def _setup_dataset(self, adata, key, add_meta=""): if isinstance(self.batch_key[key], list): - adata.obs["_batch_annotation"] = adata.obs[self.batch_key[key]].astype(str).sum(1).astype("category") + adata.obs["_batch_annotation"] = ( + adata.obs[self.batch_key[key]].astype(str).sum(1).astype("category") + ) elif isinstance(self.batch_key[key], str): adata.obs["_batch_annotation"] = adata.obs[self.batch_key[key]] else: adata.obs["_batch_annotation"] = self.unknown_celltype_label - adata.obs["_batch_annotation"] = adata.obs["_batch_annotation"].astype(str) + add_meta - adata.obs["_batch_annotation"] = adata.obs["_batch_annotation"].astype("category") + adata.obs["_batch_annotation"] = ( + adata.obs["_batch_annotation"].astype(str) + add_meta + ) + adata.obs["_batch_annotation"] = adata.obs["_batch_annotation"].astype( + "category" + ) adata.obs["_labels_annotation"] = self.unknown_celltype_label if self.labels_key[key] is not None: - adata.obs["_labels_annotation"] = adata.obs[self.labels_key[key]].astype("category") + adata.obs["_labels_annotation"] = adata.obs[self.labels_key[key]].astype( + "category" + ) # subsample the reference cells used for training certain models if key == "reference": @@ -250,8 +270,12 @@ def _setup_dataset(self, adata, key, add_meta=""): def _preprocess(self): if self.genes is None: - self.ref_adata = self.ref_adata[:, np.intersect1d(self.ref_adata.var_names, self.query_adata.var_names)] - self.query_adata = self.query_adata[:, np.intersect1d(self.ref_adata.var_names, self.query_adata.var_names)] + self.ref_adata = self.ref_adata[ + :, np.intersect1d(self.ref_adata.var_names, self.query_adata.var_names) + ] + self.query_adata = self.query_adata[ + :, np.intersect1d(self.ref_adata.var_names, self.query_adata.var_names) + ] if self.prediction_mode == "fast": self.adata = self.query_adata @@ -272,22 +296,28 @@ def _preprocess(self): self.adata = self.adata[ self.adata.obs["_batch_annotation"].isin( self.adata.obs["_batch_annotation"] - .value_counts()[self.adata.obs["_batch_annotation"].value_counts() > 8] + .value_counts()[ + self.adata.obs["_batch_annotation"].value_counts() > 8 + ] .index ) ].copy() - difference_batches = set(self.adata.obs["_batch_annotation"]) - batch_before_filtering + difference_batches = ( + set(self.adata.obs["_batch_annotation"]) - batch_before_filtering + ) if difference_batches: logging.warning( f"The following batches will be excluded from annotation because they have less than 9 cells:{difference_batches}." ) # Sort data based on batch for efficiency downstream during SCANORAMA - self.adata = self.adata[self.adata.obs.sort_values(by="_batch_annotation").index].copy() + self.adata = self.adata[ + self.adata.obs.sort_values(by="_batch_annotation").index + ].copy() - self.adata.obs[self.labels_key["reference"]] = self.adata.obs[self.labels_key["reference"]].astype( - "category" - ) + self.adata.obs[self.labels_key["reference"]] = self.adata.obs[ + self.labels_key["reference"] + ].astype("category") # Remove any cell with expression below 10 counts. zero_cell_names = self.adata[self.adata.X.sum(1) < 10].obs_names @@ -312,8 +342,12 @@ def _preprocess(self): inplace=False, batch_key="_batch_annotation", )["highly_variable"] - except ValueError: # seurat_v3 tends to error with singularities then use Poisson hvg. - self.adata.var["highly_variable"] = sc.experimental.pp.highly_variable_genes( + except ( + ValueError + ): # seurat_v3 tends to error with singularities then use Poisson hvg. + self.adata.var[ + "highly_variable" + ] = sc.experimental.pp.highly_variable_genes( self.adata[self.adata.obs["_dataset"] == "ref"].copy(), n_top_genes=self.hvg, subset=False, @@ -321,14 +355,18 @@ def _preprocess(self): flavor="pearson_residuals", inplace=False, batch_key="_batch_annotation", - )["highly_variable"] + )[ + "highly_variable" + ] self.adata = self.adata[:, self.adata.var["highly_variable"]].copy() sc.pp.normalize_total(self.adata, target_sum=1e4) sc.pp.log1p(self.adata) self.adata.layers["scaled_counts"] = self.adata.X.copy() if self.prediction_mode != "fast": - sc.pp.scale(self.adata, max_value=10, zero_center=False, layer="scaled_counts") + sc.pp.scale( + self.adata, max_value=10, zero_center=False, layer="scaled_counts" + ) self.adata.obsm["X_pca"] = sc.tl.pca(self.adata.layers["scaled_counts"]) # Store values as default for current popv in adata diff --git a/popv/reproducibility/_accuracy.py b/popv/reproducibility/_accuracy.py index 8728e4f..cf6a3fb 100644 --- a/popv/reproducibility/_accuracy.py +++ b/popv/reproducibility/_accuracy.py @@ -30,7 +30,9 @@ def match_type(n1, n2): else: return "no match" - adata.obs[save_key] = adata.obs.apply(lambda x: match_type(x[pred_key], x[gt_key]), axis=1) + adata.obs[save_key] = adata.obs.apply( + lambda x: match_type(x[pred_key], x[gt_key]), axis=1 + ) def _fine_ontology_sibling_accuracy(adata, obofile, pred_key, gt_key, save_key=None): @@ -43,7 +45,9 @@ def _fine_ontology_sibling_accuracy(adata, obofile, pred_key, gt_key, save_key=N ontology_distance_dict = {} - for name, pred_ct, gt_ct in zip(adata.obs_names, adata.obs[pred_key], adata.obs[gt_key]): + for name, pred_ct, gt_ct in zip( + adata.obs_names, adata.obs[pred_key], adata.obs[gt_key] + ): score = None combination = f"{pred_ct}_{gt_ct}" if combination in ontology_distance_dict: @@ -57,7 +61,9 @@ def _fine_ontology_sibling_accuracy(adata, obofile, pred_key, gt_key, save_key=N score = nx.shortest_path_length(dag, source=gt_ct, target=pred_ct) - 1 score *= -1 else: - paths = nx.algorithms.simple_paths.shortest_simple_paths(nx.Graph(dag), source=pred_ct, target=gt_ct) + paths = nx.algorithms.simple_paths.shortest_simple_paths( + nx.Graph(dag), source=pred_ct, target=gt_ct + ) path = next(paths, None) if path is None: score = 1000 diff --git a/popv/reproducibility/_alluvial.py b/popv/reproducibility/_alluvial.py index a614ca6..39f1337 100644 --- a/popv/reproducibility/_alluvial.py +++ b/popv/reproducibility/_alluvial.py @@ -17,7 +17,15 @@ def plot(input_data, *args, **kwargs): class AlluvialTool: - def __init__(self, input_data=(), x_range=(0, 1), res=20, h_gap_frac=0.03, v_gap_frac=0.03, **kwargs): + def __init__( + self, + input_data=(), + x_range=(0, 1), + res=20, + h_gap_frac=0.03, + v_gap_frac=0.03, + **kwargs, + ): self.input = input_data self.x_range = x_range self.res = res # defines the resolution of the splines for all veins @@ -30,7 +38,14 @@ def __init__(self, input_data=(), x_range=(0, 1), res=20, h_gap_frac=0.03, v_gap self.h_gap = x_range[1] * h_gap_frac self.v_gap_frac = v_gap_frac self.v_gap = ( - sum([width for b_item_counter in self.data_dic.values() for width in b_item_counter.values()]) * v_gap_frac + sum( + [ + width + for b_item_counter in self.data_dic.values() + for width in b_item_counter.values() + ] + ) + * v_gap_frac ) self.group_widths = self.get_group_widths() self.item_coord_dic = self.make_item_coordinate_dic() @@ -104,7 +119,11 @@ def get_item_groups(self, a_sort=None, b_sort=None, **kwargs): _ = kwargs b_members = ( sorted( - {b_item for b_item_counter in self.data_dic.values() for b_item in b_item_counter}, + { + b_item + for b_item_counter in self.data_dic.values() + for b_item in b_item_counter + }, key=lambda x: self.item_widths_dic[x], ) if not b_sort @@ -121,7 +140,9 @@ def get_item_groups(self, a_sort=None, b_sort=None, **kwargs): return a_members, b_members def get_group_widths(self): - return [self.get_group_width(group) for group in (self.a_members, self.b_members)] + return [ + self.get_group_width(group) for group in (self.a_members, self.b_members) + ] def make_item_coordinate_dic( self, @@ -139,7 +160,10 @@ def make_item_coordinate_dic( return item_coord_dic def get_group_width(self, group): - return sum([self.item_widths_dic[item] for item in group]) + (len(group) - 1) * self.v_gap + return ( + sum([self.item_widths_dic[item] for item in group]) + + (len(group) - 1) * self.v_gap + ) def generate_alluvial_vein(self, a_item, b_item): width = self.data_dic[a_item][b_item] @@ -214,14 +238,22 @@ def plot(self, figsize=(10, 15), alpha=0.5, **kwargs): ax.autoscale() return ax - def get_color_array(self, colors=None, color_side=0, rand_seed=1, cmap=None, **kwargs): + def get_color_array( + self, colors=None, color_side=0, rand_seed=1, cmap=None, **kwargs + ): _ = kwargs color_items = self.b_members if color_side else self.a_members lci = len(color_items) if rand_seed is not None: np.random.seed(rand_seed) - cmap = cmap if cmap is not None else matplotlib.cm.get_cmap("hsv", lci * 10**3) - color_array = colors if colors is not None else [cmap(item) for ind, item in enumerate(np.random.rand(lci))] + cmap = ( + cmap if cmap is not None else matplotlib.cm.get_cmap("hsv", lci * 10**3) + ) + color_array = ( + colors + if colors is not None + else [cmap(item) for ind, item in enumerate(np.random.rand(lci))] + ) ind_dic = {item: ind for ind, item in enumerate(color_items)} polygon_colors = [] for ( @@ -256,7 +288,15 @@ def auto_label_veins(self, fontname="Arial", **kwargs): fontname=fontname, ) - def label_sides(self, labels=None, label_shift=0, disp_width=False, wdisp_sep=7 * " ", fontname="Arial", **kwargs): + def label_sides( + self, + labels=None, + label_shift=0, + disp_width=False, + wdisp_sep=7 * " ", + fontname="Arial", + **kwargs, + ): if labels is not None: _ = kwargs y = max(self.group_widths) / 2 @@ -264,7 +304,9 @@ def label_sides(self, labels=None, label_shift=0, disp_width=False, wdisp_sep=7 for side, sign in enumerate((-1, 1)): plt.text( self.x_range[side] - + sign * (label_shift + itl + int(disp_width) * (len(wdisp_sep) + wtl)) * self.h_gap_frac, + + sign + * (label_shift + itl + int(disp_width) * (len(wdisp_sep) + wtl)) + * self.h_gap_frac, y, labels[side], # bidi.algorithm.get_display(labels[side]), # RTL languages @@ -275,7 +317,9 @@ def label_sides(self, labels=None, label_shift=0, disp_width=False, wdisp_sep=7 rotation=90 - 180 * side, ) - def item_text(self, item, side, disp_width=False, wdisp_sep=7 * " ", width_in=True, **kwargs): + def item_text( + self, item, side, disp_width=False, wdisp_sep=7 * " ", width_in=True, **kwargs + ): _ = kwargs f_item = item # f_item = bidi.algorithm.get_display(item) # for RTL languages diff --git a/popv/visualization.py b/popv/visualization.py index 437975c..749958e 100755 --- a/popv/visualization.py +++ b/popv/visualization.py @@ -17,14 +17,18 @@ def _sample_report(adata, cell_type_key, score_key, pred_keys): adata.obs["counts"] = np.zeros(len(adata.obs)) - _counts_adata = adata.obs.groupby([cell_type_key, score_key]).count()[["counts"]].reset_index() + _counts_adata = ( + adata.obs.groupby([cell_type_key, score_key]).count()[["counts"]].reset_index() + ) counts_adata = _counts_adata.pivot(cell_type_key, score_key, "counts") counts_adata = counts_adata.dropna() np_counts = counts_adata.dropna().to_numpy() row_sums = np_counts.sum(axis=1) new_matrix = np_counts / row_sums[:, np.newaxis] ax = ( - pd.DataFrame(data=new_matrix, index=counts_adata.index, columns=counts_adata.columns) + pd.DataFrame( + data=new_matrix, index=counts_adata.index, columns=counts_adata.columns + ) .sort_values(7) .plot(kind="bar", stacked=True, figsize=(20, 7)) ) @@ -38,16 +42,31 @@ def _sample_report(adata, cell_type_key, score_key, pred_keys): ax.bar_label(ax.containers[0]) plt.show() for key in pred_keys: - counts_adata = adata.obs.groupby([key, cell_type_key]).count().reset_index().pivot(key, cell_type_key, "counts") + counts_adata = ( + adata.obs.groupby([key, cell_type_key]) + .count() + .reset_index() + .pivot(key, cell_type_key, "counts") + ) np_counts = counts_adata.dropna().to_numpy() row_sums = np_counts.sum(axis=0) new_matrix = np_counts / row_sums[np.newaxis, :] - new_index = [counts_adata.index[r] + " " + str(np.sum(np_counts[r])) for r in range(new_matrix.shape[0])] - new_columns = [counts_adata.columns[c] + " " + str(np.sum(np_counts[:, c])) for c in range(new_matrix.shape[1])] - input_data = pd.DataFrame(data=new_matrix, index=new_index, columns=new_columns).to_dict() + new_index = [ + counts_adata.index[r] + " " + str(np.sum(np_counts[r])) + for r in range(new_matrix.shape[0]) + ] + new_columns = [ + counts_adata.columns[c] + " " + str(np.sum(np_counts[:, c])) + for c in range(new_matrix.shape[1]) + ] + input_data = pd.DataFrame( + data=new_matrix, index=new_index, columns=new_columns + ).to_dict() cmap = matplotlib.cm.get_cmap("jet") - sorted_index = np.array(new_index)[sorted(range(new_matrix.shape[0]), key=lambda r: np.sum(np_counts[r]))] + sorted_index = np.array(new_index)[ + sorted(range(new_matrix.shape[0]), key=lambda r: np.sum(np_counts[r])) + ] sorted_columns = np.array(new_columns)[ sorted(range(new_matrix.shape[1]), key=lambda c: np.sum(np_counts[:, c])) ] @@ -108,7 +127,9 @@ def agreement_score_bar_plot( ) for x in celltypes ] - mean_agreement = pd.DataFrame([mean_agreement], index=["agreement"], columns=celltypes).T + mean_agreement = pd.DataFrame( + [mean_agreement], index=["agreement"], columns=celltypes + ).T mean_agreement.dropna(inplace=True) mean_agreement = mean_agreement.sort_values("agreement", ascending=True) ax = mean_agreement.plot.bar(y="agreement", figsize=(15, 2), legend=False) @@ -142,7 +163,13 @@ def prediction_score_bar_plot( Returns axis object of corresponding plot. """ - ax = adata[adata.obs["_dataset"] == "query"].obs[popv_prediction_score].value_counts().sort_index().plot.bar() + ax = ( + adata[adata.obs["_dataset"] == "query"] + .obs[popv_prediction_score] + .value_counts() + .sort_index() + .plot.bar() + ) ax.set_xlabel("Score") ax.set_ylabel("Frequency") @@ -188,7 +215,9 @@ def celltype_ratio_bar_plot( if normalize: prop = prop.div(prop.sum(axis=0), axis=1) - ax = prop.loc[cell_types].plot(kind="bar", figsize=(len(cell_types) * 0.5, 4), logy=(not normalize)) + ax = prop.loc[cell_types].plot( + kind="bar", figsize=(len(cell_types) * 0.5, 4), logy=(not normalize) + ) ax.set_ylabel("Celltype") ax.set_ylabel("Celltype Abundance") if save_folder is not None: diff --git a/pyproject.toml.rej b/pyproject.toml.rej index 7aa2c8e..2536abf 100644 --- a/pyproject.toml.rej +++ b/pyproject.toml.rej @@ -15,7 +15,7 @@ diff a/pyproject.toml b/pyproject.toml (rejected hunks) - "session-info" + "session-info", ] - + [project.optional-dependencies] dev = [ "pre-commit", diff --git a/tests/core/test_models.py b/tests/core/test_models.py index 0a0b3f3..34370aa 100644 --- a/tests/core/test_models.py +++ b/tests/core/test_models.py @@ -180,37 +180,57 @@ def test_celltypist(): def test_annotation(): """Test Annotation and Plotting pipeline.""" adata = _get_test_anndata().adata - popv.annotation.annotate_data(adata, methods=["svm", "rf"], save_path="tests/tmp_testing/popv_test_results/") + popv.annotation.annotate_data( + adata, methods=["svm", "rf"], save_path="tests/tmp_testing/popv_test_results/" + ) popv.visualization.agreement_score_bar_plot(adata) popv.visualization.prediction_score_bar_plot(adata) - popv.visualization.make_agreement_plots(adata, prediction_keys=adata.uns["prediction_keys"], show=False) + popv.visualization.make_agreement_plots( + adata, prediction_keys=adata.uns["prediction_keys"], show=False + ) popv.visualization.celltype_ratio_bar_plot(adata) obo_fn = "resources/ontology/cl.obo" _accuracy._ontology_accuracy( - adata[adata.obs["_dataset"] == "ref"], obofile=obo_fn, gt_key="cell_ontology_class", pred_key="popv_prediction" + adata[adata.obs["_dataset"] == "ref"], + obofile=obo_fn, + gt_key="cell_ontology_class", + pred_key="popv_prediction", ) _accuracy._fine_ontology_sibling_accuracy( - adata[adata.obs["_dataset"] == "ref"], obofile=obo_fn, gt_key="cell_ontology_class", pred_key="popv_prediction" + adata[adata.obs["_dataset"] == "ref"], + obofile=obo_fn, + gt_key="cell_ontology_class", + pred_key="popv_prediction", ) assert "popv_majority_vote_prediction" in adata.obs.columns assert not adata.obs["popv_majority_vote_prediction"].isnull().any() adata = _get_test_anndata(mode="inference").adata - popv.annotation.annotate_data(adata, save_path="tests/tmp_testing/popv_test_results/") + popv.annotation.annotate_data( + adata, save_path="tests/tmp_testing/popv_test_results/" + ) adata = _get_test_anndata(mode="fast").adata - popv.annotation.annotate_data(adata, save_path="tests/tmp_testing/popv_test_results/") + popv.annotation.annotate_data( + adata, save_path="tests/tmp_testing/popv_test_results/" + ) def test_annotation_no_ontology(): """Test Annotation and Plotting pipeline without ontology.""" adata = _get_test_anndata(cl_obo_folder=False).adata - popv.annotation.annotate_data(adata, methods=["svm", "rf"], save_path="tests/tmp_testing/popv_test_results/") + popv.annotation.annotate_data( + adata, methods=["svm", "rf"], save_path="tests/tmp_testing/popv_test_results/" + ) popv.visualization.agreement_score_bar_plot(adata) popv.visualization.prediction_score_bar_plot(adata) - popv.visualization.make_agreement_plots(adata, prediction_keys=adata.uns["prediction_keys"]) - popv.visualization.celltype_ratio_bar_plot(adata, save_folder="tests/tmp_testing/popv_test_results/") + popv.visualization.make_agreement_plots( + adata, prediction_keys=adata.uns["prediction_keys"] + ) + popv.visualization.celltype_ratio_bar_plot( + adata, save_folder="tests/tmp_testing/popv_test_results/" + ) popv.visualization.celltype_ratio_bar_plot(adata, normalize=False) adata.obs["empty_columns"] = "a" input_data = adata.obs[["empty_columns", "popv_rf_prediction"]].values.tolist()