From 0ab39d9871ea5873d7ee66db9bc37c06366feee3 Mon Sep 17 00:00:00 2001 From: eeaunin Date: Wed, 1 Dec 2021 02:39:46 +0000 Subject: [PATCH] added UMAP version check to clustering scripts --- README.md | 1 + gda_clustering.py | 9 +++++++++ gda_parameters.py | 5 ++++- 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 108eb5e..ba1ea1f 100755 --- a/README.md +++ b/README.md @@ -18,6 +18,7 @@ Below is a diagram for a quick overview of what GDA does. (A) Features sets are derived from the genome reference sequence (seq), repeat finding (rep), gene annotations (gene) and evolutionary relationships between genes (orth). Values for each feature are determined for each non-overlapping window of e.g. 5kb across the genome. (B) The resulting matrix of feature values per window is embedded in two dimensions and clustered to identify groups of windows with similar properties. (C) The data can be explored in a number of ways using a web-browser based app. The clustering labels are mapped back to the chromosomes to highlight architectural features and a heatmap displays the features which define the clusters. + A more technical diagram of the components of the pipeline in the form of a flowchart can be seen [here](images/gda_pipeline_flowchart.png). A [Nextflow](https://www.nextflow.io/)-based pipeline that includes various third party tools extracts the values of a set of genomic variables that describe a genome assembly. The values of genomic variables along chromosomes are stored as [bedgraph files](https://genome.ucsc.edu/goldenPath/help/bedgraph.html). The bedgraph files corresponding to one genome assembly are then merged into one tab separated values (TSV) file. In the following text, this file is referred to as "merged TSV" file. Scaling of values, dimensionality reduction with [UMAP](https://umap-learn.readthedocs.io/en/latest/) and clustering with [HDBSCAN](https://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html) are then applied to the numbers in this TSV file. The locations of clusters along chromosomes are stored in a BED file. diff --git a/gda_clustering.py b/gda_clustering.py index db7dcc0..a4ca444 100755 --- a/gda_clustering.py +++ b/gda_clustering.py @@ -272,10 +272,19 @@ def write_circos_json(json_dict, outfile, outdir): file.write(json.dumps(json_dict)) +def check_umap_version(): + '''Check UMAP version and print a warning if it is not what is expected. UMAP crashes if it's a version that is too old (e.g. 0.4.2)''' + installed_umap_version = umap.__version__ + expected_umap_version = "0.4.6" + if umap.__version__ != expected_umap_version: + sys.stderr.write("Warning: UMAP version appears to be different from what is expected for the GDA pipeline (the installed version is {} but the expected version is {})\n".format(installed_umap_version, expected_umap_version)) + + def main(umap_n_neighbors, hdbscan_min_cluster_size, pvalue_cutoff, cluster_position_histogram_window_number, outdir, tracks_file): ################# # Procedural code ################# + check_umap_version() sys.stderr.write('\t'.join([str(umap_n_neighbors), str(hdbscan_min_cluster_size), str(pvalue_cutoff), tracks_file])) sys.stderr.write('\n') diff --git a/gda_parameters.py b/gda_parameters.py index f0dafb1..d952239 100755 --- a/gda_parameters.py +++ b/gda_parameters.py @@ -41,7 +41,7 @@ import numpy as np from matplotlib.colors import ListedColormap -from gda_clustering import get_palette, run_umap, run_hdbscan +from gda_clustering import get_palette, run_umap, run_hdbscan, check_umap_version from sklearn import metrics # Ignore NUMBA warnings related to running UMAP @@ -185,12 +185,15 @@ def export_metrics_table(metrics_dict, outdir_full): metrics_df = metrics_df.sort_values(["silhouette_score", "unclassified_percentage", "davies_bouldin_index", "calinski_harabasz_score"], ascending=[False, True, True, False]) metrics_outfile = outdir_full + "/clustering_metrics.csv" metrics_df.to_csv(metrics_outfile, index=True) + def main(args): ################# # Procedural code ################# + + check_umap_version() neighbours_list = get_neighbours_list(args.n_neighbors) cluster_size_list = get_cluster_size_list(args.cluster_size_cutoff)