Merge pull request #129 from blab/minor-revisions

Minor revisions
blab · Sep 27, 2024 · 0883894 · 0883894
2 parents af8d222 + 54e3d18
commit 0883894
Show file tree

Hide file tree

Showing 26 changed files with 334 additions and 34 deletions.
diff --git a/ha-na-nextstrain/2022-02-23-seasonal-flu-ha-na-reassortment.ipynb b/ha-na-nextstrain/2022-02-23-seasonal-flu-ha-na-reassortment.ipynb
@@ -1473,16 +1473,57 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "mcc_labels_pca_ha = get_clade_label_chart(tips_to_keep_df, \"pca1_ha\", \"pca2_ha\", \"mcc_short\")\n",
-    "mcc_labels_pca_concat = get_clade_label_chart(tips_to_keep_df, \"pca1_concatenated\", \"pca2_concatenated\", \"mcc_short\")\n",
+    "mcc_labels_pca_ha = get_clade_label_chart(\n",
+    "    tips_to_keep_df,\n",
+    "    \"pca1_ha\",\n",
+    "    \"pca2_ha\",\n",
+    "    \"mcc_short\",\n",
+    "    xoffset_by_label={\n",
+    "        \"9\": 1,\n",
+    "    },\n",
+    "    yoffset_by_label={\n",
+    "        \"12\": 0.5,\n",
+    "    }\n",
+    ")\n",
+    "mcc_labels_pca_concat = get_clade_label_chart(\n",
+    "    tips_to_keep_df,\n",
+    "    \"pca1_concatenated\",\n",
+    "    \"pca2_concatenated\",\n",
+    "    \"mcc_short\",\n",
+    "    xoffset_by_label={\n",
+    "        \"9\": 1.25,\n",
+    "    },\n",
+    "    yoffset_by_label={\n",
+    "        \"9\": 0.25,\n",
+    "    }\n",
+    ")\n",
     "\n",
     "mcc_labels_mds_ha = get_clade_label_chart(tips_to_keep_df, \"mds1_ha\", \"mds2_ha\", \"mcc_short\")\n",
     "mcc_labels_mds_concat = get_clade_label_chart(tips_to_keep_df, \"mds1_concatenated\", \"mds2_concatenated\", \"mcc_short\")\n",
     "\n",
-    "mcc_labels_tsne_ha = get_clade_label_chart(tips_to_keep_df, \"tsne_x_ha\", \"tsne_y_ha\", \"mcc_short\")\n",
+    "mcc_labels_tsne_ha = get_clade_label_chart(\n",
+    "    tips_to_keep_df,\n",
+    "    \"tsne_x_ha\",\n",
+    "    \"tsne_y_ha\",\n",
+    "    \"mcc_short\",\n",
+    "    xoffset_by_label={\n",
+    "        \"10\": -1,\n",
+    "    }\n",
+    ")\n",
     "mcc_labels_tsne_concat = get_clade_label_chart(tips_to_keep_df, \"tsne_x_concatenated\", \"tsne_y_concatenated\", \"mcc_short\")\n",
     "\n",
-    "mcc_labels_umap_ha = get_clade_label_chart(tips_to_keep_df, \"umap_x_ha\", \"umap_y_ha\", \"mcc_short\")\n",
+    "mcc_labels_umap_ha = get_clade_label_chart(\n",
+    "    tips_to_keep_df,\n",
+    "    \"umap_x_ha\",\n",
+    "    \"umap_y_ha\",\n",
+    "    \"mcc_short\",\n",
+    "    xoffset_by_label={\n",
+    "        \"12\": -1.5,\n",
+    "    },\n",
+    "    yoffset_by_label={\n",
+    "        \"10\": 1,\n",
+    "    }\n",
+    ")\n",
     "mcc_labels_umap_concat = get_clade_label_chart(tips_to_keep_df, \"umap_x_concatenated\", \"umap_y_concatenated\", \"mcc_short\")"
    ]
   },
@@ -1601,7 +1642,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "302a9e10",
+   "id": "359312b0",
    "metadata": {},
    "outputs": [],
    "source": []

diff --git a/manuscript/cartography.bib b/manuscript/cartography.bib
@@ -1100,3 +1100,16 @@ @article{Huddleston2024
   doi          = {10.5281/zenodo.10846007},
   url          = {https://doi.org/10.5281/zenodo.10846007}
 }
+
+@article{McInnes2017,
+  doi = {10.21105/joss.00205},
+  url = {https://doi.org/10.21105%2Fjoss.00205},
+  year  = {2017},
+  month = {Mar},
+  publisher = {The Open Journal},
+  volume = {2},
+  number = {11},
+  author = {Leland McInnes and John Healy and Steve Astels},
+  title = {hdbscan: {H}ierarchical density based clustering},
+  journal = {The Journal of Open Source Software}
+}
diff --git a/manuscript/cartography.tex b/manuscript/cartography.tex
@@ -490,6 +490,7 @@ \subsection{Recommendations for application of methods to new pathogens}
 Then, choose which embedding method to use based on the question under investigation.
 For analyses that require the most accurate low-dimensional representation of pairwise genetic distances across local and global scales, use MDS with 3 dimensions.
 For analyses that need to find clusters of closely related samples, use t-SNE with a perplexity of 100 (or less, if using fewer than 100 samples) and a learning rate that scales with the number of samples in the data.
+Since the HDBSCAN algorithm relies on the density of samples in a given coordinate space to find clusters and samples are less likely to place close together in higher dimensions \citep{campello2015hierarchical}, we recommend clustering with low-dimensional embeddings of sequences instead of the higher-dimensional pairwise distance matrices.
 In all cases, plot the relationship between pairwise genetic distances and Euclidean distances in each embedding.
 These plots reveal the range of genetic distances that an embedding can represent linearly and act as a sanity check akin to plotting the temporal signal present in samples prior to inferring a time-scaled phylogeny \citep{Rambaut2016,Sagulenko2018}.
 Before finding clusters in the t-SNE embedding, determine the minimum genetic distance desired between clusters, and use the pairwise genetic and Euclidean distance plot to find the corresponding Euclidean distance to use as a threshold for HDBSCAN.
@@ -620,7 +621,7 @@ \subsection{Selection of natural virus population data}
 
 For analyses that focused only on H3N2 HA data, we defined the early dataset between January 2016 and January 2018 and the late dataset between January 2018 to January 2020.
 These datasets reflected two years of recent H3N2 evolution up to the time when the SARS-CoV-2 pandemic disrupted seasonal influenza circulation.
-For both early and late datasets, we evenly sampled 25 sequences per country, year, and month.
+For both early and late datasets, we evenly sampled 25 sequences per country and per month of each year.
 We excluded outliers which were sequences either labeled as environmental samples, containing over 100 gap characters within the HA sequence, or flagged by TreeTime \citep{Sagulenko2018} for having a phylogenetic divergence that exceeded four times the interquartile interval of residuals from a root-to-tip regression for all sequences in the same tree.
 With this sampling scheme, we selected 1,523 HA sequences for the early dataset and 1,073 for the late dataset.
 For analyses that combined H3N2 HA and NA data, we defined a single dataset between January 2016 and January 2018, keeping 1,607 samples for which both HA and NA have been sequenced.
@@ -680,7 +681,7 @@ \subsection{Definitions of genetic groups by experts or biologically-informed mo
 
 \subsection{Clustering of samples in embeddings}
 
-To understand how well embeddings of genetic data could capture previously defined genetic groups, we applied an unsupervised clustering algorithm, HDBSCAN \citep{campello2015hierarchical}, to each embedding.
+To understand how well embeddings of genetic data could capture previously defined genetic groups, we applied an unsupervised clustering algorithm, HDBSCAN \citep{campello2015hierarchical}, to each embedding using the Python-based \texttt{hdbscan} package \citep{McInnes2017}.
 In addition to the four embedding methods, we identified HDBSCAN clusters from a fifth ``method'' of precomputed pairwise genetic distances.
 This genetic distance method allowed us to understand how clusters differed between low- and high-dimensional inputs.
 HDBSCAN identifies initial clusters from high-density regions in the input space and merges these clusters hierarchically.
@@ -771,7 +772,7 @@ \section*{Author contributions statement}
 
 \section*{Acknowledgments}
 
-We thank James Hadfield, Katie Kistler, Maya Lewinsohn, Nicola Muller, Louise Moncla, Nidia Trovao, and Michael Zeller for constructive feedback on this project.
+We thank James Hadfield, Katie Kistler, Maya Lewinsohn, Nicola Muller, Louise Moncla, Nidia Trovao, Michael Zeller, and anonymous reviewers for their constructive feedback on this project.
 We gratefully acknowledge the originating and submitting laboratories of seasonal influenza and SARS-CoV-2 sequences from INSDC databases without whom this work would not be possible (Supplementary Table~S\ref{S_Table_accessions}).
 
 \subsection*{Funding}

diff --git a/manuscript/cartography_supplement.tex b/manuscript/cartography_supplement.tex
@@ -225,6 +225,7 @@ \section*{Supplementary data}
   Line segments in each embedding reflect phylogenetic relationships with internal node positions calculated from the mean positions of their immediate descendants in each dimension (see Methods).
   Line thickness in the embeddings scales by the square root of the number of leaves descending from a given node in the phylogeny.
   Clade labels in the tree and embeddings highlight larger clades.
+  Where clade labels overlap in an embedding, the most ancestral clade's label is shown.
 }
 \label{S_Fig_sarscov2_late_embeddings_by_Nextstrain_clade}
 \end{figure}
@@ -236,6 +237,7 @@ \section*{Supplementary data}
   Line segments in each embedding reflect phylogenetic relationships with internal node positions calculated from the mean positions of their immediate descendants in each dimension (see Methods).
   Line thickness in the embeddings scales by the square root of the number of leaves descending from a given node in the phylogeny.
   Clade labels in the tree and embeddings highlight larger Pango lineages.
+  Where clade labels overlap in an embedding, the most ancestral clade's label is shown.
 }
 \label{S_Fig_sarscov2_late_embeddings_by_Pango}
 \end{figure}

diff --git a/manuscript/figures/flu-2016-2018-ha-embeddings-by-clade.html b/manuscript/figures/flu-2016-2018-ha-embeddings-by-clade.html
diff --git a/manuscript/figures/flu-2016-2018-ha-embeddings-by-clade.png b/manuscript/figures/flu-2016-2018-ha-embeddings-by-clade.png
diff --git a/manuscript/figures/flu-2016-2018-ha-na-all-embeddings-by-mcc.html b/manuscript/figures/flu-2016-2018-ha-na-all-embeddings-by-mcc.html
diff --git a/manuscript/figures/flu-2016-2018-ha-na-all-embeddings-by-mcc.png b/manuscript/figures/flu-2016-2018-ha-na-all-embeddings-by-mcc.png
diff --git a/manuscript/figures/flu-2016-2018-ha-na-embeddings-by-mcc.html b/manuscript/figures/flu-2016-2018-ha-na-embeddings-by-mcc.html
diff --git a/manuscript/figures/flu-2016-2018-ha-na-embeddings-by-mcc.png b/manuscript/figures/flu-2016-2018-ha-na-embeddings-by-mcc.png
diff --git a/manuscript/figures/flu-2016-2018-mds-by-clade.html b/manuscript/figures/flu-2016-2018-mds-by-clade.html
diff --git a/manuscript/figures/flu-2016-2018-mds-by-clade.png b/manuscript/figures/flu-2016-2018-mds-by-clade.png
diff --git a/manuscript/figures/flu-2018-2020-ha-embeddings-by-clade.html b/manuscript/figures/flu-2018-2020-ha-embeddings-by-clade.html
diff --git a/manuscript/figures/flu-2018-2020-ha-embeddings-by-clade.png b/manuscript/figures/flu-2018-2020-ha-embeddings-by-clade.png
diff --git a/manuscript/figures/flu-2018-2020-mds-by-clade.html b/manuscript/figures/flu-2018-2020-mds-by-clade.html
diff --git a/manuscript/figures/flu-2018-2020-mds-by-clade.png b/manuscript/figures/flu-2018-2020-mds-by-clade.png
diff --git a/manuscript/figures/sarscov2-test-embeddings-by-Nextclade_pango_collapsed-clade.html b/manuscript/figures/sarscov2-test-embeddings-by-Nextclade_pango_collapsed-clade.html
diff --git a/manuscript/figures/sarscov2-test-embeddings-by-Nextclade_pango_collapsed-clade.png b/manuscript/figures/sarscov2-test-embeddings-by-Nextclade_pango_collapsed-clade.png
diff --git a/manuscript/figures/sarscov2-test-embeddings-by-Nextstrain_clade-clade.html b/manuscript/figures/sarscov2-test-embeddings-by-Nextstrain_clade-clade.html
diff --git a/manuscript/figures/sarscov2-test-embeddings-by-Nextstrain_clade-clade.png b/manuscript/figures/sarscov2-test-embeddings-by-Nextstrain_clade-clade.png
diff --git a/notebooks/scripts/Helpers.py b/notebooks/scripts/Helpers.py
@@ -246,14 +246,23 @@ def scatterplot_with_tooltip_interactive(finalDf, x, y, Titlex, Titley, ToolTip,
     return chart
 
 
-def get_clade_label_chart(df, x_column, y_column, text_column):
+def get_clade_label_chart(
+    df, x_column, y_column, text_column, drop_labels=None,
+    xoffset_by_label=None,
+    yoffset_by_label=None,
+):
     """Build an Altair chart of text labels from the given data frame, placing
     labels at the mean x and y position of the given x and y columns and using
     text from the given text column.
 
     """
+    if drop_labels is None:
+        drop_labels = {"other"}
+    else:
+        drop_labels = set(drop_labels) | {"other"}
+
     clade_label_positions = df.loc[
-        (~df["is_internal_node"]) & (df[text_column] != "other"),
+        (~df["is_internal_node"]) & (~df[text_column].isin(drop_labels)),
         [text_column, x_column, y_column]
     ].groupby(
         text_column
@@ -262,6 +271,18 @@ def get_clade_label_chart(df, x_column, y_column, text_column):
         y_column: "mean",
     }).reset_index()
 
+    if xoffset_by_label is not None:
+        clade_label_positions[x_column] = clade_label_positions.apply(
+            lambda record: record[x_column] + xoffset_by_label.get(record[text_column], 0),
+            axis=1
+        ).values
+
+    if yoffset_by_label is not None:
+        clade_label_positions[y_column] = clade_label_positions.apply(
+            lambda record: record[y_column] + yoffset_by_label.get(record[text_column], 0),
+            axis=1
+        ).values
+
     clade_labels_chart = alt.Chart(clade_label_positions).mark_text().encode(
         x=f"{x_column}:Q",
         y=f"{y_column}:Q",

diff --git a/sars-cov-2-nextstrain-2022-2023/Snakefile b/sars-cov-2-nextstrain-2022-2023/Snakefile
@@ -1013,6 +1013,7 @@ rule sarscov2_test_create_notebook_docs:
         umap_label=lambda wildcards: f"umap_label_for_{wildcards.clade_membership}",
         plot_branches=False,
         clades_to_label=get_late_sarscov2_clades_to_label,
+        dataset="late",
     conda: "../cartography.yml"
     notebook:
         "../sars-cov-2-nextstrain/2022-03-29-final-figures.py.ipynb"

diff --git a/sars-cov-2-nextstrain/2022-03-29-final-figures.py.ipynb b/sars-cov-2-nextstrain/2022-03-29-final-figures.py.ipynb
@@ -150,6 +150,30 @@
     "clades_to_label = snakemake.params.clades_to_label"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "70b5efa7",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "dataset = snakemake.params.dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bd5317af",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "dataset"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -965,17 +989,108 @@
     ")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4728db32",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "if dataset == \"late\":\n",
+    "    pca_label_kwargs = {\n",
+    "        \"drop_labels\": {\n",
+    "            \"22B\",\n",
+    "            \"22D\",\n",
+    "            \"22F\",\n",
+    "            \"23A\",\n",
+    "            \"23B\",\n",
+    "            \"BA.2.75\",\n",
+    "            \"BA.5\",\n",
+    "            \"XBB.1\",\n",
+    "            \"XBB.1.5\",\n",
+    "            \"XBB.1.16\",\n",
+    "        },\n",
+    "        \"xoffset_by_label\": {\n",
+    "            \"XBB.1.9\": -0.5,\n",
+    "        }\n",
+    "    }\n",
+    "    mds_label_kwargs = {\n",
+    "        \"drop_labels\": {\n",
+    "            \"23A\",\n",
+    "            \"BA.5\",\n",
+    "            \"XBB.1.5\",\n",
+    "        },\n",
+    "        \"xoffset_by_label\": {\n",
+    "            \"22B\": -12,\n",
+    "        },\n",
+    "        \"yoffset_by_label\": {\n",
+    "            \"22B\": -3,\n",
+    "        }\n",
+    "    }\n",
+    "    tsne_label_kwargs = {\n",
+    "        \"xoffset_by_label\": {\n",
+    "            \"XBB.1.9\": 5,\n",
+    "        },\n",
+    "    }\n",
+    "    umap_label_kwargs = {\n",
+    "        \"drop_labels\": {\n",
+    "            \"23A\",\n",
+    "            \"XBB.1.5\",\n",
+    "            \"XBB.1.9\",\n",
+    "        },\n",
+    "        \"xoffset_by_label\": {\n",
+    "            \"23F\": -1,\n",
+    "            \"XBB.1\": 1,\n",
+    "        },\n",
+    "        \"yoffset_by_label\": {\n",
+    "            \"23D\": 1,\n",
+    "            \"22F\": -1,\n",
+    "        }\n",
+    "    }\n",
+    "else:\n",
+    "    pca_label_kwargs = {}\n",
+    "    mds_label_kwargs = {}\n",
+    "    tsne_label_kwargs = {}\n",
+    "    umap_label_kwargs = {}"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "id": "756eab62",
    "metadata": {},
    "outputs": [],
    "source": [
-    "clade_labels_for_pca_chart = get_clade_label_chart(label_embeddings_df, \"pca1\", \"pca2\", \"clade_membership_short\")\n",
-    "clade_labels_for_mds_chart = get_clade_label_chart(label_embeddings_df, \"mds1\", \"mds2\", \"clade_membership_short\")\n",
-    "clade_labels_for_tsne_chart = get_clade_label_chart(label_embeddings_df, \"tsne_x\", \"tsne_y\", \"clade_membership_short\")\n",
-    "clade_labels_for_umap_chart = get_clade_label_chart(label_embeddings_df, \"umap_x\", \"umap_y\", \"clade_membership_short\")"
+    "clade_labels_for_pca_chart = get_clade_label_chart(\n",
+    "    label_embeddings_df,\n",
+    "    \"pca1\",\n",
+    "    \"pca2\",\n",
+    "    \"clade_membership_short\",\n",
+    "    **pca_label_kwargs,\n",
+    ")\n",
+    "clade_labels_for_mds_chart = get_clade_label_chart(\n",
+    "    label_embeddings_df,\n",
+    "    \"mds1\",\n",
+    "    \"mds2\",\n",
+    "    \"clade_membership_short\",\n",
+    "    **mds_label_kwargs,\n",
+    ")\n",
+    "clade_labels_for_tsne_chart = get_clade_label_chart(\n",
+    "    label_embeddings_df,\n",
+    "    \"tsne_x\",\n",
+    "    \"tsne_y\",\n",
+    "    \"clade_membership_short\",\n",
+    "    **tsne_label_kwargs,\n",
+    ")\n",
+    "clade_labels_for_umap_chart = get_clade_label_chart(\n",
+    "    label_embeddings_df,\n",
+    "    \"umap_x\",\n",
+    "    \"umap_y\",\n",
+    "    \"clade_membership_short\",\n",
+    "    **umap_label_kwargs,\n",
+    ")"
    ]
   },
   {

diff --git a/sars-cov-2-nextstrain/Snakefile b/sars-cov-2-nextstrain/Snakefile
@@ -1001,6 +1001,7 @@ rule sarscov2_create_notebook_docs:
         umap_label=lambda wildcards: f"umap_label_for_{wildcards.clade_membership}",
         plot_branches=True,
         clades_to_label=get_early_sarscov2_clades_to_label,
+        dataset="early",
     conda: "../cartography.yml"
     notebook:
         "2022-03-29-final-figures.py.ipynb"