Skip to content

Commit

Permalink
Merge pull request #129 from blab/minor-revisions
Browse files Browse the repository at this point in the history
Minor revisions
  • Loading branch information
huddlej authored Sep 27, 2024
2 parents af8d222 + 54e3d18 commit 0883894
Show file tree
Hide file tree
Showing 26 changed files with 334 additions and 34 deletions.
51 changes: 46 additions & 5 deletions ha-na-nextstrain/2022-02-23-seasonal-flu-ha-na-reassortment.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -1473,16 +1473,57 @@
"metadata": {},
"outputs": [],
"source": [
"mcc_labels_pca_ha = get_clade_label_chart(tips_to_keep_df, \"pca1_ha\", \"pca2_ha\", \"mcc_short\")\n",
"mcc_labels_pca_concat = get_clade_label_chart(tips_to_keep_df, \"pca1_concatenated\", \"pca2_concatenated\", \"mcc_short\")\n",
"mcc_labels_pca_ha = get_clade_label_chart(\n",
" tips_to_keep_df,\n",
" \"pca1_ha\",\n",
" \"pca2_ha\",\n",
" \"mcc_short\",\n",
" xoffset_by_label={\n",
" \"9\": 1,\n",
" },\n",
" yoffset_by_label={\n",
" \"12\": 0.5,\n",
" }\n",
")\n",
"mcc_labels_pca_concat = get_clade_label_chart(\n",
" tips_to_keep_df,\n",
" \"pca1_concatenated\",\n",
" \"pca2_concatenated\",\n",
" \"mcc_short\",\n",
" xoffset_by_label={\n",
" \"9\": 1.25,\n",
" },\n",
" yoffset_by_label={\n",
" \"9\": 0.25,\n",
" }\n",
")\n",
"\n",
"mcc_labels_mds_ha = get_clade_label_chart(tips_to_keep_df, \"mds1_ha\", \"mds2_ha\", \"mcc_short\")\n",
"mcc_labels_mds_concat = get_clade_label_chart(tips_to_keep_df, \"mds1_concatenated\", \"mds2_concatenated\", \"mcc_short\")\n",
"\n",
"mcc_labels_tsne_ha = get_clade_label_chart(tips_to_keep_df, \"tsne_x_ha\", \"tsne_y_ha\", \"mcc_short\")\n",
"mcc_labels_tsne_ha = get_clade_label_chart(\n",
" tips_to_keep_df,\n",
" \"tsne_x_ha\",\n",
" \"tsne_y_ha\",\n",
" \"mcc_short\",\n",
" xoffset_by_label={\n",
" \"10\": -1,\n",
" }\n",
")\n",
"mcc_labels_tsne_concat = get_clade_label_chart(tips_to_keep_df, \"tsne_x_concatenated\", \"tsne_y_concatenated\", \"mcc_short\")\n",
"\n",
"mcc_labels_umap_ha = get_clade_label_chart(tips_to_keep_df, \"umap_x_ha\", \"umap_y_ha\", \"mcc_short\")\n",
"mcc_labels_umap_ha = get_clade_label_chart(\n",
" tips_to_keep_df,\n",
" \"umap_x_ha\",\n",
" \"umap_y_ha\",\n",
" \"mcc_short\",\n",
" xoffset_by_label={\n",
" \"12\": -1.5,\n",
" },\n",
" yoffset_by_label={\n",
" \"10\": 1,\n",
" }\n",
")\n",
"mcc_labels_umap_concat = get_clade_label_chart(tips_to_keep_df, \"umap_x_concatenated\", \"umap_y_concatenated\", \"mcc_short\")"
]
},
Expand Down Expand Up @@ -1601,7 +1642,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "302a9e10",
"id": "359312b0",
"metadata": {},
"outputs": [],
"source": []
Expand Down
13 changes: 13 additions & 0 deletions manuscript/cartography.bib
Original file line number Diff line number Diff line change
Expand Up @@ -1100,3 +1100,16 @@ @article{Huddleston2024
doi = {10.5281/zenodo.10846007},
url = {https://doi.org/10.5281/zenodo.10846007}
}

@article{McInnes2017,
doi = {10.21105/joss.00205},
url = {https://doi.org/10.21105%2Fjoss.00205},
year = {2017},
month = {Mar},
publisher = {The Open Journal},
volume = {2},
number = {11},
author = {Leland McInnes and John Healy and Steve Astels},
title = {hdbscan: {H}ierarchical density based clustering},
journal = {The Journal of Open Source Software}
}
7 changes: 4 additions & 3 deletions manuscript/cartography.tex
Original file line number Diff line number Diff line change
Expand Up @@ -490,6 +490,7 @@ \subsection{Recommendations for application of methods to new pathogens}
Then, choose which embedding method to use based on the question under investigation.
For analyses that require the most accurate low-dimensional representation of pairwise genetic distances across local and global scales, use MDS with 3 dimensions.
For analyses that need to find clusters of closely related samples, use t-SNE with a perplexity of 100 (or less, if using fewer than 100 samples) and a learning rate that scales with the number of samples in the data.
Since the HDBSCAN algorithm relies on the density of samples in a given coordinate space to find clusters and samples are less likely to place close together in higher dimensions \citep{campello2015hierarchical}, we recommend clustering with low-dimensional embeddings of sequences instead of the higher-dimensional pairwise distance matrices.
In all cases, plot the relationship between pairwise genetic distances and Euclidean distances in each embedding.
These plots reveal the range of genetic distances that an embedding can represent linearly and act as a sanity check akin to plotting the temporal signal present in samples prior to inferring a time-scaled phylogeny \citep{Rambaut2016,Sagulenko2018}.
Before finding clusters in the t-SNE embedding, determine the minimum genetic distance desired between clusters, and use the pairwise genetic and Euclidean distance plot to find the corresponding Euclidean distance to use as a threshold for HDBSCAN.
Expand Down Expand Up @@ -620,7 +621,7 @@ \subsection{Selection of natural virus population data}

For analyses that focused only on H3N2 HA data, we defined the early dataset between January 2016 and January 2018 and the late dataset between January 2018 to January 2020.
These datasets reflected two years of recent H3N2 evolution up to the time when the SARS-CoV-2 pandemic disrupted seasonal influenza circulation.
For both early and late datasets, we evenly sampled 25 sequences per country, year, and month.
For both early and late datasets, we evenly sampled 25 sequences per country and per month of each year.
We excluded outliers which were sequences either labeled as environmental samples, containing over 100 gap characters within the HA sequence, or flagged by TreeTime \citep{Sagulenko2018} for having a phylogenetic divergence that exceeded four times the interquartile interval of residuals from a root-to-tip regression for all sequences in the same tree.
With this sampling scheme, we selected 1,523 HA sequences for the early dataset and 1,073 for the late dataset.
For analyses that combined H3N2 HA and NA data, we defined a single dataset between January 2016 and January 2018, keeping 1,607 samples for which both HA and NA have been sequenced.
Expand Down Expand Up @@ -680,7 +681,7 @@ \subsection{Definitions of genetic groups by experts or biologically-informed mo

\subsection{Clustering of samples in embeddings}

To understand how well embeddings of genetic data could capture previously defined genetic groups, we applied an unsupervised clustering algorithm, HDBSCAN \citep{campello2015hierarchical}, to each embedding.
To understand how well embeddings of genetic data could capture previously defined genetic groups, we applied an unsupervised clustering algorithm, HDBSCAN \citep{campello2015hierarchical}, to each embedding using the Python-based \texttt{hdbscan} package \citep{McInnes2017}.
In addition to the four embedding methods, we identified HDBSCAN clusters from a fifth ``method'' of precomputed pairwise genetic distances.
This genetic distance method allowed us to understand how clusters differed between low- and high-dimensional inputs.
HDBSCAN identifies initial clusters from high-density regions in the input space and merges these clusters hierarchically.
Expand Down Expand Up @@ -771,7 +772,7 @@ \section*{Author contributions statement}

\section*{Acknowledgments}

We thank James Hadfield, Katie Kistler, Maya Lewinsohn, Nicola Muller, Louise Moncla, Nidia Trovao, and Michael Zeller for constructive feedback on this project.
We thank James Hadfield, Katie Kistler, Maya Lewinsohn, Nicola Muller, Louise Moncla, Nidia Trovao, Michael Zeller, and anonymous reviewers for their constructive feedback on this project.
We gratefully acknowledge the originating and submitting laboratories of seasonal influenza and SARS-CoV-2 sequences from INSDC databases without whom this work would not be possible (Supplementary Table~S\ref{S_Table_accessions}).

\subsection*{Funding}
Expand Down
2 changes: 2 additions & 0 deletions manuscript/cartography_supplement.tex
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,7 @@ \section*{Supplementary data}
Line segments in each embedding reflect phylogenetic relationships with internal node positions calculated from the mean positions of their immediate descendants in each dimension (see Methods).
Line thickness in the embeddings scales by the square root of the number of leaves descending from a given node in the phylogeny.
Clade labels in the tree and embeddings highlight larger clades.
Where clade labels overlap in an embedding, the most ancestral clade's label is shown.
}
\label{S_Fig_sarscov2_late_embeddings_by_Nextstrain_clade}
\end{figure}
Expand All @@ -236,6 +237,7 @@ \section*{Supplementary data}
Line segments in each embedding reflect phylogenetic relationships with internal node positions calculated from the mean positions of their immediate descendants in each dimension (see Methods).
Line thickness in the embeddings scales by the square root of the number of leaves descending from a given node in the phylogeny.
Clade labels in the tree and embeddings highlight larger Pango lineages.
Where clade labels overlap in an embedding, the most ancestral clade's label is shown.
}
\label{S_Fig_sarscov2_late_embeddings_by_Pango}
\end{figure}
Expand Down

Large diffs are not rendered by default.

Binary file modified manuscript/figures/flu-2016-2018-ha-embeddings-by-clade.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

Large diffs are not rendered by default.

Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

Large diffs are not rendered by default.

Binary file modified manuscript/figures/flu-2016-2018-ha-na-embeddings-by-mcc.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion manuscript/figures/flu-2016-2018-mds-by-clade.html

Large diffs are not rendered by default.

Binary file modified manuscript/figures/flu-2016-2018-mds-by-clade.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

Large diffs are not rendered by default.

Binary file modified manuscript/figures/flu-2018-2020-ha-embeddings-by-clade.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion manuscript/figures/flu-2018-2020-mds-by-clade.html

Large diffs are not rendered by default.

Binary file modified manuscript/figures/flu-2018-2020-mds-by-clade.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

Large diffs are not rendered by default.

Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

Large diffs are not rendered by default.

Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
25 changes: 23 additions & 2 deletions notebooks/scripts/Helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,14 +246,23 @@ def scatterplot_with_tooltip_interactive(finalDf, x, y, Titlex, Titley, ToolTip,
return chart


def get_clade_label_chart(df, x_column, y_column, text_column):
def get_clade_label_chart(
df, x_column, y_column, text_column, drop_labels=None,
xoffset_by_label=None,
yoffset_by_label=None,
):
"""Build an Altair chart of text labels from the given data frame, placing
labels at the mean x and y position of the given x and y columns and using
text from the given text column.
"""
if drop_labels is None:
drop_labels = {"other"}
else:
drop_labels = set(drop_labels) | {"other"}

clade_label_positions = df.loc[
(~df["is_internal_node"]) & (df[text_column] != "other"),
(~df["is_internal_node"]) & (~df[text_column].isin(drop_labels)),
[text_column, x_column, y_column]
].groupby(
text_column
Expand All @@ -262,6 +271,18 @@ def get_clade_label_chart(df, x_column, y_column, text_column):
y_column: "mean",
}).reset_index()

if xoffset_by_label is not None:
clade_label_positions[x_column] = clade_label_positions.apply(
lambda record: record[x_column] + xoffset_by_label.get(record[text_column], 0),
axis=1
).values

if yoffset_by_label is not None:
clade_label_positions[y_column] = clade_label_positions.apply(
lambda record: record[y_column] + yoffset_by_label.get(record[text_column], 0),
axis=1
).values

clade_labels_chart = alt.Chart(clade_label_positions).mark_text().encode(
x=f"{x_column}:Q",
y=f"{y_column}:Q",
Expand Down
1 change: 1 addition & 0 deletions sars-cov-2-nextstrain-2022-2023/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -1013,6 +1013,7 @@ rule sarscov2_test_create_notebook_docs:
umap_label=lambda wildcards: f"umap_label_for_{wildcards.clade_membership}",
plot_branches=False,
clades_to_label=get_late_sarscov2_clades_to_label,
dataset="late",
conda: "../cartography.yml"
notebook:
"../sars-cov-2-nextstrain/2022-03-29-final-figures.py.ipynb"
Expand Down
123 changes: 119 additions & 4 deletions sars-cov-2-nextstrain/2022-03-29-final-figures.py.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,30 @@
"clades_to_label = snakemake.params.clades_to_label"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "70b5efa7",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"dataset = snakemake.params.dataset"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bd5317af",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"dataset"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down Expand Up @@ -965,17 +989,108 @@
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4728db32",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"if dataset == \"late\":\n",
" pca_label_kwargs = {\n",
" \"drop_labels\": {\n",
" \"22B\",\n",
" \"22D\",\n",
" \"22F\",\n",
" \"23A\",\n",
" \"23B\",\n",
" \"BA.2.75\",\n",
" \"BA.5\",\n",
" \"XBB.1\",\n",
" \"XBB.1.5\",\n",
" \"XBB.1.16\",\n",
" },\n",
" \"xoffset_by_label\": {\n",
" \"XBB.1.9\": -0.5,\n",
" }\n",
" }\n",
" mds_label_kwargs = {\n",
" \"drop_labels\": {\n",
" \"23A\",\n",
" \"BA.5\",\n",
" \"XBB.1.5\",\n",
" },\n",
" \"xoffset_by_label\": {\n",
" \"22B\": -12,\n",
" },\n",
" \"yoffset_by_label\": {\n",
" \"22B\": -3,\n",
" }\n",
" }\n",
" tsne_label_kwargs = {\n",
" \"xoffset_by_label\": {\n",
" \"XBB.1.9\": 5,\n",
" },\n",
" }\n",
" umap_label_kwargs = {\n",
" \"drop_labels\": {\n",
" \"23A\",\n",
" \"XBB.1.5\",\n",
" \"XBB.1.9\",\n",
" },\n",
" \"xoffset_by_label\": {\n",
" \"23F\": -1,\n",
" \"XBB.1\": 1,\n",
" },\n",
" \"yoffset_by_label\": {\n",
" \"23D\": 1,\n",
" \"22F\": -1,\n",
" }\n",
" }\n",
"else:\n",
" pca_label_kwargs = {}\n",
" mds_label_kwargs = {}\n",
" tsne_label_kwargs = {}\n",
" umap_label_kwargs = {}"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "756eab62",
"metadata": {},
"outputs": [],
"source": [
"clade_labels_for_pca_chart = get_clade_label_chart(label_embeddings_df, \"pca1\", \"pca2\", \"clade_membership_short\")\n",
"clade_labels_for_mds_chart = get_clade_label_chart(label_embeddings_df, \"mds1\", \"mds2\", \"clade_membership_short\")\n",
"clade_labels_for_tsne_chart = get_clade_label_chart(label_embeddings_df, \"tsne_x\", \"tsne_y\", \"clade_membership_short\")\n",
"clade_labels_for_umap_chart = get_clade_label_chart(label_embeddings_df, \"umap_x\", \"umap_y\", \"clade_membership_short\")"
"clade_labels_for_pca_chart = get_clade_label_chart(\n",
" label_embeddings_df,\n",
" \"pca1\",\n",
" \"pca2\",\n",
" \"clade_membership_short\",\n",
" **pca_label_kwargs,\n",
")\n",
"clade_labels_for_mds_chart = get_clade_label_chart(\n",
" label_embeddings_df,\n",
" \"mds1\",\n",
" \"mds2\",\n",
" \"clade_membership_short\",\n",
" **mds_label_kwargs,\n",
")\n",
"clade_labels_for_tsne_chart = get_clade_label_chart(\n",
" label_embeddings_df,\n",
" \"tsne_x\",\n",
" \"tsne_y\",\n",
" \"clade_membership_short\",\n",
" **tsne_label_kwargs,\n",
")\n",
"clade_labels_for_umap_chart = get_clade_label_chart(\n",
" label_embeddings_df,\n",
" \"umap_x\",\n",
" \"umap_y\",\n",
" \"clade_membership_short\",\n",
" **umap_label_kwargs,\n",
")"
]
},
{
Expand Down
1 change: 1 addition & 0 deletions sars-cov-2-nextstrain/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -1001,6 +1001,7 @@ rule sarscov2_create_notebook_docs:
umap_label=lambda wildcards: f"umap_label_for_{wildcards.clade_membership}",
plot_branches=True,
clades_to_label=get_early_sarscov2_clades_to_label,
dataset="early",
conda: "../cartography.yml"
notebook:
"2022-03-29-final-figures.py.ipynb"
Expand Down
Loading

0 comments on commit 0883894

Please sign in to comment.