From 5d438e35c2e91f41b04aa69f19c49b9e9f4ae0e9 Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Fri, 26 Apr 2024 15:05:57 -0700 Subject: [PATCH] Plot counts per recombinant lineage/t-SNE cluster Related to #92 --- sars-cov-2-nextstrain-2022-2023/Snakefile | 1 + .../2022-03-29-final-figures.py.ipynb | 161 +++++++++++++++++- 2 files changed, 161 insertions(+), 1 deletion(-) diff --git a/sars-cov-2-nextstrain-2022-2023/Snakefile b/sars-cov-2-nextstrain-2022-2023/Snakefile index ec24a1cc..d643ac1b 100644 --- a/sars-cov-2-nextstrain-2022-2023/Snakefile +++ b/sars-cov-2-nextstrain-2022-2023/Snakefile @@ -908,6 +908,7 @@ rule sarscov2_test_create_notebook_docs: PCA_Supplement_PNG="manuscript/figures/sarscov2-test-pca-by-{clade_membership}-clade.png", MDS_Supplement="manuscript/figures/sarscov2-test-mds-by-{clade_membership}-clade.html", MDS_Supplement_PNG="manuscript/figures/sarscov2-test-mds-by-{clade_membership}-clade.png", + tsne_recombinant_counts="manuscript/figures/sarscov2-test-tsne-recombinant-counts-{clade_membership}.png", params: clade_membership=lambda wildcards: wildcards.clade_membership, pca_label=lambda wildcards: f"pca_label_for_{wildcards.clade_membership}", diff --git a/sars-cov-2-nextstrain/2022-03-29-final-figures.py.ipynb b/sars-cov-2-nextstrain/2022-03-29-final-figures.py.ipynb index bce40c89..cb71325c 100644 --- a/sars-cov-2-nextstrain/2022-03-29-final-figures.py.ipynb +++ b/sars-cov-2-nextstrain/2022-03-29-final-figures.py.ipynb @@ -71,7 +71,9 @@ "static_pca_chart = snakemake.output.PCA_Supplement_PNG\n", "\n", "interactive_mds_chart = snakemake.output.MDS_Supplement\n", - "static_mds_chart = snakemake.output.MDS_Supplement_PNG" + "static_mds_chart = snakemake.output.MDS_Supplement_PNG\n", + "\n", + "output_tsne_recombinant_counts_png = snakemake.output.tsne_recombinant_counts" ] }, { @@ -1132,6 +1134,163 @@ "full_chart_by_cluster.save(static_chart_by_clusters, format=\"png\", scale_factor=2.0)" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "4ca62b00", + "metadata": {}, + "outputs": [], + "source": [ + "tsne_label_column = [column for column in embeddings_df.columns if column.startswith(\"t-sne_label\")][0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8059ea3a", + "metadata": {}, + "outputs": [], + "source": [ + "tsne_label_column" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a15fbda6", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b2f3af09", + "metadata": {}, + "outputs": [], + "source": [ + "embeddings_df[tsne_label_column] != -1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a548586a", + "metadata": {}, + "outputs": [], + "source": [ + "(embeddings_df[\"Nextclade_pango_collapsed\"].str.startswith(\"X\").fillna(False))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bfa689b3", + "metadata": {}, + "outputs": [], + "source": [ + "embeddings_df.loc[embeddings_df[\"is_internal_node\"] == False]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b249281c", + "metadata": {}, + "outputs": [], + "source": [ + "tsne_recombinant_counts = embeddings_df.loc[\n", + " (\n", + " (embeddings_df[\"is_internal_node\"] == False) &\n", + " (embeddings_df[\"Nextclade_pango_collapsed\"].str.startswith(\"X\").fillna(False)) &\n", + " (embeddings_df[tsne_label_column] != -1)\n", + " ),\n", + " [\n", + " \"Nextclade_pango_collapsed\",\n", + " tsne_label_column,\n", + " ]\n", + "].value_counts().reset_index(name=\"count\").query(\"count >= 10\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ae75274e", + "metadata": {}, + "outputs": [], + "source": [ + "tsne_recombinant_counts.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d0670bb6", + "metadata": {}, + "outputs": [], + "source": [ + "tsne_recombinant_counts_chart = alt.Chart(tsne_recombinant_counts).mark_circle().encode(\n", + " x=alt.X(\"Nextclade_pango_collapsed:N\", title=\"Recombinant Pango lineage\"),\n", + " y=alt.Y(f\"{tsne_label_column}:N\", title=\"Cluster from t-SNE\"),\n", + " size=\"count:Q\",\n", + " tooltip=[\"Nextclade_pango_collapsed:N\", f\"{tsne_label_column}:N\", \"count:Q\"],\n", + ").properties(\n", + " width=600,\n", + " height=600,\n", + ")\n", + "tsne_recombinant_counts_chart" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9f4dad63", + "metadata": {}, + "outputs": [], + "source": [ + "tsne_recombinant_counts_chart.save(output_tsne_recombinant_counts_png, format=\"png\", scale_factor=2.0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ee0c9198", + "metadata": {}, + "outputs": [], + "source": [ + "tsne_recombinant_counts[\"Nextclade_pango_collapsed\"].value_counts().shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "56c46ef2", + "metadata": {}, + "outputs": [], + "source": [ + "(tsne_recombinant_counts[\"Nextclade_pango_collapsed\"].value_counts() == 1).sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e0b7d48f", + "metadata": {}, + "outputs": [], + "source": [ + "tsne_recombinant_counts[tsne_label_column].value_counts().shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "942f5d1f", + "metadata": {}, + "outputs": [], + "source": [ + "(tsne_recombinant_counts[tsne_label_column].value_counts() == 1).sum()" + ] + }, { "cell_type": "markdown", "id": "e303df26",