diff --git a/ha-na-nextstrain/2022-02-23-seasonal-flu-ha-na-reassortment.ipynb b/ha-na-nextstrain/2022-02-23-seasonal-flu-ha-na-reassortment.ipynb index 1cfc4d14..c2d7647c 100644 --- a/ha-na-nextstrain/2022-02-23-seasonal-flu-ha-na-reassortment.ipynb +++ b/ha-na-nextstrain/2022-02-23-seasonal-flu-ha-na-reassortment.ipynb @@ -75,6 +75,7 @@ "output_mds_png = snakemake.output.HANAFullChartBrushableMDSPNG\n", "output_tsne_html = snakemake.output.HANAFullChartBrushableTSNEHTML\n", "output_tsne_png = snakemake.output.HANAFullChartBrushableTSNEPNG\n", + "output_tsne_mcc_counts_png = snakemake.output.tsne_mcc_counts\n", "output_umap_html = snakemake.output.HANAFullChartBrushableUMAPHTML\n", "output_umap_png = snakemake.output.HANAFullChartBrushableUMAPPNG\n", "output_ha_na_html = snakemake.output.HANAChartHTML\n", @@ -964,6 +965,158 @@ "tsne_final_chart.save(output_tsne_png, format=\"png\", scale_factor=2.0)" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "aacebf2a", + "metadata": {}, + "outputs": [], + "source": [ + "tsne_MCC_counts = embeddings_df.loc[\n", + " (\n", + " (embeddings_df[\"is_internal_node\"] == False) &\n", + " (embeddings_df[\"MCC\"] != \"unassigned\") &\n", + " (embeddings_df[\"t-sne_concatenated_label\"] != -1)\n", + " ),\n", + " [\"MCC\", \"t-sne_concatenated_label\"]\n", + "].value_counts().reset_index(name=\"count\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "39522c1f", + "metadata": {}, + "outputs": [], + "source": [ + "tsne_MCC_counts[\"MCC_number\"] = tsne_MCC_counts[\"MCC\"].apply(lambda mcc: int(mcc.split(\"_\")[-1]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e794f98", + "metadata": {}, + "outputs": [], + "source": [ + "tsne_MCC_counts = tsne_MCC_counts.sort_values([\"t-sne_concatenated_label\", \"MCC_number\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "246332c3", + "metadata": {}, + "outputs": [], + "source": [ + "tsne_MCC_counts[\"t-sne_concatenated_label\"] = tsne_MCC_counts[\"t-sne_concatenated_label\"].astype(int).astype(str)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3b586e9b", + "metadata": {}, + "outputs": [], + "source": [ + "tsne_MCC_counts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cde59df4", + "metadata": {}, + "outputs": [], + "source": [ + "tsne_mcc_counts_chart = alt.Chart(tsne_MCC_counts.query(\"count >= 10\")).mark_circle().encode(\n", + " x=alt.X(\"MCC:N\", title=\"Reassortment group from TreeKnit\", sort={\"field\": \"MCC_number\"}),\n", + " y=alt.Y(\"t-sne_concatenated_label:N\", title=\"Cluster from t-SNE of HA and NA\", sort=None),\n", + " size=\"count:Q\",\n", + ")\n", + "tsne_mcc_counts_chart" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6b1ea4e2", + "metadata": {}, + "outputs": [], + "source": [ + "tsne_mcc_counts_chart.save(output_tsne_mcc_counts_png, format=\"png\", scale_factor=2.0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "efb96a6b", + "metadata": {}, + "outputs": [], + "source": [ + "tsne_MCC_counts[\"MCC\"].drop_duplicates().shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a1459d4", + "metadata": {}, + "outputs": [], + "source": [ + "embeddings_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f3b31327", + "metadata": {}, + "outputs": [], + "source": [ + "tsne_clade_counts = embeddings_df.loc[\n", + " (\n", + " (embeddings_df[\"is_internal_node\"] == False) &\n", + " (embeddings_df[\"t-sne_concatenated_label\"] != -1)\n", + " ),\n", + " [\"clade_membership\", \"t-sne_concatenated_label\"]\n", + "].value_counts().reset_index(name=\"count\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c828c993", + "metadata": {}, + "outputs": [], + "source": [ + "tsne_clade_counts[\"t-sne_concatenated_label\"] = tsne_clade_counts[\"t-sne_concatenated_label\"].astype(int).astype(str)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2c24b8a0", + "metadata": {}, + "outputs": [], + "source": [ + "tsne_clade_counts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1550932d", + "metadata": {}, + "outputs": [], + "source": [ + "alt.Chart(tsne_clade_counts.query(\"count >= 10\")).mark_circle().encode(\n", + " x=\"clade_membership:N\",\n", + " y=\"t-sne_concatenated_label:N\",\n", + " size=\"count:Q\",\n", + ")" + ] + }, { "cell_type": "markdown", "id": "ce97a775", diff --git a/ha-na-nextstrain/Snakefile b/ha-na-nextstrain/Snakefile index 2cb18a84..d8163509 100644 --- a/ha-na-nextstrain/Snakefile +++ b/ha-na-nextstrain/Snakefile @@ -758,6 +758,7 @@ rule seasonal_flu_reassortment_create_notebook_docs: HANAFullChartBrushableMDSPNG = "manuscript/figures/flu-2016-2018-ha-na-mds-by-cluster.png", HANAFullChartBrushableTSNEHTML = "manuscript/figures/flu-2016-2018-ha-na-tsne-by-cluster.html", HANAFullChartBrushableTSNEPNG = "manuscript/figures/flu-2016-2018-ha-na-tsne-by-cluster.png", + tsne_mcc_counts = "manuscript/figures/flu-2016-2018-ha-na-tsne-mcc-counts.png", HANAFullChartBrushableUMAPHTML = "manuscript/figures/flu-2016-2018-ha-na-umap-by-cluster.html", HANAFullChartBrushableUMAPPNG = "manuscript/figures/flu-2016-2018-ha-na-umap-by-cluster.png", params: