From f6a5789e4ffb0827333f4f83fc0918da452f2b3b Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Thu, 9 Feb 2023 12:35:03 -0800 Subject: [PATCH 1/4] Parameterize Nextclade dataset in config Moves hardcoded Nextclade dataset name into the top-level of the build configuration, allowing users to choose which data to use for their final alignments, QC scores, etc. As a side effect of this flexibility, users can select a dataset like "sars-cov-2-21L" which provides additional metadata annotations in the QC output that users would like to include (immune escape and ACE2 binding scores). --- defaults/parameters.yaml | 1 + workflow/snakemake_rules/main_workflow.smk | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/defaults/parameters.yaml b/defaults/parameters.yaml index ae71149fa..4cf6391b4 100644 --- a/defaults/parameters.yaml +++ b/defaults/parameters.yaml @@ -47,6 +47,7 @@ sanitize_metadata: - "GC-Content=gc_content" reference_node_name: "USA/WA1/2020" +nextclade_dataset: sars-cov-2 # Define files used for external configuration. Common examples consist of a # list of strains to include and exclude from analyses, a reference sequence to diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk index c532e7177..bfb0bf7aa 100644 --- a/workflow/snakemake_rules/main_workflow.smk +++ b/workflow/snakemake_rules/main_workflow.smk @@ -458,7 +458,7 @@ rule prepare_nextclade: output: nextclade_dataset = "data/sars-cov-2-nextclade-defaults.zip", params: - name = "sars-cov-2", + name = config["nextclade_dataset"], conda: config["conda_environment"] shell: """ From 6f3f40cfe86e28b140ba9235748f7e3b007a6805 Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Thu, 9 Feb 2023 12:36:36 -0800 Subject: [PATCH 2/4] Include immune escape and ACE2 scores if available Adds colorings to the default Auspice config JSON for immune escape and ACE2 binding scores and updates the list of columns to merge into the metadata from Nextclade's QC output. This change allows users to get these scores in their builds by changing the `nextclade_dataset` option in their build config to "sars-cov-2-21L", but it has no effect on the behavior of the default dataset, "sars-cov-2". --- defaults/auspice_config.json | 10 ++++++++++ scripts/join-metadata-and-clades.py | 7 +++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/defaults/auspice_config.json b/defaults/auspice_config.json index 7e0894a28..3c646bcdc 100644 --- a/defaults/auspice_config.json +++ b/defaults/auspice_config.json @@ -10,6 +10,16 @@ "title": "Emerging Lineage", "type": "categorical" }, + { + "key": "immune_escape", + "title": "Immune Escape vs BA.2", + "type": "continuous" + }, + { + "key": "ace2_binding", + "title": "ACE2 binding vs BA.2", + "type": "continuous" + }, { "key": "pango_lineage", "title": "PANGO Lineage", diff --git a/scripts/join-metadata-and-clades.py b/scripts/join-metadata-and-clades.py index 108f7770c..dccba30aa 100644 --- a/scripts/join-metadata-and-clades.py +++ b/scripts/join-metadata-and-clades.py @@ -37,7 +37,9 @@ "deletions": "deletions", "insertions": "insertions", "substitutions": "substitutions", - "aaSubstitutions": "aaSubstitutions" + "aaSubstitutions": "aaSubstitutions", + "immune_escape": "immune_escape", + "ace2_binding": "ace2_binding", } preferred_types = { @@ -137,7 +139,8 @@ def main(): result.loc[np.isnan(div_array)|np.isnan(t), "clock_deviation"] = np.nan for col in list(column_map.values()) + ["clock_deviation"]: - result[col] = result[col].fillna(VALUE_MISSING_DATA) + if col in result: + result[col] = result[col].fillna(VALUE_MISSING_DATA) # Move the new column so that it's next to other clade columns if INSERT_BEFORE_THIS_COLUMN in result.columns: From 9fe7e650e32c83d0c31d26626273c2d3c9148b3e Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Thu, 16 Mar 2023 15:52:32 -0700 Subject: [PATCH 3/4] Correct year for most recent feature --- docs/src/reference/change_log.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/reference/change_log.md b/docs/src/reference/change_log.md index abd7d707f..d26a69bf0 100644 --- a/docs/src/reference/change_log.md +++ b/docs/src/reference/change_log.md @@ -5,7 +5,7 @@ We also use this change log to document new features that maintain backward comp ## New features since last version update -- 30 January 2022: Include new clade 23A correspoding to Pango lineage XBB.1.5. See [PR 1043](https://github.com/nextstrain/ncov/pull/1043) for the rationale behind this clade update. +- 30 January 2023: Include new clade 23A correspoding to Pango lineage XBB.1.5. See [PR 1043](https://github.com/nextstrain/ncov/pull/1043) for the rationale behind this clade update. - 9 December 2022: Add `immune escape` and `ace2_binding` from metadata as colorings for `nextstrain-open` and `nextstrain-gisaid` builds. [PR 1036](https://github.com/nextstrain/ncov/pull/1036) From f80fc0c06d6f4d8b6b0257d053af2c11e0549006 Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Thu, 16 Mar 2023 16:07:54 -0700 Subject: [PATCH 4/4] Note new feature in changelog and reference docs --- docs/src/reference/change_log.md | 2 ++ docs/src/reference/workflow-config-file.rst | 11 +++++++++++ 2 files changed, 13 insertions(+) diff --git a/docs/src/reference/change_log.md b/docs/src/reference/change_log.md index d26a69bf0..5372f596c 100644 --- a/docs/src/reference/change_log.md +++ b/docs/src/reference/change_log.md @@ -5,6 +5,8 @@ We also use this change log to document new features that maintain backward comp ## New features since last version update +- 16 March 2023: Add a build configuration option, `nextclade_dataset`, to allow users to change the Nextclade dataset used for alignment and quality control. For example, setting `nextclade_dataset: sars-cov-2-21L` will use the BA.2 (Nextstrain 21L) dataset that provides immune escape and ACE2 binding scores. [See the workflow configuration guide for more details](https://docs.nextstrain.org/projects/ncov/en/latest/reference/workflow-config-file.html#nextclade-dataset). [PR 1046](https://github.com/nextstrain/ncov/pull/1046) + - 30 January 2023: Include new clade 23A correspoding to Pango lineage XBB.1.5. See [PR 1043](https://github.com/nextstrain/ncov/pull/1043) for the rationale behind this clade update. - 9 December 2022: Add `immune escape` and `ace2_binding` from metadata as colorings for `nextstrain-open` and `nextstrain-gisaid` builds. [PR 1036](https://github.com/nextstrain/ncov/pull/1036) diff --git a/docs/src/reference/workflow-config-file.rst b/docs/src/reference/workflow-config-file.rst index 52a12e831..04878ddf5 100644 --- a/docs/src/reference/workflow-config-file.rst +++ b/docs/src/reference/workflow-config-file.rst @@ -374,6 +374,17 @@ Secondary configuration These parameters are other high-level parameters which may affect multiple Snakemake rules, or modify which rules are run. +nextclade_dataset +----------------- + +- type: string +- description: Name of a Nextclade dataset that appears in the output of ``nextclade dataset list``. The workflow will download the corresponding dataset by running ``nextclade dataset get --name {nextclade_dataset}`` where the value in the curly brackets is the value defined in the configuration file. The final alignment for each build will use the reference sequence and gene map from this dataset. +- default: ``sars-cov-2`` +- examples: + + - ``sars-cov-2-21L`` + - ``sars-cov-2-no-recomb`` + default_build_name ------------------