Merge pull request #1046 from nextstrain/config-nextclade-dataset

Parameterize Nextclade dataset in config
nextstrain · Mar 16, 2023 · cfa73be · cfa73be
2 parents 17cb597 + f80fc0c
commit cfa73be
Show file tree

Hide file tree

Showing 6 changed files with 31 additions and 4 deletions.
diff --git a/defaults/auspice_config.json b/defaults/auspice_config.json
@@ -10,6 +10,16 @@
       "title": "Emerging Lineage",
       "type": "categorical"
     },
+    {
+        "key": "immune_escape",
+        "title": "Immune Escape vs BA.2",
+        "type": "continuous"
+    },
+    {
+        "key": "ace2_binding",
+        "title": "ACE2 binding vs BA.2",
+        "type": "continuous"
+    },
     {
       "key": "pango_lineage",
       "title": "PANGO Lineage",

diff --git a/defaults/parameters.yaml b/defaults/parameters.yaml
@@ -47,6 +47,7 @@ sanitize_metadata:
     - "GC-Content=gc_content"
 
 reference_node_name: "USA/WA1/2020"
+nextclade_dataset: sars-cov-2
 
 # Define files used for external configuration. Common examples consist of a
 # list of strains to include and exclude from analyses, a reference sequence to

diff --git a/docs/src/reference/change_log.md b/docs/src/reference/change_log.md
@@ -5,7 +5,9 @@ We also use this change log to document new features that maintain backward comp
 
 ## New features since last version update
 
-- 30 January 2022: Include new clade 23A correspoding to Pango lineage XBB.1.5. See [PR 1043](https://github.com/nextstrain/ncov/pull/1043) for the rationale behind this clade update.
+- 16 March 2023: Add a build configuration option, `nextclade_dataset`, to allow users to change the Nextclade dataset used for alignment and quality control. For example, setting `nextclade_dataset: sars-cov-2-21L` will use the BA.2 (Nextstrain 21L) dataset that provides immune escape and ACE2 binding scores. [See the workflow configuration guide for more details](https://docs.nextstrain.org/projects/ncov/en/latest/reference/workflow-config-file.html#nextclade-dataset). [PR 1046](https://github.com/nextstrain/ncov/pull/1046)
+
+- 30 January 2023: Include new clade 23A correspoding to Pango lineage XBB.1.5. See [PR 1043](https://github.com/nextstrain/ncov/pull/1043) for the rationale behind this clade update.
 
 - 9 December 2022: Add `immune escape` and `ace2_binding` from metadata  as colorings for `nextstrain-open` and `nextstrain-gisaid` builds. [PR 1036](https://github.com/nextstrain/ncov/pull/1036)
 

diff --git a/docs/src/reference/workflow-config-file.rst b/docs/src/reference/workflow-config-file.rst
@@ -374,6 +374,17 @@ Secondary configuration
 
 These parameters are other high-level parameters which may affect multiple Snakemake rules, or modify which rules are run.
 
+nextclade_dataset
+-----------------
+
+- type: string
+- description: Name of a Nextclade dataset that appears in the output of ``nextclade dataset list``. The workflow will download the corresponding dataset by running ``nextclade dataset get --name {nextclade_dataset}`` where the value in the curly brackets is the value defined in the configuration file. The final alignment for each build will use the reference sequence and gene map from this dataset.
+- default: ``sars-cov-2``
+- examples:
+
+  - ``sars-cov-2-21L``
+  - ``sars-cov-2-no-recomb``
+
 default_build_name
 ------------------
 

diff --git a/scripts/join-metadata-and-clades.py b/scripts/join-metadata-and-clades.py
@@ -37,7 +37,9 @@
     "deletions": "deletions",
     "insertions": "insertions",
     "substitutions": "substitutions",
-    "aaSubstitutions": "aaSubstitutions"
+    "aaSubstitutions": "aaSubstitutions",
+    "immune_escape": "immune_escape",
+    "ace2_binding": "ace2_binding",
 }
 
 preferred_types = {
@@ -137,7 +139,8 @@ def main():
     result.loc[np.isnan(div_array)|np.isnan(t), "clock_deviation"] = np.nan
 
     for col in list(column_map.values()) + ["clock_deviation"]:
-        result[col] = result[col].fillna(VALUE_MISSING_DATA)
+        if col in result:
+            result[col] = result[col].fillna(VALUE_MISSING_DATA)
 
     # Move the new column so that it's next to other clade columns
     if INSERT_BEFORE_THIS_COLUMN in result.columns:

diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk
@@ -458,7 +458,7 @@ rule prepare_nextclade:
     output:
         nextclade_dataset = "data/sars-cov-2-nextclade-defaults.zip",
     params:
-        name = "sars-cov-2",
+        name = config["nextclade_dataset"],
     conda: config["conda_environment"]
     shell:
         """