config.yaml

# Configuration file for `SARS2-spike-predictor-phenos.ipynb` notebook.

# Mutation effects on phenotypes.
# Top-level key is the viral clade that the mutation effects are computed relative to.
# The next key is the phenotype name.
# Then the keys are the CSV file with the data and columns with phenotypes within.
# The CSV must have columns "site", "wildtype", and "mutant."
mutation_phenotype_csvs:
  XBB.1.5:
      spike pseudovirus DMS:
        # https://www.biorxiv.org/content/10.1101/2023.11.13.566961v1
        csv: data/spike_pseudovirus_DMS_XBB.1.5.csv
        columns:
          - human sera escape
          - ACE2 binding
          - spike mediated entry
      RBD yeast-display DMS:
        # https://journals.plos.org/plospathogens/article?id=10.1371/journal.ppat.1011901
        # https://jbloomlab.github.io/SARS2-RBD-escape-calc/
        csv: data/yeast_RBD_DMS_XBB.1.5.csv
        columns:
          - ACE2 affinity
          - RBD expression
          - escape
      EVEscape:
        # https://www.nature.com/articles/s41586-023-06617-0
        # https://api.evescape.org/download_variant_data?curr-virus=COVID19&curr-variant-or-id=Omicron(XBB)
        csv: data/EVEscape_XBB_single_mutation_predictions.csv
        columns:
          - EVEscape

# Pango clade definitions from https://github.com/corneliusroemer/pango-sequences
# Indicate a specific commit or just latest version from main. Bbelow is latest version, to
# specify a specific commit, indicate the same file from GitHub with that commit, as in:
# https://raw.githubusercontent.com/corneliusroemer/pango-sequences/3201a78cc273ebb7205d853739ad9ff41c032cd5/data/pango-consensus-sequences_summary.json
pango_json: https://raw.githubusercontent.com/corneliusroemer/pango-sequences/main/data/pango-consensus-sequences_summary.json

# Pango clade growth estimates from https://github.com/nextstrain/forecasts-ncov/
# Indicate a specific date or just latest version. Below specifies the latest version.
# To specify a specific date, change the `latest` in the file name to the date, as in:
# https://data.nextstrain.org/files/workflows/forecasts-ncov/gisaid/pango_lineages/global/mlr/2024-02-25_results.json
pango_growth_json: https://data.nextstrain.org/files/workflows/forecasts-ncov/gisaid/pango_lineages/global/mlr/latest_results.json

# Indicate whether each clade is descendant of each of these parent clades, according definitions in `pango_json`
classify_descendants_of:
  - BA.2.86
  - XBB
  - BA.2

# Number of randomizations of each phenotype. Randomizations are performed by shuffling
# the mutation phenotype values among mutations. These are useful for evaluating the
# "significance" of correlations.
n_randomizations: 100

# Output files
mutation_phenotypes_csv: results/mutation_phenotypes.csv
mutation_phenotypes_randomized_csv: results/mutation_phenotypes_randomized.csv
clade_phenotypes_csv: results/clade_phenotypes.csv
clade_phenotypes_randomized_csv: results/clade_phenotypes_randomized.csv
clade_phenotype_chart: docs/index.html