From 7a26dcc2b0324d5cac46d2d8521e67ead0f2211d Mon Sep 17 00:00:00 2001
From: David Laehnemann <david.laehnemann@hhu.de>
Date: Fri, 20 May 2022 16:45:07 +0200
Subject: [PATCH] fix: automatically calculate genomic sites covered for tumor
 mutational burden calculation (#130)

* fix: automatically calculate breadth of coverage for tumor mutational burden calculation

* snakefmt

* also remove coding_genome_size from config.schema.yaml

* download ensembl annotation, extract coding exons and use to determined covered coding sites

* snakefmt

* also filter for transcript_biotype "protein_coding"
---
 .test/config-chm-eval/config.yaml             |  4 --
 .../config-no-candidate-filtering/config.yaml |  4 --
 .test/config-simple/config.yaml               |  4 --
 .test/config-sra/config.yaml                  |  4 --
 .test/config-target-regions/config.yaml       |  4 --
 .test/config_primers/config.yaml              |  4 --
 config/config.yaml                            |  4 --
 workflow/envs/awk_bedtools.yaml               |  7 ++++
 workflow/rules/common.smk                     |  1 +
 workflow/rules/mutational_burden.smk          | 23 +++++++++--
 workflow/rules/ref.smk                        | 38 +++++++++++++++++++
 workflow/rules/regions.smk                    | 24 ++++++++++--
 workflow/schemas/config.schema.yaml           |  4 --
 13 files changed, 85 insertions(+), 40 deletions(-)
 create mode 100644 workflow/envs/awk_bedtools.yaml

diff --git a/.test/config-chm-eval/config.yaml b/.test/config-chm-eval/config.yaml
index a5f626720..b3347c155 100644
--- a/.test/config-chm-eval/config.yaml
+++ b/.test/config-chm-eval/config.yaml
@@ -23,10 +23,6 @@ primers:
 # Estimation of tumor mutational burden.
 mutational_burden:
   activate: false
-  # Size of the sequenced coding genome for TMB estimation
-  # Attention: when doing panel sequencing, set this to the
-  # CAPTURED coding genome, not the entire one!
-  coding_genome_size: 3.0e7
   events:
     - SOMATIC_TUMOR_LOW
     - SOMATIC_TUMOR_MEDIUM
diff --git a/.test/config-no-candidate-filtering/config.yaml b/.test/config-no-candidate-filtering/config.yaml
index 5028e5087..9dbc7548d 100644
--- a/.test/config-no-candidate-filtering/config.yaml
+++ b/.test/config-no-candidate-filtering/config.yaml
@@ -24,10 +24,6 @@ primers:
 # Estimation of tumor mutational burden.
 mutational_burden:
   activate: false
-  # Size of the sequenced coding genome for TMB estimation
-  # Attention: when doing panel sequencing, set this to the
-  # CAPTURED coding genome, not the entire one!
-  coding_genome_size: 3.0e7
   events:
     - SOMATIC_TUMOR_LOW
     - SOMATIC_TUMOR_MEDIUM
diff --git a/.test/config-simple/config.yaml b/.test/config-simple/config.yaml
index 08ddec3be..fd47f9f12 100644
--- a/.test/config-simple/config.yaml
+++ b/.test/config-simple/config.yaml
@@ -25,10 +25,6 @@ primers:
 # Estimation of mutational burden.
 mutational_burden:
   activate: true
-  # Size of the sequenced coding genome for mutational burden estimation
-  # Attention: when doing panel sequencing, set this to the
-  # CAPTURED coding genome, not the entire one!
-  coding_genome_size: 3.0e7
   events:
     - present
 
diff --git a/.test/config-sra/config.yaml b/.test/config-sra/config.yaml
index a6e801722..d2b5190c3 100644
--- a/.test/config-sra/config.yaml
+++ b/.test/config-sra/config.yaml
@@ -23,10 +23,6 @@ primers:
 # Estimation of tumor mutational burden.
 mutational_burden:
   activate: false
-  # Size of the sequenced coding genome for TMB estimation
-  # Attention: when doing panel sequencing, set this to the
-  # CAPTURED coding genome, not the entire one!
-  coding_genome_size: 3.0e7
   events:
     - SOMATIC_TUMOR_LOW
     - SOMATIC_TUMOR_MEDIUM
diff --git a/.test/config-target-regions/config.yaml b/.test/config-target-regions/config.yaml
index 6b1d61ed4..498a0527e 100644
--- a/.test/config-target-regions/config.yaml
+++ b/.test/config-target-regions/config.yaml
@@ -26,10 +26,6 @@ primers:
 # Estimation of mutational burden.
 mutational_burden:
   activate: true
-  # Size of the sequenced coding genome for mutational burden estimation
-  # Attention: when doing panel sequencing, set this to the
-  # CAPTURED coding genome, not the entire one!
-  coding_genome_size: 3.0e7
   events:
     - present
 
diff --git a/.test/config_primers/config.yaml b/.test/config_primers/config.yaml
index c2d4c5528..dbcc04575 100644
--- a/.test/config_primers/config.yaml
+++ b/.test/config_primers/config.yaml
@@ -25,10 +25,6 @@ primers:
 # Estimation of tumor mutational burden.
 mutational_burden:
   activate: false
-  # Size of the sequenced coding genome for TMB estimation
-  # Attention: when doing panel sequencing, set this to the
-  # CAPTURED coding genome, not the entire one!
-  coding_genome_size: 3.0e7
   events:
     - SOMATIC_TUMOR_LOW
     - SOMATIC_TUMOR_MEDIUM
diff --git a/config/config.yaml b/config/config.yaml
index 126c46001..3c45fc7d0 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -37,10 +37,6 @@ primers:
 # Estimation of tumor mutational burden.
 mutational_burden:
   activate: false
-  # Size of the sequenced coding genome for mutational burden estimation
-  # Attention: when doing panel sequencing, set this to the
-  # CAPTURED coding genome, not the entire one!
-  coding_genome_size: 3e7
   # Plotting modes - hist (stratified histogram)
   # or curve (stratified curve)
   mode:
diff --git a/workflow/envs/awk_bedtools.yaml b/workflow/envs/awk_bedtools.yaml
new file mode 100644
index 000000000..7a026df35
--- /dev/null
+++ b/workflow/envs/awk_bedtools.yaml
@@ -0,0 +1,7 @@
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - bedtools =2.30
+  - gawk =5.1
+  - samtools =1.15
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 3654f97ea..7981dceda 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -647,6 +647,7 @@ wildcard_constraints:
     caller="|".join(["freebayes", "delly"]),
     filter="|".join(config["calling"]["filter"]),
     event="|".join(config["calling"]["fdr-control"]["events"].keys()),
+    regions_type="|".join(["target", "covered"]),
 
 
 caller = list(
diff --git a/workflow/rules/mutational_burden.smk b/workflow/rules/mutational_burden.smk
index 0265c6bad..afe0695e1 100644
--- a/workflow/rules/mutational_burden.smk
+++ b/workflow/rules/mutational_burden.smk
@@ -1,8 +1,24 @@
 if config["mutational_burden"]["activate"]:
 
+    rule calculate_covered_coding_sites:
+        input:
+            covered="results/regions/{group}.covered_regions.filtered.bed",
+            coding="resources/coding_regions.bed.gz",
+        output:
+            "results/regions/{group}.covered_regions.filtered.coding.coverage_breadth.txt",
+        conda:
+            "../envs/awk_bedtools.yaml"
+        log:
+            "logs/regions/{group}.covered_regions.filtered.coding.coverage_breadth.log",
+        shell:
+            "( bedtools intersect -a {input.covered} -b {input.coding} | "
+            "  awk '{{ covered += $3 - $2 }} END {{ print covered }}' >{output} "
+            ") 2> {log} "
+
     rule estimate_mutational_burden:
         input:
-            "results/final-calls/{group}.annotated.bcf",
+            calls="results/final-calls/{group}.annotated.bcf",
+            coverage_breadth="results/regions/{group}.covered_regions.filtered.coding.coverage_breadth.txt",
         output:
             report(
                 "results/plots/mutational-burden/{group}.{sample}.{mode}.mutational-burden.svg",
@@ -14,7 +30,6 @@ if config["mutational_burden"]["activate"]:
         log:
             "logs/estimate-mutational-burden/{group}.{sample}.{mode}.log",
         params:
-            coding_genome_size=config["mutational_burden"]["coding_genome_size"],
             events=get_mutational_burden_events,
             sample=get_sample_alias,
         conda:
@@ -22,7 +37,7 @@ if config["mutational_burden"]["activate"]:
         shell:
             "(varlociraptor estimate mutational-burden "
             "--plot-mode {wildcards.mode} "
-            "--coding-genome-size {params.coding_genome_size} "
+            "--coding-genome-size $( cat {input.coverage_breadth} ) "
             "--events {params.events} "
             "--sample {params.sample} "
-            "< {input} | vl2svg > {output}) 2> {log}"
+            "< {input.calls} | vl2svg > {output}) 2> {log}"
diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk
index 8d470f09f..f59b16dc2 100644
--- a/workflow/rules/ref.smk
+++ b/workflow/rules/ref.smk
@@ -59,6 +59,44 @@ rule get_known_variants:
         "v1.2.0/bio/reference/ensembl-variation"
 
 
+rule get_annotation_gz:
+    output:
+        "resources/annotation.gtf.gz",
+    params:
+        species=config["ref"]["species"],
+        release=config["ref"]["release"],
+        build=config["ref"]["build"],
+        flavor="",  # optional, e.g. chr_patch_hapl_scaff, see Ensembl FTP.
+    log:
+        "logs/get_annotation.log",
+    cache: True  # save space and time with between workflow caching (see docs)
+    wrapper:
+        "v1.5.0/bio/reference/ensembl-annotation"
+
+
+rule determine_coding_regions:
+    input:
+        "resources/annotation.gtf.gz",
+    output:
+        "resources/coding_regions.bed.gz",
+    log:
+        "logs/determine_coding_regions.log",
+    cache: True  # save space and time with between workflow caching (see docs)
+    conda:
+        "../envs/awk.yaml"
+    shell:
+        # filter for `exon` entries, but unclear how to exclude pseudogene exons...
+        """
+        ( zcat {input} | \\
+          awk 'BEGIN {{ IFS = "\\t"}} {{ if ($3 == "exon") {{ print $0 }} }}' | \\
+          grep 'transcript_biotype "protein_coding"' | \\
+          grep 'gene_biotype "protein_coding"' | \\
+          awk 'BEGIN {{ IFS = "\\t"; OFS = "\\t"}}  {{ print $1,$4-1,$5 }}' | \\
+          gzip > {output} \\
+        ) 2> {log}
+        """
+
+
 rule remove_iupac_codes:
     input:
         "resources/variation.vcf.gz",
diff --git a/workflow/rules/regions.smk b/workflow/rules/regions.smk
index 1c730bd58..7ea0f2249 100644
--- a/workflow/rules/regions.smk
+++ b/workflow/rules/regions.smk
@@ -15,7 +15,7 @@ rule build_sample_regions:
         "v1.2.0/bio/mosdepth"
 
 
-rule merge_group_regions:
+rule merge_target_group_regions:
     input:
         lambda wc: expand(
             "results/regions/{{group}}/{sample}.quantized.bed.gz",
@@ -31,18 +31,34 @@ rule merge_group_regions:
         "zcat {input} | sort -k1,1 -k2,2n - | mergeBed -i - -d 15000 > {output} 2> {log}"
 
 
+rule merge_covered_group_regions:
+    input:
+        lambda wc: expand(
+            "results/regions/{{group}}/{sample}.quantized.bed.gz",
+            sample=get_group_samples(wc.group),
+        ),
+    output:
+        "results/regions/{group}.covered_regions.bed",
+    log:
+        "logs/regions/{group}_covered_regions.log",
+    conda:
+        "../envs/bedtools.yaml"
+    shell:
+        "zcat {input} | sort -k1,1 -k2,2n - | mergeBed -i - > {output} 2> {log}"
+
+
 rule filter_group_regions:
     input:
-        regions="results/regions/{group}.target_regions.bed",
+        regions="results/regions/{group}.{regions_type}_regions.bed",
         predefined=config["targets_bed"] if "targets_bed" in config else [],
         fai="resources/genome.fasta.fai",
     output:
-        "results/regions/{group}.target_regions.filtered.bed",
+        "results/regions/{group}.{regions_type}_regions.filtered.bed",
     params:
         chroms=config["ref"]["n_chromosomes"],
         filter_targets=get_filter_targets,
     log:
-        "logs/regions/{group}.target_regions.filtered.log",
+        "logs/regions/{group}.{regions_type}_regions.filtered.log",
     shell:
         "cat {input.regions} {input.predefined} | grep -f <(head -n {params.chroms} {input.fai} | "
         'awk \'{{print "^"$1"\\t"}}\') {params.filter_targets} '
diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml
index 144de6d9e..e19e372cb 100644
--- a/workflow/schemas/config.schema.yaml
+++ b/workflow/schemas/config.schema.yaml
@@ -149,12 +149,8 @@ properties:
     properties:
       activate:
         type: boolean
-      coding_genome_size:
-        # TODO allow integer here!
-        type: string
     required:
       - activate
-      - coding_genome_size
 
 
   calling: