From 7a26dcc2b0324d5cac46d2d8521e67ead0f2211d Mon Sep 17 00:00:00 2001 From: David Laehnemann Date: Fri, 20 May 2022 16:45:07 +0200 Subject: [PATCH] fix: automatically calculate genomic sites covered for tumor mutational burden calculation (#130) * fix: automatically calculate breadth of coverage for tumor mutational burden calculation * snakefmt * also remove coding_genome_size from config.schema.yaml * download ensembl annotation, extract coding exons and use to determined covered coding sites * snakefmt * also filter for transcript_biotype "protein_coding" --- .test/config-chm-eval/config.yaml | 4 -- .../config-no-candidate-filtering/config.yaml | 4 -- .test/config-simple/config.yaml | 4 -- .test/config-sra/config.yaml | 4 -- .test/config-target-regions/config.yaml | 4 -- .test/config_primers/config.yaml | 4 -- config/config.yaml | 4 -- workflow/envs/awk_bedtools.yaml | 7 ++++ workflow/rules/common.smk | 1 + workflow/rules/mutational_burden.smk | 23 +++++++++-- workflow/rules/ref.smk | 38 +++++++++++++++++++ workflow/rules/regions.smk | 24 ++++++++++-- workflow/schemas/config.schema.yaml | 4 -- 13 files changed, 85 insertions(+), 40 deletions(-) create mode 100644 workflow/envs/awk_bedtools.yaml diff --git a/.test/config-chm-eval/config.yaml b/.test/config-chm-eval/config.yaml index a5f626720..b3347c155 100644 --- a/.test/config-chm-eval/config.yaml +++ b/.test/config-chm-eval/config.yaml @@ -23,10 +23,6 @@ primers: # Estimation of tumor mutational burden. mutational_burden: activate: false - # Size of the sequenced coding genome for TMB estimation - # Attention: when doing panel sequencing, set this to the - # CAPTURED coding genome, not the entire one! - coding_genome_size: 3.0e7 events: - SOMATIC_TUMOR_LOW - SOMATIC_TUMOR_MEDIUM diff --git a/.test/config-no-candidate-filtering/config.yaml b/.test/config-no-candidate-filtering/config.yaml index 5028e5087..9dbc7548d 100644 --- a/.test/config-no-candidate-filtering/config.yaml +++ b/.test/config-no-candidate-filtering/config.yaml @@ -24,10 +24,6 @@ primers: # Estimation of tumor mutational burden. mutational_burden: activate: false - # Size of the sequenced coding genome for TMB estimation - # Attention: when doing panel sequencing, set this to the - # CAPTURED coding genome, not the entire one! - coding_genome_size: 3.0e7 events: - SOMATIC_TUMOR_LOW - SOMATIC_TUMOR_MEDIUM diff --git a/.test/config-simple/config.yaml b/.test/config-simple/config.yaml index 08ddec3be..fd47f9f12 100644 --- a/.test/config-simple/config.yaml +++ b/.test/config-simple/config.yaml @@ -25,10 +25,6 @@ primers: # Estimation of mutational burden. mutational_burden: activate: true - # Size of the sequenced coding genome for mutational burden estimation - # Attention: when doing panel sequencing, set this to the - # CAPTURED coding genome, not the entire one! - coding_genome_size: 3.0e7 events: - present diff --git a/.test/config-sra/config.yaml b/.test/config-sra/config.yaml index a6e801722..d2b5190c3 100644 --- a/.test/config-sra/config.yaml +++ b/.test/config-sra/config.yaml @@ -23,10 +23,6 @@ primers: # Estimation of tumor mutational burden. mutational_burden: activate: false - # Size of the sequenced coding genome for TMB estimation - # Attention: when doing panel sequencing, set this to the - # CAPTURED coding genome, not the entire one! - coding_genome_size: 3.0e7 events: - SOMATIC_TUMOR_LOW - SOMATIC_TUMOR_MEDIUM diff --git a/.test/config-target-regions/config.yaml b/.test/config-target-regions/config.yaml index 6b1d61ed4..498a0527e 100644 --- a/.test/config-target-regions/config.yaml +++ b/.test/config-target-regions/config.yaml @@ -26,10 +26,6 @@ primers: # Estimation of mutational burden. mutational_burden: activate: true - # Size of the sequenced coding genome for mutational burden estimation - # Attention: when doing panel sequencing, set this to the - # CAPTURED coding genome, not the entire one! - coding_genome_size: 3.0e7 events: - present diff --git a/.test/config_primers/config.yaml b/.test/config_primers/config.yaml index c2d4c5528..dbcc04575 100644 --- a/.test/config_primers/config.yaml +++ b/.test/config_primers/config.yaml @@ -25,10 +25,6 @@ primers: # Estimation of tumor mutational burden. mutational_burden: activate: false - # Size of the sequenced coding genome for TMB estimation - # Attention: when doing panel sequencing, set this to the - # CAPTURED coding genome, not the entire one! - coding_genome_size: 3.0e7 events: - SOMATIC_TUMOR_LOW - SOMATIC_TUMOR_MEDIUM diff --git a/config/config.yaml b/config/config.yaml index 126c46001..3c45fc7d0 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -37,10 +37,6 @@ primers: # Estimation of tumor mutational burden. mutational_burden: activate: false - # Size of the sequenced coding genome for mutational burden estimation - # Attention: when doing panel sequencing, set this to the - # CAPTURED coding genome, not the entire one! - coding_genome_size: 3e7 # Plotting modes - hist (stratified histogram) # or curve (stratified curve) mode: diff --git a/workflow/envs/awk_bedtools.yaml b/workflow/envs/awk_bedtools.yaml new file mode 100644 index 000000000..7a026df35 --- /dev/null +++ b/workflow/envs/awk_bedtools.yaml @@ -0,0 +1,7 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bedtools =2.30 + - gawk =5.1 + - samtools =1.15 diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 3654f97ea..7981dceda 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -647,6 +647,7 @@ wildcard_constraints: caller="|".join(["freebayes", "delly"]), filter="|".join(config["calling"]["filter"]), event="|".join(config["calling"]["fdr-control"]["events"].keys()), + regions_type="|".join(["target", "covered"]), caller = list( diff --git a/workflow/rules/mutational_burden.smk b/workflow/rules/mutational_burden.smk index 0265c6bad..afe0695e1 100644 --- a/workflow/rules/mutational_burden.smk +++ b/workflow/rules/mutational_burden.smk @@ -1,8 +1,24 @@ if config["mutational_burden"]["activate"]: + rule calculate_covered_coding_sites: + input: + covered="results/regions/{group}.covered_regions.filtered.bed", + coding="resources/coding_regions.bed.gz", + output: + "results/regions/{group}.covered_regions.filtered.coding.coverage_breadth.txt", + conda: + "../envs/awk_bedtools.yaml" + log: + "logs/regions/{group}.covered_regions.filtered.coding.coverage_breadth.log", + shell: + "( bedtools intersect -a {input.covered} -b {input.coding} | " + " awk '{{ covered += $3 - $2 }} END {{ print covered }}' >{output} " + ") 2> {log} " + rule estimate_mutational_burden: input: - "results/final-calls/{group}.annotated.bcf", + calls="results/final-calls/{group}.annotated.bcf", + coverage_breadth="results/regions/{group}.covered_regions.filtered.coding.coverage_breadth.txt", output: report( "results/plots/mutational-burden/{group}.{sample}.{mode}.mutational-burden.svg", @@ -14,7 +30,6 @@ if config["mutational_burden"]["activate"]: log: "logs/estimate-mutational-burden/{group}.{sample}.{mode}.log", params: - coding_genome_size=config["mutational_burden"]["coding_genome_size"], events=get_mutational_burden_events, sample=get_sample_alias, conda: @@ -22,7 +37,7 @@ if config["mutational_burden"]["activate"]: shell: "(varlociraptor estimate mutational-burden " "--plot-mode {wildcards.mode} " - "--coding-genome-size {params.coding_genome_size} " + "--coding-genome-size $( cat {input.coverage_breadth} ) " "--events {params.events} " "--sample {params.sample} " - "< {input} | vl2svg > {output}) 2> {log}" + "< {input.calls} | vl2svg > {output}) 2> {log}" diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk index 8d470f09f..f59b16dc2 100644 --- a/workflow/rules/ref.smk +++ b/workflow/rules/ref.smk @@ -59,6 +59,44 @@ rule get_known_variants: "v1.2.0/bio/reference/ensembl-variation" +rule get_annotation_gz: + output: + "resources/annotation.gtf.gz", + params: + species=config["ref"]["species"], + release=config["ref"]["release"], + build=config["ref"]["build"], + flavor="", # optional, e.g. chr_patch_hapl_scaff, see Ensembl FTP. + log: + "logs/get_annotation.log", + cache: True # save space and time with between workflow caching (see docs) + wrapper: + "v1.5.0/bio/reference/ensembl-annotation" + + +rule determine_coding_regions: + input: + "resources/annotation.gtf.gz", + output: + "resources/coding_regions.bed.gz", + log: + "logs/determine_coding_regions.log", + cache: True # save space and time with between workflow caching (see docs) + conda: + "../envs/awk.yaml" + shell: + # filter for `exon` entries, but unclear how to exclude pseudogene exons... + """ + ( zcat {input} | \\ + awk 'BEGIN {{ IFS = "\\t"}} {{ if ($3 == "exon") {{ print $0 }} }}' | \\ + grep 'transcript_biotype "protein_coding"' | \\ + grep 'gene_biotype "protein_coding"' | \\ + awk 'BEGIN {{ IFS = "\\t"; OFS = "\\t"}} {{ print $1,$4-1,$5 }}' | \\ + gzip > {output} \\ + ) 2> {log} + """ + + rule remove_iupac_codes: input: "resources/variation.vcf.gz", diff --git a/workflow/rules/regions.smk b/workflow/rules/regions.smk index 1c730bd58..7ea0f2249 100644 --- a/workflow/rules/regions.smk +++ b/workflow/rules/regions.smk @@ -15,7 +15,7 @@ rule build_sample_regions: "v1.2.0/bio/mosdepth" -rule merge_group_regions: +rule merge_target_group_regions: input: lambda wc: expand( "results/regions/{{group}}/{sample}.quantized.bed.gz", @@ -31,18 +31,34 @@ rule merge_group_regions: "zcat {input} | sort -k1,1 -k2,2n - | mergeBed -i - -d 15000 > {output} 2> {log}" +rule merge_covered_group_regions: + input: + lambda wc: expand( + "results/regions/{{group}}/{sample}.quantized.bed.gz", + sample=get_group_samples(wc.group), + ), + output: + "results/regions/{group}.covered_regions.bed", + log: + "logs/regions/{group}_covered_regions.log", + conda: + "../envs/bedtools.yaml" + shell: + "zcat {input} | sort -k1,1 -k2,2n - | mergeBed -i - > {output} 2> {log}" + + rule filter_group_regions: input: - regions="results/regions/{group}.target_regions.bed", + regions="results/regions/{group}.{regions_type}_regions.bed", predefined=config["targets_bed"] if "targets_bed" in config else [], fai="resources/genome.fasta.fai", output: - "results/regions/{group}.target_regions.filtered.bed", + "results/regions/{group}.{regions_type}_regions.filtered.bed", params: chroms=config["ref"]["n_chromosomes"], filter_targets=get_filter_targets, log: - "logs/regions/{group}.target_regions.filtered.log", + "logs/regions/{group}.{regions_type}_regions.filtered.log", shell: "cat {input.regions} {input.predefined} | grep -f <(head -n {params.chroms} {input.fai} | " 'awk \'{{print "^"$1"\\t"}}\') {params.filter_targets} ' diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml index 144de6d9e..e19e372cb 100644 --- a/workflow/schemas/config.schema.yaml +++ b/workflow/schemas/config.schema.yaml @@ -149,12 +149,8 @@ properties: properties: activate: type: boolean - coding_genome_size: - # TODO allow integer here! - type: string required: - activate - - coding_genome_size calling: