Skip to content

Commit

Permalink
fix: automatically calculate genomic sites covered for tumor mutation…
Browse files Browse the repository at this point in the history
…al burden calculation (#130)

* fix: automatically calculate breadth of coverage for tumor mutational burden calculation

* snakefmt

* also remove coding_genome_size from config.schema.yaml

* download ensembl annotation, extract coding exons and use to determined covered coding sites

* snakefmt

* also filter for transcript_biotype "protein_coding"
  • Loading branch information
dlaehnemann authored May 20, 2022
1 parent 7db7b35 commit 7a26dcc
Show file tree
Hide file tree
Showing 13 changed files with 85 additions and 40 deletions.
4 changes: 0 additions & 4 deletions .test/config-chm-eval/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,6 @@ primers:
# Estimation of tumor mutational burden.
mutational_burden:
activate: false
# Size of the sequenced coding genome for TMB estimation
# Attention: when doing panel sequencing, set this to the
# CAPTURED coding genome, not the entire one!
coding_genome_size: 3.0e7
events:
- SOMATIC_TUMOR_LOW
- SOMATIC_TUMOR_MEDIUM
Expand Down
4 changes: 0 additions & 4 deletions .test/config-no-candidate-filtering/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,6 @@ primers:
# Estimation of tumor mutational burden.
mutational_burden:
activate: false
# Size of the sequenced coding genome for TMB estimation
# Attention: when doing panel sequencing, set this to the
# CAPTURED coding genome, not the entire one!
coding_genome_size: 3.0e7
events:
- SOMATIC_TUMOR_LOW
- SOMATIC_TUMOR_MEDIUM
Expand Down
4 changes: 0 additions & 4 deletions .test/config-simple/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,6 @@ primers:
# Estimation of mutational burden.
mutational_burden:
activate: true
# Size of the sequenced coding genome for mutational burden estimation
# Attention: when doing panel sequencing, set this to the
# CAPTURED coding genome, not the entire one!
coding_genome_size: 3.0e7
events:
- present

Expand Down
4 changes: 0 additions & 4 deletions .test/config-sra/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,6 @@ primers:
# Estimation of tumor mutational burden.
mutational_burden:
activate: false
# Size of the sequenced coding genome for TMB estimation
# Attention: when doing panel sequencing, set this to the
# CAPTURED coding genome, not the entire one!
coding_genome_size: 3.0e7
events:
- SOMATIC_TUMOR_LOW
- SOMATIC_TUMOR_MEDIUM
Expand Down
4 changes: 0 additions & 4 deletions .test/config-target-regions/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,6 @@ primers:
# Estimation of mutational burden.
mutational_burden:
activate: true
# Size of the sequenced coding genome for mutational burden estimation
# Attention: when doing panel sequencing, set this to the
# CAPTURED coding genome, not the entire one!
coding_genome_size: 3.0e7
events:
- present

Expand Down
4 changes: 0 additions & 4 deletions .test/config_primers/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,6 @@ primers:
# Estimation of tumor mutational burden.
mutational_burden:
activate: false
# Size of the sequenced coding genome for TMB estimation
# Attention: when doing panel sequencing, set this to the
# CAPTURED coding genome, not the entire one!
coding_genome_size: 3.0e7
events:
- SOMATIC_TUMOR_LOW
- SOMATIC_TUMOR_MEDIUM
Expand Down
4 changes: 0 additions & 4 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,6 @@ primers:
# Estimation of tumor mutational burden.
mutational_burden:
activate: false
# Size of the sequenced coding genome for mutational burden estimation
# Attention: when doing panel sequencing, set this to the
# CAPTURED coding genome, not the entire one!
coding_genome_size: 3e7
# Plotting modes - hist (stratified histogram)
# or curve (stratified curve)
mode:
Expand Down
7 changes: 7 additions & 0 deletions workflow/envs/awk_bedtools.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
channels:
- conda-forge
- bioconda
dependencies:
- bedtools =2.30
- gawk =5.1
- samtools =1.15
1 change: 1 addition & 0 deletions workflow/rules/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -647,6 +647,7 @@ wildcard_constraints:
caller="|".join(["freebayes", "delly"]),
filter="|".join(config["calling"]["filter"]),
event="|".join(config["calling"]["fdr-control"]["events"].keys()),
regions_type="|".join(["target", "covered"]),


caller = list(
Expand Down
23 changes: 19 additions & 4 deletions workflow/rules/mutational_burden.smk
Original file line number Diff line number Diff line change
@@ -1,8 +1,24 @@
if config["mutational_burden"]["activate"]:

rule calculate_covered_coding_sites:
input:
covered="results/regions/{group}.covered_regions.filtered.bed",
coding="resources/coding_regions.bed.gz",
output:
"results/regions/{group}.covered_regions.filtered.coding.coverage_breadth.txt",
conda:
"../envs/awk_bedtools.yaml"
log:
"logs/regions/{group}.covered_regions.filtered.coding.coverage_breadth.log",
shell:
"( bedtools intersect -a {input.covered} -b {input.coding} | "
" awk '{{ covered += $3 - $2 }} END {{ print covered }}' >{output} "
") 2> {log} "

rule estimate_mutational_burden:
input:
"results/final-calls/{group}.annotated.bcf",
calls="results/final-calls/{group}.annotated.bcf",
coverage_breadth="results/regions/{group}.covered_regions.filtered.coding.coverage_breadth.txt",
output:
report(
"results/plots/mutational-burden/{group}.{sample}.{mode}.mutational-burden.svg",
Expand All @@ -14,15 +30,14 @@ if config["mutational_burden"]["activate"]:
log:
"logs/estimate-mutational-burden/{group}.{sample}.{mode}.log",
params:
coding_genome_size=config["mutational_burden"]["coding_genome_size"],
events=get_mutational_burden_events,
sample=get_sample_alias,
conda:
"../envs/varlociraptor.yaml"
shell:
"(varlociraptor estimate mutational-burden "
"--plot-mode {wildcards.mode} "
"--coding-genome-size {params.coding_genome_size} "
"--coding-genome-size $( cat {input.coverage_breadth} ) "
"--events {params.events} "
"--sample {params.sample} "
"< {input} | vl2svg > {output}) 2> {log}"
"< {input.calls} | vl2svg > {output}) 2> {log}"
38 changes: 38 additions & 0 deletions workflow/rules/ref.smk
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,44 @@ rule get_known_variants:
"v1.2.0/bio/reference/ensembl-variation"


rule get_annotation_gz:
output:
"resources/annotation.gtf.gz",
params:
species=config["ref"]["species"],
release=config["ref"]["release"],
build=config["ref"]["build"],
flavor="", # optional, e.g. chr_patch_hapl_scaff, see Ensembl FTP.
log:
"logs/get_annotation.log",
cache: True # save space and time with between workflow caching (see docs)
wrapper:
"v1.5.0/bio/reference/ensembl-annotation"


rule determine_coding_regions:
input:
"resources/annotation.gtf.gz",
output:
"resources/coding_regions.bed.gz",
log:
"logs/determine_coding_regions.log",
cache: True # save space and time with between workflow caching (see docs)
conda:
"../envs/awk.yaml"
shell:
# filter for `exon` entries, but unclear how to exclude pseudogene exons...
"""
( zcat {input} | \\
awk 'BEGIN {{ IFS = "\\t"}} {{ if ($3 == "exon") {{ print $0 }} }}' | \\
grep 'transcript_biotype "protein_coding"' | \\
grep 'gene_biotype "protein_coding"' | \\
awk 'BEGIN {{ IFS = "\\t"; OFS = "\\t"}} {{ print $1,$4-1,$5 }}' | \\
gzip > {output} \\
) 2> {log}
"""


rule remove_iupac_codes:
input:
"resources/variation.vcf.gz",
Expand Down
24 changes: 20 additions & 4 deletions workflow/rules/regions.smk
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ rule build_sample_regions:
"v1.2.0/bio/mosdepth"


rule merge_group_regions:
rule merge_target_group_regions:
input:
lambda wc: expand(
"results/regions/{{group}}/{sample}.quantized.bed.gz",
Expand All @@ -31,18 +31,34 @@ rule merge_group_regions:
"zcat {input} | sort -k1,1 -k2,2n - | mergeBed -i - -d 15000 > {output} 2> {log}"


rule merge_covered_group_regions:
input:
lambda wc: expand(
"results/regions/{{group}}/{sample}.quantized.bed.gz",
sample=get_group_samples(wc.group),
),
output:
"results/regions/{group}.covered_regions.bed",
log:
"logs/regions/{group}_covered_regions.log",
conda:
"../envs/bedtools.yaml"
shell:
"zcat {input} | sort -k1,1 -k2,2n - | mergeBed -i - > {output} 2> {log}"


rule filter_group_regions:
input:
regions="results/regions/{group}.target_regions.bed",
regions="results/regions/{group}.{regions_type}_regions.bed",
predefined=config["targets_bed"] if "targets_bed" in config else [],
fai="resources/genome.fasta.fai",
output:
"results/regions/{group}.target_regions.filtered.bed",
"results/regions/{group}.{regions_type}_regions.filtered.bed",
params:
chroms=config["ref"]["n_chromosomes"],
filter_targets=get_filter_targets,
log:
"logs/regions/{group}.target_regions.filtered.log",
"logs/regions/{group}.{regions_type}_regions.filtered.log",
shell:
"cat {input.regions} {input.predefined} | grep -f <(head -n {params.chroms} {input.fai} | "
'awk \'{{print "^"$1"\\t"}}\') {params.filter_targets} '
Expand Down
4 changes: 0 additions & 4 deletions workflow/schemas/config.schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -149,12 +149,8 @@ properties:
properties:
activate:
type: boolean
coding_genome_size:
# TODO allow integer here!
type: string
required:
- activate
- coding_genome_size


calling:
Expand Down

0 comments on commit 7a26dcc

Please sign in to comment.