Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: gene coverage report #270

Merged
merged 13 commits into from
Nov 10, 2023
3 changes: 3 additions & 0 deletions .test/config-chm-eval/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,9 @@ tables:
event_prob: true
generate_excel: true

gene_coverage:
min_avg_coverage: 5

report:
activate: false
max_read_depth: 250
Expand Down
3 changes: 3 additions & 0 deletions .test/config-giab/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -146,3 +146,6 @@ params:
preprocess: "--max-depth 300"
freebayes:
min_alternate_fraction: 0.05 # Reduce for calling variants with lower VAFs

gene_coverage:
min_avg_coverage: 5
3 changes: 3 additions & 0 deletions .test/config-no-candidate-filtering/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,9 @@ params:
freebayes:
min_alternate_fraction: 0.05 # Reduce for calling variants with lower VAFs

gene_coverage:
min_avg_coverage: 5

report:
activate: true
max_read_depth: 250
Expand Down
3 changes: 3 additions & 0 deletions .test/config-simple/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,9 @@ params:
freebayes:
min_alternate_fraction: 0.05 # Reduce for calling variants with lower VAFs

gene_coverage:
min_avg_coverage: 0

report:
activate: true
max_read_depth: 250
Expand Down
3 changes: 3 additions & 0 deletions .test/config-sra/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,9 @@ params:
freebayes:
min_alternate_fraction: 0.05 # Reduce for calling variants with lower VAFs

gene_coverage:
min_avg_coverage: 5

report:
activate: false
max_read_depth: 250
Expand Down
3 changes: 3 additions & 0 deletions .test/config-target-regions/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,9 @@ params:
freebayes:
min_alternate_fraction: 0.05 # Reduce for calling variants with lower VAFs

gene_coverage:
min_avg_coverage: 5

report:
activate: true
max_read_depth: 250
Expand Down
3 changes: 3 additions & 0 deletions .test/config-target-regions/config_multiple_beds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,9 @@ params:
freebayes:
min_alternate_fraction: 0.05 # Reduce for calling variants with lower VAFs

gene_coverage:
min_avg_coverage: 5

report:
activate: true
max_read_depth: 250
Expand Down
3 changes: 3 additions & 0 deletions .test/config_primers/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,9 @@ params:
freebayes:
min_alternate_fraction: 0.05 # Reduce for calling variants with lower VAFs

gene_coverage:
min_avg_coverage: 5

report:
activate: true
max_read_depth: 250
Expand Down
6 changes: 6 additions & 0 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,12 @@ mutational_burden:
- somatic_tumor_medium
- somatic_tumor_high

# Sets the minimum average coverage for each gene.
# Genes with lower average coverage will not be concidered in gene coverage datavzrd report
# If not present min_avg_coverage will be set to 0 rendering all genes.
gene_coverage:
min_avg_coverage: 5
FelixMoelder marked this conversation as resolved.
Show resolved Hide resolved

# printing of variants in interactive tables
report:
# if stratification is deactivated, one report for all
Expand Down
21 changes: 21 additions & 0 deletions workflow/resources/datavzrd/gene-coverage-template.datavzrd.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
name: "Mean read depth per gene"

datasets:
gene-coverage:
path: ?input.csv
separator: "\t"

views:
?f"{wildcards.group}":
dataset: gene-coverage
render-table:
columns:
?for sample in params.samples:
?sample:
plot:
heatmap:
scale: linear
range:
- white
- "#ff5555"
aux-domain-columns: ?list(params.samples)
4 changes: 4 additions & 0 deletions workflow/rules/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,10 @@ def get_final_output(wildcards):
group=groups,
)

final_output.extend(
expand("results/datavzrd-report/{group}.coverage", group=groups)
)

if config["report"]["activate"]:
final_output.extend(
expand(
Expand Down
66 changes: 66 additions & 0 deletions workflow/rules/datavzrd.smk
Original file line number Diff line number Diff line change
Expand Up @@ -102,3 +102,69 @@ rule datavzrd_variants_calls:
"logs/datavzrd_report/{batch}.{event}.log",
wrapper:
"v2.10.0/utils/datavzrd"


rule bedtools_merge:
input:
left="results/regions/{group}/{sample}.regions.bed.gz",
right="results/regions/{group}.covered_regions.bed",
output:
"results/coverage/{group}/{sample}.regions.filtered.bed",
params:
## Add optional parameters
extra="-wa",
log:
"logs/bedtools/{group}/{sample}.log",
wrapper:
"v2.6.0/bio/bedtools/intersect"


rule coverage_table:
input:
lambda wc: expand(
"results/coverage/{{group}}/{sample}.regions.filtered.bed",
sample=get_group_samples(wc.group),
),
output:
"results/coverage/{group}.csv",
params:
min_cov=config["gene_coverage"].get("min_avg_coverage", 0),
conda:
"../envs/pandas.yaml"
log:
"logs/coverage/{group}_coverage_table.log",
script:
"../scripts/coverage_table.py"


rule render_datavzrd_gene_coverage_template:
input:
template=workflow.source_path(
"../resources/datavzrd/gene-coverage-template.datavzrd.yaml"
),
csv="results/coverage/{group}.csv",
output:
"resources/datavzrd/{group}.coverage.yaml",
params:
samples=lambda wc: get_group_samples(wc.group),
log:
"logs/datavzrd_render/{group}.coverage.log",
template_engine:
"yte"


rule datavzrd_coverage:
input:
csv="results/coverage/{group}.csv",
config="resources/datavzrd/{group}.coverage.yaml",
output:
report(
directory("results/datavzrd-report/{group}.coverage"),
htmlindex="index.html",
category="Mean read depth per gene",
labels=lambda wc: {"Group": wc.group},
),
log:
"logs/datavzrd_report/{group}.coverage.log",
wrapper:
"v2.10.0/utils/datavzrd"
8 changes: 4 additions & 4 deletions workflow/rules/ref.smk
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,9 @@ rule get_known_variants:
"v2.5.0/bio/reference/ensembl-variation"


rule get_annotation_gz:
rule get_annotation:
output:
"resources/annotation.gtf.gz",
"resources/annotation.gtf",
FelixMoelder marked this conversation as resolved.
Show resolved Hide resolved
params:
species=config["ref"]["species"],
release=config["ref"]["release"],
Expand All @@ -76,7 +76,7 @@ rule get_annotation_gz:

rule determine_coding_regions:
input:
"resources/annotation.gtf.gz",
"resources/annotation.gtf",
output:
"resources/coding_regions.bed.gz",
log:
Expand All @@ -87,7 +87,7 @@ rule determine_coding_regions:
shell:
# filter for `exon` entries, but unclear how to exclude pseudogene exons...
"""
( zcat {input} | \\
( cat {input} | \\
awk 'BEGIN {{ IFS = "\\t"}} {{ if ($3 == "exon") {{ print $0 }} }}' | \\
grep 'transcript_biotype "protein_coding"' | \\
grep 'gene_biotype "protein_coding"' | \\
Expand Down
14 changes: 14 additions & 0 deletions workflow/rules/regions.smk
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,27 @@ rule get_target_regions:
"""


rule transform_gene_annotations:
input:
"resources/annotation.gtf",
output:
"resources/gene_annotation.bed",
log:
"logs/plot_coverage/transform_gene_regions.log",
script:
"../scripts/transform_gene_regions.py"


rule build_sample_regions:
input:
bam="results/recal/{sample}.bam",
bai="results/recal/{sample}.bai",
bed="resources/gene_annotation.bed",
output:
"results/regions/{group}/{sample}.mosdepth.global.dist.txt",
"results/regions/{group}/{sample}.quantized.bed.gz",
"results/regions/{group}/{sample}.regions.bed.gz",
"results/regions/{group}/{sample}.mosdepth.region.dist.txt",
summary="results/regions/{group}/{sample}.mosdepth.summary.txt", # this named output is required for prefix parsing
log:
"logs/mosdepth/regions/{group}_{sample}.log",
Expand Down
6 changes: 6 additions & 0 deletions workflow/schemas/config.schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,12 @@ properties:
library_length:
type: integer

gene_coverage:
type: object
properties:
min_avg_coverage:
type: integer

report:
type: object
activate:
Expand Down
39 changes: 39 additions & 0 deletions workflow/scripts/coverage_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import pandas as pd
import numpy as np

sys.stderr = open(snakemake.log[0], "w")


def add_missing_columns(df, samples):
missing_columns = set(samples).difference(df.columns)
df[list(missing_columns)] = np.nan
return df


group_regions = dict()
samples = []
for bed in snakemake.input:
sample = bed.split("/")[-1].split(".")[0]
samples.append(sample)
with open(bed, "r") as covered_regions:
for line in covered_regions:
line = line.strip().split("\t")
chromosome = line[0]
gene = line[3]
coverage = line[4]
if float(coverage) < float(snakemake.params.min_cov):
continue
if (chromosome, gene) not in group_regions:
group_regions[(chromosome, gene)] = dict()
group_regions[(chromosome, gene)][sample] = coverage

if bool(group_regions):
df = pd.DataFrame.from_dict(group_regions).T
df.index.names = ("chromosome", "gene")
df.reset_index(inplace=True)
df = add_missing_columns(df, samples)
else:
df = pd.DataFrame(columns=["chromosome", "gene"] + samples)

with open(snakemake.output[0], "w") as csv_file:
df.to_csv(csv_file, index=False, sep="\t")
22 changes: 22 additions & 0 deletions workflow/scripts/transform_gene_regions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
sys.stderr = open(snakemake.log[0], "w")

out_file = open(snakemake.output[0], "w")

with open(snakemake.input[0], "r") as annotations:
for line in annotations.readlines():
if line.startswith("#"):
continue
line = line.split("\t")
chromosome = line[0]
feature = line[2]
if feature != "gene" or chromosome not in list(map(str, range(1, 23))) + [
"X",
"Y",
]:
continue
start = str(int(line[3]) - 1)
end = line[4]
desc = dict([x.split(" ") for x in line[8].split("; ")])
gene = desc.get("gene_name", desc["gene_id"])[1:-1]
print("\t".join([chromosome, start, end, gene]), file=out_file)
out_file.close()
Loading