Merge branch 'master' into fix/numpy-yte-parsing

snakemake-workflows · Dec 3, 2024 · 8058ee0 · 8058ee0
2 parents 9170582 + 7c24fb8
commit 8058ee0
Show file tree

Hide file tree

Showing 15 changed files with 64 additions and 85 deletions.
diff --git a/.test/config-chm-eval/config.yaml b/.test/config-chm-eval/config.yaml
@@ -15,7 +15,6 @@ ref:
 
 primers:
   trimming:
-    activate: false
     primers_fa1: ""
     primers_fa2: ""
     library_length: 0

diff --git a/.test/config-giab/config.yaml b/.test/config-giab/config.yaml
@@ -16,10 +16,9 @@ ref:
 
 primers:
   trimming:
-    activate: false
     # path to fasta files containg primer sequences
-    primers_fa1: "path/to/primer-fa1"
-    primers_fa2: "path/to/primer-fa2"
+    primers_fa1: ""
+    primers_fa2: ""
     # Library mean + error determines the maximum insert size between the outer primer ends.
     # Specify 0 to have yara autodetect the primer library insert size error.
     library_error: 0

diff --git a/.test/config-no-candidate-filtering/config.yaml b/.test/config-no-candidate-filtering/config.yaml
@@ -15,7 +15,6 @@ ref:
 
 primers:
   trimming:
-    activate: false
     primers_fa1: ""
     primers_fa2: ""
     library_length: 0

diff --git a/.test/config-simple/config.yaml b/.test/config-simple/config.yaml
@@ -15,7 +15,6 @@ ref:
 
 primers:
   trimming:
-    activate: false
     primers_fa1: ""
     primers_fa2: ""
     library_error: 0

diff --git a/.test/config-sra/config.yaml b/.test/config-sra/config.yaml
@@ -15,7 +15,6 @@ ref:
 
 primers:
   trimming:
-    activate: false
     primers_fa1: ""
     primers_fa2: ""
     library_length: 0

diff --git a/.test/config-target-regions/config.yaml b/.test/config-target-regions/config.yaml
@@ -17,7 +17,6 @@ ref:
 
 primers:
   trimming:
-    activate: false
     primers_fa1: ""
     primers_fa2: ""
     library_length: 0

diff --git a/.test/config-target-regions/config_multiple_beds.yaml b/.test/config-target-regions/config_multiple_beds.yaml
@@ -19,7 +19,6 @@ ref:
 
 primers:
   trimming:
-    activate: false
     primers_fa1: ""
     primers_fa2: ""
     library_length: 0

diff --git a/.test/config_primers/config.yaml b/.test/config_primers/config.yaml
@@ -16,7 +16,6 @@ ref:
 
 primers:
   trimming:
-    activate: true
     primers_fa1: "a.scerevisiae.1_primers.fq"
     primers_fa2: "a.scerevisiae.2_primers.fq"
     library_length: 400

diff --git a/config/README.md b/config/README.md
@@ -46,6 +46,7 @@ Defining primers directly in the config file is prefered when all samples come f
 In case of different panels, primers have to be set panel-wise in a seperate tsv-file.
 For each panel the following columns need to be set: `panel`, `fa1` and `fa2` (optional).
 Additionally, for each sample the corresponding panel must be defined in `samples.tsv` (column `panel`).
+If a panel is not provided for a sample, trimming will not be performed on that sample. 
 For single primer trimming only, the first entry in the config (respective in the tsv file) needs to be defined.
 
 # Annotating UMIS

diff --git a/config/config.yaml b/config/config.yaml
@@ -30,12 +30,13 @@ ref:
   # This is usually only relevant for testing.
   # chromosome: 21
 
+# Trimming will be applied if global primer sequences are
+# provided or primer panels are set in samplesheet
 primers:
   trimming:
-    activate: false
-    # path to fasta files containg primer sequences
-    primers_fa1: "path/to/primer-fa1"
-    primers_fa2: "path/to/primer-fa2"
+    # path to fasta files containing primer sequences
+    primers_fa1: ""
+    primers_fa2: ""
     # optional primer file allowing to define primers per sample
     # overwrites primers_fa1 and primers_fa2
     # the tsv file requires three fields: panel, fa1 and fa2 (optional)

diff --git a/workflow/resources/datavzrd/clinical_significance.js b/workflow/resources/datavzrd/clinical_significance.js
diff --git a/workflow/resources/datavzrd/variant-calls-template.datavzrd.yaml b/workflow/resources/datavzrd/variant-calls-template.datavzrd.yaml
@@ -202,7 +202,8 @@ views:
                 url: https://www.ensembl.org/Homo_sapiens/Transcript/Summary?t={feature}
           'clinical significance':
             optional: true
-            custom: ?read_file(input.clin_sig)
+            spell:
+              url: v1.2.1/med/clin-sig
           hgvsp:
             custom: ?hgvs_content
           hgvsc:
@@ -305,18 +306,18 @@ views:
             optional: true
             display-mode: detail
             spell:
-              url: v1.1.3/logic/boolean
+              url: v1.2.1/logic/boolean
               with:
                 true_value: "True"
-                false_value: ""
+                false_value: "False"
           mane_plus_clinical:
             optional: true
             display-mode: detail
             spell:
-              url: v1.1.3/logic/boolean
+              url: v1.2.1/logic/boolean
               with:
                 true_value: "True"
-                false_value: ""
+                false_value: "False"
           ?for alias in params.samples.loc[params.samples["group"] == group, "alias"]:
             '?f"{alias}: short observations"':
               optional: true
@@ -476,18 +477,18 @@ views:
             optional: true
             display-mode: detail
             spell:
-              url: v1.1.3/logic/boolean
+              url: v1.2.1/logic/boolean
               with:
                 true_value: "True"
-                false_value: ""
+                false_value: "False"
           mane_plus_clinical:
             optional: true
             display-mode: detail
             spell:
-              url: v1.1.3/logic/boolean
+              url: v1.2.1/logic/boolean
               with:
                 true_value: "True"
-                false_value: ""
+                false_value: "False"
           ?for alias in params.samples.loc[params.samples["group"] == group, "alias"]:
             '?f"{alias}: short observations"':
               optional: true

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -260,20 +260,11 @@ def get_control_fdr_input(wildcards):
         return "results/final-calls/{group}.{calling_type}.annotated.bcf"
 
 
-def get_recalibrate_quality_input(wildcards, bai=False):
-    ext = "bai" if bai else "bam"
-    datatype = get_sample_datatype(wildcards.sample)
-    if datatype == "rna":
-        return "results/split/{{sample}}.{ext}".format(ext=ext)
-    # Post-processing of DNA samples
-    if is_activated("calc_consensus_reads"):
-        return "results/consensus/{{sample}}.{ext}".format(ext=ext)
-    elif is_activated("primers/trimming"):
-        return "results/trimmed/{{sample}}.trimmed.{ext}".format(ext=ext)
-    elif is_activated("remove_duplicates"):
-        return "results/dedup/{{sample}}.{ext}".format(ext=ext)
+def get_aligner(wildcards):
+    if get_sample_datatype(wildcards.sample) == "rna":
+        return "star"
     else:
-        return "results/mapped/bwa/{{sample}}.{ext}".format(ext=ext)
+        return "bwa"
 
 
 def get_cutadapt_input(wildcards):
@@ -376,11 +367,6 @@ def is_paired_end(sample):
     return all_paired
 
 
-def group_is_paired_end(group):
-    samples = get_group_samples(group)
-    return all([is_paired_end(sample) for sample in samples])
-
-
 def get_map_reads_input(wildcards):
     if is_paired_end(wildcards.sample):
         return [
@@ -428,31 +414,42 @@ def get_sample_datatype(sample):
 
 
 def get_markduplicates_input(wildcards):
-    aligner = "star" if get_sample_datatype(wildcards.sample) == "rna" else "bwa"
+    aligner = get_aligner(wildcards)
     if sample_has_umis(wildcards.sample):
-        return "results/mapped/{aligner}/{{sample}}.annotated.bam".format(
-            aligner=aligner
-        )
+        return f"results/mapped/{aligner}/{{sample}}.annotated.bam"
+    else:
+        return f"results/mapped/{aligner}/{{sample}}.bam"
+
+
+def get_recalibrate_quality_input(wildcards, bai=False):
+    ext = "bai" if bai else "bam"
+    datatype = get_sample_datatype(wildcards.sample)
+    if datatype == "rna":
+        return "results/split/{{sample}}.{ext}".format(ext=ext)
+    # Post-processing of DNA samples
+    if is_activated("calc_consensus_reads"):
+        return "results/consensus/{{sample}}.{ext}".format(ext=ext)
     else:
-        return "results/mapped/{aligner}/{{sample}}.bam".format(aligner=aligner)
+        return get_consensus_input(wildcards, bai)
 
 
-def get_consensus_input(wildcards):
-    if is_activated("primers/trimming"):
-        return "results/trimmed/{sample}.trimmed.bam"
-    elif is_activated("remove_duplicates"):
-        return "results/dedup/{sample}.bam"
+def get_consensus_input(wildcards, bai=False):
+    ext = "bai" if bai else "bam"
+    if sample_has_primers(wildcards):
+        return "results/trimmed/{{sample}}.trimmed.{ext}".format(ext=ext)
     else:
-        aligner = "star" if get_sample_datatype(wildcards.sample) == "rna" else "bwa"
-        return "results/mapped/{aligner}/{{sample}}.bam".format(aligner=aligner)
+        return get_trimming_input(wildcards, bai)
 
 
-def get_trimming_input(wildcards):
+def get_trimming_input(wildcards, bai=False):
+    ext = "bai" if bai else "bam"
     if is_activated("remove_duplicates"):
-        return "results/dedup/{sample}.bam"
+        return "results/dedup/{{sample}}.{ext}".format(ext=ext)
     else:
-        aligner = "star" if get_sample_datatype(wildcards.sample) == "rna" else "bwa"
-        return "results/mapped/{aligner}/{{sample}}.bam".format(aligner=aligner)
+        aligner = get_aligner(wildcards)
+        return "results/mapped/{aligner}/{{sample}}.{ext}".format(
+            aligner=aligner, ext=ext
+        )
 
 
 def get_primer_bed(wc):
@@ -549,8 +546,6 @@ def get_markduplicates_extra(wc):
 
 def get_group_bams(wildcards, bai=False):
     ext = "bai" if bai else "bam"
-    if is_activated("primers/trimming") and not group_is_paired_end(wildcards.group):
-        WorkflowError("Primer trimming is only available for paired end data.")
     return expand(
         "results/recal/{sample}.{ext}",
         sample=get_group_samples(wildcards.group),
@@ -1373,6 +1368,21 @@ def get_umi_fastq(wildcards):
         return umi_read
 
 
+def sample_has_primers(wildcards):
+    sample_name = wildcards.sample
+
+    if config["primers"]["trimming"].get("primers_fa1") or (
+        "panel" in samples.columns
+        and samples.loc[samples["sample_name"] == sample_name, "panel"].notna().any()
+    ):
+        if not is_paired_end(sample_name):
+            raise WorkflowError(
+                f"Primer trimming is only available for paired-end data. Sample '{sample_name}' is not paired-end."
+            )
+        return True
+    return False
+
+
 def sample_has_umis(sample):
     return pd.notna(extract_unique_sample_column_value(sample, "umi_read"))
 

diff --git a/workflow/rules/datavzrd.smk b/workflow/rules/datavzrd.smk
@@ -68,7 +68,6 @@ rule datavzrd_variants_calls:
         spec_short_observations=workflow.source_path(
             "../resources/datavzrd/spec_short_observations.json"
         ),
-        clin_sig=workflow.source_path("../resources/datavzrd/clinical_significance.js"),
         data_short_observations=workflow.source_path(
             "../resources/datavzrd/data_short_observations.js"
         ),

diff --git a/workflow/rules/mapping.smk b/workflow/rules/mapping.smk
@@ -20,8 +20,6 @@ rule merge_untrimmed_fastqs:
         get_untrimmed_fastqs,
     output:
         temp("results/untrimmed/{sample}_{read}.fastq.gz"),
-    conda:
-        "../envs/fgbio.yaml"
     log:
         "logs/merge-fastqs/untrimmed/{sample}_{read}.log",
     wildcard_constraints: