diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index c2f9fa0..7126e2d 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -49,6 +49,8 @@ docker-run: parallel: matrix: - MATRIX_NAME: [ + "str-usersex", + "str-infersex", "missing-ref", "two-models", "all_phased", @@ -73,8 +75,6 @@ docker-run: "wf-human-snp_svrefine", "wf-human-sv", "wf-human-sv-phase", - "str-usersex", - "str-infersex", "wf-human-cnv-spectre", "wf-human-phase_all", "wf-human-phase_all_lp", diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 87626df..e950f47 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -8,7 +8,7 @@ repos: always_run: true pass_filenames: false additional_dependencies: - - epi2melabs==0.0.56 + - epi2melabs==0.0.57 - id: build_models name: build_models entry: datamodel-codegen --strict-nullable --base-class workflow_glue.results_schema_helpers.BaseModel --use-schema-description --disable-timestamp --input results_schema.yml --input-file-type openapi --output bin/workflow_glue/results_schema.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 9055720..20a4489 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,13 +4,14 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [unreleased] +## [v2.4.0] ### Added - IGV configuration for the EPI2ME App includes the output VCF files. ### Changed - Emit indexes for the input reference, when generated by the workflow. - Links to the reference genomes in the README now point to `bgzip`-compressed fasta files. - Updated `modkit` to v0.3.3. +- Reconciled workflow with wf-template v5.2.5 ### Fixed - `ERROR ~ No such variable: colors` when the workflow cannot find the reference file. - Incorrect bin size unit in QDNAseq wrapper script help text (@HudoGriz, #209). diff --git a/README.md b/README.md index f48ed02..f3c1881 100644 --- a/README.md +++ b/README.md @@ -57,7 +57,7 @@ therefore Nextflow will need to be installed before attempting to run the workflow. The workflow can currently be run using either -[Docker](https://www.docker.com/products/docker-desktop +[Docker](https://www.docker.com/products/docker-desktop) or [Singularity](https://docs.sylabs.io/guides/3.0/user-guide/index.html) to provide isolation of the required software. Both methods are automated out-of-the-box provided diff --git a/base.config b/base.config index 1dab263..7d98459 100644 --- a/base.config +++ b/base.config @@ -9,7 +9,7 @@ params { str_tag = "shadd2f2963fe39351d4e0d6fa3ca54e1064c6ec057" spectre_tag = "sha49a9fe474da9860f84f08f17f137b47a010b1834" snpeff_tag = "shadcc812849019640d4d2939703fbb8777256e41ad" - common_sha = "shad399cf22079b5b153920ac39ee40095a677933f1" + common_sha = "shae58638742cf84dbeeec683ba24bcdee67f64b986" } } diff --git a/bin/workflow_glue/check_bam_headers_in_dir.py b/bin/workflow_glue/check_bam_headers_in_dir.py index 44e689b..199e056 100755 --- a/bin/workflow_glue/check_bam_headers_in_dir.py +++ b/bin/workflow_glue/check_bam_headers_in_dir.py @@ -29,7 +29,13 @@ def main(args): for xam_file in target_files: # get the `@SQ` and `@HD` lines in the header with pysam.AlignmentFile(xam_file, check_sq=False) as f: - sq_lines = f.header.get("SQ") + # compare only the SN/LN/M5 elements of SQ to avoid labelling XAM with + # same reference but different SQ.UR as mixed_header (see CW-4842) + sq_lines = [{ + "SN": sq["SN"], + "LN": sq["LN"], + "M5": sq.get("M5"), + } for sq in f.header.get("SQ", [])] hd_lines = f.header.get("HD") # Check if it is sorted. # When there is more than one BAM, merging/sorting diff --git a/bin/workflow_glue/check_xam_index.py b/bin/workflow_glue/check_xam_index.py index 3beae14..f9f631e 100755 --- a/bin/workflow_glue/check_xam_index.py +++ b/bin/workflow_glue/check_xam_index.py @@ -14,12 +14,12 @@ def validate_xam_index(xam_file): Invalid indexes will fail the call with a ValueError: ValueError: fetch called on bamfile without index """ - alignments = pysam.AlignmentFile(xam_file, check_sq=False) - try: - alignments.fetch() - has_valid_index = True - except ValueError: - has_valid_index = False + with pysam.AlignmentFile(xam_file, check_sq=False) as alignments: + try: + alignments.fetch() + has_valid_index = True + except ValueError: + has_valid_index = False return has_valid_index diff --git a/bin/workflow_glue/report_snp.py b/bin/workflow_glue/report_snp.py index d955707..2bc158d 100755 --- a/bin/workflow_glue/report_snp.py +++ b/bin/workflow_glue/report_snp.py @@ -5,6 +5,7 @@ import os import sys +from bokeh.models import HoverTool from dominate.tags import a, h6, p from ezcharts.components.bcfstats import load_bcfstats from ezcharts.components.clinvar import load_clinvar_vcf @@ -227,6 +228,9 @@ def main(args): else: sizes = indel_sizes(bcfstats['IDD']) plt = barplot(data=sizes, x="nlength", y="count", color=Colors.cerulean) + # Add tooltips + hover = plt._fig.select(dict(type=HoverTool)) + hover.tooltips = [("Number of variants", "@top")] EZChart(plt, 'epi2melabs') # write report diff --git a/bin/workflow_glue/report_str.py b/bin/workflow_glue/report_str.py index 806b99c..81f2b70 100755 --- a/bin/workflow_glue/report_str.py +++ b/bin/workflow_glue/report_str.py @@ -2,7 +2,7 @@ """Plot STRs.""" from bokeh.models import BoxZoomTool, ColumnDataSource, HoverTool -from bokeh.models import PanTool, Range1d, ResetTool, WheelZoomTool +from bokeh.models import PanTool, Range1d, ResetTool, Title, WheelZoomTool from dominate.tags import b, code, h3, p, span, table, tbody, td, th, thead, tr from ezcharts.components import fastcat from ezcharts.components.ezchart import EZChart @@ -11,7 +11,6 @@ from ezcharts.layout.snippets import Grid, Tabs from ezcharts.plots import BokehPlot from ezcharts.plots.distribution import histplot -from ezcharts.plots.distribution import MakeRectangles from natsort import natsorted import pandas as pd from .util import wf_parser # noqa: ABS101 @@ -91,50 +90,37 @@ def argparser(): return parser -def add_triangle(plt, x_pos, idx): - """Draw triangle.""" - plt.add_series(dict( - type='line', - datasetIndex=idx, - markPoint={ - 'data': [{ - 'symbol': 'triangle', - 'coord': [x_pos, 0], - 'symbolSize': [20, 20], - 'symbolOffset': [0, 15], - 'itemStyle': { - 'color': 'rgb(255, 255, 255)', - 'borderColor': 'rgb(0, 0, 0)', - 'borderWidth': 1.5 - } - }] - } - )) - - -def add_rectangle(plt, x_start, x_end, height, idx, rectangle): +def add_line(plt, x_pos, heigth): + """Draw vertical line.""" + plt._fig.line( + [x_pos, x_pos], + [0, heigth], + line_width=2, + line_dash='dashed', + color='rgba(0, 0, 0, 0.4)' + ) + + +def add_rectangle(plt, x_start, x_end, height, rectangle): """Draw rectangle.""" if rectangle == 'normal': colour = 'rgba(144, 198, 231, 0.4)' elif rectangle == 'pathogenic': colour = 'rgba(239, 65, 53, 0.4)' - plt.add_dataset(dict( - source=[[x_start, x_end, height]], - dimensions=['x_starts', 'ends', 'heights'] - )) - plt.add_series(dict( - type='custom', - name=rectangle+" range", - renderItem=MakeRectangles(), - datasetIndex=idx, - encode={ - 'x': ['x_starts', 'ends'], - 'y': ['heights'] - }, - itemStyle={ - 'color': colour}, - clip=True - )) + # X/Y values in bokeh rect point to the central position on the + # axis of the figure. + plt._fig.rect( + x=(x_end+x_start)/2, + y=0 + height/2, + width=x_end-x_start, + height=height, + legend_label=rectangle, + fill_color=colour, + line_color=colour + ) + + plt._fig.legend.location = "top" + plt._fig.legend.orientation = "horizontal" def parse_vcf(fname, info_cols=None, nrows=None): @@ -204,17 +190,22 @@ def histogram_with_mean_and_median( raise ValueError("`series` must be `pd.Series`.") plt = histplot(data=series, bins=bins) - plt.title = dict( - text=title, - subtext=( - f"Mean: {series.mean().round(round_digits)}. " - f"Median: {series.median().round(round_digits)}" - ), + subtext = ( + f"Mean: {series.mean().round(round_digits)}. " + + f"Median: {series.median().round(round_digits)}" + ) + plt._fig.add_layout( + Title(text=subtext, text_font_size="0.8em"), + 'above' + ) + plt._fig.add_layout( + Title(text=title, text_font_size="1.5em"), + 'above' ) if x_axis_name is not None: - plt.xAxis.name = x_axis_name + plt._fig.xaxis.axis_label = x_axis_name if y_axis_name is not None: - plt.yAxis.name = y_axis_name + plt._fig.yaxis.axis_label = y_axis_name return plt @@ -223,38 +214,25 @@ def create_str_histogram( cn1, cn2, disease): """Create a histogram of STR results for a given repeat.""" h3(disease + ' (' + repeat + ')') + df = hist_data[hist_data['VARID'] == repeat]['copy_number'] + plt = histplot( + data=df, + x='copy_number', + binwidth=1 + ) + histogram_data = plt._fig.renderers[0].data_source.to_df() - plt = histplot(data=hist_data[hist_data['VARID'] == repeat] - ['copy_number'].values, binwidth=1) - histogram_data = plt.dataset[0].source max_cols = histogram_data.max(axis=0) - max_rectangle_height = max_cols[2] - - xaxis = { - 'name': "Repeat number", - 'nameGap': '30', - 'nameLocation': 'middle', - 'nameTextStyle': {'fontSize': '14', 'fontStyle': 'bold'}, - 'min': '0', - 'max': pathologic_max - } - - yaxis = { - 'name': "Number of supporting reads", - 'nameGap': '30', - 'nameLocation': 'middle', - 'nameTextStyle': {'fontSize': '14', 'fontStyle': 'bold'}, - 'max': max_rectangle_height - } - - plt.xAxis = xaxis - plt.yAxis = yaxis + max_rectangle_height = max_cols.top + + plt._fig.xaxis.axis_label = "Repeat number" + plt._fig.yaxis.axis_label = "Number of supporting reads" add_rectangle( - plt, 0, normal_max, max_rectangle_height, 1, 'normal' + plt, 0, normal_max, max_rectangle_height, 'normal' ) add_rectangle( - plt, pathologic_min, pathologic_max, max_rectangle_height, 2, + plt, pathologic_min, pathologic_max, max_rectangle_height, 'pathogenic' ) @@ -265,8 +243,14 @@ def create_str_histogram( {"name": "pathogenic range"}, ]} - add_triangle(plt, cn1, 3) - add_triangle(plt, cn2, 4) + # add_triangle(plt, cn1) + # add_triangle(plt, cn2) + add_line(plt, cn1, max_rectangle_height) + add_line(plt, cn2, max_rectangle_height) + + # Remove hover + hover = plt._fig.select(dict(type=HoverTool)) + hover.tooltips = None EZChart(plt, theme='epi2melabs') @@ -456,7 +440,7 @@ def make_report( """ The tabs below contain short tandem repeat (STR) expansion plots for each repeat genotyped in the sample. The coloured boxes denote the normal and - pathogenic ranges of repeat numbers, and the triangles denote the median + pathogenic ranges of repeat numbers, and the dashed lines denote the median number of repeats in each allele. """ ) @@ -693,7 +677,10 @@ def make_report( ) plt = fastcat.read_length_plot(read_lengths) - plt.xAxis.max = max_read_length_to_show + plt._fig.x_range.end = max_read_length_to_show + # Add tooltips + hover = plt._fig.select(dict(type=HoverTool)) + hover.tooltips = [("Number of reads", "@top")] EZChart(plt, theme='epi2melabs') plt = histogram_with_mean_and_median( @@ -702,6 +689,9 @@ def make_report( x_axis_name="Quality", y_axis_name="Number of reads" ) + # Add tooltips + hover = plt._fig.select(dict(type=HoverTool)) + hover.tooltips = [("Number of reads", "@top")] EZChart(plt, theme='epi2melabs') return report diff --git a/bin/workflow_glue/report_sv.py b/bin/workflow_glue/report_sv.py index 044f463..010b53a 100755 --- a/bin/workflow_glue/report_sv.py +++ b/bin/workflow_glue/report_sv.py @@ -4,6 +4,7 @@ import json import sys +from bokeh.models import HoverTool, Title from dominate.tags import a, h4, p from ezcharts.components.ezchart import EZChart from ezcharts.components.reports.labs import LabsReport @@ -194,16 +195,16 @@ def sv_size_plots(vcf_data, max_size=5000): # Add deletion plot plt = histplot(data=indels, bins=bins, stat='count') # override excharts axisLabel interval - plt.xAxis = dict( - name='Length', - axisLabel=dict( - interval="auto", - rotate=30 - ), - max=max_size, - min=-max_size - ) - plt.title = {"text": "Indels size distribution"} + plt._fig.add_layout( + Title(text="Indels size distribution", text_font_size="1.5em"), + 'above' + ) + plt._fig.xaxis.axis_label = 'Length' + plt._fig.x_range.start = -max_size + plt._fig.x_range.end = max_size + # Add tooltips + hover = plt._fig.select(dict(type=HoverTool)) + hover.tooltips = [("Number of variants", "@top")] EZChart(plt, 'epi2melabs') p("The plot shows Indels with |length| < 5Kb.") else: diff --git a/bin/workflow_glue/report_utils/sections.py b/bin/workflow_glue/report_utils/sections.py index 9ace65e..652e219 100755 --- a/bin/workflow_glue/report_utils/sections.py +++ b/bin/workflow_glue/report_utils/sections.py @@ -1,6 +1,7 @@ """Sections for ezcharts report.""" import os +from bokeh.models import HoverTool, Title import pandas as pd import dominate.tags as dom_tags # noqa: I100,I202 @@ -206,12 +207,17 @@ def depths(report, depth_df, sample_name): x="total_mean_pos", y="depth", hue="chrom", + linewidth=1, + marker=False ) - plt.title = {"text": "Coverage along reference"} - plt.xAxis.name = "Position along reference" - plt.yAxis.name = "Sequencing depth" - for s in plt.series: - s.showSymbol = False + plt._fig.add_layout( + Title(text="Coverage along reference", text_font_size="1.5em"), + 'above' + ) + plt._fig.xaxis.axis_label = "Position along reference" + plt._fig.yaxis.axis_label = "Sequencing depth" + hover = plt._fig.select(dict(type=HoverTool)) + hover.tooltips = [("Depth", "@y")] EZChart(plt, theme=THEME) diff --git a/docs/04_install_and_run.md b/docs/04_install_and_run.md index 5477810..5c306f1 100644 --- a/docs/04_install_and_run.md +++ b/docs/04_install_and_run.md @@ -9,7 +9,7 @@ therefore Nextflow will need to be installed before attempting to run the workflow. The workflow can currently be run using either -[Docker](https://www.docker.com/products/docker-desktop +[Docker](https://www.docker.com/products/docker-desktop) or [Singularity](https://docs.sylabs.io/guides/3.0/user-guide/index.html) to provide isolation of the required software. Both methods are automated out-of-the-box provided diff --git a/lib/common.nf b/lib/common.nf index 2a31d49..3a8568d 100644 --- a/lib/common.nf +++ b/lib/common.nf @@ -15,6 +15,7 @@ process getParams { } process configure_igv { + publishDir "${params.out_dir}/", mode: 'copy', pattern: 'igv.json', enabled: params.containsKey("igv") && params.igv label "wf_common" cpus 1 memory "2 GB" diff --git a/lib/ingress.nf b/lib/ingress.nf index 6d14a83..2931357 100644 --- a/lib/ingress.nf +++ b/lib/ingress.nf @@ -197,15 +197,15 @@ def fastq_ingress(Map arguments) .map { meta, files, stats -> // new `arity: '1..*'` would be nice here files = files instanceof List ? files : [files] - new_keys = [ + def new_keys = [ "group_key": groupKey(meta["alias"], files.size()), "n_fastq": files.size()] - grp_index = (0.. - new_keys = [ + def new_keys = [ "group_index": "${meta["alias"]}_${grp_i}"] [meta + new_keys, files, stats] } @@ -279,17 +279,19 @@ def xam_ingress(Map arguments) // sorted, the index will be used. meta, paths -> boolean is_array = paths instanceof ArrayList - String xai_fn + String src_xam + String src_xai // Using `.uri` or `.Uri()` leads to S3 paths to be prefixed with `s3:///` // instead of `s3://`, causing the workflow to not find the index file. // `.toUriString()` returns the correct path. if (!is_array){ + src_xam = paths.toUriString() def xai = file(paths.toUriString() + ".bai") if (xai.exists()){ - xai_fn = xai.toUriString() + src_xai = xai.toUriString() } } - [meta + [xai_fn: xai_fn], paths] + [meta + [src_xam: src_xam, src_xai: src_xai], paths] } | checkBamHeaders | map { meta, paths, is_unaligned_env, mixed_headers_env, is_sorted_env -> @@ -331,9 +333,9 @@ def xam_ingress(Map arguments) // - between 1 and `N_OPEN_FILES_LIMIT` aligned files no_files: n_files == 0 indexed: \ - n_files == 1 && (meta["is_unaligned"] || meta["is_sorted"]) && meta["xai_fn"] - to_index: - n_files == 1 && (meta["is_unaligned"] || meta["is_sorted"]) && !meta["xai_fn"] + n_files == 1 && (meta["is_unaligned"] || meta["is_sorted"]) && meta["src_xai"] + to_index: \ + n_files == 1 && (meta["is_unaligned"] || meta["is_sorted"]) && !meta["src_xai"] to_catsort: \ (n_files == 1) || (n_files > N_OPEN_FILES_LIMIT) || meta["is_unaligned"] to_merge: true @@ -358,20 +360,20 @@ def xam_ingress(Map arguments) .map { meta, files, stats -> // new `arity: '1..*'` would be nice here files = files instanceof List ? files : [files] - new_keys = [ + def new_keys = [ "group_key": groupKey(meta["alias"], files.size()), "n_fastq": files.size()] - grp_index = (0.. - new_keys = [ + def new_keys = [ "group_index": "${meta["alias"]}_${grp_i}"] [meta + new_keys, files, stats] } .map { meta, path, stats -> - [meta.findAll { it.key !in ['xai_fn', 'is_sorted'] }, path, stats] + [meta.findAll { it.key !in ['is_sorted', 'src_xam', 'src_xai'] }, path, stats] } // add number of reads, run IDs, and basecall models to meta @@ -388,10 +390,18 @@ def xam_ingress(Map arguments) | sortBam | groupTuple | mergeBams + | map{ + meta, bam, bai -> + [meta + [src_xam: null, src_xai: null], bam, bai] + } // now handle samples with too many files for `samtools merge` ch_catsorted = ch_result.to_catsort | catSortBams + | map{ + meta, bam, bai -> + [meta + [src_xam: null, src_xai: null], bam, bai] + } // Validate the index of the input BAM. // If the input BAM index is invalid, regenerate it. @@ -399,7 +409,7 @@ def xam_ingress(Map arguments) ch_to_validate = ch_result.indexed | map{ meta, paths -> - bai = paths && meta.xai_fn ? file(meta.xai_fn) : null + def bai = paths && meta.src_xai ? file(meta.src_xai) : null [meta, paths, bai] } | branch { @@ -429,6 +439,10 @@ def xam_ingress(Map arguments) ch_indexed = ch_result.to_index | mix( ch_validated.invalid_idx ) | samtools_index + | map{ + meta, bam, bai -> + [meta + [src_xai: null], bam, bai] + } // Add extra null for the missing index to input.missing // as well as the missing metadata. @@ -439,7 +453,7 @@ def xam_ingress(Map arguments) ) | map{ meta, paths -> - [meta + [xai_fn: null, is_sorted: false], paths, null] + [meta + [src_xam: null, src_xai: null, is_sorted: false], paths, null] } // Combine all possible inputs @@ -480,7 +494,7 @@ def xam_ingress(Map arguments) } // Remove metadata that are unnecessary downstream: - // meta.xai_fn: not needed, as it will be part of the channel as a file + // meta.src_xai: not needed, as it will be part of the channel as a file // meta.is_sorted: if data are aligned, they will also be sorted/indexed // // The output meta can contain the following flags: @@ -498,7 +512,7 @@ def xam_ingress(Map arguments) ch_result | map{ meta, bam, bai, stats -> - [meta.findAll { it.key !in ['xai_fn', 'is_sorted'] }, [bam, bai], stats] + [meta.findAll { it.key !in ['is_sorted'] }, [bam, bai], stats] }, "xam" ) @@ -508,6 +522,19 @@ def xam_ingress(Map arguments) | map{ it.flatten() } + // Final check to ensure that src_xam/src_xai is not an s3 + // path. If so, drop it. We check src_xam also for src_xai + // as, the latter is irrelevant if the former is in s3. + | map{ + meta, bam, bai, stats -> + def xam = meta.src_xam + def xai = meta.src_xai + if (meta.src_xam){ + xam = meta.src_xam.startsWith('s3://') ? null : meta.src_xam + xai = meta.src_xam.startsWith('s3://') ? null : meta.src_xai + } + [ meta + [src_xam: xam, src_xai: xai], bam, bai, stats ] + } return ch_result } diff --git a/nextflow.config b/nextflow.config index eff1e84..6d83d37 100755 --- a/nextflow.config +++ b/nextflow.config @@ -131,7 +131,7 @@ manifest { description = 'SNV calling, SV calling, modified base calling, CNV calling, and STR genotyping of human samples.' mainScript = 'main.nf' nextflowVersion = '>=23.04.2' - version = '2.3.1' + version = '2.4.0' } epi2melabs {