diff --git a/CHANGELOG.md b/CHANGELOG.md index fb468e4..1e74321 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,16 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [v1.1.0] +### Changed +* Updated Dorado container image to use Dorado v0.1.1 + * Latest models are now v4.0.0 + * Workflow prints a more helpful error when Dorado fails due to unknown model name +* Updated wf-human-snp container image to load new Clair3 models for v4 basecalling +* Default `basecaller_cfg` set to `dna_r10.4.1_e8.2_400bps_sup@v4.0.0` +### Added +* `--basecaller_args` may be used to provide custom arguments to the basecalling process + ## [v1.0.1] ### Changed * Default `basecaller_cfg` set to `dna_r10.4.1_e8.2_400bps_sup@v3.5.2` diff --git a/data/clair3_models.tsv b/data/clair3_models.tsv index c441d94..280d39b 100644 --- a/data/clair3_models.tsv +++ b/data/clair3_models.tsv @@ -1,26 +1,30 @@ -basecaller basecall_model_name clair3_model_name clair3_nomodel_reason -dorado dna_r10.4.1_e8.2_260bps_fast@v3.5.2 - 260bps models are not supported for variant calling with Clair3 -dorado dna_r10.4.1_e8.2_260bps_hac@v3.5.2 - 260bps models are not supported for variant calling with Clair3 -dorado dna_r10.4.1_e8.2_260bps_sup@v3.5.2 - 260bps models are not supported for variant calling with Clair3 -dorado dna_r10.4.1_e8.2_400bps_fast@v3.5.2 - data called with fast models is not suitable for variant calling -dorado dna_r10.4.1_e8.2_400bps_hac@v3.5.2 r1041_e82_400bps_hac_g632 - -dorado dna_r10.4.1_e8.2_400bps_sup@v3.5.2 r1041_e82_400bps_sup_g615 - -dorado dna_r9.4.1_e8_fast@v3.4 - data called with fast models is not suitable for variant calling -dorado dna_r9.4.1_e8_hac@v3.3 r941_prom_sup_g5014 - -dorado dna_r9.4.1_e8_sup@v3.3 r941_prom_sup_g5014 - -guppy dna_r10.4.1_e8.2_400bps_hac_prom r1041_e82_400bps_hac_g632 - -guppy dna_r9.4.1_450bps_hac_prom r941_prom_sup_g5014 - -guppy dna_r10.3_450bps_hac - Clair3 has not been trained on this basecalling configuration -guppy dna_r10.3_450bps_hac_prom - Clair3 has not been trained on this basecalling configuration -guppy dna_r10.4.1_e8.2_260bps_hac - 260bps models are not supported for variant calling with Clair3 -guppy dna_r10.4.1_e8.2_260bps_hac_prom - 260bps models are not supported for variant calling with Clair3 -guppy dna_r10.4.1_e8.2_400bps_hac r1041_e82_400bps_hac_g632 - -guppy dna_r10.4_e8.1_hac - Clair3 has not been trained on this basecalling configuration -guppy dna_r10.4_e8.1_hac_prom - Clair3 has not been trained on this basecalling configuration -guppy dna_r10_450bps_hac - Clair3 has not been trained on this basecalling configuration -guppy dna_r9.4.1_450bps_hac r941_prom_sup_g5014 - -guppy dna_r9.4.1_e8.1_hac - Clair3 has not been trained on this basecalling configuration -guppy dna_r9.4.1_e8.1_hac_prom - Clair3 has not been trained on this basecalling configuration -guppy dna_r9.5_450bps - Clair3 has not been trained on this basecalling configuration -guppy rna_r9.4.1_70bps_hac - RNA data is not suitable for this workflow -guppy rna_r9.4.1_70bps_hac_prom - RNA data is not suitable for this workflow \ No newline at end of file +basecaller basecall_model_name clair3_model_name clair3_nomodel_reason +dorado dna_r10.4.1_e8.2_260bps_hac@v4.0.0 - 260bps models are not supported for variant calling with Clair3 +dorado dna_r10.4.1_e8.2_260bps_sup@v4.0.0 - 260bps models are not supported for variant calling with Clair3 +dorado dna_r10.4.1_e8.2_400bps_hac@v4.0.0 r1041_e82_400bps_hac_v400 - +dorado dna_r10.4.1_e8.2_400bps_sup@v4.0.0 r1041_e82_400bps_sup_v400 - +dorado dna_r10.4.1_e8.2_260bps_fast@v3.5.2 - 260bps models are not supported for variant calling with Clair3 +dorado dna_r10.4.1_e8.2_260bps_hac@v3.5.2 - 260bps models are not supported for variant calling with Clair3 +dorado dna_r10.4.1_e8.2_260bps_sup@v3.5.2 - 260bps models are not supported for variant calling with Clair3 +dorado dna_r10.4.1_e8.2_400bps_fast@v3.5.2 - data called with fast models is not suitable for variant calling +dorado dna_r10.4.1_e8.2_400bps_hac@v3.5.2 r1041_e82_400bps_hac_g632 - +dorado dna_r10.4.1_e8.2_400bps_sup@v3.5.2 r1041_e82_400bps_sup_g615 - +dorado dna_r9.4.1_e8_fast@v3.4 - data called with fast models is not suitable for variant calling +dorado dna_r9.4.1_e8_hac@v3.3 r941_prom_sup_g5014 - +dorado dna_r9.4.1_e8_sup@v3.3 r941_prom_sup_g5014 - +guppy dna_r10.4.1_e8.2_400bps_hac_prom r1041_e82_400bps_hac_g632 - +guppy dna_r9.4.1_450bps_hac_prom r941_prom_sup_g5014 - +guppy dna_r10.3_450bps_hac - Clair3 has not been trained on this basecalling configuration +guppy dna_r10.3_450bps_hac_prom - Clair3 has not been trained on this basecalling configuration +guppy dna_r10.4.1_e8.2_260bps_hac - 260bps models are not supported for variant calling with Clair3 +guppy dna_r10.4.1_e8.2_260bps_hac_prom - 260bps models are not supported for variant calling with Clair3 +guppy dna_r10.4.1_e8.2_400bps_hac r1041_e82_400bps_hac_g632 - +guppy dna_r10.4_e8.1_hac - Clair3 has not been trained on this basecalling configuration +guppy dna_r10.4_e8.1_hac_prom - Clair3 has not been trained on this basecalling configuration +guppy dna_r10_450bps_hac - Clair3 has not been trained on this basecalling configuration +guppy dna_r9.4.1_450bps_hac r941_prom_sup_g5014 - +guppy dna_r9.4.1_e8.1_hac - Clair3 has not been trained on this basecalling configuration +guppy dna_r9.4.1_e8.1_hac_prom - Clair3 has not been trained on this basecalling configuration +guppy dna_r9.5_450bps - Clair3 has not been trained on this basecalling configuration +guppy rna_r9.4.1_70bps_hac - RNA data is not suitable for this workflow +guppy rna_r9.4.1_70bps_hac_prom - RNA data is not suitable for this workflow diff --git a/main.nf b/main.nf index a90fe51..f779995 100644 --- a/main.nf +++ b/main.nf @@ -197,6 +197,7 @@ workflow { else { // map basecalling model to clair3 model lookup_table = Channel.fromPath("${projectDir}/data/clair3_models.tsv", checkIfExists: true) + // TODO basecaller_model_path clair3_model = lookup_clair3_model(lookup_table, params.basecaller_cfg) } diff --git a/modules/local/wf-human-snp.nf b/modules/local/wf-human-snp.nf index 941997e..d2c2b46 100644 --- a/modules/local/wf-human-snp.nf +++ b/modules/local/wf-human-snp.nf @@ -571,5 +571,7 @@ process lookup_clair3_model { ''' clair3_model=$(resolve_clair3_model.py lookup_table '!{basecall_model}') cp -r ${CLAIR_MODELS_PATH}/${clair3_model} model + echo "Basecall model: !{basecall_model}" + echo "Clair3 model : ${clair3_model}" ''' } diff --git a/nextflow.config b/nextflow.config index f368086..6e0326f 100644 --- a/nextflow.config +++ b/nextflow.config @@ -18,7 +18,7 @@ params { disable_ping = false threads = 4 - wfversion = "v1.0.1" + wfversion = "v1.1.0" aws_image_prefix = null aws_queue = null @@ -46,7 +46,8 @@ params { // imported from wf-basecalling /// common basecaller_chunk_size = 25 - basecaller_cfg = "dna_r10.4.1_e8.2_400bps_sup@v3.5.2" + basecaller_cfg = "dna_r10.4.1_e8.2_400bps_sup@v4.0.0" + basecaller_args = null basecaller_basemod_threads = 2 cuda_device = "cuda:all" ubam_map_threads = 8 @@ -104,19 +105,22 @@ params { name = "wf-human-variation" template_version = "195cab5" example_cmd = [ - '--mode snp', - '--bam demo_data/chr6_chr20.bam', - '--bed demo_data/chr6_chr20.bed', - '--ref demo_data/chr6_chr20.fasta', - '--model demo_data/ont_r104_e81_sup_g5015', + "--snp", + "--sv", + "--bam demo_data/demo.bam", + "--bed demo_data/demo.bed", + "--ref demo_data/demo.fasta", + "--basecaller_cfg 'dna_r10.4.1_e8.2_400bps_hac@v4.0.0'", + "--sample_name MY_SAMPLE" ] + agent = null // container sha e2l_base_tag = "shac4db03c19b6ff1277a24ec28a19e564d628d478f" - e2l_snp_tag = "shab9dc5d954dc98f49bcedd6ac43c886f548380644" + e2l_snp_tag = "sha800ab96e243576f7f5fb17a7c4ead9e538a48931" e2l_sv_tag = "sha4963fc850f9e8807777b5a902473ba3eb4657930" e2l_methyl_tag = "sha44a13bcf48db332b2277bb9f95b56d64e393a1d5" - basecaller_container = "dorado:shaa939a6e58395033a8cc78dc4977a24bf6d9e4129" + basecaller_container = "dorado:sha097d9c8abc39b8266e3ee58f531f5ef8944a02c3" } } @@ -127,7 +131,7 @@ manifest { description = 'Basecalling, SNV calling, SV calling, methylation calling of human samples.' mainScript = 'main.nf' nextflowVersion = '>=21.05.0' - version = '1.0.1' + version = '1.1.0' } epi2melabs { @@ -230,14 +234,17 @@ profiles { timeline { enabled = true file = "${params.out_dir}/execution/timeline.html" + overwrite = true } report { enabled = true file = "${params.out_dir}/execution/report.html" + overwrite = true } trace { enabled = true file = "${params.out_dir}/execution/trace.txt" + overwrite = true } env { diff --git a/nextflow_schema.json b/nextflow_schema.json index cf78b29..7943ec7 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -74,7 +74,7 @@ "type": "string", "description": "Name of the model to use for converting signal and selecting a small variant calling model.", "help_text": "Required for basecalling and small variant calling. The basecaller configuration is used to automatically select the appropriate small variant calling model. Refer to the [model table on the Dorado repository for selecting a simplex basecalling model](https://github.com/nanoporetech/dorado#available-basecalling-models).", - "default": "dna_r10.4.1_e8.2_400bps_sup@v3.5.2" + "default": "dna_r10.4.1_e8.2_400bps_sup@v4.0.0" }, "bed": { "type": "string", @@ -301,6 +301,10 @@ "description": "Override the inferred model with a custom remora model", "help_text": "For typical use, users should set --remora_cfg which will use a named model from inside the container. Experimental or custom models will not be available in the container and can be loaded from the host with --remora_model_path.", "hidden": true + }, + "basecaller_args": { + "type": "string", + "description": "Additional command line arguments to pass to the basecaller process." } }, "required": [] @@ -429,7 +433,7 @@ }, "wfversion": { "type": "string", - "default": "v1.0.1", + "default": "v1.1.0", "hidden": true }, "monochrome_logs": { diff --git a/workflows/_basecalling.nf b/workflows/_basecalling.nf index 40da798..5b93796 100644 --- a/workflows/_basecalling.nf +++ b/workflows/_basecalling.nf @@ -18,12 +18,27 @@ process dorado { path("${chunk_idx}.ubam") script: def remora_model = remora_model_override ? "remora_model" : "\${DRD_MODELS_PATH}/${remora_cfg}" - def remora_args = (params.basecaller_basemod_threads > 0 && (params.remora_cfg || remora_model_override)) ? "--remora-models ${remora_model} --remora-threads ${params.basecaller_basemod_threads} --remora-batchsize 1024" : '' + def remora_args = (params.basecaller_basemod_threads > 0 && (params.remora_cfg || remora_model_override)) ? "--modified-bases-models ${remora_model}" : '' def model_arg = basecaller_model_override ? "dorado_model" : "\${DRD_MODELS_PATH}/${basecaller_cfg}" + def basecaller_args = params.basecaller_args ?: '' """ + echo '***' + echo 'Available models:' + list-models | sed 's,^,- ,' | sed "s,\${DRD_MODELS_PATH}/,," + echo '***' + echo 'You selected:' + echo "Basecalling model: ${basecaller_cfg}" + echo "Remora model : ${remora_cfg}" + echo '***' + echo 'A file open error below indicates that you have entered an unknown model name.' + echo 'It is possible the model you selected worked previously but has been updated to a new version.' + echo 'Resubmit this workflow with an appropriate model from the model list above.' + echo '***' + dorado basecaller \ ${model_arg} . \ ${remora_args} \ + ${basecaller_args} \ --device ${params.cuda_device} | samtools view -b -o ${chunk_idx}.ubam - """ }