diff --git a/.editorconfig b/.editorconfig index a30ae1e1..f61a6a78 100644 --- a/.editorconfig +++ b/.editorconfig @@ -23,11 +23,3 @@ indent_size = unset [/assets/email*] indent_size = unset -# To prevent errors for these test diamond databases -[/assets/test*/*.dmnd] -charset = unset -end_of_line = unset -insert_final_newline = unset -trim_trailing_whitespace = unset -indent_style = unset -indent_size = unset diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index 58919f2a..89aae5cc 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -38,7 +38,7 @@ body: id: system attributes: label: System information - description: "* Nextflow version _(eg. 23.04.1)_ + description: "* Nextflow version _(eg. 22.10.1)_ * Hardware _(eg. HPC, Desktop, Cloud)_ diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 307a3b42..fede1317 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,7 +1,12 @@ name: nf-core CI # This workflow runs the pipeline with the minimal test dataset to check that it completes without any syntax errors on: - workflow_dispatch: + push: + branches: + - dev + pull_request: + release: + types: [published] env: NXF_ANSI_LOG: false @@ -19,7 +24,7 @@ jobs: strategy: matrix: NXF_VER: - - "23.04.1" + - "22.10.1" - "latest-everything" steps: - name: Check out pipeline code @@ -30,9 +35,19 @@ jobs: with: version: "${{ matrix.NXF_VER }}" + - name: Download the NCBI taxdump database + run: | + mkdir ncbi_taxdump + curl -L https://ftp.ncbi.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz | tar -C ncbi_taxdump -xzf - + + - name: Download the BUSCO lineage database + run: | + mkdir busco_database + curl -L https://tolit.cog.sanger.ac.uk/test-data/resources/busco/blobtoolkit.GCA_922984935.2.2023-08-03.lineages.tar.gz | tar -C busco_database -xzf - + - name: Run pipeline with test data # You can customise CI pipeline run tests as required # For example: adding multiple test runs with different parameters # Remember that you can parallelise this by using strategy.matrix run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results + nextflow run ${GITHUB_WORKSPACE} -profile test,docker --taxdump $PWD/ncbi_taxdump --busco $PWD/busco_database --outdir ./results diff --git a/.github/workflows/fix-linting.yml b/.github/workflows/fix-linting.yml index b7779fad..ff57bf69 100644 --- a/.github/workflows/fix-linting.yml +++ b/.github/workflows/fix-linting.yml @@ -8,21 +8,21 @@ jobs: # Only run if comment is on a PR with the main repo, and if it contains the magic keywords if: > contains(github.event.comment.html_url, '/pull/') && - contains(github.event.comment.body, '@nf-core-bot fix linting') && + contains(github.event.comment.body, '@sanger-tolsoft fix linting') && github.repository == 'sanger-tol/blobtoolkit' runs-on: ubuntu-latest steps: - # Use the @nf-core-bot token to check out so we can push later + # Use the @sanger-tolsoft token to check out so we can push later - uses: actions/checkout@v3 with: - token: ${{ secrets.nf_core_bot_auth_token }} + token: ${{ secrets.sangertolsoft_access_token }} # Action runs on the issue comment, so we don't get the PR by default # Use the gh cli to check out the PR - name: Checkout Pull Request run: gh pr checkout ${{ github.event.issue.number }} env: - GITHUB_TOKEN: ${{ secrets.nf_core_bot_auth_token }} + GITHUB_TOKEN: ${{ secrets.sangertolsoft_access_token }} - uses: actions/setup-node@v3 @@ -46,8 +46,8 @@ jobs: - name: Commit & push changes if: steps.prettier_status.outputs.result == 'fail' run: | - git config user.email "core@nf-co.re" - git config user.name "nf-core-bot" + git config user.email "105875386+sanger-tolsoft@users.noreply.github.com" + git config user.name "sanger-tolsoft" git config push.default upstream git add . git status diff --git a/.github/workflows/sangertest.yml b/.github/workflows/sanger_test.yml similarity index 78% rename from .github/workflows/sangertest.yml rename to .github/workflows/sanger_test.yml index 95479500..406a6280 100644 --- a/.github/workflows/sangertest.yml +++ b/.github/workflows/sanger_test.yml @@ -1,4 +1,4 @@ -name: nf-core Sanger LSF tests +name: sanger-tol LSF tests on: workflow_dispatch: @@ -13,12 +13,11 @@ jobs: if: github.event_name == 'workflow_dispatch' - name: Launch workflow via tower - uses: nf-core/tower-action@v2 + uses: seqeralabs/action-tower-launch@v2 with: workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} - pipeline: ${{ github.repository }} revision: ${{ env.REVISION }} workdir: ${{ secrets.TOWER_WORKDIR_PARENT }}/work/${{ github.repository }}/work-${{ env.REVISION }} parameters: | @@ -26,3 +25,9 @@ jobs: "outdir": "${{ secrets.TOWER_WORKDIR_PARENT }}/results/${{ github.repository }}/results-${{ env.REVISION }}", } profiles: test,sanger,singularity,cleanup + - uses: actions/upload-artifact@v3 + with: + name: Tower debug log file + path: | + tower_action_*.log + tower_action_*.json diff --git a/.github/workflows/sangerfulltest.yml b/.github/workflows/sanger_test_full.yml similarity index 80% rename from .github/workflows/sangerfulltest.yml rename to .github/workflows/sanger_test_full.yml index addef9bc..e3a25f7b 100644 --- a/.github/workflows/sangerfulltest.yml +++ b/.github/workflows/sanger_test_full.yml @@ -1,4 +1,4 @@ -name: nf-core Sanger LSF full size tests +name: sanger-tol LSF full size tests on: workflow_dispatch: @@ -18,12 +18,11 @@ jobs: if: github.event_name == 'workflow_dispatch' - name: Launch workflow via tower - uses: nf-core/tower-action@v2 + uses: seqeralabs/action-tower-launch@v2 with: workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} - pipeline: ${{ github.repository }} revision: ${{ env.REVISION }} workdir: ${{ secrets.TOWER_WORKDIR_PARENT }}/work/${{ github.repository }}/work-${{ env.REVISION }} parameters: | @@ -31,3 +30,9 @@ jobs: "outdir": "${{ secrets.TOWER_WORKDIR_PARENT }}/results/${{ github.repository }}/results-${{ env.REVISION }}", } profiles: test_full,sanger,singularity,cleanup + - uses: actions/upload-artifact@v3 + with: + name: Tower debug log file + path: | + tower_action_*.log + tower_action_*.json diff --git a/.nf-core.yml b/.nf-core.yml index 5c98553f..6f8fbccc 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -8,6 +8,7 @@ lint: files_unchanged: - LICENSE - .github/ISSUE_TEMPLATE/bug_report.yml + - .github/workflows/linting.yml - assets/sendmail_template.txt - lib/NfcoreTemplate.groovy - .prettierignore diff --git a/README.md b/README.md index 58008e6e..80d707b8 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ [![GitHub Actions Linting Status](https://github.com/sanger-tol/blobtoolkit/workflows/nf-core%20linting/badge.svg)](https://github.com/sanger-tol/blobtoolkit/actions?query=workflow%3A%22nf-core+linting%22) [![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.7949058-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.7949058) -[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A523.04.1-23aa62.svg)](https://www.nextflow.io/) +[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A522.10.1-23aa62.svg)](https://www.nextflow.io/) [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/) [![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/) [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/) @@ -60,7 +60,7 @@ mMelMel1,illumina,GCA_922984935.2.illumina.mMelMel1.cram mMelMel3,ont,GCA_922984935.2.ont.mMelMel3.cram ``` -Each row represents an aligned file. Rows with the same sample identifier are considered technical replicates. The datatype refers to the sequencing technology used to generate the underlying raw data and follows a controlled vocabulary (ont, hic, pacbio, illumina). The aligned read files can be generated using the [sanger-tol/readmapping](https://github.com/sanger-tol/readmapping) pipeline. +Each row represents an aligned file. Rows with the same sample identifier are considered technical replicates. The datatype refers to the sequencing technology used to generate the underlying raw data and follows a controlled vocabulary (ont, hic, pacbio, pacbio_clr illumina). The aligned read files can be generated using the [sanger-tol/readmapping](https://github.com/sanger-tol/readmapping) pipeline. Now, you can run the pipeline using: diff --git a/assets/schema_input.json b/assets/schema_input.json index c315cedb..f08ccb89 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -16,7 +16,7 @@ "datatype": { "type": "string", "pattern": "^\\S+$", - "enum": ["hic", "illumina", "ont", "pacbio"], + "enum": ["hic", "illumina", "ont", "pacbio", "pacbio_clr"], "errorMessage": "Data type, and must be one of: 'hic' or 'illumina' or 'ont' or 'pacbio'" }, "datafile": { diff --git a/assets/test/mCerEla1.1.buscogenes.dmnd b/assets/test/mCerEla1.1.buscogenes.dmnd deleted file mode 100644 index bccca41d..00000000 Binary files a/assets/test/mCerEla1.1.buscogenes.dmnd and /dev/null differ diff --git a/assets/test/samplesheet_raw.csv b/assets/test/samplesheet_raw.csv new file mode 100644 index 00000000..830753a7 --- /dev/null +++ b/assets/test/samplesheet_raw.csv @@ -0,0 +1,4 @@ +sample,datatype,datafile +mMelMel1,illumina,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/genomic_data/mMelMel1/illumina/31231_3#1_subset.cram +mMelMel2,illumina,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/genomic_data/mMelMel2/illumina/31231_4#1_subset.cram +mMelMel3,hic,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/genomic_data/mMelMel3/hic-arima2/35528_2#1_subset.cram diff --git a/assets/test_full/full_samplesheet.csv b/assets/test_full/full_samplesheet.csv index 88fc7462..6a3ba69d 100644 --- a/assets/test_full/full_samplesheet.csv +++ b/assets/test_full/full_samplesheet.csv @@ -1,3 +1,3 @@ sample,datatype,datafile -gfLaeSulp1,hic,/lustre/scratch123/tol/projects/.sandbox/data/fungi/Laetiporus_sulphureus/analysis/gfLaeSulp1.1/read_mapping/hic/GCA_927399515.1.unmasked.hic.gfLaeSulp1.cram -gfLaeSulp1,pacbio,/lustre/scratch123/tol/projects/.sandbox/data/fungi/Laetiporus_sulphureus/analysis/gfLaeSulp1.1/read_mapping/pacbio/GCA_927399515.1.unmasked.pacbio.gfLaeSulp1.cram +gfLaeSulp1,hic,/lustre/scratch123/tol/resources/nextflow/test-data/Laetiporus_sulphureus/analysis/gfLaeSulp1.1/read_mapping/hic/GCA_927399515.1.unmasked.hic.gfLaeSulp1.cram +gfLaeSulp1,pacbio,/lustre/scratch123/tol/resources/nextflow/test-data/Laetiporus_sulphureus/analysis/gfLaeSulp1.1/read_mapping/pacbio/GCA_927399515.1.unmasked.pacbio.gfLaeSulp1.cram diff --git a/assets/test_full/gfLaeSulp1.1.buscogenes.dmnd b/assets/test_full/gfLaeSulp1.1.buscogenes.dmnd deleted file mode 100644 index a0d0e1d2..00000000 Binary files a/assets/test_full/gfLaeSulp1.1.buscogenes.dmnd and /dev/null differ diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 72e3f485..f5bf5c5b 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -24,12 +24,16 @@ class RowChecker: """ - VALID_FORMATS = (".cram",) + VALID_FORMATS = ( + ".cram", + ".bam", + ) VALID_DATATYPES = ( "hic", "illumina", "pacbio", + "pacbio_clr", "ont", ) diff --git a/conf/modules.config b/conf/modules.config index ebf62694..d29e500f 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -14,22 +14,55 @@ process { withName: "SAMPLESHEET_CHECK" { publishDir = [ - path: { "${params.outdir}/blobtoolkit_info" }, + path: { "${params.outdir}/pipeline_info/blobtoolkit" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals("versions.yml") ? null : filename } ] } - withName: "GOAT_TAXONSEARCH" { - ext.args = "-l -b" + withName: "WINDOWMASKER_MKCOUNTS" { + ext.args = "-infmt fasta -sformat obinary" + } + + withName: "WINDOWMASKER_USTAT" { + ext.args = "-infmt fasta -dust T -outfmt fasta" + } + + withName: "MINIMAP2_HIC" { + ext.args = "-ax sr" + } + + withName: "MINIMAP2_ILMN" { + ext.args = "-ax sr" + } + + withName: "MINIMAP2_CCS" { + ext.args = "-ax map-hifi --cs=short" + } + + withName: "MINIMAP2_CLR" { + ext.args = "-ax map-pb" + } + + withName: "MINIMAP2_ONT" { + ext.args = "-ax map-ont" } withName: "SAMTOOLS_VIEW" { ext.args = "--output-fmt bam --write-index" } + withName: "SAMTOOLS_INDEX" { + ext.args = "-c" + } + + withName: "GOAT_TAXONSEARCH" { + ext.args = "--lineage --busco" + } + withName: "BUSCO" { scratch = true + // Overridden in the test profile, see at the end of this file ext.args = "--mode genome --force" } @@ -68,7 +101,7 @@ process { withName: "CUSTOM_DUMPSOFTWAREVERSIONS" { publishDir = [ - path: { "${params.outdir}/blobtoolkit_info" }, + path: { "${params.outdir}/pipeline_info/blobtoolkit" }, mode: params.publish_dir_mode, pattern: "*_versions.yml" ] @@ -84,3 +117,22 @@ process { } } + + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Additional configuration to speed processes up during testing. + +---------------------------------------------------------------------------------------- +*/ + +profiles { + test { + process { + withName: BUSCO { + // Note: BUSCO *must* see the double-quotes around the parameters + ext.args = '--mode genome --force --metaeuk_parameters \'"-s=2"\' --metaeuk_rerun_parameters \'"-s=2"\'' + } + } + } +} diff --git a/conf/test.config b/conf/test.config index 165bfff6..1ef3c7a2 100644 --- a/conf/test.config +++ b/conf/test.config @@ -12,7 +12,7 @@ params { config_profile_name = 'Test profile' - config_profile_description = 'Minimal test dataset to check pipeline function' + config_profile_description = 'Minimal aligned test dataset to check pipeline function' // Limit resources so that this can run on GitHub Actions max_cpus = 2 @@ -22,15 +22,15 @@ params { // Input test data // Specify the paths to your test data // Give any required params for the test so that command line flags are not needed - input = "${projectDir}/assets/test/samplesheet.csv" + input = "${projectDir}/assets/test/samplesheet_s3.csv" // Fasta references - fasta = "/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/assembly/release/mMelMel3.1_paternal_haplotype/GCA_922984935.2.subset.fasta.gz" + fasta = "https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/assembly/release/mMelMel3.1_paternal_haplotype/GCA_922984935.2.subset.fasta.gz" accession = "GCA_922984935.2" taxon = "Meles meles" // Databases taxdump = "/lustre/scratch123/tol/teams/grit/geval_pipeline/btk_databases/taxdump" - busco = "/lustre/scratch123/tol/resources/nextflow/busco_2021_06_reduced/" - uniprot = "${projectDir}/assets/test/mCerEla1.1.buscogenes.dmnd" + busco = "/lustre/scratch123/tol/resources/nextflow/busco/blobtoolkit.GCA_922984935.2.2023-08-03" + uniprot = "https://tolit.cog.sanger.ac.uk/test-data/resources/diamond/mCerEla1.1.buscogenes.dmnd" } diff --git a/conf/test_full.config b/conf/test_full.config index ee22dba2..1db6fb01 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -22,12 +22,12 @@ params { input = "${projectDir}/assets/test_full/full_samplesheet.csv" // Fasta references - fasta = "/lustre/scratch124/tol/projects/darwin/data/fungi/Laetiporus_sulphureus/assembly/release/gfLaeSulp1.1/insdc/GCA_927399515.1.fasta.gz" + fasta = "https://tolit.cog.sanger.ac.uk/test-data/Laetiporus_sulphureus/assembly/release/gfLaeSulp1.1/insdc/GCA_927399515.1.fasta.gz" accession = "GCA_927399515.1" taxon = "Laetiporus sulphureus" // Databases taxdump = "/lustre/scratch123/tol/teams/grit/geval_pipeline/btk_databases/taxdump" busco = "/lustre/scratch123/tol/resources/busco/v5/" - uniprot = "${projectDir}/assets/test_full/gfLaeSulp1.1.buscogenes.dmnd" + uniprot = "https://tolit.cog.sanger.ac.uk/test-data/resources/diamond/gfLaeSulp1.1.buscogenes.dmnd" } diff --git a/conf/test_raw.config b/conf/test_raw.config new file mode 100644 index 00000000..868cee00 --- /dev/null +++ b/conf/test_raw.config @@ -0,0 +1,37 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run sanger-tol/blobtoolkit -profile test_raw, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Raw test profile' + config_profile_description = 'Minimal raw test dataset to check pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input test data + // Specify the paths to your test data + // Give any required params for the test so that command line flags are not needed + input = "${projectDir}/assets/test/samplesheet_raw.csv" + align = true + + // Fasta references + fasta = "https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/assembly/release/mMelMel3.1_paternal_haplotype/GCA_922984935.2.subset.fasta.gz" + accession = "GCA_922984935.2" + taxon = "Meles meles" + + // Databases + taxdump = "/lustre/scratch123/tol/teams/grit/geval_pipeline/btk_databases/taxdump" + busco = "/lustre/scratch123/tol/resources/nextflow/busco/blobtoolkit.GCA_922984935.2.2023-08-03" + uniprot = "https://tolit.cog.sanger.ac.uk/test-data/resources/diamond/mCerEla1.1.buscogenes.dmnd" +} diff --git a/docs/output.md b/docs/output.md index 437c6df7..33040ea0 100644 --- a/docs/output.md +++ b/docs/output.md @@ -52,7 +52,7 @@ Results generated by MultiQC collate pipeline QC from supported tools. The pipel
Output files -- `blobtoolkit_info/` +- `pipeline_info/blobtoolkit/` - Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`. - Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.yml`. The `pipeline_report*` files will only be present if the `--email` / `--email_on_fail` parameter's are used when running the pipeline. - Reformatted samplesheet files used as input to the pipeline: `samplesheet.valid.csv`. diff --git a/docs/usage.md b/docs/usage.md index 71b07d05..d1e3f34a 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -43,7 +43,7 @@ sample3,ont,ont.cram | Column | Description | | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (\_). | -| `datatype` | Type of sequencing data. Must be one of `hic`, `illumina`, `pacbio`, or `ont`. | +| `datatype` | Type of sequencing data. Must be one of `hic`, `illumina`, `pacbio`, `pacbio_clr` or `ont`. | | `datafile` | Full path to read data file. | An [example samplesheet](https://raw.githubusercontent.com/sanger-tol/blobtoolkit/main/assets/test/samplesheet.csv) has been provided with the pipeline. diff --git a/lib/NfcoreTemplate.groovy b/lib/NfcoreTemplate.groovy index 2777ae2b..e463d9ed 100755 --- a/lib/NfcoreTemplate.groovy +++ b/lib/NfcoreTemplate.groovy @@ -154,7 +154,7 @@ class NfcoreTemplate { } // Write summary e-mail HTML to a file - def output_d = new File("${params.outdir}/blobtoolkit_info/") + def output_d = new File("${params.outdir}/pipeline_info/blobtoolkit/") if (!output_d.exists()) { output_d.mkdirs() } diff --git a/modules.json b/modules.json index cd615550..ee078010 100644 --- a/modules.json +++ b/modules.json @@ -7,13 +7,13 @@ "nf-core": { "busco": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "6d6552cb582f56b6101c452e16ee7c23073f91de", "installed_by": ["modules"], "patch": "modules/nf-core/busco/busco.diff" }, "custom/dumpsoftwareversions": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "05c280924b6c768d484c7c443dad5e605c4ff4b4", "installed_by": ["modules"] }, "diamond/blastp": { @@ -33,23 +33,48 @@ }, "gunzip": { "branch": "master", - "git_sha": "5c460c5a4736974abde2843294f35307ee2b0e5e", + "git_sha": "e06548bfa36ee31869b81041879dd6b3a83b1d57", + "installed_by": ["modules"] + }, + "minimap2/align": { + "branch": "master", + "git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220", "installed_by": ["modules"] }, "mosdepth": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "ebb27711cd5f4de921244bfa81c676504072d31c", "installed_by": ["modules"] }, "multiqc": { + "branch": "master", + "git_sha": "a6e11ac655e744f7ebc724be669dd568ffdc0e80", + "installed_by": ["modules"] + }, + "samtools/fasta": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, - "samtools/view": { + "samtools/index": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] + }, + "samtools/view": { + "branch": "master", + "git_sha": "3ffae3598260a99e8db3207dead9f73f87f90d1f", + "installed_by": ["modules"] + }, + "windowmasker/mkcounts": { + "branch": "master", + "git_sha": "30c3ed32e8bd5ddaf349ba2f4f99d38182fdc08c", + "installed_by": ["modules"] + }, + "windowmasker/ustat": { + "branch": "master", + "git_sha": "726ee59cd9360a965d96ea9ea8770f16b8ddd6cc", + "installed_by": ["modules"] } } }, diff --git a/modules/local/blobtoolkit/blobdir.nf b/modules/local/blobtoolkit/blobdir.nf index 3f064bce..baf46df1 100644 --- a/modules/local/blobtoolkit/blobdir.nf +++ b/modules/local/blobtoolkit/blobdir.nf @@ -5,7 +5,7 @@ process BLOBTOOLKIT_BLOBDIR { if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { exit 1, "BLOBTOOLKIT_BLOBDIR module does not support Conda. Please use Docker / Singularity / Podman instead." } - container "genomehubs/blobtoolkit:4.1.5" + container "docker.io/genomehubs/blobtoolkit:4.1.5" input: tuple val(meta), path(window, stageAs: 'windowstats/*') diff --git a/modules/local/blobtoolkit/config.nf b/modules/local/blobtoolkit/config.nf index ce1e3adc..ed7bba45 100644 --- a/modules/local/blobtoolkit/config.nf +++ b/modules/local/blobtoolkit/config.nf @@ -5,7 +5,7 @@ process BLOBTOOLKIT_CONFIG { if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { exit 1, "GENERATE_CONFIG module does not support Conda. Please use Docker / Singularity / Podman instead." } - container "genomehubs/blobtoolkit:4.1.5" + container "docker.io/genomehubs/blobtoolkit:4.1.5" input: tuple val(meta), path(fasta) diff --git a/modules/local/blobtoolkit/countbuscos.nf b/modules/local/blobtoolkit/countbuscos.nf index 1379cbac..b118f702 100644 --- a/modules/local/blobtoolkit/countbuscos.nf +++ b/modules/local/blobtoolkit/countbuscos.nf @@ -5,7 +5,7 @@ process BLOBTOOLKIT_COUNTBUSCOS { if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { exit 1, "BLOBTOOLKIT_COUNTBUSCOS module does not support Conda. Please use Docker / Singularity / Podman instead." } - container "genomehubs/blobtoolkit:4.1.5" + container "docker.io/genomehubs/blobtoolkit:4.1.5" input: tuple val(meta), path(table, stageAs: 'dir??/*') diff --git a/modules/local/blobtoolkit/extractbuscos.nf b/modules/local/blobtoolkit/extractbuscos.nf index 23c73247..cd9441b6 100644 --- a/modules/local/blobtoolkit/extractbuscos.nf +++ b/modules/local/blobtoolkit/extractbuscos.nf @@ -5,7 +5,7 @@ process BLOBTOOLKIT_EXTRACTBUSCOS { if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { exit 1, "BLOBTOOLKIT_EXTRACTBUSCOS module does not support Conda. Please use Docker / Singularity / Podman instead." } - container "genomehubs/blobtoolkit:4.1.5" + container "docker.io/genomehubs/blobtoolkit:4.1.5" input: tuple val(meta), path(fasta) diff --git a/modules/local/blobtoolkit/images.nf b/modules/local/blobtoolkit/images.nf index 11bdd485..8cfbae22 100644 --- a/modules/local/blobtoolkit/images.nf +++ b/modules/local/blobtoolkit/images.nf @@ -5,7 +5,7 @@ process BLOBTOOLKIT_IMAGES { if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { exit 1, "BLOBTOOLKIT_IMAGES module does not support Conda. Please use Docker / Singularity / Podman instead." } - container "genomehubs/blobtk:0.3.3" + container "docker.io/genomehubs/blobtk:0.3.3" input: tuple val(meta), path(blobdir) diff --git a/modules/local/blobtoolkit/metadata.nf b/modules/local/blobtoolkit/metadata.nf index 32339c48..30f47538 100644 --- a/modules/local/blobtoolkit/metadata.nf +++ b/modules/local/blobtoolkit/metadata.nf @@ -5,7 +5,7 @@ process BLOBTOOLKIT_METADATA { if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { exit 1, "BLOBTOOLKIT_METADATA module does not support Conda. Please use Docker / Singularity / Podman instead." } - container "genomehubs/blobtoolkit:4.1.5" + container "docker.io/genomehubs/blobtoolkit:4.1.5" input: tuple val(meta), path(yaml) diff --git a/modules/local/blobtoolkit/summary.nf b/modules/local/blobtoolkit/summary.nf index d1059d8a..36e4ad25 100644 --- a/modules/local/blobtoolkit/summary.nf +++ b/modules/local/blobtoolkit/summary.nf @@ -5,7 +5,7 @@ process BLOBTOOLKIT_SUMMARY { if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { exit 1, "BLOBTOOLKIT_SUMMARY module does not support Conda. Please use Docker / Singularity / Podman instead." } - container "genomehubs/blobtoolkit:4.1.5" + container "docker.io/genomehubs/blobtoolkit:4.1.5" input: tuple val(meta), path(blobdir) diff --git a/modules/local/blobtoolkit/windowstats.nf b/modules/local/blobtoolkit/windowstats.nf index 0517535f..06dd3c42 100644 --- a/modules/local/blobtoolkit/windowstats.nf +++ b/modules/local/blobtoolkit/windowstats.nf @@ -5,7 +5,7 @@ process BLOBTOOLKIT_WINDOWSTATS { if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { exit 1, "GET_WINDOW_STATS module does not support Conda. Please use Docker / Singularity / Podman instead." } - container "genomehubs/blobtoolkit:4.1.5" + container "docker.io/genomehubs/blobtoolkit:4.1.5" input: tuple val(meta), path(tsv) diff --git a/modules/local/create_bed.nf b/modules/local/create_bed.nf index 034ab1e6..3158a732 100644 --- a/modules/local/create_bed.nf +++ b/modules/local/create_bed.nf @@ -5,7 +5,7 @@ process CREATE_BED { conda "conda-forge::gawk=5.1.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/gawk:5.1.0' : - 'quay.io/biocontainers/gawk:5.1.0' }" + 'biocontainers/gawk:5.1.0' }" input: tuple val(meta), path(tsv) //path to tsv output from fasta windows diff --git a/modules/local/samplesheet_check.nf b/modules/local/samplesheet_check.nf index 5798da0e..760f3e44 100644 --- a/modules/local/samplesheet_check.nf +++ b/modules/local/samplesheet_check.nf @@ -5,7 +5,7 @@ process SAMPLESHEET_CHECK { conda "conda-forge::python=3.9.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/python:3.9--1' : - 'quay.io/biocontainers/python:3.9--1' }" + 'biocontainers/python:3.9--1' }" input: path samplesheet diff --git a/modules/local/windowstats_input.nf b/modules/local/windowstats_input.nf index f366025d..1c660442 100644 --- a/modules/local/windowstats_input.nf +++ b/modules/local/windowstats_input.nf @@ -5,7 +5,7 @@ process WINDOWSTATS_INPUT { conda "conda-forge::pandas=1.5.2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/pandas:1.5.2': - 'quay.io/biocontainers/pandas:1.5.2' }" + 'biocontainers/pandas:1.5.2' }" input: tuple val(meta), path(freq) diff --git a/modules/nf-core/busco/busco.diff b/modules/nf-core/busco/busco.diff index 2aa7184a..dfa3fa05 100644 --- a/modules/nf-core/busco/busco.diff +++ b/modules/nf-core/busco/busco.diff @@ -8,24 +8,5 @@ Changes in module 'nf-core/busco' label 'process_medium' conda "bioconda::busco=5.4.3" -@@ -14,11 +14,13 @@ - path config_file // Optional: busco configuration file - - output: -- tuple val(meta), path("*-busco.batch_summary.txt"), emit: batch_summary -- tuple val(meta), path("short_summary.*.txt") , emit: short_summaries_txt, optional: true -- tuple val(meta), path("short_summary.*.json") , emit: short_summaries_json, optional: true -- tuple val(meta), path("*-busco") , emit: busco_dir -- path "versions.yml" , emit: versions -+ tuple val(meta), path("*-busco.batch_summary.txt") , emit: batch_summary -+ tuple val(meta), path("short_summary.*.txt") , emit: short_summaries_txt, optional: true -+ tuple val(meta), path("short_summary.*.json") , emit: short_summaries_json, optional: true -+ tuple val(meta), path("*-busco") , emit: busco_dir -+ tuple val(meta), path("*-busco/*/run_*/full_table.tsv") , emit: full_table, optional: true -+ tuple val(meta), path("*-busco/*/run_*/busco_sequences"), emit: seq_dir, optional: true -+ path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when ************************************************************ diff --git a/modules/nf-core/busco/main.nf b/modules/nf-core/busco/main.nf index 254ee9fd..4fc88bd6 100644 --- a/modules/nf-core/busco/main.nf +++ b/modules/nf-core/busco/main.nf @@ -14,13 +14,16 @@ process BUSCO { path config_file // Optional: busco configuration file output: - tuple val(meta), path("*-busco.batch_summary.txt") , emit: batch_summary - tuple val(meta), path("short_summary.*.txt") , emit: short_summaries_txt, optional: true - tuple val(meta), path("short_summary.*.json") , emit: short_summaries_json, optional: true - tuple val(meta), path("*-busco") , emit: busco_dir - tuple val(meta), path("*-busco/*/run_*/full_table.tsv") , emit: full_table, optional: true - tuple val(meta), path("*-busco/*/run_*/busco_sequences"), emit: seq_dir, optional: true - path "versions.yml" , emit: versions + tuple val(meta), path("*-busco.batch_summary.txt") , emit: batch_summary + tuple val(meta), path("short_summary.*.txt") , emit: short_summaries_txt, optional: true + tuple val(meta), path("short_summary.*.json") , emit: short_summaries_json, optional: true + tuple val(meta), path("*-busco/*/run_*/full_table.tsv") , emit: full_table, optional: true + tuple val(meta), path("*-busco/*/run_*/missing_busco_list.tsv") , emit: missing_busco_list, optional: true + tuple val(meta), path("*-busco/*/run_*/single_copy_proteins.faa") , emit: single_copy_proteins, optional: true + tuple val(meta), path("*-busco/*/run_*/busco_sequences") , emit: seq_dir + tuple val(meta), path("*-busco/*/translated_proteins") , emit: translated_dir, optional: true + tuple val(meta), path("*-busco") , emit: busco_dir + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when diff --git a/modules/nf-core/busco/meta.yml b/modules/nf-core/busco/meta.yml index ef8c5245..77d15fbd 100644 --- a/modules/nf-core/busco/meta.yml +++ b/modules/nf-core/busco/meta.yml @@ -25,7 +25,7 @@ input: description: Nucleic or amino acid sequence file in FASTA format. pattern: "*.{fasta,fna,fa,fasta.gz,fna.gz,fa.gz}" - lineage: - type: value + type: string description: The BUSCO lineage to use, or "auto" to automatically select lineage - busco_lineages_path: type: directory @@ -56,6 +56,26 @@ output: type: directory description: BUSCO lineage specific output pattern: "*-busco" + - full_table: + type: file + description: Full BUSCO results table + pattern: "full_table.tsv" + - missing_busco_list: + type: file + description: List of missing BUSCOs + pattern: "missing_busco_list.tsv" + - single_copy_proteins: + type: file + description: Fasta file of single copy proteins (transcriptome mode) + pattern: "single_copy_proteins.faa" + - seq_dir: + type: directory + description: BUSCO sequence directory + pattern: "busco_sequences" + - translated_proteins: + type: directory + description: Six frame translations of each transcript made by the transcriptome mode + pattern: "translated_proteins" - versions: type: file description: File containing software versions diff --git a/modules/nf-core/custom/dumpsoftwareversions/main.nf b/modules/nf-core/custom/dumpsoftwareversions/main.nf index ebc87273..c9d014b1 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/main.nf +++ b/modules/nf-core/custom/dumpsoftwareversions/main.nf @@ -2,10 +2,10 @@ process CUSTOM_DUMPSOFTWAREVERSIONS { label 'process_single' // Requires `pyyaml` which does not have a dedicated container but is in the MultiQC container - conda "bioconda::multiqc=1.14" + conda "bioconda::multiqc=1.15" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.14--pyhdfd78af_0' : - 'biocontainers/multiqc:1.14--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.15--pyhdfd78af_0' : + 'biocontainers/multiqc:1.15--pyhdfd78af_0' }" input: path versions diff --git a/modules/nf-core/gunzip/main.nf b/modules/nf-core/gunzip/main.nf index e7189d2f..73bf08cd 100644 --- a/modules/nf-core/gunzip/main.nf +++ b/modules/nf-core/gunzip/main.nf @@ -21,10 +21,14 @@ process GUNZIP { def args = task.ext.args ?: '' gunzip = archive.toString() - '.gz' """ - gunzip \\ - -f \\ + # Not calling gunzip itself because it creates files + # with the original group ownership rather than the + # default one for that user / the work directory + gzip \\ + -cd \\ $args \\ - $archive + $archive \\ + > $gunzip cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/minimap2/align/main.nf b/modules/nf-core/minimap2/align/main.nf new file mode 100644 index 00000000..4da47c18 --- /dev/null +++ b/modules/nf-core/minimap2/align/main.nf @@ -0,0 +1,48 @@ +process MINIMAP2_ALIGN { + tag "$meta.id" + label 'process_medium' + + // Note: the versions here need to match the versions used in the mulled container below and minimap2/index + conda "bioconda::minimap2=2.24 bioconda::samtools=1.14" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:1679e915ddb9d6b4abda91880c4b48857d471bd8-0' : + 'biocontainers/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:1679e915ddb9d6b4abda91880c4b48857d471bd8-0' }" + + input: + tuple val(meta), path(reads) + path reference + val bam_format + val cigar_paf_format + val cigar_bam + + output: + tuple val(meta), path("*.paf"), optional: true, emit: paf + tuple val(meta), path("*.bam"), optional: true, emit: bam + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def bam_output = bam_format ? "-a | samtools sort | samtools view -@ ${task.cpus} -b -h -o ${prefix}.bam" : "-o ${prefix}.paf" + def cigar_paf = cigar_paf_format && !bam_format ? "-c" : '' + def set_cigar_bam = cigar_bam && bam_format ? "-L" : '' + """ + minimap2 \\ + $args \\ + -t $task.cpus \\ + "${reference ?: reads}" \\ + "$reads" \\ + $cigar_paf \\ + $set_cigar_bam \\ + $bam_output + + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + minimap2: \$(minimap2 --version 2>&1) + END_VERSIONS + """ +} diff --git a/modules/nf-core/minimap2/align/meta.yml b/modules/nf-core/minimap2/align/meta.yml new file mode 100644 index 00000000..991b39a0 --- /dev/null +++ b/modules/nf-core/minimap2/align/meta.yml @@ -0,0 +1,65 @@ +name: minimap2_align +description: A versatile pairwise aligner for genomic and spliced nucleotide sequences +keywords: + - align + - fasta + - fastq + - genome + - paf + - reference +tools: + - minimap2: + description: | + A versatile pairwise aligner for genomic and spliced nucleotide sequences. + homepage: https://github.com/lh3/minimap2 + documentation: https://github.com/lh3/minimap2#uguide + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FASTA or FASTQ files of size 1 and 2 for single-end + and paired-end data, respectively. + - reference: + type: file + description: | + Reference database in FASTA format. + - bam_format: + type: boolean + description: Specify that output should be in BAM format + - cigar_paf_format: + type: boolean + description: Specify that output CIGAR should be in PAF format + - cigar_bam: + type: boolean + description: | + Write CIGAR with >65535 ops at the CG tag. This is recommended when + doing XYZ (https://github.com/lh3/minimap2#working-with-65535-cigar-operations) +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - paf: + type: file + description: Alignment in PAF format + pattern: "*.paf" + - bam: + type: file + description: Alignment in BAM format + pattern: "*.bam" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@heuermh" + - "@sofstam" + - "@sateeshperi" + - "@jfy133" diff --git a/modules/nf-core/mosdepth/main.nf b/modules/nf-core/mosdepth/main.nf index c17e4e65..74db3a27 100644 --- a/modules/nf-core/mosdepth/main.nf +++ b/modules/nf-core/mosdepth/main.nf @@ -35,10 +35,10 @@ process MOSDEPTH { def reference = fasta ? "--fasta ${fasta}" : "" def interval = bed ? "--by ${bed}" : "" if (bed && args.contains("--by")) { - exit 1, "'--by' can only be specified once when running mosdepth! Either remove input BED file definition or remove '--by' from 'ext.args' definition" + error "'--by' can only be specified once when running mosdepth! Either remove input BED file definition or remove '--by' from 'ext.args' definition" } if (!bed && args.contains("--thresholds")) { - exit 1, "'--thresholds' can only be specified in conjunction with '--by'" + error "'--thresholds' can only be specified in conjunction with '--by'" } """ diff --git a/modules/nf-core/multiqc/main.nf b/modules/nf-core/multiqc/main.nf index 1fc387be..65d7dd0d 100644 --- a/modules/nf-core/multiqc/main.nf +++ b/modules/nf-core/multiqc/main.nf @@ -1,10 +1,10 @@ process MULTIQC { label 'process_single' - conda "bioconda::multiqc=1.14" + conda "bioconda::multiqc=1.15" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.14--pyhdfd78af_0' : - 'biocontainers/multiqc:1.14--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.15--pyhdfd78af_0' : + 'biocontainers/multiqc:1.15--pyhdfd78af_0' }" input: path multiqc_files, stageAs: "?/*" diff --git a/modules/nf-core/samtools/fasta/main.nf b/modules/nf-core/samtools/fasta/main.nf new file mode 100644 index 00000000..31459656 --- /dev/null +++ b/modules/nf-core/samtools/fasta/main.nf @@ -0,0 +1,44 @@ +process SAMTOOLS_FASTA { + tag "$meta.id" + label 'process_low' + + conda "bioconda::samtools=1.17" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(input) + val(interleave) + + output: + tuple val(meta), path("*_{1,2}.fasta.gz") , optional:true, emit: fasta + tuple val(meta), path("*_interleaved.fasta.gz"), optional:true, emit: interleaved + tuple val(meta), path("*_singleton.fasta.gz") , optional:true, emit: singleton + tuple val(meta), path("*_other.fasta.gz") , optional:true, emit: other + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def output = ( interleave && ! meta.single_end ) ? "> ${prefix}_interleaved.fasta.gz" : + meta.single_end ? "-1 ${prefix}_1.fasta.gz -s ${prefix}_singleton.fasta.gz" : + "-1 ${prefix}_1.fasta.gz -2 ${prefix}_2.fasta.gz -s ${prefix}_singleton.fasta.gz" + """ + samtools \\ + fasta \\ + $args \\ + --threads ${task.cpus-1} \\ + -0 ${prefix}_other.fasta.gz \\ + $input \\ + $output + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//' )) + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/fasta/meta.yml b/modules/nf-core/samtools/fasta/meta.yml new file mode 100644 index 00000000..8e459860 --- /dev/null +++ b/modules/nf-core/samtools/fasta/meta.yml @@ -0,0 +1,61 @@ +name: "samtools_fasta" +description: Converts a SAM/BAM/CRAM file to FASTA +keywords: + - bam + - sam + - cram + - fasta +tools: + - "samtools": + description: "Tools for dealing with SAM, BAM and CRAM files" + homepage: "http://www.htslib.org" + documentation: "https://www.htslib.org/doc/samtools-fasta.html" + tool_dev_url: "https://github.com/samtools/samtools" + doi: "10.1093/bioinformatics/btp352" + licence: "['MIT']" + +input: + # Only when we have meta + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - interleave: + type: boolean + description: Set true for interleaved fasta files + +output: + #Only when we have meta + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - fasta: + type: file + description: Compressed FASTA file(s) with reads with either the READ1 or READ2 flag set in separate files. + pattern: "*_{1,2}.fasta.gz" + - interleaved: + type: file + description: Compressed FASTA file with reads with either the READ1 or READ2 flag set in a combined file. Needs collated input file. + pattern: "*_interleaved.fasta.gz" + - singleton: + type: file + description: Compressed FASTA file with singleton reads + pattern: "*_singleton.fasta.gz" + - other: + type: file + description: Compressed FASTA file with reads with either both READ1 and READ2 flags set or unset + pattern: "*_other.fasta.gz" + +authors: + - "@priyanka-surana" diff --git a/modules/nf-core/samtools/index/main.nf b/modules/nf-core/samtools/index/main.nf new file mode 100644 index 00000000..0b20aa4b --- /dev/null +++ b/modules/nf-core/samtools/index/main.nf @@ -0,0 +1,48 @@ +process SAMTOOLS_INDEX { + tag "$meta.id" + label 'process_low' + + conda "bioconda::samtools=1.17" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(input) + + output: + tuple val(meta), path("*.bai") , optional:true, emit: bai + tuple val(meta), path("*.csi") , optional:true, emit: csi + tuple val(meta), path("*.crai"), optional:true, emit: crai + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + samtools \\ + index \\ + -@ ${task.cpus-1} \\ + $args \\ + $input + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + """ + touch ${input}.bai + touch ${input}.crai + touch ${input}.csi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/index/meta.yml b/modules/nf-core/samtools/index/meta.yml new file mode 100644 index 00000000..8bd2fa6f --- /dev/null +++ b/modules/nf-core/samtools/index/meta.yml @@ -0,0 +1,53 @@ +name: samtools_index +description: Index SAM/BAM/CRAM file +keywords: + - index + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bai: + type: file + description: BAM/CRAM/SAM index file + pattern: "*.{bai,crai,sai}" + - crai: + type: file + description: BAM/CRAM/SAM index file + pattern: "*.{bai,crai,sai}" + - csi: + type: file + description: CSI index file + pattern: "*.{csi}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@ewels" + - "@maxulysse" diff --git a/modules/nf-core/samtools/view/main.nf b/modules/nf-core/samtools/view/main.nf index b87369e5..cb91facf 100644 --- a/modules/nf-core/samtools/view/main.nf +++ b/modules/nf-core/samtools/view/main.nf @@ -9,7 +9,7 @@ process SAMTOOLS_VIEW { input: tuple val(meta), path(input), path(index) - path fasta + tuple val(meta2), path(fasta) path qname output: diff --git a/modules/nf-core/samtools/view/meta.yml b/modules/nf-core/samtools/view/meta.yml index 76916033..3b05450b 100644 --- a/modules/nf-core/samtools/view/meta.yml +++ b/modules/nf-core/samtools/view/meta.yml @@ -26,12 +26,17 @@ input: description: BAM/CRAM/SAM file pattern: "*.{bam,cram,sam}" - index: - type: optional file - description: BAM.BAI/BAM.CSI/CRAM.CRAI file + type: file + description: BAM.BAI/BAM.CSI/CRAM.CRAI file (optional) pattern: "*.{.bai,.csi,.crai}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] - fasta: - type: optional file - description: Reference file the CRAM was created with + type: file + description: Reference file the CRAM was created with (optional) pattern: "*.{fasta,fa}" - qname: type: file diff --git a/modules/nf-core/windowmasker/mkcounts/main.nf b/modules/nf-core/windowmasker/mkcounts/main.nf new file mode 100644 index 00000000..bfa66f35 --- /dev/null +++ b/modules/nf-core/windowmasker/mkcounts/main.nf @@ -0,0 +1,55 @@ +process WINDOWMASKER_MKCOUNTS { + tag "$meta.id" + label 'process_low' + + conda "bioconda::blast=2.14.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/blast:2.14.0--h7d5a4b4_1': + 'biocontainers/blast:2.14.0--h7d5a4b4_1' }" + + input: + tuple val(meta), path(ref) + + output: + tuple val(meta), path("*.txt") , emit: counts + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: "" + def prefix = task.ext.prefix ?: "${meta.id}" + + def memory = 3072 + if (!task.memory) { + log.info '[WINDOWMASKER: MK_COUNTS] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + memory = (task.memory.toMega()).intValue() + } + + """ + windowmasker -mk_counts \\ + $args \\ + -mem ${memory} \\ + -in ${ref} \\ + -out ${prefix}.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + windowmasker: \$(windowmasker -version-full | head -n 1 | sed 's/^.*windowmasker: //; s/ .*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + touch ${prefix}.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + windowmasker: \$(windowmasker -version-full | head -n 1 | sed 's/^.*windowmasker: //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/windowmasker/mkcounts/meta.yml b/modules/nf-core/windowmasker/mkcounts/meta.yml new file mode 100644 index 00000000..788dc96c --- /dev/null +++ b/modules/nf-core/windowmasker/mkcounts/meta.yml @@ -0,0 +1,40 @@ +name: windowmasker_mkcounts +description: A program to generate frequency counts of repetitive units. +keywords: + - fasta + - interval + - windowmasker +tools: + - windowmasker: + description: | + A program to mask highly repetitive and low complexity DNA sequences within a genome. + homepage: https://github.com/ncbi/ncbi-cxx-toolkit-public + documentation: https://ncbi.github.io/cxx-toolkit/ + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - ref: + type: file + description: An input nucleotide fasta file. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - intervals: + type: file + description: | + An output file containing genomic locations of low + complexity and highly repetitive regions + pattern: "${prefix}.txt" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@DLBPointon" diff --git a/modules/nf-core/windowmasker/ustat/main.nf b/modules/nf-core/windowmasker/ustat/main.nf new file mode 100644 index 00000000..72a19dbf --- /dev/null +++ b/modules/nf-core/windowmasker/ustat/main.nf @@ -0,0 +1,69 @@ +process WINDOWMASKER_USTAT { + tag "$meta.id" + label 'process_low' + + conda "bioconda::blast=2.14.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/blast:2.14.0--h7d5a4b4_1': + 'biocontainers/blast:2.14.0--h7d5a4b4_1' }" + + input: + tuple val(meta) , path(counts) + tuple val(meta2), path(ref) + + output: + tuple val(meta), path("${output}") , emit: intervals + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: "" + def prefix = task.ext.prefix ?: "${meta.id}" + def outfmt = args.contains('-outfmt fasta') ? 'fasta' : + args.contains('-outfmt maskinfo_asn1_bin') ? 'maskinfo_asn1_bin' : + args.contains('-outfmt maskinfo_asn1_text') ? 'maskinfo_asn1_text' : + args.contains('-outfmt maskinfo_xml') ? 'maskinfo_xml' : + args.contains('-outfmt seqloc_asn1_bin') ? 'seqloc_asn1_bin' : + args.contains('-outfmt seqloc_asn1_text') ? 'seqloc_asn1_text' : + args.contains('-outfmt seqloc_xml') ? 'seqloc_xml' : + 'interval' + + output = "${prefix}.${outfmt}" + + """ + windowmasker -ustat \\ + ${counts} \\ + $args \\ + -in ${ref} \\ + -out ${output} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + windowmasker: \$(windowmasker -version-full | head -n 1 | sed 's/^.*windowmasker: //; s/ .*\$//') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: "" + def prefix = task.ext.prefix ?: "${meta.id}" + def outfmt = args.contains('-outfmt fasta') ? 'fasta' : + args.contains('-outfmt maskinfo_asn1_bin') ? 'maskinfo_asn1_bin' : + args.contains('-outfmt maskinfo_asn1_text') ? 'maskinfo_asn1_text' : + args.contains('-outfmt maskinfo_xml') ? 'maskinfo_xml' : + args.contains('-outfmt seqloc_asn1_bin') ? 'seqloc_asn1_bin' : + args.contains('-outfmt seqloc_asn1_text') ? 'seqloc_asn1_text' : + args.contains('-outfmt seqloc_xml') ? 'seqloc_xml' : + 'interval' + + output = "${prefix}.${outfmt}" + """ + touch ${output} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + windowmasker: \$(windowmasker -version-full | head -n 1 | sed 's/^.*windowmasker: //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/windowmasker/ustat/meta.yml b/modules/nf-core/windowmasker/ustat/meta.yml new file mode 100644 index 00000000..6acf2e50 --- /dev/null +++ b/modules/nf-core/windowmasker/ustat/meta.yml @@ -0,0 +1,48 @@ +name: windowmasker_ustat +description: A program to take a counts file and creates a file of genomic co-ordinates to be masked. +keywords: + - fasta + - interval + - windowmasker +tools: + - windowmasker: + description: | + A program to mask highly repetitive and low complexity DNA sequences within a genome. + homepage: https://github.com/ncbi/ncbi-cxx-toolkit-public + documentation: https://ncbi.github.io/cxx-toolkit/ + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - counts: + type: file + description: Contains count data of repetitive regions. + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - ref: + type: file + description: An input nucleotide fasta file. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - wm_intervals: + type: file + description: | + An output file containing genomic locations of low + complexity and highly repetitive regions + pattern: "${output}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@DLBPointon" diff --git a/nextflow.config b/nextflow.config index 988b2a1c..91844aa9 100644 --- a/nextflow.config +++ b/nextflow.config @@ -13,6 +13,8 @@ params { // Input options input = null yaml = null + align = false + mask = false // Reference options fasta = null @@ -36,7 +38,7 @@ params { // Boilerplate options outdir = 'results' - tracedir = "${params.outdir}/blobtoolkit_info" + tracedir = "${params.outdir}/pipeline_info/blobtoolkit" publish_dir_mode = 'copy' email = null email_on_fail = null @@ -177,6 +179,7 @@ profiles { } cleanup { cleanup = true } test { includeConfig 'conf/test.config' } + test_raw { includeConfig 'conf/test_raw.config' } test_full { includeConfig 'conf/test_full.config' } } @@ -220,7 +223,7 @@ manifest { homePage = 'https://github.com/sanger-tol/blobtoolkit' description = """Quality assessment of genome assemblies""" mainScript = 'main.nf' - nextflowVersion = '!>=23.04.1' + nextflowVersion = '!>=22.10.1' version = '0.2.0' doi = '10.5281/zenodo.7949058' } diff --git a/nextflow_schema.json b/nextflow_schema.json index a960bee2..0597fdcb 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -22,6 +22,16 @@ "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row.", "fa_icon": "fas fa-file-csv" }, + "align": { + "type": "boolean", + "description": "Turn on optional alignment before running the rest of the pipeline.", + "fa_icon": "fas fa-toggle-off" + }, + "mask": { + "type": "boolean", + "description": "Turn on optional genome masking if needed.", + "fa_icon": "fas fa-toggle-off" + }, "yaml": { "type": "string", "format": "file-path", @@ -287,7 +297,7 @@ "tracedir": { "type": "string", "description": "Directory to keep pipeline Nextflow logs and reports.", - "default": "${params.outdir}/blobtoolkit_info", + "default": "${params.outdir}/pipeline_info/blobtoolkit", "fa_icon": "fas fa-cogs", "hidden": true }, diff --git a/subworkflows/local/coverage_stats.nf b/subworkflows/local/coverage_stats.nf index 0d13824b..37af72df 100644 --- a/subworkflows/local/coverage_stats.nf +++ b/subworkflows/local/coverage_stats.nf @@ -2,33 +2,50 @@ // Calculate genome coverage and statistics // -include { SAMTOOLS_VIEW } from '../../modules/nf-core/samtools/view/main' -include { MOSDEPTH } from '../../modules/nf-core/mosdepth/main' -include { FASTAWINDOWS } from '../../modules/nf-core/fastawindows/main' -include { CREATE_BED } from '../../modules/local/create_bed' +include { SAMTOOLS_VIEW } from '../../modules/nf-core/samtools/view/main' +include { SAMTOOLS_INDEX } from '../../modules/nf-core/samtools/index/main' +include { MOSDEPTH } from '../../modules/nf-core/mosdepth/main' +include { FASTAWINDOWS } from '../../modules/nf-core/fastawindows/main' +include { CREATE_BED } from '../../modules/local/create_bed' workflow COVERAGE_STATS { take: - cram // channel: [ val(meta), path(cram) ] - fasta // channel: [ val(meta), path(fasta) ] + input // channel: [ val(meta), path(aln) ] + fasta // channel: [ val(meta), path(fasta) ] main: ch_versions = Channel.empty() - // Convert from CRAM to BAM - cram - | map { meta, cram -> [ meta, cram, [] ] } - | set { ch_cram_crai} + // Create aligned BAM and index CSI channel + input + | branch { meta, aln -> + bam : aln.toString().endsWith("bam") == true + return [ meta, aln ] + cram : aln.toString().endsWith("cram") == true + return [ meta, aln, [] ] + } + | set { ch_aln_idx} - fasta - | map { meta, fasta -> fasta } - | set { ch_fasta } + SAMTOOLS_VIEW ( ch_aln_idx.cram, fasta, [] ) + ch_versions = ch_versions.mix ( SAMTOOLS_VIEW.out.versions.first() ) - SAMTOOLS_VIEW ( ch_cram_crai, ch_fasta, [] ) - ch_versions = ch_versions.mix ( SAMTOOLS_VIEW.out.versions.first() ) + SAMTOOLS_VIEW.out.bam + | join ( SAMTOOLS_VIEW.out.csi ) + | set { ch_view } + + SAMTOOLS_INDEX ( ch_aln_idx.bam ) + ch_versions = ch_versions.mix ( SAMTOOLS_INDEX.out.versions.first() ) + + ch_aln_idx.bam + | join ( SAMTOOLS_INDEX.out.csi ) + | set { ch_index } + + ch_view + | mix ( ch_index ) + | set { ch_bam_csi } // Calculate genome statistics @@ -42,8 +59,7 @@ workflow COVERAGE_STATS { // Calculate coverage - SAMTOOLS_VIEW.out.bam - | join ( SAMTOOLS_VIEW.out.csi ) + ch_bam_csi | combine ( CREATE_BED.out.bed ) | map { meta, bam, csi, meta2, bed -> [ meta, bam, csi, bed ] } | set { ch_bam_csi_bed } diff --git a/subworkflows/local/minimap_alignment.nf b/subworkflows/local/minimap_alignment.nf new file mode 100644 index 00000000..b54cee07 --- /dev/null +++ b/subworkflows/local/minimap_alignment.nf @@ -0,0 +1,75 @@ +// +// Optional alignment subworkflow using Minimap2 +// + +include { SAMTOOLS_FASTA } from '../../modules/nf-core/samtools/fasta/main' +include { MINIMAP2_ALIGN as MINIMAP2_HIC } from '../../modules/nf-core/minimap2/align/main' +include { MINIMAP2_ALIGN as MINIMAP2_ILMN } from '../../modules/nf-core/minimap2/align/main' +include { MINIMAP2_ALIGN as MINIMAP2_CCS } from '../../modules/nf-core/minimap2/align/main' +include { MINIMAP2_ALIGN as MINIMAP2_CLR } from '../../modules/nf-core/minimap2/align/main' +include { MINIMAP2_ALIGN as MINIMAP2_ONT } from '../../modules/nf-core/minimap2/align/main' + + +workflow MINIMAP2_ALIGNMENT { + take: + input // channel: [ val(meta), path(datafile) ] + fasta // channel: [ val(meta), path(fasta) ] + + + main: + ch_versions = Channel.empty() + + + // Convert reads to FASTA + SAMTOOLS_FASTA ( input, true ) + ch_versions = ch_versions.mix(SAMTOOLS_FASTA.out.versions.first()) + + + // Branch input by sequencing type + SAMTOOLS_FASTA.out.interleaved + | branch { + meta, reads -> + hic: meta.datatype == "hic" + illumina : meta.datatype == "illumina" + pacbio : meta.datatype == "pacbio" + clr : meta.datatype == "pacbio_clr" + ont : meta.datatype == "ont" + } + | set { ch_input } + + + // Align with Minimap2 + fasta + | map { meta, genome -> genome } + | set { ch_ref } + + MINIMAP2_HIC ( ch_input.hic, ch_ref, true, false, false ) + ch_versions = ch_versions.mix(MINIMAP2_HIC.out.versions.first()) + + MINIMAP2_ILMN ( ch_input.illumina, ch_ref, true, false, false ) + ch_versions = ch_versions.mix(MINIMAP2_ILMN.out.versions.first()) + + MINIMAP2_CCS ( ch_input.pacbio, ch_ref, true, false, false ) + ch_versions = ch_versions.mix(MINIMAP2_CCS.out.versions.first()) + + MINIMAP2_CLR ( ch_input.clr, ch_ref, true, false, false ) + ch_versions = ch_versions.mix(MINIMAP2_CLR.out.versions.first()) + + MINIMAP2_ONT ( ch_input.ont, ch_ref, true, false, false ) + ch_versions = ch_versions.mix(MINIMAP2_ONT.out.versions.first()) + + + // Combine aligned reads + Channel.empty() + | mix ( MINIMAP2_HIC.out.bam ) + | mix ( MINIMAP2_ILMN.out.bam ) + | mix ( MINIMAP2_CCS.out.bam ) + | mix ( MINIMAP2_CLR.out.bam ) + | mix ( MINIMAP2_ONT.out.bam ) + | set { ch_aligned } + + + emit: + aln = ch_aligned // channel: [ val(meta), bam ] + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/prepare_genome.nf b/subworkflows/local/prepare_genome.nf new file mode 100644 index 00000000..d1e31a72 --- /dev/null +++ b/subworkflows/local/prepare_genome.nf @@ -0,0 +1,49 @@ +// +// Prepare genome for downstream processing +// + +include { GUNZIP } from '../../modules/nf-core/gunzip/main' +include { WINDOWMASKER_MKCOUNTS } from '../../modules/nf-core/windowmasker/mkcounts/main' +include { WINDOWMASKER_USTAT } from '../../modules/nf-core/windowmasker/ustat/main' + + +workflow PREPARE_GENOME { + take: + fasta // channel: [ meta, path(genome) ] + + + main: + ch_versions = Channel.empty() + + + // + // MODULE: Decompress FASTA file if needed + // + if ( params.fasta.endsWith('.gz') ) { + ch_genome = GUNZIP ( fasta ).gunzip + ch_versions = ch_versions.mix ( GUNZIP.out.versions ) + } else { + ch_genome = fasta + } + + + // + // MODULES: Mask the genome if needed + // + if ( params.mask ) { + WINDOWMASKER_MKCOUNTS ( ch_genome ) + ch_versions = ch_versions.mix ( WINDOWMASKER_MKCOUNTS.out.versions ) + + WINDOWMASKER_USTAT ( WINDOWMASKER_MKCOUNTS.out.counts, ch_genome ) + ch_versions = ch_versions.mix ( WINDOWMASKER_USTAT.out.versions ) + + ch_fasta = WINDOWMASKER_USTAT.out.intervals + } else { + ch_fasta = ch_genome + } + + + emit: + genome = ch_fasta // channel: [ meta, path(genome) ] + versions = ch_versions // channel: [ versions.yml ] +} \ No newline at end of file diff --git a/workflows/blobtoolkit.nf b/workflows/blobtoolkit.nf index c8dad117..919e17bb 100644 --- a/workflows/blobtoolkit.nf +++ b/workflows/blobtoolkit.nf @@ -16,7 +16,7 @@ for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true // Check mandatory parameters if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } -if (params.fasta && params.accession) { ch_fasta = Channel.of([ [ 'id': params.accession ], params.fasta ]).collect() } else { exit 1, 'Genome fasta file and accession must be specified!' } +if (params.fasta && params.accession) { ch_fasta = Channel.of([ [ 'id': params.accession ], params.fasta ]).first() } else { exit 1, 'Genome fasta file and accession must be specified!' } if (params.taxon) { ch_taxon = Channel.of(params.taxon) } else { exit 1, 'NCBI Taxon ID not specified!' } if (params.uniprot) { ch_uniprot = file(params.uniprot) } else { exit 1, 'Diamond BLASTp database not specified!' } if (params.taxdump) { ch_taxdump = file(params.taxdump) } else { exit 1, 'NCBI Taxonomy database not specified!' } @@ -50,12 +50,14 @@ include { BLOBTOOLKIT_CONFIG } from '../modules/local/blobtoolkit/config' // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // -include { INPUT_CHECK } from '../subworkflows/local/input_check' -include { COVERAGE_STATS } from '../subworkflows/local/coverage_stats' -include { BUSCO_DIAMOND } from '../subworkflows/local/busco_diamond_blastp' -include { COLLATE_STATS } from '../subworkflows/local/collate_stats' -include { BLOBTOOLS } from '../subworkflows/local/blobtools' -include { VIEW } from '../subworkflows/local/view' +include { PREPARE_GENOME } from '../subworkflows/local/prepare_genome' +include { MINIMAP2_ALIGNMENT } from '../subworkflows/local/minimap_alignment' +include { INPUT_CHECK } from '../subworkflows/local/input_check' +include { COVERAGE_STATS } from '../subworkflows/local/coverage_stats' +include { BUSCO_DIAMOND } from '../subworkflows/local/busco_diamond_blastp' +include { COLLATE_STATS } from '../subworkflows/local/collate_stats' +include { BLOBTOOLS } from '../subworkflows/local/blobtools' +include { VIEW } from '../subworkflows/local/view' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -66,7 +68,6 @@ include { VIEW } from '../subworkflows/local/view' // // MODULE: Installed directly from nf-core/modules // -include { GUNZIP } from '../modules/nf-core/gunzip/main' include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' include { MULTIQC } from '../modules/nf-core/multiqc/main' @@ -84,14 +85,10 @@ workflow BLOBTOOLKIT { ch_versions = Channel.empty() // - // MODULE: Decompress FASTA file if needed + // SUBWORKFLOW: Prepare genome for downstream processing // - if ( params.fasta.endsWith('.gz') ) { - ch_genome = GUNZIP ( ch_fasta ).gunzip - ch_versions = ch_versions.mix ( GUNZIP.out.versions.first() ) - } else { - ch_genome = ch_fasta - } + PREPARE_GENOME ( ch_fasta ) + ch_versions = ch_versions.mix ( PREPARE_GENOME.out.versions ) // // SUBWORKFLOW: Check samplesheet and create channels for downstream analysis @@ -99,10 +96,21 @@ workflow BLOBTOOLKIT { INPUT_CHECK ( ch_input ) ch_versions = ch_versions.mix ( INPUT_CHECK.out.versions ) + // + // SUBWORKFLOW: Optional read alignment + // + if ( params.align ) { + MINIMAP2_ALIGNMENT ( INPUT_CHECK.out.aln, PREPARE_GENOME.out.genome ) + ch_versions = ch_versions.mix ( MINIMAP2_ALIGNMENT.out.versions ) + ch_aligned = MINIMAP2_ALIGNMENT.out.aln + } else { + ch_aligned = INPUT_CHECK.out.aln + } + // // SUBWORKFLOW: Calculate genome coverage and statistics // - COVERAGE_STATS ( INPUT_CHECK.out.aln, ch_genome ) + COVERAGE_STATS ( ch_aligned, PREPARE_GENOME.out.genome ) ch_versions = ch_versions.mix ( COVERAGE_STATS.out.versions ) // @@ -115,27 +123,46 @@ workflow BLOBTOOLKIT { ch_taxon_taxa = ch_fasta.combine(ch_taxon).map { meta, fasta, taxon -> [ meta, taxon, [] ] } } - BUSCO_DIAMOND ( ch_genome, ch_taxon_taxa, ch_busco_db, ch_uniprot, params.blastp_outext, params.blastp_cols ) + BUSCO_DIAMOND ( + PREPARE_GENOME.out.genome, + ch_taxon_taxa, + ch_busco_db, + ch_uniprot, + params.blastp_outext, + params.blastp_cols + ) ch_versions = ch_versions.mix ( BUSCO_DIAMOND.out.versions ) // // SUBWORKFLOW: Collate genome statistics by various window sizes // - COLLATE_STATS ( BUSCO_DIAMOND.out.full_table, COVERAGE_STATS.out.bed, COVERAGE_STATS.out.freq, COVERAGE_STATS.out.mononuc, COVERAGE_STATS.out.cov ) + COLLATE_STATS ( + BUSCO_DIAMOND.out.full_table, + COVERAGE_STATS.out.bed, + COVERAGE_STATS.out.freq, + COVERAGE_STATS.out.mononuc, + COVERAGE_STATS.out.cov + ) ch_versions = ch_versions.mix ( COLLATE_STATS.out.versions ) // // SUBWORKFLOW: Create BlobTools dataset // if ( !params.yaml ) { - BLOBTOOLKIT_CONFIG ( ch_genome ) + BLOBTOOLKIT_CONFIG ( PREPARE_GENOME.out.genome ) ch_config = BLOBTOOLKIT_CONFIG.out.yaml ch_versions = ch_versions.mix ( BLOBTOOLKIT_CONFIG.out.versions.first() ) } else { ch_config = ch_yaml } - BLOBTOOLS ( ch_config, COLLATE_STATS.out.window_tsv, BUSCO_DIAMOND.out.first_table, BUSCO_DIAMOND.out.blastp_txt.ifEmpty([[],[]]), ch_taxdump ) + BLOBTOOLS ( + ch_config, + COLLATE_STATS.out.window_tsv, + BUSCO_DIAMOND.out.first_table, + BUSCO_DIAMOND.out.blastp_txt.ifEmpty([[],[]]), + ch_taxdump + ) ch_versions = ch_versions.mix ( BLOBTOOLS.out.versions ) //