diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index ea27a584..4ecfbfe3 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -2,6 +2,7 @@ "name": "nfcore", "image": "nfcore/gitpod:latest", "remoteUser": "gitpod", + "runArgs": ["--privileged"], // Configure tool-specific properties. "customizations": { diff --git a/.editorconfig b/.editorconfig index a30ae1e1..84a786d8 100644 --- a/.editorconfig +++ b/.editorconfig @@ -31,3 +31,12 @@ insert_final_newline = unset trim_trailing_whitespace = unset indent_style = unset indent_size = unset + +# To prevent errors for these test blastn databases +[/assets/test*/nt_*/*.{ndb,nhr,nin,nog,nos,not,nsq,ntf,nto}] +charset = unset +end_of_line = unset +insert_final_newline = unset +trim_trailing_whitespace = unset +indent_style = unset +indent_size = unset diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 19adb352..1ff8015f 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -23,6 +23,9 @@ If you're not used to this workflow with git, you can start with some [docs from ## Tests +You can optionally test your changes by running the pipeline locally. Then it is recommended to use the `debug` profile to +receive warnings about process selectors and other debug info. Example: `nextflow run . -profile debug,test,docker --outdir `. + When you create a pull request with changes, [GitHub Actions](https://github.com/features/actions) will run automatic tests. Typically, pull-requests are only fully reviewed when these tests are passing, though of course we can help out before then. @@ -110,4 +113,3 @@ To get started: Devcontainer specs: - [DevContainer config](.devcontainer/devcontainer.json) -- [Dockerfile](.devcontainer/Dockerfile) diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index 58919f2a..cfb4bf72 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -2,6 +2,14 @@ name: Bug report description: Report something that is broken or incorrect labels: bug body: + - type: markdown + attributes: + value: | + Before you post this issue, please check the documentation: + + - [nf-core website: troubleshooting](https://nf-co.re/usage/troubleshooting) + - [sanger-tol/blobtoolkit pipeline documentation](https://pipelines.tol.sanger.ac.uk/blobtoolkit/usage) + - type: textarea id: description attributes: @@ -9,46 +17,34 @@ body: description: A clear and concise description of what the bug is. validations: required: true + - type: textarea id: command_used attributes: label: Command used and terminal output - description: Steps to reproduce the behaviour. Please paste the command you used - to launch the pipeline and the output from your terminal. + description: Steps to reproduce the behaviour. Please paste the command you used to launch the pipeline and the output from your terminal. render: console - placeholder: "$ nextflow run ... - + placeholder: | + $ nextflow run ... Some output where something broke - " - type: textarea id: files attributes: label: Relevant files - description: "Please drag and drop the relevant files here. Create a `.zip` archive - if the extension is not allowed. - - Your verbose log file `.nextflow.log` is often useful _(this is a hidden file - in the directory where you launched the pipeline)_ as well as custom Nextflow - configuration files. + description: | + Please drag and drop the relevant files here. Create a `.zip` archive if the extension is not allowed. + Your verbose log file `.nextflow.log` is often useful _(this is a hidden file in the directory where you launched the pipeline)_ as well as custom Nextflow configuration files. - " - type: textarea id: system attributes: label: System information - description: "* Nextflow version _(eg. 23.04.1)_ - + description: | + * Nextflow version _(eg. 23.04.0)_ * Hardware _(eg. HPC, Desktop, Cloud)_ - * Executor _(eg. slurm, local, awsbatch)_ - - * Container engine: _(e.g. Docker, Singularity, Conda, Podman, Shifter, Charliecloud, - or Apptainer)_ - + * Container engine: _(e.g. Docker, Singularity, Conda, Podman, Shifter, Charliecloud, or Apptainer)_ * OS _(eg. CentOS Linux, macOS, Linux Mint)_ - * Version of sanger-tol/blobtoolkit _(eg. 1.1, 1.5, 1.8.2)_ - - " diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml deleted file mode 100644 index 1da35927..00000000 --- a/.github/ISSUE_TEMPLATE/config.yml +++ /dev/null @@ -1,7 +0,0 @@ -contact_links: - - name: Join nf-core - url: https://nf-co.re/join - about: Please join the nf-core community here - - name: "Slack #blobtoolkit channel" - url: https://nfcore.slack.com/channels/blobtoolkit - about: Discussion about the nf-core/blobtoolkit pipeline diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index fef3064b..b78be63e 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -8,16 +8,17 @@ These are the most common things requested on pull requests (PRs). Remember that PRs should be made against the dev branch, unless you're preparing a pipeline release. -Learn more about contributing: [CONTRIBUTING.md](https://github.com/sanger-tol/blobtoolkit/tree/master/.github/CONTRIBUTING.md) +Learn more about contributing: [CONTRIBUTING.md](.github/CONTRIBUTING.md) --> ## PR checklist - [ ] This comment contains a description of changes (with reason). - [ ] If you've fixed a bug or added code that should be tested, add tests! -- [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](https://github.com/sanger-tol/blobtoolkit/tree/master/.github/CONTRIBUTING.md) +- [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](.github/CONTRIBUTING.md) - [ ] Make sure your code lints (`nf-core lint`). - [ ] Ensure the test suite passes (`nextflow run . -profile test,docker --outdir `). +- [ ] Check for unexpected warnings in debug mode (`nextflow run . -profile debug,test,docker --outdir `). - [ ] Usage Documentation in `docs/usage.md` is updated. - [ ] Output Documentation in `docs/output.md` is updated. - [ ] `CHANGELOG.md` is updated. diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 307a3b42..589d7118 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,7 +1,12 @@ name: nf-core CI # This workflow runs the pipeline with the minimal test dataset to check that it completes without any syntax errors on: - workflow_dispatch: + push: + branches: + - dev + pull_request: + release: + types: [published] env: NXF_ANSI_LOG: false @@ -19,20 +24,30 @@ jobs: strategy: matrix: NXF_VER: - - "23.04.1" + - "23.04.0" - "latest-everything" steps: - name: Check out pipeline code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install Nextflow uses: nf-core/setup-nextflow@v1 with: version: "${{ matrix.NXF_VER }}" + - name: Download the NCBI taxdump database + run: | + mkdir ncbi_taxdump + curl -L https://ftp.ncbi.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz | tar -C ncbi_taxdump -xzf - + + - name: Download the BUSCO lineage database + run: | + mkdir busco_database + curl -L https://tolit.cog.sanger.ac.uk/test-data/resources/busco/blobtoolkit.GCA_922984935.2.2023-08-03.lineages.tar.gz | tar -C busco_database -xzf - + - name: Run pipeline with test data # You can customise CI pipeline run tests as required # For example: adding multiple test runs with different parameters # Remember that you can parallelise this by using strategy.matrix run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results + nextflow run ${GITHUB_WORKSPACE} -profile test,docker --taxdump $PWD/ncbi_taxdump --busco $PWD/busco_database --outdir ./results diff --git a/.github/workflows/fix-linting.yml b/.github/workflows/fix-linting.yml index b7779fad..d5f2a971 100644 --- a/.github/workflows/fix-linting.yml +++ b/.github/workflows/fix-linting.yml @@ -8,23 +8,23 @@ jobs: # Only run if comment is on a PR with the main repo, and if it contains the magic keywords if: > contains(github.event.comment.html_url, '/pull/') && - contains(github.event.comment.body, '@nf-core-bot fix linting') && + contains(github.event.comment.body, '@sanger-tolsoft fix linting') && github.repository == 'sanger-tol/blobtoolkit' runs-on: ubuntu-latest steps: - # Use the @nf-core-bot token to check out so we can push later - - uses: actions/checkout@v3 + # Use the @sanger-tolsoft token to check out so we can push later + - uses: actions/checkout@v4 with: - token: ${{ secrets.nf_core_bot_auth_token }} + token: ${{ secrets.sangertolsoft_access_token }} # Action runs on the issue comment, so we don't get the PR by default # Use the gh cli to check out the PR - name: Checkout Pull Request run: gh pr checkout ${{ github.event.issue.number }} env: - GITHUB_TOKEN: ${{ secrets.nf_core_bot_auth_token }} + GITHUB_TOKEN: ${{ secrets.sangertolsoft_access_token }} - - uses: actions/setup-node@v3 + - uses: actions/setup-node@v4 - name: Install Prettier run: npm install -g prettier @prettier/plugin-php @@ -46,8 +46,8 @@ jobs: - name: Commit & push changes if: steps.prettier_status.outputs.result == 'fail' run: | - git config user.email "core@nf-co.re" - git config user.name "nf-core-bot" + git config user.email "105875386+sanger-tolsoft@users.noreply.github.com" + git config user.name "sanger-tolsoft" git config push.default upstream git add . git status diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 888cb4bc..905c58e4 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -14,9 +14,9 @@ jobs: EditorConfig: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - - uses: actions/setup-node@v3 + - uses: actions/setup-node@v4 - name: Install editorconfig-checker run: npm install -g editorconfig-checker @@ -27,9 +27,9 @@ jobs: Prettier: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - - uses: actions/setup-node@v3 + - uses: actions/setup-node@v4 - name: Install Prettier run: npm install -g prettier @@ -40,7 +40,7 @@ jobs: PythonBlack: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Check code lints with Black uses: psf/black@stable @@ -71,14 +71,14 @@ jobs: runs-on: ubuntu-latest steps: - name: Check out pipeline code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install Nextflow uses: nf-core/setup-nextflow@v1 - uses: actions/setup-python@v4 with: - python-version: "3.8" + python-version: "3.11" architecture: "x64" - name: Install dependencies diff --git a/.github/workflows/release-announcements.yml b/.github/workflows/release-announcements.yml new file mode 100644 index 00000000..6ad33927 --- /dev/null +++ b/.github/workflows/release-announcements.yml @@ -0,0 +1,68 @@ +name: release-announcements +# Automatic release toot and tweet anouncements +on: + release: + types: [published] + workflow_dispatch: + +jobs: + toot: + runs-on: ubuntu-latest + steps: + - uses: rzr/fediverse-action@master + with: + access-token: ${{ secrets.MASTODON_ACCESS_TOKEN }} + host: "mstdn.science" # custom host if not "mastodon.social" (default) + # GitHub event payload + # https://docs.github.com/en/developers/webhooks-and-events/webhooks/webhook-events-and-payloads#release + message: | + Pipeline release! ${{ github.repository }} v${{ github.event.release.tag_name }} - ${{ github.event.release.name }}! + + Please see the changelog: ${{ github.event.release.html_url }} + + send-tweet: + runs-on: ubuntu-latest + + steps: + - uses: actions/setup-python@v4 + with: + python-version: "3.10" + - name: Install dependencies + run: pip install tweepy==4.14.0 + - name: Send tweet + shell: python + run: | + import os + import tweepy + + client = tweepy.Client( + access_token=os.getenv("TWITTER_ACCESS_TOKEN"), + access_token_secret=os.getenv("TWITTER_ACCESS_TOKEN_SECRET"), + consumer_key=os.getenv("TWITTER_CONSUMER_KEY"), + consumer_secret=os.getenv("TWITTER_CONSUMER_SECRET"), + ) + tweet = os.getenv("TWEET") + client.create_tweet(text=tweet) + env: + TWEET: | + Pipeline release! ${{ github.repository }} v${{ github.event.release.tag_name }} - ${{ github.event.release.name }}! + + Please see the changelog: ${{ github.event.release.html_url }} + TWITTER_CONSUMER_KEY: ${{ secrets.TWITTER_CONSUMER_KEY }} + TWITTER_CONSUMER_SECRET: ${{ secrets.TWITTER_CONSUMER_SECRET }} + TWITTER_ACCESS_TOKEN: ${{ secrets.TWITTER_ACCESS_TOKEN }} + TWITTER_ACCESS_TOKEN_SECRET: ${{ secrets.TWITTER_ACCESS_TOKEN_SECRET }} + + bsky-post: + runs-on: ubuntu-latest + steps: + - uses: zentered/bluesky-post-action@v0.0.2 + with: + post: | + Pipeline release! ${{ github.repository }} v${{ github.event.release.tag_name }} - ${{ github.event.release.name }}! + + Please see the changelog: ${{ github.event.release.html_url }} + env: + BSKY_IDENTIFIER: ${{ secrets.BSKY_IDENTIFIER }} + BSKY_PASSWORD: ${{ secrets.BSKY_PASSWORD }} + # diff --git a/.github/workflows/sangertest.yml b/.github/workflows/sanger_test.yml similarity index 78% rename from .github/workflows/sangertest.yml rename to .github/workflows/sanger_test.yml index 95479500..406a6280 100644 --- a/.github/workflows/sangertest.yml +++ b/.github/workflows/sanger_test.yml @@ -1,4 +1,4 @@ -name: nf-core Sanger LSF tests +name: sanger-tol LSF tests on: workflow_dispatch: @@ -13,12 +13,11 @@ jobs: if: github.event_name == 'workflow_dispatch' - name: Launch workflow via tower - uses: nf-core/tower-action@v2 + uses: seqeralabs/action-tower-launch@v2 with: workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} - pipeline: ${{ github.repository }} revision: ${{ env.REVISION }} workdir: ${{ secrets.TOWER_WORKDIR_PARENT }}/work/${{ github.repository }}/work-${{ env.REVISION }} parameters: | @@ -26,3 +25,9 @@ jobs: "outdir": "${{ secrets.TOWER_WORKDIR_PARENT }}/results/${{ github.repository }}/results-${{ env.REVISION }}", } profiles: test,sanger,singularity,cleanup + - uses: actions/upload-artifact@v3 + with: + name: Tower debug log file + path: | + tower_action_*.log + tower_action_*.json diff --git a/.github/workflows/sangerfulltest.yml b/.github/workflows/sanger_test_full.yml similarity index 80% rename from .github/workflows/sangerfulltest.yml rename to .github/workflows/sanger_test_full.yml index addef9bc..e3a25f7b 100644 --- a/.github/workflows/sangerfulltest.yml +++ b/.github/workflows/sanger_test_full.yml @@ -1,4 +1,4 @@ -name: nf-core Sanger LSF full size tests +name: sanger-tol LSF full size tests on: workflow_dispatch: @@ -18,12 +18,11 @@ jobs: if: github.event_name == 'workflow_dispatch' - name: Launch workflow via tower - uses: nf-core/tower-action@v2 + uses: seqeralabs/action-tower-launch@v2 with: workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} - pipeline: ${{ github.repository }} revision: ${{ env.REVISION }} workdir: ${{ secrets.TOWER_WORKDIR_PARENT }}/work/${{ github.repository }}/work-${{ env.REVISION }} parameters: | @@ -31,3 +30,9 @@ jobs: "outdir": "${{ secrets.TOWER_WORKDIR_PARENT }}/results/${{ github.repository }}/results-${{ env.REVISION }}", } profiles: test_full,sanger,singularity,cleanup + - uses: actions/upload-artifact@v3 + with: + name: Tower debug log file + path: | + tower_action_*.log + tower_action_*.json diff --git a/.gitpod.yml b/.gitpod.yml index 85d95ecc..acf72695 100644 --- a/.gitpod.yml +++ b/.gitpod.yml @@ -1,5 +1,12 @@ image: nfcore/gitpod:latest - +tasks: + - name: Update Nextflow and setup pre-commit + command: | + pre-commit install --install-hooks + nextflow self-update + - name: unset JAVA_TOOL_OPTIONS + command: | + unset JAVA_TOOL_OPTIONS vscode: extensions: # based on nf-core.nf-core-extensionpack - codezombiech.gitignore # Language support for .gitignore files diff --git a/.nf-core.yml b/.nf-core.yml index f2175469..3e9d09b4 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -1,18 +1,34 @@ -repository_type: pipeline lint: files_exist: + - CODE_OF_CONDUCT.md - assets/nf-core-blobtoolkit_logo_light.png - docs/images/nf-core-blobtoolkit_logo_light.png - docs/images/nf-core-blobtoolkit_logo_dark.png + - .github/ISSUE_TEMPLATE/config.yml + - .github/workflows/awstest.yml + - .github/workflows/awsfulltest.yml + - conf/igenomes.config files_unchanged: - - LICENSE - - .github/ISSUE_TEMPLATE/bug_report.yml - - assets/sendmail_template.txt + - LICENCE - lib/NfcoreTemplate.groovy - - .prettierignore + - CODE_OF_CONDUCT.md + - assets/nf-core-blobtoolkit_logo_light.png + - docs/images/nf-core-blobtoolkit_logo_light.png + - docs/images/nf-core-blobtoolkit_logo_dark.png + - .github/ISSUE_TEMPLATE/bug_report.yml + - .github/PULL_REQUEST_TEMPLATE.md + multiqc_config: + - report_comment nextflow_config: - manifest.name - manifest.homePage - multiqc_config: - - report_comment - actions_ci: false + template_strings: False + merge_markers: False +repository_type: pipeline +template: + author: priyanka-surana + description: Quality assessment of genome assemblies + name: blobtoolkit + prefix: sanger-tol + skip: + - igenomes diff --git a/.prettierignore b/.prettierignore index 7b59d0aa..5bee2b1c 100644 --- a/.prettierignore +++ b/.prettierignore @@ -3,9 +3,11 @@ adaptivecard.json slackreport.json .nextflow* work/ +data/ results/ .DS_Store +testing/ +testing* *.pyc bin/ -assets/test/*yaml -assets/test_full/*yaml +assets/test* diff --git a/CHANGELOG.md b/CHANGELOG.md index 5a786073..bd4cc71d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,18 +3,27 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [[0.2.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.2.0)] – – [2023-MM-DD] +## [[0.2.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.2.0)] – Pikachu – [2023-12-22] ### Enhancements & fixes -... +- Template updated to nf-core/tools 2.11.1 +- Includes all subworkflows in the [Snakemake version](https://github.com/blobtoolkit/blobtoolkit) +- Added blastx and blastn subworkflows +- Replaced mosdepth with blobtk depth +- Updated config creation script ### Parameters -| Old parameter | New parameter | -| ------------- | ------------- | - -... +| Old parameter | New parameter | +| ------------- | --------------- | +| | --mask | +| | --align | +| --uniprot | --blastp | +| | --blastx | +| | --blastn | +| | --blastx_outext | +| | --blastx_cols | > **NB:** Parameter has been **updated** if both old and new parameter information is present.
**NB:** Parameter has been **added** if just the new parameter information is present.
**NB:** Parameter has been **removed** if new parameter information isn't present. @@ -22,19 +31,22 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 Note, since the pipeline is using Nextflow DSL2, each process will be run with its own [Biocontainer](https://biocontainers.pro/#/registry). This means that on occasion it is entirely possible for the pipeline to be using different versions of the same tool. However, the overall software dependency changes compared to the last release have been listed below for reference. Only `Docker` or `Singularity` containers are supported, `conda` is not supported. -| Dependency | Old version | New version | -| ------------- | ----------- | ----------- | -| blobtoolkit | 4.1.4 | | -| busco | 5.4.3 | | -| fasta_windows | 0.2.4 | | -| goat | 0.2.0 | | -| gunzip | 1.10 | | -| mosdepth | 0.3.3 | | -| nextflow | 22.10.6 | | -| python | 3.10.6 | | -| samtools | 1.15.1 | | -| tar | 1.30 | | -| yaml | 6.0 | | +| Dependency | Old version | New version | +| ------------ | ----------- | ----------- | +| blobtoolkit | 4.1.4 | 4.3.2 | +| busco | 5.4.3 | 5.5.0 | +| goat | 0.2.0 | 0.2.5 | +| mosdepth | 0.3.3 | | +| nextflow | 22.10.6 | 23.10.0 | +| python | 3.10.6 | 3.12.0 | +| samtools | 1.15.1 | 1.18 | +| tar | 1.30 | | +| yaml | 6.0 | 6.0.1 | +| blobtk | 0.3.3 | 0.5.1 | +| diamond | 2.0.15 | 2.1.8 | +| minimap2 | | 2.24-r1122 | +| blast | | 2.14.1 | +| windowmasker | | 2.14.0 | > **NB:** Dependency has been **updated** if both old and new version information is present.
**NB:** Dependency has been **added** if just the new version information is present.
**NB:** Dependency has been **removed** if version information isn't present. diff --git a/CITATIONS.md b/CITATIONS.md index 41f43458..8b960779 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -1,5 +1,7 @@ # sanger-tol/blobtoolkit: Citations +> Butt, Zaynab, et al. "sanger-tol/blobtoolkit" Zenodo, 2023, https://zenodo.org/doi/10.5281/zenodo.7949058. + ## [nf-core](https://nf-co.re) > Ewels, Philip A., et al. “The Nf-Core Framework for Community-Curated Bioinformatics Pipelines.” Nature Biotechnology, vol. 38, no. 3, Feb. 2020, pp. 276–78, https://doi.org/10.1038/s41587-020-0439-x. @@ -28,9 +30,9 @@ > Challis, Richard, et al. “Genomes on a Tree (GoaT): A versatile, scalable search engine for genomic and sequencing project metadata across the eukaryotic tree of life.” Wellcome Open Research, vol. 8, no. 24, 2023, https://doi.org/10.12688/wellcomeopenres.18658.1. -- [Mosdepth](https://github.com/brentp/mosdepth) +- [Minimap2](https://github.com/lh3/minimap2) - > Pedersen, Brent S., and Aaron R. Quinlan. “Mosdepth: Quick Coverage Calculation for Genomes and Exomes.” Bioinformatics, edited by John Hancock, vol. 34, no. 5, Oct. 2017, pp. 867–68, https://doi.org/10.1093/bioinformatics/btx699. + > Li, Heng. "Minimap2: pairwise alignment for nucleotide sequences." Bioinformatics, vol. 34, no. 18, Sep. 2018, pp. 3094-100, https://doi.org/10.1093/bioinformatics/bty191. - [MultiQC](https://multiqc.info) diff --git a/README.md b/README.md index 58008e6e..fab2a350 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,9 @@ # ![sanger-tol/blobtoolkit](docs/images/sanger-tol-blobtoolkit_logo.png) - +[![GitHub Actions CI Status](https://github.com/sanger-tol/blobtoolkit/workflows/nf-core%20CI/badge.svg)](https://github.com/sanger-tol/blobtoolkit/actions?query=workflow%3A%22nf-core+CI%22) +[![GitHub Actions Linting Status](https://github.com/sanger-tol/blobtoolkit/workflows/nf-core%20linting/badge.svg)](https://github.com/sanger-tol/blobtoolkit/actions?query=workflow%3A%22nf-core+linting%22)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.7949058-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.7949058) -[![GitHub Actions Linting Status](https://github.com/sanger-tol/blobtoolkit/workflows/nf-core%20linting/badge.svg)](https://github.com/sanger-tol/blobtoolkit/actions?query=workflow%3A%22nf-core+linting%22) -[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.7949058-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.7949058) - -[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A523.04.1-23aa62.svg)](https://www.nextflow.io/) +[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A523.04.0-23aa62.svg)](https://www.nextflow.io/) [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/) [![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/) [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/) @@ -29,22 +27,22 @@ 1. Calculate genome statistics in windows ([`fastawindows`](https://github.com/tolkit/fasta_windows)) -2. Calculate Coverage ([`mosdepth`](https://github.com/brentp/mosdepth)) +2. Calculate Coverage ([`blobtk/depth`](https://github.com/blobtoolkit/blobtk)) 3. Fetch associated BUSCO lineages ([`goat/taxonsearch`](https://github.com/genomehubs/goat-cli)) 4. Run BUSCO ([`busco`](https://busco.ezlab.org/)) -5. Extract BUSCO genes (blobtoolkit/extractbuscos) +5. Extract BUSCO genes ([`blobtoolkit/extractbuscos`](https://github.com/blobtoolkit/blobtoolkit)) 6. Run Diamond BLASTp against extracted BUSCO genes ([`diamond/blastp`](https://github.com/bbuchfink/diamond)) -7. Count BUSCO genes (blobtoolkit/countbuscos) -8. Generate combined sequence stats across various window sizes (blobtoolkit/windowstats) -9. Imports analysis results into a BlobDir dataset (blobtoolkit/blobdir) -10. Create static plot images (blobtoolkit/images) +7. Run BLASTn against extracted BUSCO genes ([`blast/blastn`](https://www.ncbi.nlm.nih.gov/books/NBK131777/)) +8. Run BLASTx against extracted BUSCO genes ([`blast/blastx`](https://www.ncbi.nlm.nih.gov/books/NBK131777/)) +9. Count BUSCO genes ([`blobtoolkit/countbuscos`](https://github.com/blobtoolkit/blobtoolkit)) +10. Generate combined sequence stats across various window sizes ([`blobtoolkit/windowstats`](https://github.com/blobtoolkit/blobtoolkit)) +11. Imports analysis results into a BlobDir dataset ([`blobtoolkit/blobdir`](https://github.com/blobtoolkit/blobtoolkit)) +12. Create static plot images ([`blobtk/images`](https://github.com/blobtoolkit/blobtk)) ## Usage -> **Note** -> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how -> to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) -> with `-profile test` before running the workflow on actual data. +> [!NOTE] +> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data. @@ -60,7 +58,7 @@ mMelMel1,illumina,GCA_922984935.2.illumina.mMelMel1.cram mMelMel3,ont,GCA_922984935.2.ont.mMelMel3.cram ``` -Each row represents an aligned file. Rows with the same sample identifier are considered technical replicates. The datatype refers to the sequencing technology used to generate the underlying raw data and follows a controlled vocabulary (ont, hic, pacbio, illumina). The aligned read files can be generated using the [sanger-tol/readmapping](https://github.com/sanger-tol/readmapping) pipeline. +Each row represents an aligned file. Rows with the same sample identifier are considered technical replicates. The datatype refers to the sequencing technology used to generate the underlying raw data and follows a controlled vocabulary (ont, hic, pacbio, pacbio_clr, illumina). The aligned read files can be generated using the [sanger-tol/readmapping](https://github.com/sanger-tol/readmapping) pipeline. Now, you can run the pipeline using: @@ -75,12 +73,13 @@ nextflow run sanger-tol/blobtoolkit \ --accession GCA_XXXXXXXXX.X \ --taxon XXXX \ --taxdump /path/to/taxdump/database \ - --uniprot /path/to/diamond/database + --blastp /path/to/diamond/database \ + --blastn /path/to/blastn/database \ + --blastx /path/to/blastx/database ``` -> **Warning:** -> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those -> provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; +> [!WARNING] +> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; > see [docs](https://nf-co.re/usage/configuration#custom-configuration-files). For more details, please refer to the [usage documentation](https://pipelines.tol.sanger.ac.uk/blobtoolkit/usage) and the [parameter documentation](https://pipelines.tol.sanger.ac.uk/blobtoolkit/parameters). @@ -105,8 +104,6 @@ If you would like to contribute to this pipeline, please see the [contributing g ## Citations - - If you use sanger-tol/blobtoolkit for your analysis, please cite it using the following doi: [10.5281/zenodo.7949058](https://doi.org/10.5281/zenodo.7949058) diff --git a/assets/methods_description_template.yml b/assets/methods_description_template.yml index 051e4609..56f46081 100644 --- a/assets/methods_description_template.yml +++ b/assets/methods_description_template.yml @@ -3,17 +3,22 @@ description: "Suggested text and references to use when describing pipeline usag section_name: "sanger-tol/blobtoolkit Methods Description" section_href: "https://github.com/sanger-tol/blobtoolkit" plot_type: "html" -## Update the HTML below to your prefered methods description, e.g. add publication citation for this pipeline +## Update the HTML below to your preferred methods description, e.g. add publication citation for this pipeline ## You inject any metadata in the Nextflow '${workflow}' object data: |

Methods

-

Data was processed using sanger-tol/blobtoolkit v${workflow.manifest.version} ${doi_text} of the sanger-tol collection of workflows, created using nf-core (Ewels et al., 2020).

+

Data was processed using sanger-tol/blobtoolkit v${workflow.manifest.version} ${doi_text} of the sanger-tol collection of workflows, created using nf-core template (Ewels et al., 2020), utilising reproducible software environments from the Bioconda (Grüning et al., 2018) and Biocontainers (da Veiga Leprevost et al., 2017) projects.

The pipeline was executed with Nextflow v${workflow.nextflow.version} (Di Tommaso et al., 2017) with the following command:

${workflow.commandLine}
+

${tool_citations}

References

    -
  • Di Tommaso, P., Chatzou, M., Floden, E. W., Barja, P. P., Palumbo, E., & Notredame, C. (2017). Nextflow enables reproducible computational workflows. Nature Biotechnology, 35(4), 316-319. https://doi.org/10.1038/nbt.3820
  • -
  • Ewels, P. A., Peltzer, A., Fillinger, S., Patel, H., Alneberg, J., Wilm, A., Garcia, M. U., Di Tommaso, P., & Nahnsen, S. (2020). The nf-core framework for community-curated bioinformatics pipelines. Nature Biotechnology, 38(3), 276-278. https://doi.org/10.1038/s41587-020-0439-x
  • +
  • Butt, Z., Challis, R., Kumar, S., Muffato, M., Qi, G., Ramos Díaz, A., & Surana, P. (2023). sanger-tol/blobtoolkit. Zenodo. 10.5281/zenodo.7949058
  • +
  • Di Tommaso, P., Chatzou, M., Floden, E. W., Barja, P. P., Palumbo, E., & Notredame, C. (2017). Nextflow enables reproducible computational workflows. Nature Biotechnology, 35(4), 316-319. doi: 10.1038/nbt.3820
  • +
  • Ewels, P. A., Peltzer, A., Fillinger, S., Patel, H., Alneberg, J., Wilm, A., Garcia, M. U., Di Tommaso, P., & Nahnsen, S. (2020). The nf-core framework for community-curated bioinformatics pipelines. Nature Biotechnology, 38(3), 276-278. doi: 10.1038/s41587-020-0439-x
  • +
  • Grüning, B., Dale, R., Sjödin, A., Chapman, B. A., Rowe, J., Tomkins-Tinch, C. H., Valieris, R., Köster, J., & Bioconda Team. (2018). Bioconda: sustainable and comprehensive software distribution for the life sciences. Nature Methods, 15(7), 475–476. doi: 10.1038/s41592-018-0046-7
  • +
  • da Veiga Leprevost, F., Grüning, B. A., Alves Aflitos, S., Röst, H. L., Uszkoreit, J., Barsnes, H., Vaudel, M., Moreno, P., Gatto, L., Weber, J., Bai, M., Jimenez, R. C., Sachsenberg, T., Pfeuffer, J., Vera Alvarez, R., Griss, J., Nesvizhskii, A. I., & Perez-Riverol, Y. (2017). BioContainers: an open-source and community-driven framework for software standardization. Bioinformatics (Oxford, England), 33(16), 2580–2582. doi: 10.1093/bioinformatics/btx192
  • + ${tool_bibliography}
Notes:
diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index dec36a50..03f46915 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -1,6 +1,8 @@ report_comment: > - This report has been generated by the sanger-tol/blobtoolkit + + This report has been generated by the sanger-tol/blobtoolkit analysis pipeline. + report_section_order: "sanger-tol-blobtoolkit-methods-description": order: -1000 diff --git a/assets/schema_input.json b/assets/schema_input.json index c315cedb..f08ccb89 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -16,7 +16,7 @@ "datatype": { "type": "string", "pattern": "^\\S+$", - "enum": ["hic", "illumina", "ont", "pacbio"], + "enum": ["hic", "illumina", "ont", "pacbio", "pacbio_clr"], "errorMessage": "Data type, and must be one of: 'hic' or 'illumina' or 'ont' or 'pacbio'" }, "datafile": { diff --git a/assets/sendmail_template.txt b/assets/sendmail_template.txt index 619b2eb5..d9b56ea1 100644 --- a/assets/sendmail_template.txt +++ b/assets/sendmail_template.txt @@ -12,9 +12,9 @@ $email_html Content-Type: image/png;name="sanger-tol-blobtoolkit_logo.png" Content-Transfer-Encoding: base64 Content-ID: -Content-Disposition: inline; filename="sanger-tol-blobtoolkit_logo.png" +Content-Disposition: inline; filename="sanger-tol-blobtoolkit_logo_light.png" -<% out << new File("$projectDir/docs/images/nf-core-blobtoolkit_logo.png"). +<% out << new File("$projectDir/assets/sanger-tol-blobtoolkit_logo_light.png"). bytes. encodeBase64(). toString(). diff --git a/assets/slackreport.json b/assets/slackreport.json index 6db5d148..69cf63c3 100644 --- a/assets/slackreport.json +++ b/assets/slackreport.json @@ -3,7 +3,7 @@ { "fallback": "Plain-text summary of the attachment.", "color": "<% if (success) { %>good<% } else { %>danger<%} %>", - "author_name": "sanger-tol/blobtoolkit v${version} - ${runName}", + "author_name": "sanger-tol/blobtoolkit ${version} - ${runName}", "author_icon": "https://www.nextflow.io/docs/latest/_static/favicon.ico", "text": "<% if (success) { %>Pipeline completed successfully!<% } else { %>Pipeline completed with errors<% } %>", "fields": [ diff --git a/assets/test/mCerEla1.1.buscogenes.dmnd b/assets/test/mMelMel3.1.buscogenes.dmnd similarity index 99% rename from assets/test/mCerEla1.1.buscogenes.dmnd rename to assets/test/mMelMel3.1.buscogenes.dmnd index bccca41d..391345ba 100644 Binary files a/assets/test/mCerEla1.1.buscogenes.dmnd and b/assets/test/mMelMel3.1.buscogenes.dmnd differ diff --git a/assets/test/mMelMel3.1.buscoregions.dmnd b/assets/test/mMelMel3.1.buscoregions.dmnd new file mode 100644 index 00000000..91fa6042 Binary files /dev/null and b/assets/test/mMelMel3.1.buscoregions.dmnd differ diff --git a/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.ndb b/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.ndb new file mode 100644 index 00000000..18062436 Binary files /dev/null and b/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.ndb differ diff --git a/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nhr b/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nhr new file mode 100644 index 00000000..0b5d4906 Binary files /dev/null and b/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nhr differ diff --git a/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nin b/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nin new file mode 100644 index 00000000..bebd568b Binary files /dev/null and b/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nin differ diff --git a/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nog b/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nog new file mode 100644 index 00000000..e6ef79c7 Binary files /dev/null and b/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nog differ diff --git a/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nos b/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nos new file mode 100644 index 00000000..99700566 Binary files /dev/null and b/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nos differ diff --git a/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.not b/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.not new file mode 100644 index 00000000..047e8d38 Binary files /dev/null and b/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.not differ diff --git a/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nsq b/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nsq new file mode 100644 index 00000000..48497573 Binary files /dev/null and b/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nsq differ diff --git a/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.ntf b/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.ntf new file mode 100644 index 00000000..3be5ea5b Binary files /dev/null and b/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.ntf differ diff --git a/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nto b/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nto new file mode 100644 index 00000000..6d4a41c7 Binary files /dev/null and b/assets/test/nt_mMelMel3.1/nt_mMelMel3.1.nto differ diff --git a/assets/test/samplesheet_raw.csv b/assets/test/samplesheet_raw.csv new file mode 100644 index 00000000..830753a7 --- /dev/null +++ b/assets/test/samplesheet_raw.csv @@ -0,0 +1,4 @@ +sample,datatype,datafile +mMelMel1,illumina,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/genomic_data/mMelMel1/illumina/31231_3#1_subset.cram +mMelMel2,illumina,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/genomic_data/mMelMel2/illumina/31231_4#1_subset.cram +mMelMel3,hic,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/genomic_data/mMelMel3/hic-arima2/35528_2#1_subset.cram diff --git a/assets/test_full/full_samplesheet.csv b/assets/test_full/full_samplesheet.csv index 88fc7462..6a3ba69d 100644 --- a/assets/test_full/full_samplesheet.csv +++ b/assets/test_full/full_samplesheet.csv @@ -1,3 +1,3 @@ sample,datatype,datafile -gfLaeSulp1,hic,/lustre/scratch123/tol/projects/.sandbox/data/fungi/Laetiporus_sulphureus/analysis/gfLaeSulp1.1/read_mapping/hic/GCA_927399515.1.unmasked.hic.gfLaeSulp1.cram -gfLaeSulp1,pacbio,/lustre/scratch123/tol/projects/.sandbox/data/fungi/Laetiporus_sulphureus/analysis/gfLaeSulp1.1/read_mapping/pacbio/GCA_927399515.1.unmasked.pacbio.gfLaeSulp1.cram +gfLaeSulp1,hic,/lustre/scratch123/tol/resources/nextflow/test-data/Laetiporus_sulphureus/analysis/gfLaeSulp1.1/read_mapping/hic/GCA_927399515.1.unmasked.hic.gfLaeSulp1.cram +gfLaeSulp1,pacbio,/lustre/scratch123/tol/resources/nextflow/test-data/Laetiporus_sulphureus/analysis/gfLaeSulp1.1/read_mapping/pacbio/GCA_927399515.1.unmasked.pacbio.gfLaeSulp1.cram diff --git a/assets/test_full/gfLaeSulp1.1.buscoregions.dmnd b/assets/test_full/gfLaeSulp1.1.buscoregions.dmnd new file mode 100644 index 00000000..3f2a1a54 Binary files /dev/null and b/assets/test_full/gfLaeSulp1.1.buscoregions.dmnd differ diff --git a/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.ndb b/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.ndb new file mode 100644 index 00000000..0905629a Binary files /dev/null and b/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.ndb differ diff --git a/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nhr b/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nhr new file mode 100644 index 00000000..1fa3521a Binary files /dev/null and b/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nhr differ diff --git a/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nin b/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nin new file mode 100644 index 00000000..0503c4c7 Binary files /dev/null and b/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nin differ diff --git a/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nog b/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nog new file mode 100644 index 00000000..7dcd60eb Binary files /dev/null and b/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nog differ diff --git a/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nos b/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nos new file mode 100644 index 00000000..6bd1dcdf Binary files /dev/null and b/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nos differ diff --git a/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.not b/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.not new file mode 100644 index 00000000..8bacddec Binary files /dev/null and b/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.not differ diff --git a/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nsq b/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nsq new file mode 100644 index 00000000..6afe38e9 Binary files /dev/null and b/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nsq differ diff --git a/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.ntf b/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.ntf new file mode 100644 index 00000000..efd34086 Binary files /dev/null and b/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.ntf differ diff --git a/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nto b/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nto new file mode 100644 index 00000000..4b140ec3 Binary files /dev/null and b/assets/test_full/nt_gfLaeSulp1.1/nt_gfLaeSulp1.1.nto differ diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 72e3f485..f5bf5c5b 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -24,12 +24,16 @@ class RowChecker: """ - VALID_FORMATS = (".cram",) + VALID_FORMATS = ( + ".cram", + ".bam", + ) VALID_DATATYPES = ( "hic", "illumina", "pacbio", + "pacbio_clr", "ont", ) diff --git a/bin/nohitlist.sh b/bin/nohitlist.sh new file mode 100755 index 00000000..c935cebe --- /dev/null +++ b/bin/nohitlist.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +# input +fasta=$1 +blast=$2 +prefix=$3 +E=$4 + +# find ids of sequences with no hits in the blastx search +grep '>' $fasta | \ + grep -v -w -f <(awk -v evalue="$E" '{{if($14<{evalue}){{print $1}}}}' $blast | sort | uniq) | \ + cut -f1 | sed 's/>//' > $prefix.nohit.txt + + + diff --git a/bin/update_versions.py b/bin/update_versions.py new file mode 100755 index 00000000..0978393c --- /dev/null +++ b/bin/update_versions.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python3 + +"""Script to update the software versions in meta.json""" + +import argparse +import sys +import json +import yaml + + +def parse_args(args=None): + Description = "Combine BED files to create window stats input file." + + parser = argparse.ArgumentParser(description=Description) + parser.add_argument("--meta", help="Input JSON file.", required=True) + parser.add_argument("--software", help="Input YAML file.", required=True) + parser.add_argument("--version", action="version", version="%(prog)s 1.0.0") + return parser.parse_args(args) + + +def update_meta(meta, software): + with open(meta) as fh: + infile = json.load(fh) + + with open(software) as fh: + versions = yaml.safe_load(fh) + + new_dict = dict() + for k, v in versions.items(): + new_dict.update(v) + + infile["settings"]["pipeline"] = "https://github.com/sanger-tol/blobtoolkit" + infile["settings"]["release"] = new_dict["sanger-tol/blobtoolkit"] + + del new_dict["sanger-tol/blobtoolkit"] + infile["settings"]["software_versions"] = new_dict + + return infile + + +def main(args=None): + args = parse_args(args) + + data = update_meta(args.meta, args.software) + with open(args.meta, "w") as fh: + json.dump(data, fh) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bin/windowstats_input.py b/bin/windowstats_input.py index dcd1bedb..29d98c2f 100755 --- a/bin/windowstats_input.py +++ b/bin/windowstats_input.py @@ -12,7 +12,7 @@ def parse_args(args=None): parser = argparse.ArgumentParser(description=Description) parser.add_argument("--freq", help="Frequence fasta windows input file", required=True) parser.add_argument("--mononuc", help="Mononucleotide fasta windows input file", required=True) - parser.add_argument("--mosdepth", help="Mosdepth coverage input file", nargs="+", required=True) + parser.add_argument("--depth", help="Depth coverage input file", nargs="+", required=True) parser.add_argument("--countbusco", help="BUSCO gene counts by region", required=True) parser.add_argument("--output", help="Output TSV file.", required=True) parser.add_argument("--version", action="version", version="%(prog)s 1.0.0") @@ -24,7 +24,7 @@ def make_dir(path): os.makedirs(path, exist_ok=True) -def merge_all(freq, mononuc, mosdepth, countbusco): +def merge_all(freq, mononuc, depth, countbusco): freq_fw = pd.read_csv(freq, sep="\t") mononuc_fw = pd.read_csv(mononuc, sep="\t") combo_fw = freq_fw.merge(mononuc_fw).rename( @@ -32,7 +32,7 @@ def merge_all(freq, mononuc, mosdepth, countbusco): ) count_df = pd.read_csv(countbusco, sep="\t").rename(columns={"ID": "sequence"}) - for f in mosdepth: + for f in depth: tag = os.path.basename(f).replace(".regions.bed.gz", "") cov_df = pd.read_csv( f, @@ -52,7 +52,7 @@ def main(args=None): out_dir = os.path.dirname(args.output) make_dir(out_dir) - merge_all(args.freq, args.mononuc, args.mosdepth, args.countbusco).to_csv(args.output, sep="\t", index=False) + merge_all(args.freq, args.mononuc, args.depth, args.countbusco).to_csv(args.output, sep="\t", index=False) if __name__ == "__main__": diff --git a/conf/modules.config b/conf/modules.config index ebf62694..155111ab 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -14,34 +14,80 @@ process { withName: "SAMPLESHEET_CHECK" { publishDir = [ - path: { "${params.outdir}/blobtoolkit_info" }, + path: { "${params.outdir}/pipeline_info/blobtoolkit" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals("versions.yml") ? null : filename } ] } - withName: "GOAT_TAXONSEARCH" { - ext.args = "-l -b" + withName: "WINDOWMASKER_MKCOUNTS" { + ext.args = "-infmt fasta -sformat obinary" + } + + withName: "WINDOWMASKER_USTAT" { + ext.args = "-infmt fasta -dust T -outfmt fasta" + } + + withName: "MINIMAP2_HIC" { + ext.args = "-ax sr" + } + + withName: "MINIMAP2_ILMN" { + ext.args = "-ax sr" + } + + withName: "MINIMAP2_CCS" { + ext.args = "-ax map-hifi --cs=short" + } + + withName: "MINIMAP2_CLR" { + ext.args = "-ax map-pb" + } + + withName: "MINIMAP2_ONT" { + ext.args = "-ax map-ont" } withName: "SAMTOOLS_VIEW" { ext.args = "--output-fmt bam --write-index" } + withName: "SAMTOOLS_INDEX" { + ext.args = "-c" + } + + withName: "GOAT_TAXONSEARCH" { + ext.args = "--lineage --busco" + } + withName: "BUSCO" { scratch = true - ext.args = "--mode genome --force" + // Overridden in the test profile, see at the end of this file + ext.args = "--force" + publishDir = [ + path: { "${params.outdir}/BUSCO" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals("versions.yml") ? null : filename } + ] } withName: "DIAMOND_BLASTP" { ext.args = "--evalue 1.0e-25 --max-target-seqs 10 --max-hsps 1" } + withName: "DIAMOND_BLASTX" { + ext.args = "--evalue 1.0e-25 --max-target-seqs 10 --max-hsps 1" + } + + withName: "BLOBTK_DEPTH" { + ext.args = "-s 1000" + } + withName: "BLOBTOOLKIT_WINDOWSTATS" { ext.args = "--window 0.1 --window 0.01 --window 1 --window 100000 --window 1000000" } - withName: "BLOBTOOLKIT_BLOBDIR" { + withName: "BLOBTOOLKIT_CREATEBLOBDIR" { ext.args = "--evalue 1.0e-25 --hit-count 10" publishDir = [ path: { "${params.outdir}/" }, @@ -50,6 +96,15 @@ process { ] } + withName: "BLOBTOOLKIT_UPDATEBLOBDIR" { + ext.args = "--evalue 1.0e-25 --hit-count 10 --update-plot" + publishDir = [ + path: { "${params.outdir}/" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals("versions.yml") ? null : filename } + ] + } + withName: "BLOBTOOLKIT_SUMMARY" { publishDir = [ path: { "${params.outdir}/${blobdir.name}/" }, @@ -58,7 +113,7 @@ process { ] } - withName: "BLOBTOOLKIT_IMAGES" { + withName: "BLOBTK_IMAGES" { publishDir = [ path: { "${params.outdir}/${blobdir.name}/" }, mode: params.publish_dir_mode, @@ -66,16 +121,40 @@ process { ] } + withName: "BLOBTOOLKIT_CHUNK" { + ext.args = "--chunk 100000 --overlap 0 --max-chunks 10 --min-length 1000" + } + + withName: "BLOBTOOLKIT_UNCHUNK" { + ext.args = "--count 10" + } + + withName: "NOHIT_LIST" { + ext.args = "1.0e-25" + } + + withName: "BLAST_BLASTN" { + ext.args = "-outfmt '6 qseqid staxids bitscore std' -max_target_seqs 10 -max_hsps 1 -evalue 1.0e-10 -lcase_masking -dust '20 64 1'" + } + withName: "CUSTOM_DUMPSOFTWAREVERSIONS" { publishDir = [ - path: { "${params.outdir}/blobtoolkit_info" }, + path: { "${params.outdir}/pipeline_info/blobtoolkit" }, mode: params.publish_dir_mode, pattern: "*_versions.yml" ] } - withName: MULTIQC { - ext.args = params.multiqc_title ? "--title \"$params.multiqc_title\"" : '' + withName: "BLOBTOOLKIT_UPDATEMETA" { + publishDir = [ + path: { "${params.outdir}/" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals("versions.yml") ? null : filename } + ] + } + + withName: 'MULTIQC' { + ext.args = { params.multiqc_title ? "--title \"$params.multiqc_title\"" : '' } publishDir = [ path: { "${params.outdir}/multiqc" }, mode: params.publish_dir_mode, @@ -84,3 +163,27 @@ process { } } + + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Additional configuration to speed processes up during testing. + +---------------------------------------------------------------------------------------- +*/ + +profiles { + test { + process { + withName: BUSCO { + // Note: BUSCO *must* see the double-quotes around the parameters + ext.args = '--force --metaeuk_parameters \'"-s=2"\' --metaeuk_rerun_parameters \'"-s=2"\'' + publishDir = [ + path: { "${params.outdir}/BUSCO" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals("versions.yml") ? null : filename } + ] + } + } + } +} diff --git a/conf/test.config b/conf/test.config index 165bfff6..221a0f22 100644 --- a/conf/test.config +++ b/conf/test.config @@ -12,7 +12,7 @@ params { config_profile_name = 'Test profile' - config_profile_description = 'Minimal test dataset to check pipeline function' + config_profile_description = 'Minimal aligned test dataset to check pipeline function' // Limit resources so that this can run on GitHub Actions max_cpus = 2 @@ -22,15 +22,17 @@ params { // Input test data // Specify the paths to your test data // Give any required params for the test so that command line flags are not needed - input = "${projectDir}/assets/test/samplesheet.csv" + input = "${projectDir}/assets/test/samplesheet_s3.csv" // Fasta references - fasta = "/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/assembly/release/mMelMel3.1_paternal_haplotype/GCA_922984935.2.subset.fasta.gz" + fasta = "https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/assembly/release/mMelMel3.1_paternal_haplotype/GCA_922984935.2.subset.fasta.gz" accession = "GCA_922984935.2" taxon = "Meles meles" // Databases - taxdump = "/lustre/scratch123/tol/teams/grit/geval_pipeline/btk_databases/taxdump" - busco = "/lustre/scratch123/tol/resources/nextflow/busco_2021_06_reduced/" - uniprot = "${projectDir}/assets/test/mCerEla1.1.buscogenes.dmnd" + taxdump = "/lustre/scratch123/tol/teams/grit/geval_pipeline/btk_databases/taxdump" + busco = "/lustre/scratch123/tol/resources/nextflow/busco/blobtoolkit.GCA_922984935.2.2023-08-03" + blastp = "${projectDir}/assets/test/mMelMel3.1.buscogenes.dmnd" + blastx = "${projectDir}/assets/test/mMelMel3.1.buscoregions.dmnd" + blastn = "${projectDir}/assets/test/nt_mMelMel3.1" } diff --git a/conf/test_full.config b/conf/test_full.config index ee22dba2..ff1ac068 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -10,8 +10,6 @@ ---------------------------------------------------------------------------------------- */ -// cleanup = true - params { config_profile_name = 'Full test profile' config_profile_description = 'Full test dataset to check pipeline function' @@ -22,12 +20,14 @@ params { input = "${projectDir}/assets/test_full/full_samplesheet.csv" // Fasta references - fasta = "/lustre/scratch124/tol/projects/darwin/data/fungi/Laetiporus_sulphureus/assembly/release/gfLaeSulp1.1/insdc/GCA_927399515.1.fasta.gz" + fasta = "https://tolit.cog.sanger.ac.uk/test-data/Laetiporus_sulphureus/assembly/release/gfLaeSulp1.1/insdc/GCA_927399515.1.fasta.gz" accession = "GCA_927399515.1" taxon = "Laetiporus sulphureus" // Databases - taxdump = "/lustre/scratch123/tol/teams/grit/geval_pipeline/btk_databases/taxdump" - busco = "/lustre/scratch123/tol/resources/busco/v5/" - uniprot = "${projectDir}/assets/test_full/gfLaeSulp1.1.buscogenes.dmnd" + taxdump = "/lustre/scratch123/tol/teams/grit/geval_pipeline/btk_databases/taxdump" + busco = "/lustre/scratch123/tol/resources/busco/latest" + blastp = "${projectDir}/assets/test_full/gfLaeSulp1.1.buscogenes.dmnd" + blastx = "${projectDir}/assets/test_full/gfLaeSulp1.1.buscoregions.dmnd" + blastn = "${projectDir}/assets/test_full/nt_gfLaeSulp1.1" } diff --git a/conf/test_raw.config b/conf/test_raw.config new file mode 100644 index 00000000..6d4174c2 --- /dev/null +++ b/conf/test_raw.config @@ -0,0 +1,39 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run sanger-tol/blobtoolkit -profile test_raw, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Raw test profile' + config_profile_description = 'Minimal raw test dataset to check pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input test data + // Specify the paths to your test data + // Give any required params for the test so that command line flags are not needed + input = "${projectDir}/assets/test/samplesheet_raw.csv" + align = true + + // Fasta references + fasta = "https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/assembly/release/mMelMel3.1_paternal_haplotype/GCA_922984935.2.subset.fasta.gz" + accession = "GCA_922984935.2" + taxon = "Meles meles" + + // Databases + taxdump = "/lustre/scratch123/tol/teams/grit/geval_pipeline/btk_databases/taxdump" + busco = "/lustre/scratch123/tol/resources/nextflow/busco/blobtoolkit.GCA_922984935.2.2023-08-03" + blastp = "${projectDir}/assets/test/mMelMel3.1.buscogenes.dmnd" + blastx = "${projectDir}/assets/test/mMelMel3.1.buscoregions.dmnd" + blastn = "${projectDir}/assets/test/nt_mMelMel3.1/" +} diff --git a/docs/output.md b/docs/output.md index 437c6df7..ffa089a9 100644 --- a/docs/output.md +++ b/docs/output.md @@ -52,10 +52,11 @@ Results generated by MultiQC collate pipeline QC from supported tools. The pipel
Output files -- `blobtoolkit_info/` +- `pipeline_info/blobtoolkit/` - Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`. - Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.yml`. The `pipeline_report*` files will only be present if the `--email` / `--email_on_fail` parameter's are used when running the pipeline. - Reformatted samplesheet files used as input to the pipeline: `samplesheet.valid.csv`. + - Parameters used by the pipeline run: `params.json`.
diff --git a/docs/usage.md b/docs/usage.md index 71b07d05..4e4c9d7c 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -43,10 +43,10 @@ sample3,ont,ont.cram | Column | Description | | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (\_). | -| `datatype` | Type of sequencing data. Must be one of `hic`, `illumina`, `pacbio`, or `ont`. | +| `datatype` | Type of sequencing data. Must be one of `hic`, `illumina`, `pacbio`, `pacbio_clr` or `ont`. | | `datafile` | Full path to read data file. | -An [example samplesheet](https://raw.githubusercontent.com/sanger-tol/blobtoolkit/main/assets/test/samplesheet.csv) has been provided with the pipeline. +An [example samplesheet](assets/test/samplesheet.csv) has been provided with the pipeline. ## Getting databases ready for the pipeline @@ -159,14 +159,12 @@ find v5/data -name "*.tar.gz" | parallel "cd {//}; tar -xzf {/}" ## YAML File and Nextflow configuration -As in the Snakemake version [a YAML configuration file](https://github.com/blobtoolkit/blobtoolkit/tree/main/src/blobtoolkit-pipeline/src#configuration) is needed to generate metadata summary. This YAML config file can be generated with a genome accession value for released assemblies (for example, GCA_XXXXXXXXX.X) or can be passed for draft assemblies (for example, [GCA_922984935.2.yaml](https://raw.githubusercontent.com/sanger-tol/blobtoolkit/main/assets/test/GCA_922984935.2.yaml) using the `--yaml` parameter. Even for draft assemblies, a placeholder value should be passed with the `--accession` parameter. +As in the Snakemake version [a YAML configuration file](https://github.com/blobtoolkit/blobtoolkit/tree/main/src/blobtoolkit-pipeline/src#configuration) is needed to generate metadata summary. This YAML config file can be generated with a genome accession value for released assemblies (for example, GCA_XXXXXXXXX.X) or can be passed for draft assemblies (for example, [GCA_922984935.2.yaml](assets/test/GCA_922984935.2.yaml) using the `--yaml` parameter. Even for draft assemblies, a placeholder value should be passed with the `--accession` parameter. The data in the YAML is currently ignored in the Nextflow pipeline version. The YAML file is retained only to allow compatibility with the BlobDir dataset generated by the [Snakemake version](https://github.com/blobtoolkit/blobtoolkit/tree/main/src/blobtoolkit-pipeline/src). The taxonomic information in the YAML file can be obtained from [NCBI Taxonomy](https://www.ncbi.nlm.nih.gov/data-hub/taxonomy/). ## Changes from Snakemake to Nextflow -The current version of Nextflow pipeline is not compatible with the public version of the [GenomeHubs BlobToolKit portal](https://blobtoolkit.genomehubs.org). - ### Commands Snakemake @@ -183,10 +181,10 @@ Nextflow ```bash # Public Assemblies -nextflow run sanger-tol/blobtoolkit --input SAMPLESHEET --fasta GENOME –-accession GCA_ACCESSION --taxon TAXON_ID --taxdump TAXDUMP_DB --uniprot DMND_db +nextflow run sanger-tol/blobtoolkit --input SAMPLESHEET --fasta GENOME –-accession GCA_ACCESSION --taxon TAXON_ID --taxdump TAXDUMP_DB --blastp DMND_db --blastn BLASTN_DB --blastx BLASTX_DB # Draft Assemblies -nextflow run sanger-tol/blobtoolkit --input SAMPLESHEET --fasta GENOME –-accession TAG --taxon TAXON_ID --yaml CONFIG --taxdump TAXDUMP_DB --uniprot DMND_db +nextflow run sanger-tol/blobtoolkit --input SAMPLESHEET --fasta GENOME –-accession TAG --taxon TAXON_ID --yaml CONFIG --taxdump TAXDUMP_DB --blastp DMND_db --blastn BLASTN_DB --blastx BLASTX_DB ``` ### Subworkflows @@ -194,31 +192,31 @@ nextflow run sanger-tol/blobtoolkit --input SAMPLESHEET --fasta GENOME –-acces Here is a full list of snakemake subworkflows and their Nextflow couterparts: - **`minimap.smk`** - - Not implemented yet. - - Alignment is done using the [sanger-tol/readmapping](https://github.com/sanger-tol/readmapping) pipeline. + - Implemented as [`minimap_alignment.nf`](subworkflows/local/minimap_alignment.nf). + - Optimised alignment is done using the [sanger-tol/readmapping](https://github.com/sanger-tol/readmapping) pipeline. - **`windowmasker.smk`** - - Not implemented yet. - - Genomes downloaded by [sanger-tol/insdcdownload](https://github.com/sanger-tol/insdcdownload) is masked. + - Implemented as part of [`prepare_genome.nf`](subworkflows/local/prepare_genome.nf). + - Genomes downloaded by [sanger-tol/insdcdownload](https://github.com/sanger-tol/insdcdownload) are already masked. - **`chunk_stats.smk`** - - Subworkflow has been modified. + - Modified implementation as part of [`coverage_stats.nf`](subworkflows/local/coverage_stats.nf). - BED file and additional statistics calculated using [`fasta_windows`](https://github.com/tolkit/fasta_windows). - **`busco.smk`** - - Implemented as [`busco_diamond_blastp.nf`](https://github.com/sanger-tol/blobtoolkit/blob/main/subworkflows/local/busco_diamond_blastp.nf). + - Implemented as [`busco_diamond_blastp.nf`](subworkflows/local/busco_diamond_blastp.nf). - **`cov_stats.smk`** - - The coverage calculation are done using [`mosdepth`]() in subworkflow [`coverage_stats.nf`](https://github.com/sanger-tol/blobtoolkit/blob/main/subworkflows/local/coverage_stats.nf). - - Combining the various tsv files in done in subworkflow [`collate_stats.nf`](https://github.com/sanger-tol/blobtoolkit/blob/main/subworkflows/local/collate_stats.nf). + - Implemented as part of [`coverage_stats.nf`](subworkflows/local/coverage_stats.nf). + - Combining the various tsv files is done in subworkflow [`collate_stats.nf`](subworkflows/local/collate_stats.nf). - **`window_stats.smk`** - - The [`window_stats`]() process in implemented in subworkflow [`collate_stats.nf`](https://github.com/sanger-tol/blobtoolkit/blob/main/subworkflows/local/collate_stats.nf). + - Implemented as part of [`collate_stats.nf`](subworkflows/local/collate_stats.nf). - **`diamond_blastp.smk`** - - Implemented within [`busco_diamond_blastp.nf`](https://github.com/sanger-tol/blobtoolkit/blob/main/subworkflows/local/busco_diamond_blastp.nf). + - Implemented as [`busco_diamond_blastp.nf`](subworkflows/local/busco_diamond_blastp.nf). - **`diamond.smk`** - - Will be implemented as `diamond_blastx.nf`. + - Implemented as [`run_blastx.nf`](subworkflows/local/run_blastx.nf). - **`blastn.smk`** - - Will be implemented as `blastn.nf`. + - Implemented as [`run_blastn.nf`](subworkflows/local/run_blastn.nf). - **`blobtools.smk`** - - Implemented as [`blobtools.nf`](https://github.com/sanger-tol/blobtoolkit/blob/main/subworkflows/local/blobtools.nf). + - Implemented as [`blobtools.nf`](subworkflows/local/blobtools.nf). - **`view.smk`** - - Implemented as [`view.nf`](https://github.com/sanger-tol/blobtoolkit/blob/main/subworkflows/local/view.nf). + - Implemented as [`view.nf`](subworkflows/local/view.nf). ### Software dependencies @@ -226,22 +224,21 @@ List of tools for any given dataset can be fetched from the API, for example htt | Dependency | Snakemake | Nextflow | | ----------------- | --------- | -------- | -| blobtoolkit | 4.1.5 | 4.1.5 | -| blast | 2.12.0 | | -| blobtk | 0.2.4 | | -| busco | 5.3.2 | 5.4.3 | -| diamond | 2.0.15 | | +| blobtoolkit | 4.3.2 | 4.3.2 | +| blast | 2.12.0 | 2.14.1 | +| blobtk | 0.5.0 | 0.5.1 | +| busco | 5.3.2 | 5.5.0 | +| diamond | 2.0.15 | 2.1.8 | | fasta_windows | | 0.2.4 | -| goat | | 0.2.0 | -| minimap2 | 2.24 | | -| mosdepth | | 0.3.3 | +| goat | | 0.2.5 | +| minimap2 | 2.24 | 2.24 | | ncbi-datasets-cli | 14.1.0 | | -| nextflow | | 22.10.6 | -| python | 3.9.13 | 3.10.6 | -| samtools | 1.15.1 | 1.15.1 | +| nextflow | | 23.10.0 | +| python | 3.9.13 | 3.12.0 | +| samtools | 1.15.1 | 1.18 | | seqtk | 1.3 | | | snakemake | 7.19.1 | | -| windowmasker | 2.12.0 | | +| windowmasker | 2.12.0 | 2.14.0 | > **NB:** Dependency has been **added** if only the Nextflow version information is present. > **NB:** Dependency has been **removed** if only the Snakemake version information is present. @@ -252,7 +249,7 @@ List of tools for any given dataset can be fetched from the API, for example htt The typical command for running the pipeline is as follows: ```bash -nextflow run sanger-tol/blobtoolkit --input samplesheet.csv --outdir --fasta genome.fasta -profile docker –-accession GCA_922984935.2 --taxon "Meles meles" --taxdump /path/to/taxdump --uniprot /path/to/buscogenes.dmnd +nextflow run sanger-tol/blobtoolkit --input samplesheet.csv --outdir --fasta genome.fasta -profile docker –-accession GCA_accession --taxon "species name" --taxdump /path/to/taxdump --blastp /path/to/buscogenes.dmnd --blastn /path/to/blastn.nt --blastx /path/to/buscoregions.dmnd ``` This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. @@ -270,8 +267,11 @@ If you wish to repeatedly use the same parameters for multiple runs, rather than Pipeline settings can be provided in a `yaml` or `json` file via `-params-file `. -> ⚠️ Do not use `-c ` to specify parameters as this will result in errors. Custom config files specified with `-c` must only be used for [tuning process resource specifications](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources), other infrastructural tweaks (such as output directories), or module arguments (args). -> The above pipeline run specified with a params file in yaml format: +:::warning +Do not use `-c ` to specify parameters as this will result in errors. Custom config files specified with `-c` must only be used for [tuning process resource specifications](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources), other infrastructural tweaks (such as output directories), or module arguments (args). +::: + +The above pipeline run specified with a params file in yaml format: ```bash nextflow run sanger-tol/blobtoolkit -profile docker -params-file params.yaml @@ -282,8 +282,7 @@ with `params.yaml` containing: ```yaml input: './samplesheet.csv' outdir: './results/' -genome: 'GRCh37' -input: 'data' +fasta: 'genome.fasta' <...> ``` @@ -307,11 +306,15 @@ This version number will be logged in reports when you run the pipeline, so that To further assist in reproducbility, you can use share and re-use [parameter files](#running-the-pipeline) to repeat pipeline runs with the same settings without having to write out a command with every single parameter. -> 💡 If you wish to share such profile (such as upload as supplementary material for academic publications), make sure to NOT include cluster specific paths to files, nor institutional specific profiles. +:::tip +If you wish to share such profile (such as upload as supplementary material for academic publications), make sure to NOT include cluster specific paths to files, nor institutional specific profiles. +::: ## Core Nextflow arguments -> **NB:** These options are part of Nextflow and use a _single_ hyphen (pipeline parameters use a double-hyphen). +:::note +These options are part of Nextflow and use a _single_ hyphen (pipeline parameters use a double-hyphen). +::: ### `-profile` @@ -319,7 +322,9 @@ Use this parameter to choose a configuration profile. Profiles can give configur Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (Docker, Singularity, Podman, Shifter, Charliecloud, Apptainer, Conda) - see below. -> We highly recommend the use of Docker or Singularity containers for full pipeline reproducibility, however when this is not possible, Conda is also supported. +:::info +We highly recommend the use of Docker or Singularity containers for full pipeline reproducibility, however when this is not possible, Conda is also supported. +::: The pipeline also dynamically loads configurations from [https://github.com/nf-core/configs](https://github.com/nf-core/configs) when it runs, making multiple config profiles for various institutional clusters available at run time. For more information and to see if your system is available in these configs please see the [nf-core/configs documentation](https://github.com/nf-core/configs#documentation). @@ -360,7 +365,7 @@ Specify the path to a specific config file (this is a core Nextflow command). Se ### Resource requests -Whilst the default requirements set within the pipeline will hopefully work for most people and with most input data, you may find that you want to customise the compute resources that the pipeline requests. Each step in the pipeline has a default set of requirements for number of CPUs, memory and time. For most of the steps in the pipeline, if the job exits with any of the error codes specified [here](https://github.com/sanger-tol/blobtoolkit/blob/56906ffb5737e4b985797bb5fb4b9c94cfe69600/conf/base.config#L18) it will automatically be resubmitted with higher requests (2 x original, then 3 x original). If it still fails after the third attempt then the pipeline execution is stopped. +Whilst the default requirements set within the pipeline will hopefully work for most people and with most input data, you may find that you want to customise the compute resources that the pipeline requests. Each step in the pipeline has a default set of requirements for number of CPUs, memory and time. For most of the steps in the pipeline, if the job exits with any of the error codes specified [here](conf/base.config#L18) it will automatically be resubmitted with higher requests (2 x original, then 3 x original). If it still fails after the third attempt then the pipeline execution is stopped. To change the resource requests, please see the [max resources](https://nf-co.re/docs/usage/configuration#max-resources) and [tuning workflow resources](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources) section of the nf-core website. diff --git a/lib/NfcoreSchema.groovy b/lib/NfcoreSchema.groovy deleted file mode 100755 index 9b34804d..00000000 --- a/lib/NfcoreSchema.groovy +++ /dev/null @@ -1,530 +0,0 @@ -// -// This file holds several functions used to perform JSON parameter validation, help and summary rendering for the nf-core pipeline template. -// - -import nextflow.Nextflow -import org.everit.json.schema.Schema -import org.everit.json.schema.loader.SchemaLoader -import org.everit.json.schema.ValidationException -import org.json.JSONObject -import org.json.JSONTokener -import org.json.JSONArray -import groovy.json.JsonSlurper -import groovy.json.JsonBuilder - -class NfcoreSchema { - - // - // Resolve Schema path relative to main workflow directory - // - public static String getSchemaPath(workflow, schema_filename='nextflow_schema.json') { - return "${workflow.projectDir}/${schema_filename}" - } - - // - // Function to loop over all parameters defined in schema and check - // whether the given parameters adhere to the specifications - // - /* groovylint-disable-next-line UnusedPrivateMethodParameter */ - public static void validateParameters(workflow, params, log, schema_filename='nextflow_schema.json') { - def has_error = false - //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - // Check for nextflow core params and unexpected params - def json = new File(getSchemaPath(workflow, schema_filename=schema_filename)).text - def Map schemaParams = (Map) new JsonSlurper().parseText(json).get('definitions') - def nf_params = [ - // Options for base `nextflow` command - 'bg', - 'c', - 'C', - 'config', - 'd', - 'D', - 'dockerize', - 'h', - 'log', - 'q', - 'quiet', - 'syslog', - 'v', - - // Options for `nextflow run` command - 'ansi', - 'ansi-log', - 'bg', - 'bucket-dir', - 'c', - 'cache', - 'config', - 'dsl2', - 'dump-channels', - 'dump-hashes', - 'E', - 'entry', - 'latest', - 'lib', - 'main-script', - 'N', - 'name', - 'offline', - 'params-file', - 'pi', - 'plugins', - 'poll-interval', - 'pool-size', - 'profile', - 'ps', - 'qs', - 'queue-size', - 'r', - 'resume', - 'revision', - 'stdin', - 'stub', - 'stub-run', - 'test', - 'w', - 'with-apptainer', - 'with-charliecloud', - 'with-conda', - 'with-dag', - 'with-docker', - 'with-mpi', - 'with-notification', - 'with-podman', - 'with-report', - 'with-singularity', - 'with-timeline', - 'with-tower', - 'with-trace', - 'with-weblog', - 'without-docker', - 'without-podman', - 'work-dir' - ] - def unexpectedParams = [] - - // Collect expected parameters from the schema - def expectedParams = [] - def enums = [:] - for (group in schemaParams) { - for (p in group.value['properties']) { - expectedParams.push(p.key) - if (group.value['properties'][p.key].containsKey('enum')) { - enums[p.key] = group.value['properties'][p.key]['enum'] - } - } - } - - for (specifiedParam in params.keySet()) { - // nextflow params - if (nf_params.contains(specifiedParam)) { - log.error "ERROR: You used a core Nextflow option with two hyphens: '--${specifiedParam}'. Please resubmit with '-${specifiedParam}'" - has_error = true - } - // unexpected params - def params_ignore = params.schema_ignore_params.split(',') + 'schema_ignore_params' - def expectedParamsLowerCase = expectedParams.collect{ it.replace("-", "").toLowerCase() } - def specifiedParamLowerCase = specifiedParam.replace("-", "").toLowerCase() - def isCamelCaseBug = (specifiedParam.contains("-") && !expectedParams.contains(specifiedParam) && expectedParamsLowerCase.contains(specifiedParamLowerCase)) - if (!expectedParams.contains(specifiedParam) && !params_ignore.contains(specifiedParam) && !isCamelCaseBug) { - // Temporarily remove camelCase/camel-case params #1035 - def unexpectedParamsLowerCase = unexpectedParams.collect{ it.replace("-", "").toLowerCase()} - if (!unexpectedParamsLowerCase.contains(specifiedParamLowerCase)){ - unexpectedParams.push(specifiedParam) - } - } - } - - //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - // Validate parameters against the schema - InputStream input_stream = new File(getSchemaPath(workflow, schema_filename=schema_filename)).newInputStream() - JSONObject raw_schema = new JSONObject(new JSONTokener(input_stream)) - - // Remove anything that's in params.schema_ignore_params - raw_schema = removeIgnoredParams(raw_schema, params) - - Schema schema = SchemaLoader.load(raw_schema) - - // Clean the parameters - def cleanedParams = cleanParameters(params) - - // Convert to JSONObject - def jsonParams = new JsonBuilder(cleanedParams) - JSONObject params_json = new JSONObject(jsonParams.toString()) - - // Validate - try { - schema.validate(params_json) - } catch (ValidationException e) { - println '' - log.error 'ERROR: Validation of pipeline parameters failed!' - JSONObject exceptionJSON = e.toJSON() - printExceptions(exceptionJSON, params_json, log, enums) - println '' - has_error = true - } - - // Check for unexpected parameters - if (unexpectedParams.size() > 0) { - Map colors = NfcoreTemplate.logColours(params.monochrome_logs) - println '' - def warn_msg = 'Found unexpected parameters:' - for (unexpectedParam in unexpectedParams) { - warn_msg = warn_msg + "\n* --${unexpectedParam}: ${params[unexpectedParam].toString()}" - } - log.warn warn_msg - log.info "- ${colors.dim}Ignore this warning: params.schema_ignore_params = \"${unexpectedParams.join(',')}\" ${colors.reset}" - println '' - } - - if (has_error) { - Nextflow.error('Exiting!') - } - } - - // - // Beautify parameters for --help - // - public static String paramsHelp(workflow, params, command, schema_filename='nextflow_schema.json') { - Map colors = NfcoreTemplate.logColours(params.monochrome_logs) - Integer num_hidden = 0 - String output = '' - output += 'Typical pipeline command:\n\n' - output += " ${colors.cyan}${command}${colors.reset}\n\n" - Map params_map = paramsLoad(getSchemaPath(workflow, schema_filename=schema_filename)) - Integer max_chars = paramsMaxChars(params_map) + 1 - Integer desc_indent = max_chars + 14 - Integer dec_linewidth = 160 - desc_indent - for (group in params_map.keySet()) { - Integer num_params = 0 - String group_output = colors.underlined + colors.bold + group + colors.reset + '\n' - def group_params = params_map.get(group) // This gets the parameters of that particular group - for (param in group_params.keySet()) { - if (group_params.get(param).hidden && !params.show_hidden_params) { - num_hidden += 1 - continue; - } - def type = '[' + group_params.get(param).type + ']' - def description = group_params.get(param).description - def defaultValue = group_params.get(param).default != null ? " [default: " + group_params.get(param).default.toString() + "]" : '' - def description_default = description + colors.dim + defaultValue + colors.reset - // Wrap long description texts - // Loosely based on https://dzone.com/articles/groovy-plain-text-word-wrap - if (description_default.length() > dec_linewidth){ - List olines = [] - String oline = "" // " " * indent - description_default.split(" ").each() { wrd -> - if ((oline.size() + wrd.size()) <= dec_linewidth) { - oline += wrd + " " - } else { - olines += oline - oline = wrd + " " - } - } - olines += oline - description_default = olines.join("\n" + " " * desc_indent) - } - group_output += " --" + param.padRight(max_chars) + colors.dim + type.padRight(10) + colors.reset + description_default + '\n' - num_params += 1 - } - group_output += '\n' - if (num_params > 0){ - output += group_output - } - } - if (num_hidden > 0){ - output += colors.dim + "!! Hiding $num_hidden params, use --show_hidden_params to show them !!\n" + colors.reset - } - output += NfcoreTemplate.dashedLine(params.monochrome_logs) - return output - } - - // - // Groovy Map summarising parameters/workflow options used by the pipeline - // - public static LinkedHashMap paramsSummaryMap(workflow, params, schema_filename='nextflow_schema.json') { - // Get a selection of core Nextflow workflow options - def Map workflow_summary = [:] - if (workflow.revision) { - workflow_summary['revision'] = workflow.revision - } - workflow_summary['runName'] = workflow.runName - if (workflow.containerEngine) { - workflow_summary['containerEngine'] = workflow.containerEngine - } - if (workflow.container) { - workflow_summary['container'] = workflow.container - } - workflow_summary['launchDir'] = workflow.launchDir - workflow_summary['workDir'] = workflow.workDir - workflow_summary['projectDir'] = workflow.projectDir - workflow_summary['userName'] = workflow.userName - workflow_summary['profile'] = workflow.profile - workflow_summary['configFiles'] = workflow.configFiles.join(', ') - - // Get pipeline parameters defined in JSON Schema - def Map params_summary = [:] - def params_map = paramsLoad(getSchemaPath(workflow, schema_filename=schema_filename)) - for (group in params_map.keySet()) { - def sub_params = new LinkedHashMap() - def group_params = params_map.get(group) // This gets the parameters of that particular group - for (param in group_params.keySet()) { - if (params.containsKey(param)) { - def params_value = params.get(param) - def schema_value = group_params.get(param).default - def param_type = group_params.get(param).type - if (schema_value != null) { - if (param_type == 'string') { - if (schema_value.contains('$projectDir') || schema_value.contains('${projectDir}')) { - def sub_string = schema_value.replace('\$projectDir', '') - sub_string = sub_string.replace('\${projectDir}', '') - if (params_value.contains(sub_string)) { - schema_value = params_value - } - } - if (schema_value.contains('$params.outdir') || schema_value.contains('${params.outdir}')) { - def sub_string = schema_value.replace('\$params.outdir', '') - sub_string = sub_string.replace('\${params.outdir}', '') - if ("${params.outdir}${sub_string}" == params_value) { - schema_value = params_value - } - } - } - } - - // We have a default in the schema, and this isn't it - if (schema_value != null && params_value != schema_value) { - sub_params.put(param, params_value) - } - // No default in the schema, and this isn't empty - else if (schema_value == null && params_value != "" && params_value != null && params_value != false) { - sub_params.put(param, params_value) - } - } - } - params_summary.put(group, sub_params) - } - return [ 'Core Nextflow options' : workflow_summary ] << params_summary - } - - // - // Beautify parameters for summary and return as string - // - public static String paramsSummaryLog(workflow, params) { - Map colors = NfcoreTemplate.logColours(params.monochrome_logs) - String output = '' - def params_map = paramsSummaryMap(workflow, params) - def max_chars = paramsMaxChars(params_map) - for (group in params_map.keySet()) { - def group_params = params_map.get(group) // This gets the parameters of that particular group - if (group_params) { - output += colors.bold + group + colors.reset + '\n' - for (param in group_params.keySet()) { - output += " " + colors.blue + param.padRight(max_chars) + ": " + colors.green + group_params.get(param) + colors.reset + '\n' - } - output += '\n' - } - } - output += "!! Only displaying parameters that differ from the pipeline defaults !!\n" - output += NfcoreTemplate.dashedLine(params.monochrome_logs) - return output - } - - // - // Loop over nested exceptions and print the causingException - // - private static void printExceptions(ex_json, params_json, log, enums, limit=5) { - def causingExceptions = ex_json['causingExceptions'] - if (causingExceptions.length() == 0) { - def m = ex_json['message'] =~ /required key \[([^\]]+)\] not found/ - // Missing required param - if (m.matches()) { - log.error "* Missing required parameter: --${m[0][1]}" - } - // Other base-level error - else if (ex_json['pointerToViolation'] == '#') { - log.error "* ${ex_json['message']}" - } - // Error with specific param - else { - def param = ex_json['pointerToViolation'] - ~/^#\// - def param_val = params_json[param].toString() - if (enums.containsKey(param)) { - def error_msg = "* --${param}: '${param_val}' is not a valid choice (Available choices" - if (enums[param].size() > limit) { - log.error "${error_msg} (${limit} of ${enums[param].size()}): ${enums[param][0..limit-1].join(', ')}, ... )" - } else { - log.error "${error_msg}: ${enums[param].join(', ')})" - } - } else { - log.error "* --${param}: ${ex_json['message']} (${param_val})" - } - } - } - for (ex in causingExceptions) { - printExceptions(ex, params_json, log, enums) - } - } - - // - // Remove an element from a JSONArray - // - private static JSONArray removeElement(json_array, element) { - def list = [] - int len = json_array.length() - for (int i=0;i - if(raw_schema.keySet().contains('definitions')){ - raw_schema.definitions.each { definition -> - for (key in definition.keySet()){ - if (definition[key].get("properties").keySet().contains(ignore_param)){ - // Remove the param to ignore - definition[key].get("properties").remove(ignore_param) - // If the param was required, change this - if (definition[key].has("required")) { - def cleaned_required = removeElement(definition[key].required, ignore_param) - definition[key].put("required", cleaned_required) - } - } - } - } - } - if(raw_schema.keySet().contains('properties') && raw_schema.get('properties').keySet().contains(ignore_param)) { - raw_schema.get("properties").remove(ignore_param) - } - if(raw_schema.keySet().contains('required') && raw_schema.required.contains(ignore_param)) { - def cleaned_required = removeElement(raw_schema.required, ignore_param) - raw_schema.put("required", cleaned_required) - } - } - return raw_schema - } - - // - // Clean and check parameters relative to Nextflow native classes - // - private static Map cleanParameters(params) { - def new_params = params.getClass().newInstance(params) - for (p in params) { - // remove anything evaluating to false - if (!p['value']) { - new_params.remove(p.key) - } - // Cast MemoryUnit to String - if (p['value'].getClass() == nextflow.util.MemoryUnit) { - new_params.replace(p.key, p['value'].toString()) - } - // Cast Duration to String - if (p['value'].getClass() == nextflow.util.Duration) { - new_params.replace(p.key, p['value'].toString().replaceFirst(/d(?!\S)/, "day")) - } - // Cast LinkedHashMap to String - if (p['value'].getClass() == LinkedHashMap) { - new_params.replace(p.key, p['value'].toString()) - } - } - return new_params - } - - // - // This function tries to read a JSON params file - // - private static LinkedHashMap paramsLoad(String json_schema) { - def params_map = new LinkedHashMap() - try { - params_map = paramsRead(json_schema) - } catch (Exception e) { - println "Could not read parameters settings from JSON. $e" - params_map = new LinkedHashMap() - } - return params_map - } - - // - // Method to actually read in JSON file using Groovy. - // Group (as Key), values are all parameters - // - Parameter1 as Key, Description as Value - // - Parameter2 as Key, Description as Value - // .... - // Group - // - - private static LinkedHashMap paramsRead(String json_schema) throws Exception { - def json = new File(json_schema).text - def Map schema_definitions = (Map) new JsonSlurper().parseText(json).get('definitions') - def Map schema_properties = (Map) new JsonSlurper().parseText(json).get('properties') - /* Tree looks like this in nf-core schema - * definitions <- this is what the first get('definitions') gets us - group 1 - title - description - properties - parameter 1 - type - description - parameter 2 - type - description - group 2 - title - description - properties - parameter 1 - type - description - * properties <- parameters can also be ungrouped, outside of definitions - parameter 1 - type - description - */ - - // Grouped params - def params_map = new LinkedHashMap() - schema_definitions.each { key, val -> - def Map group = schema_definitions."$key".properties // Gets the property object of the group - def title = schema_definitions."$key".title - def sub_params = new LinkedHashMap() - group.each { innerkey, value -> - sub_params.put(innerkey, value) - } - params_map.put(title, sub_params) - } - - // Ungrouped params - def ungrouped_params = new LinkedHashMap() - schema_properties.each { innerkey, value -> - ungrouped_params.put(innerkey, value) - } - params_map.put("Other parameters", ungrouped_params) - - return params_map - } - - // - // Get maximum number of characters across all parameter names - // - private static Integer paramsMaxChars(params_map) { - Integer max_chars = 0 - for (group in params_map.keySet()) { - def group_params = params_map.get(group) // This gets the parameters of that particular group - for (param in group_params.keySet()) { - if (param.size() > max_chars) { - max_chars = param.size() - } - } - } - return max_chars - } -} diff --git a/lib/NfcoreTemplate.groovy b/lib/NfcoreTemplate.groovy index 2777ae2b..a2210c6e 100755 --- a/lib/NfcoreTemplate.groovy +++ b/lib/NfcoreTemplate.groovy @@ -3,6 +3,8 @@ // import org.yaml.snakeyaml.Yaml +import groovy.json.JsonOutput +import nextflow.extension.FilesEx class NfcoreTemplate { @@ -128,7 +130,7 @@ class NfcoreTemplate { def email_html = html_template.toString() // Render the sendmail template - def max_multiqc_email_size = params.max_multiqc_email_size as nextflow.util.MemoryUnit + def max_multiqc_email_size = (params.containsKey('max_multiqc_email_size') ? params.max_multiqc_email_size : 0) as nextflow.util.MemoryUnit def smail_fields = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, projectDir: "$projectDir", mqcFile: mqc_report, mqcMaxSize: max_multiqc_email_size.toBytes() ] def sf = new File("$projectDir/assets/sendmail_template.txt") def sendmail_template = engine.createTemplate(sf).make(smail_fields) @@ -140,12 +142,14 @@ class NfcoreTemplate { try { if (params.plaintext_email) { throw GroovyException('Send plaintext e-mail, not HTML') } // Try to send HTML e-mail using sendmail + def sendmail_tf = new File(workflow.launchDir.toString(), ".sendmail_tmp.html") + sendmail_tf.withWriter { w -> w << sendmail_html } [ 'sendmail', '-t' ].execute() << sendmail_html log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} Sent summary e-mail to $email_address (sendmail)-" } catch (all) { // Catch failures and try with plaintext def mail_cmd = [ 'mail', '-s', subject, '--content-type=text/html', email_address ] - if ( mqc_report.size() <= max_multiqc_email_size.toBytes() ) { + if ( mqc_report != null && mqc_report.size() <= max_multiqc_email_size.toBytes() ) { mail_cmd += [ '-A', mqc_report ] } mail_cmd.execute() << email_html @@ -154,14 +158,16 @@ class NfcoreTemplate { } // Write summary e-mail HTML to a file - def output_d = new File("${params.outdir}/blobtoolkit_info/") - if (!output_d.exists()) { - output_d.mkdirs() - } - def output_hf = new File(output_d, "pipeline_report.html") + def output_hf = new File(workflow.launchDir.toString(), ".pipeline_report.html") output_hf.withWriter { w -> w << email_html } - def output_tf = new File(output_d, "pipeline_report.txt") + FilesEx.copyTo(output_hf.toPath(), "${params.outdir}/pipeline_info/pipeline_report.html"); + output_hf.delete() + + // Write summary e-mail TXT to a file + def output_tf = new File(workflow.launchDir.toString(), ".pipeline_report.txt") output_tf.withWriter { w -> w << email_txt } + FilesEx.copyTo(output_tf.toPath(), "${params.outdir}/pipeline_info/pipeline_report.txt"); + output_tf.delete() } // @@ -222,6 +228,20 @@ class NfcoreTemplate { } } + // + // Dump pipeline parameters in a json file + // + public static void dump_parameters(workflow, params) { + def timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') + def filename = "params_${timestamp}.json" + def temp_pf = new File(workflow.launchDir.toString(), ".${filename}") + def jsonStr = JsonOutput.toJson(params) + temp_pf.text = JsonOutput.prettyPrint(jsonStr) + + FilesEx.copyTo(temp_pf.toPath(), "${params.outdir}/pipeline_info/blobtoolkit/params_${timestamp}.json") + temp_pf.delete() + } + // // Print pipeline summary on completion // diff --git a/lib/WorkflowBlobtoolkit.groovy b/lib/WorkflowBlobtoolkit.groovy index 8be37902..df59d762 100755 --- a/lib/WorkflowBlobtoolkit.groovy +++ b/lib/WorkflowBlobtoolkit.groovy @@ -12,6 +12,7 @@ class WorkflowBlobtoolkit { // public static void initialise(params, log) { + if (!params.fasta) { Nextflow.error "Genome fasta file not specified with e.g. '--fasta genome.fa' or via a detectable config file." } @@ -44,20 +45,59 @@ class WorkflowBlobtoolkit { return yaml_file_text } - public static String methodsDescriptionText(run_workflow, mqc_methods_yaml) { - // Convert to a named map so can be used as with familar NXF ${workflow} variable syntax in the MultiQC YML file + // + // Generate methods description for MultiQC + // + + public static String toolCitationText(params) { + + // Optionally add in-text citation tools to this list. + // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "Tool (Foo et al. 2023)" : "", + // Uncomment function in methodsDescriptionText to render in MultiQC report + def citation_text = [ + "Tools used in the workflow included:", + "MultiQC (Ewels et al. 2016)", + "." + ].join(' ').trim() + + return citation_text + } + + public static String toolBibliographyText(params) { + + // Optionally add bibliographic entries to this list. + // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "
  • Author (2023) Pub name, Journal, DOI
  • " : "", + // Uncomment function in methodsDescriptionText to render in MultiQC report + def reference_text = [ + "
  • Ewels, P., Magnusson, M., Lundin, S., & Käller, M. (2016). MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics , 32(19), 3047–3048. doi: /10.1093/bioinformatics/btw354
  • " + ].join(' ').trim() + + return reference_text + } + + public static String methodsDescriptionText(run_workflow, mqc_methods_yaml, params) { + // Convert to a named map so can be used as with familar NXF ${workflow} variable syntax in the MultiQC YML file def meta = [:] meta.workflow = run_workflow.toMap() meta["manifest_map"] = run_workflow.manifest.toMap() + // Pipeline DOI meta["doi_text"] = meta.manifest_map.doi ? "(doi: ${meta.manifest_map.doi})" : "" meta["nodoi_text"] = meta.manifest_map.doi ? "": "
  • If available, make sure to update the text to include the Zenodo DOI of version of the pipeline used.
  • " + // Tool references + meta["tool_citations"] = "" + meta["tool_bibliography"] = "" + + // Only uncomment below if logic in toolCitationText/toolBibliographyText has been filled! + //meta["tool_citations"] = toolCitationText(params).replaceAll(", \\.", ".").replaceAll("\\. \\.", ".").replaceAll(", \\.", ".") + //meta["tool_bibliography"] = toolBibliographyText(params) + + def methods_text = mqc_methods_yaml.text def engine = new SimpleTemplateEngine() def description_html = engine.createTemplate(methods_text).make(meta) return description_html - } -} + }} diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index 53b29a95..0723e363 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -20,40 +20,11 @@ class WorkflowMain { " https://github.com/${workflow.manifest.name}/blob/master/CITATIONS.md" } - // - // Generate help string - // - public static String help(workflow, params) { - def command = "nextflow run ${workflow.manifest.name} --input samplesheet.csv --fasta reference.fa -profile docker" - def help_string = '' - help_string += NfcoreTemplate.logo(workflow, params.monochrome_logs) - help_string += NfcoreSchema.paramsHelp(workflow, params, command) - help_string += '\n' + citation(workflow) + '\n' - help_string += NfcoreTemplate.dashedLine(params.monochrome_logs) - return help_string - } - - // - // Generate parameter summary log string - // - public static String paramsSummaryLog(workflow, params) { - def summary_log = '' - summary_log += NfcoreTemplate.logo(workflow, params.monochrome_logs) - summary_log += NfcoreSchema.paramsSummaryLog(workflow, params) - summary_log += '\n' + citation(workflow) + '\n' - summary_log += NfcoreTemplate.dashedLine(params.monochrome_logs) - return summary_log - } // // Validate parameters and print summary to screen // public static void initialise(workflow, params, log) { - // Print help to screen if required - if (params.help) { - log.info help(workflow, params) - System.exit(0) - } // Print workflow version and exit on --version if (params.version) { @@ -62,14 +33,6 @@ class WorkflowMain { System.exit(0) } - // Print parameter summary log to screen - log.info paramsSummaryLog(workflow, params) - - // Validate workflow parameters via the JSON schema - if (params.validate_params) { - NfcoreSchema.validateParameters(workflow, params, log) - } - // Check that a -profile or Nextflow config has been provided to run the pipeline NfcoreTemplate.checkConfigProvided(workflow, log) diff --git a/main.nf b/main.nf index 5618f04c..fbd0f4ae 100644 --- a/main.nf +++ b/main.nf @@ -16,6 +16,22 @@ nextflow.enable.dsl = 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ +include { validateParameters; paramsHelp } from 'plugin/nf-validation' + +// Print help message if needed +if (params.help) { + def logo = NfcoreTemplate.logo(workflow, params.monochrome_logs) + def citation = '\n' + WorkflowMain.citation(workflow) + '\n' + def String command = "nextflow run ${workflow.manifest.name} --input samplesheet.csv --genome GRCh37 -profile docker" + log.info logo + paramsHelp(command) + citation + NfcoreTemplate.dashedLine(params.monochrome_logs) + System.exit(0) +} + +// Validate input parameters +if (params.validate_params) { + validateParameters() +} + WorkflowMain.initialise(workflow, params, log) /* diff --git a/modules.json b/modules.json index cd615550..38431cf4 100644 --- a/modules.json +++ b/modules.json @@ -5,50 +5,86 @@ "https://github.com/nf-core/modules.git": { "modules": { "nf-core": { + "blast/blastn": { + "branch": "master", + "git_sha": "f0d13ae7e1f9b24a705764f8673af859268d7077", + "installed_by": ["modules"], + "patch": "modules/nf-core/blast/blastn/blast-blastn.diff" + }, "busco": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "e3126f437c336c826f242842fe51769cfce0ec2d", "installed_by": ["modules"], "patch": "modules/nf-core/busco/busco.diff" }, "custom/dumpsoftwareversions": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "bba7e362e4afead70653f84d8700588ea28d0f9e", "installed_by": ["modules"] }, "diamond/blastp": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "b29f6beb86d1d24d680277fb1a3f4de7b8b8a92c", + "installed_by": ["modules"] + }, + "diamond/blastx": { + "branch": "master", + "git_sha": "b29f6beb86d1d24d680277fb1a3f4de7b8b8a92c", "installed_by": ["modules"] }, "fastawindows": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["modules"] }, "goat/taxonsearch": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["modules"] }, "gunzip": { "branch": "master", - "git_sha": "5c460c5a4736974abde2843294f35307ee2b0e5e", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["modules"] }, - "mosdepth": { + "minimap2/align": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", "installed_by": ["modules"] }, "multiqc": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "4ab13872435962dadc239979554d13709e20bf29", + "installed_by": ["modules"] + }, + "samtools/fasta": { + "branch": "master", + "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", + "installed_by": ["modules"] + }, + "samtools/index": { + "branch": "master", + "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", "installed_by": ["modules"] }, "samtools/view": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", + "installed_by": ["modules"] + }, + "seqtk/subseq": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "windowmasker/mkcounts": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "windowmasker/ustat": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["modules"] } } diff --git a/modules/local/blobtk/depth.nf b/modules/local/blobtk/depth.nf new file mode 100644 index 00000000..b339c156 --- /dev/null +++ b/modules/local/blobtk/depth.nf @@ -0,0 +1,34 @@ +process BLOBTK_DEPTH { + tag "${meta.id}" + label 'process_single' + + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + exit 1, "BLOBTOOLKIT_DEPTH module does not support Conda. Please use Docker / Singularity / Podman instead." + } + container "docker.io/genomehubs/blobtk:0.5.1" + + input: + tuple val(meta), path(bam), path(bai) + + output: + tuple val(meta), path('*.regions.bed.gz') , emit: bed + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + blobtk depth \\ + -b ${bam} \\ + $args \\ + -O ${prefix}.regions.bed.gz \\ + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + blobtk: \$(blobtk --version | cut -d' ' -f2) + END_VERSIONS + """ +} diff --git a/modules/local/blobtoolkit/images.nf b/modules/local/blobtk/images.nf similarity index 86% rename from modules/local/blobtoolkit/images.nf rename to modules/local/blobtk/images.nf index 11bdd485..1b6e8087 100644 --- a/modules/local/blobtoolkit/images.nf +++ b/modules/local/blobtk/images.nf @@ -1,11 +1,11 @@ -process BLOBTOOLKIT_IMAGES { +process BLOBTK_IMAGES { tag "${meta.id}_${plot}" label 'process_single' if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { exit 1, "BLOBTOOLKIT_IMAGES module does not support Conda. Please use Docker / Singularity / Podman instead." } - container "genomehubs/blobtk:0.3.3" + container "docker.io/genomehubs/blobtk:0.5.1" input: tuple val(meta), path(blobdir) @@ -32,7 +32,7 @@ process BLOBTOOLKIT_IMAGES { cat <<-END_VERSIONS > versions.yml "${task.process}": - blobtoolkit: \$(btk --version | cut -d' ' -f2 | sed 's/v//') + blobtk: \$(blobtk --version | cut -d' ' -f2) END_VERSIONS """ } diff --git a/modules/local/blobtoolkit/chunk.nf b/modules/local/blobtoolkit/chunk.nf new file mode 100644 index 00000000..38bc37fe --- /dev/null +++ b/modules/local/blobtoolkit/chunk.nf @@ -0,0 +1,37 @@ +process BLOBTOOLKIT_CHUNK { + tag "$meta.id" + label 'process_single' + + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + exit 1, "BLOBTOOLKIT_CHUNK module does not support Conda. Please use Docker / Singularity / Podman instead." + } + container "docker.io/genomehubs/blobtoolkit:4.3.2" + + input: + tuple val(meta) , path(fasta) + tuple val(meta2), path(busco_table) + + output: + tuple val(meta), path("*.chunks.fasta"), emit: chunks + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def busco = busco_table ? "--busco ${busco_table}" : "--busco None" + """ + btk pipeline chunk-fasta \\ + --in ${fasta} \\ + ${busco} \\ + --out ${prefix}.chunks.fasta \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + blobtoolkit: \$(btk --version | cut -d' ' -f2 | sed 's/v//') + END_VERSIONS + """ +} diff --git a/modules/local/blobtoolkit/config.nf b/modules/local/blobtoolkit/config.nf index ce1e3adc..0a9c2f58 100644 --- a/modules/local/blobtoolkit/config.nf +++ b/modules/local/blobtoolkit/config.nf @@ -5,10 +5,11 @@ process BLOBTOOLKIT_CONFIG { if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { exit 1, "GENERATE_CONFIG module does not support Conda. Please use Docker / Singularity / Podman instead." } - container "genomehubs/blobtoolkit:4.1.5" + container "docker.io/genomehubs/blobtoolkit:4.3.2" input: - tuple val(meta), path(fasta) + tuple val(meta), val(reads) + tuple val(meta), val(fasta) output: tuple val(meta), path("${meta.id}/*.yaml"), emit: yaml @@ -18,9 +19,15 @@ process BLOBTOOLKIT_CONFIG { task.ext.when == null || task.ext.when script: + def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" + def input_reads = reads.collect{"--reads $it"}.join(' ') """ - blobtoolkit-pipeline generate-config ${meta.id} + btk pipeline \\ + generate-config \\ + ${prefix} \\ + $args \\ + ${input_reads} cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/blobtoolkit/countbuscos.nf b/modules/local/blobtoolkit/countbuscos.nf index 1379cbac..e151cde8 100644 --- a/modules/local/blobtoolkit/countbuscos.nf +++ b/modules/local/blobtoolkit/countbuscos.nf @@ -5,7 +5,7 @@ process BLOBTOOLKIT_COUNTBUSCOS { if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { exit 1, "BLOBTOOLKIT_COUNTBUSCOS module does not support Conda. Please use Docker / Singularity / Podman instead." } - container "genomehubs/blobtoolkit:4.1.5" + container "docker.io/genomehubs/blobtoolkit:4.3.2" input: tuple val(meta), path(table, stageAs: 'dir??/*') diff --git a/modules/local/blobtoolkit/blobdir.nf b/modules/local/blobtoolkit/createblobdir.nf similarity index 86% rename from modules/local/blobtoolkit/blobdir.nf rename to modules/local/blobtoolkit/createblobdir.nf index 3f064bce..54810650 100644 --- a/modules/local/blobtoolkit/blobdir.nf +++ b/modules/local/blobtoolkit/createblobdir.nf @@ -1,11 +1,11 @@ -process BLOBTOOLKIT_BLOBDIR { +process BLOBTOOLKIT_CREATEBLOBDIR { tag "$meta.id" label 'process_medium' if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { exit 1, "BLOBTOOLKIT_BLOBDIR module does not support Conda. Please use Docker / Singularity / Podman instead." } - container "genomehubs/blobtoolkit:4.1.5" + container "docker.io/genomehubs/blobtoolkit:4.3.2" input: tuple val(meta), path(window, stageAs: 'windowstats/*') @@ -24,7 +24,7 @@ process BLOBTOOLKIT_BLOBDIR { script: def args = task.ext.args ?: '' prefix = task.ext.prefix ?: "${meta.id}" - def hits = blastp ? "--hits ${blastp}" : "" + def hits_blastp = blastp ? "--hits ${blastp}" : "" """ blobtools replace \\ --bedtsvdir windowstats \\ @@ -32,7 +32,7 @@ process BLOBTOOLKIT_BLOBDIR { --taxdump ${taxdump} \\ --taxrule buscogenes \\ --busco ${busco} \\ - ${hits} \\ + ${hits_blastp} \\ --threads ${task.cpus} \\ $args \\ ${prefix} diff --git a/modules/local/blobtoolkit/extractbuscos.nf b/modules/local/blobtoolkit/extractbuscos.nf index fd5c368e..e34bfd93 100644 --- a/modules/local/blobtoolkit/extractbuscos.nf +++ b/modules/local/blobtoolkit/extractbuscos.nf @@ -5,13 +5,11 @@ process BLOBTOOLKIT_EXTRACTBUSCOS { if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { exit 1, "BLOBTOOLKIT_EXTRACTBUSCOS module does not support Conda. Please use Docker / Singularity / Podman instead." } - container "genomehubs/blobtoolkit:4.1.5" + container "docker.io/genomehubs/blobtoolkit:4.3.2" input: tuple val(meta), path(fasta) - tuple val(meta1), path(seq1, stageAs: "lineage1/*") - tuple val(meta2), path(seq2, stageAs: "lineage2/*") - tuple val(meta3), path(seq3, stageAs: "lineage3/*") + tuple val(metaseq), path(seq, stageAs: "lineage??/*") output: tuple val(meta), path("*_buscogenes.fasta"), emit: genes @@ -23,11 +21,10 @@ process BLOBTOOLKIT_EXTRACTBUSCOS { script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" + def seq_args = seq.collect { "--busco " + it } .join(' ') """ btk pipeline extract-busco-genes \\ - --busco $seq1 \\ - --busco $seq2 \\ - --busco $seq3 \\ + $seq_args \\ --out ${prefix}_buscogenes.fasta cat <<-END_VERSIONS > versions.yml diff --git a/modules/local/blobtoolkit/metadata.nf b/modules/local/blobtoolkit/metadata.nf index 32339c48..8e2d585d 100644 --- a/modules/local/blobtoolkit/metadata.nf +++ b/modules/local/blobtoolkit/metadata.nf @@ -5,7 +5,7 @@ process BLOBTOOLKIT_METADATA { if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { exit 1, "BLOBTOOLKIT_METADATA module does not support Conda. Please use Docker / Singularity / Podman instead." } - container "genomehubs/blobtoolkit:4.1.5" + container "docker.io/genomehubs/blobtoolkit:4.3.2" input: tuple val(meta), path(yaml) diff --git a/modules/local/blobtoolkit/summary.nf b/modules/local/blobtoolkit/summary.nf index d1059d8a..ac92a3b3 100644 --- a/modules/local/blobtoolkit/summary.nf +++ b/modules/local/blobtoolkit/summary.nf @@ -5,7 +5,7 @@ process BLOBTOOLKIT_SUMMARY { if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { exit 1, "BLOBTOOLKIT_SUMMARY module does not support Conda. Please use Docker / Singularity / Podman instead." } - container "genomehubs/blobtoolkit:4.1.5" + container "docker.io/genomehubs/blobtoolkit:4.3.2" input: tuple val(meta), path(blobdir) diff --git a/modules/local/blobtoolkit/unchunk.nf b/modules/local/blobtoolkit/unchunk.nf new file mode 100644 index 00000000..b544bf1f --- /dev/null +++ b/modules/local/blobtoolkit/unchunk.nf @@ -0,0 +1,34 @@ +process BLOBTOOLKIT_UNCHUNK { + tag "$meta.id" + label 'process_single' + + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + exit 1, "BLOBTOOLKIT_UNCHUNK module does not support Conda. Please use Docker / Singularity / Podman instead." + } + container "docker.io/genomehubs/blobtoolkit:4.3.2" + + input: + tuple val(meta), path(blast_table) + + output: + tuple val(meta), path("*.out"), emit: blast_out + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${blast_table}" + """ + btk pipeline unchunk-blast \\ + --in ${blast_table} \\ + --out ${prefix}.out \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + blobtoolkit: \$(btk --version | cut -d' ' -f2 | sed 's/v//') + END_VERSIONS + """ +} diff --git a/modules/local/blobtoolkit/updateblobdir.nf b/modules/local/blobtoolkit/updateblobdir.nf new file mode 100644 index 00000000..7a677828 --- /dev/null +++ b/modules/local/blobtoolkit/updateblobdir.nf @@ -0,0 +1,43 @@ +process BLOBTOOLKIT_UPDATEBLOBDIR { + tag "$meta.id" + label 'process_medium' + + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + exit 1, "BLOBTOOLKIT_BLOBDIR module does not support Conda. Please use Docker / Singularity / Podman instead." + } + container "docker.io/genomehubs/blobtoolkit:4.3.2" + + input: + tuple val(meta), path(input) + tuple val(meta1), path(blastx, stageAs: "blastx.txt") + tuple val(meta2), path(blastn, stageAs: "blastn.txt") + path(taxdump) + + output: + tuple val(meta), path(prefix), emit: blobdir + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + def hits_blastx = blastx ? "--hits ${blastx}" : "" + def hits_blastn = blastn ? "--hits ${blastn}" : "" + """ + blobtools replace \\ + --taxdump ${taxdump} \\ + --taxrule bestdistorder=buscoregions \\ + ${hits_blastx} \\ + ${hits_blastn} \\ + --threads ${task.cpus} \\ + $args \\ + ${input} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + blobtoolkit: \$(btk --version | cut -d' ' -f2 | sed 's/v//') + END_VERSIONS + """ +} diff --git a/modules/local/blobtoolkit/updatemeta.nf b/modules/local/blobtoolkit/updatemeta.nf new file mode 100644 index 00000000..45df4209 --- /dev/null +++ b/modules/local/blobtoolkit/updatemeta.nf @@ -0,0 +1,36 @@ +process BLOBTOOLKIT_UPDATEMETA { + tag "$meta.id" + label 'process_single' + + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + exit 1, "BLOBTOOLKIT_UPDATEMETA module does not support Conda. Please use Docker / Singularity / Podman instead." + } + container "docker.io/pacificbiosciences/pyyaml:5.3.1" + + input: + tuple val(meta), path(input) + path versions + + output: + tuple val(meta), path(prefix), emit: blobdir + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + update_versions.py \\ + ${args} \\ + --meta ${input}/meta.json \\ + --software ${versions} \\ + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + update_versions.py: \$(update_versions.py --version | cut -d' ' -f2) + END_VERSIONS + """ + +} diff --git a/modules/local/blobtoolkit/windowstats.nf b/modules/local/blobtoolkit/windowstats.nf index 0517535f..dde880e6 100644 --- a/modules/local/blobtoolkit/windowstats.nf +++ b/modules/local/blobtoolkit/windowstats.nf @@ -5,7 +5,7 @@ process BLOBTOOLKIT_WINDOWSTATS { if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { exit 1, "GET_WINDOW_STATS module does not support Conda. Please use Docker / Singularity / Podman instead." } - container "genomehubs/blobtoolkit:4.1.5" + container "docker.io/genomehubs/blobtoolkit:4.3.2" input: tuple val(meta), path(tsv) diff --git a/modules/local/create_bed.nf b/modules/local/create_bed.nf index 034ab1e6..3158a732 100644 --- a/modules/local/create_bed.nf +++ b/modules/local/create_bed.nf @@ -5,7 +5,7 @@ process CREATE_BED { conda "conda-forge::gawk=5.1.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/gawk:5.1.0' : - 'quay.io/biocontainers/gawk:5.1.0' }" + 'biocontainers/gawk:5.1.0' }" input: tuple val(meta), path(tsv) //path to tsv output from fasta windows diff --git a/modules/local/nohit_list.nf b/modules/local/nohit_list.nf new file mode 100644 index 00000000..efefe59a --- /dev/null +++ b/modules/local/nohit_list.nf @@ -0,0 +1,32 @@ +process NOHIT_LIST { + tag "$meta.id" + label 'process_single' + + conda "conda-forge::gawk=5.1.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gawk:5.1.0' : + 'biocontainers/gawk:5.1.0' }" + + input: + tuple val(meta), path(blast) //path to blast output table in txt format + tuple val(meta), path(fasta) //path to genome fasta file + + output: + tuple val(meta), path ('*.nohit.txt') , emit: nohitlist + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: // This script is bundled with the pipeline, in sanger-tol/blobtoolkit/bin/ + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + nohitlist.sh ${fasta} ${blast} ${prefix} $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + nohit_list: 1.0 + END_VERSIONS + """ +} diff --git a/modules/local/samplesheet_check.nf b/modules/local/samplesheet_check.nf index 5798da0e..760f3e44 100644 --- a/modules/local/samplesheet_check.nf +++ b/modules/local/samplesheet_check.nf @@ -5,7 +5,7 @@ process SAMPLESHEET_CHECK { conda "conda-forge::python=3.9.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/python:3.9--1' : - 'quay.io/biocontainers/python:3.9--1' }" + 'biocontainers/python:3.9--1' }" input: path samplesheet diff --git a/modules/local/windowstats_input.nf b/modules/local/windowstats_input.nf index f366025d..6fe537b1 100644 --- a/modules/local/windowstats_input.nf +++ b/modules/local/windowstats_input.nf @@ -5,12 +5,12 @@ process WINDOWSTATS_INPUT { conda "conda-forge::pandas=1.5.2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/pandas:1.5.2': - 'quay.io/biocontainers/pandas:1.5.2' }" + 'biocontainers/pandas:1.5.2' }" input: tuple val(meta), path(freq) tuple val(meta), path(mononuc) - tuple val(meta), path(mosdepth) + tuple val(meta), path(depth) tuple val(meta), path(countbusco) output: @@ -27,7 +27,7 @@ process WINDOWSTATS_INPUT { windowstats_input.py \\ --freq ${freq} \\ --mononuc ${mononuc} \\ - --mosdepth ${mosdepth} \\ + --depth ${depth} \\ --countbusco ${countbusco} \\ --output ${prefix}.tsv \\ ${args} diff --git a/modules/nf-core/blast/blastn/blast-blastn.diff b/modules/nf-core/blast/blastn/blast-blastn.diff new file mode 100644 index 00000000..dc3f108f --- /dev/null +++ b/modules/nf-core/blast/blastn/blast-blastn.diff @@ -0,0 +1,30 @@ +Changes in module 'nf-core/blast/blastn' +--- modules/nf-core/blast/blastn/main.nf ++++ modules/nf-core/blast/blastn/main.nf +@@ -10,6 +10,7 @@ + input: + tuple val(meta) , path(fasta) + tuple val(meta2), path(db) ++ val taxid + + output: + tuple val(meta), path('*.txt'), emit: txt +@@ -23,7 +24,7 @@ + def prefix = task.ext.prefix ?: "${meta.id}" + def is_compressed = fasta.getExtension() == "gz" ? true : false + def fasta_name = is_compressed ? fasta.getBaseName() : fasta +- ++ def exclude_taxon = taxid ? "-negative_taxids ${taxid}" : '' + """ + if [ "${is_compressed}" == "true" ]; then + gzip -c -d ${fasta} > ${fasta_name} +@@ -34,6 +35,7 @@ + -num_threads ${task.cpus} \\ + -db \$DB \\ + -query ${fasta_name} \\ ++ ${exclude_taxon} \\ + ${args} \\ + -out ${prefix}.txt + + +************************************************************ diff --git a/modules/nf-core/blast/blastn/environment.yml b/modules/nf-core/blast/blastn/environment.yml new file mode 100644 index 00000000..cb9b15dd --- /dev/null +++ b/modules/nf-core/blast/blastn/environment.yml @@ -0,0 +1,7 @@ +name: blast_blastn +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::blast=2.14.1 diff --git a/modules/nf-core/blast/blastn/main.nf b/modules/nf-core/blast/blastn/main.nf new file mode 100644 index 00000000..44b581a9 --- /dev/null +++ b/modules/nf-core/blast/blastn/main.nf @@ -0,0 +1,59 @@ +process BLAST_BLASTN { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/blast:2.14.1--pl5321h6f7f691_0': + 'biocontainers/blast:2.14.1--pl5321h6f7f691_0' }" + + input: + tuple val(meta) , path(fasta) + tuple val(meta2), path(db) + val taxid + + output: + tuple val(meta), path('*.txt'), emit: txt + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def is_compressed = fasta.getExtension() == "gz" ? true : false + def fasta_name = is_compressed ? fasta.getBaseName() : fasta + def exclude_taxon = taxid ? "-negative_taxids ${taxid}" : '' + """ + if [ "${is_compressed}" == "true" ]; then + gzip -c -d ${fasta} > ${fasta_name} + fi + + DB=`find -L ./ -name "*.nin" | sed 's/\\.nin\$//'` + blastn \\ + -num_threads ${task.cpus} \\ + -db \$DB \\ + -query ${fasta_name} \\ + ${exclude_taxon} \\ + ${args} \\ + -out ${prefix}.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + blast: \$(blastn -version 2>&1 | sed 's/^.*blastn: //; s/ .*\$//') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + blast: \$(blastn -version 2>&1 | sed 's/^.*blastn: //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/blast/blastn/meta.yml b/modules/nf-core/blast/blastn/meta.yml new file mode 100644 index 00000000..a0d64dd6 --- /dev/null +++ b/modules/nf-core/blast/blastn/meta.yml @@ -0,0 +1,55 @@ +name: blast_blastn +description: Queries a BLAST DNA database +keywords: + - fasta + - blast + - blastn + - DNA sequence +tools: + - blast: + description: | + BLAST finds regions of similarity between biological sequences. + homepage: https://blast.ncbi.nlm.nih.gov/Blast.cgi + documentation: https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=Blastdocs + doi: 10.1016/S0022-2836(05)80360-2 + licence: ["US-Government-Work"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Input fasta file containing queries sequences + pattern: "*.{fa,fasta,fa.gz,fasta.gz}" + - meta2: + type: map + description: | + Groovy Map containing db information + e.g. [ id:'test2', single_end:false ] + - db: + type: directory + description: Directory containing the blast database + pattern: "*" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - txt: + type: file + description: File containing blastn hits + pattern: "*.txt" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" +maintainers: + - "@joseespinosa" + - "@drpatelh" + - "@vagkaratzas" diff --git a/modules/nf-core/blast/blastn/tests/main.nf.test b/modules/nf-core/blast/blastn/tests/main.nf.test new file mode 100644 index 00000000..0e909a7e --- /dev/null +++ b/modules/nf-core/blast/blastn/tests/main.nf.test @@ -0,0 +1,71 @@ +nextflow_process { + + name "Test Process BLAST_BLASTN" + script "../main.nf" + process "BLAST_BLASTN" + config "./nextflow.config" + tag "modules" + tag "modules_nfcore" + tag "blast" + tag "blast/blastn" + + setup { + run("BLAST_MAKEBLASTDB") { + script "../../makeblastdb/main.nf" + process { + """ + input[0] = [ [id:'test2'], file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) ] + """ + } + } + } + + test("Should search for nucleotide hits against a blast db") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ [id:'test'], file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) ] + input[1] = BLAST_MAKEBLASTDB.out.db + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.txt[0][1]).getText().contains("Query= MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate") }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + + } + + test("Should search for zipped nucleotide hits against a blast db") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ [id:'test'], file(params.test_data['sarscov2']['genome']['genome_fasta_gz'], checkIfExists: true) ] + input[1] = BLAST_MAKEBLASTDB.out.db + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.txt[0][1]).getText().contains("Query= MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate") }, + { assert snapshot(process.out.versions).match("versions_zipped") } + ) + } + + } + +} diff --git a/modules/nf-core/blast/blastn/tests/main.nf.test.snap b/modules/nf-core/blast/blastn/tests/main.nf.test.snap new file mode 100644 index 00000000..d1b5f3f2 --- /dev/null +++ b/modules/nf-core/blast/blastn/tests/main.nf.test.snap @@ -0,0 +1,18 @@ +{ + "versions": { + "content": [ + [ + "versions.yml:md5,2d5ffadc7035672f6a9e00b01d1751ea" + ] + ], + "timestamp": "2023-12-11T07:20:03.54997013" + }, + "versions_zipped": { + "content": [ + [ + "versions.yml:md5,2d5ffadc7035672f6a9e00b01d1751ea" + ] + ], + "timestamp": "2023-12-11T07:20:12.925782708" + } +} \ No newline at end of file diff --git a/modules/nf-core/blast/blastn/tests/nextflow.config b/modules/nf-core/blast/blastn/tests/nextflow.config new file mode 100644 index 00000000..0899289b --- /dev/null +++ b/modules/nf-core/blast/blastn/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: BLAST_MAKEBLASTDB { + ext.args = '-dbtype nucl' + } +} diff --git a/modules/nf-core/blast/blastn/tests/tags.yml b/modules/nf-core/blast/blastn/tests/tags.yml new file mode 100644 index 00000000..b4588ab8 --- /dev/null +++ b/modules/nf-core/blast/blastn/tests/tags.yml @@ -0,0 +1,2 @@ +blast/blastn: + - modules/nf-core/blast/blastn/** diff --git a/modules/nf-core/busco/busco.diff b/modules/nf-core/busco/busco.diff index 2aa7184a..0a402c4c 100644 --- a/modules/nf-core/busco/busco.diff +++ b/modules/nf-core/busco/busco.diff @@ -7,25 +7,15 @@ Changes in module 'nf-core/busco' + tag "${meta.id}_${lineage}" label 'process_medium' - conda "bioconda::busco=5.4.3" -@@ -14,11 +14,13 @@ - path config_file // Optional: busco configuration file - - output: -- tuple val(meta), path("*-busco.batch_summary.txt"), emit: batch_summary -- tuple val(meta), path("short_summary.*.txt") , emit: short_summaries_txt, optional: true -- tuple val(meta), path("short_summary.*.json") , emit: short_summaries_json, optional: true -- tuple val(meta), path("*-busco") , emit: busco_dir -- path "versions.yml" , emit: versions -+ tuple val(meta), path("*-busco.batch_summary.txt") , emit: batch_summary -+ tuple val(meta), path("short_summary.*.txt") , emit: short_summaries_txt, optional: true -+ tuple val(meta), path("short_summary.*.json") , emit: short_summaries_json, optional: true -+ tuple val(meta), path("*-busco") , emit: busco_dir -+ tuple val(meta), path("*-busco/*/run_*/full_table.tsv") , emit: full_table, optional: true -+ tuple val(meta), path("*-busco/*/run_*/busco_sequences"), emit: seq_dir, optional: true -+ path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when + conda "${moduleDir}/environment.yml" +@@ -37,7 +37,7 @@ + def prefix = task.ext.prefix ?: "${meta.id}-${lineage}" + def busco_config = config_file ? "--config $config_file" : '' + def busco_lineage = lineage.equals('auto') ? '--auto-lineage' : "--lineage_dataset ${lineage}" +- def busco_lineage_dir = busco_lineages_path ? "--download_path ${busco_lineages_path}" : '' ++ def busco_lineage_dir = busco_lineages_path ? "--download_path ${busco_lineages_path} --offline" : '' + """ + # Nextflow changes the container --entrypoint to /bin/bash (container default entrypoint: /usr/local/env-execute) + # Check for container variable initialisation script and source it. ************************************************************ diff --git a/modules/nf-core/busco/environment.yml b/modules/nf-core/busco/environment.yml new file mode 100644 index 00000000..f872d057 --- /dev/null +++ b/modules/nf-core/busco/environment.yml @@ -0,0 +1,7 @@ +name: busco +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::busco=5.5.0 diff --git a/modules/nf-core/busco/main.nf b/modules/nf-core/busco/main.nf index 254ee9fd..867238cf 100644 --- a/modules/nf-core/busco/main.nf +++ b/modules/nf-core/busco/main.nf @@ -2,35 +2,42 @@ process BUSCO { tag "${meta.id}_${lineage}" label 'process_medium' - conda "bioconda::busco=5.4.3" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/busco:5.4.3--pyhdfd78af_0': - 'biocontainers/busco:5.4.3--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/busco:5.5.0--pyhdfd78af_0': + 'biocontainers/busco:5.5.0--pyhdfd78af_0' }" input: tuple val(meta), path('tmp_input/*') + val mode // Required: One of genome, proteins, or transcriptome val lineage // Required: lineage to check against, "auto" enables --auto-lineage instead path busco_lineages_path // Recommended: path to busco lineages - downloads if not set path config_file // Optional: busco configuration file output: - tuple val(meta), path("*-busco.batch_summary.txt") , emit: batch_summary - tuple val(meta), path("short_summary.*.txt") , emit: short_summaries_txt, optional: true - tuple val(meta), path("short_summary.*.json") , emit: short_summaries_json, optional: true - tuple val(meta), path("*-busco") , emit: busco_dir - tuple val(meta), path("*-busco/*/run_*/full_table.tsv") , emit: full_table, optional: true - tuple val(meta), path("*-busco/*/run_*/busco_sequences"), emit: seq_dir, optional: true - path "versions.yml" , emit: versions + tuple val(meta), path("*-busco.batch_summary.txt") , emit: batch_summary + tuple val(meta), path("short_summary.*.txt") , emit: short_summaries_txt, optional: true + tuple val(meta), path("short_summary.*.json") , emit: short_summaries_json, optional: true + tuple val(meta), path("*-busco/*/run_*/full_table.tsv") , emit: full_table, optional: true + tuple val(meta), path("*-busco/*/run_*/missing_busco_list.tsv") , emit: missing_busco_list, optional: true + tuple val(meta), path("*-busco/*/run_*/single_copy_proteins.faa") , emit: single_copy_proteins, optional: true + tuple val(meta), path("*-busco/*/run_*/busco_sequences") , emit: seq_dir + tuple val(meta), path("*-busco/*/translated_proteins") , emit: translated_dir, optional: true + tuple val(meta), path("*-busco") , emit: busco_dir + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when script: + if ( mode !in [ 'genome', 'proteins', 'transcriptome' ] ) { + error "Mode must be one of 'genome', 'proteins', or 'transcriptome'." + } def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}-${lineage}" def busco_config = config_file ? "--config $config_file" : '' def busco_lineage = lineage.equals('auto') ? '--auto-lineage' : "--lineage_dataset ${lineage}" - def busco_lineage_dir = busco_lineages_path ? "--offline --download_path ${busco_lineages_path}" : '' + def busco_lineage_dir = busco_lineages_path ? "--download_path ${busco_lineages_path} --offline" : '' """ # Nextflow changes the container --entrypoint to /bin/bash (container default entrypoint: /usr/local/env-execute) # Check for container variable initialisation script and source it. @@ -66,6 +73,7 @@ process BUSCO { --cpu $task.cpus \\ --in "\$INPUT_SEQS" \\ --out ${prefix}-busco \\ + --mode $mode \\ $busco_lineage \\ $busco_lineage_dir \\ $busco_config \\ diff --git a/modules/nf-core/busco/meta.yml b/modules/nf-core/busco/meta.yml index ef8c5245..90b30d4d 100644 --- a/modules/nf-core/busco/meta.yml +++ b/modules/nf-core/busco/meta.yml @@ -13,7 +13,6 @@ tools: tool_dev_url: https://gitlab.com/ezlab/busco doi: "10.1007/978-1-4939-9173-0_14" licence: ["MIT"] - input: - meta: type: map @@ -24,8 +23,12 @@ input: type: file description: Nucleic or amino acid sequence file in FASTA format. pattern: "*.{fasta,fna,fa,fasta.gz,fna.gz,fa.gz}" + - mode: + type: string + description: The mode to run Busco in. One of genome, proteins, or transcriptome + pattern: "{genome,proteins,transcriptome}" - lineage: - type: value + type: string description: The BUSCO lineage to use, or "auto" to automatically select lineage - busco_lineages_path: type: directory @@ -33,7 +36,6 @@ input: - config_file: type: file description: Path to BUSCO config file. - output: - meta: type: map @@ -56,14 +58,39 @@ output: type: directory description: BUSCO lineage specific output pattern: "*-busco" + - full_table: + type: file + description: Full BUSCO results table + pattern: "full_table.tsv" + - missing_busco_list: + type: file + description: List of missing BUSCOs + pattern: "missing_busco_list.tsv" + - single_copy_proteins: + type: file + description: Fasta file of single copy proteins (transcriptome mode) + pattern: "single_copy_proteins.faa" + - seq_dir: + type: directory + description: BUSCO sequence directory + pattern: "busco_sequences" + - translated_proteins: + type: directory + description: Six frame translations of each transcript made by the transcriptome mode + pattern: "translated_proteins" - versions: type: file description: File containing software versions pattern: "versions.yml" - authors: - "@priyanka-surana" - "@charles-plessy" - "@mahesh-panchal" - "@muffato" - "@jvhagey" +maintainers: + - "@priyanka-surana" + - "@charles-plessy" + - "@mahesh-panchal" + - "@muffato" + - "@jvhagey" diff --git a/modules/nf-core/custom/dumpsoftwareversions/environment.yml b/modules/nf-core/custom/dumpsoftwareversions/environment.yml new file mode 100644 index 00000000..f0c63f69 --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/environment.yml @@ -0,0 +1,7 @@ +name: custom_dumpsoftwareversions +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::multiqc=1.17 diff --git a/modules/nf-core/custom/dumpsoftwareversions/main.nf b/modules/nf-core/custom/dumpsoftwareversions/main.nf index ebc87273..7685b33c 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/main.nf +++ b/modules/nf-core/custom/dumpsoftwareversions/main.nf @@ -2,10 +2,10 @@ process CUSTOM_DUMPSOFTWAREVERSIONS { label 'process_single' // Requires `pyyaml` which does not have a dedicated container but is in the MultiQC container - conda "bioconda::multiqc=1.14" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.14--pyhdfd78af_0' : - 'biocontainers/multiqc:1.14--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.17--pyhdfd78af_0' : + 'biocontainers/multiqc:1.17--pyhdfd78af_0' }" input: path versions diff --git a/modules/nf-core/custom/dumpsoftwareversions/meta.yml b/modules/nf-core/custom/dumpsoftwareversions/meta.yml index c32657de..5f15a5fd 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/meta.yml +++ b/modules/nf-core/custom/dumpsoftwareversions/meta.yml @@ -1,4 +1,4 @@ -# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json name: custom_dumpsoftwareversions description: Custom module used to dump software versions within the nf-core pipeline template keywords: @@ -16,7 +16,6 @@ input: type: file description: YML file containing software versions pattern: "*.yml" - output: - yml: type: file @@ -30,7 +29,9 @@ output: type: file description: File containing software versions pattern: "versions.yml" - authors: - "@drpatelh" - "@grst" +maintainers: + - "@drpatelh" + - "@grst" diff --git a/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test new file mode 100644 index 00000000..eec1db10 --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test @@ -0,0 +1,38 @@ +nextflow_process { + + name "Test Process CUSTOM_DUMPSOFTWAREVERSIONS" + script "../main.nf" + process "CUSTOM_DUMPSOFTWAREVERSIONS" + tag "modules" + tag "modules_nfcore" + tag "custom" + tag "dumpsoftwareversions" + tag "custom/dumpsoftwareversions" + + test("Should run without failures") { + when { + process { + """ + def tool1_version = ''' + TOOL1: + tool1: 0.11.9 + '''.stripIndent() + + def tool2_version = ''' + TOOL2: + tool2: 1.9 + '''.stripIndent() + + input[0] = Channel.of(tool1_version, tool2_version).collectFile() + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} diff --git a/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap new file mode 100644 index 00000000..4274ed57 --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap @@ -0,0 +1,27 @@ +{ + "Should run without failures": { + "content": [ + { + "0": [ + "software_versions.yml:md5,1c851188476409cda5752ce971b20b58" + ], + "1": [ + "software_versions_mqc.yml:md5,2570f4ba271ad08357b0d3d32a9cf84d" + ], + "2": [ + "versions.yml:md5,3843ac526e762117eedf8825b40683df" + ], + "mqc_yml": [ + "software_versions_mqc.yml:md5,2570f4ba271ad08357b0d3d32a9cf84d" + ], + "versions": [ + "versions.yml:md5,3843ac526e762117eedf8825b40683df" + ], + "yml": [ + "software_versions.yml:md5,1c851188476409cda5752ce971b20b58" + ] + } + ], + "timestamp": "2023-11-03T14:43:22.157011" + } +} diff --git a/modules/nf-core/custom/dumpsoftwareversions/tests/tags.yml b/modules/nf-core/custom/dumpsoftwareversions/tests/tags.yml new file mode 100644 index 00000000..405aa24a --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/tests/tags.yml @@ -0,0 +1,2 @@ +custom/dumpsoftwareversions: + - modules/nf-core/custom/dumpsoftwareversions/** diff --git a/modules/nf-core/diamond/blastp/environment.yml b/modules/nf-core/diamond/blastp/environment.yml new file mode 100644 index 00000000..922ea7ed --- /dev/null +++ b/modules/nf-core/diamond/blastp/environment.yml @@ -0,0 +1,7 @@ +name: diamond_blastp +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::diamond=2.1.8 diff --git a/modules/nf-core/diamond/blastp/main.nf b/modules/nf-core/diamond/blastp/main.nf index 02af8886..dc01cdcc 100644 --- a/modules/nf-core/diamond/blastp/main.nf +++ b/modules/nf-core/diamond/blastp/main.nf @@ -2,14 +2,14 @@ process DIAMOND_BLASTP { tag "$meta.id" label 'process_medium' - conda "bioconda::diamond=2.0.15" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/diamond:2.0.15--hb97b32f_0' : - 'biocontainers/diamond:2.0.15--hb97b32f_0' }" + 'https://depot.galaxyproject.org/singularity/diamond:2.1.8--h43eeafb_0' : + 'biocontainers/diamond:2.1.8--h43eeafb_0' }" input: - tuple val(meta), path(fasta) - path db + tuple val(meta) , path(fasta) + tuple val(meta2), path(db) val out_ext val blast_columns @@ -21,7 +21,7 @@ process DIAMOND_BLASTP { tuple val(meta), path('*.sam') , optional: true, emit: sam tuple val(meta), path('*.tsv') , optional: true, emit: tsv tuple val(meta), path('*.paf') , optional: true, emit: paf - path "versions.yml" , emit: versions + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when @@ -29,6 +29,8 @@ process DIAMOND_BLASTP { script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" + def is_compressed = fasta.getExtension() == "gz" ? true : false + def fasta_name = is_compressed ? fasta.getBaseName() : fasta def columns = blast_columns ? "${blast_columns}" : '' switch ( out_ext ) { case "blast": outfmt = 0; break @@ -45,15 +47,19 @@ process DIAMOND_BLASTP { break } """ + if [ "${is_compressed}" == "true" ]; then + gzip -c -d ${fasta} > ${fasta_name} + fi + DB=`find -L ./ -name "*.dmnd" | sed 's/\\.dmnd\$//'` diamond \\ blastp \\ - --threads $task.cpus \\ + --threads ${task.cpus} \\ --db \$DB \\ - --query $fasta \\ + --query ${fasta_name} \\ --outfmt ${outfmt} ${columns} \\ - $args \\ + ${args} \\ --out ${prefix}.${out_ext} cat <<-END_VERSIONS > versions.yml @@ -61,4 +67,31 @@ process DIAMOND_BLASTP { diamond: \$(diamond --version 2>&1 | tail -n 1 | sed 's/^diamond version //') END_VERSIONS """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + switch ( out_ext ) { + case "blast": outfmt = 0; break + case "xml": outfmt = 5; break + case "txt": outfmt = 6; break + case "daa": outfmt = 100; break + case "sam": outfmt = 101; break + case "tsv": outfmt = 102; break + case "paf": outfmt = 103; break + default: + outfmt = '6'; + out_ext = 'txt'; + log.warn("Unknown output file format provided (${out_ext}): selecting DIAMOND default of tabular BLAST output (txt)"); + break + } + + """ + touch ${prefix}.${out_ext} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + diamond: \$(diamond --version 2>&1 | tail -n 1 | sed 's/^diamond version //') + END_VERSIONS + """ } diff --git a/modules/nf-core/diamond/blastp/meta.yml b/modules/nf-core/diamond/blastp/meta.yml index 5bf35791..bab6801e 100644 --- a/modules/nf-core/diamond/blastp/meta.yml +++ b/modules/nf-core/diamond/blastp/meta.yml @@ -13,7 +13,6 @@ tools: tool_dev_url: https://github.com/bbuchfink/diamond doi: "10.1038/s41592-021-01101-x" licence: ["GPL v3.0"] - input: - meta: type: map @@ -23,11 +22,16 @@ input: - fasta: type: file description: Input fasta file containing query sequences - pattern: "*.{fa,fasta}" + pattern: "*.{fa,fasta,fa.gz,fasta.gz}" + - meta2: + type: map + description: | + Groovy Map containing db information + e.g. [ id:'test2', single_end:false ] - db: - type: directory - description: Directory containing the protein blast database - pattern: "*" + type: file + description: File of the indexed DIAMOND database + pattern: "*.dmnd" - out_ext: type: string description: | @@ -40,10 +44,14 @@ input: type: string description: | Optional space separated list of DIAMOND tabular BLAST output keywords - used for in conjunction with the 'txt' out_ext option (--outfmt 6). See - DIAMOND documnetation for more information. - + used for in conjunction with the 'txt' out_ext option (--outfmt 6). Options: + qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] - blast: type: file description: File containing blastp hits @@ -76,7 +84,10 @@ output: type: file description: File containing software versions pattern: "versions.yml" - authors: - "@spficklin" - "@jfy133" +maintainers: + - "@spficklin" + - "@jfy133" + - "@vagkaratzas" diff --git a/modules/nf-core/diamond/blastp/tests/main.nf.test b/modules/nf-core/diamond/blastp/tests/main.nf.test new file mode 100644 index 00000000..672bf050 --- /dev/null +++ b/modules/nf-core/diamond/blastp/tests/main.nf.test @@ -0,0 +1,103 @@ +nextflow_process { + + name "Test Process DIAMOND_BLASTP" + script "../main.nf" + process "DIAMOND_BLASTP" + tag "modules" + tag "modules_nfcore" + tag "diamond" + tag "diamond/blastp" + + setup { + run("DIAMOND_MAKEDB") { + script "../../makedb/main.nf" + process { + """ + input[0] = [ [id:'test2'], [ file(params.test_data['sarscov2']['genome']['proteome_fasta'], checkIfExists: true) ] ] + input[1] = [] + input[2] = [] + input[3] = [] + """ + } + } + } + + test("Should search for protein hits against a DIAMOND db and return a tab separated output file of hits") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ [id:'test'], file(params.test_data['sarscov2']['genome']['proteome_fasta'], checkIfExists: true) ] + input[1] = DIAMOND_MAKEDB.out.db + input[2] = 'txt' + input[3] = 'qseqid qlen' + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.txt).match("txt") }, + { assert process.out.versions } + ) + } + + } + + test("Should search for zipped protein hits against a DIAMOND db and return a tab separated output file of hits") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ [id:'test'], file(params.test_data['sarscov2']['genome']['proteome_fasta_gz'], checkIfExists: true) ] + input[1] = DIAMOND_MAKEDB.out.db + input[2] = 'txt' + input[3] = 'qseqid qlen' + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.txt).match("gz_txt") }, + { assert process.out.versions } + ) + } + + } + + test("Should search for protein hits against a DIAMOND db and return a daa format file of hits") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ [id:'test'], file(params.test_data['sarscov2']['genome']['proteome_fasta'], checkIfExists: true) ] + input[1] = DIAMOND_MAKEDB.out.db + input[2] = 'daa' + input[3] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.out.daa }, + { assert process.out.versions } + ) + } + + } + +} diff --git a/modules/nf-core/diamond/blastp/tests/main.nf.test.snap b/modules/nf-core/diamond/blastp/tests/main.nf.test.snap new file mode 100644 index 00000000..83575bc1 --- /dev/null +++ b/modules/nf-core/diamond/blastp/tests/main.nf.test.snap @@ -0,0 +1,28 @@ +{ + "txt": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.txt:md5,8131b1afd717f3d5f2f2417c5b562e6e" + ] + ] + ], + "timestamp": "2023-11-07T10:27:02.453987512" + }, + "gz_txt": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.txt:md5,8131b1afd717f3d5f2f2417c5b562e6e" + ] + ] + ], + "timestamp": "2023-11-07T09:41:13.934994026" + } +} \ No newline at end of file diff --git a/modules/nf-core/diamond/blastp/tests/tags.yml b/modules/nf-core/diamond/blastp/tests/tags.yml new file mode 100644 index 00000000..d894dab6 --- /dev/null +++ b/modules/nf-core/diamond/blastp/tests/tags.yml @@ -0,0 +1,3 @@ +diamond/blastp: + - modules/nf-core/diamond/blastp/** + - modules/nf-core/diamond/makedb/** diff --git a/modules/nf-core/diamond/blastx/environment.yml b/modules/nf-core/diamond/blastx/environment.yml new file mode 100644 index 00000000..70d6857d --- /dev/null +++ b/modules/nf-core/diamond/blastx/environment.yml @@ -0,0 +1,7 @@ +name: diamond_blastx +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::diamond=2.1.8 diff --git a/modules/nf-core/diamond/blastx/main.nf b/modules/nf-core/diamond/blastx/main.nf new file mode 100644 index 00000000..bf3f623c --- /dev/null +++ b/modules/nf-core/diamond/blastx/main.nf @@ -0,0 +1,102 @@ +process DIAMOND_BLASTX { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/diamond:2.1.8--h43eeafb_0' : + 'biocontainers/diamond:2.1.8--h43eeafb_0' }" + + input: + tuple val(meta) , path(fasta) + tuple val(meta2), path(db) + val out_ext + val blast_columns + + output: + tuple val(meta), path('*.blast'), optional: true, emit: blast + tuple val(meta), path('*.xml') , optional: true, emit: xml + tuple val(meta), path('*.txt') , optional: true, emit: txt + tuple val(meta), path('*.daa') , optional: true, emit: daa + tuple val(meta), path('*.sam') , optional: true, emit: sam + tuple val(meta), path('*.tsv') , optional: true, emit: tsv + tuple val(meta), path('*.paf') , optional: true, emit: paf + tuple val(meta), path("*.log") , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def is_compressed = fasta.getExtension() == "gz" ? true : false + def fasta_name = is_compressed ? fasta.getBaseName() : fasta + def columns = blast_columns ? "${blast_columns}" : '' + switch ( out_ext ) { + case "blast": outfmt = 0; break + case "xml": outfmt = 5; break + case "txt": outfmt = 6; break + case "daa": outfmt = 100; break + case "sam": outfmt = 101; break + case "tsv": outfmt = 102; break + case "paf": outfmt = 103; break + default: + outfmt = '6'; + out_ext = 'txt'; + log.warn("Unknown output file format provided (${out_ext}): selecting DIAMOND default of tabular BLAST output (txt)"); + break + } + """ + if [ "${is_compressed}" == "true" ]; then + gzip -c -d ${fasta} > ${fasta_name} + fi + + DB=`find -L ./ -name "*.dmnd" | sed 's/\\.dmnd\$//'` + + diamond \\ + blastx \\ + --threads ${task.cpus} \\ + --db \$DB \\ + --query ${fasta_name} \\ + --outfmt ${outfmt} ${columns} \\ + ${args} \\ + --out ${prefix}.${out_ext} \\ + --log + + mv diamond.log ${prefix}.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + diamond: \$(diamond --version 2>&1 | tail -n 1 | sed 's/^diamond version //') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + switch ( out_ext ) { + case "blast": outfmt = 0; break + case "xml": outfmt = 5; break + case "txt": outfmt = 6; break + case "daa": outfmt = 100; break + case "sam": outfmt = 101; break + case "tsv": outfmt = 102; break + case "paf": outfmt = 103; break + default: + outfmt = '6'; + out_ext = 'txt'; + log.warn("Unknown output file format provided (${out_ext}): selecting DIAMOND default of tabular BLAST output (txt)"); + break + } + + """ + touch ${prefix}.${out_ext} + touch ${prefix}.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + diamond: \$(diamond --version 2>&1 | tail -n 1 | sed 's/^diamond version //') + END_VERSIONS + """ +} diff --git a/modules/nf-core/diamond/blastx/meta.yml b/modules/nf-core/diamond/blastx/meta.yml new file mode 100644 index 00000000..17106548 --- /dev/null +++ b/modules/nf-core/diamond/blastx/meta.yml @@ -0,0 +1,99 @@ +name: diamond_blastx +description: Queries a DIAMOND database using blastx mode +keywords: + - fasta + - diamond + - blastx + - DNA sequence +tools: + - diamond: + description: Accelerated BLAST compatible local sequence aligner + homepage: https://github.com/bbuchfink/diamond + documentation: https://github.com/bbuchfink/diamond/wiki + tool_dev_url: https://github.com/bbuchfink/diamond + doi: "10.1038/s41592-021-01101-x" + licence: ["GPL v3.0"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Input fasta file containing query sequences + pattern: "*.{fa,fasta,fa.gz,fasta.gz}" + - meta2: + type: map + description: | + Groovy Map containing db information + e.g. [ id:'test2', single_end:false ] + - db: + type: file + description: File of the indexed DIAMOND database + pattern: "*.dmnd" + - out_ext: + type: string + description: | + Specify the type of output file to be generated. `blast` corresponds to + BLAST pairwise format. `xml` corresponds to BLAST xml format. + `txt` corresponds to to BLAST tabular format. `tsv` corresponds to + taxonomic classification format. + pattern: "blast|xml|txt|daa|sam|tsv|paf" + - blast_columns: + type: string + description: | + Optional space separated list of DIAMOND tabular BLAST output keywords + used for in conjunction with the 'txt' out_ext option (--outfmt 6). Options: + qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - blast: + type: file + description: File containing blastp hits + pattern: "*.{blast}" + - xml: + type: file + description: File containing blastp hits + pattern: "*.{xml}" + - txt: + type: file + description: File containing hits in tabular BLAST format. + pattern: "*.{txt}" + - daa: + type: file + description: File containing hits DAA format + pattern: "*.{daa}" + - sam: + type: file + description: File containing aligned reads in SAM format + pattern: "*.{sam}" + - tsv: + type: file + description: Tab separated file containing taxonomic classification of hits + pattern: "*.{tsv}" + - paf: + type: file + description: File containing aligned reads in pairwise mapping format format + pattern: "*.{paf}" + - log: + type: file + description: Log file containing stdout information + pattern: "*.{log}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@spficklin" + - "@jfy133" + - "@mjamy" +maintainers: + - "@spficklin" + - "@jfy133" + - "@mjamy" + - "@vagkaratzas" diff --git a/modules/nf-core/diamond/blastx/tests/main.nf.test b/modules/nf-core/diamond/blastx/tests/main.nf.test new file mode 100644 index 00000000..a367f883 --- /dev/null +++ b/modules/nf-core/diamond/blastx/tests/main.nf.test @@ -0,0 +1,78 @@ +nextflow_process { + + name "Test Process DIAMOND_BLASTX" + script "../main.nf" + process "DIAMOND_BLASTX" + tag "modules" + tag "modules_nfcore" + tag "diamond" + tag "diamond/blastx" + + setup { + run("DIAMOND_MAKEDB") { + script "../../makedb/main.nf" + process { + """ + input[0] = [ [id:'test2'], [ file(params.test_data['sarscov2']['genome']['proteome_fasta'], checkIfExists: true) ] ] + input[1] = [] + input[2] = [] + input[3] = [] + """ + } + } + } + + test("Should search for transcriptome hits against a DIAMOND db and return the default tab separated output file of hits") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ [id:'test'], file(params.test_data['sarscov2']['genome']['transcriptome_fasta'], checkIfExists: true) ] + input[1] = DIAMOND_MAKEDB.out.db + input[2] = 'tfdfdt' // Nonsense file extension to check default case. + input[3] = 'qseqid qlen' + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.txt).match("txt") }, + { assert path(process.out.log.get(0).get(1)).readLines().contains("11 queries aligned.") }, + { assert process.out.versions } + ) + } + + } + + test("Should search for transcriptome hits against a DIAMOND db and return the daa format output file of hits") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ [id:'test'], file(params.test_data['sarscov2']['genome']['transcriptome_fasta'], checkIfExists: true) ] + input[1] = DIAMOND_MAKEDB.out.db + input[2] = 'daa' + input[3] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.out.daa }, + { assert path(process.out.log.get(0).get(1)).readLines().contains("11 queries aligned.") }, + { assert process.out.versions } + ) + } + + } +} diff --git a/modules/nf-core/diamond/blastx/tests/main.nf.test.snap b/modules/nf-core/diamond/blastx/tests/main.nf.test.snap new file mode 100644 index 00000000..27fb0a31 --- /dev/null +++ b/modules/nf-core/diamond/blastx/tests/main.nf.test.snap @@ -0,0 +1,15 @@ +{ + "txt": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.txt:md5,33dc682dabfa44c7089abbc8fe8b84e4" + ] + ] + ], + "timestamp": "2023-11-07T09:42:36.646074348" + } +} \ No newline at end of file diff --git a/modules/nf-core/diamond/blastx/tests/tags.yml b/modules/nf-core/diamond/blastx/tests/tags.yml new file mode 100644 index 00000000..a0e2964b --- /dev/null +++ b/modules/nf-core/diamond/blastx/tests/tags.yml @@ -0,0 +1,3 @@ +diamond/blastx: + - modules/nf-core/diamond/blastx/** + - modules/nf-core/diamond/makedb/** diff --git a/modules/nf-core/fastawindows/environment.yml b/modules/nf-core/fastawindows/environment.yml new file mode 100644 index 00000000..ce557158 --- /dev/null +++ b/modules/nf-core/fastawindows/environment.yml @@ -0,0 +1,7 @@ +name: fastawindows +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::fasta_windows=0.2.4 diff --git a/modules/nf-core/fastawindows/main.nf b/modules/nf-core/fastawindows/main.nf index c65a051e..03cc8c57 100644 --- a/modules/nf-core/fastawindows/main.nf +++ b/modules/nf-core/fastawindows/main.nf @@ -2,7 +2,7 @@ process FASTAWINDOWS { tag "$meta.id" label 'process_low' - conda "bioconda::fasta_windows=0.2.4" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/fasta_windows:0.2.4--hec16e2b_0': 'biocontainers/fasta_windows:0.2.4--hec16e2b_0' }" diff --git a/modules/nf-core/fastawindows/meta.yml b/modules/nf-core/fastawindows/meta.yml index 9342af96..494cc1b6 100644 --- a/modules/nf-core/fastawindows/meta.yml +++ b/modules/nf-core/fastawindows/meta.yml @@ -9,9 +9,7 @@ tools: - "fastawindows": description: "fasta_windows is a tool written for Darwin Tree of Life chromosomal level genome assemblies. The executable takes a fasta formatted file and calculates some statistics of interest in windows" homepage: "https://github.com/tolkit/fasta_windows" - licence: "['MIT']" - input: - meta: type: map @@ -22,7 +20,6 @@ input: type: file description: FASTA file pattern: "*.{fa,fasta,fna}" - output: - meta: type: map @@ -55,3 +52,5 @@ output: pattern: "*.{tsv}" authors: - "@muffato" +maintainers: + - "@muffato" diff --git a/modules/nf-core/goat/taxonsearch/environment.yml b/modules/nf-core/goat/taxonsearch/environment.yml new file mode 100644 index 00000000..e56e71f1 --- /dev/null +++ b/modules/nf-core/goat/taxonsearch/environment.yml @@ -0,0 +1,7 @@ +name: goat_taxonsearch +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::goat=0.2.5 diff --git a/modules/nf-core/goat/taxonsearch/main.nf b/modules/nf-core/goat/taxonsearch/main.nf index 1b0e8ba3..62c12baa 100644 --- a/modules/nf-core/goat/taxonsearch/main.nf +++ b/modules/nf-core/goat/taxonsearch/main.nf @@ -2,10 +2,10 @@ process GOAT_TAXONSEARCH { tag "$meta.id" label 'process_single' - conda "bioconda::goat=0.2.0" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/goat:0.2.0--h92d785c_0': - 'biocontainers/goat:0.2.0--h92d785c_0' }" + 'https://depot.galaxyproject.org/singularity/goat:0.2.5--h9d3141d_2': + 'biocontainers/goat:0.2.5--h9d3141d_2' }" input: tuple val(meta), val(taxon), path(taxa_file) diff --git a/modules/nf-core/goat/taxonsearch/meta.yml b/modules/nf-core/goat/taxonsearch/meta.yml index 06c374f0..1bb19e30 100644 --- a/modules/nf-core/goat/taxonsearch/meta.yml +++ b/modules/nf-core/goat/taxonsearch/meta.yml @@ -12,9 +12,7 @@ tools: homepage: https://github.com/genomehubs/goat-cli documentation: https://github.com/genomehubs/goat-cli/wiki tool_dev_url: https://genomehubs.github.io/goat-cli/goat_cli/ - licence: ["MIT"] - input: - meta: type: map @@ -22,7 +20,7 @@ input: Groovy Map containing sample information e.g. [ id:'test'] - taxon: - type: val + type: string description: | The taxon to search. An NCBI taxon ID, or the name of a taxon at any rank. - taxa_file: @@ -31,7 +29,6 @@ input: A file of NCBI taxonomy ID's (tips) and/or binomial names. Each line should contain a single entry.File size is limited to 500 entries. pattern: "*.txt" - output: - meta: type: map @@ -46,6 +43,8 @@ output: type: file description: TSV file containing search results. pattern: "*.tsv" - authors: - "@alxndrdiaz" +maintainers: + - "@alxndrdiaz" + - "@muffato" diff --git a/modules/nf-core/gunzip/environment.yml b/modules/nf-core/gunzip/environment.yml new file mode 100644 index 00000000..25910b34 --- /dev/null +++ b/modules/nf-core/gunzip/environment.yml @@ -0,0 +1,7 @@ +name: gunzip +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - conda-forge::sed=4.7 diff --git a/modules/nf-core/gunzip/main.nf b/modules/nf-core/gunzip/main.nf index e7189d2f..468a6f28 100644 --- a/modules/nf-core/gunzip/main.nf +++ b/modules/nf-core/gunzip/main.nf @@ -2,7 +2,7 @@ process GUNZIP { tag "$archive" label 'process_single' - conda "conda-forge::sed=4.7" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : 'nf-core/ubuntu:20.04' }" @@ -21,10 +21,14 @@ process GUNZIP { def args = task.ext.args ?: '' gunzip = archive.toString() - '.gz' """ - gunzip \\ - -f \\ + # Not calling gunzip itself because it creates files + # with the original group ownership rather than the + # default one for that user / the work directory + gzip \\ + -cd \\ $args \\ - $archive + $archive \\ + > $gunzip cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/gunzip/meta.yml b/modules/nf-core/gunzip/meta.yml index 4cdcdf4c..231034f2 100644 --- a/modules/nf-core/gunzip/meta.yml +++ b/modules/nf-core/gunzip/meta.yml @@ -33,3 +33,7 @@ authors: - "@joseespinosa" - "@drpatelh" - "@jfy133" +maintainers: + - "@joseespinosa" + - "@drpatelh" + - "@jfy133" diff --git a/modules/nf-core/gunzip/tests/main.nf.test b/modules/nf-core/gunzip/tests/main.nf.test new file mode 100644 index 00000000..d0317922 --- /dev/null +++ b/modules/nf-core/gunzip/tests/main.nf.test @@ -0,0 +1,35 @@ +nextflow_process { + + name "Test Process GUNZIP" + script "../main.nf" + process "GUNZIP" + tag "gunzip" + tag "modules_nfcore" + tag "modules" + + test("Should run without failures") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [], + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/gunzip/tests/main.nf.test.snap b/modules/nf-core/gunzip/tests/main.nf.test.snap new file mode 100644 index 00000000..720fd9ff --- /dev/null +++ b/modules/nf-core/gunzip/tests/main.nf.test.snap @@ -0,0 +1,31 @@ +{ + "Should run without failures": { + "content": [ + { + "0": [ + [ + [ + + ], + "test_1.fastq:md5,4161df271f9bfcd25d5845a1e220dbec" + ] + ], + "1": [ + "versions.yml:md5,54376d32aca20e937a4ec26dac228e84" + ], + "gunzip": [ + [ + [ + + ], + "test_1.fastq:md5,4161df271f9bfcd25d5845a1e220dbec" + ] + ], + "versions": [ + "versions.yml:md5,54376d32aca20e937a4ec26dac228e84" + ] + } + ], + "timestamp": "2023-10-17T15:35:37.690477896" + } +} \ No newline at end of file diff --git a/modules/nf-core/gunzip/tests/tags.yml b/modules/nf-core/gunzip/tests/tags.yml new file mode 100644 index 00000000..fd3f6915 --- /dev/null +++ b/modules/nf-core/gunzip/tests/tags.yml @@ -0,0 +1,2 @@ +gunzip: + - modules/nf-core/gunzip/** diff --git a/modules/nf-core/minimap2/align/environment.yml b/modules/nf-core/minimap2/align/environment.yml new file mode 100644 index 00000000..de1f3811 --- /dev/null +++ b/modules/nf-core/minimap2/align/environment.yml @@ -0,0 +1,8 @@ +name: minimap2_align +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::minimap2=2.24 + - bioconda::samtools=1.18 diff --git a/modules/nf-core/minimap2/align/main.nf b/modules/nf-core/minimap2/align/main.nf new file mode 100644 index 00000000..47cd420c --- /dev/null +++ b/modules/nf-core/minimap2/align/main.nf @@ -0,0 +1,48 @@ +process MINIMAP2_ALIGN { + tag "$meta.id" + label 'process_medium' + + // Note: the versions here need to match the versions used in the mulled container below and minimap2/index + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:365b17b986c1a60c1b82c6066a9345f38317b763-0' : + 'biocontainers/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:365b17b986c1a60c1b82c6066a9345f38317b763-0' }" + + input: + tuple val(meta), path(reads) + tuple val(meta2), path(reference) + val bam_format + val cigar_paf_format + val cigar_bam + + output: + tuple val(meta), path("*.paf"), optional: true, emit: paf + tuple val(meta), path("*.bam"), optional: true, emit: bam + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def bam_output = bam_format ? "-a | samtools sort | samtools view -@ ${task.cpus} -b -h -o ${prefix}.bam" : "-o ${prefix}.paf" + def cigar_paf = cigar_paf_format && !bam_format ? "-c" : '' + def set_cigar_bam = cigar_bam && bam_format ? "-L" : '' + """ + minimap2 \\ + $args \\ + -t $task.cpus \\ + "${reference ?: reads}" \\ + "$reads" \\ + $cigar_paf \\ + $set_cigar_bam \\ + $bam_output + + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + minimap2: \$(minimap2 --version 2>&1) + END_VERSIONS + """ +} diff --git a/modules/nf-core/minimap2/align/meta.yml b/modules/nf-core/minimap2/align/meta.yml new file mode 100644 index 00000000..408522d5 --- /dev/null +++ b/modules/nf-core/minimap2/align/meta.yml @@ -0,0 +1,75 @@ +name: minimap2_align +description: A versatile pairwise aligner for genomic and spliced nucleotide sequences +keywords: + - align + - fasta + - fastq + - genome + - paf + - reference +tools: + - minimap2: + description: | + A versatile pairwise aligner for genomic and spliced nucleotide sequences. + homepage: https://github.com/lh3/minimap2 + documentation: https://github.com/lh3/minimap2#uguide + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FASTA or FASTQ files of size 1 and 2 for single-end + and paired-end data, respectively. + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test_ref'] + - reference: + type: file + description: | + Reference database in FASTA format. + - bam_format: + type: boolean + description: Specify that output should be in BAM format + - cigar_paf_format: + type: boolean + description: Specify that output CIGAR should be in PAF format + - cigar_bam: + type: boolean + description: | + Write CIGAR with >65535 ops at the CG tag. This is recommended when + doing XYZ (https://github.com/lh3/minimap2#working-with-65535-cigar-operations) +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - paf: + type: file + description: Alignment in PAF format + pattern: "*.paf" + - bam: + type: file + description: Alignment in BAM format + pattern: "*.bam" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@heuermh" + - "@sofstam" + - "@sateeshperi" + - "@jfy133" +maintainers: + - "@heuermh" + - "@sofstam" + - "@sateeshperi" + - "@jfy133" diff --git a/modules/nf-core/minimap2/align/tests/main.nf.test b/modules/nf-core/minimap2/align/tests/main.nf.test new file mode 100644 index 00000000..b634468b --- /dev/null +++ b/modules/nf-core/minimap2/align/tests/main.nf.test @@ -0,0 +1,145 @@ +nextflow_process { + + name "Test Process MINIMAP2_ALIGN" + script "../main.nf" + process "MINIMAP2_ALIGN" + + tag "modules" + tag "modules_nfcore" + tag "minimap2" + tag "minimap2/align" + + test("sarscov2 - fastq, fasta, true, false, false") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) + ] + input[1] = [ + [ id:'test_ref' ], // meta map + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + input[2] = true + input[3] = false + input[4] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.versions + ).match() } + ) + } + + } + + test("sarscov2 - [fastq1, fastq2], fasta, true, false, false") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) + ] + ] + input[1] = [ + [ id:'test_ref' ], // meta map + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + input[2] = true + input[3] = false + input[4] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.versions + ).match() } + ) + } + + } + + test("sarscov2 - fastq, [], true, false, false") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) + ] + input[1] = [ + [ id:'test_ref' ], // meta map + [] + ] + input[2] = true + input[3] = false + input[4] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.versions + ).match() } + ) + } + + } + + test("sarscov2 - fastq, fasta, true, false, false - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) + ] + input[1] = [ + [ id:'test_ref' ], // meta map + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + input[2] = true + input[3] = false + input[4] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.versions + ).match() } + ) + } + + } + +} diff --git a/modules/nf-core/minimap2/align/tests/main.nf.test.snap b/modules/nf-core/minimap2/align/tests/main.nf.test.snap new file mode 100644 index 00000000..a39a1697 --- /dev/null +++ b/modules/nf-core/minimap2/align/tests/main.nf.test.snap @@ -0,0 +1,38 @@ +{ + "sarscov2 - fastq, fasta, true, false, false": { + "content": [ + "test.bam", + [ + "versions.yml:md5,9e9eeae0002d466d580a9d6e0d003eb1" + ] + ], + "timestamp": "2023-12-04T12:07:06.01315354" + }, + "sarscov2 - fastq, fasta, true, false, false - stub": { + "content": [ + "test.bam", + [ + "versions.yml:md5,9e9eeae0002d466d580a9d6e0d003eb1" + ] + ], + "timestamp": "2023-12-04T12:07:24.487175659" + }, + "sarscov2 - [fastq1, fastq2], fasta, true, false, false": { + "content": [ + "test.bam", + [ + "versions.yml:md5,9e9eeae0002d466d580a9d6e0d003eb1" + ] + ], + "timestamp": "2023-12-04T12:07:12.50816279" + }, + "sarscov2 - fastq, [], true, false, false": { + "content": [ + "test.bam", + [ + "versions.yml:md5,9e9eeae0002d466d580a9d6e0d003eb1" + ] + ], + "timestamp": "2023-12-04T12:07:18.414974788" + } +} \ No newline at end of file diff --git a/modules/nf-core/minimap2/align/tests/tags.yml b/modules/nf-core/minimap2/align/tests/tags.yml new file mode 100644 index 00000000..39dba374 --- /dev/null +++ b/modules/nf-core/minimap2/align/tests/tags.yml @@ -0,0 +1,2 @@ +minimap2/align: + - "modules/nf-core/minimap2/align/**" diff --git a/modules/nf-core/mosdepth/main.nf b/modules/nf-core/mosdepth/main.nf deleted file mode 100644 index c17e4e65..00000000 --- a/modules/nf-core/mosdepth/main.nf +++ /dev/null @@ -1,80 +0,0 @@ -process MOSDEPTH { - tag "$meta.id" - label 'process_medium' - - conda "bioconda::mosdepth=0.3.3" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mosdepth:0.3.3--hdfd78af_1' : - 'biocontainers/mosdepth:0.3.3--hdfd78af_1'}" - - input: - tuple val(meta), path(bam), path(bai), path(bed) - tuple val(meta2), path(fasta) - - output: - tuple val(meta), path('*.global.dist.txt') , emit: global_txt - tuple val(meta), path('*.summary.txt') , emit: summary_txt - tuple val(meta), path('*.region.dist.txt') , optional:true, emit: regions_txt - tuple val(meta), path('*.per-base.d4') , optional:true, emit: per_base_d4 - tuple val(meta), path('*.per-base.bed.gz') , optional:true, emit: per_base_bed - tuple val(meta), path('*.per-base.bed.gz.csi') , optional:true, emit: per_base_csi - tuple val(meta), path('*.regions.bed.gz') , optional:true, emit: regions_bed - tuple val(meta), path('*.regions.bed.gz.csi') , optional:true, emit: regions_csi - tuple val(meta), path('*.quantized.bed.gz') , optional:true, emit: quantized_bed - tuple val(meta), path('*.quantized.bed.gz.csi') , optional:true, emit: quantized_csi - tuple val(meta), path('*.thresholds.bed.gz') , optional:true, emit: thresholds_bed - tuple val(meta), path('*.thresholds.bed.gz.csi'), optional:true, emit: thresholds_csi - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def reference = fasta ? "--fasta ${fasta}" : "" - def interval = bed ? "--by ${bed}" : "" - if (bed && args.contains("--by")) { - exit 1, "'--by' can only be specified once when running mosdepth! Either remove input BED file definition or remove '--by' from 'ext.args' definition" - } - if (!bed && args.contains("--thresholds")) { - exit 1, "'--thresholds' can only be specified in conjunction with '--by'" - } - - """ - mosdepth \\ - --threads $task.cpus \\ - $interval \\ - $reference \\ - $args \\ - $prefix \\ - $bam - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - mosdepth: \$(mosdepth --version 2>&1 | sed 's/^.*mosdepth //; s/ .*\$//') - END_VERSIONS - """ - - stub: - def prefix = task.ext.prefix ?: "${meta.id}" - """ - touch ${prefix}.global.dist.txt - touch ${prefix}.region.dist.txt - touch ${prefix}.summary.txt - touch ${prefix}.per-base.d4 - touch ${prefix}.per-base.bed.gz - touch ${prefix}.per-base.bed.gz.csi - touch ${prefix}.regions.bed.gz - touch ${prefix}.regions.bed.gz.csi - touch ${prefix}.quantized.bed.gz - touch ${prefix}.quantized.bed.gz.csi - touch ${prefix}.thresholds.bed.gz - touch ${prefix}.thresholds.bed.gz.csi - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - mosdepth: \$(mosdepth --version 2>&1 | sed 's/^.*mosdepth //; s/ .*\$//') - END_VERSIONS - """ -} diff --git a/modules/nf-core/mosdepth/meta.yml b/modules/nf-core/mosdepth/meta.yml deleted file mode 100644 index adf3893f..00000000 --- a/modules/nf-core/mosdepth/meta.yml +++ /dev/null @@ -1,109 +0,0 @@ -name: mosdepth -description: Calculates genome-wide sequencing coverage. -keywords: - - mosdepth - - bam - - cram - - coverage -tools: - - mosdepth: - description: | - Fast BAM/CRAM depth calculation for WGS, exome, or targeted sequencing. - documentation: https://github.com/brentp/mosdepth - doi: 10.1093/bioinformatics/btx699 - licence: ["MIT"] -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - bam: - type: file - description: Input BAM/CRAM file - pattern: "*.{bam,cram}" - - bai: - type: file - description: Index for BAM/CRAM file - pattern: "*.{bai,crai}" - - meta2: - type: map - description: | - Groovy Map containing bed information - e.g. [ id:'test' ] - - bed: - type: file - description: BED file with intersected intervals - pattern: "*.{bed}" - - meta3: - type: map - description: | - Groovy Map containing reference information - e.g. [ id:'test' ] - - fasta: - type: file - description: Reference genome FASTA file - pattern: "*.{fa,fasta}" -output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - global_txt: - type: file - description: Text file with global cumulative coverage distribution - pattern: "*.{global.dist.txt}" - - regions_txt: - type: file - description: Text file with region cumulative coverage distribution - pattern: "*.{region.dist.txt}" - - summary_txt: - type: file - description: Text file with summary mean depths per chromosome and regions - pattern: "*.{summary.txt}" - - per_base_bed: - type: file - description: BED file with per-base coverage - pattern: "*.{per-base.bed.gz}" - - per_base_csi: - type: file - description: Index file for BED file with per-base coverage - pattern: "*.{per-base.bed.gz.csi}" - - per_base_d4: - type: file - description: D4 file with per-base coverage - pattern: "*.{per-base.d4}" - - regions_bed: - type: file - description: BED file with per-region coverage - pattern: "*.{regions.bed.gz}" - - regions_csi: - type: file - description: Index file for BED file with per-region coverage - pattern: "*.{regions.bed.gz.csi}" - - quantized_bed: - type: file - description: BED file with binned coverage - pattern: "*.{quantized.bed.gz}" - - quantized_csi: - type: file - description: Index file for BED file with binned coverage - pattern: "*.{quantized.bed.gz.csi}" - - thresholds_bed: - type: file - description: BED file with the number of bases in each region that are covered at or above each threshold - pattern: "*.{thresholds.bed.gz}" - - thresholds_csi: - type: file - description: Index file for BED file with threshold coverage - pattern: "*.{thresholds.bed.gz.csi}" - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" -authors: - - "@joseespinosa" - - "@drpatelh" - - "@ramprasadn" - - "@matthdsm" diff --git a/modules/nf-core/multiqc/environment.yml b/modules/nf-core/multiqc/environment.yml new file mode 100644 index 00000000..bc0bdb5b --- /dev/null +++ b/modules/nf-core/multiqc/environment.yml @@ -0,0 +1,7 @@ +name: multiqc +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::multiqc=1.18 diff --git a/modules/nf-core/multiqc/main.nf b/modules/nf-core/multiqc/main.nf index 1fc387be..00cc48d2 100644 --- a/modules/nf-core/multiqc/main.nf +++ b/modules/nf-core/multiqc/main.nf @@ -1,10 +1,10 @@ process MULTIQC { label 'process_single' - conda "bioconda::multiqc=1.14" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.14--pyhdfd78af_0' : - 'biocontainers/multiqc:1.14--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.18--pyhdfd78af_0' : + 'biocontainers/multiqc:1.18--pyhdfd78af_0' }" input: path multiqc_files, stageAs: "?/*" @@ -25,12 +25,14 @@ process MULTIQC { def args = task.ext.args ?: '' def config = multiqc_config ? "--config $multiqc_config" : '' def extra_config = extra_multiqc_config ? "--config $extra_multiqc_config" : '' + def logo = multiqc_logo ? /--cl-config 'custom_logo: "${multiqc_logo}"'/ : '' """ multiqc \\ --force \\ $args \\ $config \\ $extra_config \\ + $logo \\ . cat <<-END_VERSIONS > versions.yml diff --git a/modules/nf-core/multiqc/meta.yml b/modules/nf-core/multiqc/meta.yml index f93b5ee5..f1aa660e 100644 --- a/modules/nf-core/multiqc/meta.yml +++ b/modules/nf-core/multiqc/meta.yml @@ -1,5 +1,5 @@ -# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json -name: MultiQC +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: multiqc description: Aggregate results from bioinformatics analyses across many samples into a single report keywords: - QC @@ -13,7 +13,6 @@ tools: homepage: https://multiqc.info/ documentation: https://multiqc.info/docs/ licence: ["GPL-3.0-or-later"] - input: - multiqc_files: type: file @@ -31,7 +30,6 @@ input: type: file description: Optional logo file for MultiQC pattern: "*.{png}" - output: - report: type: file @@ -54,3 +52,8 @@ authors: - "@bunop" - "@drpatelh" - "@jfy133" +maintainers: + - "@abhi18av" + - "@bunop" + - "@drpatelh" + - "@jfy133" diff --git a/modules/nf-core/multiqc/tests/main.nf.test b/modules/nf-core/multiqc/tests/main.nf.test new file mode 100644 index 00000000..c2dad217 --- /dev/null +++ b/modules/nf-core/multiqc/tests/main.nf.test @@ -0,0 +1,63 @@ +nextflow_process { + + name "Test Process MULTIQC" + script "../main.nf" + process "MULTIQC" + tag "modules" + tag "modules_nfcore" + tag "multiqc" + + test("MULTIQC: FASTQC") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz_fastqc_zip'], checkIfExists: true)]) + input[1] = [] + input[2] = [] + input[3] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.report.get(0)).exists() }, + { assert path(process.out.data.get(0)).exists() }, + { assert path(process.out.versions.get(0)).getText().contains("multiqc") } + ) + } + + } + + test("MULTIQC: FASTQC and a config file") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz_fastqc_zip'], checkIfExists: true)]) + input[1] = Channel.of(file("https://github.com/nf-core/tools/raw/dev/nf_core/pipeline-template/assets/multiqc_config.yml", checkIfExists: true)) + input[2] = [] + input[3] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.report.get(0)).exists() }, + { assert path(process.out.data.get(0)).exists() }, + { assert path(process.out.versions.get(0)).getText().contains("multiqc") } + ) + } + + } +} diff --git a/modules/nf-core/multiqc/tests/tags.yml b/modules/nf-core/multiqc/tests/tags.yml new file mode 100644 index 00000000..bea6c0d3 --- /dev/null +++ b/modules/nf-core/multiqc/tests/tags.yml @@ -0,0 +1,2 @@ +multiqc: + - modules/nf-core/multiqc/** diff --git a/modules/nf-core/samtools/fasta/environment.yml b/modules/nf-core/samtools/fasta/environment.yml new file mode 100644 index 00000000..05cb8a8e --- /dev/null +++ b/modules/nf-core/samtools/fasta/environment.yml @@ -0,0 +1,7 @@ +name: samtools_fasta +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.18 diff --git a/modules/nf-core/samtools/fasta/main.nf b/modules/nf-core/samtools/fasta/main.nf new file mode 100644 index 00000000..63e2852e --- /dev/null +++ b/modules/nf-core/samtools/fasta/main.nf @@ -0,0 +1,44 @@ +process SAMTOOLS_FASTA { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.18--h50ea8bc_1' : + 'biocontainers/samtools:1.18--h50ea8bc_1' }" + + input: + tuple val(meta), path(input) + val(interleave) + + output: + tuple val(meta), path("*_{1,2}.fasta.gz") , optional:true, emit: fasta + tuple val(meta), path("*_interleaved.fasta.gz"), optional:true, emit: interleaved + tuple val(meta), path("*_singleton.fasta.gz") , optional:true, emit: singleton + tuple val(meta), path("*_other.fasta.gz") , optional:true, emit: other + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def output = ( interleave && ! meta.single_end ) ? "> ${prefix}_interleaved.fasta.gz" : + meta.single_end ? "-1 ${prefix}_1.fasta.gz -s ${prefix}_singleton.fasta.gz" : + "-1 ${prefix}_1.fasta.gz -2 ${prefix}_2.fasta.gz -s ${prefix}_singleton.fasta.gz" + """ + samtools \\ + fasta \\ + $args \\ + --threads ${task.cpus-1} \\ + -0 ${prefix}_other.fasta.gz \\ + $input \\ + $output + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/fasta/meta.yml b/modules/nf-core/samtools/fasta/meta.yml new file mode 100644 index 00000000..eae26f01 --- /dev/null +++ b/modules/nf-core/samtools/fasta/meta.yml @@ -0,0 +1,60 @@ +name: "samtools_fasta" +description: Converts a SAM/BAM/CRAM file to FASTA +keywords: + - bam + - sam + - cram + - fasta +tools: + - "samtools": + description: "Tools for dealing with SAM, BAM and CRAM files" + homepage: "http://www.htslib.org" + documentation: "https://www.htslib.org/doc/samtools-fasta.html" + tool_dev_url: "https://github.com/samtools/samtools" + doi: "10.1093/bioinformatics/btp352" + licence: ["MIT"] +input: + # Only when we have meta + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - interleave: + type: boolean + description: Set true for interleaved fasta files +output: + #Only when we have meta + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - fasta: + type: file + description: Compressed FASTA file(s) with reads with either the READ1 or READ2 flag set in separate files. + pattern: "*_{1,2}.fasta.gz" + - interleaved: + type: file + description: Compressed FASTA file with reads with either the READ1 or READ2 flag set in a combined file. Needs collated input file. + pattern: "*_interleaved.fasta.gz" + - singleton: + type: file + description: Compressed FASTA file with singleton reads + pattern: "*_singleton.fasta.gz" + - other: + type: file + description: Compressed FASTA file with reads with either both READ1 and READ2 flags set or unset + pattern: "*_other.fasta.gz" +authors: + - "@priyanka-surana" +maintainers: + - "@priyanka-surana" diff --git a/modules/nf-core/samtools/index/environment.yml b/modules/nf-core/samtools/index/environment.yml new file mode 100644 index 00000000..296ed99e --- /dev/null +++ b/modules/nf-core/samtools/index/environment.yml @@ -0,0 +1,7 @@ +name: samtools_index +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.18 diff --git a/modules/nf-core/samtools/index/main.nf b/modules/nf-core/samtools/index/main.nf new file mode 100644 index 00000000..8ad18fdc --- /dev/null +++ b/modules/nf-core/samtools/index/main.nf @@ -0,0 +1,48 @@ +process SAMTOOLS_INDEX { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.18--h50ea8bc_1' : + 'biocontainers/samtools:1.18--h50ea8bc_1' }" + + input: + tuple val(meta), path(input) + + output: + tuple val(meta), path("*.bai") , optional:true, emit: bai + tuple val(meta), path("*.csi") , optional:true, emit: csi + tuple val(meta), path("*.crai"), optional:true, emit: crai + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + samtools \\ + index \\ + -@ ${task.cpus-1} \\ + $args \\ + $input + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + """ + touch ${input}.bai + touch ${input}.crai + touch ${input}.csi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/index/meta.yml b/modules/nf-core/samtools/index/meta.yml new file mode 100644 index 00000000..01a4ee03 --- /dev/null +++ b/modules/nf-core/samtools/index/meta.yml @@ -0,0 +1,57 @@ +name: samtools_index +description: Index SAM/BAM/CRAM file +keywords: + - index + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bai: + type: file + description: BAM/CRAM/SAM index file + pattern: "*.{bai,crai,sai}" + - crai: + type: file + description: BAM/CRAM/SAM index file + pattern: "*.{bai,crai,sai}" + - csi: + type: file + description: CSI index file + pattern: "*.{csi}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@ewels" + - "@maxulysse" +maintainers: + - "@drpatelh" + - "@ewels" + - "@maxulysse" diff --git a/modules/nf-core/samtools/index/tests/csi.nextflow.config b/modules/nf-core/samtools/index/tests/csi.nextflow.config new file mode 100644 index 00000000..0ed260ef --- /dev/null +++ b/modules/nf-core/samtools/index/tests/csi.nextflow.config @@ -0,0 +1,7 @@ +process { + + withName: SAMTOOLS_INDEX { + ext.args = '-c' + } + +} diff --git a/modules/nf-core/samtools/index/tests/main.nf.test b/modules/nf-core/samtools/index/tests/main.nf.test new file mode 100644 index 00000000..c76a9169 --- /dev/null +++ b/modules/nf-core/samtools/index/tests/main.nf.test @@ -0,0 +1,87 @@ +nextflow_process { + + name "Test Process SAMTOOLS_INDEX" + script "../main.nf" + process "SAMTOOLS_INDEX" + tag "modules" + tag "modules_nfcore" + tag "samtools" + tag "samtools/index" + + test("sarscov2 [BAI]") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out.bai).match("bai") }, + { assert path(process.out.versions.get(0)).getText().contains("samtools") } + ) + } + } + + test("homo_sapiens [CRAI]") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.test_data['homo_sapiens']['illumina']['test_paired_end_recalibrated_sorted_cram'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out.crai).match("crai") }, + { assert path(process.out.versions.get(0)).getText().contains("samtools") } + ) + } + } + + test("homo_sapiens [CSI]") { + + config "./csi.nextflow.config" + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert path(process.out.csi.get(0).get(1)).exists() }, + { assert path(process.out.versions.get(0)).getText().contains("samtools") } + ) + } + } +} diff --git a/modules/nf-core/samtools/index/tests/main.nf.test.snap b/modules/nf-core/samtools/index/tests/main.nf.test.snap new file mode 100644 index 00000000..b3baee7f --- /dev/null +++ b/modules/nf-core/samtools/index/tests/main.nf.test.snap @@ -0,0 +1,28 @@ +{ + "crai": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.paired_end.recalibrated.sorted.cram.crai:md5,14bc3bd5c89cacc8f4541f9062429029" + ] + ] + ], + "timestamp": "2023-11-15T15:17:37.30801" + }, + "bai": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.paired_end.sorted.bam.bai:md5,704c10dd1326482448ca3073fdebc2f4" + ] + ] + ], + "timestamp": "2023-11-15T15:17:30.869234" + } +} \ No newline at end of file diff --git a/modules/nf-core/samtools/index/tests/tags.yml b/modules/nf-core/samtools/index/tests/tags.yml new file mode 100644 index 00000000..e0f58a7a --- /dev/null +++ b/modules/nf-core/samtools/index/tests/tags.yml @@ -0,0 +1,2 @@ +samtools/index: + - modules/nf-core/samtools/index/** diff --git a/modules/nf-core/samtools/view/environment.yml b/modules/nf-core/samtools/view/environment.yml new file mode 100644 index 00000000..99aa69d0 --- /dev/null +++ b/modules/nf-core/samtools/view/environment.yml @@ -0,0 +1,7 @@ +name: samtools_view +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.18 diff --git a/modules/nf-core/samtools/view/main.nf b/modules/nf-core/samtools/view/main.nf index b87369e5..0b5a2912 100644 --- a/modules/nf-core/samtools/view/main.nf +++ b/modules/nf-core/samtools/view/main.nf @@ -2,14 +2,14 @@ process SAMTOOLS_VIEW { tag "$meta.id" label 'process_low' - conda "bioconda::samtools=1.17" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : - 'biocontainers/samtools:1.17--h00cdaf9_0' }" + 'https://depot.galaxyproject.org/singularity/samtools:1.18--h50ea8bc_1' : + 'biocontainers/samtools:1.18--h50ea8bc_1' }" input: tuple val(meta), path(input), path(index) - path fasta + tuple val(meta2), path(fasta) path qname output: @@ -53,10 +53,19 @@ process SAMTOOLS_VIEW { """ stub: + def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" + def file_type = args.contains("--output-fmt sam") ? "sam" : + args.contains("--output-fmt bam") ? "bam" : + args.contains("--output-fmt cram") ? "cram" : + input.getExtension() + if ("$input" == "${prefix}.${file_type}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + + def index = args.contains("--write-index") ? "touch ${prefix}.csi" : "" + """ - touch ${prefix}.bam - touch ${prefix}.cram + touch ${prefix}.${file_type} + ${index} cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/samtools/view/meta.yml b/modules/nf-core/samtools/view/meta.yml index 76916033..3dadafae 100644 --- a/modules/nf-core/samtools/view/meta.yml +++ b/modules/nf-core/samtools/view/meta.yml @@ -26,12 +26,17 @@ input: description: BAM/CRAM/SAM file pattern: "*.{bam,cram,sam}" - index: - type: optional file - description: BAM.BAI/BAM.CSI/CRAM.CRAI file + type: file + description: BAM.BAI/BAM.CSI/CRAM.CRAI file (optional) pattern: "*.{.bai,.csi,.crai}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] - fasta: - type: optional file - description: Reference file the CRAM was created with + type: file + description: Reference file the CRAM was created with (optional) pattern: "*.{fasta,fa}" - qname: type: file @@ -77,3 +82,8 @@ authors: - "@joseespinosa" - "@FriederikeHanssen" - "@priyanka-surana" +maintainers: + - "@drpatelh" + - "@joseespinosa" + - "@FriederikeHanssen" + - "@priyanka-surana" diff --git a/modules/nf-core/samtools/view/tests/bam.config b/modules/nf-core/samtools/view/tests/bam.config new file mode 100644 index 00000000..c10d1081 --- /dev/null +++ b/modules/nf-core/samtools/view/tests/bam.config @@ -0,0 +1,3 @@ +process { + ext.args = "--output-fmt bam" +} \ No newline at end of file diff --git a/modules/nf-core/samtools/view/tests/bam_index.config b/modules/nf-core/samtools/view/tests/bam_index.config new file mode 100644 index 00000000..771ae033 --- /dev/null +++ b/modules/nf-core/samtools/view/tests/bam_index.config @@ -0,0 +1,3 @@ +process { + ext.args = "--output-fmt bam --write-index" +} \ No newline at end of file diff --git a/modules/nf-core/samtools/view/tests/main.nf.test b/modules/nf-core/samtools/view/tests/main.nf.test new file mode 100644 index 00000000..89ed3555 --- /dev/null +++ b/modules/nf-core/samtools/view/tests/main.nf.test @@ -0,0 +1,231 @@ +nextflow_process { + + name "Test Process SAMTOOLS_VIEW" + script "../main.nf" + process "SAMTOOLS_VIEW" + + tag "modules" + tag "modules_nfcore" + tag "samtools" + tag "samtools/view" + + test("sarscov2 - [bam, []], [], []") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data['sarscov2']['illumina']['test_paired_end_bam'], checkIfExists: true), + [] + ] + input[1] = [[],[]] + input[2] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.cram, + process.out.sam, + process.out.bai, + process.out.crai, + process.out.csi, + process.out.versions + ).match() } + ) + } + + } + + test("homo_sapiens - [cram, crai], fasta, []") { + + when { + process { + """ + input[0] = [ + [ id: 'test' ], // meta map + file(params.test_data['homo_sapiens']['illumina']['test_paired_end_sorted_cram'], checkIfExists: true), + file(params.test_data['homo_sapiens']['illumina']['test_paired_end_sorted_cram_crai'], checkIfExists: true) + ] + input[1] = [ + [ id:'genome' ], + file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true) + ] + input[2] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.cram[0][1]).name, + process.out.bam, + process.out.sam, + process.out.bai, + process.out.crai, + process.out.csi, + process.out.versions + ).match() } + ) + } + + } + + test("homo_sapiens - [cram, []], fasta, [] - bam output") { + + config "./bam.config" + + when { + process { + """ + input[0] = [ + [ id: 'test' ], // meta map + file(params.test_data['homo_sapiens']['illumina']['test_paired_end_sorted_cram'], checkIfExists: true), + [] + ] + input[1] = [ + [ id:'genome' ], + file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true) + ] + input[2] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.cram, + process.out.sam, + process.out.bai, + process.out.crai, + process.out.csi, + process.out.versions + ).match() } + ) + } + + } + + test("homo_sapiens - [cram, []], fasta, [] - bam & index output") { + + config "./bam_index.config" + + when { + process { + """ + input[0] = [ + [ id: 'test' ], // meta map + file(params.test_data['homo_sapiens']['illumina']['test_paired_end_sorted_cram'], checkIfExists: true), + [] + ] + input[1] = [ + [ id:'genome' ], + file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true) + ] + input[2] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.cram, + process.out.sam, + file(process.out.csi[0][1]).name, + process.out.crai, + process.out.bai, + process.out.versions + ).match() } + ) + } + + } + + test("homo_sapiens - [cram, []], fasta, qname - bam & index output") { + + config "./bam_index.config" + + when { + process { + """ + input[0] = [ + [ id: 'test' ], // meta map + file(params.test_data['homo_sapiens']['illumina']['test_paired_end_sorted_cram'], checkIfExists: true), + [] + ] + input[1] = [ + [ id:'genome' ], + file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true) + ] + input[2] = Channel.of("testN:2817", "testN:2814").collectFile(name: "readnames.list", newLine: true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.cram, + process.out.sam, + file(process.out.csi[0][1]).name, + process.out.crai, + process.out.bai, + process.out.versions + ).match() } + ) + } + + } + + test("sarscov2 - [bam, []], [], [] - stub") { + + options "-stub" + config "./bam_index.config" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data['sarscov2']['illumina']['test_paired_end_bam'], checkIfExists: true), + [] + ] + input[1] = [[],[]] + input[2] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.cram, + process.out.sam, + file(process.out.csi[0][1]).name, + process.out.crai, + process.out.bai, + process.out.versions + ).match() } + ) + } + + } + +} diff --git a/modules/nf-core/samtools/view/tests/main.nf.test.snap b/modules/nf-core/samtools/view/tests/main.nf.test.snap new file mode 100644 index 00000000..83427491 --- /dev/null +++ b/modules/nf-core/samtools/view/tests/main.nf.test.snap @@ -0,0 +1,140 @@ +{ + "homo_sapiens - [cram, []], fasta, [] - bam output": { + "content": [ + "test.bam", + [ + + ], + [ + + ], + [ + + ], + [ + + ], + [ + + ], + [ + "versions.yml:md5,06b9049228b111e7bed5c52fe8a98d9b" + ] + ], + "timestamp": "2023-12-04T17:41:17.563069206" + }, + "sarscov2 - [bam, []], [], []": { + "content": [ + "test.bam", + [ + + ], + [ + + ], + [ + + ], + [ + + ], + [ + + ], + [ + "versions.yml:md5,06b9049228b111e7bed5c52fe8a98d9b" + ] + ], + "timestamp": "2023-12-04T17:41:03.206994564" + }, + "homo_sapiens - [cram, []], fasta, qname - bam & index output": { + "content": [ + "test.bam", + [ + + ], + [ + + ], + "test.bam.csi", + [ + + ], + [ + + ], + [ + "versions.yml:md5,06b9049228b111e7bed5c52fe8a98d9b" + ] + ], + "timestamp": "2023-12-04T17:44:39.165289759" + }, + "homo_sapiens - [cram, []], fasta, [] - bam & index output": { + "content": [ + "test.bam", + [ + + ], + [ + + ], + "test.bam.csi", + [ + + ], + [ + + ], + [ + "versions.yml:md5,06b9049228b111e7bed5c52fe8a98d9b" + ] + ], + "timestamp": "2023-12-04T17:44:32.25731224" + }, + "sarscov2 - [bam, []], [], [] - stub": { + "content": [ + "test.bam", + [ + + ], + [ + + ], + "test.csi", + [ + + ], + [ + + ], + [ + "versions.yml:md5,06b9049228b111e7bed5c52fe8a98d9b" + ] + ], + "timestamp": "2023-12-04T17:44:45.81037195" + }, + "homo_sapiens - [cram, crai], fasta, []": { + "content": [ + "test.cram", + [ + + ], + [ + + ], + [ + + ], + [ + + ], + [ + + ], + [ + "versions.yml:md5,06b9049228b111e7bed5c52fe8a98d9b" + ] + ], + "timestamp": "2023-12-04T17:41:10.730011823" + } +} \ No newline at end of file diff --git a/modules/nf-core/samtools/view/tests/tags.yml b/modules/nf-core/samtools/view/tests/tags.yml new file mode 100644 index 00000000..4fdf1dd1 --- /dev/null +++ b/modules/nf-core/samtools/view/tests/tags.yml @@ -0,0 +1,2 @@ +samtools/view: + - "modules/nf-core/samtools/view/**" diff --git a/modules/nf-core/seqtk/subseq/environment.yml b/modules/nf-core/seqtk/subseq/environment.yml new file mode 100644 index 00000000..42c4e8af --- /dev/null +++ b/modules/nf-core/seqtk/subseq/environment.yml @@ -0,0 +1,7 @@ +name: seqtk_subseq +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::seqtk=1.3 diff --git a/modules/nf-core/seqtk/subseq/main.nf b/modules/nf-core/seqtk/subseq/main.nf new file mode 100644 index 00000000..91d2dff3 --- /dev/null +++ b/modules/nf-core/seqtk/subseq/main.nf @@ -0,0 +1,41 @@ +process SEQTK_SUBSEQ { + tag "$sequences" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/seqtk:1.3--h5bf99c6_3' : + 'biocontainers/seqtk:1.3--h5bf99c6_3' }" + + input: + path sequences + path filter_list + + output: + path "*.gz" , emit: sequences + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: '' + def ext = "fa" + if ("$sequences" ==~ /.+\.fq|.+\.fq.gz|.+\.fastq|.+\.fastq.gz/) { + ext = "fq" + } + """ + seqtk \\ + subseq \\ + $args \\ + $sequences \\ + $filter_list | \\ + gzip --no-name > ${sequences}${prefix}.${ext}.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + seqtk: \$(echo \$(seqtk 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/seqtk/subseq/meta.yml b/modules/nf-core/seqtk/subseq/meta.yml new file mode 100644 index 00000000..2cb8858d --- /dev/null +++ b/modules/nf-core/seqtk/subseq/meta.yml @@ -0,0 +1,33 @@ +name: seqtk_subseq +description: Select only sequences that match the filtering condition +keywords: + - filtering,selection +tools: + - seqtk: + description: Seqtk is a fast and lightweight tool for processing sequences in the FASTA or FASTQ format + homepage: https://github.com/lh3/seqtk + documentation: https://docs.csc.fi/apps/seqtk/ + tool_dev_url: https://github.com/lh3/seqtk + licence: ["MIT"] +input: + - sequences: + type: file + description: FASTQ/FASTA file + pattern: "*.{fq,fq.gz,fa,fa.gz}" + - filter_list: + type: file + description: BED file or a text file with a list of sequence names + pattern: "*.{bed,lst}" +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - sequences: + type: file + description: FASTQ/FASTA file + pattern: "*.{fq.gz,fa.gz}" +authors: + - "@sidorov-si" +maintainers: + - "@sidorov-si" diff --git a/modules/nf-core/windowmasker/mkcounts/environment.yml b/modules/nf-core/windowmasker/mkcounts/environment.yml new file mode 100644 index 00000000..15887425 --- /dev/null +++ b/modules/nf-core/windowmasker/mkcounts/environment.yml @@ -0,0 +1,7 @@ +name: windowmasker_mkcounts +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::blast=2.14.0 diff --git a/modules/nf-core/windowmasker/mkcounts/main.nf b/modules/nf-core/windowmasker/mkcounts/main.nf new file mode 100644 index 00000000..6bfd175e --- /dev/null +++ b/modules/nf-core/windowmasker/mkcounts/main.nf @@ -0,0 +1,55 @@ +process WINDOWMASKER_MKCOUNTS { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/blast:2.14.0--h7d5a4b4_1': + 'biocontainers/blast:2.14.0--h7d5a4b4_1' }" + + input: + tuple val(meta), path(ref) + + output: + tuple val(meta), path("*.txt") , emit: counts + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: "" + def prefix = task.ext.prefix ?: "${meta.id}" + + def memory = 3072 + if (!task.memory) { + log.info '[WINDOWMASKER: MK_COUNTS] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + memory = (task.memory.toMega()).intValue() + } + + """ + windowmasker -mk_counts \\ + $args \\ + -mem ${memory} \\ + -in ${ref} \\ + -out ${prefix}.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + windowmasker: \$(windowmasker -version-full | head -n 1 | sed 's/^.*windowmasker: //; s/ .*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + touch ${prefix}.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + windowmasker: \$(windowmasker -version-full | head -n 1 | sed 's/^.*windowmasker: //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/windowmasker/mkcounts/meta.yml b/modules/nf-core/windowmasker/mkcounts/meta.yml new file mode 100644 index 00000000..436ed7a5 --- /dev/null +++ b/modules/nf-core/windowmasker/mkcounts/meta.yml @@ -0,0 +1,42 @@ +name: windowmasker_mkcounts +description: A program to generate frequency counts of repetitive units. +keywords: + - fasta + - interval + - windowmasker +tools: + - windowmasker: + description: | + A program to mask highly repetitive and low complexity DNA sequences within a genome. + homepage: https://github.com/ncbi/ncbi-cxx-toolkit-public + documentation: https://ncbi.github.io/cxx-toolkit/ + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - ref: + type: file + description: An input nucleotide fasta file. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - intervals: + type: file + description: | + An output file containing genomic locations of low + complexity and highly repetitive regions + pattern: "${prefix}.txt" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@DLBPointon" +maintainers: + - "@DLBPointon" diff --git a/modules/nf-core/windowmasker/ustat/environment.yml b/modules/nf-core/windowmasker/ustat/environment.yml new file mode 100644 index 00000000..a97fdd9d --- /dev/null +++ b/modules/nf-core/windowmasker/ustat/environment.yml @@ -0,0 +1,7 @@ +name: windowmasker_ustat +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::blast=2.14.0 diff --git a/modules/nf-core/windowmasker/ustat/main.nf b/modules/nf-core/windowmasker/ustat/main.nf new file mode 100644 index 00000000..2cc3df63 --- /dev/null +++ b/modules/nf-core/windowmasker/ustat/main.nf @@ -0,0 +1,69 @@ +process WINDOWMASKER_USTAT { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/blast:2.14.0--h7d5a4b4_1': + 'biocontainers/blast:2.14.0--h7d5a4b4_1' }" + + input: + tuple val(meta) , path(counts) + tuple val(meta2), path(ref) + + output: + tuple val(meta), path("${output}") , emit: intervals + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: "" + def prefix = task.ext.prefix ?: "${meta.id}" + def outfmt = args.contains('-outfmt fasta') ? 'fasta' : + args.contains('-outfmt maskinfo_asn1_bin') ? 'maskinfo_asn1_bin' : + args.contains('-outfmt maskinfo_asn1_text') ? 'maskinfo_asn1_text' : + args.contains('-outfmt maskinfo_xml') ? 'maskinfo_xml' : + args.contains('-outfmt seqloc_asn1_bin') ? 'seqloc_asn1_bin' : + args.contains('-outfmt seqloc_asn1_text') ? 'seqloc_asn1_text' : + args.contains('-outfmt seqloc_xml') ? 'seqloc_xml' : + 'interval' + + output = "${prefix}.${outfmt}" + + """ + windowmasker -ustat \\ + ${counts} \\ + $args \\ + -in ${ref} \\ + -out ${output} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + windowmasker: \$(windowmasker -version-full | head -n 1 | sed 's/^.*windowmasker: //; s/ .*\$//') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: "" + def prefix = task.ext.prefix ?: "${meta.id}" + def outfmt = args.contains('-outfmt fasta') ? 'fasta' : + args.contains('-outfmt maskinfo_asn1_bin') ? 'maskinfo_asn1_bin' : + args.contains('-outfmt maskinfo_asn1_text') ? 'maskinfo_asn1_text' : + args.contains('-outfmt maskinfo_xml') ? 'maskinfo_xml' : + args.contains('-outfmt seqloc_asn1_bin') ? 'seqloc_asn1_bin' : + args.contains('-outfmt seqloc_asn1_text') ? 'seqloc_asn1_text' : + args.contains('-outfmt seqloc_xml') ? 'seqloc_xml' : + 'interval' + + output = "${prefix}.${outfmt}" + """ + touch ${output} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + windowmasker: \$(windowmasker -version-full | head -n 1 | sed 's/^.*windowmasker: //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/windowmasker/ustat/meta.yml b/modules/nf-core/windowmasker/ustat/meta.yml new file mode 100644 index 00000000..6a07c935 --- /dev/null +++ b/modules/nf-core/windowmasker/ustat/meta.yml @@ -0,0 +1,50 @@ +name: windowmasker_ustat +description: A program to take a counts file and creates a file of genomic co-ordinates to be masked. +keywords: + - fasta + - interval + - windowmasker +tools: + - windowmasker: + description: | + A program to mask highly repetitive and low complexity DNA sequences within a genome. + homepage: https://github.com/ncbi/ncbi-cxx-toolkit-public + documentation: https://ncbi.github.io/cxx-toolkit/ + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - counts: + type: file + description: Contains count data of repetitive regions. + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - ref: + type: file + description: An input nucleotide fasta file. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - wm_intervals: + type: file + description: | + An output file containing genomic locations of low + complexity and highly repetitive regions + pattern: "${output}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@DLBPointon" +maintainers: + - "@DLBPointon" diff --git a/nextflow.config b/nextflow.config index 988b2a1c..98b0398c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -13,6 +13,8 @@ params { // Input options input = null yaml = null + align = false + mask = false // Reference options fasta = null @@ -23,9 +25,13 @@ params { // Databases and related options taxdump = null busco = null - uniprot = null + blastp = null + blastx = null + blastn = null blastp_outext = 'txt' blastp_cols = 'qseqid staxids bitscore qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore' + blastx_outext = 'txt' + blastx_cols = 'qseqid staxids bitscore qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore' // MultiQC options multiqc_config = null @@ -36,7 +42,6 @@ params { // Boilerplate options outdir = 'results' - tracedir = "${params.outdir}/blobtoolkit_info" publish_dir_mode = 'copy' email = null email_on_fail = null @@ -45,17 +50,14 @@ params { hook_url = null help = false version = false - validate_params = true - show_hidden_params = false - schema_ignore_params = 'genomes' // Config options + config_profile_name = null + config_profile_description = null custom_config_version = 'master' custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" - config_profile_description = null config_profile_contact = null config_profile_url = null - config_profile_name = null // Max resource options // Defaults only, expecting to be overwritten @@ -63,6 +65,13 @@ params { max_cpus = 16 max_time = '240.h' + // Schema validation default options + validationFailUnrecognisedParams = false + validationLenientMode = false + validationSchemaIgnoreParams = 'genomes,igenomes_base' + validationShowHiddenParams = false + validate_params = true + } // Load base.config by default for all pipelines @@ -82,13 +91,12 @@ try { // } catch (Exception e) { // System.err.println("WARNING: Could not load nf-core/config/blobtoolkit profiles: ${params.custom_config_base}/pipeline/blobtoolkit.config") // } - - profiles { debug { dumpHashes = true process.beforeScript = 'echo $HOSTNAME' - cleanup = false + cleanup = false + nextflow.enable.configProcessNamesValidation = true } conda { conda.enabled = true @@ -111,17 +119,16 @@ profiles { } docker { docker.enabled = true - docker.registry = 'quay.io' - docker.userEmulation = true conda.enabled = false singularity.enabled = false podman.enabled = false shifter.enabled = false charliecloud.enabled = false apptainer.enabled = false + docker.runOptions = '-u $(id -u):$(id -g)' } arm { - docker.runOptions = '-u $(id -u):$(id -g) --platform=linux/amd64' + docker.runOptions = '-u $(id -u):$(id -g) --platform=linux/amd64' } singularity { singularity.enabled = true @@ -135,7 +142,6 @@ profiles { } podman { podman.enabled = true - podman.registry = 'quay.io' conda.enabled = false docker.enabled = false singularity.enabled = false @@ -163,6 +169,7 @@ profiles { } apptainer { apptainer.enabled = true + apptainer.autoMounts = true conda.enabled = false docker.enabled = false singularity.enabled = false @@ -172,15 +179,27 @@ profiles { } gitpod { executor.name = 'local' - executor.cpus = 16 - executor.memory = 60.GB + executor.cpus = 4 + executor.memory = 8.GB } cleanup { cleanup = true } test { includeConfig 'conf/test.config' } + test_raw { includeConfig 'conf/test_raw.config' } test_full { includeConfig 'conf/test_full.config' } } +// Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile +// Will not be used unless Apptainer / Docker / Podman / Singularity are enabled +// Set to your registry if you have a mirror of containers +apptainer.registry = 'quay.io' +docker.registry = 'quay.io' +podman.registry = 'quay.io' +singularity.registry = 'quay.io' +// Nextflow plugins +plugins { + id 'nf-validation@1.1.3' // Validation of pipeline parameters and creation of an input channel from a sample sheet +} // Export these variables to prevent local Python/R libraries from conflicting with those in the container // The JULIA depot path has been adjusted to a fixed path `/usr/local/share/julia` that needs to be used for packages in the container. @@ -196,22 +215,25 @@ env { // Capture exit codes from upstream processes when piping process.shell = ['/bin/bash', '-euo', 'pipefail'] +// Disable process selector warnings by default. Use debug profile to enable warnings. +nextflow.enable.configProcessNamesValidation = false + def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') timeline { enabled = true - file = "${params.tracedir}/execution_timeline_${trace_timestamp}.html" + file = "${params.outdir}/pipeline_info/blobtoolkit/execution_timeline_${trace_timestamp}.html" } report { enabled = true - file = "${params.tracedir}/execution_report_${trace_timestamp}.html" + file = "${params.outdir}/pipeline_info/blobtoolkit/execution_report_${trace_timestamp}.html" } trace { enabled = true - file = "${params.tracedir}/execution_trace_${trace_timestamp}.txt" + file = "${params.outdir}/pipeline_info/blobtoolkit/execution_trace_${trace_timestamp}.txt" } dag { enabled = true - file = "${params.tracedir}/pipeline_dag_${trace_timestamp}.html" + file = "${params.outdir}/pipeline_info/blobtoolkit/pipeline_dag_${trace_timestamp}.html" } manifest { @@ -220,7 +242,7 @@ manifest { homePage = 'https://github.com/sanger-tol/blobtoolkit' description = """Quality assessment of genome assemblies""" mainScript = 'main.nf' - nextflowVersion = '!>=23.04.1' + nextflowVersion = '!>=23.04.0' version = '0.2.0' doi = '10.5281/zenodo.7949058' } diff --git a/nextflow_schema.json b/nextflow_schema.json index a960bee2..37c8a567 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -15,13 +15,23 @@ "input": { "type": "string", "format": "file-path", + "exists": true, "mimetype": "text/csv", "pattern": "^\\S+\\.csv$", - "schema": "assets/schema_input.json", "description": "Path to comma-separated file containing information about the samples in the experiment.", "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row.", "fa_icon": "fas fa-file-csv" }, + "align": { + "type": "boolean", + "description": "Turn on optional alignment before running the rest of the pipeline.", + "fa_icon": "fas fa-toggle-off" + }, + "mask": { + "type": "boolean", + "description": "Turn on optional genome masking if needed.", + "fa_icon": "fas fa-toggle-off" + }, "yaml": { "type": "string", "format": "file-path", @@ -79,7 +89,7 @@ "type": "object", "fa_icon": "fas fa-database", "description": "Define the location and parameters to work with databases.", - "required": ["uniprot", "taxdump"], + "required": ["blastp", "blastx", "blastn", "taxdump"], "properties": { "taxa_file": { "type": "string", @@ -106,13 +116,38 @@ "fa_icon": "fas fa-file-circle-question", "default": "txt" }, - "uniprot": { + "blastx_cols": { + "type": "string", + "description": "When blastx_outext is 'txt', this is the list of columns that Diamond BLAST should print.", + "default": "qseqid staxids bitscore qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore" + }, + "blastx_outext": { + "type": "string", + "enum": ["blast", "xml", "txt", "daa", "sam", "tsv", "paf"], + "description": "Extension (file format) of the output file from Diamond BLAST.", + "fa_icon": "fas fa-file-circle-question", + "default": "txt" + }, + "blastp": { "type": "string", "format": "file-path", "pattern": "^\\S+\\.dmnd$", "description": "Path to the Diamond species-specific buscogenes database", "fa_icon": "fas fa-file-archive" }, + "blastx": { + "type": "string", + "format": "file-path", + "pattern": "^\\S+\\.dmnd$", + "description": "Path to the Diamond species-specific buscoregions database", + "fa_icon": "fas fa-file-archive" + }, + "blastn": { + "type": "string", + "format": "directory-path", + "description": "Path to the nucleotide BLAST database", + "fa_icon": "fas fa-file-archive" + }, "taxdump": { "type": "string", "format": "directory-path", @@ -198,7 +233,7 @@ "description": "Maximum amount of time that can be requested for any single job.", "default": "240.h", "fa_icon": "far fa-clock", - "pattern": "^(\\d+\\.?\\s*(s|m|h|day)\\s*)+$", + "pattern": "^(\\d+\\.?\\s*(s|m|h|d|day)\\s*)+$", "hidden": true, "help_text": "Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`" } @@ -269,6 +304,7 @@ }, "multiqc_config": { "type": "string", + "format": "file-path", "description": "Custom config file to supply to MultiQC.", "fa_icon": "fas fa-cog", "hidden": true @@ -284,13 +320,6 @@ "description": "Custom MultiQC yaml file containing HTML including a methods description.", "fa_icon": "fas fa-cog" }, - "tracedir": { - "type": "string", - "description": "Directory to keep pipeline Nextflow logs and reports.", - "default": "${params.outdir}/blobtoolkit_info", - "fa_icon": "fas fa-cogs", - "hidden": true - }, "validate_params": { "type": "boolean", "description": "Boolean whether to validate parameters against the schema at runtime", @@ -298,12 +327,26 @@ "fa_icon": "fas fa-check-square", "hidden": true }, - "show_hidden_params": { + "validationShowHiddenParams": { "type": "boolean", "fa_icon": "far fa-eye-slash", "description": "Show all params when using `--help`", "hidden": true, "help_text": "By default, parameters set as _hidden_ in the schema are not shown on the command line when a user runs with `--help`. Specifying this option will tell the pipeline to show all parameters." + }, + "validationFailUnrecognisedParams": { + "type": "boolean", + "fa_icon": "far fa-check-circle", + "description": "Validation of parameters fails when an unrecognised parameter is found.", + "hidden": true, + "help_text": "By default, when an unrecognised parameter is found, it returns a warinig." + }, + "validationLenientMode": { + "type": "boolean", + "fa_icon": "far fa-check-circle", + "description": "Validation of parameters in lenient more.", + "hidden": true, + "help_text": "Allows string values that are parseable as numbers or booleans. For further information see [JSONSchema docs](https://github.com/everit-org/json-schema#lenient-mode)." } } } diff --git a/pipeline_template.yml b/pipeline_template.yml deleted file mode 100644 index 0aa7398f..00000000 --- a/pipeline_template.yml +++ /dev/null @@ -1,3 +0,0 @@ -prefix: sanger-tol -skip: - - igenomes diff --git a/subworkflows/local/blobtools.nf b/subworkflows/local/blobtools.nf index 730e1334..8411ad24 100644 --- a/subworkflows/local/blobtools.nf +++ b/subworkflows/local/blobtools.nf @@ -2,8 +2,9 @@ // Create BlobTools dataset // -include { BLOBTOOLKIT_METADATA } from '../../modules/local/blobtoolkit/metadata' -include { BLOBTOOLKIT_BLOBDIR } from '../../modules/local/blobtoolkit/blobdir' +include { BLOBTOOLKIT_METADATA } from '../../modules/local/blobtoolkit/metadata' +include { BLOBTOOLKIT_CREATEBLOBDIR } from '../../modules/local/blobtoolkit/createblobdir' +include { BLOBTOOLKIT_UPDATEBLOBDIR } from '../../modules/local/blobtoolkit/updateblobdir' workflow BLOBTOOLS { take: @@ -11,6 +12,8 @@ workflow BLOBTOOLS { windowstats // channel: [ val(meta), path(window_stats_tsvs) ] busco // channel: [ val(meta), path(full_table) ] blastp // channel: [ val(meta), path(txt) ] + blastx // channel: [ val(meta), path(txt) ] + blastn // channel: [ val(meta), path(txt) ] taxdump // channel: path(taxdump_db) @@ -28,12 +31,19 @@ workflow BLOBTOOLS { // // Create Blobtools dataset files // - BLOBTOOLKIT_BLOBDIR ( windowstats, busco, blastp, BLOBTOOLKIT_METADATA.out.yaml, taxdump ) - ch_versions = ch_versions.mix ( BLOBTOOLKIT_BLOBDIR.out.versions.first() ) + BLOBTOOLKIT_CREATEBLOBDIR ( windowstats, busco, blastp, BLOBTOOLKIT_METADATA.out.yaml, taxdump ) + ch_versions = ch_versions.mix ( BLOBTOOLKIT_CREATEBLOBDIR.out.versions.first() ) + + + // + // Update Blobtools dataset files + // + BLOBTOOLKIT_UPDATEBLOBDIR ( BLOBTOOLKIT_CREATEBLOBDIR.out.blobdir, blastx, blastn, taxdump ) + ch_versions = ch_versions.mix ( BLOBTOOLKIT_UPDATEBLOBDIR.out.versions.first() ) emit: - metadata = BLOBTOOLKIT_METADATA.out.yaml // channel: [ val(meta), path(yaml) ] - blobdir = BLOBTOOLKIT_BLOBDIR.out.blobdir // channel: [ val(meta), path(dir) ] - versions = ch_versions // channel: [ versions.yml ] + metadata = BLOBTOOLKIT_METADATA.out.yaml // channel: [ val(meta), path(yaml) ] + blobdir = BLOBTOOLKIT_UPDATEBLOBDIR.out.blobdir // channel: [ val(meta), path(dir) ] + versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/busco_diamond_blastp.nf b/subworkflows/local/busco_diamond_blastp.nf index 44fe8b6c..6037de19 100644 --- a/subworkflows/local/busco_diamond_blastp.nf +++ b/subworkflows/local/busco_diamond_blastp.nf @@ -27,6 +27,18 @@ workflow BUSCO_DIAMOND { // GOAT_TAXONSEARCH ( taxon_taxa ) ch_versions = ch_versions.mix ( GOAT_TAXONSEARCH.out.versions.first() ) + + + // + // Get NCBI species ID + // + GOAT_TAXONSEARCH.out.taxonsearch + | map { meta, csv -> csv.splitCsv(header:true, sep:'\t', strip:true) } + | map { row -> [ row.taxon_rank, row.taxon_id ] } + | transpose() + | filter { rank,id -> rank =~ /species/ } + | map { rank, id -> id} + | set { ch_taxid } // @@ -35,11 +47,17 @@ workflow BUSCO_DIAMOND { GOAT_TAXONSEARCH.out.taxonsearch | map { meta, csv -> csv.splitCsv(header:true, sep:'\t', strip:true) } | map { row -> row.odb10_lineage.findAll { it != "" } } - | map { lineages -> [ lineages + [ "bacteria_odb10", "archaea_odb10" ] ] } + | set { ch_ancestral_lineages } + + + // Add the basal lineages to the list (excluding duplicates) + basal_lineages = [ "archaea_odb10", "bacteria_odb10", "eukaryota_odb10" ] + ch_ancestral_lineages + | map { lineages -> (lineages + basal_lineages).unique() } | flatten () | set { ch_lineages } - BUSCO ( fasta, ch_lineages, busco_db.collect().ifEmpty([]), [] ) + BUSCO ( fasta, "genome", ch_lineages, busco_db.collect().ifEmpty([]), [] ) ch_versions = ch_versions.mix ( BUSCO.out.versions.first() ) @@ -47,18 +65,13 @@ workflow BUSCO_DIAMOND { // Select input for BLOBTOOLKIT_EXTRACTBUSCOS // BUSCO.out.seq_dir - | map { meta, seq -> [ [ "id": seq.parent.baseName ], seq ] } - | branch { - meta, seq -> - archaea : meta.id == "run_archaea_odb10" - bacteria : meta.id == "run_bacteria_odb10" - eukaryota : meta.id == "run_eukaryota_odb10" - } - | set { ch_busco } - - - // Extract BUSCO genes from the 3 kingdoms - BLOBTOOLKIT_EXTRACTBUSCOS ( fasta, ch_busco.archaea, ch_busco.bacteria, ch_busco.eukaryota ) + | filter { meta, seq -> basal_lineages.contains(seq.parent.baseName.minus("run_")) } + | groupTuple() + | set { ch_basal_buscos } + + + // Extract BUSCO genes from the basal lineages + BLOBTOOLKIT_EXTRACTBUSCOS ( fasta, ch_basal_buscos ) ch_versions = ch_versions.mix ( BLOBTOOLKIT_EXTRACTBUSCOS.out.versions.first() ) @@ -91,6 +104,7 @@ workflow BUSCO_DIAMOND { first_table = ch_first_table // channel: [ val(meta), path(full_table) ] full_table = BUSCO.out.full_table // channel: [ val(meta), path(full_tables) ] blastp_txt = DIAMOND_BLASTP.out.txt // channel: [ val(meta), path(txt) ] + taxon_id = ch_taxid // channel: taxon_id multiqc // channel: [ meta, summary ] versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/collate_stats.nf b/subworkflows/local/collate_stats.nf index ac567621..21baf44a 100644 --- a/subworkflows/local/collate_stats.nf +++ b/subworkflows/local/collate_stats.nf @@ -28,7 +28,7 @@ workflow COLLATE_STATS { ch_versions = ch_versions.mix ( BLOBTOOLKIT_COUNTBUSCOS.out.versions.first() ) - // Combine outputs from Fasta windows, mosdepth, and count BUSCO genes + // Combine outputs from Fasta windows, blobtk depth, and count BUSCO genes WINDOWSTATS_INPUT ( freq, mononuc, cov, BLOBTOOLKIT_COUNTBUSCOS.out.tsv ) ch_versions = ch_versions.mix ( WINDOWSTATS_INPUT.out.versions.first() ) diff --git a/subworkflows/local/coverage_stats.nf b/subworkflows/local/coverage_stats.nf index 0d13824b..79b39a8a 100644 --- a/subworkflows/local/coverage_stats.nf +++ b/subworkflows/local/coverage_stats.nf @@ -2,33 +2,50 @@ // Calculate genome coverage and statistics // -include { SAMTOOLS_VIEW } from '../../modules/nf-core/samtools/view/main' -include { MOSDEPTH } from '../../modules/nf-core/mosdepth/main' -include { FASTAWINDOWS } from '../../modules/nf-core/fastawindows/main' -include { CREATE_BED } from '../../modules/local/create_bed' +include { SAMTOOLS_VIEW } from '../../modules/nf-core/samtools/view/main' +include { SAMTOOLS_INDEX } from '../../modules/nf-core/samtools/index/main' +include { BLOBTK_DEPTH } from '../../modules/local/blobtk/depth' +include { FASTAWINDOWS } from '../../modules/nf-core/fastawindows/main' +include { CREATE_BED } from '../../modules/local/create_bed' workflow COVERAGE_STATS { take: - cram // channel: [ val(meta), path(cram) ] - fasta // channel: [ val(meta), path(fasta) ] + input // channel: [ val(meta), path(aln) ] + fasta // channel: [ val(meta), path(fasta) ] main: ch_versions = Channel.empty() - // Convert from CRAM to BAM - cram - | map { meta, cram -> [ meta, cram, [] ] } - | set { ch_cram_crai} + // Create aligned BAM and index CSI channel + input + | branch { meta, aln -> + bam : aln.toString().endsWith("bam") == true + return [ meta, aln ] + cram : aln.toString().endsWith("cram") == true + return [ meta, aln, [] ] + } + | set { ch_aln_idx} - fasta - | map { meta, fasta -> fasta } - | set { ch_fasta } + SAMTOOLS_VIEW ( ch_aln_idx.cram, fasta, [] ) + ch_versions = ch_versions.mix ( SAMTOOLS_VIEW.out.versions.first() ) - SAMTOOLS_VIEW ( ch_cram_crai, ch_fasta, [] ) - ch_versions = ch_versions.mix ( SAMTOOLS_VIEW.out.versions.first() ) + SAMTOOLS_VIEW.out.bam + | join ( SAMTOOLS_VIEW.out.csi ) + | set { ch_view } + + SAMTOOLS_INDEX ( ch_aln_idx.bam ) + ch_versions = ch_versions.mix ( SAMTOOLS_INDEX.out.versions.first() ) + + ch_aln_idx.bam + | join ( SAMTOOLS_INDEX.out.csi ) + | set { ch_index } + + ch_view + | mix ( ch_index ) + | set { ch_bam_csi } // Calculate genome statistics @@ -42,35 +59,22 @@ workflow COVERAGE_STATS { // Calculate coverage - SAMTOOLS_VIEW.out.bam - | join ( SAMTOOLS_VIEW.out.csi ) - | combine ( CREATE_BED.out.bed ) - | map { meta, bam, csi, meta2, bed -> [ meta, bam, csi, bed ] } - | set { ch_bam_csi_bed } - - MOSDEPTH ( ch_bam_csi_bed, fasta ) - ch_versions = ch_versions.mix ( MOSDEPTH.out.versions.first() ) + BLOBTK_DEPTH ( ch_bam_csi ) + ch_versions = ch_versions.mix ( BLOBTK_DEPTH.out.versions.first() ) - // Combining mosdepth regions_bed in single channel - MOSDEPTH.out.regions_bed + // Combining regions_bed in single channel + BLOBTK_DEPTH.out.bed | combine ( fasta ) | map { meta, bed, meta2, fasta -> [ meta2, bed ] } | groupTuple () | set { ch_coverage } - // Mosdepth results for MULTIQC - MOSDEPTH.out.regions_txt - | ifEmpty ( MOSDEPTH.out.global_txt ) - | set { multiqc } - - emit: freq = FASTAWINDOWS.out.freq // channel: [ val(meta), path(freq) ] mononuc = FASTAWINDOWS.out.mononuc // channel: [ val(meta), path(mononuc) ] bed = CREATE_BED.out.bed // channel: [ val(meta), path(bed) ] cov = ch_coverage // channel: [ val(meta), path(regions.bed.gz) ] - multiqc // channel: [ val(meta), path(dist.txt) ] versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 0b02604b..01849bd1 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -2,23 +2,46 @@ // Check input samplesheet and get aligned read channels // -include { SAMPLESHEET_CHECK } from '../../modules/local/samplesheet_check' +include { SAMPLESHEET_CHECK } from '../../modules/local/samplesheet_check' +include { BLOBTOOLKIT_CONFIG } from '../../modules/local/blobtoolkit/config' workflow INPUT_CHECK { take: samplesheet // file: /path/to/samplesheet.csv + fasta // channel: [ meta, path(fasta) ] + yaml // channel: [ meta, path(config ] main: + ch_versions = Channel.empty() + + SAMPLESHEET_CHECK ( samplesheet ) .csv .splitCsv ( header:true, sep:',' ) .map { create_data_channels(it) } .set { aln } + ch_versions = ch_versions.mix ( SAMPLESHEET_CHECK.out.versions.first() ) + + if ( !params.yaml ) { + aln + | map { meta, data -> meta.id.split("_")[0..-2].join("_") } + | combine ( fasta ) + | map { sample, meta, fasta -> [ meta, sample ] } + | groupTuple() + | set { reads } + + BLOBTOOLKIT_CONFIG ( reads, fasta ) + ch_versions = ch_versions.mix ( BLOBTOOLKIT_CONFIG.out.versions.first() ) + ch_config = BLOBTOOLKIT_CONFIG.out.yaml + } else { + ch_config = yaml + } emit: - aln // channel: [ val(meta), path(datafile) ] - versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ] + aln // channel: [ val(meta), path(datafile) ] + config = ch_config // channel: [ val(meta), path(yaml) ] + versions = ch_versions // channel: [ versions.yml ] } // Function to get list of [ meta, datafile ] diff --git a/subworkflows/local/minimap_alignment.nf b/subworkflows/local/minimap_alignment.nf new file mode 100644 index 00000000..b9a4409e --- /dev/null +++ b/subworkflows/local/minimap_alignment.nf @@ -0,0 +1,71 @@ +// +// Optional alignment subworkflow using Minimap2 +// + +include { SAMTOOLS_FASTA } from '../../modules/nf-core/samtools/fasta/main' +include { MINIMAP2_ALIGN as MINIMAP2_HIC } from '../../modules/nf-core/minimap2/align/main' +include { MINIMAP2_ALIGN as MINIMAP2_ILMN } from '../../modules/nf-core/minimap2/align/main' +include { MINIMAP2_ALIGN as MINIMAP2_CCS } from '../../modules/nf-core/minimap2/align/main' +include { MINIMAP2_ALIGN as MINIMAP2_CLR } from '../../modules/nf-core/minimap2/align/main' +include { MINIMAP2_ALIGN as MINIMAP2_ONT } from '../../modules/nf-core/minimap2/align/main' + + +workflow MINIMAP2_ALIGNMENT { + take: + input // channel: [ val(meta), path(datafile) ] + fasta // channel: [ val(meta), path(fasta) ] + + + main: + ch_versions = Channel.empty() + + + // Convert reads to FASTA + SAMTOOLS_FASTA ( input, true ) + ch_versions = ch_versions.mix(SAMTOOLS_FASTA.out.versions.first()) + + + // Branch input by sequencing type + SAMTOOLS_FASTA.out.interleaved + | branch { + meta, reads -> + hic: meta.datatype == "hic" + illumina : meta.datatype == "illumina" + pacbio : meta.datatype == "pacbio" + clr : meta.datatype == "pacbio_clr" + ont : meta.datatype == "ont" + } + | set { ch_input } + + + // Align with Minimap2 + MINIMAP2_HIC ( ch_input.hic, fasta, true, false, false ) + ch_versions = ch_versions.mix(MINIMAP2_HIC.out.versions.first()) + + MINIMAP2_ILMN ( ch_input.illumina, fasta, true, false, false ) + ch_versions = ch_versions.mix(MINIMAP2_ILMN.out.versions.first()) + + MINIMAP2_CCS ( ch_input.pacbio, fasta, true, false, false ) + ch_versions = ch_versions.mix(MINIMAP2_CCS.out.versions.first()) + + MINIMAP2_CLR ( ch_input.clr, fasta, true, false, false ) + ch_versions = ch_versions.mix(MINIMAP2_CLR.out.versions.first()) + + MINIMAP2_ONT ( ch_input.ont, fasta, true, false, false ) + ch_versions = ch_versions.mix(MINIMAP2_ONT.out.versions.first()) + + + // Combine aligned reads + Channel.empty() + | mix ( MINIMAP2_HIC.out.bam ) + | mix ( MINIMAP2_ILMN.out.bam ) + | mix ( MINIMAP2_CCS.out.bam ) + | mix ( MINIMAP2_CLR.out.bam ) + | mix ( MINIMAP2_ONT.out.bam ) + | set { ch_aligned } + + + emit: + aln = ch_aligned // channel: [ val(meta), bam ] + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/prepare_genome.nf b/subworkflows/local/prepare_genome.nf new file mode 100644 index 00000000..d1e31a72 --- /dev/null +++ b/subworkflows/local/prepare_genome.nf @@ -0,0 +1,49 @@ +// +// Prepare genome for downstream processing +// + +include { GUNZIP } from '../../modules/nf-core/gunzip/main' +include { WINDOWMASKER_MKCOUNTS } from '../../modules/nf-core/windowmasker/mkcounts/main' +include { WINDOWMASKER_USTAT } from '../../modules/nf-core/windowmasker/ustat/main' + + +workflow PREPARE_GENOME { + take: + fasta // channel: [ meta, path(genome) ] + + + main: + ch_versions = Channel.empty() + + + // + // MODULE: Decompress FASTA file if needed + // + if ( params.fasta.endsWith('.gz') ) { + ch_genome = GUNZIP ( fasta ).gunzip + ch_versions = ch_versions.mix ( GUNZIP.out.versions ) + } else { + ch_genome = fasta + } + + + // + // MODULES: Mask the genome if needed + // + if ( params.mask ) { + WINDOWMASKER_MKCOUNTS ( ch_genome ) + ch_versions = ch_versions.mix ( WINDOWMASKER_MKCOUNTS.out.versions ) + + WINDOWMASKER_USTAT ( WINDOWMASKER_MKCOUNTS.out.counts, ch_genome ) + ch_versions = ch_versions.mix ( WINDOWMASKER_USTAT.out.versions ) + + ch_fasta = WINDOWMASKER_USTAT.out.intervals + } else { + ch_fasta = ch_genome + } + + + emit: + genome = ch_fasta // channel: [ meta, path(genome) ] + versions = ch_versions // channel: [ versions.yml ] +} \ No newline at end of file diff --git a/subworkflows/local/run_blastn.nf b/subworkflows/local/run_blastn.nf new file mode 100644 index 00000000..87cb0a88 --- /dev/null +++ b/subworkflows/local/run_blastn.nf @@ -0,0 +1,81 @@ +// +// BLASTN search of assembly contigs with no diamond blastx match against the nucleotide database +// + + +include { NOHIT_LIST } from '../../modules/local/nohit_list' +include { SEQTK_SUBSEQ } from '../../modules/nf-core/seqtk/subseq/main' +include { GUNZIP } from '../../modules/nf-core/gunzip/main' +include { BLOBTOOLKIT_CHUNK } from '../../modules/local/blobtoolkit/chunk' +include { BLAST_BLASTN as BLASTN_TAXON } from '../../modules/nf-core/blast/blastn/main' +include { BLAST_BLASTN } from '../../modules/nf-core/blast/blastn/main' +include { BLOBTOOLKIT_UNCHUNK } from '../../modules/local/blobtoolkit/unchunk' + + +workflow RUN_BLASTN { + take: + blast_table // channel: [ val(meta), path(blast_table) ] + fasta // channel: [ val(meta), path(fasta) ] + blastn // channel: [ val(meta), path(blastn_db) ] + taxon_id // channel: val(taxon_id) + + + main: + ch_versions = Channel.empty() + + + // Extract no hits fasta + // Get list of sequence ids with no hits in diamond blastx search + NOHIT_LIST ( blast_table, fasta ) + ch_versions = ch_versions.mix ( NOHIT_LIST.out.versions.first() ) + + // Subset of sequences with no hits (meta is not propagated in this step) + SEQTK_SUBSEQ ( + fasta.map { meta, genome -> genome }, + NOHIT_LIST.out.nohitlist.map { meta, nohit -> nohit } + ) + ch_versions = ch_versions.mix ( SEQTK_SUBSEQ.out.versions.first() ) + + + // Split long contigs into chunks + // add meta to fasta subset channel: [ val(meta), path(compressed_fasta) ] + ch_gz = fasta.combine(SEQTK_SUBSEQ.out.sequences).map { meta, genome, seq -> [ meta, seq ] } + + // uncompress fasta + GUNZIP ( ch_gz ) + + // create chunks + BLOBTOOLKIT_CHUNK ( GUNZIP.out.gunzip, [[],[]] ) + ch_versions = ch_versions.mix ( BLOBTOOLKIT_CHUNK.out.versions.first() ) + + + // Run blastn search + // run blastn excluding taxon_id + BLASTN_TAXON ( BLOBTOOLKIT_CHUNK.out.chunks, blastn, taxon_id ) + + // check if blastn output table is empty + BLASTN_TAXON.out.txt + | map { meta, txt -> txt.isEmpty() } + | set { is_txt_empty } + + // repeat the blastn search without excluding taxon_id + if ( is_txt_empty ) { + BLAST_BLASTN ( BLOBTOOLKIT_CHUNK.out.chunks, blastn, [] ) + ch_blastn_txt = BLAST_BLASTN.out.txt + } + else { + ch_blastn_txt = BLASTN_TAXON.out.txt + } + + ch_versions = ch_versions.mix ( BLAST_BLASTN.out.versions.first() ) + + + // Unchunk chunked blastn results + BLOBTOOLKIT_UNCHUNK ( ch_blastn_txt ) + ch_versions = ch_versions.mix ( BLOBTOOLKIT_UNCHUNK.out.versions.first() ) + + + emit: + blastn_out = BLOBTOOLKIT_UNCHUNK.out.blast_out // channel: [ val(meta), path(blastn_out) ] + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/run_blastx.nf b/subworkflows/local/run_blastx.nf new file mode 100644 index 00000000..1bad6f6d --- /dev/null +++ b/subworkflows/local/run_blastx.nf @@ -0,0 +1,46 @@ +// +// Diamond blastx search of assembly contigs against the UniProt reference proteomes +// + +include { BLOBTOOLKIT_CHUNK } from '../../modules/local/blobtoolkit/chunk' +include { BLOBTOOLKIT_UNCHUNK } from '../../modules/local/blobtoolkit/unchunk' +include { DIAMOND_BLASTX } from '../../modules/nf-core/diamond/blastx/main' + +workflow RUN_BLASTX { + take: + fasta // channel: [ val(meta), path(fasta) ] + table // channel: [ val(meta), path(busco_table) ] + blastx // channel: [ val(meta), path(blastx_db) ] + outext // channel: val(out_format) + cols // channel: val(column_names) + + + main: + ch_versions = Channel.empty() + + + // + // Create metadata summary file + // + BLOBTOOLKIT_CHUNK ( fasta, table ) + ch_versions = ch_versions.mix ( BLOBTOOLKIT_CHUNK.out.versions.first() ) + + + // + // Run diamond_blastx + // + DIAMOND_BLASTX ( BLOBTOOLKIT_CHUNK.out.chunks, blastx, outext, cols) + ch_versions = ch_versions.mix ( DIAMOND_BLASTX.out.versions.first() ) + + + // + // Unchunk chunked blastx results + // + BLOBTOOLKIT_UNCHUNK ( DIAMOND_BLASTX.out.txt ) + ch_versions = ch_versions.mix ( BLOBTOOLKIT_UNCHUNK.out.versions.first() ) + + + emit: + blastx_out = BLOBTOOLKIT_UNCHUNK.out.blast_out // channel: [ val(meta), path(blastx_out) ] + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/view.nf b/subworkflows/local/view.nf index f1bf89d6..505d6c36 100644 --- a/subworkflows/local/view.nf +++ b/subworkflows/local/view.nf @@ -3,7 +3,7 @@ // include { BLOBTOOLKIT_SUMMARY } from '../../modules/local/blobtoolkit/summary' -include { BLOBTOOLKIT_IMAGES } from '../../modules/local/blobtoolkit/images' +include { BLOBTK_IMAGES } from '../../modules/local/blobtk/images' workflow VIEW { take: @@ -24,14 +24,14 @@ workflow VIEW { // // Generate static plots in png format // - plots = [ "snail", "blob", "cumulative" ] + plots = [ "blob", "cumulative", "snail" ] - BLOBTOOLKIT_IMAGES ( blobdir, plots ) - ch_versions = ch_versions.mix( BLOBTOOLKIT_IMAGES.out.versions ) + BLOBTK_IMAGES ( blobdir, plots ) + ch_versions = ch_versions.mix( BLOBTK_IMAGES.out.versions ) emit: summary = BLOBTOOLKIT_SUMMARY.out.json // channel: [ val(meta), path(json) ] - images = BLOBTOOLKIT_IMAGES.out.png // channel: [ val(meta), path(png) ] + images = BLOBTK_IMAGES.out.png // channel: [ val(meta), path(png) ] versions = ch_versions // channel: [ versions.yml ] } diff --git a/workflows/blobtoolkit.nf b/workflows/blobtoolkit.nf index c8dad117..0d452a54 100644 --- a/workflows/blobtoolkit.nf +++ b/workflows/blobtoolkit.nf @@ -1,24 +1,32 @@ /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - VALIDATE INPUTS + PRINT PARAMS SUMMARY ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -def summary_params = NfcoreSchema.paramsSummaryMap(workflow, params) +include { paramsSummaryLog; paramsSummaryMap } from 'plugin/nf-validation' + +def logo = NfcoreTemplate.logo(workflow, params.monochrome_logs) +def citation = '\n' + WorkflowMain.citation(workflow) + '\n' +def summary_params = paramsSummaryMap(workflow) + +// Print parameter summary log to screen +log.info logo + paramsSummaryLog(workflow) + citation -// Validate input parameters WorkflowBlobtoolkit.initialise(params, log) // Add all file path parameters for the pipeline to the list below // Check input path parameters to see if they exist -def checkPathParamList = [ params.input, params.multiqc_config, params.fasta, params.taxa_file, params.taxdump, params.busco, params.uniprot ] +def checkPathParamList = [ params.input, params.multiqc_config, params.fasta, params.taxa_file, params.taxdump, params.busco, params.blastp, params.blastx ] for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } // Check mandatory parameters if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } -if (params.fasta && params.accession) { ch_fasta = Channel.of([ [ 'id': params.accession ], params.fasta ]).collect() } else { exit 1, 'Genome fasta file and accession must be specified!' } +if (params.fasta && params.accession) { ch_fasta = Channel.of([ [ 'id': params.accession ], params.fasta ]).first() } else { exit 1, 'Genome fasta file and accession must be specified!' } if (params.taxon) { ch_taxon = Channel.of(params.taxon) } else { exit 1, 'NCBI Taxon ID not specified!' } -if (params.uniprot) { ch_uniprot = file(params.uniprot) } else { exit 1, 'Diamond BLASTp database not specified!' } +if (params.blastp && params.accession) { ch_blastp = Channel.of([ [ 'id': params.accession ], params.blastp ]).first() } else { exit 1, 'Diamond BLASTp database and accession must be specified!' } +if (params.blastx && params.accession) { ch_blastx = Channel.of([ [ 'id': params.accession ], params.blastx ]).first() } else { exit 1, 'Diamond BLASTx database and accession must be specified!' } +if (params.blastn && params.accession) { ch_blastn = Channel.of([ [ 'id': params.accession ], params.blastn ]).first() } else { exit 1, 'BLASTn database not specified!' } if (params.taxdump) { ch_taxdump = file(params.taxdump) } else { exit 1, 'NCBI Taxonomy database not specified!' } // Create channel for optional parameters @@ -45,17 +53,22 @@ ch_multiqc_custom_methods_description = params.multiqc_methods_description ? fil // // MODULE: Loaded from modules/local/ // -include { BLOBTOOLKIT_CONFIG } from '../modules/local/blobtoolkit/config' +include { BLOBTOOLKIT_CONFIG } from '../modules/local/blobtoolkit/config' +include { BLOBTOOLKIT_UPDATEMETA } from '../modules/local/blobtoolkit/updatemeta' // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // -include { INPUT_CHECK } from '../subworkflows/local/input_check' -include { COVERAGE_STATS } from '../subworkflows/local/coverage_stats' -include { BUSCO_DIAMOND } from '../subworkflows/local/busco_diamond_blastp' -include { COLLATE_STATS } from '../subworkflows/local/collate_stats' -include { BLOBTOOLS } from '../subworkflows/local/blobtools' -include { VIEW } from '../subworkflows/local/view' +include { PREPARE_GENOME } from '../subworkflows/local/prepare_genome' +include { MINIMAP2_ALIGNMENT } from '../subworkflows/local/minimap_alignment' +include { INPUT_CHECK } from '../subworkflows/local/input_check' +include { COVERAGE_STATS } from '../subworkflows/local/coverage_stats' +include { BUSCO_DIAMOND } from '../subworkflows/local/busco_diamond_blastp' +include { RUN_BLASTX } from '../subworkflows/local/run_blastx' +include { RUN_BLASTN } from '../subworkflows/local/run_blastn' +include { COLLATE_STATS } from '../subworkflows/local/collate_stats' +include { BLOBTOOLS } from '../subworkflows/local/blobtools' +include { VIEW } from '../subworkflows/local/view' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -66,7 +79,6 @@ include { VIEW } from '../subworkflows/local/view' // // MODULE: Installed directly from nf-core/modules // -include { GUNZIP } from '../modules/nf-core/gunzip/main' include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' include { MULTIQC } from '../modules/nf-core/multiqc/main' @@ -84,25 +96,32 @@ workflow BLOBTOOLKIT { ch_versions = Channel.empty() // - // MODULE: Decompress FASTA file if needed + // SUBWORKFLOW: Prepare genome for downstream processing // - if ( params.fasta.endsWith('.gz') ) { - ch_genome = GUNZIP ( ch_fasta ).gunzip - ch_versions = ch_versions.mix ( GUNZIP.out.versions.first() ) - } else { - ch_genome = ch_fasta - } + PREPARE_GENOME ( ch_fasta ) + ch_versions = ch_versions.mix ( PREPARE_GENOME.out.versions ) // // SUBWORKFLOW: Check samplesheet and create channels for downstream analysis // - INPUT_CHECK ( ch_input ) + INPUT_CHECK ( ch_input, ch_fasta, ch_yaml ) ch_versions = ch_versions.mix ( INPUT_CHECK.out.versions ) + // + // SUBWORKFLOW: Optional read alignment + // + if ( params.align ) { + MINIMAP2_ALIGNMENT ( INPUT_CHECK.out.aln, PREPARE_GENOME.out.genome ) + ch_versions = ch_versions.mix ( MINIMAP2_ALIGNMENT.out.versions ) + ch_aligned = MINIMAP2_ALIGNMENT.out.aln + } else { + ch_aligned = INPUT_CHECK.out.aln + } + // // SUBWORKFLOW: Calculate genome coverage and statistics // - COVERAGE_STATS ( INPUT_CHECK.out.aln, ch_genome ) + COVERAGE_STATS ( ch_aligned, PREPARE_GENOME.out.genome ) ch_versions = ch_versions.mix ( COVERAGE_STATS.out.versions ) // @@ -115,27 +134,63 @@ workflow BLOBTOOLKIT { ch_taxon_taxa = ch_fasta.combine(ch_taxon).map { meta, fasta, taxon -> [ meta, taxon, [] ] } } - BUSCO_DIAMOND ( ch_genome, ch_taxon_taxa, ch_busco_db, ch_uniprot, params.blastp_outext, params.blastp_cols ) + BUSCO_DIAMOND ( + PREPARE_GENOME.out.genome, + ch_taxon_taxa, + ch_busco_db, + ch_blastp, + params.blastp_outext, + params.blastp_cols + ) ch_versions = ch_versions.mix ( BUSCO_DIAMOND.out.versions ) + + // + // SUBWORKFLOW: Diamond blastx search of assembly contigs against the UniProt reference proteomes + // + RUN_BLASTX ( + PREPARE_GENOME.out.genome, + BUSCO_DIAMOND.out.first_table, + ch_blastx, + params.blastx_outext, + params.blastx_cols + ) + ch_versions = ch_versions.mix ( RUN_BLASTX.out.versions ) + + // + // SUBWORKFLOW: Run blastn search on sequences that had no blastx hits + // + RUN_BLASTN ( + RUN_BLASTX.out.blastx_out, + PREPARE_GENOME.out.genome, + ch_blastn, + BUSCO_DIAMOND.out.taxon_id + ) + // // SUBWORKFLOW: Collate genome statistics by various window sizes // - COLLATE_STATS ( BUSCO_DIAMOND.out.full_table, COVERAGE_STATS.out.bed, COVERAGE_STATS.out.freq, COVERAGE_STATS.out.mononuc, COVERAGE_STATS.out.cov ) + COLLATE_STATS ( + BUSCO_DIAMOND.out.full_table, + COVERAGE_STATS.out.bed, + COVERAGE_STATS.out.freq, + COVERAGE_STATS.out.mononuc, + COVERAGE_STATS.out.cov + ) ch_versions = ch_versions.mix ( COLLATE_STATS.out.versions ) // // SUBWORKFLOW: Create BlobTools dataset // - if ( !params.yaml ) { - BLOBTOOLKIT_CONFIG ( ch_genome ) - ch_config = BLOBTOOLKIT_CONFIG.out.yaml - ch_versions = ch_versions.mix ( BLOBTOOLKIT_CONFIG.out.versions.first() ) - } else { - ch_config = ch_yaml - } - - BLOBTOOLS ( ch_config, COLLATE_STATS.out.window_tsv, BUSCO_DIAMOND.out.first_table, BUSCO_DIAMOND.out.blastp_txt.ifEmpty([[],[]]), ch_taxdump ) + BLOBTOOLS ( + INPUT_CHECK.out.config, + COLLATE_STATS.out.window_tsv, + BUSCO_DIAMOND.out.first_table, + BUSCO_DIAMOND.out.blastp_txt.ifEmpty([[],[]]), + RUN_BLASTX.out.blastx_out.ifEmpty([[],[]]), + RUN_BLASTN.out.blastn_out.ifEmpty([[],[]]), + ch_taxdump + ) ch_versions = ch_versions.mix ( BLOBTOOLS.out.versions ) // @@ -151,20 +206,25 @@ workflow BLOBTOOLKIT { ch_versions.unique().collectFile(name: 'collated_versions.yml') ) + // + // MODULE: Update meta json file + // + BLOBTOOLKIT_UPDATEMETA ( BLOBTOOLS.out.blobdir, CUSTOM_DUMPSOFTWAREVERSIONS.out.yml ) + + // // MODULE: MultiQC // workflow_summary = WorkflowBlobtoolkit.paramsSummaryMultiqc(workflow, summary_params) ch_workflow_summary = Channel.value(workflow_summary) - methods_description = WorkflowBlobtoolkit.methodsDescriptionText(workflow, ch_multiqc_custom_methods_description) + methods_description = WorkflowBlobtoolkit.methodsDescriptionText(workflow, ch_multiqc_custom_methods_description, params) ch_methods_description = Channel.value(methods_description) ch_multiqc_files = Channel.empty() ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(BUSCO_DIAMOND.out.multiqc.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(COVERAGE_STATS.out.multiqc.collect{it[1]}.ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) MULTIQC ( @@ -186,6 +246,7 @@ workflow.onComplete { if (params.email || params.email_on_fail) { NfcoreTemplate.email(workflow, params, summary_params, projectDir, log, multiqc_report) } + NfcoreTemplate.dump_parameters(workflow, params) NfcoreTemplate.summary(workflow, params, log) if (params.hook_url) { NfcoreTemplate.IM_notification(workflow, params, summary_params, projectDir, log)