diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 00000000..ea27a584 --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,27 @@ +{ + "name": "nfcore", + "image": "nfcore/gitpod:latest", + "remoteUser": "gitpod", + + // Configure tool-specific properties. + "customizations": { + // Configure properties specific to VS Code. + "vscode": { + // Set *default* container specific settings.json values on container create. + "settings": { + "python.defaultInterpreterPath": "/opt/conda/bin/python", + "python.linting.enabled": true, + "python.linting.pylintEnabled": true, + "python.formatting.autopep8Path": "/opt/conda/bin/autopep8", + "python.formatting.yapfPath": "/opt/conda/bin/yapf", + "python.linting.flake8Path": "/opt/conda/bin/flake8", + "python.linting.pycodestylePath": "/opt/conda/bin/pycodestyle", + "python.linting.pydocstylePath": "/opt/conda/bin/pydocstyle", + "python.linting.pylintPath": "/opt/conda/bin/pylint" + }, + + // Add the IDs of extensions you want installed when the container is created. + "extensions": ["ms-python.python", "ms-python.vscode-pylance", "nf-core.nf-core-extensionpack"] + } + } +} diff --git a/.editorconfig b/.editorconfig index b6b31907..a30ae1e1 100644 --- a/.editorconfig +++ b/.editorconfig @@ -22,3 +22,12 @@ indent_size = unset [/assets/email*] indent_size = unset + +# To prevent errors for these test diamond databases +[/assets/test*/*.dmnd] +charset = unset +end_of_line = unset +insert_final_newline = unset +trim_trailing_whitespace = unset +indent_style = unset +indent_size = unset diff --git a/.gitattributes b/.gitattributes index 050bb120..7a2dabc2 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,3 +1,4 @@ *.config linguist-language=nextflow +*.nf.test linguist-language=nextflow modules/nf-core/** linguist-generated subworkflows/nf-core/** linguist-generated diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 26f3f9ab..19adb352 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -1,22 +1,20 @@ -# nf-core/blobtoolkit: Contributing Guidelines +# sanger-tol/blobtoolkit: Contributing Guidelines Hi there! -Many thanks for taking an interest in improving nf-core/blobtoolkit. +Many thanks for taking an interest in improving sanger-tol/blobtoolkit. -We try to manage the required tasks for nf-core/blobtoolkit using GitHub issues, you probably came to this page when creating one. +We try to manage the required tasks for sanger-tol/blobtoolkit using GitHub issues, you probably came to this page when creating one. Please use the pre-filled template to save time. However, don't be put off by this template - other more general issues and suggestions are welcome! Contributions to the code are even more welcome ;) -> If you need help using or modifying nf-core/blobtoolkit then the best place to ask is on the nf-core Slack [#blobtoolkit](https://nfcore.slack.com/channels/blobtoolkit) channel ([join our Slack here](https://nf-co.re/join/slack)). - ## Contribution workflow -If you'd like to write some code for nf-core/blobtoolkit, the standard workflow is as follows: +If you'd like to write some code for sanger-tol/blobtoolkit, the standard workflow is as follows: -1. Check that there isn't already an issue about your idea in the [nf-core/blobtoolkit issues](https://github.com/nf-core/blobtoolkit/issues) to avoid duplicating work. If there isn't one already, please create one so that others know you're working on this -2. [Fork](https://help.github.com/en/github/getting-started-with-github/fork-a-repo) the [nf-core/blobtoolkit repository](https://github.com/nf-core/blobtoolkit) to your GitHub account +1. Check that there isn't already an issue about your idea in the [sanger-tol/blobtoolkit issues](https://github.com/sanger-tol/blobtoolkit/issues) to avoid duplicating work. If there isn't one already, please create one so that others know you're working on this +2. [Fork](https://help.github.com/en/github/getting-started-with-github/fork-a-repo) the [sanger-tol/blobtoolkit repository](https://github.com/sanger-tol/blobtoolkit) to your GitHub account 3. Make the necessary changes / additions within your forked repository following [Pipeline conventions](#pipeline-contribution-conventions) 4. Use `nf-core schema build` and add any new parameters to the pipeline JSON schema (requires [nf-core tools](https://github.com/nf-core/tools) >= 1.10). 5. Submit a Pull Request against the `dev` branch and wait for the code to be reviewed and merged @@ -52,13 +50,9 @@ These tests are run both with the latest available version of `Nextflow` and als - Fix the bug, and bump version (X.Y.Z+1). - A PR should be made on `master` from patch to directly this particular bug. -## Getting help - -For further information/help, please consult the [nf-core/blobtoolkit documentation](https://nf-co.re/blobtoolkit/usage) and don't hesitate to get in touch on the nf-core Slack [#blobtoolkit](https://nfcore.slack.com/channels/blobtoolkit) channel ([join our Slack here](https://nf-co.re/join/slack)). - ## Pipeline contribution conventions -To make the nf-core/blobtoolkit code and processing logic more understandable for new contributors and to ensure quality, we semi-standardise the way the code and other contributions are written. +To make the sanger-tol/blobtoolkit code and processing logic more understandable for new contributors and to ensure quality, we semi-standardise the way the code and other contributions are written. ### Adding a new step @@ -101,3 +95,19 @@ If you are using a new feature from core Nextflow, you may bump the minimum requ ### Images and figures For overview images and other documents we follow the nf-core [style guidelines and examples](https://nf-co.re/developers/design_guidelines). + +## GitHub Codespaces + +This repo includes a devcontainer configuration which will create a GitHub Codespaces for Nextflow development! This is an online developer environment that runs in your browser, complete with VSCode and a terminal. + +To get started: + +- Open the repo in [Codespaces](https://github.com/sanger-tol/blobtoolkit/codespaces) +- Tools installed + - nf-core + - Nextflow + +Devcontainer specs: + +- [DevContainer config](.devcontainer/devcontainer.json) +- [Dockerfile](.devcontainer/Dockerfile) diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index 4c493eb4..89aae5cc 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -2,14 +2,6 @@ name: Bug report description: Report something that is broken or incorrect labels: bug body: - - type: markdown - attributes: - value: | - Before you post this issue, please check the documentation: - - - [nf-core website: troubleshooting](https://nf-co.re/usage/troubleshooting) - - [nf-core/blobtoolkit pipeline documentation](https://nf-co.re/blobtoolkit/usage) - - type: textarea id: description attributes: @@ -17,34 +9,46 @@ body: description: A clear and concise description of what the bug is. validations: required: true - - type: textarea id: command_used attributes: label: Command used and terminal output - description: Steps to reproduce the behaviour. Please paste the command you used to launch the pipeline and the output from your terminal. + description: Steps to reproduce the behaviour. Please paste the command you used + to launch the pipeline and the output from your terminal. render: console - placeholder: | - $ nextflow run ... + placeholder: "$ nextflow run ... + Some output where something broke + " - type: textarea id: files attributes: label: Relevant files - description: | - Please drag and drop the relevant files here. Create a `.zip` archive if the extension is not allowed. - Your verbose log file `.nextflow.log` is often useful _(this is a hidden file in the directory where you launched the pipeline)_ as well as custom Nextflow configuration files. + description: "Please drag and drop the relevant files here. Create a `.zip` archive + if the extension is not allowed. + + Your verbose log file `.nextflow.log` is often useful _(this is a hidden file + in the directory where you launched the pipeline)_ as well as custom Nextflow + configuration files. + " - type: textarea id: system attributes: label: System information - description: | - * Nextflow version _(eg. 21.10.3)_ + description: "* Nextflow version _(eg. 22.10.1)_ + * Hardware _(eg. HPC, Desktop, Cloud)_ + * Executor _(eg. slurm, local, awsbatch)_ - * Container engine: _(e.g. Docker, Singularity, Conda, Podman, Shifter or Charliecloud)_ + + * Container engine: _(e.g. Docker, Singularity, Conda, Podman, Shifter, Charliecloud, + or Apptainer)_ + * OS _(eg. CentOS Linux, macOS, Linux Mint)_ - * Version of nf-core/blobtoolkit _(eg. 1.1, 1.5, 1.8.2)_ + + * Version of sanger-tol/blobtoolkit _(eg. 1.1, 1.5, 1.8.2)_ + + " diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml index 9c04675e..9a0926a8 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.yml +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -1,5 +1,5 @@ name: Feature request -description: Suggest an idea for the nf-core/blobtoolkit pipeline +description: Suggest an idea for the sanger-tol/blobtoolkit pipeline labels: enhancement body: - type: textarea diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 6ca1c72b..fef3064b 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,22 +1,21 @@ ## PR checklist - [ ] This comment contains a description of changes (with reason). - [ ] If you've fixed a bug or added code that should be tested, add tests! - - [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](https://github.com/nf-core/blobtoolkit/tree/master/.github/CONTRIBUTING.md) - - [ ] If necessary, also make a PR on the nf-core/blobtoolkit _branch_ on the [nf-core/test-datasets](https://github.com/nf-core/test-datasets) repository. +- [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](https://github.com/sanger-tol/blobtoolkit/tree/master/.github/CONTRIBUTING.md) - [ ] Make sure your code lints (`nf-core lint`). - [ ] Ensure the test suite passes (`nextflow run . -profile test,docker --outdir `). - [ ] Usage Documentation in `docs/usage.md` is updated. diff --git a/.github/workflows/awsfulltest.yml b/.github/workflows/awsfulltest.yml deleted file mode 100644 index 2e477bfc..00000000 --- a/.github/workflows/awsfulltest.yml +++ /dev/null @@ -1,30 +0,0 @@ -name: nf-core AWS full size tests -# This workflow is triggered on published releases. -# It can be additionally triggered manually with GitHub actions workflow dispatch button. -# It runs the -profile 'test_full' on AWS batch - -on: - release: - types: [published] - workflow_dispatch: -jobs: - run-tower: - name: Run AWS full tests - if: github.repository == 'nf-core/blobtoolkit' - runs-on: ubuntu-latest - steps: - - name: Launch workflow via tower - uses: nf-core/tower-action@v3 - # TODO nf-core: You can customise AWS full pipeline tests as required - # Add full size test data (but still relatively small datasets for few samples) - # on the `test_full.config` test runs with only one set of parameters - with: - workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} - access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} - compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} - workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/blobtoolkit/work-${{ github.sha }} - parameters: | - { - "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/blobtoolkit/results-${{ github.sha }}" - } - profiles: test_full,aws_tower diff --git a/.github/workflows/awstest.yml b/.github/workflows/awstest.yml deleted file mode 100644 index e1255e92..00000000 --- a/.github/workflows/awstest.yml +++ /dev/null @@ -1,25 +0,0 @@ -name: nf-core AWS test -# This workflow can be triggered manually with the GitHub actions workflow dispatch button. -# It runs the -profile 'test' on AWS batch - -on: - workflow_dispatch: -jobs: - run-tower: - name: Run AWS tests - if: github.repository == 'nf-core/blobtoolkit' - runs-on: ubuntu-latest - steps: - # Launch workflow using Tower CLI tool action - - name: Launch workflow via tower - uses: nf-core/tower-action@v3 - with: - workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} - access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} - compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} - workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/blobtoolkit/work-${{ github.sha }} - parameters: | - { - "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/blobtoolkit/results-test-${{ github.sha }}" - } - profiles: test,aws_tower diff --git a/.github/workflows/branch.yml b/.github/workflows/branch.yml index ec820ed2..b30ada17 100644 --- a/.github/workflows/branch.yml +++ b/.github/workflows/branch.yml @@ -11,9 +11,9 @@ jobs: steps: # PRs to the nf-core repo master branch are only ok if coming from the nf-core repo `dev` or any `patch` branches - name: Check PRs - if: github.repository == 'nf-core/blobtoolkit' + if: github.repository == 'sanger-tol/blobtoolkit' run: | - { [[ ${{github.event.pull_request.head.repo.full_name }} == nf-core/blobtoolkit ]] && [[ $GITHUB_HEAD_REF = "dev" ]]; } || [[ $GITHUB_HEAD_REF == "patch" ]] + { [[ ${{github.event.pull_request.head.repo.full_name }} == sanger-tol/blobtoolkit ]] && [[ $GITHUB_HEAD_REF == "dev" ]]; } || [[ $GITHUB_HEAD_REF == "patch" ]] # If the above check failed, post a comment on the PR explaining the failure # NOTE - this doesn't currently work if the PR is coming from a fork, due to limitations in GitHub actions secrets diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c0f84172..2cfe15c7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,49 +1,37 @@ name: nf-core CI # This workflow runs the pipeline with the minimal test dataset to check that it completes without any syntax errors on: - push: - branches: - - dev - pull_request: - release: - types: [published] + workflow_dispatch: env: NXF_ANSI_LOG: false - CAPSULE_LOG: none + +concurrency: + group: "${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}" + cancel-in-progress: true jobs: test: name: Run pipeline with test data - # Only run on push if this is the nf-core dev branch (merged PRs) - if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/blobtoolkit') }}" + # Only run on push if this is the dev branch (merged PRs) + if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'sanger-tol/blobtoolkit') }}" runs-on: ubuntu-latest strategy: matrix: - # Nextflow versions - include: - # Test pipeline minimum Nextflow version - - NXF_VER: "21.10.3" - NXF_EDGE: "" - # Test latest edge release of Nextflow - - NXF_VER: "" - NXF_EDGE: "1" + NXF_VER: + - "22.10.1" + - "latest-everything" steps: - name: Check out pipeline code - uses: actions/checkout@v2 + uses: actions/checkout@v3 - name: Install Nextflow - env: - NXF_VER: ${{ matrix.NXF_VER }} - # Uncomment only if the edge release is more recent than the latest stable release - # See https://github.com/nextflow-io/nextflow/issues/2467 - # NXF_EDGE: ${{ matrix.NXF_EDGE }} - run: | - wget -qO- get.nextflow.io | bash - sudo mv nextflow /usr/local/bin/ + uses: nf-core/setup-nextflow@v1 + with: + version: "${{ matrix.NXF_VER }}" - name: Run pipeline with test data - # TODO nf-core: You can customise CI pipeline run tests as required + # You can customise CI pipeline run tests as required # For example: adding multiple test runs with different parameters # Remember that you can parallelise this by using strategy.matrix run: | diff --git a/.github/workflows/clean-up.yml b/.github/workflows/clean-up.yml new file mode 100644 index 00000000..694e90ec --- /dev/null +++ b/.github/workflows/clean-up.yml @@ -0,0 +1,24 @@ +name: "Close user-tagged issues and PRs" +on: + schedule: + - cron: "0 0 * * 0" # Once a week + +jobs: + clean-up: + runs-on: ubuntu-latest + permissions: + issues: write + pull-requests: write + steps: + - uses: actions/stale@v7 + with: + stale-issue-message: "This issue has been tagged as awaiting-changes or awaiting-feedback by an nf-core contributor. Remove stale label or add a comment otherwise this issue will be closed in 20 days." + stale-pr-message: "This PR has been tagged as awaiting-changes or awaiting-feedback by an nf-core contributor. Remove stale label or add a comment if it is still useful." + close-issue-message: "This issue was closed because it has been tagged as awaiting-changes or awaiting-feedback by an nf-core contributor and then staled for 20 days with no activity." + days-before-stale: 30 + days-before-close: 20 + days-before-pr-close: -1 + any-of-labels: "awaiting-changes,awaiting-feedback" + exempt-issue-labels: "WIP" + exempt-pr-labels: "WIP" + repo-token: "${{ secrets.GITHUB_TOKEN }}" diff --git a/.github/workflows/fix-linting.yml b/.github/workflows/fix-linting.yml index 4c061cce..b7779fad 100644 --- a/.github/workflows/fix-linting.yml +++ b/.github/workflows/fix-linting.yml @@ -9,7 +9,7 @@ jobs: if: > contains(github.event.comment.html_url, '/pull/') && contains(github.event.comment.body, '@nf-core-bot fix linting') && - github.repository == 'nf-core/blobtoolkit' + github.repository == 'sanger-tol/blobtoolkit' runs-on: ubuntu-latest steps: # Use the @nf-core-bot token to check out so we can push later @@ -24,7 +24,7 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.nf_core_bot_auth_token }} - - uses: actions/setup-node@v2 + - uses: actions/setup-node@v3 - name: Install Prettier run: npm install -g prettier @prettier/plugin-php @@ -34,9 +34,9 @@ jobs: id: prettier_status run: | if prettier --check ${GITHUB_WORKSPACE}; then - echo "::set-output name=result::pass" + echo "result=pass" >> $GITHUB_OUTPUT else - echo "::set-output name=result::fail" + echo "result=fail" >> $GITHUB_OUTPUT fi - name: Run 'prettier --write' diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 77358dee..888cb4bc 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -4,6 +4,8 @@ name: nf-core linting # that the code meets the nf-core guidelines. on: push: + branches: + - dev pull_request: release: types: [published] @@ -12,9 +14,9 @@ jobs: EditorConfig: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - - uses: actions/setup-node@v2 + - uses: actions/setup-node@v3 - name: Install editorconfig-checker run: npm install -g editorconfig-checker @@ -25,9 +27,9 @@ jobs: Prettier: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - - uses: actions/setup-node@v2 + - uses: actions/setup-node@v3 - name: Install Prettier run: npm install -g prettier @@ -35,22 +37,48 @@ jobs: - name: Run Prettier --check run: prettier --check ${GITHUB_WORKSPACE} + PythonBlack: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Check code lints with Black + uses: psf/black@stable + + # If the above check failed, post a comment on the PR explaining the failure + - name: Post PR comment + if: failure() + uses: mshick/add-pr-comment@v1 + with: + message: | + ## Python linting (`black`) is failing + + To keep the code consistent with lots of contributors, we run automated code consistency checks. + To fix this CI test, please run: + + * Install [`black`](https://black.readthedocs.io/en/stable/): `pip install black` + * Fix formatting errors in your pipeline: `black .` + + Once you push these changes the test should pass, and you can hide this comment :+1: + + We highly recommend setting up Black in your code editor so that this formatting is done automatically on save. Ask about it on Slack for help! + + Thanks again for your contribution! + repo-token: ${{ secrets.GITHUB_TOKEN }} + allow-repeats: false + nf-core: runs-on: ubuntu-latest steps: - name: Check out pipeline code - uses: actions/checkout@v2 + uses: actions/checkout@v3 - name: Install Nextflow - env: - CAPSULE_LOG: none - run: | - wget -qO- get.nextflow.io | bash - sudo mv nextflow /usr/local/bin/ + uses: nf-core/setup-nextflow@v1 - - uses: actions/setup-python@v3 + - uses: actions/setup-python@v4 with: - python-version: "3.6" + python-version: "3.8" architecture: "x64" - name: Install dependencies @@ -71,7 +99,7 @@ jobs: - name: Upload linting log file artifact if: ${{ always() }} - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v3 with: name: linting-logs path: | diff --git a/.github/workflows/linting_comment.yml b/.github/workflows/linting_comment.yml index 04758f61..0bbcd30f 100644 --- a/.github/workflows/linting_comment.yml +++ b/.github/workflows/linting_comment.yml @@ -18,7 +18,7 @@ jobs: - name: Get PR number id: pr_number - run: echo "::set-output name=pr_number::$(cat linting-logs/PR_number.txt)" + run: echo "pr_number=$(cat linting-logs/PR_number.txt)" >> $GITHUB_OUTPUT - name: Post PR comment uses: marocchino/sticky-pull-request-comment@v2 diff --git a/.github/workflows/sangerfulltest.yml b/.github/workflows/sangerfulltest.yml new file mode 100644 index 00000000..addef9bc --- /dev/null +++ b/.github/workflows/sangerfulltest.yml @@ -0,0 +1,33 @@ +name: nf-core Sanger LSF full size tests + +on: + workflow_dispatch: +jobs: + run-tower: + name: Run LSF full size tests + runs-on: ubuntu-latest + steps: + - name: Sets env vars for push + run: | + echo "REVISION=${{ github.sha }}" >> $GITHUB_ENV + if: github.event_name == 'push' + + - name: Sets env vars for workflow_dispatch + run: | + echo "REVISION=${{ github.sha }}" >> $GITHUB_ENV + if: github.event_name == 'workflow_dispatch' + + - name: Launch workflow via tower + uses: nf-core/tower-action@v2 + with: + workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} + access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} + compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} + pipeline: ${{ github.repository }} + revision: ${{ env.REVISION }} + workdir: ${{ secrets.TOWER_WORKDIR_PARENT }}/work/${{ github.repository }}/work-${{ env.REVISION }} + parameters: | + { + "outdir": "${{ secrets.TOWER_WORKDIR_PARENT }}/results/${{ github.repository }}/results-${{ env.REVISION }}", + } + profiles: test_full,sanger,singularity,cleanup diff --git a/.github/workflows/sangertest.yml b/.github/workflows/sangertest.yml new file mode 100644 index 00000000..95479500 --- /dev/null +++ b/.github/workflows/sangertest.yml @@ -0,0 +1,28 @@ +name: nf-core Sanger LSF tests + +on: + workflow_dispatch: +jobs: + run-tower: + name: Run LSF tests + runs-on: ubuntu-latest + steps: + - name: Sets env vars for workflow_dispatch + run: | + echo "REVISION=${{ github.sha }}" >> $GITHUB_ENV + if: github.event_name == 'workflow_dispatch' + + - name: Launch workflow via tower + uses: nf-core/tower-action@v2 + with: + workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} + access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} + compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} + pipeline: ${{ github.repository }} + revision: ${{ env.REVISION }} + workdir: ${{ secrets.TOWER_WORKDIR_PARENT }}/work/${{ github.repository }}/work-${{ env.REVISION }} + parameters: | + { + "outdir": "${{ secrets.TOWER_WORKDIR_PARENT }}/results/${{ github.repository }}/results-${{ env.REVISION }}", + } + profiles: test,sanger,singularity,cleanup diff --git a/.nf-core.yml b/.nf-core.yml index 3805dc81..f2175469 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -1 +1,18 @@ repository_type: pipeline +lint: + files_exist: + - assets/nf-core-blobtoolkit_logo_light.png + - docs/images/nf-core-blobtoolkit_logo_light.png + - docs/images/nf-core-blobtoolkit_logo_dark.png + files_unchanged: + - LICENSE + - .github/ISSUE_TEMPLATE/bug_report.yml + - assets/sendmail_template.txt + - lib/NfcoreTemplate.groovy + - .prettierignore + nextflow_config: + - manifest.name + - manifest.homePage + multiqc_config: + - report_comment + actions_ci: false diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..0c31cdb9 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,5 @@ +repos: + - repo: https://github.com/pre-commit/mirrors-prettier + rev: "v2.7.1" + hooks: + - id: prettier diff --git a/.prettierignore b/.prettierignore index d0e7ae58..7b59d0aa 100644 --- a/.prettierignore +++ b/.prettierignore @@ -1,9 +1,11 @@ email_template.html +adaptivecard.json +slackreport.json .nextflow* work/ -data/ results/ .DS_Store -testing/ -testing* *.pyc +bin/ +assets/test/*yaml +assets/test_full/*yaml diff --git a/CHANGELOG.md b/CHANGELOG.md index 15713bbc..02ab2ff8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,16 +1,50 @@ -# nf-core/blobtoolkit: Changelog +# sanger-tol/blobtoolkit: Changelog The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## v1.0dev - [date] +## [[0.1.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.1.0)] – Vaporeon – [2023-05-18] -Initial release of nf-core/blobtoolkit, created with the [nf-core](https://nf-co.re/) template. +### Enhancements & fixes -### `Added` +Initial release of sanger-tol/blobtoolkit :tada: -### `Fixed` +This release marks the point where the pipeline was moved from Snakemake at [blobtoolkit/blobtoolkit](https://github.com/blobtoolkit/blobtoolkit) over to Nextflow DSL2 at [sanger-tol/blobtoolkit](https://github.com/sanger-tol/blobtoolkit). There are two subworkflows in the Snakemake version that are still being implemented in Nextflow – `diamond_blastx` and `blastn`. -### `Dependencies` +### Parameters -### `Deprecated` +| Old parameter | New parameter | +| ------------- | --------------- | +| | --input | +| | --fasta | +| | --accession | +| | --taxon | +| | --taxa_file | +| | --yaml | +| | --blastp_outext | +| | --blastp_cols | +| | --taxdump | +| | --busco | +| | --uniprot | + +> **NB:** Parameter has been **updated** if both old and new parameter information is present.
**NB:** Parameter has been **added** if just the new parameter information is present.
**NB:** Parameter has been **removed** if new parameter information isn't present. + +### Software dependencies + +Note, since the pipeline is using Nextflow DSL2, each process will be run with its own [Biocontainer](https://biocontainers.pro/#/registry). This means that on occasion it is entirely possible for the pipeline to be using different versions of the same tool. However, the overall software dependency changes compared to the last release have been listed below for reference. Only `Docker` or `Singularity` containers are supported, `conda` is not supported. + +| Dependency | Old version | New version | +| ------------- | ----------- | ----------- | +| blobtoolkit | | 4.1.4 | +| busco | | 5.4.3 | +| fasta_windows | | 0.2.4 | +| goat | | 0.2.0 | +| gunzip | | 1.10 | +| mosdepth | | 0.3.3 | +| nextflow | | 22.10.6 | +| python | | 3.10.6 | +| samtools | | 1.15.1 | +| tar | | 1.30 | +| yaml | | 6.0 | + +> **NB:** Dependency has been **updated** if both old and new version information is present.
**NB:** Dependency has been **added** if just the new version information is present.
**NB:** Dependency has been **removed** if version information isn't present. diff --git a/CITATIONS.md b/CITATIONS.md index e4d37d37..41f43458 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -1,19 +1,44 @@ -# nf-core/blobtoolkit: Citations +# sanger-tol/blobtoolkit: Citations -## [nf-core](https://pubmed.ncbi.nlm.nih.gov/32055031/) +## [nf-core](https://nf-co.re) -> Ewels PA, Peltzer A, Fillinger S, Patel H, Alneberg J, Wilm A, Garcia MU, Di Tommaso P, Nahnsen S. The nf-core framework for community-curated bioinformatics pipelines. Nat Biotechnol. 2020 Mar;38(3):276-278. doi: 10.1038/s41587-020-0439-x. PubMed PMID: 32055031. +> Ewels, Philip A., et al. “The Nf-Core Framework for Community-Curated Bioinformatics Pipelines.” Nature Biotechnology, vol. 38, no. 3, Feb. 2020, pp. 276–78, https://doi.org/10.1038/s41587-020-0439-x. -## [Nextflow](https://pubmed.ncbi.nlm.nih.gov/28398311/) +## [Nextflow](https://www.nextflow.io) -> Di Tommaso P, Chatzou M, Floden EW, Barja PP, Palumbo E, Notredame C. Nextflow enables reproducible computational workflows. Nat Biotechnol. 2017 Apr 11;35(4):316-319. doi: 10.1038/nbt.3820. PubMed PMID: 28398311. +> Di Tommaso, Paolo, et al. “Nextflow Enables Reproducible Computational Workflows.” Nature Biotechnology, vol. 35, no. 4, Apr. 2017, pp. 316–19, https://doi.org/10.1038/nbt.3820. ## Pipeline tools -- [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) +- [BlobToolKit](https://github.com/blobtoolkit/blobtoolkit) -- [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/) - > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924. + > Challis, Richard, et al. “BlobToolKit – Interactive Quality Assessment of Genome Assemblies.” G3 Genes|Genomes|Genetics, vol. 10, no. 4, Apr. 2020, pp. 1361–74, https://doi.org/10.1534/g3.119.400908. + +- [BUSCO](https://gitlab.com/ezlab/busco) + + > Manni, Mosè, et al. “BUSCO: Assessing Genomic Data Quality and Beyond.” Current Protocols, vol. 1, no. 12, Dec. 2021, https://doi.org/10.1002/cpz1.323. + +- [Diamond](https://github.com/bbuchfink/diamond) + + > Buchfink, Benjamin, et al. “Sensitive Protein Alignments at Tree-of-Life Scale Using DIAMOND.” Nature Methods, vol. 18, no. 4, Apr. 2021, pp. 366–68, https://doi.org/10.1038/s41592-021-01101-x. + +- [Fasta_windows](https://github.com/tolkit/fasta_windows) + +- [GoaT](https://goat.genomehubs.org) + + > Challis, Richard, et al. “Genomes on a Tree (GoaT): A versatile, scalable search engine for genomic and sequencing project metadata across the eukaryotic tree of life.” Wellcome Open Research, vol. 8, no. 24, 2023, https://doi.org/10.12688/wellcomeopenres.18658.1. + +- [Mosdepth](https://github.com/brentp/mosdepth) + + > Pedersen, Brent S., and Aaron R. Quinlan. “Mosdepth: Quick Coverage Calculation for Genomes and Exomes.” Bioinformatics, edited by John Hancock, vol. 34, no. 5, Oct. 2017, pp. 867–68, https://doi.org/10.1093/bioinformatics/btx699. + +- [MultiQC](https://multiqc.info) + + > Ewels, Philip, et al. “MultiQC: Summarize Analysis Results for Multiple Tools and Samples in a Single Report.” Bioinformatics, vol. 32, no. 19, 2016, pp. 3047–3048., https://doi.org/10.1093/bioinformatics/btw354. + +- [Samtools](https://www.htslib.org) + + > Danecek, Petr, et al. “Twelve Years of SAMtools and BCFtools.” GigaScience, vol. 10, no. 2, Jan. 2021, https://doi.org/10.1093/gigascience/giab008. ## Software packaging/containerisation tools @@ -21,15 +46,18 @@ > Anaconda Software Distribution. Computer software. Vers. 2-2.4.0. Anaconda, Nov. 2016. Web. -- [Bioconda](https://pubmed.ncbi.nlm.nih.gov/29967506/) +- [Bioconda](https://bioconda.github.io) + + > Grüning, Björn, et al. “Bioconda: sustainable and comprehensive software distribution for the life sciences.", Nature Methods, vol. 15, Jul. 2018, pp. 475-6, https://doi.org/10.1038/s41592-018-0046-7. + +- [BioContainers](https://biocontainers.pro) - > Grüning B, Dale R, Sjödin A, Chapman BA, Rowe J, Tomkins-Tinch CH, Valieris R, Köster J; Bioconda Team. Bioconda: sustainable and comprehensive software distribution for the life sciences. Nat Methods. 2018 Jul;15(7):475-476. doi: 10.1038/s41592-018-0046-7. PubMed PMID: 29967506. + > da Veiga, Felipe, et al. “BioContainers: an open-source and community-driven framework for software standardization.", Bioinformatics, vol. 33, no. 16, Aug. 2017, pp. 2580-2, https://doi.org/10.1093/bioinformatics/btx192. -- [BioContainers](https://pubmed.ncbi.nlm.nih.gov/28379341/) +- [Docker](https://www.docker.com) - > da Veiga Leprevost F, Grüning B, Aflitos SA, Röst HL, Uszkoreit J, Barsnes H, Vaudel M, Moreno P, Gatto L, Weber J, Bai M, Jimenez RC, Sachsenberg T, Pfeuffer J, Alvarez RV, Griss J, Nesvizhskii AI, Perez-Riverol Y. BioContainers: an open-source and community-driven framework for software standardization. Bioinformatics. 2017 Aug 15;33(16):2580-2582. doi: 10.1093/bioinformatics/btx192. PubMed PMID: 28379341; PubMed Central PMCID: PMC5870671. + > Merkel, Dirk, et al. “Docker: Lightweight Linux Containers for Consistent Development and Deployment.", Association for Computing Machinery, vol. 2014, no. 239, Mar. 2014. -- [Docker](https://dl.acm.org/doi/10.5555/2600239.2600241) +- [Singularity](https://docs.sylabs.io/guides/latest/user-guide/) -- [Singularity](https://pubmed.ncbi.nlm.nih.gov/28494014/) - > Kurtzer GM, Sochat V, Bauer MW. Singularity: Scientific containers for mobility of compute. PLoS One. 2017 May 11;12(5):e0177459. doi: 10.1371/journal.pone.0177459. eCollection 2017. PubMed PMID: 28494014; PubMed Central PMCID: PMC5426675. + > Kurtzer, Gregory M., et al. “Singularity: Scientific containers for mobility of compute.", PLOS ONE, vol. 12, no. 5, May 2017, pp. e0177459, https://doi.org/10.1371/journal.pone.0177459. diff --git a/LICENSE b/LICENSE index a8e2a454..dbbab5cd 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) Alexander Ramos +Copyright (c) 2022-2023 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 556bc711..42f5fecc 100644 --- a/README.md +++ b/README.md @@ -1,94 +1,119 @@ -# ![nf-core/blobtoolkit](docs/images/nf-core-blobtoolkit_logo_light.png#gh-light-mode-only) ![nf-core/blobtoolkit](docs/images/nf-core-blobtoolkit_logo_dark.png#gh-dark-mode-only) +# ![sanger-tol/blobtoolkit](docs/images/sanger-tol-blobtoolkit_logo.png) -[![GitHub Actions CI Status](https://github.com/nf-core/blobtoolkit/workflows/nf-core%20CI/badge.svg)](https://github.com/nf-core/blobtoolkit/actions?query=workflow%3A%22nf-core+CI%22) -[![GitHub Actions Linting Status](https://github.com/nf-core/blobtoolkit/workflows/nf-core%20linting/badge.svg)](https://github.com/nf-core/blobtoolkit/actions?query=workflow%3A%22nf-core+linting%22) -[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?logo=Amazon%20AWS)](https://nf-co.re/blobtoolkit/results) -[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8)](https://doi.org/10.5281/zenodo.XXXXXXX) + -[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A521.10.3-23aa62.svg)](https://www.nextflow.io/) -[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?logo=anaconda)](https://docs.conda.io/en/latest/) -[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?logo=docker)](https://www.docker.com/) -[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg)](https://sylabs.io/docs/) -[![Launch on Nextflow Tower](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Nextflow%20Tower-%234256e7)](https://tower.nf/launch?pipeline=https://github.com/nf-core/blobtoolkit) +[![GitHub Actions Linting Status](https://github.com/sanger-tol/blobtoolkit/workflows/nf-core%20linting/badge.svg)](https://github.com/sanger-tol/blobtoolkit/actions?query=workflow%3A%22nf-core+linting%22) +[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX) -[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23blobtoolkit-4A154B?logo=slack)](https://nfcore.slack.com/channels/blobtoolkit) -[![Follow on Twitter](http://img.shields.io/badge/twitter-%40nf__core-1DA1F2?logo=twitter)](https://twitter.com/nf_core) -[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?logo=youtube)](https://www.youtube.com/c/nf-core) +[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A522.10.1-23aa62.svg)](https://www.nextflow.io/) +[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/) +[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/) +[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/) +[![Launch on Nextflow Tower](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Nextflow%20Tower-%234256e7)](https://tower.nf/launch?pipeline=https://github.com/sanger-tol/blobtoolkit) ## Introduction - +**sanger-tol/blobtoolkit** is a bioinformatics pipeline that can be used to identify and analyse non-target DNA for eukaryotic genomes. It takes a samplesheet and aligned CRAM files as input, calculates genome statistics, coverage and completeness information, combines them in a TSV file by window size to create a BlobDir dataset and static plots. -**nf-core/blobtoolkit** is a bioinformatics best-practice analysis pipeline for BlobToolKit Nextflow Pipeline.. + -The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community! + - + -On release, automated continuous integration tests run the pipeline on a full-sized dataset on the AWS cloud infrastructure. This ensures that the pipeline runs on AWS, has sensible resource allocation defaults set to run on real-world datasets, and permits the persistent storage of results to benchmark between pipeline releases and other analysis sources. The results obtained from the full-sized test can be viewed on the [nf-core website](https://nf-co.re/blobtoolkit/results). + -## Pipeline summary +1. Calculate genome statistics in windows ([`fastawindows`](https://github.com/tolkit/fasta_windows)) +2. Calculate Coverage ([`mosdepth`](https://github.com/brentp/mosdepth)) +3. Fetch associated BUSCO lineages ([`goat/taxonsearch`](https://github.com/genomehubs/goat-cli)) +4. Run BUSCO ([`busco`](https://busco.ezlab.org/)) +5. Extract BUSCO genes (blobtoolkit/extractbuscos) +6. Run Diamond BLASTp against extracted BUSCO genes ([`diamond/blastp`](https://github.com/bbuchfink/diamond)) +7. Count BUSCO genes (blobtoolkit/countbuscos) +8. Generate combined sequence stats across various window sizes (blobtoolkit/windowstats) +9. Imports analysis results into a BlobDir dataset (blobtoolkit/blobdir) +10. Create static plot images (blobtoolkit/images) - +## Usage -1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) -2. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) +> **Note** +> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how +> to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) +> with `-profile test` before running the workflow on actual data. -## Quick Start + -1. Install [`Nextflow`](https://www.nextflow.io/docs/latest/getstarted.html#installation) (`>=21.10.3`) +First, prepare a samplesheet with your input data that looks as follows: -2. Install any of [`Docker`](https://docs.docker.com/engine/installation/), [`Singularity`](https://www.sylabs.io/guides/3.0/user-guide/) (you can follow [this tutorial](https://singularity-tutorial.github.io/01-installation/)), [`Podman`](https://podman.io/), [`Shifter`](https://nersc.gitlab.io/development/shifter/how-to-use/) or [`Charliecloud`](https://hpc.github.io/charliecloud/) for full pipeline reproducibility _(you can use [`Conda`](https://conda.io/miniconda.html) both to install Nextflow itself and also to manage software within pipelines. Please only use it within pipelines as a last resort; see [docs](https://nf-co.re/usage/configuration#basic-configuration-profiles))_. +`samplesheet.csv`: -3. Download the pipeline and test it on a minimal dataset with a single command: +```csv +sample,datatype,datafile +mMelMel3,hic,GCA_922984935.2.hic.mMelMel3.cram +mMelMel1,illumina,GCA_922984935.2.illumina.mMelMel1.cram +mMelMel3,ont,GCA_922984935.2.ont.mMelMel3.cram +``` - ```console - nextflow run nf-core/blobtoolkit -profile test,YOURPROFILE --outdir - ``` +Each row represents an aligned file. Rows with the same sample identifier are considered technical replicates. The datatype refers to the sequencing technology used to generate the underlying raw data and follows a controlled vocabulary (ont, hic, pacbio, illumina). The aligned read files can be generated using the [sanger-tol/readmapping](https://github.com/sanger-tol/readmapping) pipeline. - Note that some form of configuration will be needed so that Nextflow knows how to fetch the required software. This is usually done in the form of a config profile (`YOURPROFILE` in the example command above). You can chain multiple config profiles in a comma-separated string. +Now, you can run the pipeline using: - > - The pipeline comes with config profiles called `docker`, `singularity`, `podman`, `shifter`, `charliecloud` and `conda` which instruct the pipeline to use the named tool for software management. For example, `-profile test,docker`. - > - Please check [nf-core/configs](https://github.com/nf-core/configs#documentation) to see if a custom config file to run nf-core pipelines already exists for your Institute. If so, you can simply use `-profile ` in your command. This will enable either `docker` or `singularity` and set the appropriate execution settings for your local compute environment. - > - If you are using `singularity`, please use the [`nf-core download`](https://nf-co.re/tools/#downloading-pipelines-for-offline-use) command to download images first, before running the pipeline. Setting the [`NXF_SINGULARITY_CACHEDIR` or `singularity.cacheDir`](https://www.nextflow.io/docs/latest/singularity.html?#singularity-docker-hub) Nextflow options enables you to store and re-use the images from a central location for future pipeline runs. - > - If you are using `conda`, it is highly recommended to use the [`NXF_CONDA_CACHEDIR` or `conda.cacheDir`](https://www.nextflow.io/docs/latest/conda.html) settings to store the environments in a central location for future pipeline runs. + -4. Start running your own analysis! +```bash +nextflow run sanger-tol/blobtoolkit \ + -profile \ + --input samplesheet.csv \ + --outdir \ + --fasta genome.fasta \ + --accession GCA_XXXXXXXXX.X \ + --taxon XXXX \ + --taxdump /path/to/taxdump/database \ + --uniprot /path/to/diamond/database +``` - +> **Warning:** +> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those +> provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; +> see [docs](https://nf-co.re/usage/configuration#custom-configuration-files). - ```console - nextflow run nf-core/blobtoolkit --input samplesheet.csv --outdir --genome GRCh37 -profile - ``` +For more details, please refer to the [usage documentation](https://pipelines.tol.sanger.ac.uk/blobtoolkit/usage) and the [parameter documentation](https://pipelines.tol.sanger.ac.uk/blobtoolkit/parameters). -## Documentation +## Pipeline output -The nf-core/blobtoolkit pipeline comes with documentation about the pipeline [usage](https://nf-co.re/blobtoolkit/usage), [parameters](https://nf-co.re/blobtoolkit/parameters) and [output](https://nf-co.re/blobtoolkit/output). + For more details about the output files and reports, please refer to the [output documentation](https://pipelines.tol.sanger.ac.uk/blobtoolkit/output). ## Credits -nf-core/blobtoolkit was originally written by Alexander Ramos. +sanger-tol/blobtoolkit was written in Nextflow by [Alexander Ramos Diaz](https://github.com/alxndrdiaz), [Zaynab Butt](https://github.com/zb32), [Matthieu Muffato](https://github.com/muffato), and [Priyanka Surana](https://github.com/priyanka-surana). The orignal design and coding for [BlobToolKit software and Snakemake pipeline](https://github.com/blobtoolkit/blobtoolkit) was done by [Richard Challis](https://github.com/rjchallis) and [Sujai Kumar](https://github.com/sujaikumar). -We thank the following people for their extensive assistance in the development of this pipeline: +We thank the following people for their assistance in the development of this pipeline: - + + +- [Guoying Qi](https://github.com/gq1) ## Contributions and Support If you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md). -For further information or help, don't hesitate to get in touch on the [Slack `#blobtoolkit` channel](https://nfcore.slack.com/channels/blobtoolkit) (you can join with [this invite](https://nf-co.re/join/slack)). - ## Citations - - + + + - + An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file. -You can cite the `nf-core` publication as follows: +This pipeline uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) community, reused here under the [MIT license](https://github.com/nf-core/tools/blob/master/LICENSE). > **The nf-core framework for community-curated bioinformatics pipelines.** > diff --git a/assets/adaptivecard.json b/assets/adaptivecard.json new file mode 100644 index 00000000..e546325c --- /dev/null +++ b/assets/adaptivecard.json @@ -0,0 +1,67 @@ +{ + "type": "message", + "attachments": [ + { + "contentType": "application/vnd.microsoft.card.adaptive", + "contentUrl": null, + "content": { + "\$schema": "http://adaptivecards.io/schemas/adaptive-card.json", + "msteams": { + "width": "Full" + }, + "type": "AdaptiveCard", + "version": "1.2", + "body": [ + { + "type": "TextBlock", + "size": "Large", + "weight": "Bolder", + "color": "<% if (success) { %>Good<% } else { %>Attention<%} %>", + "text": "sanger-tol/blobtoolkit v${version} - ${runName}", + "wrap": true + }, + { + "type": "TextBlock", + "spacing": "None", + "text": "Completed at ${dateComplete} (duration: ${duration})", + "isSubtle": true, + "wrap": true + }, + { + "type": "TextBlock", + "text": "<% if (success) { %>Pipeline completed successfully!<% } else { %>Pipeline completed with errors. The full error message was: ${errorReport}.<% } %>", + "wrap": true + }, + { + "type": "TextBlock", + "text": "The command used to launch the workflow was as follows:", + "wrap": true + }, + { + "type": "TextBlock", + "text": "${commandLine}", + "isSubtle": true, + "wrap": true + } + ], + "actions": [ + { + "type": "Action.ShowCard", + "title": "Pipeline Configuration", + "card": { + "type": "AdaptiveCard", + "\$schema": "http://adaptivecards.io/schemas/adaptive-card.json", + "body": [ + { + "type": "FactSet", + "facts": [<% out << summary.collect{ k,v -> "{\"title\": \"$k\", \"value\" : \"$v\"}"}.join(",\n") %> + ] + } + ] + } + } + ] + } + } + ] +} diff --git a/assets/email_template.html b/assets/email_template.html index 2f1b4eaf..add38111 100644 --- a/assets/email_template.html +++ b/assets/email_template.html @@ -4,21 +4,21 @@ - - nf-core/blobtoolkit Pipeline Report + + sanger-tol/blobtoolkit Pipeline Report
-

nf-core/blobtoolkit v${version}

+

sanger-tol/blobtoolkit v${version}

Run Name: $runName

<% if (!success){ out << """
-

nf-core/blobtoolkit execution completed unsuccessfully!

+

sanger-tol/blobtoolkit execution completed unsuccessfully!

The exit status of the task that caused the workflow execution to fail was: $exitStatus.

The full error message was:

${errorReport}
@@ -27,7 +27,7 @@

nf-core/blobtoolkit execution complete } else { out << """
- nf-core/blobtoolkit execution completed successfully! + sanger-tol/blobtoolkit execution completed successfully!
""" } @@ -44,8 +44,8 @@

Pipeline Configuration:

-

nf-core/blobtoolkit

-

https://github.com/nf-core/blobtoolkit

+

sanger-tol/blobtoolkit

+

https://github.com/sanger-tol/blobtoolkit

diff --git a/assets/email_template.txt b/assets/email_template.txt index dc6b35f5..e9fdd4ac 100644 --- a/assets/email_template.txt +++ b/assets/email_template.txt @@ -1,19 +1,10 @@ ----------------------------------------------------- - ,--./,-. - ___ __ __ __ ___ /,-._.--~\\ - |\\ | |__ __ / ` / \\ |__) |__ } { - | \\| | \\__, \\__/ | \\ |___ \\`-._,-`-, - `._,._,' - nf-core/blobtoolkit v${version} ----------------------------------------------------- - Run Name: $runName <% if (success){ - out << "## nf-core/blobtoolkit execution completed successfully! ##" + out << "## sanger-tol/blobtoolkit execution completed successfully! ##" } else { out << """#################################################### -## nf-core/blobtoolkit execution completed unsuccessfully! ## +## sanger-tol/blobtoolkit execution completed unsuccessfully! ## #################################################### The exit status of the task that caused the workflow execution to fail was: $exitStatus. The full error message was: @@ -36,5 +27,5 @@ Pipeline Configuration: <% out << summary.collect{ k,v -> " - $k: $v" }.join("\n") %> -- -nf-core/blobtoolkit -https://github.com/nf-core/blobtoolkit +sanger-tol/blobtoolkit +https://github.com/sanger-tol/blobtoolkit diff --git a/assets/methods_description_template.yml b/assets/methods_description_template.yml new file mode 100644 index 00000000..051e4609 --- /dev/null +++ b/assets/methods_description_template.yml @@ -0,0 +1,25 @@ +id: "sanger-tol-blobtoolkit-methods-description" +description: "Suggested text and references to use when describing pipeline usage within the methods section of a publication." +section_name: "sanger-tol/blobtoolkit Methods Description" +section_href: "https://github.com/sanger-tol/blobtoolkit" +plot_type: "html" +## Update the HTML below to your prefered methods description, e.g. add publication citation for this pipeline +## You inject any metadata in the Nextflow '${workflow}' object +data: | +

Methods

+

Data was processed using sanger-tol/blobtoolkit v${workflow.manifest.version} ${doi_text} of the sanger-tol collection of workflows, created using nf-core (Ewels et al., 2020).

+

The pipeline was executed with Nextflow v${workflow.nextflow.version} (Di Tommaso et al., 2017) with the following command:

+
${workflow.commandLine}
+

References

+
    +
  • Di Tommaso, P., Chatzou, M., Floden, E. W., Barja, P. P., Palumbo, E., & Notredame, C. (2017). Nextflow enables reproducible computational workflows. Nature Biotechnology, 35(4), 316-319. https://doi.org/10.1038/nbt.3820
  • +
  • Ewels, P. A., Peltzer, A., Fillinger, S., Patel, H., Alneberg, J., Wilm, A., Garcia, M. U., Di Tommaso, P., & Nahnsen, S. (2020). The nf-core framework for community-curated bioinformatics pipelines. Nature Biotechnology, 38(3), 276-278. https://doi.org/10.1038/s41587-020-0439-x
  • +
+
+
Notes:
+
    + ${nodoi_text} +
  • The command above does not include parameters contained in any configs or profiles that may have been used. Ensure the config file is also uploaded with your publication!
  • +
  • You should also cite all software used within this run. Check the "Software Versions" of this report to get version information.
  • +
+
diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index 2824d072..dec36a50 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -1,11 +1,12 @@ report_comment: > - This report has been generated by the nf-core/blobtoolkit - analysis pipeline. For information about how to interpret these results, please see the - documentation. + This report has been generated by the sanger-tol/blobtoolkit + analysis pipeline. report_section_order: - software_versions: + "sanger-tol-blobtoolkit-methods-description": order: -1000 - "nf-core-blobtoolkit-summary": + software_versions: order: -1001 + "sanger-tol-blobtoolkit-summary": + order: -1002 export_plots: true diff --git a/assets/nf-core-blobtoolkit_logo_light.png b/assets/nf-core-blobtoolkit_logo_light.png deleted file mode 100644 index 7d5582f8..00000000 Binary files a/assets/nf-core-blobtoolkit_logo_light.png and /dev/null differ diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv deleted file mode 100644 index 5f653ab7..00000000 --- a/assets/samplesheet.csv +++ /dev/null @@ -1,3 +0,0 @@ -sample,fastq_1,fastq_2 -SAMPLE_PAIRED_END,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz -SAMPLE_SINGLE_END,/path/to/fastq/files/AEG588A4_S4_L003_R1_001.fastq.gz, diff --git a/assets/schema_input.json b/assets/schema_input.json index e00057b3..c315cedb 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -1,7 +1,7 @@ { "$schema": "http://json-schema.org/draft-07/schema", - "$id": "https://raw.githubusercontent.com/nf-core/blobtoolkit/master/assets/schema_input.json", - "title": "nf-core/blobtoolkit pipeline - params.input schema", + "$id": "https://raw.githubusercontent.com/sanger-tol/blobtoolkit/master/assets/schema_input.json", + "title": "sanger-tol/blobtoolkit pipeline - params.input schema", "description": "Schema for the file provided with params.input", "type": "array", "items": { @@ -9,28 +9,22 @@ "properties": { "sample": { "type": "string", + "description": "Sample Name", "pattern": "^\\S+$", "errorMessage": "Sample name must be provided and cannot contain spaces" }, - "fastq_1": { + "datatype": { "type": "string", - "pattern": "^\\S+\\.f(ast)?q\\.gz$", - "errorMessage": "FastQ file for reads 1 must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'" + "pattern": "^\\S+$", + "enum": ["hic", "illumina", "ont", "pacbio"], + "errorMessage": "Data type, and must be one of: 'hic' or 'illumina' or 'ont' or 'pacbio'" }, - "fastq_2": { - "errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'", - "anyOf": [ - { - "type": "string", - "pattern": "^\\S+\\.f(ast)?q\\.gz$" - }, - { - "type": "string", - "maxLength": 0 - } - ] + "datafile": { + "type": "string", + "pattern": "^\\S+\\.cram$", + "errorMessage": "Data file for reads cannot contain spaces and must have extension 'cram'" } }, - "required": ["sample", "fastq_1"] + "required": ["datafile", "datatype", "sample"] } } diff --git a/assets/sendmail_template.txt b/assets/sendmail_template.txt index dd00b99d..619b2eb5 100644 --- a/assets/sendmail_template.txt +++ b/assets/sendmail_template.txt @@ -9,12 +9,12 @@ Content-Type: text/html; charset=utf-8 $email_html --nfcoremimeboundary -Content-Type: image/png;name="nf-core-blobtoolkit_logo.png" +Content-Type: image/png;name="sanger-tol-blobtoolkit_logo.png" Content-Transfer-Encoding: base64 Content-ID: -Content-Disposition: inline; filename="nf-core-blobtoolkit_logo_light.png" +Content-Disposition: inline; filename="sanger-tol-blobtoolkit_logo.png" -<% out << new File("$projectDir/assets/nf-core-blobtoolkit_logo_light.png"). +<% out << new File("$projectDir/docs/images/nf-core-blobtoolkit_logo.png"). bytes. encodeBase64(). toString(). diff --git a/assets/slackreport.json b/assets/slackreport.json new file mode 100644 index 00000000..6db5d148 --- /dev/null +++ b/assets/slackreport.json @@ -0,0 +1,34 @@ +{ + "attachments": [ + { + "fallback": "Plain-text summary of the attachment.", + "color": "<% if (success) { %>good<% } else { %>danger<%} %>", + "author_name": "sanger-tol/blobtoolkit v${version} - ${runName}", + "author_icon": "https://www.nextflow.io/docs/latest/_static/favicon.ico", + "text": "<% if (success) { %>Pipeline completed successfully!<% } else { %>Pipeline completed with errors<% } %>", + "fields": [ + { + "title": "Command used to launch the workflow", + "value": "```${commandLine}```", + "short": false + } + <% + if (!success) { %> + , + { + "title": "Full error message", + "value": "```${errorReport}```", + "short": false + }, + { + "title": "Pipeline configuration", + "value": "<% out << summary.collect{ k,v -> k == "hook_url" ? "_${k}_: (_hidden_)" : ( ( v.class.toString().contains('Path') || ( v.class.toString().contains('String') && v.contains('/') ) ) ? "_${k}_: `${v}`" : (v.class.toString().contains('DateTime') ? ("_${k}_: " + v.format(java.time.format.DateTimeFormatter.ofLocalizedDateTime(java.time.format.FormatStyle.MEDIUM))) : "_${k}_: ${v}") ) }.join(",\n") %>", + "short": false + } + <% } + %> + ], + "footer": "Completed at <% out << dateComplete.format(java.time.format.DateTimeFormatter.ofLocalizedDateTime(java.time.format.FormatStyle.MEDIUM)) %> (duration: ${duration})" + } + ] +} diff --git a/assets/test/GCA_922984935.2.yaml b/assets/test/GCA_922984935.2.yaml new file mode 100644 index 00000000..4911d14a --- /dev/null +++ b/assets/test/GCA_922984935.2.yaml @@ -0,0 +1,79 @@ +assembly: + accession: GCA_922984935.2 + alias: mMelMel3.2 paternal haplotype + bioproject: PRJEB49353 + biosample: SAMEA7524400 + file: ./GCA_922984935.2/assembly/GCA_922984935.2.fasta.gz + level: chromosome + prefix: CAKLPM02 + scaffold-count: 538 + span: 2738694574 + url: ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/922/984/935/GCA_922984935.2_mMelMel3.2_paternal_haplotype/GCA_922984935.2_mMelMel3.2_paternal_haplotype_genomic.fna.gz +busco: + basal_lineages: + - eukaryota_odb10 + - bacteria_odb10 + - archaea_odb10 + download_dir: ./busco + lineages: + - carnivora_odb10 + - laurasiatheria_odb10 + - eutheria_odb10 + - mammalia_odb10 + - tetrapoda_odb10 + - vertebrata_odb10 + - metazoa_odb10 + - eukaryota_odb10 + - bacteria_odb10 + - archaea_odb10 +fields: + categories: + file: ./GCA_922984935.2/assembly/GCA_922984935.2.categories.tsv + synonyms: + file: ./GCA_922984935.2/assembly/GCA_922984935.2.synonyms.tsv + prefix: insdc +reads: + paired: [] + single: [] +revision: 0 +settings: + blast_chunk: 100000 + blast_max_chunks: 10 + blast_min_length: 1000 + blast_overlap: 0 + stats_chunk: 1000 + stats_windows: + - 0.1 + - 0.01 + - 100000 + - 1000000 + taxdump: ./taxdump + tmp: /tmp +similarity: + blastn: + name: nt + path: ./nt + defaults: + evalue: 1.0e-10 + import_evalue: 1.0e-25 + max_target_seqs: 10 + taxrule: buscogenes + diamond_blastp: + import_max_target_seqs: 100000 + name: reference_proteomes + path: ./uniprot + taxrule: blastp=buscogenes + diamond_blastx: + name: reference_proteomes + path: ./uniprot +taxon: + class: Mammalia + family: Mustelidae + genus: Meles + kingdom: Metazoa + name: Meles meles + order: Carnivora + phylum: Chordata + superkingdom: Eukaryota + taxid: '9662' +version: 1 diff --git a/assets/test/mCerEla1.1.buscogenes.dmnd b/assets/test/mCerEla1.1.buscogenes.dmnd new file mode 100644 index 00000000..bccca41d Binary files /dev/null and b/assets/test/mCerEla1.1.buscogenes.dmnd differ diff --git a/assets/test/samplesheet.csv b/assets/test/samplesheet.csv new file mode 100644 index 00000000..5f8d4463 --- /dev/null +++ b/assets/test/samplesheet.csv @@ -0,0 +1,5 @@ +sample,datatype,datafile +mMelMel3,hic,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/hic/GCA_922984935.2.subset.unmasked.hic.mMelMel3.cram +mMelMel1,illumina,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/illumina/GCA_922984935.2.subset.unmasked.illumina.mMelMel1.cram +mMelMel2,illumina,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/illumina/GCA_922984935.2.subset.unmasked.illumina.mMelMel2.cram +mMelMel3,ont,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/ont/GCA_922984935.2.subset.unmasked.ont.mMelMel3.cram diff --git a/assets/test/samplesheet_s3.csv b/assets/test/samplesheet_s3.csv new file mode 100644 index 00000000..dbb34181 --- /dev/null +++ b/assets/test/samplesheet_s3.csv @@ -0,0 +1,5 @@ +sample,datatype,datafile +mMelMel3,hic,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/hic/GCA_922984935.2.subset.unmasked.hic.mMelMel3.cram +mMelMel1,illumina,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/illumina/GCA_922984935.2.subset.unmasked.illumina.mMelMel1.cram +mMelMel2,illumina,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/illumina/GCA_922984935.2.subset.unmasked.illumina.mMelMel2.cram +mMelMel3,ont,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/ont/GCA_922984935.2.subset.unmasked.ont.mMelMel3.cram diff --git a/assets/test_full/full_samplesheet.csv b/assets/test_full/full_samplesheet.csv new file mode 100644 index 00000000..88fc7462 --- /dev/null +++ b/assets/test_full/full_samplesheet.csv @@ -0,0 +1,3 @@ +sample,datatype,datafile +gfLaeSulp1,hic,/lustre/scratch123/tol/projects/.sandbox/data/fungi/Laetiporus_sulphureus/analysis/gfLaeSulp1.1/read_mapping/hic/GCA_927399515.1.unmasked.hic.gfLaeSulp1.cram +gfLaeSulp1,pacbio,/lustre/scratch123/tol/projects/.sandbox/data/fungi/Laetiporus_sulphureus/analysis/gfLaeSulp1.1/read_mapping/pacbio/GCA_927399515.1.unmasked.pacbio.gfLaeSulp1.cram diff --git a/assets/test_full/gfLaeSulp1.1.buscogenes.dmnd b/assets/test_full/gfLaeSulp1.1.buscogenes.dmnd new file mode 100644 index 00000000..a0d0e1d2 Binary files /dev/null and b/assets/test_full/gfLaeSulp1.1.buscogenes.dmnd differ diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 3652c63c..72e3f485 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -11,7 +11,6 @@ from collections import Counter from pathlib import Path - logger = logging.getLogger() @@ -25,17 +24,20 @@ class RowChecker: """ - VALID_FORMATS = ( - ".fq.gz", - ".fastq.gz", + VALID_FORMATS = (".cram",) + + VALID_DATATYPES = ( + "hic", + "illumina", + "pacbio", + "ont", ) def __init__( self, sample_col="sample", - first_col="fastq_1", - second_col="fastq_2", - single_col="single_end", + type_col="datatype", + file_col="datafile", **kwargs, ): """ @@ -44,26 +46,21 @@ def __init__( Args: sample_col (str): The name of the column that contains the sample name (default "sample"). - first_col (str): The name of the column that contains the first (or only) - FASTQ file path (default "fastq_1"). - second_col (str): The name of the column that contains the second (if any) - FASTQ file path (default "fastq_2"). - single_col (str): The name of the new column that will be inserted and - records whether the sample contains single- or paired-end sequencing - reads (default "single_end"). - + type_col (str): The name of the column that contains the dataype for + the read data (default "datatype"). + file_col (str): The name of the column that contains the file path for + the read data (default "datafile"). """ super().__init__(**kwargs) self._sample_col = sample_col - self._first_col = first_col - self._second_col = second_col - self._single_col = single_col + self._type_col = type_col + self._file_col = file_col self._seen = set() self.modified = [] def validate_and_transform(self, row): """ - Perform all validations on the given row and insert the read pairing status. + Perform all validations on the given row. Args: row (dict): A mapping from column headers (keys) to elements of that row @@ -71,62 +68,55 @@ def validate_and_transform(self, row): """ self._validate_sample(row) - self._validate_first(row) - self._validate_second(row) - self._validate_pair(row) - self._seen.add((row[self._sample_col], row[self._first_col])) + self._validate_type(row) + self._validate_file(row) + self._seen.add((row[self._sample_col], row[self._file_col])) self.modified.append(row) def _validate_sample(self, row): """Assert that the sample name exists and convert spaces to underscores.""" - assert len(row[self._sample_col]) > 0, "Sample input is required." + if len(row[self._sample_col]) <= 0: + raise AssertionError("Sample input is required.") # Sanitize samples slightly. row[self._sample_col] = row[self._sample_col].replace(" ", "_") - def _validate_first(self, row): - """Assert that the first FASTQ entry is non-empty and has the right format.""" - assert len(row[self._first_col]) > 0, "At least the first FASTQ file is required." - self._validate_fastq_format(row[self._first_col]) - - def _validate_second(self, row): - """Assert that the second FASTQ entry has the right format if it exists.""" - if len(row[self._second_col]) > 0: - self._validate_fastq_format(row[self._second_col]) - - def _validate_pair(self, row): - """Assert that read pairs have the same file extension. Report pair status.""" - if row[self._first_col] and row[self._second_col]: - row[self._single_col] = False - assert ( - Path(row[self._first_col]).suffixes[-2:] == Path(row[self._second_col]).suffixes[-2:] - ), "FASTQ pairs must have the same file extensions." - else: - row[self._single_col] = True - - def _validate_fastq_format(self, filename): + def _validate_type(self, row): + """Assert that the data type matches expected values.""" + if not any(row[self._type_col] for datatype in self.VALID_DATATYPES): + raise AssertionError( + f"The datatype is unrecognized: {row[self._type_col]}\n" + f"It should be one of: {', '.join(self.VALID_DATATYPES)}" + ) + + def _validate_file(self, row): + """Assert that the datafile is non-empty and has the right format.""" + if len(row[self._file_col]) <= 0: + raise AssertionError("Data file is required.") + self._validate_data_format(row[self._file_col]) + + def _validate_data_format(self, filename): """Assert that a given filename has one of the expected FASTQ extensions.""" - assert any(filename.endswith(extension) for extension in self.VALID_FORMATS), ( - f"The FASTQ file has an unrecognized extension: {filename}\n" - f"It should be one of: {', '.join(self.VALID_FORMATS)}" - ) + if not any(filename.endswith(extension) for extension in self.VALID_FORMATS): + raise AssertionError( + f"The data file has an unrecognized extension: {filename}\n" + f"It should be one of: {', '.join(self.VALID_FORMATS)}" + ) def validate_unique_samples(self): """ - Assert that the combination of sample name and FASTQ filename is unique. + Assert that the combination of sample name and aligned filename is unique. - In addition to the validation, also rename the sample if more than one sample, - FASTQ file combination exists. + In addition to the validation, also rename all samples to have a suffix of _T{n}, where n is the + number of times the same sample exist, but with different FASTQ files, e.g., multiple runs per experiment. """ - assert len(self._seen) == len(self.modified), "The pair of sample name and FASTQ must be unique." - if len({pair[0] for pair in self._seen}) < len(self._seen): - counts = Counter(pair[0] for pair in self._seen) - seen = Counter() - for row in self.modified: - sample = row[self._sample_col] - seen[sample] += 1 - if counts[sample] > 1: - row[self._sample_col] = f"{sample}_T{seen[sample]}" + if len(self._seen) != len(self.modified): + raise AssertionError("The pair of sample and file name must be unique.") + seen = Counter() + for row in self.modified: + sample = row[self._sample_col] + seen[sample] += 1 + row[self._sample_col] = f"{sample}_T{seen[sample]}" def read_head(handle, num_lines=10): @@ -157,20 +147,15 @@ def sniff_format(handle): peek = read_head(handle) handle.seek(0) sniffer = csv.Sniffer() - if not sniffer.has_header(peek): - logger.critical(f"The given sample sheet does not appear to contain a header.") - sys.exit(1) dialect = sniffer.sniff(peek) return dialect def check_samplesheet(file_in, file_out): """ - Check that the tabular samplesheet has the structure expected by nf-core pipelines. + Check that the tabular samplesheet has the structure expected by sanger-tol pipelines. Validate the general shape of the table, expected columns, and each row. Also add - an additional column which records whether one or two FASTQ reads were found. - Args: file_in (pathlib.Path): The given tabular samplesheet. The format can be either CSV, TSV, or any other format automatically recognized by ``csv.Sniffer``. @@ -179,24 +164,25 @@ def check_samplesheet(file_in, file_out): Example: This function checks that the samplesheet follows the following structure, - see also the `viral recon samplesheet`_:: + see also the `blobtoolkit samplesheet`_:: - sample,fastq_1,fastq_2 - SAMPLE_PE,SAMPLE_PE_RUN1_1.fastq.gz,SAMPLE_PE_RUN1_2.fastq.gz - SAMPLE_PE,SAMPLE_PE_RUN2_1.fastq.gz,SAMPLE_PE_RUN2_2.fastq.gz - SAMPLE_SE,SAMPLE_SE_RUN1_1.fastq.gz, + sample,datatype,datafile + sample1,hic,/path/to/file1.cram + sample1,pacbio,/path/to/file2.cram + sample1,ont,/path/to/file3.cram - .. _viral recon samplesheet: - https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv + .. _blobtoolkit samplesheet: + https://raw.githubusercontent.com/sanger-tol/blobtoolkit/main/assets/test/samplesheet.csv """ - required_columns = {"sample", "fastq_1", "fastq_2"} + required_columns = {"sample", "datatype", "datafile"} # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. with file_in.open(newline="") as in_handle: reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle)) # Validate the existence of the expected header columns. if not required_columns.issubset(reader.fieldnames): - logger.critical(f"The sample sheet **must** contain the column headers: {', '.join(required_columns)}.") + req_cols = ", ".join(required_columns) + logger.critical(f"The sample sheet **must** contain these column headers: {req_cols}.") sys.exit(1) # Validate each row. checker = RowChecker() @@ -208,7 +194,6 @@ def check_samplesheet(file_in, file_out): sys.exit(1) checker.validate_unique_samples() header = list(reader.fieldnames) - header.insert(1, "single_end") # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. with file_out.open(mode="w", newline="") as out_handle: writer = csv.DictWriter(out_handle, header, delimiter=",") @@ -242,6 +227,12 @@ def parse_args(argv=None): choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"), default="WARNING", ) + parser.add_argument( + "-v", + "--version", + action="version", + version="%(prog)s 1.0.0", + ) return parser.parse_args(argv) diff --git a/bin/windowstats_input.py b/bin/windowstats_input.py new file mode 100755 index 00000000..dcd1bedb --- /dev/null +++ b/bin/windowstats_input.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python3 + +import argparse +import os +import sys +import pandas as pd + + +def parse_args(args=None): + Description = "Combine BED files to create window stats input file." + + parser = argparse.ArgumentParser(description=Description) + parser.add_argument("--freq", help="Frequence fasta windows input file", required=True) + parser.add_argument("--mononuc", help="Mononucleotide fasta windows input file", required=True) + parser.add_argument("--mosdepth", help="Mosdepth coverage input file", nargs="+", required=True) + parser.add_argument("--countbusco", help="BUSCO gene counts by region", required=True) + parser.add_argument("--output", help="Output TSV file.", required=True) + parser.add_argument("--version", action="version", version="%(prog)s 1.0.0") + return parser.parse_args(args) + + +def make_dir(path): + if len(path) > 0: + os.makedirs(path, exist_ok=True) + + +def merge_all(freq, mononuc, mosdepth, countbusco): + freq_fw = pd.read_csv(freq, sep="\t") + mononuc_fw = pd.read_csv(mononuc, sep="\t") + combo_fw = freq_fw.merge(mononuc_fw).rename( + columns={"ID": "sequence", "GC_prop": "gc", "Prop_Ns": "n", "N": "ncount"} + ) + + count_df = pd.read_csv(countbusco, sep="\t").rename(columns={"ID": "sequence"}) + for f in mosdepth: + tag = os.path.basename(f).replace(".regions.bed.gz", "") + cov_df = pd.read_csv( + f, + compression="gzip", + sep="\t", + names=["sequence", "start", "end", tag + "_cov"], + ) + count_df = count_df.merge(cov_df) + + combo_all = combo_fw.merge(count_df) + return combo_all + + +def main(args=None): + args = parse_args(args) + + out_dir = os.path.dirname(args.output) + make_dir(out_dir) + + merge_all(args.freq, args.mononuc, args.mosdepth, args.countbusco).to_csv(args.output, sep="\t", index=False) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/conf/base.config b/conf/base.config index 5d2a880c..4d5e9045 100644 --- a/conf/base.config +++ b/conf/base.config @@ -1,6 +1,6 @@ /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - nf-core/blobtoolkit Nextflow base config file + sanger-tol/blobtoolkit Nextflow base config file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ A 'blank slate' config file, appropriate for general use on most high performance compute environments. Assumes that all software is installed and available on @@ -10,12 +10,12 @@ process { - // TODO nf-core: Check the defaults for all processes + // Check the defaults for all processes cpus = { check_max( 1 * task.attempt, 'cpus' ) } memory = { check_max( 6.GB * task.attempt, 'memory' ) } time = { check_max( 4.h * task.attempt, 'time' ) } - errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' } + errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' } maxRetries = 1 maxErrors = '-1' @@ -24,8 +24,13 @@ process { // These labels are used and recognised by default in DSL2 files hosted on nf-core/modules. // If possible, it would be nice to keep the same label naming convention when // adding in your local modules too. - // TODO nf-core: Customise requirements for specific processes. + // Customise requirements for specific processes. // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors + withLabel:process_single { + cpus = { check_max( 1 , 'cpus' ) } + memory = { check_max( 6.GB * task.attempt, 'memory' ) } + time = { check_max( 4.h * task.attempt, 'time' ) } + } withLabel:process_low { cpus = { check_max( 2 * task.attempt, 'cpus' ) } memory = { check_max( 12.GB * task.attempt, 'memory' ) } diff --git a/conf/igenomes.config b/conf/igenomes.config deleted file mode 100644 index 7a1b3ac6..00000000 --- a/conf/igenomes.config +++ /dev/null @@ -1,432 +0,0 @@ -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Nextflow config file for iGenomes paths -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Defines reference genomes using iGenome paths. - Can be used by any config that customises the base path using: - $params.igenomes_base / --igenomes_base ----------------------------------------------------------------------------------------- -*/ - -params { - // illumina iGenomes reference file paths - genomes { - 'GRCh37' { - fasta = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/README.txt" - mito_name = "MT" - macs_gsize = "2.7e9" - blacklist = "${projectDir}/assets/blacklists/GRCh37-blacklist.bed" - } - 'GRCh38' { - fasta = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Annotation/Genes/genes.bed" - mito_name = "chrM" - macs_gsize = "2.7e9" - blacklist = "${projectDir}/assets/blacklists/hg38-blacklist.bed" - } - 'GRCm38' { - fasta = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Annotation/README.txt" - mito_name = "MT" - macs_gsize = "1.87e9" - blacklist = "${projectDir}/assets/blacklists/GRCm38-blacklist.bed" - } - 'TAIR10' { - fasta = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Annotation/README.txt" - mito_name = "Mt" - } - 'EB2' { - fasta = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Annotation/README.txt" - } - 'UMD3.1' { - fasta = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Annotation/README.txt" - mito_name = "MT" - } - 'WBcel235' { - fasta = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Annotation/Genes/genes.bed" - mito_name = "MtDNA" - macs_gsize = "9e7" - } - 'CanFam3.1' { - fasta = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Annotation/README.txt" - mito_name = "MT" - } - 'GRCz10' { - fasta = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Annotation/Genes/genes.bed" - mito_name = "MT" - } - 'BDGP6' { - fasta = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Annotation/Genes/genes.bed" - mito_name = "M" - macs_gsize = "1.2e8" - } - 'EquCab2' { - fasta = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Annotation/README.txt" - mito_name = "MT" - } - 'EB1' { - fasta = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Annotation/README.txt" - } - 'Galgal4' { - fasta = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Annotation/Genes/genes.bed" - mito_name = "MT" - } - 'Gm01' { - fasta = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Annotation/README.txt" - } - 'Mmul_1' { - fasta = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Annotation/README.txt" - mito_name = "MT" - } - 'IRGSP-1.0' { - fasta = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Annotation/Genes/genes.bed" - mito_name = "Mt" - } - 'CHIMP2.1.4' { - fasta = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Annotation/README.txt" - mito_name = "MT" - } - 'Rnor_5.0' { - fasta = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Annotation/Genes/genes.bed" - mito_name = "MT" - } - 'Rnor_6.0' { - fasta = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Annotation/Genes/genes.bed" - mito_name = "MT" - } - 'R64-1-1' { - fasta = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Annotation/Genes/genes.bed" - mito_name = "MT" - macs_gsize = "1.2e7" - } - 'EF2' { - fasta = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Annotation/README.txt" - mito_name = "MT" - macs_gsize = "1.21e7" - } - 'Sbi1' { - fasta = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Annotation/README.txt" - } - 'Sscrofa10.2' { - fasta = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Annotation/README.txt" - mito_name = "MT" - } - 'AGPv3' { - fasta = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Annotation/Genes/genes.bed" - mito_name = "Mt" - } - 'hg38' { - fasta = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.bed" - mito_name = "chrM" - macs_gsize = "2.7e9" - blacklist = "${projectDir}/assets/blacklists/hg38-blacklist.bed" - } - 'hg19' { - fasta = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Annotation/README.txt" - mito_name = "chrM" - macs_gsize = "2.7e9" - blacklist = "${projectDir}/assets/blacklists/hg19-blacklist.bed" - } - 'mm10' { - fasta = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Annotation/README.txt" - mito_name = "chrM" - macs_gsize = "1.87e9" - blacklist = "${projectDir}/assets/blacklists/mm10-blacklist.bed" - } - 'bosTau8' { - fasta = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Annotation/Genes/genes.bed" - mito_name = "chrM" - } - 'ce10' { - fasta = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Annotation/README.txt" - mito_name = "chrM" - macs_gsize = "9e7" - } - 'canFam3' { - fasta = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Annotation/README.txt" - mito_name = "chrM" - } - 'danRer10' { - fasta = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Annotation/Genes/genes.bed" - mito_name = "chrM" - macs_gsize = "1.37e9" - } - 'dm6' { - fasta = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Annotation/Genes/genes.bed" - mito_name = "chrM" - macs_gsize = "1.2e8" - } - 'equCab2' { - fasta = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Annotation/README.txt" - mito_name = "chrM" - } - 'galGal4' { - fasta = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Annotation/README.txt" - mito_name = "chrM" - } - 'panTro4' { - fasta = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Annotation/README.txt" - mito_name = "chrM" - } - 'rn6' { - fasta = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Annotation/Genes/genes.bed" - mito_name = "chrM" - } - 'sacCer3' { - fasta = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/BismarkIndex/" - readme = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Annotation/README.txt" - mito_name = "chrM" - macs_gsize = "1.2e7" - } - 'susScr3' { - fasta = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Annotation/README.txt" - mito_name = "chrM" - } - } -} diff --git a/conf/modules.config b/conf/modules.config index da58a5d8..ebf62694 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -12,29 +12,74 @@ process { - publishDir = [ - path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] + withName: "SAMPLESHEET_CHECK" { + publishDir = [ + path: { "${params.outdir}/blobtoolkit_info" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals("versions.yml") ? null : filename } + ] + } + + withName: "GOAT_TAXONSEARCH" { + ext.args = "-l -b" + } + + withName: "SAMTOOLS_VIEW" { + ext.args = "--output-fmt bam --write-index" + } - withName: SAMPLESHEET_CHECK { + withName: "BUSCO" { + scratch = true + ext.args = "--mode genome --force" + } + + withName: "DIAMOND_BLASTP" { + ext.args = "--evalue 1.0e-25 --max-target-seqs 10 --max-hsps 1" + } + + withName: "BLOBTOOLKIT_WINDOWSTATS" { + ext.args = "--window 0.1 --window 0.01 --window 1 --window 100000 --window 1000000" + } + + withName: "BLOBTOOLKIT_BLOBDIR" { + ext.args = "--evalue 1.0e-25 --hit-count 10" publishDir = [ - path: { "${params.outdir}/pipeline_info" }, + path: { "${params.outdir}/" }, mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + saveAs: { filename -> filename.equals("versions.yml") ? null : filename } ] } - withName: FASTQC { - ext.args = '--quiet' + withName: "BLOBTOOLKIT_SUMMARY" { + publishDir = [ + path: { "${params.outdir}/${blobdir.name}/" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals("versions.yml") ? null : filename } + ] + } + + withName: "BLOBTOOLKIT_IMAGES" { + publishDir = [ + path: { "${params.outdir}/${blobdir.name}/" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals("versions.yml") ? null : filename } + ] } - withName: CUSTOM_DUMPSOFTWAREVERSIONS { + withName: "CUSTOM_DUMPSOFTWAREVERSIONS" { publishDir = [ - path: { "${params.outdir}/pipeline_info" }, + path: { "${params.outdir}/blobtoolkit_info" }, mode: params.publish_dir_mode, - pattern: '*_versions.yml' + pattern: "*_versions.yml" + ] + } + + withName: MULTIQC { + ext.args = params.multiqc_title ? "--title \"$params.multiqc_title\"" : '' + publishDir = [ + path: { "${params.outdir}/multiqc" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } diff --git a/conf/test.config b/conf/test.config index 52630ab2..165bfff6 100644 --- a/conf/test.config +++ b/conf/test.config @@ -5,7 +5,7 @@ Defines input files and everything required to run a fast and simple pipeline test. Use as follows: - nextflow run nf-core/blobtoolkit -profile test, --outdir + nextflow run sanger-tol/blobtoolkit -profile test, --outdir ---------------------------------------------------------------------------------------- */ @@ -19,11 +19,18 @@ params { max_memory = '6.GB' max_time = '6.h' - // Input data - // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets - // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv' + // Input test data + // Specify the paths to your test data + // Give any required params for the test so that command line flags are not needed + input = "${projectDir}/assets/test/samplesheet.csv" - // Genome references - genome = 'R64-1-1' + // Fasta references + fasta = "/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/assembly/release/mMelMel3.1_paternal_haplotype/GCA_922984935.2.subset.fasta.gz" + accession = "GCA_922984935.2" + taxon = "Meles meles" + + // Databases + taxdump = "/lustre/scratch123/tol/teams/grit/geval_pipeline/btk_databases/taxdump" + busco = "/lustre/scratch123/tol/resources/nextflow/busco_2021_06_reduced/" + uniprot = "${projectDir}/assets/test/mCerEla1.1.buscogenes.dmnd" } diff --git a/conf/test_full.config b/conf/test_full.config index 37c14d31..ee22dba2 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -5,20 +5,29 @@ Defines input files and everything required to run a full size pipeline test. Use as follows: - nextflow run nf-core/blobtoolkit -profile test_full, --outdir + nextflow run sanger-tol/blobtoolkit -profile test_full, --outdir ---------------------------------------------------------------------------------------- */ +// cleanup = true + params { config_profile_name = 'Full test profile' config_profile_description = 'Full test dataset to check pipeline function' // Input data for full size test - // TODO nf-core: Specify the paths to your full test data ( on nf-core/test-datasets or directly in repositories, e.g. SRA) - // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_full_illumina_amplicon.csv' + // Specify the paths to your full test data + // Give any required params for the test so that command line flags are not needed + input = "${projectDir}/assets/test_full/full_samplesheet.csv" + + // Fasta references + fasta = "/lustre/scratch124/tol/projects/darwin/data/fungi/Laetiporus_sulphureus/assembly/release/gfLaeSulp1.1/insdc/GCA_927399515.1.fasta.gz" + accession = "GCA_927399515.1" + taxon = "Laetiporus sulphureus" - // Genome references - genome = 'R64-1-1' + // Databases + taxdump = "/lustre/scratch123/tol/teams/grit/geval_pipeline/btk_databases/taxdump" + busco = "/lustre/scratch123/tol/resources/busco/v5/" + uniprot = "${projectDir}/assets/test_full/gfLaeSulp1.1.buscogenes.dmnd" } diff --git a/docs/README.md b/docs/README.md index eba2215b..58471f91 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,10 +1,8 @@ -# nf-core/blobtoolkit: Documentation +# sanger-tol/blobtoolkit: Documentation -The nf-core/blobtoolkit documentation is split into the following pages: +The sanger-tol/blobtoolkit documentation is split into the following pages: - [Usage](usage.md) - An overview of how the pipeline works, how to run it and a description of all of the different command-line flags. - [Output](output.md) - An overview of the different results produced by the pipeline and how to interpret them. - -You can find a lot more documentation about installing, configuring and running nf-core pipelines on the website: [https://nf-co.re](https://nf-co.re) diff --git a/docs/decision-records/README.md b/docs/decision-records/README.md new file mode 100644 index 00000000..bd17babb --- /dev/null +++ b/docs/decision-records/README.md @@ -0,0 +1,25 @@ +Design decisions about the pipeline are indexed and recorded as individual files in this directory. + +To add a new decision, please create a pull request that adds a new markdown file named `XX-short-summary.md` to this directory. When replacing a previous decision, change the status of the latter to "Superseded" and add this to the title of the file `superseded-XX-short-summary.md`. The new file should have the following structure: + +## Title – Decision Statement + +## Status – Either Proposed, Rejected, Current, Deprecated or Superseded + +If this issue has been superseded, please add a line saying 'Superseded by '. + +## Context + +Explain why a decision is needed (problem statement) and provide details of the different options considered when making this decision. + +## Decision + +State what option was selected and why was it picked over other choices. + +## Consequences + +Reflect on how this decision will impact other planned work, or what new work needs to be planned to implement the decision. + +## Discussion Notes and Linked Issues or Pull Requests + +Add any offline discussion notes here, along with associated issue(s) and pull request links. diff --git a/docs/images/mqc_fastqc_adapter.png b/docs/images/mqc_fastqc_adapter.png deleted file mode 100755 index 361d0e47..00000000 Binary files a/docs/images/mqc_fastqc_adapter.png and /dev/null differ diff --git a/docs/images/mqc_fastqc_counts.png b/docs/images/mqc_fastqc_counts.png deleted file mode 100755 index cb39ebb8..00000000 Binary files a/docs/images/mqc_fastqc_counts.png and /dev/null differ diff --git a/docs/images/mqc_fastqc_quality.png b/docs/images/mqc_fastqc_quality.png deleted file mode 100755 index a4b89bf5..00000000 Binary files a/docs/images/mqc_fastqc_quality.png and /dev/null differ diff --git a/docs/images/nf-core-blobtoolkit_logo_dark.png b/docs/images/nf-core-blobtoolkit_logo_dark.png deleted file mode 100644 index ecb2183b..00000000 Binary files a/docs/images/nf-core-blobtoolkit_logo_dark.png and /dev/null differ diff --git a/docs/images/nf-core-blobtoolkit_logo_light.png b/docs/images/nf-core-blobtoolkit_logo_light.png deleted file mode 100644 index d6ec116d..00000000 Binary files a/docs/images/nf-core-blobtoolkit_logo_light.png and /dev/null differ diff --git a/docs/images/sanger-tol-blobtoolkit_diagram.svg b/docs/images/sanger-tol-blobtoolkit_diagram.svg new file mode 100644 index 00000000..892a2496 --- /dev/null +++ b/docs/images/sanger-tol-blobtoolkit_diagram.svg @@ -0,0 +1,767 @@ + + + +FASTAWINDOWSSAMTOOLSVIEWparams.taxonparams.busco_lineages_pathparams.ncbi_taxdumpparams.busco_diamondblastp_dboptional : params.yamloptional: params.taxa_fileCREATE BEDMOSDEPTHDIAMONDBLASTPGOAT TAXONSEARCHBUSCOEXTRACT BUSCO GENESCOUNT BUSCO GENESCOVERAGE TSVGENERATE IMAGESGENERATE SUMMARYGET WINDOW STATSCREATE BLOBDIRGENERATECONFIGADD SUMMARY TO METADATAPNGJSON diff --git a/docs/images/sanger-tol-blobtoolkit_logo.png b/docs/images/sanger-tol-blobtoolkit_logo.png new file mode 100644 index 00000000..925705ef Binary files /dev/null and b/docs/images/sanger-tol-blobtoolkit_logo.png differ diff --git a/docs/images/sanger-tol-blobtoolkit_workflow.png b/docs/images/sanger-tol-blobtoolkit_workflow.png new file mode 100644 index 00000000..0336febd Binary files /dev/null and b/docs/images/sanger-tol-blobtoolkit_workflow.png differ diff --git a/docs/output.md b/docs/output.md index 2a5c86af..437c6df7 100644 --- a/docs/output.md +++ b/docs/output.md @@ -1,4 +1,4 @@ -# nf-core/blobtoolkit: Output +# sanger-tol/blobtoolkit: Output ## Introduction @@ -6,37 +6,31 @@ This document describes the output produced by the pipeline. Most of the plots a The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. - +The directories comply with Tree of Life's canonical directory structure. + + ## Pipeline overview The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: -- [FastQC](#fastqc) - Raw read QC -- [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline +- [BlobDir](#blobdir) - Output files from `blobtools` and `view` subworkflow +- [MultiQC](#multiqc) - Aggregate report describing results from the whole pipeline - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution -### FastQC +### BlobDir + +The files in the BlobDir dataset which is used to create the online interactive assessments.
Output files -- `fastqc/` - - `*_fastqc.html`: FastQC report containing quality metrics. - - `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images. +- `/` + - `*.json`: files generated from genome and alignment coverage statistics + - `*.png`: static plot images
-[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). - -![MultiQC - FastQC sequence counts plot](images/mqc_fastqc_counts.png) - -![MultiQC - FastQC mean quality scores plot](images/mqc_fastqc_quality.png) - -![MultiQC - FastQC adapter content plot](images/mqc_fastqc_adapter.png) - -> **NB:** The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality. - ### MultiQC
@@ -49,16 +43,16 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
-[MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory. +[MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarising all samples in your project. Some of the pipeline QC results are visualised in the report and further statistics are available in the report data directory. -Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see . +Results generated by MultiQC collate pipeline QC from supported tools. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see . ### Pipeline information
Output files -- `pipeline_info/` +- `blobtoolkit_info/` - Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`. - Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.yml`. The `pipeline_report*` files will only be present if the `--email` / `--email_on_fail` parameter's are used when running the pipeline. - Reformatted samplesheet files used as input to the pipeline: `samplesheet.valid.csv`. diff --git a/docs/usage.md b/docs/usage.md index e8c455a9..71b07d05 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -1,18 +1,18 @@ -# nf-core/blobtoolkit: Usage +# sanger-tol/blobtoolkit: Usage -## :warning: Please read this documentation on the nf-core website: [https://nf-co.re/blobtoolkit/usage](https://nf-co.re/blobtoolkit/usage) +## :warning: Please read this documentation on the nf-core website: [https://pipelines.tol.sanger.ac.uk/blobtoolkit/usage](https://pipelines.tol.sanger.ac.uk/blobtoolkit/usage) > _Documentation of pipeline parameters is generated automatically from the pipeline schema and can no longer be found in markdown files._ ## Introduction - + ## Samplesheet input You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below. -```console +```bash --input '[path to samplesheet file]' ``` @@ -21,71 +21,293 @@ You will need to create a samplesheet with information about the samples you wou The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 3 lanes: ```console -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L004_R1_001.fastq.gz,AEG588A1_S1_L004_R2_001.fastq.gz +sample,datatype,datafile +sample1,hic,hic.cram +sample2,illumina,illumina.cram +sample2,illumina,illumina.cram ``` ### Full samplesheet -The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 3 columns to match those defined in the table below. +The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 3 columns to match those defined in the table below. -A final samplesheet file consisting of both single- and paired-end data may look something like the one below. This is for 6 samples, where `TREATMENT_REP3` has been sequenced twice. +A final samplesheet file may look something like the one below. ```console -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP2,AEG588A2_S2_L002_R1_001.fastq.gz,AEG588A2_S2_L002_R2_001.fastq.gz -CONTROL_REP3,AEG588A3_S3_L002_R1_001.fastq.gz,AEG588A3_S3_L002_R2_001.fastq.gz -TREATMENT_REP1,AEG588A4_S4_L003_R1_001.fastq.gz, -TREATMENT_REP2,AEG588A5_S5_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L004_R1_001.fastq.gz, +sample,datatype,datafile +sample1,hic,hic.cram +sample2,illumina,illumina.cram +sample3,ont,ont.cram +``` + +| Column | Description | +| ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (\_). | +| `datatype` | Type of sequencing data. Must be one of `hic`, `illumina`, `pacbio`, or `ont`. | +| `datafile` | Full path to read data file. | + +An [example samplesheet](https://raw.githubusercontent.com/sanger-tol/blobtoolkit/main/assets/test/samplesheet.csv) has been provided with the pipeline. + +## Getting databases ready for the pipeline + +The BlobToolKit pipeline can be run in many different ways. The default way requires access to several databases: + +1. [NCBI taxdump database](https://www.ncbi.nlm.nih.gov/taxonomy) +2. [NCBI nucleotide BLAST database](https://blast.ncbi.nlm.nih.gov/doc/blast-help/downloadblastdata.html#databases) +3. [UniProt reference proteomes database](https://www.uniprot.org) +4. [BUSCO database](https://busco.ezlab.org) + +It is a good idea to put a date suffix for each database location so you know at a glance whether you are using the latest version. We are using the `YYYY_MM` format as we do not expect the databases to be updated more frequently than once a month. However, feel free to use `DATE=YYYY_MM_DD` or a different format if you prefer. + +### 1. NCBI taxdump database + +Create the database directory and move into the directory: + +```bash +DATE=2023_03 +TAXDUMP=/path/to/databases/taxdump_${DATE} +mkdir -p $TAXDUMP +cd $TAXDUMP +``` + +Retrieve and decompress the NCBI taxdump: + +```bash +curl -L ftp://ftp.ncbi.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz | tar xzf - +``` + +### 2. NCBI nucleotide BLAST database + +Create the database directory and move into the directory: + +```bash +DATE=2023_03 +NT=/path/to/databases/nt_${DATE} +mkdir -p $NT +cd $NT +``` + +Retrieve the NCBI blast nt database (version 5) files and tar gunzip them. We are using the `&&` syntax to ensure that each command completes without error before the next one is run: + +```bash +wget "ftp://ftp.ncbi.nlm.nih.gov/blast/db/v5/nt.??.tar.gz" -P $NT/ && +for file in $NT/*.tar.gz; do + tar xf $file -C $NT && rm $file; +done +``` + +### 3. UniProt reference proteomes database + +You need [diamond blast](https://github.com/bbuchfink/diamond) installed for this step. The easiest way is probably using [conda](https://anaconda.org/bioconda/diamond). Make sure you have the latest version of Diamond (>2.x.x) otherwise the `--taxonnames` argument may not work. + +Create the database directory and move into the directory: + +```bash +DATE=2023_03 +UNIPROT=/path/to/databases/uniprot_${DATE} +mkdir -p $UNIPROT +cd $UNIPROT +``` + +The UniProt `Refseq_Proteomes_YYYY_MM.tar.gz` file is very large (>160 GB) and will take a long time to download. The command below looks complex because it needs to get around the problem of using wildcards with wget and curl. + +```bash +wget -q -O $UNIPROT/reference_proteomes.tar.gz \ + ftp.ebi.ac.uk/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/$(curl \ + -vs ftp.ebi.ac.uk/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/ 2>&1 | \ + awk '/tar.gz/ {print $9}') +tar xf reference_proteomes.tar.gz + +# Create a single fasta file with all the fasta files from each subdirectory: +touch reference_proteomes.fasta.gz +find . -mindepth 2 | grep "fasta.gz" | grep -v 'DNA' | grep -v 'additional' | xargs cat >> reference_proteomes.fasta.gz + +# create the accession-to-taxid map for all reference proteome sequences: +printf "accession\taccession.version\ttaxid\tgi\n" > reference_proteomes.taxid_map +zcat */*/*.idmapping.gz | grep "NCBI_TaxID" | awk '{print $1 "\t" $1 "\t" $3 "\t" 0}' >> reference_proteomes.taxid_map + +# create the taxon aware diamond blast database +diamond makedb -p 16 --in reference_proteomes.fasta.gz --taxonmap reference_proteomes.taxid_map --taxonnodes $TAXDUMP/nodes.dmp --taxonnames $TAXDUMP/names.dmp -d reference_proteomes.dmnd +``` + +### 4. BUSCO databases + +Create the database directory and move into the directory: + +```bash +DATE=2023_03 +BUSCO=/path/to/databases/busco_${DATE} +mkdir -p $BUSCO +cd $BUSCO +``` + +Download BUSCO data and lineages to allow BUSCO to run in offline mode: + +```bash +wget -r -nH https://busco-data.ezlab.org/v5/data/ +# the trailing slash after data is important. Otherwise wget doesn't get the subdirectories + +# tar gunzip all folders that have been stored as tar.gz, in the same parent directories as where they were stored: +find v5/data -name "*.tar.gz" | while read -r TAR; do tar -C `dirname $TAR` -xzf $TAR; done +``` + +If you have [GNU parallel](https://www.gnu.org/software/parallel/) installed, you can also use the command below which will run faster as it will run the decompression commands in parallel: + +```bash +find v5/data -name "*.tar.gz" | parallel "cd {//}; tar -xzf {/}" +``` + +## YAML File and Nextflow configuration + +As in the Snakemake version [a YAML configuration file](https://github.com/blobtoolkit/blobtoolkit/tree/main/src/blobtoolkit-pipeline/src#configuration) is needed to generate metadata summary. This YAML config file can be generated with a genome accession value for released assemblies (for example, GCA_XXXXXXXXX.X) or can be passed for draft assemblies (for example, [GCA_922984935.2.yaml](https://raw.githubusercontent.com/sanger-tol/blobtoolkit/main/assets/test/GCA_922984935.2.yaml) using the `--yaml` parameter. Even for draft assemblies, a placeholder value should be passed with the `--accession` parameter. + +The data in the YAML is currently ignored in the Nextflow pipeline version. The YAML file is retained only to allow compatibility with the BlobDir dataset generated by the [Snakemake version](https://github.com/blobtoolkit/blobtoolkit/tree/main/src/blobtoolkit-pipeline/src). The taxonomic information in the YAML file can be obtained from [NCBI Taxonomy](https://www.ncbi.nlm.nih.gov/data-hub/taxonomy/). + +## Changes from Snakemake to Nextflow + +The current version of Nextflow pipeline is not compatible with the public version of the [GenomeHubs BlobToolKit portal](https://blobtoolkit.genomehubs.org). + +### Commands + +Snakemake + +```bash +# Public Assemblies +run_btk_pipeline.sh GCA_ACCESSION + +# Draft Assemblies +blobtoolkit-pipeline run --config YAML --threads INT --workdir DIR ``` -| Column | Description | -| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | -| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | -| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | +Nextflow + +```bash +# Public Assemblies +nextflow run sanger-tol/blobtoolkit --input SAMPLESHEET --fasta GENOME –-accession GCA_ACCESSION --taxon TAXON_ID --taxdump TAXDUMP_DB --uniprot DMND_db -An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. +# Draft Assemblies +nextflow run sanger-tol/blobtoolkit --input SAMPLESHEET --fasta GENOME –-accession TAG --taxon TAXON_ID --yaml CONFIG --taxdump TAXDUMP_DB --uniprot DMND_db +``` + +### Subworkflows + +Here is a full list of snakemake subworkflows and their Nextflow couterparts: + +- **`minimap.smk`** + - Not implemented yet. + - Alignment is done using the [sanger-tol/readmapping](https://github.com/sanger-tol/readmapping) pipeline. +- **`windowmasker.smk`** + - Not implemented yet. + - Genomes downloaded by [sanger-tol/insdcdownload](https://github.com/sanger-tol/insdcdownload) is masked. +- **`chunk_stats.smk`** + - Subworkflow has been modified. + - BED file and additional statistics calculated using [`fasta_windows`](https://github.com/tolkit/fasta_windows). +- **`busco.smk`** + - Implemented as [`busco_diamond_blastp.nf`](https://github.com/sanger-tol/blobtoolkit/blob/main/subworkflows/local/busco_diamond_blastp.nf). +- **`cov_stats.smk`** + - The coverage calculation are done using [`mosdepth`]() in subworkflow [`coverage_stats.nf`](https://github.com/sanger-tol/blobtoolkit/blob/main/subworkflows/local/coverage_stats.nf). + - Combining the various tsv files in done in subworkflow [`collate_stats.nf`](https://github.com/sanger-tol/blobtoolkit/blob/main/subworkflows/local/collate_stats.nf). +- **`window_stats.smk`** + - The [`window_stats`]() process in implemented in subworkflow [`collate_stats.nf`](https://github.com/sanger-tol/blobtoolkit/blob/main/subworkflows/local/collate_stats.nf). +- **`diamond_blastp.smk`** + - Implemented within [`busco_diamond_blastp.nf`](https://github.com/sanger-tol/blobtoolkit/blob/main/subworkflows/local/busco_diamond_blastp.nf). +- **`diamond.smk`** + - Will be implemented as `diamond_blastx.nf`. +- **`blastn.smk`** + - Will be implemented as `blastn.nf`. +- **`blobtools.smk`** + - Implemented as [`blobtools.nf`](https://github.com/sanger-tol/blobtoolkit/blob/main/subworkflows/local/blobtools.nf). +- **`view.smk`** + - Implemented as [`view.nf`](https://github.com/sanger-tol/blobtoolkit/blob/main/subworkflows/local/view.nf). + +### Software dependencies + +List of tools for any given dataset can be fetched from the API, for example https://blobtoolkit.genomehubs.org/api/v1/dataset/id/CAJEUD01.1/settings/software_versions. + +| Dependency | Snakemake | Nextflow | +| ----------------- | --------- | -------- | +| blobtoolkit | 4.1.5 | 4.1.5 | +| blast | 2.12.0 | | +| blobtk | 0.2.4 | | +| busco | 5.3.2 | 5.4.3 | +| diamond | 2.0.15 | | +| fasta_windows | | 0.2.4 | +| goat | | 0.2.0 | +| minimap2 | 2.24 | | +| mosdepth | | 0.3.3 | +| ncbi-datasets-cli | 14.1.0 | | +| nextflow | | 22.10.6 | +| python | 3.9.13 | 3.10.6 | +| samtools | 1.15.1 | 1.15.1 | +| seqtk | 1.3 | | +| snakemake | 7.19.1 | | +| windowmasker | 2.12.0 | | + +> **NB:** Dependency has been **added** if only the Nextflow version information is present. +> **NB:** Dependency has been **removed** if only the Snakemake version information is present. +> **NB:** Dependency has been **updated** if bothe the Snakemake and Nextflow version information is present. ## Running the pipeline The typical command for running the pipeline is as follows: -```console -nextflow run nf-core/blobtoolkit --input samplesheet.csv --outdir --genome GRCh37 -profile docker +```bash +nextflow run sanger-tol/blobtoolkit --input samplesheet.csv --outdir --fasta genome.fasta -profile docker –-accession GCA_922984935.2 --taxon "Meles meles" --taxdump /path/to/taxdump --uniprot /path/to/buscogenes.dmnd ``` This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. Note that the pipeline will create the following files in your working directory: -```console +```bash work # Directory containing the nextflow working files - # Finished results in specified location (defined with --outdir) + # Finished results in specified location (defined with --outdir) .nextflow_log # Log file from Nextflow # Other nextflow hidden files, eg. history of pipeline runs and old logs. ``` +If you wish to repeatedly use the same parameters for multiple runs, rather than specifying each flag in the command, you can specify these in a params file. + +Pipeline settings can be provided in a `yaml` or `json` file via `-params-file `. + +> ⚠️ Do not use `-c ` to specify parameters as this will result in errors. Custom config files specified with `-c` must only be used for [tuning process resource specifications](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources), other infrastructural tweaks (such as output directories), or module arguments (args). +> The above pipeline run specified with a params file in yaml format: + +```bash +nextflow run sanger-tol/blobtoolkit -profile docker -params-file params.yaml +``` + +with `params.yaml` containing: + +```yaml +input: './samplesheet.csv' +outdir: './results/' +genome: 'GRCh37' +input: 'data' +<...> +``` + +You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-co.re/launch). + ### Updating the pipeline When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: -```console -nextflow pull nf-core/blobtoolkit +```bash +nextflow pull sanger-tol/blobtoolkit ``` ### Reproducibility It is a good idea to specify a pipeline version when running the pipeline on your data. This ensures that a specific version of the pipeline code and software are used when you run your pipeline. If you keep using the same tag, you'll be running the same version of the pipeline, even if there have been changes to the code since. -First, go to the [nf-core/blobtoolkit releases page](https://github.com/nf-core/blobtoolkit/releases) and find the latest version number - numeric only (eg. `1.3.1`). Then specify this when running the pipeline with `-r` (one hyphen) - eg. `-r 1.3.1`. +First, go to the [sanger-tol/blobtoolkit releases page](https://github.com/sanger-tol/blobtoolkit/releases) and find the latest pipeline version - numeric only (eg. `1.3.1`). Then specify this when running the pipeline with `-r` (one hyphen) - eg. `-r 1.3.1`. Of course, you can switch to another version by changing the number after the `-r` flag. + +This version number will be logged in reports when you run the pipeline, so that you'll know what you used when you look back in the future. For example, at the bottom of the MultiQC reports. -This version number will be logged in reports when you run the pipeline, so that you'll know what you used when you look back in the future. +To further assist in reproducbility, you can use share and re-use [parameter files](#running-the-pipeline) to repeat pipeline runs with the same settings without having to write out a command with every single parameter. + +> 💡 If you wish to share such profile (such as upload as supplementary material for academic publications), make sure to NOT include cluster specific paths to files, nor institutional specific profiles. ## Core Nextflow arguments @@ -95,7 +317,7 @@ This version number will be logged in reports when you run the pipeline, so that Use this parameter to choose a configuration profile. Profiles can give configuration presets for different compute environments. -Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (Docker, Singularity, Podman, Shifter, Charliecloud, Conda) - see below. When using Biocontainers, most of these software packaging methods pull Docker containers from quay.io e.g [FastQC](https://quay.io/repository/biocontainers/fastqc) except for Singularity which directly downloads Singularity images via https hosted by the [Galaxy project](https://depot.galaxyproject.org/singularity/) and Conda which downloads and installs software locally from [Bioconda](https://bioconda.github.io/). +Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (Docker, Singularity, Podman, Shifter, Charliecloud, Apptainer, Conda) - see below. > We highly recommend the use of Docker or Singularity containers for full pipeline reproducibility, however when this is not possible, Conda is also supported. @@ -104,8 +326,11 @@ The pipeline also dynamically loads configurations from [https://github.com/nf-c Note that multiple profiles can be loaded, for example: `-profile test,docker` - the order of arguments is important! They are loaded in sequence, so later profiles can overwrite earlier profiles. -If `-profile` is not specified, the pipeline will run locally and expect all software to be installed and available on the `PATH`. This is _not_ recommended. +If `-profile` is not specified, the pipeline will run locally and expect all software to be installed and available on the `PATH`. This is _not_ recommended, since it can lead to different results on different machines dependent on the computer enviroment. +- `test` + - A profile with a complete configuration for automated testing + - Includes links to test data so needs no other parameters - `docker` - A generic configuration profile to be used with [Docker](https://docker.com/) - `singularity` @@ -116,11 +341,10 @@ If `-profile` is not specified, the pipeline will run locally and expect all sof - A generic configuration profile to be used with [Shifter](https://nersc.gitlab.io/development/shifter/how-to-use/) - `charliecloud` - A generic configuration profile to be used with [Charliecloud](https://hpc.github.io/charliecloud/) +- `apptainer` + - A generic configuration profile to be used with [Apptainer](https://apptainer.org/) - `conda` - - A generic configuration profile to be used with [Conda](https://conda.io/docs/). Please only use Conda as a last resort i.e. when it's not possible to run the pipeline with Docker, Singularity, Podman, Shifter or Charliecloud. -- `test` - - A profile with a complete configuration for automated testing - - Includes links to test data so needs no other parameters + - A generic configuration profile to be used with [Conda](https://conda.io/docs/). Please only use Conda as a last resort i.e. when it's not possible to run the pipeline with Docker, Singularity, Podman, Shifter, Charliecloud, or Apptainer. ### `-resume` @@ -136,98 +360,21 @@ Specify the path to a specific config file (this is a core Nextflow command). Se ### Resource requests -Whilst the default requirements set within the pipeline will hopefully work for most people and with most input data, you may find that you want to customise the compute resources that the pipeline requests. Each step in the pipeline has a default set of requirements for number of CPUs, memory and time. For most of the steps in the pipeline, if the job exits with any of the error codes specified [here](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/conf/base.config#L18) it will automatically be resubmitted with higher requests (2 x original, then 3 x original). If it still fails after the third attempt then the pipeline execution is stopped. - -For example, if the nf-core/rnaseq pipeline is failing after multiple re-submissions of the `STAR_ALIGN` process due to an exit code of `137` this would indicate that there is an out of memory issue: - -```console -[62/149eb0] NOTE: Process `NFCORE_RNASEQ:RNASEQ:ALIGN_STAR:STAR_ALIGN (WT_REP1)` terminated with an error exit status (137) -- Execution is retried (1) -Error executing process > 'NFCORE_RNASEQ:RNASEQ:ALIGN_STAR:STAR_ALIGN (WT_REP1)' - -Caused by: - Process `NFCORE_RNASEQ:RNASEQ:ALIGN_STAR:STAR_ALIGN (WT_REP1)` terminated with an error exit status (137) - -Command executed: - STAR \ - --genomeDir star \ - --readFilesIn WT_REP1_trimmed.fq.gz \ - --runThreadN 2 \ - --outFileNamePrefix WT_REP1. \ - - -Command exit status: - 137 - -Command output: - (empty) - -Command error: - .command.sh: line 9: 30 Killed STAR --genomeDir star --readFilesIn WT_REP1_trimmed.fq.gz --runThreadN 2 --outFileNamePrefix WT_REP1. -Work dir: - /home/pipelinetest/work/9d/172ca5881234073e8d76f2a19c88fb +Whilst the default requirements set within the pipeline will hopefully work for most people and with most input data, you may find that you want to customise the compute resources that the pipeline requests. Each step in the pipeline has a default set of requirements for number of CPUs, memory and time. For most of the steps in the pipeline, if the job exits with any of the error codes specified [here](https://github.com/sanger-tol/blobtoolkit/blob/56906ffb5737e4b985797bb5fb4b9c94cfe69600/conf/base.config#L18) it will automatically be resubmitted with higher requests (2 x original, then 3 x original). If it still fails after the third attempt then the pipeline execution is stopped. -Tip: you can replicate the issue by changing to the process work dir and entering the command `bash .command.run` -``` - -To bypass this error you would need to find exactly which resources are set by the `STAR_ALIGN` process. The quickest way is to search for `process STAR_ALIGN` in the [nf-core/rnaseq Github repo](https://github.com/nf-core/rnaseq/search?q=process+STAR_ALIGN). -We have standardised the structure of Nextflow DSL2 pipelines such that all module files will be present in the `modules/` directory and so, based on the search results, the file we want is `modules/nf-core/software/star/align/main.nf`. -If you click on the link to that file you will notice that there is a `label` directive at the top of the module that is set to [`label process_high`](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/modules/nf-core/software/star/align/main.nf#L9). -The [Nextflow `label`](https://www.nextflow.io/docs/latest/process.html#label) directive allows us to organise workflow processes in separate groups which can be referenced in a configuration file to select and configure subset of processes having similar computing requirements. -The default values for the `process_high` label are set in the pipeline's [`base.config`](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/conf/base.config#L33-L37) which in this case is defined as 72GB. -Providing you haven't set any other standard nf-core parameters to **cap** the [maximum resources](https://nf-co.re/usage/configuration#max-resources) used by the pipeline then we can try and bypass the `STAR_ALIGN` process failure by creating a custom config file that sets at least 72GB of memory, in this case increased to 100GB. -The custom config below can then be provided to the pipeline via the [`-c`](#-c) parameter as highlighted in previous sections. - -```nextflow -process { - withName: 'NFCORE_RNASEQ:RNASEQ:ALIGN_STAR:STAR_ALIGN' { - memory = 100.GB - } -} -``` - -> **NB:** We specify the full process name i.e. `NFCORE_RNASEQ:RNASEQ:ALIGN_STAR:STAR_ALIGN` in the config file because this takes priority over the short name (`STAR_ALIGN`) and allows existing configuration using the full process name to be correctly overridden. -> -> If you get a warning suggesting that the process selector isn't recognised check that the process name has been specified correctly. - -### Updating containers - -The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. If for some reason you need to use a different version of a particular tool with the pipeline then you just need to identify the `process` name and override the Nextflow `container` definition for that process using the `withName` declaration. For example, in the [nf-core/viralrecon](https://nf-co.re/viralrecon) pipeline a tool called [Pangolin](https://github.com/cov-lineages/pangolin) has been used during the COVID-19 pandemic to assign lineages to SARS-CoV-2 genome sequenced samples. Given that the lineage assignments change quite frequently it doesn't make sense to re-release the nf-core/viralrecon everytime a new version of Pangolin has been released. However, you can override the default container used by the pipeline by creating a custom config file and passing it as a command-line argument via `-c custom.config`. +To change the resource requests, please see the [max resources](https://nf-co.re/docs/usage/configuration#max-resources) and [tuning workflow resources](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources) section of the nf-core website. -1. Check the default version used by the pipeline in the module file for [Pangolin](https://github.com/nf-core/viralrecon/blob/a85d5969f9025409e3618d6c280ef15ce417df65/modules/nf-core/software/pangolin/main.nf#L14-L19) -2. Find the latest version of the Biocontainer available on [Quay.io](https://quay.io/repository/biocontainers/pangolin?tag=latest&tab=tags) -3. Create the custom config accordingly: +### Custom Containers - - For Docker: +In some cases you may wish to change which container or conda environment a step of the pipeline uses for a particular tool. By default nf-core pipelines use containers and software from the [biocontainers](https://biocontainers.pro/) or [bioconda](https://bioconda.github.io/) projects. However in some cases the pipeline specified version maybe out of date. - ```nextflow - process { - withName: PANGOLIN { - container = 'quay.io/biocontainers/pangolin:3.0.5--pyhdfd78af_0' - } - } - ``` +To use a different container from the default container or conda environment specified in a pipeline, please see the [updating tool versions](https://nf-co.re/docs/usage/configuration#updating-tool-versions) section of the nf-core website. - - For Singularity: +### Custom Tool Arguments - ```nextflow - process { - withName: PANGOLIN { - container = 'https://depot.galaxyproject.org/singularity/pangolin:3.0.5--pyhdfd78af_0' - } - } - ``` +A pipeline might not always support every possible argument or option of a particular tool used in pipeline. Fortunately, nf-core pipelines provide some freedom to users to insert additional parameters that the pipeline does not include by default. - - For Conda: - - ```nextflow - process { - withName: PANGOLIN { - conda = 'bioconda::pangolin=3.0.5' - } - } - ``` - -> **NB:** If you wish to periodically update individual tool-specific results (e.g. Pangolin) generated by the pipeline then you must ensure to keep the `work/` directory otherwise the `-resume` ability of the pipeline will be compromised and it will restart from scratch. +To learn how to provide additional arguments to a particular tool of the pipeline, please see the [customising tool arguments](https://nf-co.re/docs/usage/configuration#customising-tool-arguments) section of the nf-core website. ### nf-core/configs @@ -237,6 +384,14 @@ See the main [Nextflow documentation](https://www.nextflow.io/docs/latest/config If you have any questions or issues please send us a message on [Slack](https://nf-co.re/join/slack) on the [`#configs` channel](https://nfcore.slack.com/channels/configs). +## Azure Resource Requests + +To be used with the `azurebatch` profile by specifying the `-profile azurebatch`. +We recommend providing a compute `params.vm_type` of `Standard_D16_v3` VMs by default but these options can be changed if required. + +Note that the choice of VM size depends on your quota and the overall workload during the analysis. +For a thorough list, please refer the [Azure Sizes for virtual machines in Azure](https://docs.microsoft.com/en-us/azure/virtual-machines/sizes). + ## Running in the background Nextflow handles job submissions and supervises the running jobs. The Nextflow process must run until the pipeline is finished. @@ -251,6 +406,6 @@ Some HPC setups also allow you to run nextflow within a cluster job submitted yo In some cases, the Nextflow Java virtual machines can start to request a large amount of memory. We recommend adding the following line to your environment to limit this (typically in `~/.bashrc` or `~./bash_profile`): -```console +```bash NXF_OPTS='-Xms1g -Xmx4g' ``` diff --git a/lib/NfcoreSchema.groovy b/lib/NfcoreSchema.groovy index b3d092f8..9b34804d 100755 --- a/lib/NfcoreSchema.groovy +++ b/lib/NfcoreSchema.groovy @@ -2,6 +2,7 @@ // This file holds several functions used to perform JSON parameter validation, help and summary rendering for the nf-core pipeline template. // +import nextflow.Nextflow import org.everit.json.schema.Schema import org.everit.json.schema.loader.SchemaLoader import org.everit.json.schema.ValidationException @@ -46,7 +47,6 @@ class NfcoreSchema { 'quiet', 'syslog', 'v', - 'version', // Options for `nextflow run` command 'ansi', @@ -84,6 +84,7 @@ class NfcoreSchema { 'stub-run', 'test', 'w', + 'with-apptainer', 'with-charliecloud', 'with-conda', 'with-dag', @@ -178,7 +179,7 @@ class NfcoreSchema { } if (has_error) { - System.exit(1) + Nextflow.error('Exiting!') } } diff --git a/lib/NfcoreTemplate.groovy b/lib/NfcoreTemplate.groovy index 2fc0a9b9..2777ae2b 100755 --- a/lib/NfcoreTemplate.groovy +++ b/lib/NfcoreTemplate.groovy @@ -32,6 +32,25 @@ class NfcoreTemplate { } } + // + // Generate version string + // + public static String version(workflow) { + String version_string = "" + + if (workflow.manifest.version) { + def prefix_v = workflow.manifest.version[0] != 'v' ? 'v' : '' + version_string += "${prefix_v}${workflow.manifest.version}" + } + + if (workflow.commitId) { + def git_shortsha = workflow.commitId.substring(0, 7) + version_string += "-g${git_shortsha}" + } + + return version_string + } + // // Construct and send completion email // @@ -61,7 +80,7 @@ class NfcoreTemplate { misc_fields['Nextflow Compile Timestamp'] = workflow.nextflow.timestamp def email_fields = [:] - email_fields['version'] = workflow.manifest.version + email_fields['version'] = NfcoreTemplate.version(workflow) email_fields['runName'] = workflow.runName email_fields['success'] = workflow.success email_fields['dateComplete'] = workflow.complete @@ -135,7 +154,7 @@ class NfcoreTemplate { } // Write summary e-mail HTML to a file - def output_d = new File("${params.outdir}/pipeline_info/") + def output_d = new File("${params.outdir}/blobtoolkit_info/") if (!output_d.exists()) { output_d.mkdirs() } @@ -145,6 +164,64 @@ class NfcoreTemplate { output_tf.withWriter { w -> w << email_txt } } + // + // Construct and send a notification to a web server as JSON + // e.g. Microsoft Teams and Slack + // + public static void IM_notification(workflow, params, summary_params, projectDir, log) { + def hook_url = params.hook_url + + def summary = [:] + for (group in summary_params.keySet()) { + summary << summary_params[group] + } + + def misc_fields = [:] + misc_fields['start'] = workflow.start + misc_fields['complete'] = workflow.complete + misc_fields['scriptfile'] = workflow.scriptFile + misc_fields['scriptid'] = workflow.scriptId + if (workflow.repository) misc_fields['repository'] = workflow.repository + if (workflow.commitId) misc_fields['commitid'] = workflow.commitId + if (workflow.revision) misc_fields['revision'] = workflow.revision + misc_fields['nxf_version'] = workflow.nextflow.version + misc_fields['nxf_build'] = workflow.nextflow.build + misc_fields['nxf_timestamp'] = workflow.nextflow.timestamp + + def msg_fields = [:] + msg_fields['version'] = NfcoreTemplate.version(workflow) + msg_fields['runName'] = workflow.runName + msg_fields['success'] = workflow.success + msg_fields['dateComplete'] = workflow.complete + msg_fields['duration'] = workflow.duration + msg_fields['exitStatus'] = workflow.exitStatus + msg_fields['errorMessage'] = (workflow.errorMessage ?: 'None') + msg_fields['errorReport'] = (workflow.errorReport ?: 'None') + msg_fields['commandLine'] = workflow.commandLine.replaceFirst(/ +--hook_url +[^ ]+/, "") + msg_fields['projectDir'] = workflow.projectDir + msg_fields['summary'] = summary << misc_fields + + // Render the JSON template + def engine = new groovy.text.GStringTemplateEngine() + // Different JSON depending on the service provider + // Defaults to "Adaptive Cards" (https://adaptivecards.io), except Slack which has its own format + def json_path = hook_url.contains("hooks.slack.com") ? "slackreport.json" : "adaptivecard.json" + def hf = new File("$projectDir/assets/${json_path}") + def json_template = engine.createTemplate(hf).make(msg_fields) + def json_message = json_template.toString() + + // POST + def post = new URL(hook_url).openConnection(); + post.setRequestMethod("POST") + post.setDoOutput(true) + post.setRequestProperty("Content-Type", "application/json") + post.getOutputStream().write(json_message.getBytes("UTF-8")); + def postRC = post.getResponseCode(); + if (! postRC.equals(200)) { + log.warn(post.getErrorStream().getText()); + } + } + // // Print pipeline summary on completion // @@ -154,7 +231,7 @@ class NfcoreTemplate { if (workflow.stats.ignoredCount == 0) { log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} Pipeline completed successfully${colors.reset}-" } else { - log.info "-${colors.purple}[$workflow.manifest.name]${colors.red} Pipeline completed successfully, but with errored process(es) ${colors.reset}-" + log.info "-${colors.purple}[$workflow.manifest.name]${colors.yellow} Pipeline completed successfully, but with errored process(es) ${colors.reset}-" } } else { log.info "-${colors.purple}[$workflow.manifest.name]${colors.red} Pipeline completed with errors${colors.reset}-" @@ -238,19 +315,23 @@ class NfcoreTemplate { } // - // nf-core logo + // sanger-tol logo // public static String logo(workflow, monochrome_logs) { Map colors = logColours(monochrome_logs) + String workflow_version = NfcoreTemplate.version(workflow) String.format( """\n ${dashedLine(monochrome_logs)} - ${colors.green},--.${colors.black}/${colors.green},-.${colors.reset} - ${colors.blue} ___ __ __ __ ___ ${colors.green}/,-._.--~\'${colors.reset} - ${colors.blue} |\\ | |__ __ / ` / \\ |__) |__ ${colors.yellow}} {${colors.reset} - ${colors.blue} | \\| | \\__, \\__/ | \\ |___ ${colors.green}\\`-._,-`-,${colors.reset} - ${colors.green}`._,._,\'${colors.reset} - ${colors.purple} ${workflow.manifest.name} v${workflow.manifest.version}${colors.reset} + ${colors.blue} _____ ${colors.green} _______ ${colors.red} _${colors.reset} + ${colors.blue} / ____| ${colors.green}|__ __| ${colors.red}| |${colors.reset} + ${colors.blue} | (___ __ _ _ __ __ _ ___ _ __ ${colors.reset} ___ ${colors.green}| |${colors.yellow} ___ ${colors.red}| |${colors.reset} + ${colors.blue} \\___ \\ / _` | '_ \\ / _` |/ _ \\ '__|${colors.reset}|___|${colors.green}| |${colors.yellow}/ _ \\${colors.red}| |${colors.reset} + ${colors.blue} ____) | (_| | | | | (_| | __/ | ${colors.green}| |${colors.yellow} (_) ${colors.red}| |____${colors.reset} + ${colors.blue} |_____/ \\__,_|_| |_|\\__, |\\___|_| ${colors.green}|_|${colors.yellow}\\___/${colors.red}|______|${colors.reset} + ${colors.blue} __/ |${colors.reset} + ${colors.blue} |___/${colors.reset} + ${colors.purple} ${workflow.manifest.name} ${workflow_version}${colors.reset} ${dashedLine(monochrome_logs)} """.stripIndent() ) diff --git a/lib/Utils.groovy b/lib/Utils.groovy index 28567bd7..8d030f4e 100755 --- a/lib/Utils.groovy +++ b/lib/Utils.groovy @@ -21,19 +21,26 @@ class Utils { } // Check that all channels are present - def required_channels = ['conda-forge', 'bioconda', 'defaults'] - def conda_check_failed = !required_channels.every { ch -> ch in channels } + // This channel list is ordered by required channel priority. + def required_channels_in_order = ['conda-forge', 'bioconda', 'defaults'] + def channels_missing = ((required_channels_in_order as Set) - (channels as Set)) as Boolean // Check that they are in the right order - conda_check_failed |= !(channels.indexOf('conda-forge') < channels.indexOf('bioconda')) - conda_check_failed |= !(channels.indexOf('bioconda') < channels.indexOf('defaults')) + def channel_priority_violation = false + def n = required_channels_in_order.size() + for (int i = 0; i < n - 1; i++) { + channel_priority_violation |= !(channels.indexOf(required_channels_in_order[i]) < channels.indexOf(required_channels_in_order[i+1])) + } - if (conda_check_failed) { + if (channels_missing | channel_priority_violation) { log.warn "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + " There is a problem with your Conda configuration!\n\n" + " You will need to set-up the conda-forge and bioconda channels correctly.\n" + - " Please refer to https://bioconda.github.io/user/install.html#set-up-channels\n" + - " NB: The order of the channels matters!\n" + + " Please refer to https://bioconda.github.io/\n" + + " The observed channel order is \n" + + " ${channels}\n" + + " but the following channel order is required:\n" + + " ${required_channels_in_order}\n" + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" } } diff --git a/lib/WorkflowBlobtoolkit.groovy b/lib/WorkflowBlobtoolkit.groovy index 5a9ac418..8be37902 100755 --- a/lib/WorkflowBlobtoolkit.groovy +++ b/lib/WorkflowBlobtoolkit.groovy @@ -1,18 +1,19 @@ // -// This file holds several functions specific to the workflow/blobtoolkit.nf in the nf-core/blobtoolkit pipeline +// This file holds several functions specific to the workflow/blobtoolkit.nf in the sanger-tol/blobtoolkit pipeline // +import nextflow.Nextflow +import groovy.text.SimpleTemplateEngine + class WorkflowBlobtoolkit { // // Check and validate parameters // public static void initialise(params, log) { - genomeExistsError(params, log) if (!params.fasta) { - log.error "Genome fasta file not specified with e.g. '--fasta genome.fa' or via a detectable config file." - System.exit(1) + Nextflow.error "Genome fasta file not specified with e.g. '--fasta genome.fa' or via a detectable config file." } } @@ -43,17 +44,20 @@ class WorkflowBlobtoolkit { return yaml_file_text } - // - // Exit pipeline if incorrect --genome key provided - // - private static void genomeExistsError(params, log) { - if (params.genomes && params.genome && !params.genomes.containsKey(params.genome)) { - log.error "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + - " Genome '${params.genome}' not found in any config files provided to the pipeline.\n" + - " Currently, the available genome keys are:\n" + - " ${params.genomes.keySet().join(", ")}\n" + - "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - System.exit(1) - } + public static String methodsDescriptionText(run_workflow, mqc_methods_yaml) { + // Convert to a named map so can be used as with familar NXF ${workflow} variable syntax in the MultiQC YML file + def meta = [:] + meta.workflow = run_workflow.toMap() + meta["manifest_map"] = run_workflow.manifest.toMap() + + meta["doi_text"] = meta.manifest_map.doi ? "(doi: ${meta.manifest_map.doi})" : "" + meta["nodoi_text"] = meta.manifest_map.doi ? "": "
  • If available, make sure to update the text to include the Zenodo DOI of version of the pipeline used.
  • " + + def methods_text = mqc_methods_yaml.text + + def engine = new SimpleTemplateEngine() + def description_html = engine.createTemplate(methods_text).make(meta) + + return description_html } } diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index bd5f99de..c3b0de0e 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -1,7 +1,9 @@ // -// This file holds several functions specific to the main.nf workflow in the nf-core/blobtoolkit pipeline +// This file holds several functions specific to the main.nf workflow in the sanger-tol/blobtoolkit pipeline // +import nextflow.Nextflow + class WorkflowMain { // @@ -9,9 +11,9 @@ class WorkflowMain { // public static String citation(workflow) { return "If you use ${workflow.manifest.name} for your analysis please cite:\n\n" + - // TODO nf-core: Add Zenodo DOI for pipeline after first release - //"* The pipeline\n" + - //" https://doi.org/10.5281/zenodo.XXXXXXX\n\n" + + // Add Zenodo DOI for pipeline after first release + "* The pipeline\n" + + " https://doi.org/10.5281/zenodo.XXXXXXX\n\n" + "* The nf-core framework\n" + " https://doi.org/10.1038/s41587-020-0439-x\n\n" + "* Software dependencies\n" + @@ -19,10 +21,10 @@ class WorkflowMain { } // - // Print help to screen if required + // Generate help string // - public static String help(workflow, params, log) { - def command = "nextflow run ${workflow.manifest.name} --input samplesheet.csv --genome GRCh37 -profile docker" + public static String help(workflow, params) { + def command = "nextflow run ${workflow.manifest.name} --input samplesheet.csv --fasta reference.fa -profile docker" def help_string = '' help_string += NfcoreTemplate.logo(workflow, params.monochrome_logs) help_string += NfcoreSchema.paramsHelp(workflow, params, command) @@ -32,9 +34,9 @@ class WorkflowMain { } // - // Print parameter summary log to screen + // Generate parameter summary log string // - public static String paramsSummaryLog(workflow, params, log) { + public static String paramsSummaryLog(workflow, params) { def summary_log = '' summary_log += NfcoreTemplate.logo(workflow, params.monochrome_logs) summary_log += NfcoreSchema.paramsSummaryLog(workflow, params) @@ -49,23 +51,30 @@ class WorkflowMain { public static void initialise(workflow, params, log) { // Print help to screen if required if (params.help) { - log.info help(workflow, params, log) + log.info help(workflow, params) + System.exit(0) + } + + // Print workflow version and exit on --version + if (params.version) { + String workflow_version = NfcoreTemplate.version(workflow) + log.info "${workflow.manifest.name} ${workflow_version}" System.exit(0) } + // Print parameter summary log to screen + log.info paramsSummaryLog(workflow, params) + // Validate workflow parameters via the JSON schema if (params.validate_params) { NfcoreSchema.validateParameters(workflow, params, log) } - // Print parameter summary log to screen - log.info paramsSummaryLog(workflow, params, log) - // Check that a -profile or Nextflow config has been provided to run the pipeline NfcoreTemplate.checkConfigProvided(workflow, log) // Check that conda channels are set-up correctly - if (params.enable_conda) { + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { Utils.checkCondaChannels(log) } @@ -74,21 +83,7 @@ class WorkflowMain { // Check input has been provided if (!params.input) { - log.error "Please provide an input samplesheet to the pipeline e.g. '--input samplesheet.csv'" - System.exit(1) - } - } - - // - // Get attribute from genome config file e.g. fasta - // - public static String getGenomeAttribute(params, attribute) { - def val = '' - if (params.genomes && params.genome && params.genomes.containsKey(params.genome)) { - if (params.genomes[ params.genome ].containsKey(attribute)) { - val = params.genomes[ params.genome ][ attribute ] - } + Nextflow.error("Please provide an input samplesheet to the pipeline e.g. '--input samplesheet.csv'") } - return val } } diff --git a/main.nf b/main.nf index 29ea0bfa..5618f04c 100644 --- a/main.nf +++ b/main.nf @@ -1,24 +1,15 @@ #!/usr/bin/env nextflow /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - nf-core/blobtoolkit + sanger-tol/blobtoolkit ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Github : https://github.com/nf-core/blobtoolkit - Website: https://nf-co.re/blobtoolkit - Slack : https://nfcore.slack.com/channels/blobtoolkit + Github : https://github.com/sanger-tol/blobtoolkit + Website: https://pipelines.tol.sanger.ac.uk/blobtoolkit ---------------------------------------------------------------------------------------- */ nextflow.enable.dsl = 2 -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - GENOME PARAMETER VALUES -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ - -params.fasta = WorkflowMain.getGenomeAttribute(params, 'fasta') - /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ VALIDATE & PRINT PARAMETER SUMMARY @@ -36,9 +27,9 @@ WorkflowMain.initialise(workflow, params, log) include { BLOBTOOLKIT } from './workflows/blobtoolkit' // -// WORKFLOW: Run main nf-core/blobtoolkit analysis pipeline +// WORKFLOW: Run main sanger-tol/blobtoolkit analysis pipeline // -workflow NFCORE_BLOBTOOLKIT { +workflow SANGERTOL_BLOBTOOLKIT { BLOBTOOLKIT () } @@ -53,7 +44,7 @@ workflow NFCORE_BLOBTOOLKIT { // See: https://github.com/nf-core/rnaseq/issues/619 // workflow { - NFCORE_BLOBTOOLKIT () + SANGERTOL_BLOBTOOLKIT () } /* diff --git a/modules.json b/modules.json index dd1ca986..cd615550 100644 --- a/modules.json +++ b/modules.json @@ -1,13 +1,60 @@ { - "name": "nf-core/blobtoolkit", - "homePage": "https://github.com/nf-core/blobtoolkit", + "name": "sanger-tol/blobtoolkit", + "homePage": "https://github.com/sanger-tol/blobtoolkit", "repos": { - "nf-core/modules": { - "custom/dumpsoftwareversions": { - "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" + "https://github.com/nf-core/modules.git": { + "modules": { + "nf-core": { + "busco": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"], + "patch": "modules/nf-core/busco/busco.diff" + }, + "custom/dumpsoftwareversions": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, + "diamond/blastp": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, + "fastawindows": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, + "goat/taxonsearch": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, + "gunzip": { + "branch": "master", + "git_sha": "5c460c5a4736974abde2843294f35307ee2b0e5e", + "installed_by": ["modules"] + }, + "mosdepth": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, + "multiqc": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, + "samtools/view": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + } + } }, - "multiqc": { - "git_sha": "5138acca0985ca01c38a1c4fba917d83772b1106" + "subworkflows": { + "nf-core": {} } } } diff --git a/modules/local/blobtoolkit/blobdir.nf b/modules/local/blobtoolkit/blobdir.nf new file mode 100644 index 00000000..3f064bce --- /dev/null +++ b/modules/local/blobtoolkit/blobdir.nf @@ -0,0 +1,45 @@ +process BLOBTOOLKIT_BLOBDIR { + tag "$meta.id" + label 'process_medium' + + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + exit 1, "BLOBTOOLKIT_BLOBDIR module does not support Conda. Please use Docker / Singularity / Podman instead." + } + container "genomehubs/blobtoolkit:4.1.5" + + input: + tuple val(meta), path(window, stageAs: 'windowstats/*') + tuple val(meta1), path(busco) + tuple val(meta2), path(blastp) + tuple val(meta3), path(yaml) + path(taxdump) + + output: + tuple val(meta), path(prefix), emit: blobdir + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + def hits = blastp ? "--hits ${blastp}" : "" + """ + blobtools replace \\ + --bedtsvdir windowstats \\ + --meta ${yaml} \\ + --taxdump ${taxdump} \\ + --taxrule buscogenes \\ + --busco ${busco} \\ + ${hits} \\ + --threads ${task.cpus} \\ + $args \\ + ${prefix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + blobtoolkit: \$(btk --version | cut -d' ' -f2 | sed 's/v//') + END_VERSIONS + """ +} diff --git a/modules/local/blobtoolkit/config.nf b/modules/local/blobtoolkit/config.nf new file mode 100644 index 00000000..ce1e3adc --- /dev/null +++ b/modules/local/blobtoolkit/config.nf @@ -0,0 +1,30 @@ +process BLOBTOOLKIT_CONFIG { + tag "$meta.id" + label 'process_single' + + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + exit 1, "GENERATE_CONFIG module does not support Conda. Please use Docker / Singularity / Podman instead." + } + container "genomehubs/blobtoolkit:4.1.5" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path("${meta.id}/*.yaml"), emit: yaml + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + blobtoolkit-pipeline generate-config ${meta.id} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + blobtoolkit: \$(btk --version | cut -d' ' -f2 | sed 's/v//') + END_VERSIONS + """ +} diff --git a/modules/local/blobtoolkit/countbuscos.nf b/modules/local/blobtoolkit/countbuscos.nf new file mode 100644 index 00000000..1379cbac --- /dev/null +++ b/modules/local/blobtoolkit/countbuscos.nf @@ -0,0 +1,36 @@ +process BLOBTOOLKIT_COUNTBUSCOS { + tag "$meta.id" + label 'process_single' + + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + exit 1, "BLOBTOOLKIT_COUNTBUSCOS module does not support Conda. Please use Docker / Singularity / Podman instead." + } + container "genomehubs/blobtoolkit:4.1.5" + + input: + tuple val(meta), path(table, stageAs: 'dir??/*') + tuple val(meta), path(bed) + + output: + tuple val(meta), path("*_buscogenes.tsv"), emit: tsv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def busco_inputs = table.collect{"--in $it"}.join(' ') + """ + btk pipeline count-busco-genes \\ + $busco_inputs \\ + --mask ${bed} \\ + --out ${prefix}_buscogenes.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + blobtoolkit: \$(btk --version | cut -d' ' -f2 | sed 's/v//') + END_VERSIONS + """ +} diff --git a/modules/local/blobtoolkit/extractbuscos.nf b/modules/local/blobtoolkit/extractbuscos.nf new file mode 100644 index 00000000..fd5c368e --- /dev/null +++ b/modules/local/blobtoolkit/extractbuscos.nf @@ -0,0 +1,38 @@ +process BLOBTOOLKIT_EXTRACTBUSCOS { + tag "$meta.id" + label 'process_single' + + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + exit 1, "BLOBTOOLKIT_EXTRACTBUSCOS module does not support Conda. Please use Docker / Singularity / Podman instead." + } + container "genomehubs/blobtoolkit:4.1.5" + + input: + tuple val(meta), path(fasta) + tuple val(meta1), path(seq1, stageAs: "lineage1/*") + tuple val(meta2), path(seq2, stageAs: "lineage2/*") + tuple val(meta3), path(seq3, stageAs: "lineage3/*") + + output: + tuple val(meta), path("*_buscogenes.fasta"), emit: genes + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + btk pipeline extract-busco-genes \\ + --busco $seq1 \\ + --busco $seq2 \\ + --busco $seq3 \\ + --out ${prefix}_buscogenes.fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + blobtoolkit: \$(btk --version | cut -d' ' -f2 | sed 's/v//') + END_VERSIONS + """ +} diff --git a/modules/local/blobtoolkit/images.nf b/modules/local/blobtoolkit/images.nf new file mode 100644 index 00000000..11bdd485 --- /dev/null +++ b/modules/local/blobtoolkit/images.nf @@ -0,0 +1,38 @@ +process BLOBTOOLKIT_IMAGES { + tag "${meta.id}_${plot}" + label 'process_single' + + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + exit 1, "BLOBTOOLKIT_IMAGES module does not support Conda. Please use Docker / Singularity / Podman instead." + } + container "genomehubs/blobtk:0.3.3" + + input: + tuple val(meta), path(blobdir) + each plot + + output: + tuple val(meta), path('*.png') , emit: png + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def legend = plot.equals("snail") ? "" : "--legend full" + """ + blobtk plot \\ + -v ${plot} \\ + -d ${blobdir} \\ + -o ${prefix}.${plot}.png \\ + ${legend} \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + blobtoolkit: \$(btk --version | cut -d' ' -f2 | sed 's/v//') + END_VERSIONS + """ +} diff --git a/modules/local/blobtoolkit/metadata.nf b/modules/local/blobtoolkit/metadata.nf new file mode 100644 index 00000000..32339c48 --- /dev/null +++ b/modules/local/blobtoolkit/metadata.nf @@ -0,0 +1,33 @@ +process BLOBTOOLKIT_METADATA { + tag "$meta.id" + label 'process_single' + + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + exit 1, "BLOBTOOLKIT_METADATA module does not support Conda. Please use Docker / Singularity / Podman instead." + } + container "genomehubs/blobtoolkit:4.1.5" + + input: + tuple val(meta), path(yaml) + + output: + tuple val(meta), path("*.metadata.yaml"), emit: yaml + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + btk pipeline add-summary-to-metadata \\ + --config ${yaml} \\ + --out ${prefix}.metadata.yaml + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + blobtoolkit: \$(btk --version | cut -d' ' -f2 | sed 's/v//') + END_VERSIONS + """ +} diff --git a/modules/local/blobtoolkit/summary.nf b/modules/local/blobtoolkit/summary.nf new file mode 100644 index 00000000..d1059d8a --- /dev/null +++ b/modules/local/blobtoolkit/summary.nf @@ -0,0 +1,33 @@ +process BLOBTOOLKIT_SUMMARY { + tag "$meta.id" + label 'process_single' + + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + exit 1, "BLOBTOOLKIT_SUMMARY module does not support Conda. Please use Docker / Singularity / Podman instead." + } + container "genomehubs/blobtoolkit:4.1.5" + + input: + tuple val(meta), path(blobdir) + + output: + tuple val(meta), path("*.json"), emit: json + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + blobtools filter \\ + ${args} \\ + --summary ${prefix}.summary.json ${blobdir} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + blobtoolkit: \$(btk --version | cut -d' ' -f2 | sed 's/v//') + END_VERSIONS + """ +} diff --git a/modules/local/blobtoolkit/windowstats.nf b/modules/local/blobtoolkit/windowstats.nf new file mode 100644 index 00000000..0517535f --- /dev/null +++ b/modules/local/blobtoolkit/windowstats.nf @@ -0,0 +1,34 @@ +process BLOBTOOLKIT_WINDOWSTATS { + tag "$meta.id" + label 'process_single' + + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + exit 1, "GET_WINDOW_STATS module does not support Conda. Please use Docker / Singularity / Podman instead." + } + container "genomehubs/blobtoolkit:4.1.5" + + input: + tuple val(meta), path(tsv) + + output: + tuple val(meta), path('*_window_stats*.tsv') , emit: tsv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + btk pipeline window-stats \\ + --in ${tsv} \\ + $args \\ + --out ${prefix}_window_stats.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + blobtoolkit: \$(btk --version | cut -d' ' -f2 | sed 's/v//') + END_VERSIONS + """ +} diff --git a/modules/local/create_bed.nf b/modules/local/create_bed.nf new file mode 100644 index 00000000..034ab1e6 --- /dev/null +++ b/modules/local/create_bed.nf @@ -0,0 +1,30 @@ +process CREATE_BED { + tag "$meta.id" + label 'process_single' + + conda "conda-forge::gawk=5.1.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gawk:5.1.0' : + 'quay.io/biocontainers/gawk:5.1.0' }" + + input: + tuple val(meta), path(tsv) //path to tsv output from fasta windows + + output: + tuple val(meta), path ('*.bed') , emit: bed + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + cut -f 1,2,3 $tsv | sed '1d' > ${prefix}.bed + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + create_bed: 1.03 + END_VERSIONS + """ +} diff --git a/modules/local/samplesheet_check.nf b/modules/local/samplesheet_check.nf index b67e4a8d..5798da0e 100644 --- a/modules/local/samplesheet_check.nf +++ b/modules/local/samplesheet_check.nf @@ -1,10 +1,11 @@ process SAMPLESHEET_CHECK { tag "$samplesheet" + label 'process_single' - conda (params.enable_conda ? "conda-forge::python=3.8.3" : null) + conda "conda-forge::python=3.9.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/python:3.8.3' : - 'quay.io/biocontainers/python:3.8.3' }" + 'https://depot.galaxyproject.org/singularity/python:3.9--1' : + 'quay.io/biocontainers/python:3.9--1' }" input: path samplesheet @@ -13,7 +14,10 @@ process SAMPLESHEET_CHECK { path '*.csv' , emit: csv path "versions.yml", emit: versions - script: // This script is bundled with the pipeline, in nf-core/blobtoolkit/bin/ + when: + task.ext.when == null || task.ext.when + + script: // This script is bundled with the pipeline, in sanger-tol/blobtoolkit/bin/ """ check_samplesheet.py \\ $samplesheet \\ @@ -21,6 +25,7 @@ process SAMPLESHEET_CHECK { cat <<-END_VERSIONS > versions.yml "${task.process}": + check_samplesheet.py: \$(check_samplesheet.py --version | cut -d' ' -f2) python: \$(python --version | sed 's/Python //g') END_VERSIONS """ diff --git a/modules/local/windowstats_input.nf b/modules/local/windowstats_input.nf new file mode 100644 index 00000000..f366025d --- /dev/null +++ b/modules/local/windowstats_input.nf @@ -0,0 +1,40 @@ +process WINDOWSTATS_INPUT { + tag "$meta.id" + label 'process_single' + + conda "conda-forge::pandas=1.5.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/pandas:1.5.2': + 'quay.io/biocontainers/pandas:1.5.2' }" + + input: + tuple val(meta), path(freq) + tuple val(meta), path(mononuc) + tuple val(meta), path(mosdepth) + tuple val(meta), path(countbusco) + + output: + tuple val(meta), path("*.tsv"), emit: tsv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + windowstats_input.py \\ + --freq ${freq} \\ + --mononuc ${mononuc} \\ + --mosdepth ${mosdepth} \\ + --countbusco ${countbusco} \\ + --output ${prefix}.tsv \\ + ${args} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + windowstats_input.py: \$(windowstats_input.py --version | cut -d' ' -f2) + END_VERSIONS + """ +} diff --git a/modules/nf-core/busco/busco.diff b/modules/nf-core/busco/busco.diff new file mode 100644 index 00000000..2aa7184a --- /dev/null +++ b/modules/nf-core/busco/busco.diff @@ -0,0 +1,31 @@ +Changes in module 'nf-core/busco' +--- modules/nf-core/busco/main.nf ++++ modules/nf-core/busco/main.nf +@@ -1,5 +1,5 @@ + process BUSCO { +- tag "$meta.id" ++ tag "${meta.id}_${lineage}" + label 'process_medium' + + conda "bioconda::busco=5.4.3" +@@ -14,11 +14,13 @@ + path config_file // Optional: busco configuration file + + output: +- tuple val(meta), path("*-busco.batch_summary.txt"), emit: batch_summary +- tuple val(meta), path("short_summary.*.txt") , emit: short_summaries_txt, optional: true +- tuple val(meta), path("short_summary.*.json") , emit: short_summaries_json, optional: true +- tuple val(meta), path("*-busco") , emit: busco_dir +- path "versions.yml" , emit: versions ++ tuple val(meta), path("*-busco.batch_summary.txt") , emit: batch_summary ++ tuple val(meta), path("short_summary.*.txt") , emit: short_summaries_txt, optional: true ++ tuple val(meta), path("short_summary.*.json") , emit: short_summaries_json, optional: true ++ tuple val(meta), path("*-busco") , emit: busco_dir ++ tuple val(meta), path("*-busco/*/run_*/full_table.tsv") , emit: full_table, optional: true ++ tuple val(meta), path("*-busco/*/run_*/busco_sequences"), emit: seq_dir, optional: true ++ path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + +************************************************************ diff --git a/modules/nf-core/busco/main.nf b/modules/nf-core/busco/main.nf new file mode 100644 index 00000000..254ee9fd --- /dev/null +++ b/modules/nf-core/busco/main.nf @@ -0,0 +1,86 @@ +process BUSCO { + tag "${meta.id}_${lineage}" + label 'process_medium' + + conda "bioconda::busco=5.4.3" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/busco:5.4.3--pyhdfd78af_0': + 'biocontainers/busco:5.4.3--pyhdfd78af_0' }" + + input: + tuple val(meta), path('tmp_input/*') + val lineage // Required: lineage to check against, "auto" enables --auto-lineage instead + path busco_lineages_path // Recommended: path to busco lineages - downloads if not set + path config_file // Optional: busco configuration file + + output: + tuple val(meta), path("*-busco.batch_summary.txt") , emit: batch_summary + tuple val(meta), path("short_summary.*.txt") , emit: short_summaries_txt, optional: true + tuple val(meta), path("short_summary.*.json") , emit: short_summaries_json, optional: true + tuple val(meta), path("*-busco") , emit: busco_dir + tuple val(meta), path("*-busco/*/run_*/full_table.tsv") , emit: full_table, optional: true + tuple val(meta), path("*-busco/*/run_*/busco_sequences"), emit: seq_dir, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}-${lineage}" + def busco_config = config_file ? "--config $config_file" : '' + def busco_lineage = lineage.equals('auto') ? '--auto-lineage' : "--lineage_dataset ${lineage}" + def busco_lineage_dir = busco_lineages_path ? "--offline --download_path ${busco_lineages_path}" : '' + """ + # Nextflow changes the container --entrypoint to /bin/bash (container default entrypoint: /usr/local/env-execute) + # Check for container variable initialisation script and source it. + if [ -f "/usr/local/env-activate.sh" ]; then + set +u # Otherwise, errors out because of various unbound variables + . "/usr/local/env-activate.sh" + set -u + fi + + # If the augustus config directory is not writable, then copy to writeable area + if [ ! -w "\${AUGUSTUS_CONFIG_PATH}" ]; then + # Create writable tmp directory for augustus + AUG_CONF_DIR=\$( mktemp -d -p \$PWD ) + cp -r \$AUGUSTUS_CONFIG_PATH/* \$AUG_CONF_DIR + export AUGUSTUS_CONFIG_PATH=\$AUG_CONF_DIR + echo "New AUGUSTUS_CONFIG_PATH=\${AUGUSTUS_CONFIG_PATH}" + fi + + # Ensure the input is uncompressed + INPUT_SEQS=input_seqs + mkdir "\$INPUT_SEQS" + cd "\$INPUT_SEQS" + for FASTA in ../tmp_input/*; do + if [ "\${FASTA##*.}" == 'gz' ]; then + gzip -cdf "\$FASTA" > \$( basename "\$FASTA" .gz ) + else + ln -s "\$FASTA" . + fi + done + cd .. + + busco \\ + --cpu $task.cpus \\ + --in "\$INPUT_SEQS" \\ + --out ${prefix}-busco \\ + $busco_lineage \\ + $busco_lineage_dir \\ + $busco_config \\ + $args + + # clean up + rm -rf "\$INPUT_SEQS" + + # Move files to avoid staging/publishing issues + mv ${prefix}-busco/batch_summary.txt ${prefix}-busco.batch_summary.txt + mv ${prefix}-busco/*/short_summary.*.{json,txt} . || echo "Short summaries were not available: No genes were found." + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + busco: \$( busco --version 2>&1 | sed 's/^BUSCO //' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/busco/meta.yml b/modules/nf-core/busco/meta.yml new file mode 100644 index 00000000..ef8c5245 --- /dev/null +++ b/modules/nf-core/busco/meta.yml @@ -0,0 +1,69 @@ +name: busco +description: Benchmarking Universal Single Copy Orthologs +keywords: + - quality control + - genome + - transcriptome + - proteome +tools: + - busco: + description: BUSCO provides measures for quantitative assessment of genome assembly, gene set, and transcriptome completeness based on evolutionarily informed expectations of gene content from near-universal single-copy orthologs selected from OrthoDB. + homepage: https://busco.ezlab.org/ + documentation: https://busco.ezlab.org/busco_userguide.html + tool_dev_url: https://gitlab.com/ezlab/busco + doi: "10.1007/978-1-4939-9173-0_14" + licence: ["MIT"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Nucleic or amino acid sequence file in FASTA format. + pattern: "*.{fasta,fna,fa,fasta.gz,fna.gz,fa.gz}" + - lineage: + type: value + description: The BUSCO lineage to use, or "auto" to automatically select lineage + - busco_lineages_path: + type: directory + description: Path to local BUSCO lineages directory. + - config_file: + type: file + description: Path to BUSCO config file. + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - batch_summary: + type: file + description: Summary of all sequence files analyzed + pattern: "*-busco.batch_summary.txt" + - short_summaries_txt: + type: file + description: Short Busco summary in plain text format + pattern: "short_summary.*.txt" + - short_summaries_json: + type: file + description: Short Busco summary in JSON format + pattern: "short_summary.*.json" + - busco_dir: + type: directory + description: BUSCO lineage specific output + pattern: "*-busco" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@priyanka-surana" + - "@charles-plessy" + - "@mahesh-panchal" + - "@muffato" + - "@jvhagey" diff --git a/modules/nf-core/modules/custom/dumpsoftwareversions/main.nf b/modules/nf-core/custom/dumpsoftwareversions/main.nf similarity index 79% rename from modules/nf-core/modules/custom/dumpsoftwareversions/main.nf rename to modules/nf-core/custom/dumpsoftwareversions/main.nf index 327d5100..ebc87273 100644 --- a/modules/nf-core/modules/custom/dumpsoftwareversions/main.nf +++ b/modules/nf-core/custom/dumpsoftwareversions/main.nf @@ -1,11 +1,11 @@ process CUSTOM_DUMPSOFTWAREVERSIONS { - label 'process_low' + label 'process_single' // Requires `pyyaml` which does not have a dedicated container but is in the MultiQC container - conda (params.enable_conda ? "bioconda::multiqc=1.11" : null) + conda "bioconda::multiqc=1.14" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.11--pyhdfd78af_0' : - 'quay.io/biocontainers/multiqc:1.11--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.14--pyhdfd78af_0' : + 'biocontainers/multiqc:1.14--pyhdfd78af_0' }" input: path versions diff --git a/modules/nf-core/modules/custom/dumpsoftwareversions/meta.yml b/modules/nf-core/custom/dumpsoftwareversions/meta.yml similarity index 88% rename from modules/nf-core/modules/custom/dumpsoftwareversions/meta.yml rename to modules/nf-core/custom/dumpsoftwareversions/meta.yml index 60b546a0..c32657de 100644 --- a/modules/nf-core/modules/custom/dumpsoftwareversions/meta.yml +++ b/modules/nf-core/custom/dumpsoftwareversions/meta.yml @@ -1,7 +1,9 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json name: custom_dumpsoftwareversions description: Custom module used to dump software versions within the nf-core pipeline template keywords: - custom + - dump - version tools: - custom: diff --git a/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py new file mode 100755 index 00000000..da033408 --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python + + +"""Provide functions to merge multiple versions.yml files.""" + + +import yaml +import platform +from textwrap import dedent + + +def _make_versions_html(versions): + """Generate a tabular HTML output of all versions for MultiQC.""" + html = [ + dedent( + """\\ + + + + + + + + + + """ + ) + ] + for process, tmp_versions in sorted(versions.items()): + html.append("") + for i, (tool, version) in enumerate(sorted(tmp_versions.items())): + html.append( + dedent( + f"""\\ + + + + + + """ + ) + ) + html.append("") + html.append("
    Process Name Software Version
    {process if (i == 0) else ''}{tool}{version}
    ") + return "\\n".join(html) + + +def main(): + """Load all version files and generate merged output.""" + versions_this_module = {} + versions_this_module["${task.process}"] = { + "python": platform.python_version(), + "yaml": yaml.__version__, + } + + with open("$versions") as f: + versions_by_process = yaml.load(f, Loader=yaml.BaseLoader) | versions_this_module + + # aggregate versions by the module name (derived from fully-qualified process name) + versions_by_module = {} + for process, process_versions in versions_by_process.items(): + module = process.split(":")[-1] + try: + if versions_by_module[module] != process_versions: + raise AssertionError( + "We assume that software versions are the same between all modules. " + "If you see this error-message it means you discovered an edge-case " + "and should open an issue in nf-core/tools. " + ) + except KeyError: + versions_by_module[module] = process_versions + + versions_by_module["Workflow"] = { + "Nextflow": "$workflow.nextflow.version", + "$workflow.manifest.name": "$workflow.manifest.version", + } + + versions_mqc = { + "id": "software_versions", + "section_name": "${workflow.manifest.name} Software Versions", + "section_href": "https://github.com/${workflow.manifest.name}", + "plot_type": "html", + "description": "are collected at run time from the software output.", + "data": _make_versions_html(versions_by_module), + } + + with open("software_versions.yml", "w") as f: + yaml.dump(versions_by_module, f, default_flow_style=False) + with open("software_versions_mqc.yml", "w") as f: + yaml.dump(versions_mqc, f, default_flow_style=False) + + with open("versions.yml", "w") as f: + yaml.dump(versions_this_module, f, default_flow_style=False) + + +if __name__ == "__main__": + main() diff --git a/modules/nf-core/diamond/blastp/main.nf b/modules/nf-core/diamond/blastp/main.nf new file mode 100644 index 00000000..02af8886 --- /dev/null +++ b/modules/nf-core/diamond/blastp/main.nf @@ -0,0 +1,64 @@ +process DIAMOND_BLASTP { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::diamond=2.0.15" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/diamond:2.0.15--hb97b32f_0' : + 'biocontainers/diamond:2.0.15--hb97b32f_0' }" + + input: + tuple val(meta), path(fasta) + path db + val out_ext + val blast_columns + + output: + tuple val(meta), path('*.blast'), optional: true, emit: blast + tuple val(meta), path('*.xml') , optional: true, emit: xml + tuple val(meta), path('*.txt') , optional: true, emit: txt + tuple val(meta), path('*.daa') , optional: true, emit: daa + tuple val(meta), path('*.sam') , optional: true, emit: sam + tuple val(meta), path('*.tsv') , optional: true, emit: tsv + tuple val(meta), path('*.paf') , optional: true, emit: paf + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def columns = blast_columns ? "${blast_columns}" : '' + switch ( out_ext ) { + case "blast": outfmt = 0; break + case "xml": outfmt = 5; break + case "txt": outfmt = 6; break + case "daa": outfmt = 100; break + case "sam": outfmt = 101; break + case "tsv": outfmt = 102; break + case "paf": outfmt = 103; break + default: + outfmt = '6'; + out_ext = 'txt'; + log.warn("Unknown output file format provided (${out_ext}): selecting DIAMOND default of tabular BLAST output (txt)"); + break + } + """ + DB=`find -L ./ -name "*.dmnd" | sed 's/\\.dmnd\$//'` + + diamond \\ + blastp \\ + --threads $task.cpus \\ + --db \$DB \\ + --query $fasta \\ + --outfmt ${outfmt} ${columns} \\ + $args \\ + --out ${prefix}.${out_ext} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + diamond: \$(diamond --version 2>&1 | tail -n 1 | sed 's/^diamond version //') + END_VERSIONS + """ +} diff --git a/modules/nf-core/diamond/blastp/meta.yml b/modules/nf-core/diamond/blastp/meta.yml new file mode 100644 index 00000000..5bf35791 --- /dev/null +++ b/modules/nf-core/diamond/blastp/meta.yml @@ -0,0 +1,82 @@ +name: diamond_blastp +description: Queries a DIAMOND database using blastp mode +keywords: + - fasta + - diamond + - blastp + - DNA sequence +tools: + - diamond: + description: Accelerated BLAST compatible local sequence aligner + homepage: https://github.com/bbuchfink/diamond + documentation: https://github.com/bbuchfink/diamond/wiki + tool_dev_url: https://github.com/bbuchfink/diamond + doi: "10.1038/s41592-021-01101-x" + licence: ["GPL v3.0"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Input fasta file containing query sequences + pattern: "*.{fa,fasta}" + - db: + type: directory + description: Directory containing the protein blast database + pattern: "*" + - out_ext: + type: string + description: | + Specify the type of output file to be generated. `blast` corresponds to + BLAST pairwise format. `xml` corresponds to BLAST xml format. + `txt` corresponds to to BLAST tabular format. `tsv` corresponds to + taxonomic classification format. + pattern: "blast|xml|txt|daa|sam|tsv|paf" + - blast_columns: + type: string + description: | + Optional space separated list of DIAMOND tabular BLAST output keywords + used for in conjunction with the 'txt' out_ext option (--outfmt 6). See + DIAMOND documnetation for more information. + +output: + - blast: + type: file + description: File containing blastp hits + pattern: "*.{blast}" + - xml: + type: file + description: File containing blastp hits + pattern: "*.{xml}" + - txt: + type: file + description: File containing hits in tabular BLAST format. + pattern: "*.{txt}" + - daa: + type: file + description: File containing hits DAA format + pattern: "*.{daa}" + - sam: + type: file + description: File containing aligned reads in SAM format + pattern: "*.{sam}" + - tsv: + type: file + description: Tab separated file containing taxonomic classification of hits + pattern: "*.{tsv}" + - paf: + type: file + description: File containing aligned reads in pairwise mapping format format + pattern: "*.{paf}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@spficklin" + - "@jfy133" diff --git a/modules/nf-core/fastawindows/main.nf b/modules/nf-core/fastawindows/main.nf new file mode 100644 index 00000000..c65a051e --- /dev/null +++ b/modules/nf-core/fastawindows/main.nf @@ -0,0 +1,40 @@ +process FASTAWINDOWS { + tag "$meta.id" + label 'process_low' + + conda "bioconda::fasta_windows=0.2.4" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/fasta_windows:0.2.4--hec16e2b_0': + 'biocontainers/fasta_windows:0.2.4--hec16e2b_0' }" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path("fw_out/*_freq_windows.tsv") , emit: freq + tuple val(meta), path("fw_out/*_mononuc_windows.tsv") , emit: mononuc + tuple val(meta), path("fw_out/*_dinuc_windows.tsv") , emit: dinuc + tuple val(meta), path("fw_out/*_trinuc_windows.tsv") , emit: trinuc + tuple val(meta), path("fw_out/*_tetranuc_windows.tsv"), emit: tetranuc + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + rm -rf fw_out + env RAYON_NUM_THREADS=$task.cpus \\ + fasta_windows \\ + $args \\ + --fasta $fasta \\ + --output ${prefix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fasta_windows: \$(fasta_windows --version | cut -d' ' -f3) + END_VERSIONS + """ +} diff --git a/modules/nf-core/fastawindows/meta.yml b/modules/nf-core/fastawindows/meta.yml new file mode 100644 index 00000000..9342af96 --- /dev/null +++ b/modules/nf-core/fastawindows/meta.yml @@ -0,0 +1,57 @@ +name: "fastawindows" +description: Quickly compute statistics over a fasta file in windows. +keywords: + - genome + - fasta + - tsv + - bed +tools: + - "fastawindows": + description: "fasta_windows is a tool written for Darwin Tree of Life chromosomal level genome assemblies. The executable takes a fasta formatted file and calculates some statistics of interest in windows" + homepage: "https://github.com/tolkit/fasta_windows" + + licence: "['MIT']" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: FASTA file + pattern: "*.{fa,fasta,fna}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - freq: + type: file + description: TSV file with frequencies and statistics + pattern: "*.{tsv}" + - mononuc: + type: file + description: TSV file with mononucleotide counts + pattern: "*.{tsv}" + - dinuc: + type: file + description: TSV file with dinucleotide counts + pattern: "*.{tsv}" + - trinuc: + type: file + description: TSV file with trinucleotide counts + pattern: "*.{tsv}" + - tetranuc: + type: file + description: TSV file with tetranucleotide counts + pattern: "*.{tsv}" +authors: + - "@muffato" diff --git a/modules/nf-core/goat/taxonsearch/main.nf b/modules/nf-core/goat/taxonsearch/main.nf new file mode 100644 index 00000000..1b0e8ba3 --- /dev/null +++ b/modules/nf-core/goat/taxonsearch/main.nf @@ -0,0 +1,36 @@ +process GOAT_TAXONSEARCH { + tag "$meta.id" + label 'process_single' + + conda "bioconda::goat=0.2.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/goat:0.2.0--h92d785c_0': + 'biocontainers/goat:0.2.0--h92d785c_0' }" + + input: + tuple val(meta), val(taxon), path(taxa_file) + + output: + tuple val(meta), path("*.tsv"), emit: taxonsearch + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + input = taxa_file ? "-f ${taxa_file}" : "-t \"${taxon}\"" + if (!taxon && !taxa_file) error "No input. Valid input: single taxon identifier or a .txt file with identifiers" + if (taxon && taxa_file ) error "Only one input is required: a single taxon identifier or a .txt file with identifiers" + """ + goat-cli taxon search \\ + $args \\ + $input > ${prefix}.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + goat: \$(goat-cli --version | cut -d' ' -f2) + END_VERSIONS + """ +} diff --git a/modules/nf-core/goat/taxonsearch/meta.yml b/modules/nf-core/goat/taxonsearch/meta.yml new file mode 100644 index 00000000..06c374f0 --- /dev/null +++ b/modules/nf-core/goat/taxonsearch/meta.yml @@ -0,0 +1,51 @@ +name: "goat_taxonsearch" +description: Query metadata for any taxon across the tree of life. +keywords: + - public datasets + - ncbi + - genomes on a tree +tools: + - goat: + description: | + goat-cli is a command line interface to query the + Genomes on a Tree Open API. + homepage: https://github.com/genomehubs/goat-cli + documentation: https://github.com/genomehubs/goat-cli/wiki + tool_dev_url: https://genomehubs.github.io/goat-cli/goat_cli/ + + licence: ["MIT"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test'] + - taxon: + type: val + description: | + The taxon to search. An NCBI taxon ID, or the name of a taxon at any rank. + - taxa_file: + type: file + description: | + A file of NCBI taxonomy ID's (tips) and/or binomial names. Each line + should contain a single entry.File size is limited to 500 entries. + pattern: "*.txt" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - taxonsearch: + type: file + description: TSV file containing search results. + pattern: "*.tsv" + +authors: + - "@alxndrdiaz" diff --git a/modules/nf-core/gunzip/main.nf b/modules/nf-core/gunzip/main.nf new file mode 100644 index 00000000..e7189d2f --- /dev/null +++ b/modules/nf-core/gunzip/main.nf @@ -0,0 +1,44 @@ +process GUNZIP { + tag "$archive" + label 'process_single' + + conda "conda-forge::sed=4.7" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + + input: + tuple val(meta), path(archive) + + output: + tuple val(meta), path("$gunzip"), emit: gunzip + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + gunzip = archive.toString() - '.gz' + """ + gunzip \\ + -f \\ + $args \\ + $archive + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gunzip: \$(echo \$(gunzip --version 2>&1) | sed 's/^.*(gzip) //; s/ Copyright.*\$//') + END_VERSIONS + """ + + stub: + gunzip = archive.toString() - '.gz' + """ + touch $gunzip + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gunzip: \$(echo \$(gunzip --version 2>&1) | sed 's/^.*(gzip) //; s/ Copyright.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gunzip/meta.yml b/modules/nf-core/gunzip/meta.yml new file mode 100644 index 00000000..4cdcdf4c --- /dev/null +++ b/modules/nf-core/gunzip/meta.yml @@ -0,0 +1,35 @@ +name: gunzip +description: Compresses and decompresses files. +keywords: + - gunzip + - compression + - decompression +tools: + - gunzip: + description: | + gzip is a file format and a software application used for file compression and decompression. + documentation: https://www.gnu.org/software/gzip/manual/gzip.html + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Optional groovy Map containing meta information + e.g. [ id:'test', single_end:false ] + - archive: + type: file + description: File to be compressed/uncompressed + pattern: "*.*" +output: + - gunzip: + type: file + description: Compressed/uncompressed file + pattern: "*.*" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" + - "@jfy133" diff --git a/modules/nf-core/modules/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py b/modules/nf-core/modules/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py deleted file mode 100644 index d1390392..00000000 --- a/modules/nf-core/modules/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py +++ /dev/null @@ -1,89 +0,0 @@ -#!/usr/bin/env python - -import yaml -import platform -from textwrap import dedent - - -def _make_versions_html(versions): - html = [ - dedent( - """\\ - - - - - - - - - - """ - ) - ] - for process, tmp_versions in sorted(versions.items()): - html.append("") - for i, (tool, version) in enumerate(sorted(tmp_versions.items())): - html.append( - dedent( - f"""\\ - - - - - - """ - ) - ) - html.append("") - html.append("
    Process Name Software Version
    {process if (i == 0) else ''}{tool}{version}
    ") - return "\\n".join(html) - - -versions_this_module = {} -versions_this_module["${task.process}"] = { - "python": platform.python_version(), - "yaml": yaml.__version__, -} - -with open("$versions") as f: - versions_by_process = yaml.load(f, Loader=yaml.BaseLoader) | versions_this_module - -# aggregate versions by the module name (derived from fully-qualified process name) -versions_by_module = {} -for process, process_versions in versions_by_process.items(): - module = process.split(":")[-1] - try: - assert versions_by_module[module] == process_versions, ( - "We assume that software versions are the same between all modules. " - "If you see this error-message it means you discovered an edge-case " - "and should open an issue in nf-core/tools. " - ) - except KeyError: - versions_by_module[module] = process_versions - -versions_by_module["Workflow"] = { - "Nextflow": "$workflow.nextflow.version", - "$workflow.manifest.name": "$workflow.manifest.version", -} - -versions_mqc = { - "id": "software_versions", - "section_name": "${workflow.manifest.name} Software Versions", - "section_href": "https://github.com/${workflow.manifest.name}", - "plot_type": "html", - "description": "are collected at run time from the software output.", - "data": _make_versions_html(versions_by_module), -} - -with open("software_versions.yml", "w") as f: - yaml.dump(versions_by_module, f, default_flow_style=False) -with open("software_versions_mqc.yml", "w") as f: - yaml.dump(versions_mqc, f, default_flow_style=False) - -with open("versions.yml", "w") as f: - yaml.dump(versions_this_module, f, default_flow_style=False) diff --git a/modules/nf-core/mosdepth/main.nf b/modules/nf-core/mosdepth/main.nf new file mode 100644 index 00000000..c17e4e65 --- /dev/null +++ b/modules/nf-core/mosdepth/main.nf @@ -0,0 +1,80 @@ +process MOSDEPTH { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::mosdepth=0.3.3" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mosdepth:0.3.3--hdfd78af_1' : + 'biocontainers/mosdepth:0.3.3--hdfd78af_1'}" + + input: + tuple val(meta), path(bam), path(bai), path(bed) + tuple val(meta2), path(fasta) + + output: + tuple val(meta), path('*.global.dist.txt') , emit: global_txt + tuple val(meta), path('*.summary.txt') , emit: summary_txt + tuple val(meta), path('*.region.dist.txt') , optional:true, emit: regions_txt + tuple val(meta), path('*.per-base.d4') , optional:true, emit: per_base_d4 + tuple val(meta), path('*.per-base.bed.gz') , optional:true, emit: per_base_bed + tuple val(meta), path('*.per-base.bed.gz.csi') , optional:true, emit: per_base_csi + tuple val(meta), path('*.regions.bed.gz') , optional:true, emit: regions_bed + tuple val(meta), path('*.regions.bed.gz.csi') , optional:true, emit: regions_csi + tuple val(meta), path('*.quantized.bed.gz') , optional:true, emit: quantized_bed + tuple val(meta), path('*.quantized.bed.gz.csi') , optional:true, emit: quantized_csi + tuple val(meta), path('*.thresholds.bed.gz') , optional:true, emit: thresholds_bed + tuple val(meta), path('*.thresholds.bed.gz.csi'), optional:true, emit: thresholds_csi + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def reference = fasta ? "--fasta ${fasta}" : "" + def interval = bed ? "--by ${bed}" : "" + if (bed && args.contains("--by")) { + exit 1, "'--by' can only be specified once when running mosdepth! Either remove input BED file definition or remove '--by' from 'ext.args' definition" + } + if (!bed && args.contains("--thresholds")) { + exit 1, "'--thresholds' can only be specified in conjunction with '--by'" + } + + """ + mosdepth \\ + --threads $task.cpus \\ + $interval \\ + $reference \\ + $args \\ + $prefix \\ + $bam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + mosdepth: \$(mosdepth --version 2>&1 | sed 's/^.*mosdepth //; s/ .*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.global.dist.txt + touch ${prefix}.region.dist.txt + touch ${prefix}.summary.txt + touch ${prefix}.per-base.d4 + touch ${prefix}.per-base.bed.gz + touch ${prefix}.per-base.bed.gz.csi + touch ${prefix}.regions.bed.gz + touch ${prefix}.regions.bed.gz.csi + touch ${prefix}.quantized.bed.gz + touch ${prefix}.quantized.bed.gz.csi + touch ${prefix}.thresholds.bed.gz + touch ${prefix}.thresholds.bed.gz.csi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + mosdepth: \$(mosdepth --version 2>&1 | sed 's/^.*mosdepth //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/mosdepth/meta.yml b/modules/nf-core/mosdepth/meta.yml new file mode 100644 index 00000000..adf3893f --- /dev/null +++ b/modules/nf-core/mosdepth/meta.yml @@ -0,0 +1,109 @@ +name: mosdepth +description: Calculates genome-wide sequencing coverage. +keywords: + - mosdepth + - bam + - cram + - coverage +tools: + - mosdepth: + description: | + Fast BAM/CRAM depth calculation for WGS, exome, or targeted sequencing. + documentation: https://github.com/brentp/mosdepth + doi: 10.1093/bioinformatics/btx699 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: Input BAM/CRAM file + pattern: "*.{bam,cram}" + - bai: + type: file + description: Index for BAM/CRAM file + pattern: "*.{bai,crai}" + - meta2: + type: map + description: | + Groovy Map containing bed information + e.g. [ id:'test' ] + - bed: + type: file + description: BED file with intersected intervals + pattern: "*.{bed}" + - meta3: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - fasta: + type: file + description: Reference genome FASTA file + pattern: "*.{fa,fasta}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - global_txt: + type: file + description: Text file with global cumulative coverage distribution + pattern: "*.{global.dist.txt}" + - regions_txt: + type: file + description: Text file with region cumulative coverage distribution + pattern: "*.{region.dist.txt}" + - summary_txt: + type: file + description: Text file with summary mean depths per chromosome and regions + pattern: "*.{summary.txt}" + - per_base_bed: + type: file + description: BED file with per-base coverage + pattern: "*.{per-base.bed.gz}" + - per_base_csi: + type: file + description: Index file for BED file with per-base coverage + pattern: "*.{per-base.bed.gz.csi}" + - per_base_d4: + type: file + description: D4 file with per-base coverage + pattern: "*.{per-base.d4}" + - regions_bed: + type: file + description: BED file with per-region coverage + pattern: "*.{regions.bed.gz}" + - regions_csi: + type: file + description: Index file for BED file with per-region coverage + pattern: "*.{regions.bed.gz.csi}" + - quantized_bed: + type: file + description: BED file with binned coverage + pattern: "*.{quantized.bed.gz}" + - quantized_csi: + type: file + description: Index file for BED file with binned coverage + pattern: "*.{quantized.bed.gz.csi}" + - thresholds_bed: + type: file + description: BED file with the number of bases in each region that are covered at or above each threshold + pattern: "*.{thresholds.bed.gz}" + - thresholds_csi: + type: file + description: Index file for BED file with threshold coverage + pattern: "*.{thresholds.bed.gz.csi}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" + - "@ramprasadn" + - "@matthdsm" diff --git a/modules/nf-core/modules/multiqc/main.nf b/modules/nf-core/multiqc/main.nf similarity index 77% rename from modules/nf-core/modules/multiqc/main.nf rename to modules/nf-core/multiqc/main.nf index 1e7d6afe..1fc387be 100644 --- a/modules/nf-core/modules/multiqc/main.nf +++ b/modules/nf-core/multiqc/main.nf @@ -1,14 +1,16 @@ process MULTIQC { - label 'process_medium' + label 'process_single' - conda (params.enable_conda ? 'bioconda::multiqc=1.13a' : null) + conda "bioconda::multiqc=1.14" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.13a--pyhdfd78af_1' : - 'quay.io/biocontainers/multiqc:1.13a--pyhdfd78af_1' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.14--pyhdfd78af_0' : + 'biocontainers/multiqc:1.14--pyhdfd78af_0' }" input: path multiqc_files, stageAs: "?/*" - tuple path(multiqc_config), path(multiqc_logo) + path(multiqc_config) + path(extra_multiqc_config) + path(multiqc_logo) output: path "*multiqc_report.html", emit: report @@ -22,11 +24,13 @@ process MULTIQC { script: def args = task.ext.args ?: '' def config = multiqc_config ? "--config $multiqc_config" : '' + def extra_config = extra_multiqc_config ? "--config $extra_multiqc_config" : '' """ multiqc \\ --force \\ - $config \\ $args \\ + $config \\ + $extra_config \\ . cat <<-END_VERSIONS > versions.yml diff --git a/modules/nf-core/modules/multiqc/meta.yml b/modules/nf-core/multiqc/meta.yml similarity index 74% rename from modules/nf-core/modules/multiqc/meta.yml rename to modules/nf-core/multiqc/meta.yml index bf3a27fe..f93b5ee5 100644 --- a/modules/nf-core/modules/multiqc/meta.yml +++ b/modules/nf-core/multiqc/meta.yml @@ -1,3 +1,4 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json name: MultiQC description: Aggregate results from bioinformatics analyses across many samples into a single report keywords: @@ -12,6 +13,7 @@ tools: homepage: https://multiqc.info/ documentation: https://multiqc.info/docs/ licence: ["GPL-3.0-or-later"] + input: - multiqc_files: type: file @@ -19,19 +21,24 @@ input: List of reports / files recognised by MultiQC, for example the html and zip output of FastQC - multiqc_config: type: file - description: Config yml for MultiQC + description: Optional config yml for MultiQC + pattern: "*.{yml,yaml}" + - extra_multiqc_config: + type: file + description: Second optional config yml for MultiQC. Will override common sections in multiqc_config. pattern: "*.{yml,yaml}" - multiqc_logo: type: file - description: Logo file for MultiQC + description: Optional logo file for MultiQC pattern: "*.{png}" + output: - report: type: file description: MultiQC report file pattern: "multiqc_report.html" - data: - type: dir + type: directory description: MultiQC data dir pattern: "multiqc_data" - plots: @@ -46,3 +53,4 @@ authors: - "@abhi18av" - "@bunop" - "@drpatelh" + - "@jfy133" diff --git a/modules/nf-core/samtools/view/main.nf b/modules/nf-core/samtools/view/main.nf new file mode 100644 index 00000000..b87369e5 --- /dev/null +++ b/modules/nf-core/samtools/view/main.nf @@ -0,0 +1,66 @@ +process SAMTOOLS_VIEW { + tag "$meta.id" + label 'process_low' + + conda "bioconda::samtools=1.17" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(input), path(index) + path fasta + path qname + + output: + tuple val(meta), path("*.bam"), emit: bam, optional: true + tuple val(meta), path("*.cram"), emit: cram, optional: true + tuple val(meta), path("*.sam"), emit: sam, optional: true + tuple val(meta), path("*.bai"), emit: bai, optional: true + tuple val(meta), path("*.csi"), emit: csi, optional: true + tuple val(meta), path("*.crai"), emit: crai, optional: true + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def reference = fasta ? "--reference ${fasta}" : "" + def readnames = qname ? "--qname-file ${qname}": "" + def file_type = args.contains("--output-fmt sam") ? "sam" : + args.contains("--output-fmt bam") ? "bam" : + args.contains("--output-fmt cram") ? "cram" : + input.getExtension() + if ("$input" == "${prefix}.${file_type}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + samtools \\ + view \\ + --threads ${task.cpus-1} \\ + ${reference} \\ + ${readnames} \\ + $args \\ + -o ${prefix}.${file_type} \\ + $input \\ + $args2 + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.bam + touch ${prefix}.cram + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/view/meta.yml b/modules/nf-core/samtools/view/meta.yml new file mode 100644 index 00000000..76916033 --- /dev/null +++ b/modules/nf-core/samtools/view/meta.yml @@ -0,0 +1,79 @@ +name: samtools_view +description: filter/convert SAM/BAM/CRAM file +keywords: + - view + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - index: + type: optional file + description: BAM.BAI/BAM.CSI/CRAM.CRAI file + pattern: "*.{.bai,.csi,.crai}" + - fasta: + type: optional file + description: Reference file the CRAM was created with + pattern: "*.{fasta,fa}" + - qname: + type: file + description: Optional file with read names to output only select alignments + pattern: "*.{txt,list}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: optional filtered/converted BAM file + pattern: "*.{bam}" + - cram: + type: file + description: optional filtered/converted CRAM file + pattern: "*.{cram}" + - sam: + type: file + description: optional filtered/converted SAM file + pattern: "*.{sam}" + # bai, csi, and crai are created with `--write-index` + - bai: + type: file + description: optional BAM file index + pattern: "*.{bai}" + - csi: + type: file + description: optional tabix BAM file index + pattern: "*.{csi}" + - crai: + type: file + description: optional CRAM file index + pattern: "*.{crai}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@joseespinosa" + - "@FriederikeHanssen" + - "@priyanka-surana" diff --git a/nextflow.config b/nextflow.config index d471fa70..fa9e2065 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,6 +1,6 @@ /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - nf-core/blobtoolkit Nextflow config file + sanger-tol/blobtoolkit Nextflow config file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Default config options for all compute environments ---------------------------------------------------------------------------------------- @@ -9,33 +9,45 @@ // Global default params, used in configs params { - // TODO nf-core: Specify your pipeline's command line flags + // Specify your pipeline's command line flags // Input options input = null + yaml = null - // References - genome = null - igenomes_base = 's3://ngi-igenomes/igenomes' - igenomes_ignore = false + // Reference options + fasta = null + accession = null + taxon = null + taxa_file = null + + // Databases and related options + taxdump = null + busco = null + uniprot = null + blastp_outext = 'txt' + blastp_cols = 'qseqid staxids bitscore qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore' // MultiQC options multiqc_config = null multiqc_title = null + multiqc_logo = null max_multiqc_email_size = '25.MB' + multiqc_methods_description = null // Boilerplate options - outdir = null - tracedir = "${params.outdir}/pipeline_info" + outdir = 'results' + tracedir = "${params.outdir}/blobtoolkit_info" publish_dir_mode = 'copy' email = null email_on_fail = null plaintext_email = false monochrome_logs = false + hook_url = null help = false + version = false validate_params = true show_hidden_params = false schema_ignore_params = 'genomes' - enable_conda = false // Config options custom_config_version = 'master' @@ -63,7 +75,7 @@ try { System.err.println("WARNING: Could not load nf-core/config profiles: ${params.custom_config_base}/nfcore_custom.config") } -// Load nf-core/blobtoolkit custom profiles from different institutions. +// Load sanger-tol/blobtoolkit custom profiles from different institutions. // Warning: Uncomment only if a pipeline-specific instititutional config already exists on nf-core/configs! // try { // includeConfig "${params.custom_config_base}/pipeline/blobtoolkit.config" @@ -73,62 +85,102 @@ try { profiles { - debug { process.beforeScript = 'echo $HOSTNAME' } + debug { + dumpHashes = true + process.beforeScript = 'echo $HOSTNAME' + cleanup = false + } conda { - params.enable_conda = true + conda.enabled = true docker.enabled = false singularity.enabled = false podman.enabled = false shifter.enabled = false charliecloud.enabled = false + apptainer.enabled = false + } + mamba { + conda.enabled = true + conda.useMamba = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + apptainer.enabled = false } docker { docker.enabled = true + docker.registry = 'quay.io' docker.userEmulation = true + conda.enabled = false singularity.enabled = false podman.enabled = false shifter.enabled = false charliecloud.enabled = false + apptainer.enabled = false + } + arm { + docker.runOptions = '-u $(id -u):$(id -g) --platform=linux/amd64' } singularity { singularity.enabled = true singularity.autoMounts = true + conda.enabled = false docker.enabled = false podman.enabled = false shifter.enabled = false charliecloud.enabled = false + apptainer.enabled = false } podman { podman.enabled = true + podman.registry = 'quay.io' + conda.enabled = false docker.enabled = false singularity.enabled = false shifter.enabled = false charliecloud.enabled = false + apptainer.enabled = false } shifter { shifter.enabled = true + conda.enabled = false docker.enabled = false singularity.enabled = false podman.enabled = false charliecloud.enabled = false + apptainer.enabled = false } charliecloud { charliecloud.enabled = true + conda.enabled = false docker.enabled = false singularity.enabled = false podman.enabled = false shifter.enabled = false + apptainer.enabled = false } + apptainer { + apptainer.enabled = true + conda.enabled = false + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + gitpod { + executor.name = 'local' + executor.cpus = 16 + executor.memory = 60.GB + } + cleanup { cleanup = true } test { includeConfig 'conf/test.config' } test_full { includeConfig 'conf/test_full.config' } } -// Load igenomes.config if required -if (!params.igenomes_ignore) { - includeConfig 'conf/igenomes.config' -} else { - params.genomes = [:] -} + // Export these variables to prevent local Python/R libraries from conflicting with those in the container // The JULIA depot path has been adjusted to a fixed path `/usr/local/share/julia` that needs to be used for packages in the container. @@ -163,13 +215,14 @@ dag { } manifest { - name = 'nf-core/blobtoolkit' - author = 'Alexander Ramos' - homePage = 'https://github.com/nf-core/blobtoolkit' - description = 'BlobToolKit Nextflow Pipeline.' + name = 'sanger-tol/blobtoolkit' + author = """@zb32, @rjchallis, @sujaikumar, @muffato, @gq1, @alxndrdiaz, @priyanka-surana""" + homePage = 'https://github.com/sanger-tol/blobtoolkit' + description = """Quality assessment of genome assemblies""" mainScript = 'main.nf' - nextflowVersion = '!>=21.10.3' - version = '1.0dev' + nextflowVersion = '!>=22.10.1' + version = '0.1.0' + doi = '10.5281/zenodo.XXXXXXX' } // Load modules.config for DSL2 module specific options diff --git a/nextflow_schema.json b/nextflow_schema.json index a6387784..a960bee2 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1,8 +1,8 @@ { "$schema": "http://json-schema.org/draft-07/schema", - "$id": "https://raw.githubusercontent.com/nf-core/blobtoolkit/master/nextflow_schema.json", - "title": "nf-core/blobtoolkit pipeline parameters", - "description": "BlobToolKit Nextflow Pipeline.", + "$id": "https://raw.githubusercontent.com/sanger-tol/blobtoolkit/master/nextflow_schema.json", + "title": "sanger-tol/blobtoolkit pipeline parameters", + "description": "Quality assessment of genome assemblies", "type": "object", "definitions": { "input_output_options": { @@ -19,21 +19,28 @@ "pattern": "^\\S+\\.csv$", "schema": "assets/schema_input.json", "description": "Path to comma-separated file containing information about the samples in the experiment.", - "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/blobtoolkit/usage#samplesheet-input).", + "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row.", "fa_icon": "fas fa-file-csv" }, + "yaml": { + "type": "string", + "format": "file-path", + "description": "Custom config file for draft assembly", + "fa_icon": "fas fa-file-alt" + }, "outdir": { "type": "string", "format": "directory-path", "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", - "fa_icon": "fas fa-folder-open" + "fa_icon": "fas fa-folder-open", + "default": "results" }, "email": { "type": "string", + "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$", "description": "Email address for completion summary.", - "fa_icon": "fas fa-envelope", "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.", - "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$" + "fa_icon": "fas fa-envelope" }, "multiqc_title": { "type": "string", @@ -47,12 +54,15 @@ "type": "object", "fa_icon": "fas fa-dna", "description": "Reference genome related files and options required for the workflow.", + "required": ["taxon", "accession", "fasta"], "properties": { - "genome": { + "taxon": { + "type": "string", + "description": "NCBI taxonomy ID for the genome species" + }, + "accession": { "type": "string", - "description": "Name of iGenomes reference.", - "fa_icon": "fas fa-book", - "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`. \n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details." + "description": "Genome accession where available or an identifier for draft assemblies" }, "fasta": { "type": "string", @@ -60,23 +70,54 @@ "mimetype": "text/plain", "pattern": "^\\S+\\.fn?a(sta)?(\\.gz)?$", "description": "Path to FASTA genome file.", - "help_text": "This parameter is *mandatory* if `--genome` is not specified. If you don't have a BWA index available this will be generated for you automatically. Combine with `--save_reference` to save BWA index for future runs.", "fa_icon": "far fa-file-code" + } + } + }, + "databases": { + "title": "Databases", + "type": "object", + "fa_icon": "fas fa-database", + "description": "Define the location and parameters to work with databases.", + "required": ["uniprot", "taxdump"], + "properties": { + "taxa_file": { + "type": "string", + "format": "file-path", + "description": "Path to file containing the BUSCO lineages for the genome species", + "help_text": "If this file is not included, the relevant BUSCO lineages are automatically calculated using the taxon parameter.", + "fa_icon": "fas fa-file-alt" }, - "igenomes_base": { + "busco": { "type": "string", "format": "directory-path", - "description": "Directory / URL base for iGenomes references.", - "default": "s3://ngi-igenomes/igenomes", - "fa_icon": "fas fa-cloud-download-alt", - "hidden": true + "description": "Local directory where clade-specific BUSCO lineage datasets are stored", + "fa_icon": "fas fa-folder-open" }, - "igenomes_ignore": { - "type": "boolean", - "description": "Do not load the iGenomes reference config.", - "fa_icon": "fas fa-ban", - "hidden": true, - "help_text": "Do not load `igenomes.config` when running the pipeline. You may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`." + "blastp_cols": { + "type": "string", + "description": "When blastp_outext is 'txt', this is the list of columns that Diamond BLAST should print.", + "default": "qseqid staxids bitscore qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore" + }, + "blastp_outext": { + "type": "string", + "enum": ["blast", "xml", "txt", "daa", "sam", "tsv", "paf"], + "description": "Extension (file format) of the output file from Diamond BLAST.", + "fa_icon": "fas fa-file-circle-question", + "default": "txt" + }, + "uniprot": { + "type": "string", + "format": "file-path", + "pattern": "^\\S+\\.dmnd$", + "description": "Path to the Diamond species-specific buscogenes database", + "fa_icon": "fas fa-file-archive" + }, + "taxdump": { + "type": "string", + "format": "directory-path", + "description": "Path to the new NCBI tax dump database", + "fa_icon": "fas fa-folder-open" } } }, @@ -176,6 +217,12 @@ "fa_icon": "fas fa-question-circle", "hidden": true }, + "version": { + "type": "boolean", + "description": "Display version and exit.", + "fa_icon": "fas fa-question-circle", + "hidden": true + }, "publish_dir_mode": { "type": "string", "default": "copy", @@ -213,16 +260,34 @@ "fa_icon": "fas fa-palette", "hidden": true }, + "hook_url": { + "type": "string", + "description": "Incoming hook URL for messaging service", + "fa_icon": "fas fa-people-group", + "help_text": "Incoming hook URL for messaging service. Currently, MS Teams and Slack are supported.", + "hidden": true + }, "multiqc_config": { "type": "string", "description": "Custom config file to supply to MultiQC.", "fa_icon": "fas fa-cog", "hidden": true }, + "multiqc_logo": { + "type": "string", + "description": "Custom logo file to supply to MultiQC. File name must also be set in the MultiQC config file", + "fa_icon": "fas fa-image", + "hidden": true + }, + "multiqc_methods_description": { + "type": "string", + "description": "Custom MultiQC yaml file containing HTML including a methods description.", + "fa_icon": "fas fa-cog" + }, "tracedir": { "type": "string", "description": "Directory to keep pipeline Nextflow logs and reports.", - "default": "${params.outdir}/pipeline_info", + "default": "${params.outdir}/blobtoolkit_info", "fa_icon": "fas fa-cogs", "hidden": true }, @@ -239,12 +304,6 @@ "description": "Show all params when using `--help`", "hidden": true, "help_text": "By default, parameters set as _hidden_ in the schema are not shown on the command line when a user runs with `--help`. Specifying this option will tell the pipeline to show all parameters." - }, - "enable_conda": { - "type": "boolean", - "description": "Run this workflow with Conda. You can also use '-profile conda' instead of providing this parameter.", - "hidden": true, - "fa_icon": "fas fa-bacon" } } } @@ -256,6 +315,9 @@ { "$ref": "#/definitions/reference_genome_options" }, + { + "$ref": "#/definitions/databases" + }, { "$ref": "#/definitions/institutional_config_options" }, diff --git a/pipeline_template.yml b/pipeline_template.yml new file mode 100644 index 00000000..0aa7398f --- /dev/null +++ b/pipeline_template.yml @@ -0,0 +1,3 @@ +prefix: sanger-tol +skip: + - igenomes diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..0d62beb6 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,10 @@ +# Config file for Python. Mostly used to configure linting of bin/check_samplesheet.py with Black. +# Should be kept the same as nf-core/tools to avoid fighting with template synchronisation. +[tool.black] +line-length = 120 +target_version = ["py37", "py38", "py39", "py310"] + +[tool.isort] +profile = "black" +known_first_party = ["nf_core"] +multi_line_output = 3 diff --git a/subworkflows/local/blobtools.nf b/subworkflows/local/blobtools.nf new file mode 100644 index 00000000..730e1334 --- /dev/null +++ b/subworkflows/local/blobtools.nf @@ -0,0 +1,39 @@ +// +// Create BlobTools dataset +// + +include { BLOBTOOLKIT_METADATA } from '../../modules/local/blobtoolkit/metadata' +include { BLOBTOOLKIT_BLOBDIR } from '../../modules/local/blobtoolkit/blobdir' + +workflow BLOBTOOLS { + take: + config // channel: [ val(meta), path(config) ] + windowstats // channel: [ val(meta), path(window_stats_tsvs) ] + busco // channel: [ val(meta), path(full_table) ] + blastp // channel: [ val(meta), path(txt) ] + taxdump // channel: path(taxdump_db) + + + main: + ch_versions = Channel.empty() + + + // + // Create metadata summary file + // + BLOBTOOLKIT_METADATA ( config ) + ch_versions = ch_versions.mix ( BLOBTOOLKIT_METADATA.out.versions.first() ) + + + // + // Create Blobtools dataset files + // + BLOBTOOLKIT_BLOBDIR ( windowstats, busco, blastp, BLOBTOOLKIT_METADATA.out.yaml, taxdump ) + ch_versions = ch_versions.mix ( BLOBTOOLKIT_BLOBDIR.out.versions.first() ) + + + emit: + metadata = BLOBTOOLKIT_METADATA.out.yaml // channel: [ val(meta), path(yaml) ] + blobdir = BLOBTOOLKIT_BLOBDIR.out.blobdir // channel: [ val(meta), path(dir) ] + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/busco_diamond_blastp.nf b/subworkflows/local/busco_diamond_blastp.nf new file mode 100644 index 00000000..44fe8b6c --- /dev/null +++ b/subworkflows/local/busco_diamond_blastp.nf @@ -0,0 +1,96 @@ +// +// Run BUSCO for a genome from GOAT and runs diamond_blastp +// + +include { GOAT_TAXONSEARCH } from '../../modules/nf-core/goat/taxonsearch/main' +include { BUSCO } from '../../modules/nf-core/busco/main' +include { BLOBTOOLKIT_EXTRACTBUSCOS } from '../../modules/local/blobtoolkit/extractbuscos' +include { DIAMOND_BLASTP } from '../../modules/nf-core/diamond/blastp/main' + + +workflow BUSCO_DIAMOND { + take: + fasta // channel: [ val(meta), path(fasta) ] + taxon_taxa // channel: [ val(meta, val(taxon), path(taxa) ] + busco_db // channel: path(busco_db) + blastp // channel: path(blastp_db) + outext // channel: val(out_format) + cols // channel: val(column_names) + + + main: + ch_versions = Channel.empty() + + + // + // Fetch BUSCO lineages for taxon (or taxa) + // + GOAT_TAXONSEARCH ( taxon_taxa ) + ch_versions = ch_versions.mix ( GOAT_TAXONSEARCH.out.versions.first() ) + + + // + // Run BUSCO search + // + GOAT_TAXONSEARCH.out.taxonsearch + | map { meta, csv -> csv.splitCsv(header:true, sep:'\t', strip:true) } + | map { row -> row.odb10_lineage.findAll { it != "" } } + | map { lineages -> [ lineages + [ "bacteria_odb10", "archaea_odb10" ] ] } + | flatten () + | set { ch_lineages } + + BUSCO ( fasta, ch_lineages, busco_db.collect().ifEmpty([]), [] ) + ch_versions = ch_versions.mix ( BUSCO.out.versions.first() ) + + + // + // Select input for BLOBTOOLKIT_EXTRACTBUSCOS + // + BUSCO.out.seq_dir + | map { meta, seq -> [ [ "id": seq.parent.baseName ], seq ] } + | branch { + meta, seq -> + archaea : meta.id == "run_archaea_odb10" + bacteria : meta.id == "run_bacteria_odb10" + eukaryota : meta.id == "run_eukaryota_odb10" + } + | set { ch_busco } + + + // Extract BUSCO genes from the 3 kingdoms + BLOBTOOLKIT_EXTRACTBUSCOS ( fasta, ch_busco.archaea, ch_busco.bacteria, ch_busco.eukaryota ) + ch_versions = ch_versions.mix ( BLOBTOOLKIT_EXTRACTBUSCOS.out.versions.first() ) + + + // + // Align BUSCO genes against the BLASTp database + // + BLOBTOOLKIT_EXTRACTBUSCOS.out.genes + | filter { it[1].size() > 140 } + | set { ch_busco_genes } + + DIAMOND_BLASTP ( ch_busco_genes, blastp, outext, cols ) + ch_versions = ch_versions.mix ( DIAMOND_BLASTP.out.versions.first() ) + + + // Select BUSCO results for taxonomically closest database + BUSCO.out.full_table + | combine ( ch_lineages.toList().map { it[0] } ) + | filter { meta, table, lineage -> table =~ /$lineage/ } + | map { meta, table, lineage -> [ meta, table ] } + | set { ch_first_table } + + + // BUSCO results for MULTIQC + BUSCO.out.short_summaries_txt + | ifEmpty ( [ [], [] ] ) + | set { multiqc } + + + emit: + first_table = ch_first_table // channel: [ val(meta), path(full_table) ] + full_table = BUSCO.out.full_table // channel: [ val(meta), path(full_tables) ] + blastp_txt = DIAMOND_BLASTP.out.txt // channel: [ val(meta), path(txt) ] + multiqc // channel: [ meta, summary ] + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/collate_stats.nf b/subworkflows/local/collate_stats.nf new file mode 100644 index 00000000..ac567621 --- /dev/null +++ b/subworkflows/local/collate_stats.nf @@ -0,0 +1,44 @@ +// +// Collate genome statistics by various window sizes +// + +include { BLOBTOOLKIT_COUNTBUSCOS } from '../../modules/local/blobtoolkit/countbuscos' +include { WINDOWSTATS_INPUT } from '../../modules/local/windowstats_input' +include { BLOBTOOLKIT_WINDOWSTATS } from '../../modules/local/blobtoolkit/windowstats' + + +workflow COLLATE_STATS { + take: + busco_table // channel: [ val(meta), path(full_table) ] + bed // channel: [ val(meta), path(bed) ] + freq // channel: [ val(meta), path(freq) ] + mononuc // channel: [ val(meta), path(mononuc) ] + cov // channel: [ val(meta), path(regions.bed.gz) ] + + main: + ch_versions = Channel.empty() + + + // Count BUSCO genes in a region + busco_table + | groupTuple() + | set { ch_busco } + + BLOBTOOLKIT_COUNTBUSCOS ( ch_busco, bed ) + ch_versions = ch_versions.mix ( BLOBTOOLKIT_COUNTBUSCOS.out.versions.first() ) + + + // Combine outputs from Fasta windows, mosdepth, and count BUSCO genes + WINDOWSTATS_INPUT ( freq, mononuc, cov, BLOBTOOLKIT_COUNTBUSCOS.out.tsv ) + ch_versions = ch_versions.mix ( WINDOWSTATS_INPUT.out.versions.first() ) + + + // Genome statistics by different window sizes + BLOBTOOLKIT_WINDOWSTATS ( WINDOWSTATS_INPUT.out.tsv ) + ch_versions = ch_versions.mix ( BLOBTOOLKIT_WINDOWSTATS.out.versions.first() ) + + + emit: + window_tsv = BLOBTOOLKIT_WINDOWSTATS.out.tsv // channel: [ val(meta), path(window_stats_tsvs) ] + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/coverage_stats.nf b/subworkflows/local/coverage_stats.nf new file mode 100644 index 00000000..0d13824b --- /dev/null +++ b/subworkflows/local/coverage_stats.nf @@ -0,0 +1,76 @@ +// +// Calculate genome coverage and statistics +// + +include { SAMTOOLS_VIEW } from '../../modules/nf-core/samtools/view/main' +include { MOSDEPTH } from '../../modules/nf-core/mosdepth/main' +include { FASTAWINDOWS } from '../../modules/nf-core/fastawindows/main' +include { CREATE_BED } from '../../modules/local/create_bed' + + +workflow COVERAGE_STATS { + take: + cram // channel: [ val(meta), path(cram) ] + fasta // channel: [ val(meta), path(fasta) ] + + + main: + ch_versions = Channel.empty() + + + // Convert from CRAM to BAM + cram + | map { meta, cram -> [ meta, cram, [] ] } + | set { ch_cram_crai} + + fasta + | map { meta, fasta -> fasta } + | set { ch_fasta } + + SAMTOOLS_VIEW ( ch_cram_crai, ch_fasta, [] ) + ch_versions = ch_versions.mix ( SAMTOOLS_VIEW.out.versions.first() ) + + + // Calculate genome statistics + FASTAWINDOWS ( fasta ) + ch_versions = ch_versions.mix ( FASTAWINDOWS.out.versions.first() ) + + + // Create genome windows file in BED format + CREATE_BED ( FASTAWINDOWS.out.mononuc ) + ch_versions = ch_versions.mix ( CREATE_BED.out.versions.first() ) + + + // Calculate coverage + SAMTOOLS_VIEW.out.bam + | join ( SAMTOOLS_VIEW.out.csi ) + | combine ( CREATE_BED.out.bed ) + | map { meta, bam, csi, meta2, bed -> [ meta, bam, csi, bed ] } + | set { ch_bam_csi_bed } + + MOSDEPTH ( ch_bam_csi_bed, fasta ) + ch_versions = ch_versions.mix ( MOSDEPTH.out.versions.first() ) + + + // Combining mosdepth regions_bed in single channel + MOSDEPTH.out.regions_bed + | combine ( fasta ) + | map { meta, bed, meta2, fasta -> [ meta2, bed ] } + | groupTuple () + | set { ch_coverage } + + + // Mosdepth results for MULTIQC + MOSDEPTH.out.regions_txt + | ifEmpty ( MOSDEPTH.out.global_txt ) + | set { multiqc } + + + emit: + freq = FASTAWINDOWS.out.freq // channel: [ val(meta), path(freq) ] + mononuc = FASTAWINDOWS.out.mononuc // channel: [ val(meta), path(mononuc) ] + bed = CREATE_BED.out.bed // channel: [ val(meta), path(bed) ] + cov = ch_coverage // channel: [ val(meta), path(regions.bed.gz) ] + multiqc // channel: [ val(meta), path(dist.txt) ] + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 0aecf87f..0b02604b 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -1,5 +1,5 @@ // -// Check input samplesheet and get read channels +// Check input samplesheet and get aligned read channels // include { SAMPLESHEET_CHECK } from '../../modules/local/samplesheet_check' @@ -12,33 +12,31 @@ workflow INPUT_CHECK { SAMPLESHEET_CHECK ( samplesheet ) .csv .splitCsv ( header:true, sep:',' ) - .map { create_fastq_channel(it) } - .set { reads } + .map { create_data_channels(it) } + .set { aln } + emit: - reads // channel: [ val(meta), [ reads ] ] - versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ] + aln // channel: [ val(meta), path(datafile) ] + versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ] } -// Function to get list of [ meta, [ fastq_1, fastq_2 ] ] -def create_fastq_channel(LinkedHashMap row) { +// Function to get list of [ meta, datafile ] +def create_data_channels(LinkedHashMap row) { // create meta map def meta = [:] meta.id = row.sample - meta.single_end = row.single_end.toBoolean() + meta.datatype = row.datatype - // add path(s) of the fastq file(s) to the meta map - def fastq_meta = [] - if (!file(row.fastq_1).exists()) { - exit 1, "ERROR: Please check input samplesheet -> Read 1 FastQ file does not exist!\n${row.fastq_1}" - } - if (meta.single_end) { - fastq_meta = [ meta, [ file(row.fastq_1) ] ] + + // add path(s) of the read file(s) to the meta map + def data_meta = [] + + if ( !file(row.datafile).exists() ) { + exit 1, "ERROR: Please check input samplesheet -> Data file does not exist!\n${row.datafile}" } else { - if (!file(row.fastq_2).exists()) { - exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.fastq_2}" - } - fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ] + data_meta = [ meta, file(row.datafile) ] } - return fastq_meta + + return data_meta } diff --git a/subworkflows/local/view.nf b/subworkflows/local/view.nf new file mode 100644 index 00000000..f1bf89d6 --- /dev/null +++ b/subworkflows/local/view.nf @@ -0,0 +1,37 @@ +// +// Generate summary and static plots from blobdir +// + +include { BLOBTOOLKIT_SUMMARY } from '../../modules/local/blobtoolkit/summary' +include { BLOBTOOLKIT_IMAGES } from '../../modules/local/blobtoolkit/images' + +workflow VIEW { + take: + blobdir // channel: [ val(meta), path(blobdir) ] + + + main: + ch_versions = Channel.empty() + + + // + // Generate summary file + // + BLOBTOOLKIT_SUMMARY ( blobdir ) + ch_versions = ch_versions.mix ( BLOBTOOLKIT_SUMMARY.out.versions.first() ) + + + // + // Generate static plots in png format + // + plots = [ "snail", "blob", "cumulative" ] + + BLOBTOOLKIT_IMAGES ( blobdir, plots ) + ch_versions = ch_versions.mix( BLOBTOOLKIT_IMAGES.out.versions ) + + + emit: + summary = BLOBTOOLKIT_SUMMARY.out.json // channel: [ val(meta), path(json) ] + images = BLOBTOOLKIT_IMAGES.out.png // channel: [ val(meta), path(png) ] + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/tower.yml b/tower.yml new file mode 100644 index 00000000..787aedfe --- /dev/null +++ b/tower.yml @@ -0,0 +1,5 @@ +reports: + multiqc_report.html: + display: "MultiQC HTML report" + samplesheet.csv: + display: "Auto-created samplesheet with collated metadata and FASTQ paths" diff --git a/workflows/blobtoolkit.nf b/workflows/blobtoolkit.nf index db5b4d95..c8dad117 100644 --- a/workflows/blobtoolkit.nf +++ b/workflows/blobtoolkit.nf @@ -9,13 +9,21 @@ def summary_params = NfcoreSchema.paramsSummaryMap(workflow, params) // Validate input parameters WorkflowBlobtoolkit.initialise(params, log) -// TODO nf-core: Add all file path parameters for the pipeline to the list below +// Add all file path parameters for the pipeline to the list below // Check input path parameters to see if they exist -def checkPathParamList = [ params.input, params.multiqc_config, params.fasta ] +def checkPathParamList = [ params.input, params.multiqc_config, params.fasta, params.taxa_file, params.taxdump, params.busco, params.uniprot ] for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } // Check mandatory parameters if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } +if (params.fasta && params.accession) { ch_fasta = Channel.of([ [ 'id': params.accession ], params.fasta ]).collect() } else { exit 1, 'Genome fasta file and accession must be specified!' } +if (params.taxon) { ch_taxon = Channel.of(params.taxon) } else { exit 1, 'NCBI Taxon ID not specified!' } +if (params.uniprot) { ch_uniprot = file(params.uniprot) } else { exit 1, 'Diamond BLASTp database not specified!' } +if (params.taxdump) { ch_taxdump = file(params.taxdump) } else { exit 1, 'NCBI Taxonomy database not specified!' } + +// Create channel for optional parameters +if (params.busco) { ch_busco_db = Channel.fromPath(params.busco) } else { ch_busco_db = Channel.empty() } +if (params.yaml && params.accession) { ch_yaml = Channel.of([ [ 'id': params.accession ], params.yaml ]) } else { ch_yaml = Channel.empty() } /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -23,8 +31,10 @@ if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input sample ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -ch_multiqc_config = file("$projectDir/assets/multiqc_config.yml", checkIfExists: true) -ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multiqc_config) : Channel.empty() +ch_multiqc_config = Channel.fromPath("$projectDir/assets/multiqc_config.yml", checkIfExists: true) +ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath( params.multiqc_config, checkIfExists: true ) : Channel.empty() +ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath( params.multiqc_logo, checkIfExists: true ) : Channel.empty() +ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -32,10 +42,20 @@ ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multi ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ +// +// MODULE: Loaded from modules/local/ +// +include { BLOBTOOLKIT_CONFIG } from '../modules/local/blobtoolkit/config' + // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // -include { INPUT_CHECK } from '../subworkflows/local/input_check' +include { INPUT_CHECK } from '../subworkflows/local/input_check' +include { COVERAGE_STATS } from '../subworkflows/local/coverage_stats' +include { BUSCO_DIAMOND } from '../subworkflows/local/busco_diamond_blastp' +include { COLLATE_STATS } from '../subworkflows/local/collate_stats' +include { BLOBTOOLS } from '../subworkflows/local/blobtools' +include { VIEW } from '../subworkflows/local/view' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -46,9 +66,9 @@ include { INPUT_CHECK } from '../subworkflows/local/input_check' // // MODULE: Installed directly from nf-core/modules // -include { FASTQC } from '../modules/nf-core/modules/fastqc/main' -include { MULTIQC } from '../modules/nf-core/modules/multiqc/main' -include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/modules/custom/dumpsoftwareversions/main' +include { GUNZIP } from '../modules/nf-core/gunzip/main' +include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' +include { MULTIQC } from '../modules/nf-core/multiqc/main' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -64,21 +84,69 @@ workflow BLOBTOOLKIT { ch_versions = Channel.empty() // - // SUBWORKFLOW: Read in samplesheet, validate and stage input files + // MODULE: Decompress FASTA file if needed // - INPUT_CHECK ( - ch_input - ) - ch_versions = ch_versions.mix(INPUT_CHECK.out.versions) + if ( params.fasta.endsWith('.gz') ) { + ch_genome = GUNZIP ( ch_fasta ).gunzip + ch_versions = ch_versions.mix ( GUNZIP.out.versions.first() ) + } else { + ch_genome = ch_fasta + } // - // MODULE: Run FastQC + // SUBWORKFLOW: Check samplesheet and create channels for downstream analysis // - FASTQC ( - INPUT_CHECK.out.reads - ) - ch_versions = ch_versions.mix(FASTQC.out.versions.first()) + INPUT_CHECK ( ch_input ) + ch_versions = ch_versions.mix ( INPUT_CHECK.out.versions ) + + // + // SUBWORKFLOW: Calculate genome coverage and statistics + // + COVERAGE_STATS ( INPUT_CHECK.out.aln, ch_genome ) + ch_versions = ch_versions.mix ( COVERAGE_STATS.out.versions ) + + // + // SUBWORKFLOW: Run BUSCO using lineages fetched from GOAT, then run diamond_blastp + // + if (params.taxa_file) { + ch_taxa = Channel.from(params.taxa_file) + ch_taxon_taxa = ch_fasta.combine(ch_taxon).combine(ch_taxa).map { meta, fasta, taxon, taxa -> [ meta, taxon, taxa ] } + } else { + ch_taxon_taxa = ch_fasta.combine(ch_taxon).map { meta, fasta, taxon -> [ meta, taxon, [] ] } + } + + BUSCO_DIAMOND ( ch_genome, ch_taxon_taxa, ch_busco_db, ch_uniprot, params.blastp_outext, params.blastp_cols ) + ch_versions = ch_versions.mix ( BUSCO_DIAMOND.out.versions ) + // + // SUBWORKFLOW: Collate genome statistics by various window sizes + // + COLLATE_STATS ( BUSCO_DIAMOND.out.full_table, COVERAGE_STATS.out.bed, COVERAGE_STATS.out.freq, COVERAGE_STATS.out.mononuc, COVERAGE_STATS.out.cov ) + ch_versions = ch_versions.mix ( COLLATE_STATS.out.versions ) + + // + // SUBWORKFLOW: Create BlobTools dataset + // + if ( !params.yaml ) { + BLOBTOOLKIT_CONFIG ( ch_genome ) + ch_config = BLOBTOOLKIT_CONFIG.out.yaml + ch_versions = ch_versions.mix ( BLOBTOOLKIT_CONFIG.out.versions.first() ) + } else { + ch_config = ch_yaml + } + + BLOBTOOLS ( ch_config, COLLATE_STATS.out.window_tsv, BUSCO_DIAMOND.out.first_table, BUSCO_DIAMOND.out.blastp_txt.ifEmpty([[],[]]), ch_taxdump ) + ch_versions = ch_versions.mix ( BLOBTOOLS.out.versions ) + + // + // SUBWORKFLOW: Generate summary and static images + // + VIEW ( BLOBTOOLS.out.blobdir ) + ch_versions = ch_versions.mix(VIEW.out.versions) + + // + // MODULE: Combine different versions.yml + // CUSTOM_DUMPSOFTWAREVERSIONS ( ch_versions.unique().collectFile(name: 'collated_versions.yml') ) @@ -89,18 +157,23 @@ workflow BLOBTOOLKIT { workflow_summary = WorkflowBlobtoolkit.paramsSummaryMultiqc(workflow, summary_params) ch_workflow_summary = Channel.value(workflow_summary) + methods_description = WorkflowBlobtoolkit.methodsDescriptionText(workflow, ch_multiqc_custom_methods_description) + ch_methods_description = Channel.value(methods_description) + ch_multiqc_files = Channel.empty() - ch_multiqc_files = ch_multiqc_files.mix(Channel.from(ch_multiqc_config)) - ch_multiqc_files = ch_multiqc_files.mix(ch_multiqc_custom_config.collect().ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) + ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml')) + ch_multiqc_files = ch_multiqc_files.mix(BUSCO_DIAMOND.out.multiqc.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(COVERAGE_STATS.out.multiqc.collect{it[1]}.ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) - ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) MULTIQC ( - ch_multiqc_files.collect() + ch_multiqc_files.collect(), + ch_multiqc_config.toList(), + ch_multiqc_custom_config.toList(), + ch_multiqc_logo.toList() ) multiqc_report = MULTIQC.out.report.toList() - ch_versions = ch_versions.mix(MULTIQC.out.versions) } /* @@ -114,6 +187,9 @@ workflow.onComplete { NfcoreTemplate.email(workflow, params, summary_params, projectDir, log, multiqc_report) } NfcoreTemplate.summary(workflow, params, log) + if (params.hook_url) { + NfcoreTemplate.IM_notification(workflow, params, summary_params, projectDir, log) + } } /*