diff --git a/.editorconfig b/.editorconfig index b78de6e..b6b3190 100644 --- a/.editorconfig +++ b/.editorconfig @@ -8,7 +8,7 @@ trim_trailing_whitespace = true indent_size = 4 indent_style = space -[*.{md,yml,yaml,html,css,scss,js,cff}] +[*.{md,yml,yaml,html,css,scss,js}] indent_size = 2 # These files are edited and tested upstream in nf-core/modules diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index fa96bcf..1df5965 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -44,7 +44,8 @@ body: * Executor _(eg. slurm, local, awsbatch)_ - * Container engine: _(e.g. Docker, Singularity, Conda, Podman, Shifter or Charliecloud)_ + * Container engine: _(e.g. Docker, Singularity, Conda, Podman, Shifter, Charliecloud, + or Apptainer)_ * OS _(eg. CentOS Linux, macOS, Linux Mint)_ diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index dbc6e02..567ac13 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -41,3 +41,7 @@ jobs: # Remember that you can parallelise this by using strategy.matrix run: | nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results + + - name: Run pipeline with unaligned test data + run: | + nextflow run ${GITHUB_WORKSPACE} -profile test_align,docker --outdir ./results --align diff --git a/.github/workflows/clean-up.yml b/.github/workflows/clean-up.yml new file mode 100644 index 0000000..694e90e --- /dev/null +++ b/.github/workflows/clean-up.yml @@ -0,0 +1,24 @@ +name: "Close user-tagged issues and PRs" +on: + schedule: + - cron: "0 0 * * 0" # Once a week + +jobs: + clean-up: + runs-on: ubuntu-latest + permissions: + issues: write + pull-requests: write + steps: + - uses: actions/stale@v7 + with: + stale-issue-message: "This issue has been tagged as awaiting-changes or awaiting-feedback by an nf-core contributor. Remove stale label or add a comment otherwise this issue will be closed in 20 days." + stale-pr-message: "This PR has been tagged as awaiting-changes or awaiting-feedback by an nf-core contributor. Remove stale label or add a comment if it is still useful." + close-issue-message: "This issue was closed because it has been tagged as awaiting-changes or awaiting-feedback by an nf-core contributor and then staled for 20 days with no activity." + days-before-stale: 30 + days-before-close: 20 + days-before-pr-close: -1 + any-of-labels: "awaiting-changes,awaiting-feedback" + exempt-issue-labels: "WIP" + exempt-pr-labels: "WIP" + repo-token: "${{ secrets.GITHUB_TOKEN }}" diff --git a/.github/workflows/fix-linting.yml b/.github/workflows/fix-linting.yml index 063af6e..010b9f1 100644 --- a/.github/workflows/fix-linting.yml +++ b/.github/workflows/fix-linting.yml @@ -8,21 +8,21 @@ jobs: # Only run if comment is on a PR with the main repo, and if it contains the magic keywords if: > contains(github.event.comment.html_url, '/pull/') && - contains(github.event.comment.body, '@nf-core-bot fix linting') && + contains(github.event.comment.body, '@sanger-tolsoft fix linting') && github.repository == 'sanger-tol/variantcalling' runs-on: ubuntu-latest steps: - # Use the @nf-core-bot token to check out so we can push later + # Use the @sanger-tolsoft token to check out so we can push later - uses: actions/checkout@v3 with: - token: ${{ secrets.nf_core_bot_auth_token }} + token: ${{ secrets.sangertolsoft_access_token }} # Action runs on the issue comment, so we don't get the PR by default # Use the gh cli to check out the PR - name: Checkout Pull Request run: gh pr checkout ${{ github.event.issue.number }} env: - GITHUB_TOKEN: ${{ secrets.nf_core_bot_auth_token }} + GITHUB_TOKEN: ${{ secrets.sangertolsoft_access_token }} - uses: actions/setup-node@v3 @@ -46,8 +46,8 @@ jobs: - name: Commit & push changes if: steps.prettier_status.outputs.result == 'fail' run: | - git config user.email "core@nf-co.re" - git config user.name "nf-core-bot" + git config user.email "105875386+sanger-tolsoft@users.noreply.github.com" + git config user.name "sanger-tolsoft" git config push.default upstream git add . git status diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 858d622..916c43a 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -22,7 +22,7 @@ jobs: run: npm install -g editorconfig-checker - name: Run ECLint check - run: editorconfig-checker -exclude README.md $(find .* -type f | grep -v '.git\|.py\|.md\|json\|yml\|yaml\|html\|css\|work\|.nextflow\|build\|nf_core.egg-info\|log.txt\|Makefile') + run: editorconfig-checker -exclude README.md $(find .* -type f | grep -v '.git\|.py\|.cff\|.md\|json\|yml\|yaml\|html\|css\|work\|.nextflow\|build\|nf_core.egg-info\|log.txt\|Makefile') Prettier: runs-on: ubuntu-latest @@ -78,13 +78,13 @@ jobs: - uses: actions/setup-python@v4 with: - python-version: "3.7" + python-version: "3.8" architecture: "x64" - name: Install dependencies run: | python -m pip install --upgrade pip - pip install nf-core + pip install nf-core==2.8.0 - name: Run nf-core lint env: diff --git a/.github/workflows/sangertest.yml b/.github/workflows/sanger_test.yml similarity index 68% rename from .github/workflows/sangertest.yml rename to .github/workflows/sanger_test.yml index 7e8af27..a4e5426 100644 --- a/.github/workflows/sangertest.yml +++ b/.github/workflows/sanger_test.yml @@ -9,16 +9,23 @@ jobs: runs-on: ubuntu-latest steps: - name: Launch workflow via tower - uses: nf-core/tower-action@v2 + uses: seqeralabs/action-tower-launch@v2 with: workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} - pipeline: ${{ github.repository }} revision: ${{ github.sha }} workdir: ${{ secrets.TOWER_WORKDIR_PARENT }}/work/${{ github.repository }}/work-${{ github.sha }} parameters: | { "outdir": "${{ secrets.TOWER_WORKDIR_PARENT }}/results/${{ github.repository }}/results-${{ github.sha }}", + "align": true } - profiles: test,sanger,singularity,cleanup + profiles: test_align,sanger,singularity,cleanup + + - uses: actions/upload-artifact@v3 + with: + name: Tower debug log file + path: | + tower_action_*.log + tower_action_*.json diff --git a/.github/workflows/sangerfulltest.yml b/.github/workflows/sanger_test_full.yml similarity index 77% rename from .github/workflows/sangerfulltest.yml rename to .github/workflows/sanger_test_full.yml index 850d4ae..a552c63 100644 --- a/.github/workflows/sangerfulltest.yml +++ b/.github/workflows/sanger_test_full.yml @@ -22,16 +22,23 @@ jobs: if: github.event_name == 'workflow_dispatch' - name: Launch workflow via tower - uses: nf-core/tower-action@v2 + uses: seqeralabs/action-tower-launch@v2 with: workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} - pipeline: ${{ github.repository }} revision: ${{ env.REVISION }} workdir: ${{ secrets.TOWER_WORKDIR_PARENT }}/work/${{ github.repository }}/work-${{ env.REVISION }} parameters: | { "outdir": "${{ secrets.TOWER_WORKDIR_PARENT }}/results/${{ github.repository }}/results-${{ env.REVISION }}", + "align": true, } - profiles: test_full,sanger,singularity,cleanup + profiles: test_full_align,sanger,singularity,cleanup + + - uses: actions/upload-artifact@v3 + with: + name: Tower debug log file + path: | + tower_action_*.log + tower_action_*.json diff --git a/.nf-core.yml b/.nf-core.yml index f58cf3f..4e011f7 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -16,6 +16,7 @@ lint: - lib/NfcoreTemplate.groovy - .github/PULL_REQUEST_TEMPLATE.md - .github/workflows/branch.yml + - .github/workflows/linting.yml - LICENSE - assets/email_template.html - .github/ISSUE_TEMPLATE/bug_report.yml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..0c31cdb --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,5 @@ +repos: + - repo: https://github.com/pre-commit/mirrors-prettier + rev: "v2.7.1" + hooks: + - id: prettier diff --git a/CHANGELOG.md b/CHANGELOG.md index c2e0db5..bbd01fb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,53 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [[1.0.0](https://github.com/sanger-tol/variantcalling/releases/tag/1.0.0)] - [2023-05-03] +## [[1.1.0](https://github.com/sanger-tol/variantcalling/releases/tag/1.1.0)] - Shang Tang - [2023-12-20] + +### Enhancements & fixes + +- Updated the CI procedure to use "sanger-tol" rather than "nf-core" names. +- Renamed Sanger related Github CI test workflows. +- nf-core template was updated from 2.7 to 2.8. +- Removed BAM/CRAM index files from the sample sheets. +- Made fasta index file optional from the inputs. +- Imported PacBio readmapping sub-workflows from [sanger-tol/readmapping pipeline](https://github.com/sanger-tol/readmapping/). Therefore, the pipeline can run on unaligned BAM/CRAM samples now. +- Use VCFtools to calculate per site nucleotide diversity. +- Use VCFtools to calculate heterozygosity. + +### Parameters + +This release with the following initial parameters: + +| Old parameter | New parameter | +| ------------- | ------------------- | +| --gzi | | +| | --vector_db | +| | --align | +| | --include_positions | +| | --exclude_positions | + +> **NB:** Parameter has been **updated** if both old and new parameter information is present.
**NB:** Parameter has been **added** if just the new parameter information is present.
**NB:** Parameter has been **removed** if new parameter information isn't present. + +### Software dependencies + +Note, since the pipeline is using Nextflow DSL2, each process will be run with its own [Biocontainer](https://biocontainers.pro/#/registry). This means that on occasion it is entirely possible for the pipeline to be using different versions of the same tool. However, the overall software dependency changes compared to the last release have been listed below for reference. Only `Docker` or `Singularity` containers are supported, `conda` is not supported. + +| Dependency | Old version | New version | +| ----------- | ----------- | ----------- | +| DeepVariant | 1.4.0 | 1.5.0 | +| samtools | 1.16.1 | 1.17 | +| bcftools | 1.16.1 | 1.17 | +| python | 3.11.0 | 3.11.4 | +| vcftools | | 0.1.16 | +| blast | | 2.14.1+ | +| gunzip: | | 1.10 | +| minimap2 | | 2.24-r1122 | +| awk | | 5.1.0 | +| untar | | 1.30 | + +> **NB:** Dependency has been **updated** if both old and new version information is present.
**NB:** Dependency has been **added** if just the new version information is present.
**NB:** Dependency has been **removed** if version information isn't present. + +## [[1.0.0](https://github.com/sanger-tol/variantcalling/releases/tag/1.0.0)] - Xia Yu - [2023-05-03] Initial release of sanger-tol/variantcalling, created with the [nf-core](https://nf-co.re/) tools. diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 0000000..5b0fb48 --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,30 @@ +# This CITATION.cff file was generated with cffinit. +# Visit https://bit.ly/cffinit to generate yours today! + +cff-version: 1.2.0 +title: sanger-tol/variantcalling v1.1.0 +message: >- + If you use this software, please cite it using the + metadata from this file. +type: software +authors: + - given-names: Guoying + family-names: Qi + affiliation: Wellcome Sanger Institute + orcid: "https://orcid.org/0000-0003-1262-8973" + - given-names: Priyanka + family-names: Surana + affiliation: Wellcome Sanger Institute + orcid: "https://orcid.org/0000-0002-7167-0875" + - given-names: Matthieu + family-names: Muffato + affiliation: Wellcome Sanger Institute + orcid: "https://orcid.org/0000-0002-7860-3560" +identifiers: + - type: doi + value: 10.5281/zenodo.7890528 +repository-code: "https://github.com/sanger-tol/variantcalling" +license: MIT +commit: TODO +version: 1.1.0 +date-released: "2022-12-20" diff --git a/CITATIONS.md b/CITATIONS.md index 923ff06..a492677 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -22,6 +22,22 @@ > Danecek, Petr, et al. “Twelve Years of SAMtools and BCFtools.” GigaScience, vol. 10, no. 2, Jan. 2021, https://doi.org/10.1093/gigascience/giab008. +- [Blast](https://pubmed.ncbi.nlm.nih.gov/20003500/) + + > Camacho C, Coulouris G, Avagyan V, Ma N, Papadopoulos J, Bealer K, Madden TL. BLAST+: architecture and applications. BMC Bioinformatics. 2009 Dec 15;10:421. doi: 10.1186/1471-2105-10-421. PMID: 20003500; PMCID: PMC2803857. + +- [BWA-MEM2](https://ieeexplore.ieee.org/document/8820962) + + > Vasimuddin Md, Misra S, Li H, Aluru S. Efficient Architecture-Aware Acceleration of BWA-MEM for Multicore Systems. 2019 IEEE International Parallel and Distributed Processing Symposium. 2019 May;314–24. doi: 10.1109/IPDPS.2019.00041. + +- [Minimap2](https://pubmed.ncbi.nlm.nih.gov/34623391/) + + > Li H. New strategies to improve minimap2 alignment accuracy. Bioinformatics. 2021 Oct 8;37(23):4572–4. doi: 10.1093/bioinformatics/btab705. Epub ahead of print. PMID: 34623391; PMCID: PMC8652018. + +- [VCFTools](https://pubmed.ncbi.nlm.nih.gov/21653522/) + + > Danecek P, Auton A, Abecasis G, et al.: The variant call format and VCFtools. Bioinformatics. 2011 Aug 1;27(15):2156-8. doi: 10.1093/bioinformatics/btr330. Epub 2011 Jun 7. PubMed PMID: 21653522; PubMed Central PMCID: PMC3137218. + ## Software packaging/containerisation tools - [Anaconda](https://anaconda.com) diff --git a/README.md b/README.md index 62da4b2..3a344df 100644 --- a/README.md +++ b/README.md @@ -18,14 +18,19 @@ On release, automated continuous integration tests run the pipeline on a full-si ## Pipeline summary -The pipleline takes aligned PacBio sample reads (CRAM/BAM files and their index files) from a CSV file and the reference file in FASTA format, and then uses DeepVariant tool to make variant calling. +The pipeline takes aligned or unaligned PacBio sample reads (CRAM/BAM files) from a CSV file and the reference file in FASTA format, and then uses DeepVariant tool to make variant calling. Steps involved: - Split fasta file into smaller files, normally one sequence per file unless the sequences are too small. +- Align the reads if not aligned. +- Merge input BAM/CRAM files together if they have the same sample names. - Filter out reads using the `-F 0x900` option to only retain the primary alignments. - Run DeepVariant using filtered BAM/CRAM files against each of split fasta files. - Merge all VCF and GVCF files generated by DeepVariant by sample together for each input BAM/CRAM file. +- Run VCFtools to calculate heterozygosity and per site nucleotide diversity. + + ## Quick Start @@ -36,7 +41,12 @@ Steps involved: 3. Download the pipeline and test it on a minimal dataset with a single command: ```bash + # for aligned reads nextflow run sanger-tol/variantcalling -profile test,YOURPROFILE --outdir + + # for unaligned reads + nextflow run sanger-tol/variantcalling -profile test_align,YOURPROFILE --align --outdir + ``` Note that some form of configuration will be needed so that Nextflow knows how to fetch the required software. This is usually done in the form of a config profile (`YOURPROFILE` in the example command above). You can chain multiple config profiles in a comma-separated string. @@ -49,7 +59,7 @@ Steps involved: 4. Start running your own analysis! ```bash - nextflow run sanger-tol/variantcalling --input samplesheet.csv --outdir --fasta genome.fasta.gz --fai genome.fasta.gz.fai --gzi genome.fasta.gz.gzi -profile + nextflow run sanger-tol/variantcalling --input samplesheet.csv --outdir --fasta genome.fasta.gz -profile ``` ## Credits diff --git a/assets/email_template.html b/assets/email_template.html index 8c07878..cf7fc79 100644 --- a/assets/email_template.html +++ b/assets/email_template.html @@ -3,8 +3,7 @@ - - + sanger-tol/variantcalling Pipeline Report diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index 2ea95db..9de2e5b 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,4 +1,5 @@ -sample,datatype,datafile,indexfile -sample1,pacbio,/path/to/data/file/file1.bam,/path/to/index/file/file1.bam.bai -sample2,pacbio,/path/to/data/file/file2.cram,/path/to/index/file/file2.cram.crai -sample3,pacbio,/path/to/data/file/file3.bam,/path/to/index/file/file3.bam.csi +sample,datatype,datafile +sample1,pacbio,/path/to/data/file/file1.bam +sample2,pacbio,/path/to/data/file/file2.cram +sample3,pacbio,/path/to/data/file/file3-1.bam +sample3,pacbio,/path/to/data/file/file3-2.cram diff --git a/assets/samplesheet_test.csv b/assets/samplesheet_test.csv index cf5546a..6eb03e5 100644 --- a/assets/samplesheet_test.csv +++ b/assets/samplesheet_test.csv @@ -1,4 +1,5 @@ -sample,datatype,datafile,indexfile -icCanRufa1_crai,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.cram,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.cram.crai -icCanRufa1_bai,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.bam,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.bam.bai -icCanRufa1_csi,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.bam,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.bam.csi +sample,datatype,datafile +icCanRufa1_cram,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.cram +icCanRufa1_bam,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.bam +icCanRufa1,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.cram +icCanRufa1,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.bam diff --git a/assets/samplesheet_test_align.csv b/assets/samplesheet_test_align.csv new file mode 100644 index 0000000..4b5a9b2 --- /dev/null +++ b/assets/samplesheet_test_align.csv @@ -0,0 +1,4 @@ +sample,datatype,datafile +icCanRufa1,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/genomic_data/icCanRufa1/pacbio/m64094_200730_174533.ccs.bc1010_BAK8A_OA--bc1010_BAK8A_OA_0_03.bam +icCanRufa1,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/genomic_data/icCanRufa1/pacbio/m64094_200730_174533.ccs.bc1010_BAK8A_OA--bc1010_BAK8A_OA_0_02.bam +icCanRufa1XXXXX,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/genomic_data/icCanRufa1/pacbio/m64094_200730_174533.ccs.bc1010_BAK8A_OA--bc1010_BAK8A_OA_0_02.bam diff --git a/assets/samplesheet_test_full.csv b/assets/samplesheet_test_full.csv index 599c5b7..1e40e2b 100644 --- a/assets/samplesheet_test_full.csv +++ b/assets/samplesheet_test_full.csv @@ -1,2 +1,2 @@ -sample,datatype,datafile,indexfile -icCanRufa1,pacbio,/lustre/scratch123/tol/projects/.sandbox/data/insects/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1.cram,/lustre/scratch123/tol/projects/.sandbox/data/insects/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1.cram.crai +sample,datatype,datafile +icCanRufa1,pacbio,/lustre/scratch123/tol/resources/nextflow/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1.cram diff --git a/assets/samplesheet_test_full_align.csv b/assets/samplesheet_test_full_align.csv new file mode 100644 index 0000000..d5b0ac4 --- /dev/null +++ b/assets/samplesheet_test_full_align.csv @@ -0,0 +1,2 @@ +sample,datatype,datafile +ilPolIcar1,pacbio,/lustre/scratch124/tol/projects/darwin/data/insects/Polyommatus_icarus/genomic_data/ilPolIcar1/pacbio/m64016_191206_183623.ccs.bc1019_BAK8B_OA--bc1019_BAK8B_OA.bam diff --git a/assets/schema_input.json b/assets/schema_input.json index 43497e9..f264cf6 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -21,13 +21,8 @@ "type": "string", "pattern": "^\\S+\\.(bam|cram)$", "errorMessage": "Data file for reads cannot contain spaces and must have extension 'cram' or 'bam'" - }, - "indexfile": { - "type": "string", - "pattern": "^\\S+\\.(bai|csi|crai)$", - "errorMessage": "Data index file for reads cannot contain spaces and must have extension 'bai', 'csi' or 'crai'" } }, - "required": ["sample", "datatype", "datafile", "indexfile"] + "required": ["sample", "datatype", "datafile"] } } diff --git a/assets/slackreport.json b/assets/slackreport.json index 043d02f..837b798 100644 --- a/assets/slackreport.json +++ b/assets/slackreport.json @@ -3,7 +3,7 @@ { "fallback": "Plain-text summary of the attachment.", "color": "<% if (success) { %>good<% } else { %>danger<%} %>", - "author_name": "sanger-tol/readmapping v${version} - ${runName}", + "author_name": "sanger-tol/variantcalling v${version} - ${runName}", "author_icon": "https://www.nextflow.io/docs/latest/_static/favicon.ico", "text": "<% if (success) { %>Pipeline completed successfully!<% } else { %>Pipeline completed with errors<% } %>", "fields": [ diff --git a/assets/vectorDB.tar.gz b/assets/vectorDB.tar.gz new file mode 100644 index 0000000..f9b08d5 Binary files /dev/null and b/assets/vectorDB.tar.gz differ diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 6bbd806..3a6b9d7 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -"""Provide a command line tool to validate and transform tabular samplesheets.""" +"""Provide a command line tool to validate tabular samplesheets.""" import argparse @@ -35,7 +35,6 @@ def __init__( sample_col="sample", type_col="datatype", file_col="datafile", - index_col="indexfile", **kwargs, ): """ @@ -48,8 +47,6 @@ def __init__( the read data (default "datatype"). file_col (str): The name of the column that contains the file path for the read data (default "datafile"). - index_col (str): The name of the column that contains the index file - for the data (default "indexfile"). """ super().__init__(**kwargs) @@ -57,11 +54,10 @@ def __init__( self._sample_col = sample_col self._type_col = type_col self._file_col = file_col - self._index_col = index_col self._seen = set() - self.modified = [] + self.validated = [] - def validate_and_transform(self, row): + def validate(self, row): """ Perform all validations on the given row. @@ -73,9 +69,8 @@ def validate_and_transform(self, row): self._validate_sample(row) self._validate_type(row) self._validate_data_file(row) - self._validate_index_file(row) self._seen.add((row[self._sample_col], row[self._file_col])) - self.modified.append(row) + self.validated.append(row) def _validate_sample(self, row): """Assert that the sample name exists and convert spaces to underscores.""" @@ -98,17 +93,6 @@ def _validate_data_file(self, row): raise AssertionError("Data file is required.") self._validate_data_format(row[self._file_col]) - def _validate_index_file(self, row): - """Assert that the indexfile is non-empty and has the right format.""" - if len(row[self._index_col]) <= 0: - raise AssertionError("Data index file is required.") - if row[self._file_col].endswith("bam") and not ( - row[self._index_col].endswith("bai") or row[self._index_col].endswith("csi") - ): - raise AssertionError("bai or csi index file should be given for bam file.") - if row[self._file_col].endswith("cram") and not row[self._index_col].endswith("crai"): - raise AssertionError("crai index file shuld be given for cram file.") - def _validate_data_format(self, filename): """Assert that a given filename has one of the expected read data file extensions.""" if not any(filename.endswith(extension) for extension in self.DATA_VALID_FORMATS): @@ -121,14 +105,11 @@ def validate_unique_samples(self): """ Assert that the combination of sample name and data filename is unique. - In addition to the validation, also rename all samples to have a suffix of _T{n}, where n is the - number of times the same sample exist, but with different files, e.g., multiple runs per experiment. - """ - if len(self._seen) != len(self.modified): + if len(self._seen) != len(self.validated): raise AssertionError("The combination of sample name and data file must be unique.") seen = Counter() - for row in self.modified: + for row in self.validated: sample = row[self._sample_col] seen[sample] += 1 row[self._sample_col] = f"{sample}_T{seen[sample]}" @@ -162,7 +143,7 @@ def sniff_format(handle): peek = read_head(handle) handle.seek(0) sniffer = csv.Sniffer() - # same input file could retrun random true or false + # same input file could return random true or false # disable it now # the following validation should be enough # if not sniffer.has_header(peek): @@ -188,16 +169,17 @@ def check_samplesheet(file_in, file_out): This function checks that the samplesheet follows the following structure, see also the `variantcalling samplesheet`_:: - sample,datatype,datafile,indexfile - sample1,pacbio,/path/to/data/file/file1.bam,/path/to/index/file/file1.bam.bai - sample2,pacbio,/path/to/data/file/file2.cram,/path/to/index/file/file2.cram.crai - sample3,pacbio,/path/to/data/file/file3.bam,/path/to/index/file/file3.bam.csi + sample,datatype,datafile + sample1,pacbio,/path/to/data/file/file1.bam + sample2,pacbio,/path/to/data/file/file2.cram + sample3,pacbio,/path/to/data/file/file3-1.bam + sample3,pacbio,/path/to/data/file/file3-2.cram .. _variantcalling samplesheet: https://raw.githubusercontent.com/sanger-tol/variantcalling/main/assets/samplesheet.csv """ - required_columns = {"sample", "datatype", "datafile", "indexfile"} + required_columns = {"sample", "datatype", "datafile"} # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. with file_in.open(newline="") as in_handle: reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle)) @@ -210,7 +192,7 @@ def check_samplesheet(file_in, file_out): checker = RowChecker() for i, row in enumerate(reader): try: - checker.validate_and_transform(row) + checker.validate(row) except AssertionError as error: logger.critical(f"{str(error)} On line {i + 2}.") sys.exit(1) @@ -220,7 +202,7 @@ def check_samplesheet(file_in, file_out): with file_out.open(mode="w", newline="") as out_handle: writer = csv.DictWriter(out_handle, header, delimiter=",") writer.writeheader() - for row in checker.modified: + for row in checker.validated: writer.writerow(row) diff --git a/bin/pacbio_filter.sh b/bin/pacbio_filter.sh new file mode 100755 index 0000000..73d7caa --- /dev/null +++ b/bin/pacbio_filter.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +input=$1 +output=$2 + +grep -v 'MG551957' $input | awk -v OFS='\t' '{if (($2 ~ /NGB00972/ && $3 >= 97 && $4 >= 44) || ($2 ~ /NGB00973/ && $3 >= 97 && $4 >= 34) || ($2 ~ /^bc/ && $3 >= 99 && $4 >= 16)) print $1}' | sort -u > $output diff --git a/conf/base.config b/conf/base.config index b00ad1c..e08d741 100644 --- a/conf/base.config +++ b/conf/base.config @@ -15,8 +15,8 @@ process { memory = { check_max( 6.GB * task.attempt, 'memory' ) } time = { check_max( 4.h * task.attempt, 'time' ) } - errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' } - maxRetries = 1 + errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' } + maxRetries = 5 maxErrors = '-1' // Process-specific resource requirements diff --git a/conf/modules.config b/conf/modules.config index 0f4e51c..acc63f5 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -14,14 +14,77 @@ process { withName: SAMPLESHEET_CHECK { publishDir = [ - path: { "${params.outdir}/variantcalling_info" }, + path: { "${params.outdir}/pipeline_info/variantcalling" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } + if( params.align ) { + + withName: '.*:ALIGN_PACBIO:FILTER_PACBIO:SAMTOOLS_CONVERT' { + ext.args = "-e '[rq]>=0.99' -x fi -x fp -x ri -x rp --write-index --output-fmt bam" + } + + withName: '.*:ALIGN_PACBIO:FILTER_PACBIO:SAMTOOLS_COLLATE' { + ext.prefix = { "${meta.id}.collate" } + } + + withName: '.*:ALIGN_PACBIO:FILTER_PACBIO:BLAST_BLASTN' { + ext.args = '-task blastn -reward 1 -penalty -5 -gapopen 3 -gapextend 3 -dust yes -soft_masking true -evalue .01 -searchsp 1750000000000 -outfmt 6' + } + + withName: '.*:ALIGN_PACBIO:FILTER_PACBIO:SAMTOOLS_FILTER' { + ext.prefix = { "${meta.id}.filter" } + } + + withName: '.*:ALIGN_PACBIO:FILTER_PACBIO:SAMTOOLS_FASTQ' { + ext.args = '-F 0x200 -nt' + } + + withName: '.*:.*:ALIGN_PACBIO:MINIMAP2_ALIGN' { + ext.args = { "-ax map-hifi --cs=short -R ${meta.read_group}" } + } + + withName: '.*:.*:ALIGN_PACBIO:SAMTOOLS_MERGE' { + ext.args = { "-c -p" } + ext.prefix = { "${meta.id}.merge" } + } + + withName: '.*:CONVERT_STATS:SAMTOOLS_VIEW' { + ext.prefix = { "${meta2.id}.${meta.datatype}.${meta.id}" } + ext.args = '--output-fmt cram --write-index' + } + + withName: '.*:CONVERT_STATS:SAMTOOLS_STATS' { + ext.prefix = { "${input.baseName}" } + } + + withName: '.*:CONVERT_STATS:SAMTOOLS_FLAGSTAT' { + ext.prefix = { "${bam.baseName}" } + } + + withName: '.*:CONVERT_STATS:SAMTOOLS_IDXSTATS' { + ext.prefix = { "${bam.baseName}" } + } + + withName: '.*:ALIGN_PACBIO:CONVERT_STATS:.*' { + publishDir = [ + path: { "${params.outdir}/variant_calling" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } else{ + + withName: '.*:INPUT_MERGE:SAMTOOLS_MERGE' { + ext.args = '--write-index' + } + } + withName: '.*:INPUT_FILTER_SPLIT:SAMTOOLS_VIEW' { ext.args = '--output-fmt cram --write-index -F 0x900' + ext.prefix = { "${meta.id}_filtered" } } withName: '.*:DEEPVARIANT_CALLER:DEEPVARIANT' { @@ -46,9 +109,32 @@ process { ] } + withName: '.*:PROCESS_VCF:VCFTOOLS_SITE_PI' { + ext.args = '--site-pi' + if( params.include_positions ){ + ext.args += ' --positions' + } else if ( params.exclude_positions ){ + ext.args += ' --exclude-positions' + } + publishDir = [ + path: { "${params.outdir}/variant_calling" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*:PROCESS_VCF:VCFTOOLS_HET' { + ext.args = '--het' + publishDir = [ + path: { "${params.outdir}/variant_calling" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: CUSTOM_DUMPSOFTWAREVERSIONS { publishDir = [ - path: { "${params.outdir}/variantcalling_info" }, + path: { "${params.outdir}/pipeline_info/variantcalling" }, mode: params.publish_dir_mode, pattern: '*_versions.yml' ] diff --git a/conf/test.config b/conf/test.config index be32ebe..01515e9 100644 --- a/conf/test.config +++ b/conf/test.config @@ -25,9 +25,9 @@ params { // Fasta references fasta = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/assembly/GCA_947369205.1_OX376310.1_CANBKR010000003.1.fasta.gz' - // Reference index file - fai = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/assembly/GCA_947369205.1_OX376310.1_CANBKR010000003.1.fasta.gz.fai' - gzi = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/assembly/GCA_947369205.1_OX376310.1_CANBKR010000003.1.fasta.gz.gzi' + // Reference index file (optional) + // fai = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/assembly/GCA_947369205.1_OX376310.1_CANBKR010000003.1.fasta.gz.fai' + // fai = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/assembly/GCA_947369205.1_OX376310.1_CANBKR010000003.1.fasta.gz.gzi' // Interval bed file interval = 'https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.bed' diff --git a/conf/test_align.config b/conf/test_align.config new file mode 100644 index 0000000..8da6b65 --- /dev/null +++ b/conf/test_align.config @@ -0,0 +1,27 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run sanger-tol/variantcalling -profile test_align, --outdir --align + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test profile with alignment' + config_profile_description = 'Minimal unaligned test dataset to check pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + input = "${projectDir}/assets/samplesheet_test_align.csv" + + // Fasta references + fasta = "https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/assembly/GCA_947369205.1_OX376310.1_CANBKR010000003.1.fasta.gz" +} diff --git a/conf/test_full.config b/conf/test_full.config index 12dc8ef..3a2d38e 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -10,6 +10,8 @@ ---------------------------------------------------------------------------------------- */ +cleanup = true + params { config_profile_name = 'Full test profile' config_profile_description = 'Full test dataset to check pipeline function' @@ -21,6 +23,5 @@ params { fasta = '/lustre/scratch124/tol/projects/darwin/data/insects/Cantharis_rufa/assembly/release/icCanRufa1.1/insdc/GCA_947369205.1.fasta.gz' // Reference index file - fai = '/lustre/scratch124/tol/projects/darwin/data/insects/Cantharis_rufa/assembly/release/icCanRufa1.1/insdc/GCA_947369205.1.fasta.gz.fai' - gzi = '/lustre/scratch124/tol/projects/darwin/data/insects/Cantharis_rufa/assembly/release/icCanRufa1.1/insdc/GCA_947369205.1.fasta.gz.gzi' + fai = '/lustre/scratch124/tol/projects/darwin/data/insects/Cantharis_rufa/assembly/release/icCanRufa1.1/insdc/GCA_947369205.1.fasta.gz.gzi' } diff --git a/conf/test_full_align.config b/conf/test_full_align.config new file mode 100644 index 0000000..79b9fd7 --- /dev/null +++ b/conf/test_full_align.config @@ -0,0 +1,25 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running full-size tests with alignment +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a full size pipeline test. + + Use as follows: + nextflow run sanger-tol/variantcalling -profile test_full_align, --outdir --align + +---------------------------------------------------------------------------------------- +*/ + +cleanup = true + +params { + config_profile_name = 'Full test profile with alignment' + config_profile_description = 'Full non-aligned test dataset to check pipeline function' + + // Input data for full size test + input = "${projectDir}/assets/samplesheet_test_full_align.csv" + + // Fasta references + fasta = "/lustre/scratch124/tol/projects/darwin/data/insects/Polyommatus_icarus/assembly/release/ilPolIcar1.1/insdc/GCA_937595015.1.fasta.gz" + +} diff --git a/docs/images/mermaid-diagram.mmd b/docs/images/mermaid-diagram.mmd new file mode 100644 index 0000000..b447cae --- /dev/null +++ b/docs/images/mermaid-diagram.mmd @@ -0,0 +1,57 @@ +--- +title: Sanger-tol/Variantcalling Workflow +--- + +%%{ init: { +'gitGraph': {'mainBranchName': 'BAM/CRAM' +}, +'themeVariables': { +'commitLabelFontSize': '18px' +} +} +}%% +gitGraph TB: +commit id: "START" +branch Fasta order: 4 +commit id: "SAMTOOLS_FAIDX" +checkout BAM/CRAM +commit id: "SAMPLESHEET_CHECK" +branch AlignedReads order: 3 +branch UnAlignedReads order: 2 +commit id: "SAMTOOLS_COLLATE" +commit id: "SAMTOOLS_FASTA" +commit id: "BLAST_BLASTN" +commit id: "PACBIO_FILTER" +commit id: "PACBIO_SAMTOOLS_FILTER" +commit id: "SAMTOOLS_FASTQ" +commit id: "MINIMAP2_ALIGN" +commit id: "SAMTOOLS_MERGE_BY_SAMPLE 1" type: HIGHLIGHT +commit id: "SAMTOOLS_STATS" type: HIGHLIGHT +commit id: "SAMTOOLS_FLAGSTAT" type: HIGHLIGHT +commit id: "SAMTOOLS_IDXSTATS" type: HIGHLIGHT +checkout BAM/CRAM +merge UnAlignedReads +checkout Fasta +branch SplitFasta order: 5 +commit id: "FASTA_SPLIT" +branch DeepVariant order: 6 +checkout AlignedReads +merge Fasta +commit id: "SAMTOOLS_SORT" +commit id: "SAMTOOLS_MERGE_BY_SAMPLE 2" +checkout BAM/CRAM +merge AlignedReads +commit id: "SAMTOOLS_FILTER" +checkout DeepVariant +merge BAM/CRAM +commit id: "DEEPVARIANT" +commit id: "BCFTOOLS_CONCAT_VCF" type: HIGHLIGHT +branch "VCFtools" order: 7 +commit id: "VCFTOOLS_SITE_PI" type: HIGHLIGHT +commit id: "VCFTOOLS_HET" type: HIGHLIGHT +checkout DeepVariant +commit id: "BCFTOOLS_CONCAT_GVCF" type: HIGHLIGHT +checkout BAM/CRAM +merge VCFtools +merge DeepVariant +commit id: "DUMPSOFTWAREVERSIONS" type: HIGHLIGHT diff --git a/docs/images/mermaid-diagram.png b/docs/images/mermaid-diagram.png new file mode 100644 index 0000000..b0dee45 Binary files /dev/null and b/docs/images/mermaid-diagram.png differ diff --git a/docs/output.md b/docs/output.md index 946a1c1..fa2258c 100644 --- a/docs/output.md +++ b/docs/output.md @@ -10,9 +10,49 @@ The directories listed below will be created in the results directory after the The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: +- [Readmapping Alignments](#readmapping-alignments) - Optional aligned CRAM files generated by minimap2 +- [Alignments Statistics](#alignments-statistics) Optional statistics files generated by samtools +- [VCFtools Processing](#vcftools-processing) Heterozygosity and per site nucleotide diversity calculated by VCFtools - [PacBio Variant Calling](#pacbio-variant-calling) - VCF and GVCF compressed files generated by DeepVariant - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution +### Readmapping Alignments + +The unaligned PacBio read data is being filtered and aligned using `minimap2`. The CRAM files from the same sample will be merged. + +
+Output files + +- `readmapping` + - Aligned CRAM files: `.pacbio..cram`. + - Aligned CRAM index files: `.pacbio..cram.crai`. + +
+ +### Alignments Statistics + +The statistics for the aligned CRAM files will be calculated using `samtools`. + +
+Output files + +- `statistics` + - Comprehensive statistics from alignment file: `.pacbio..stats`. + - Number of alignments for each FLAG type: `.pacbio..flagstats`. + - Alignment summary statistics: `.pacbio..idxstats`. + +
+ +### VCFtools Processing + +
+Output files + +- Heterozygosity generated by VCFtools: `.pacbio._deepvariant.vcf.het`. +- Per site nucleotide diversity calculated by VCFtools: `.pacbio._deepvariant.vcf.sites.pi`. + +
+ ### PacBio Variant Calling The aligned PacBio read data is used to call variants with DeepVariant. This is done by splitting the genome fasta file for speed efficiency. `BCFTOOLS` is used to combine the split `VCF` and `GVCF` files generated by `DEEPVARIANT`. @@ -21,8 +61,8 @@ The aligned PacBio read data is used to call variants with DeepVariant. This is Output files - `variant_calling` - - Compressed VCF files: `_deepvariant.vcf.gz`. - - Compressed GVCF files: `_deepvariant.g.vcf.gz`. + - Compressed VCF files: `.pacbio._deepvariant.vcf.gz`. + - Compressed GVCF files: `.pacbio._deepvariant.g.vcf.gz`. @@ -31,7 +71,7 @@ The aligned PacBio read data is used to call variants with DeepVariant. This is
Output files -- `variantcalling_info/` +- `pipeline_info/variantcalling/` - Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`. - Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.yml`. The `pipeline_report*` files will only be present if the `--email` / `--email_on_fail` parameter's are used when running the pipeline. - Reformatted samplesheet files used as input to the pipeline: `samplesheet.valid.csv`. diff --git a/docs/usage.md b/docs/usage.md index 416dfdb..4fbed55 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -2,11 +2,11 @@ ## Introduction -The pipleline takes aligned sample reads (CRAM/BAM files and their index files) from a CSV file and a reference file in FASTA format, and then use DeepVariant to call variants. +The pipeline takes aligned or unaliged sample reads (CRAM/BAM files) from a CSV file and a reference file in FASTA format, and then use DeepVariant to call variants. ## Samplesheet input -You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use the `input` parameter to specify the samplesheet location. It has to be a comma-separated file with at least 4 columns, and a header row as shown in the examples below. +You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use the `input` parameter to specify the samplesheet location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below. ```bash --input '[path to samplesheet file]' @@ -17,10 +17,10 @@ You will need to create a samplesheet with information about the samples you wou The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. Below is an example for the same sample sequenced across 3 lanes: ```console -sample,datatype,datafile,indexfile -sample1,pacbio,sample1_1.cram,sample1_1.cram.crai -sample1,pacbio,sample1_2.cram,sample1_3.cram.crai -sample1,pacbio,sample1_3.cram,sample1_3.cram.crai +sample,datatype,datafile +sample1,pacbio,sample1_1.cram +sample1,pacbio,sample1_2.cram +sample1,pacbio,sample1_3.cram ``` ### Full samplesheet @@ -28,18 +28,17 @@ sample1,pacbio,sample1_3.cram,sample1_3.cram.crai A final samplesheet file consisting of both BAM or CRAM will look like this. Currently this pipeline only supports Pacbio aligned data. ```console -sample,datatype,datafile,indexfile -sample1,pacbio,/path/to/data/file/file1.bam,/path/to/index/file/file1.bam.bai -sample2,pacbio,/path/to/data/file/file2.cram,/path/to/index/file/file2.cram.crai -sample3,pacbio,/path/to/data/file/file3.bam,/path/to/index/file/file3.bam.csi +sample,datatype,datafile +sample1,pacbio,/path/to/data/file/file1.bam +sample2,pacbio,/path/to/data/file/file2.cram +sample3,pacbio,/path/to/data/file/file3.bam ``` -| Column | Description | -| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | -| `datatype` | Sequencing data type. Must be `pacbio`. | -| `datafile` | The location for either BAM or CRAM file. | -| `indexfile` | The location for BAM or CRAM index file – BAI, CSI or CRAI. | +| Column | Description | +| ---------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | +| `datatype` | Sequencing data type. Must be `pacbio`. | +| `datafile` | The location for either BAM or CRAM file. | An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. @@ -48,7 +47,7 @@ An [example samplesheet](../assets/samplesheet.csv) has been provided with the p The typical command for running the pipeline is as follows: ```bash -nextflow run sanger-tol/variantcalling --input samplesheet.csv --outdir --fasta genome.fasta.gz --fai genome.fasta.gz.fai --gzi genome.fasta.gz.gzi -profile docker +nextflow run sanger-tol/variantcalling --input samplesheet.csv --outdir --fasta genome.fasta.gz -profile docker ``` This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. @@ -62,7 +61,9 @@ work # Directory containing the nextflow working files # Other nextflow hidden files, eg. history of pipeline runs and old logs. ``` -The pipeline will split the intput fasta file into smaller files to run DeepVariant parallel. You can set the minimum split fasta file size from the command line. For example to set the minimum size as 10K using `--split_fasta_cutoff 10000`. +The pipeline will split the input fasta file into smaller files to run DeepVariant parallel. You can set the minimum split fasta file size from the command line. For example to set the minimum size as 10K using `--split_fasta_cutoff 10000`. + +If the input BAM/CRAM files are not aligned, please add `--align` in your command. Please don't use this flag if the input files are already aligned because the current workflow will not align the aligned PacBio reads. ### Updating the pipeline @@ -80,6 +81,10 @@ First, go to the [sanger-tol/variantcalling releases page](https://github.com/sa This version number will be logged in reports when you run the pipeline, so that you'll know what you used when you look back in the future. +To further assist in reproducbility, you can use share and re-use [parameter files](#running-the-pipeline) to repeat pipeline runs with the same settings without having to write out a command with every single parameter. + +> 💡 If you wish to share such profile (such as upload as supplementary material for academic publications), make sure to NOT include cluster specific paths to files, nor institutional specific profiles. + ## Core Nextflow arguments > **NB:** These options are part of Nextflow and use a _single_ hyphen (pipeline parameters use a double-hyphen). @@ -88,7 +93,7 @@ This version number will be logged in reports when you run the pipeline, so that Use this parameter to choose a configuration profile. Profiles can give configuration presets for different compute environments. -Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (Docker, Singularity, Podman, Shifter, Charliecloud, Conda) - see below. +Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (Docker, Singularity, Podman, Shifter, Charliecloud, Apptainer, Conda) - see below. > We highly recommend the use of Docker or Singularity containers for full pipeline reproducibility, however when this is not possible, Conda is also supported. @@ -112,8 +117,10 @@ If `-profile` is not specified, the pipeline will run locally and expect all sof - A generic configuration profile to be used with [Shifter](https://nersc.gitlab.io/development/shifter/how-to-use/) - `charliecloud` - A generic configuration profile to be used with [Charliecloud](https://hpc.github.io/charliecloud/) +- `apptainer` + - A generic configuration profile to be used with [Apptainer](https://apptainer.org/) - `conda` - - A generic configuration profile to be used with [Conda](https://conda.io/docs/). Please only use Conda as a last resort i.e. when it's not possible to run the pipeline with Docker, Singularity, Podman, Shifter or Charliecloud. + - A generic configuration profile to be used with [Conda](https://conda.io/docs/). Please only use Conda as a last resort i.e. when it's not possible to run the pipeline with Docker, Singularity, Podman, Shifter, Charliecloud, or Apptainer. ### `-resume` @@ -131,102 +138,19 @@ Specify the path to a specific config file (this is a core Nextflow command). Se Whilst the default requirements set within the pipeline will hopefully work for most people and with most input data, you may find that you want to customise the compute resources that the pipeline requests. Each step in the pipeline has a default set of requirements for number of CPUs, memory and time. For most of the steps in the pipeline, if the job exits with any of the error codes specified [here](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/conf/base.config#L18) it will automatically be resubmitted with higher requests (2 x original, then 3 x original). If it still fails after the third attempt then the pipeline execution is stopped. -For example, if the nf-core/rnaseq pipeline is failing after multiple re-submissions of the `STAR_ALIGN` process due to an exit code of `137` this would indicate that there is an out of memory issue: - -```console -[62/149eb0] NOTE: Process `NFCORE_RNASEQ:RNASEQ:ALIGN_STAR:STAR_ALIGN (WT_REP1)` terminated with an error exit status (137) -- Execution is retried (1) -Error executing process > 'NFCORE_RNASEQ:RNASEQ:ALIGN_STAR:STAR_ALIGN (WT_REP1)' - -Caused by: - Process `NFCORE_RNASEQ:RNASEQ:ALIGN_STAR:STAR_ALIGN (WT_REP1)` terminated with an error exit status (137) - -Command executed: - STAR \ - --genomeDir star \ - --readFilesIn WT_REP1_trimmed.fq.gz \ - --runThreadN 2 \ - --outFileNamePrefix WT_REP1. \ - - -Command exit status: - 137 - -Command output: - (empty) - -Command error: - .command.sh: line 9: 30 Killed STAR --genomeDir star --readFilesIn WT_REP1_trimmed.fq.gz --runThreadN 2 --outFileNamePrefix WT_REP1. -Work dir: - /home/pipelinetest/work/9d/172ca5881234073e8d76f2a19c88fb - -Tip: you can replicate the issue by changing to the process work dir and entering the command `bash .command.run` -``` - -#### For beginners - -A first step to bypass this error, you could try to increase the amount of CPUs, memory, and time for the whole pipeline. Therefor you can try to increase the resource for the parameters `--max_cpus`, `--max_memory`, and `--max_time`. Based on the error above, you have to increase the amount of memory. Therefore you can go to the [parameter documentation of rnaseq](https://nf-co.re/rnaseq/3.9/parameters) and scroll down to the `show hidden parameter` button to get the default value for `--max_memory`. In this case 128GB, you than can try to run your pipeline again with `--max_memory 200GB -resume` to skip all process, that were already calculated. If you can not increase the resource of the complete pipeline, you can try to adapt the resource for a single process as mentioned below. - -#### Advanced option on process level - -To bypass this error you would need to find exactly which resources are set by the `STAR_ALIGN` process. The quickest way is to search for `process STAR_ALIGN` in the [nf-core/rnaseq Github repo](https://github.com/nf-core/rnaseq/search?q=process+STAR_ALIGN). -We have standardised the structure of Nextflow DSL2 pipelines such that all module files will be present in the `modules/` directory and so, based on the search results, the file we want is `modules/nf-core/star/align/main.nf`. -If you click on the link to that file you will notice that there is a `label` directive at the top of the module that is set to [`label process_high`](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/modules/nf-core/software/star/align/main.nf#L9). -The [Nextflow `label`](https://www.nextflow.io/docs/latest/process.html#label) directive allows us to organise workflow processes in separate groups which can be referenced in a configuration file to select and configure subset of processes having similar computing requirements. -The default values for the `process_high` label are set in the pipeline's [`base.config`](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/conf/base.config#L33-L37) which in this case is defined as 72GB. -Providing you haven't set any other standard nf-core parameters to **cap** the [maximum resources](https://nf-co.re/usage/configuration#max-resources) used by the pipeline then we can try and bypass the `STAR_ALIGN` process failure by creating a custom config file that sets at least 72GB of memory, in this case increased to 100GB. -The custom config below can then be provided to the pipeline via the [`-c`](#-c) parameter as highlighted in previous sections. - -```nextflow -process { - withName: 'NFCORE_RNASEQ:RNASEQ:ALIGN_STAR:STAR_ALIGN' { - memory = 100.GB - } -} -``` - -> **NB:** We specify the full process name i.e. `NFCORE_RNASEQ:RNASEQ:ALIGN_STAR:STAR_ALIGN` in the config file because this takes priority over the short name (`STAR_ALIGN`) and allows existing configuration using the full process name to be correctly overridden. -> -> If you get a warning suggesting that the process selector isn't recognised check that the process name has been specified correctly. - -### Updating containers (advanced users) - -The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. If for some reason you need to use a different version of a particular tool with the pipeline then you just need to identify the `process` name and override the Nextflow `container` definition for that process using the `withName` declaration. For example, in the [nf-core/viralrecon](https://nf-co.re/viralrecon) pipeline a tool called [Pangolin](https://github.com/cov-lineages/pangolin) has been used during the COVID-19 pandemic to assign lineages to SARS-CoV-2 genome sequenced samples. Given that the lineage assignments change quite frequently it doesn't make sense to re-release the nf-core/viralrecon everytime a new version of Pangolin has been released. However, you can override the default container used by the pipeline by creating a custom config file and passing it as a command-line argument via `-c custom.config`. - -1. Check the default version used by the pipeline in the module file for [Pangolin](https://github.com/nf-core/viralrecon/blob/a85d5969f9025409e3618d6c280ef15ce417df65/modules/nf-core/software/pangolin/main.nf#L14-L19) -2. Find the latest version of the Biocontainer available on [Quay.io](https://quay.io/repository/biocontainers/pangolin?tag=latest&tab=tags) -3. Create the custom config accordingly: - - - For Docker: +To change the resource requests, please see the [max resources](https://nf-co.re/docs/usage/configuration#max-resources) and [tuning workflow resources](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources) section of the nf-core website. - ```nextflow - process { - withName: PANGOLIN { - container = 'quay.io/biocontainers/pangolin:3.0.5--pyhdfd78af_0' - } - } - ``` +### Custom Containers - - For Singularity: +In some cases you may wish to change which container or conda environment a step of the pipeline uses for a particular tool. By default nf-core pipelines use containers and software from the [biocontainers](https://biocontainers.pro/) or [bioconda](https://bioconda.github.io/) projects. However in some cases the pipeline specified version maybe out of date. - ```nextflow - process { - withName: PANGOLIN { - container = 'https://depot.galaxyproject.org/singularity/pangolin:3.0.5--pyhdfd78af_0' - } - } - ``` +To use a different container from the default container or conda environment specified in a pipeline, please see the [updating tool versions](https://nf-co.re/docs/usage/configuration#updating-tool-versions) section of the nf-core website. - - For Conda: +### Custom Tool Arguments - ```nextflow - process { - withName: PANGOLIN { - conda = 'bioconda::pangolin=3.0.5' - } - } - ``` +A pipeline might not always support every possible argument or option of a particular tool used in pipeline. Fortunately, nf-core pipelines provide some freedom to users to insert additional parameters that the pipeline does not include by default. -> **NB:** If you wish to periodically update individual tool-specific results (e.g. Pangolin) generated by the pipeline then you must ensure to keep the `work/` directory otherwise the `-resume` ability of the pipeline will be compromised and it will restart from scratch. +To learn how to provide additional arguments to a particular tool of the pipeline, please see the [customising tool arguments](https://nf-co.re/docs/usage/configuration#customising-tool-arguments) section of the nf-core website. ### nf-core/configs diff --git a/lib/NfcoreSchema.groovy b/lib/NfcoreSchema.groovy index 33cd4f6..9b34804 100755 --- a/lib/NfcoreSchema.groovy +++ b/lib/NfcoreSchema.groovy @@ -2,6 +2,7 @@ // This file holds several functions used to perform JSON parameter validation, help and summary rendering for the nf-core pipeline template. // +import nextflow.Nextflow import org.everit.json.schema.Schema import org.everit.json.schema.loader.SchemaLoader import org.everit.json.schema.ValidationException @@ -83,6 +84,7 @@ class NfcoreSchema { 'stub-run', 'test', 'w', + 'with-apptainer', 'with-charliecloud', 'with-conda', 'with-dag', @@ -177,7 +179,7 @@ class NfcoreSchema { } if (has_error) { - System.exit(1) + Nextflow.error('Exiting!') } } diff --git a/lib/NfcoreTemplate.groovy b/lib/NfcoreTemplate.groovy index 79cfdb4..bbd2fc1 100755 --- a/lib/NfcoreTemplate.groovy +++ b/lib/NfcoreTemplate.groovy @@ -120,7 +120,7 @@ class NfcoreTemplate { } // Write summary e-mail HTML to a file - def output_d = new File("${params.outdir}/pipeline_info/") + def output_d = new File("${params.outdir}/pipeline_info/variantcalling") if (!output_d.exists()) { output_d.mkdirs() } diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index a5850eb..0309eb4 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -2,6 +2,8 @@ // This file holds several functions specific to the main.nf workflow in the sanger-tol/variantcalling pipeline // +import nextflow.Nextflow + class WorkflowMain { // @@ -20,7 +22,7 @@ class WorkflowMain { // // Generate help string // - public static String help(workflow, params, log) { + public static String help(workflow, params) { def command = "nextflow run ${workflow.manifest.name} --input samplesheet.csv --fasta reference.fa --fai reference.fai --outdir results -profile docker" def help_string = '' help_string += NfcoreTemplate.logo(workflow, params.monochrome_logs) @@ -33,7 +35,7 @@ class WorkflowMain { // // Generate parameter summary log string // - public static String paramsSummaryLog(workflow, params, log) { + public static String paramsSummaryLog(workflow, params) { def summary_log = '' summary_log += NfcoreTemplate.logo(workflow, params.monochrome_logs) summary_log += NfcoreSchema.paramsSummaryLog(workflow, params) @@ -48,7 +50,7 @@ class WorkflowMain { public static void initialise(workflow, params, log) { // Print help to screen if required if (params.help) { - log.info help(workflow, params, log) + log.info help(workflow, params) System.exit(0) } @@ -60,7 +62,7 @@ class WorkflowMain { } // Print parameter summary log to screen - log.info paramsSummaryLog(workflow, params, log) + log.info paramsSummaryLog(workflow, params) // Validate workflow parameters via the JSON schema if (params.validate_params) { @@ -77,8 +79,7 @@ class WorkflowMain { // Check input has been provided if (!params.input) { - log.error "Please provide an input samplesheet to the pipeline e.g. '--input samplesheet.csv'" - System.exit(1) + Nextflow.error("Please provide an input samplesheet to the pipeline e.g. '--input samplesheet.csv'") } } } diff --git a/lib/WorkflowVariantcalling.groovy b/lib/WorkflowVariantcalling.groovy index dad2c91..523a779 100755 --- a/lib/WorkflowVariantcalling.groovy +++ b/lib/WorkflowVariantcalling.groovy @@ -2,6 +2,7 @@ // This file holds several functions specific to the workflow/variantcalling.nf in the sanger-tol/variantcalling pipeline // +import nextflow.Nextflow import groovy.text.SimpleTemplateEngine class WorkflowVariantcalling { @@ -12,9 +13,7 @@ class WorkflowVariantcalling { public static void initialise(params, log) { if (!params.fasta) { - log.error "Genome fasta file not specified with e.g. '--fasta genome.fa' or via a detectable config file." - System.exit(1) + Nextflow.error "Genome fasta file not specified with e.g. '--fasta genome.fa' or via a detectable config file." } } - } diff --git a/modules.json b/modules.json index 5ab5a99..f5561b1 100644 --- a/modules.json +++ b/modules.json @@ -7,34 +7,101 @@ "nf-core": { "bcftools/concat": { "branch": "master", - "git_sha": "fa12afdf5874c1d11e4a20efe81c97935e8eea24", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, + "blast/blastn": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["modules"] }, "cat/cat": { "branch": "master", - "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "custom/dumpsoftwareversions": { "branch": "master", - "git_sha": "7101db4432d3268b7fcb5b8f75fa0a022dc5561b", + "git_sha": "05c280924b6c768d484c7c443dad5e605c4ff4b4", "installed_by": ["modules"] }, "deepvariant": { "branch": "master", - "git_sha": "58b5e78506e66f7ecd610fa825890ed9fb98b793", - "installed_by": ["modules"], - "patch": "modules/nf-core/deepvariant/deepvariant.diff" + "git_sha": "516189e968feb4ebdd9921806988b4c12b4ac2dc", + "installed_by": ["modules"] + }, + "gunzip": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "minimap2/align": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "samtools/collate": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] }, "samtools/faidx": { "branch": "master", - "git_sha": "371eff7748d769c2ddc8bd593773523a364a52fe", + "git_sha": "fd742419940e01ba1c5ecb172c3e32ec840662fe", + "installed_by": ["modules"] + }, + "samtools/fasta": { + "branch": "master", + "git_sha": "6f4299292ef2c5b66e6829527b2647c301b77cc9", + "installed_by": ["modules"] + }, + "samtools/fastq": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "samtools/flagstat": { + "branch": "master", + "git_sha": "63e817de8c617131447192ab2c4e70b4ed4071f7", + "installed_by": ["modules"] + }, + "samtools/idxstats": { + "branch": "master", + "git_sha": "63e817de8c617131447192ab2c4e70b4ed4071f7", + "installed_by": ["modules"] + }, + "samtools/merge": { + "branch": "master", + "git_sha": "e7ce60acc8a33fa17429e966364657a63016e870", + "installed_by": ["modules"], + "patch": "modules/nf-core/samtools/merge/samtools-merge.diff" + }, + "samtools/sort": { + "branch": "master", + "git_sha": "a0f7be95788366c1923171e358da7d049eb440f9", + "installed_by": ["modules"] + }, + "samtools/stats": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["modules"] }, "samtools/view": { "branch": "master", - "git_sha": "371eff7748d769c2ddc8bd593773523a364a52fe", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"], + "patch": "modules/nf-core/samtools/view/samtools-view.diff" + }, + "untar": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["modules"] + }, + "vcftools": { + "branch": "master", + "git_sha": "485558b40040fc3ace093d9084210125d8ba4c97", + "installed_by": ["modules"], + "patch": "modules/nf-core/vcftools/vcftools.diff" } } } diff --git a/modules/local/pacbio_filter.nf b/modules/local/pacbio_filter.nf new file mode 100644 index 0000000..18dd11c --- /dev/null +++ b/modules/local/pacbio_filter.nf @@ -0,0 +1,30 @@ +process PACBIO_FILTER { + tag "$meta.id" + label 'process_single' + + conda "conda-forge::gawk=5.1.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gawk:5.1.0' : + 'quay.io/biocontainers/gawk:5.1.0' }" + + input: + tuple val(meta), path(txt) + + output: + path("*.blocklist"), emit: list + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + pacbio_filter.sh $txt ${prefix}.blocklist + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + GNU Awk: \$(echo \$(awk --version 2>&1) | grep -i awk | sed 's/GNU Awk //; s/,.*//') + END_VERSIONS + """ +} diff --git a/modules/local/samplesheet_check.nf b/modules/local/samplesheet_check.nf index 264bee0..0505378 100644 --- a/modules/local/samplesheet_check.nf +++ b/modules/local/samplesheet_check.nf @@ -5,7 +5,7 @@ process SAMPLESHEET_CHECK { conda "conda-forge::python=3.8.3" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/python:3.8.3' : - 'quay.io/biocontainers/python:3.8.3' }" + 'biocontainers/python:3.8.3' }" input: path samplesheet diff --git a/modules/nf-core/bcftools/concat/main.nf b/modules/nf-core/bcftools/concat/main.nf index de9ba67..244a42c 100644 --- a/modules/nf-core/bcftools/concat/main.nf +++ b/modules/nf-core/bcftools/concat/main.nf @@ -5,7 +5,7 @@ process BCFTOOLS_CONCAT { conda "bioconda::bcftools=1.17" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/bcftools:1.17--haef29d1_0': - 'quay.io/biocontainers/bcftools:1.17--haef29d1_0' }" + 'biocontainers/bcftools:1.17--haef29d1_0' }" input: tuple val(meta), path(vcfs), path(tbi) diff --git a/modules/nf-core/blast/blastn/environment.yml b/modules/nf-core/blast/blastn/environment.yml new file mode 100644 index 0000000..cb9b15d --- /dev/null +++ b/modules/nf-core/blast/blastn/environment.yml @@ -0,0 +1,7 @@ +name: blast_blastn +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::blast=2.14.1 diff --git a/modules/nf-core/blast/blastn/main.nf b/modules/nf-core/blast/blastn/main.nf new file mode 100644 index 0000000..e8b96ad --- /dev/null +++ b/modules/nf-core/blast/blastn/main.nf @@ -0,0 +1,57 @@ +process BLAST_BLASTN { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/blast:2.14.1--pl5321h6f7f691_0': + 'biocontainers/blast:2.14.1--pl5321h6f7f691_0' }" + + input: + tuple val(meta) , path(fasta) + tuple val(meta2), path(db) + + output: + tuple val(meta), path('*.txt'), emit: txt + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def is_compressed = fasta.getExtension() == "gz" ? true : false + def fasta_name = is_compressed ? fasta.getBaseName() : fasta + + """ + if [ "${is_compressed}" == "true" ]; then + gzip -c -d ${fasta} > ${fasta_name} + fi + + DB=`find -L ./ -name "*.nin" | sed 's/\\.nin\$//'` + blastn \\ + -num_threads ${task.cpus} \\ + -db \$DB \\ + -query ${fasta_name} \\ + ${args} \\ + -out ${prefix}.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + blast: \$(blastn -version 2>&1 | sed 's/^.*blastn: //; s/ .*\$//') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + blast: \$(blastn -version 2>&1 | sed 's/^.*blastn: //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/blast/blastn/meta.yml b/modules/nf-core/blast/blastn/meta.yml new file mode 100644 index 0000000..a0d64dd --- /dev/null +++ b/modules/nf-core/blast/blastn/meta.yml @@ -0,0 +1,55 @@ +name: blast_blastn +description: Queries a BLAST DNA database +keywords: + - fasta + - blast + - blastn + - DNA sequence +tools: + - blast: + description: | + BLAST finds regions of similarity between biological sequences. + homepage: https://blast.ncbi.nlm.nih.gov/Blast.cgi + documentation: https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=Blastdocs + doi: 10.1016/S0022-2836(05)80360-2 + licence: ["US-Government-Work"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Input fasta file containing queries sequences + pattern: "*.{fa,fasta,fa.gz,fasta.gz}" + - meta2: + type: map + description: | + Groovy Map containing db information + e.g. [ id:'test2', single_end:false ] + - db: + type: directory + description: Directory containing the blast database + pattern: "*" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - txt: + type: file + description: File containing blastn hits + pattern: "*.txt" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" +maintainers: + - "@joseespinosa" + - "@drpatelh" + - "@vagkaratzas" diff --git a/modules/nf-core/blast/blastn/tests/main.nf.test b/modules/nf-core/blast/blastn/tests/main.nf.test new file mode 100644 index 0000000..1058c81 --- /dev/null +++ b/modules/nf-core/blast/blastn/tests/main.nf.test @@ -0,0 +1,71 @@ +nextflow_process { + + name "Test Process BLAST_BLASTN" + script "../main.nf" + process "BLAST_BLASTN" + config "./nextflow.config" + tag "modules" + tag "modules_nfcore" + tag "blast" + tag "blast/blastn" + + setup { + run("BLAST_MAKEBLASTDB") { + script "../../makeblastdb/main.nf" + process { + """ + input[0] = [ [id:'test2'], file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) ] + """ + } + } + } + + test("Should search for nucleotide hits against a blast db") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ [id:'test'], file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) ] + input[1] = BLAST_MAKEBLASTDB.out.db + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.txt.get(0).get(1)).getText().contains("Query= MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate") }, + { assert process.out.versions } + ) + } + + } + + test("Should search for zipped nucleotide hits against a blast db") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ [id:'test'], file(params.test_data['sarscov2']['genome']['genome_fasta_gz'], checkIfExists: true) ] + input[1] = BLAST_MAKEBLASTDB.out.db + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.txt.get(0).get(1)).getText().contains("Query= MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate") }, + { assert process.out.versions } + ) + } + + } + +} diff --git a/modules/nf-core/blast/blastn/tests/nextflow.config b/modules/nf-core/blast/blastn/tests/nextflow.config new file mode 100644 index 0000000..0899289 --- /dev/null +++ b/modules/nf-core/blast/blastn/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: BLAST_MAKEBLASTDB { + ext.args = '-dbtype nucl' + } +} diff --git a/modules/nf-core/blast/blastn/tests/tags.yml b/modules/nf-core/blast/blastn/tests/tags.yml new file mode 100644 index 0000000..b4588ab --- /dev/null +++ b/modules/nf-core/blast/blastn/tests/tags.yml @@ -0,0 +1,2 @@ +blast/blastn: + - modules/nf-core/blast/blastn/** diff --git a/modules/nf-core/cat/cat/main.nf b/modules/nf-core/cat/cat/main.nf index 840af4b..9f06221 100644 --- a/modules/nf-core/cat/cat/main.nf +++ b/modules/nf-core/cat/cat/main.nf @@ -5,7 +5,7 @@ process CAT_CAT { conda "conda-forge::pigz=2.3.4" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/pigz:2.3.4' : - 'quay.io/biocontainers/pigz:2.3.4' }" + 'biocontainers/pigz:2.3.4' }" input: tuple val(meta), path(files_in) diff --git a/modules/nf-core/custom/dumpsoftwareversions/main.nf b/modules/nf-core/custom/dumpsoftwareversions/main.nf index 800a609..c9d014b 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/main.nf +++ b/modules/nf-core/custom/dumpsoftwareversions/main.nf @@ -2,10 +2,10 @@ process CUSTOM_DUMPSOFTWAREVERSIONS { label 'process_single' // Requires `pyyaml` which does not have a dedicated container but is in the MultiQC container - conda "bioconda::multiqc=1.14" + conda "bioconda::multiqc=1.15" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.14--pyhdfd78af_0' : - 'quay.io/biocontainers/multiqc:1.14--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.15--pyhdfd78af_0' : + 'biocontainers/multiqc:1.15--pyhdfd78af_0' }" input: path versions diff --git a/modules/nf-core/custom/dumpsoftwareversions/meta.yml b/modules/nf-core/custom/dumpsoftwareversions/meta.yml index 60b546a..c32657d 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/meta.yml +++ b/modules/nf-core/custom/dumpsoftwareversions/meta.yml @@ -1,7 +1,9 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json name: custom_dumpsoftwareversions description: Custom module used to dump software versions within the nf-core pipeline template keywords: - custom + - dump - version tools: - custom: diff --git a/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py old mode 100644 new mode 100755 index da03340..e55b8d4 --- a/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py +++ b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py @@ -4,10 +4,11 @@ """Provide functions to merge multiple versions.yml files.""" -import yaml import platform from textwrap import dedent +import yaml + def _make_versions_html(versions): """Generate a tabular HTML output of all versions for MultiQC.""" diff --git a/modules/nf-core/deepvariant/environment.yml b/modules/nf-core/deepvariant/environment.yml new file mode 100644 index 0000000..bcd3a8b --- /dev/null +++ b/modules/nf-core/deepvariant/environment.yml @@ -0,0 +1,4 @@ +channels: + - conda-forge + - bioconda + - defaults diff --git a/modules/nf-core/deepvariant/main.nf b/modules/nf-core/deepvariant/main.nf index 434fcc0..2d5c480 100644 --- a/modules/nf-core/deepvariant/main.nf +++ b/modules/nf-core/deepvariant/main.nf @@ -1,32 +1,33 @@ process DEEPVARIANT { tag "$meta.id" - label 'process_medium' + label 'process_high' - container "google/deepvariant:1.4.0" - - // Exit if running this module with -profile conda / -profile mamba - if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { - exit 1, "DEEPVARIANT module does not support Conda. Please use Docker / Singularity / Podman instead." - } + container "nf-core/deepvariant:1.5.0" input: tuple val(meta), path(input), path(index), path(intervals) - path(fasta) - path(fai) - path(gzi) + tuple val(meta2), path(fasta) + tuple val(meta3), path(fai) + tuple val(meta4), path(gzi) output: - tuple val(meta), path("${prefix}.vcf.gz") , emit: vcf - tuple val(meta), path("${prefix}.g.vcf.gz"), emit: gvcf - path "versions.yml" , emit: versions + tuple val(meta), path("${prefix}.vcf.gz") , emit: vcf + tuple val(meta), path("${prefix}.vcf.gz.tbi") , emit: vcf_tbi + tuple val(meta), path("${prefix}.g.vcf.gz") , emit: gvcf + tuple val(meta), path("${prefix}.g.vcf.gz.tbi"), emit: gvcf_tbi + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when script: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error "DEEPVARIANT module does not support Conda. Please use Docker / Singularity / Podman instead." + } def args = task.ext.args ?: '' prefix = task.ext.prefix ?: "${meta.id}" - def regions = intervals ? "--regions ${intervals}" : "" + def regions = intervals ? "--regions=${intervals}" : "" """ /opt/deepvariant/bin/run_deepvariant \\ @@ -36,6 +37,7 @@ process DEEPVARIANT { --output_gvcf=${prefix}.g.vcf.gz \\ ${args} \\ ${regions} \\ + --intermediate_results_dir=. \\ --num_shards=${task.cpus} cat <<-END_VERSIONS > versions.yml @@ -45,10 +47,16 @@ process DEEPVARIANT { """ stub: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error "DEEPVARIANT module does not support Conda. Please use Docker / Singularity / Podman instead." + } prefix = task.ext.prefix ?: "${meta.id}" """ touch ${prefix}.vcf.gz + touch ${prefix}.vcf.gz.tbi touch ${prefix}.g.vcf.gz + touch ${prefix}.g.vcf.gz.tbi cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/deepvariant/meta.yml b/modules/nf-core/deepvariant/meta.yml index 63868b2..a50dc57 100644 --- a/modules/nf-core/deepvariant/meta.yml +++ b/modules/nf-core/deepvariant/meta.yml @@ -3,6 +3,7 @@ description: DeepVariant is an analysis pipeline that uses a deep neural network keywords: - variant calling - machine learning + - neural network tools: - deepvariant: description: DeepVariant is an analysis pipeline that uses a deep neural network to call genetic variants from next-generation DNA sequencing data @@ -11,7 +12,6 @@ tools: tool_dev_url: https://github.com/google/deepvariant doi: "10.1038/nbt.4235" licence: ["BSD-3-clause"] - input: - meta: type: map @@ -30,19 +30,33 @@ input: type: file description: Interval file for targeted regions pattern: "*.bed" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] - fasta: type: file description: The reference fasta file pattern: "*.fasta" + - meta3: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] - fai: type: file description: Index of reference fasta file pattern: "*.fai" + - meta4: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] - gzi: type: file description: GZI index of reference fasta file pattern: "*.gzi" - output: - meta: type: map @@ -61,6 +75,9 @@ output: type: file description: File containing software version pattern: "*.{version.txt}" - authors: - "@abhi18av" + - "@ramprasadn" +maintainers: + - "@abhi18av" + - "@ramprasadn" diff --git a/modules/nf-core/gunzip/environment.yml b/modules/nf-core/gunzip/environment.yml new file mode 100644 index 0000000..25910b3 --- /dev/null +++ b/modules/nf-core/gunzip/environment.yml @@ -0,0 +1,7 @@ +name: gunzip +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - conda-forge::sed=4.7 diff --git a/modules/nf-core/gunzip/main.nf b/modules/nf-core/gunzip/main.nf new file mode 100644 index 0000000..468a6f2 --- /dev/null +++ b/modules/nf-core/gunzip/main.nf @@ -0,0 +1,48 @@ +process GUNZIP { + tag "$archive" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + + input: + tuple val(meta), path(archive) + + output: + tuple val(meta), path("$gunzip"), emit: gunzip + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + gunzip = archive.toString() - '.gz' + """ + # Not calling gunzip itself because it creates files + # with the original group ownership rather than the + # default one for that user / the work directory + gzip \\ + -cd \\ + $args \\ + $archive \\ + > $gunzip + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gunzip: \$(echo \$(gunzip --version 2>&1) | sed 's/^.*(gzip) //; s/ Copyright.*\$//') + END_VERSIONS + """ + + stub: + gunzip = archive.toString() - '.gz' + """ + touch $gunzip + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gunzip: \$(echo \$(gunzip --version 2>&1) | sed 's/^.*(gzip) //; s/ Copyright.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gunzip/meta.yml b/modules/nf-core/gunzip/meta.yml new file mode 100644 index 0000000..231034f --- /dev/null +++ b/modules/nf-core/gunzip/meta.yml @@ -0,0 +1,39 @@ +name: gunzip +description: Compresses and decompresses files. +keywords: + - gunzip + - compression + - decompression +tools: + - gunzip: + description: | + gzip is a file format and a software application used for file compression and decompression. + documentation: https://www.gnu.org/software/gzip/manual/gzip.html + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Optional groovy Map containing meta information + e.g. [ id:'test', single_end:false ] + - archive: + type: file + description: File to be compressed/uncompressed + pattern: "*.*" +output: + - gunzip: + type: file + description: Compressed/uncompressed file + pattern: "*.*" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" + - "@jfy133" +maintainers: + - "@joseespinosa" + - "@drpatelh" + - "@jfy133" diff --git a/modules/nf-core/gunzip/tests/main.nf.test b/modules/nf-core/gunzip/tests/main.nf.test new file mode 100644 index 0000000..d031792 --- /dev/null +++ b/modules/nf-core/gunzip/tests/main.nf.test @@ -0,0 +1,35 @@ +nextflow_process { + + name "Test Process GUNZIP" + script "../main.nf" + process "GUNZIP" + tag "gunzip" + tag "modules_nfcore" + tag "modules" + + test("Should run without failures") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [], + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/gunzip/tests/main.nf.test.snap b/modules/nf-core/gunzip/tests/main.nf.test.snap new file mode 100644 index 0000000..720fd9f --- /dev/null +++ b/modules/nf-core/gunzip/tests/main.nf.test.snap @@ -0,0 +1,31 @@ +{ + "Should run without failures": { + "content": [ + { + "0": [ + [ + [ + + ], + "test_1.fastq:md5,4161df271f9bfcd25d5845a1e220dbec" + ] + ], + "1": [ + "versions.yml:md5,54376d32aca20e937a4ec26dac228e84" + ], + "gunzip": [ + [ + [ + + ], + "test_1.fastq:md5,4161df271f9bfcd25d5845a1e220dbec" + ] + ], + "versions": [ + "versions.yml:md5,54376d32aca20e937a4ec26dac228e84" + ] + } + ], + "timestamp": "2023-10-17T15:35:37.690477896" + } +} \ No newline at end of file diff --git a/modules/nf-core/gunzip/tests/tags.yml b/modules/nf-core/gunzip/tests/tags.yml new file mode 100644 index 0000000..fd3f691 --- /dev/null +++ b/modules/nf-core/gunzip/tests/tags.yml @@ -0,0 +1,2 @@ +gunzip: + - modules/nf-core/gunzip/** diff --git a/modules/nf-core/minimap2/align/environment.yml b/modules/nf-core/minimap2/align/environment.yml new file mode 100644 index 0000000..60b9a8b --- /dev/null +++ b/modules/nf-core/minimap2/align/environment.yml @@ -0,0 +1,8 @@ +name: minimap2_align +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::minimap2=2.24 + - bioconda::samtools=1.14 diff --git a/modules/nf-core/minimap2/align/main.nf b/modules/nf-core/minimap2/align/main.nf new file mode 100644 index 0000000..fa3ae50 --- /dev/null +++ b/modules/nf-core/minimap2/align/main.nf @@ -0,0 +1,48 @@ +process MINIMAP2_ALIGN { + tag "$meta.id" + label 'process_medium' + + // Note: the versions here need to match the versions used in the mulled container below and minimap2/index + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:1679e915ddb9d6b4abda91880c4b48857d471bd8-0' : + 'biocontainers/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:1679e915ddb9d6b4abda91880c4b48857d471bd8-0' }" + + input: + tuple val(meta), path(reads) + tuple val(meta2), path(reference) + val bam_format + val cigar_paf_format + val cigar_bam + + output: + tuple val(meta), path("*.paf"), optional: true, emit: paf + tuple val(meta), path("*.bam"), optional: true, emit: bam + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def bam_output = bam_format ? "-a | samtools sort | samtools view -@ ${task.cpus} -b -h -o ${prefix}.bam" : "-o ${prefix}.paf" + def cigar_paf = cigar_paf_format && !bam_format ? "-c" : '' + def set_cigar_bam = cigar_bam && bam_format ? "-L" : '' + """ + minimap2 \\ + $args \\ + -t $task.cpus \\ + "${reference ?: reads}" \\ + "$reads" \\ + $cigar_paf \\ + $set_cigar_bam \\ + $bam_output + + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + minimap2: \$(minimap2 --version 2>&1) + END_VERSIONS + """ +} diff --git a/modules/nf-core/minimap2/align/meta.yml b/modules/nf-core/minimap2/align/meta.yml new file mode 100644 index 0000000..408522d --- /dev/null +++ b/modules/nf-core/minimap2/align/meta.yml @@ -0,0 +1,75 @@ +name: minimap2_align +description: A versatile pairwise aligner for genomic and spliced nucleotide sequences +keywords: + - align + - fasta + - fastq + - genome + - paf + - reference +tools: + - minimap2: + description: | + A versatile pairwise aligner for genomic and spliced nucleotide sequences. + homepage: https://github.com/lh3/minimap2 + documentation: https://github.com/lh3/minimap2#uguide + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FASTA or FASTQ files of size 1 and 2 for single-end + and paired-end data, respectively. + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test_ref'] + - reference: + type: file + description: | + Reference database in FASTA format. + - bam_format: + type: boolean + description: Specify that output should be in BAM format + - cigar_paf_format: + type: boolean + description: Specify that output CIGAR should be in PAF format + - cigar_bam: + type: boolean + description: | + Write CIGAR with >65535 ops at the CG tag. This is recommended when + doing XYZ (https://github.com/lh3/minimap2#working-with-65535-cigar-operations) +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - paf: + type: file + description: Alignment in PAF format + pattern: "*.paf" + - bam: + type: file + description: Alignment in BAM format + pattern: "*.bam" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@heuermh" + - "@sofstam" + - "@sateeshperi" + - "@jfy133" +maintainers: + - "@heuermh" + - "@sofstam" + - "@sateeshperi" + - "@jfy133" diff --git a/modules/nf-core/samtools/collate/environment.yml b/modules/nf-core/samtools/collate/environment.yml new file mode 100644 index 0000000..0fb861b --- /dev/null +++ b/modules/nf-core/samtools/collate/environment.yml @@ -0,0 +1,7 @@ +name: samtools_collate +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.17 diff --git a/modules/nf-core/samtools/collate/main.nf b/modules/nf-core/samtools/collate/main.nf new file mode 100644 index 0000000..38a4daf --- /dev/null +++ b/modules/nf-core/samtools/collate/main.nf @@ -0,0 +1,46 @@ +process SAMTOOLS_COLLATE { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0': + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(input) + path fasta + + output: + tuple val(meta), path("*.bam"), emit: bam, optional: true + tuple val(meta), path("*.cram"), emit: cram, optional: true + tuple val(meta), path("*.sam"), emit: sam, optional: true + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def reference = fasta ? "--reference ${fasta}" : "" + def extension = args.contains("--output-fmt sam") ? "sam" : + args.contains("--output-fmt bam") ? "bam" : + args.contains("--output-fmt cram") ? "cram" : + "bam" + if ("$input" == "${prefix}.${extension}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + samtools \\ + collate \\ + $args \\ + ${reference} \\ + -@ $task.cpus \\ + -o ${prefix}.${extension} \\ + $input + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/collate/meta.yml b/modules/nf-core/samtools/collate/meta.yml new file mode 100644 index 0000000..3c79927 --- /dev/null +++ b/modules/nf-core/samtools/collate/meta.yml @@ -0,0 +1,43 @@ +name: "samtools_collate" +description: shuffles and groups reads together by their names +keywords: + - collate + - bam +tools: + - "samtools": + description: "Tools for dealing with SAM, BAM and CRAM files" + homepage: "http://www.htslib.org" + documentation: "https://www.htslib.org/doc/samtools-collate.html" + tool_dev_url: "https://github.com/samtools/samtools" + doi: "10.1093/bioinformatics/btp352" + licence: "['MIT']" +input: + # Only when we have meta + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" +output: + #Only when we have meta + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - output: + type: file + description: Collated BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" +authors: + - "@priyanka-surana" +maintainers: + - "@priyanka-surana" diff --git a/modules/nf-core/samtools/faidx/main.nf b/modules/nf-core/samtools/faidx/main.nf index 21be8ba..59ed308 100644 --- a/modules/nf-core/samtools/faidx/main.nf +++ b/modules/nf-core/samtools/faidx/main.nf @@ -5,15 +5,17 @@ process SAMTOOLS_FAIDX { conda "bioconda::samtools=1.17" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : - 'quay.io/biocontainers/samtools:1.17--h00cdaf9_0' }" + 'biocontainers/samtools:1.17--h00cdaf9_0' }" input: tuple val(meta), path(fasta) + tuple val(meta2), path(fai) output: - tuple val(meta), path ("*.fai"), emit: fai - tuple val(meta), path ("*.gzi"), emit: gzi, optional: true - path "versions.yml" , emit: versions + tuple val(meta), path ("*.{fa,fasta}") , emit: fa , optional: true + tuple val(meta), path ("*.fai") , emit: fai, optional: true + tuple val(meta), path ("*.gzi") , emit: gzi, optional: true + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when @@ -23,8 +25,8 @@ process SAMTOOLS_FAIDX { """ samtools \\ faidx \\ - $args \\ - $fasta + $fasta \\ + $args cat <<-END_VERSIONS > versions.yml "${task.process}": @@ -33,8 +35,12 @@ process SAMTOOLS_FAIDX { """ stub: + def match = (task.ext.args =~ /-o(?:utput)?\s(.*)\s?/).findAll() + def fastacmd = match[0] ? "touch ${match[0][1]}" : '' """ + ${fastacmd} touch ${fasta}.fai + cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/samtools/faidx/meta.yml b/modules/nf-core/samtools/faidx/meta.yml index fe2fe9a..957b25e 100644 --- a/modules/nf-core/samtools/faidx/meta.yml +++ b/modules/nf-core/samtools/faidx/meta.yml @@ -3,6 +3,7 @@ description: Index FASTA file keywords: - index - fasta + - faidx tools: - samtools: description: | @@ -17,12 +18,21 @@ input: - meta: type: map description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] + Groovy Map containing reference information + e.g. [ id:'test' ] - fasta: type: file description: FASTA file pattern: "*.{fa,fasta}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - fai: + type: file + description: FASTA index file + pattern: "*.{fai}" output: - meta: type: map diff --git a/modules/nf-core/samtools/fasta/environment.yml b/modules/nf-core/samtools/fasta/environment.yml new file mode 100644 index 0000000..8a82f9e --- /dev/null +++ b/modules/nf-core/samtools/fasta/environment.yml @@ -0,0 +1,7 @@ +name: samtools_fasta +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.17 diff --git a/modules/nf-core/samtools/fasta/main.nf b/modules/nf-core/samtools/fasta/main.nf new file mode 100644 index 0000000..dc4ad98 --- /dev/null +++ b/modules/nf-core/samtools/fasta/main.nf @@ -0,0 +1,44 @@ +process SAMTOOLS_FASTA { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(input) + val(interleave) + + output: + tuple val(meta), path("*_{1,2}.fasta.gz") , optional:true, emit: fasta + tuple val(meta), path("*_interleaved.fasta.gz"), optional:true, emit: interleaved + tuple val(meta), path("*_singleton.fasta.gz") , optional:true, emit: singleton + tuple val(meta), path("*_other.fasta.gz") , optional:true, emit: other + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def output = ( interleave && ! meta.single_end ) ? "> ${prefix}_interleaved.fasta.gz" : + meta.single_end ? "-1 ${prefix}_1.fasta.gz -s ${prefix}_singleton.fasta.gz" : + "-1 ${prefix}_1.fasta.gz -2 ${prefix}_2.fasta.gz -s ${prefix}_singleton.fasta.gz" + """ + samtools \\ + fasta \\ + $args \\ + --threads ${task.cpus-1} \\ + -0 ${prefix}_other.fasta.gz \\ + $input \\ + $output + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/fasta/meta.yml b/modules/nf-core/samtools/fasta/meta.yml new file mode 100644 index 0000000..eae26f0 --- /dev/null +++ b/modules/nf-core/samtools/fasta/meta.yml @@ -0,0 +1,60 @@ +name: "samtools_fasta" +description: Converts a SAM/BAM/CRAM file to FASTA +keywords: + - bam + - sam + - cram + - fasta +tools: + - "samtools": + description: "Tools for dealing with SAM, BAM and CRAM files" + homepage: "http://www.htslib.org" + documentation: "https://www.htslib.org/doc/samtools-fasta.html" + tool_dev_url: "https://github.com/samtools/samtools" + doi: "10.1093/bioinformatics/btp352" + licence: ["MIT"] +input: + # Only when we have meta + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - interleave: + type: boolean + description: Set true for interleaved fasta files +output: + #Only when we have meta + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - fasta: + type: file + description: Compressed FASTA file(s) with reads with either the READ1 or READ2 flag set in separate files. + pattern: "*_{1,2}.fasta.gz" + - interleaved: + type: file + description: Compressed FASTA file with reads with either the READ1 or READ2 flag set in a combined file. Needs collated input file. + pattern: "*_interleaved.fasta.gz" + - singleton: + type: file + description: Compressed FASTA file with singleton reads + pattern: "*_singleton.fasta.gz" + - other: + type: file + description: Compressed FASTA file with reads with either both READ1 and READ2 flags set or unset + pattern: "*_other.fasta.gz" +authors: + - "@priyanka-surana" +maintainers: + - "@priyanka-surana" diff --git a/modules/nf-core/samtools/fastq/environment.yml b/modules/nf-core/samtools/fastq/environment.yml new file mode 100644 index 0000000..1b7124d --- /dev/null +++ b/modules/nf-core/samtools/fastq/environment.yml @@ -0,0 +1,7 @@ +name: samtools_fastq +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.17 diff --git a/modules/nf-core/samtools/fastq/main.nf b/modules/nf-core/samtools/fastq/main.nf new file mode 100644 index 0000000..ed8d755 --- /dev/null +++ b/modules/nf-core/samtools/fastq/main.nf @@ -0,0 +1,44 @@ +process SAMTOOLS_FASTQ { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(input) + val(interleave) + + output: + tuple val(meta), path("*_{1,2}.fastq.gz") , optional:true, emit: fastq + tuple val(meta), path("*_interleaved.fastq.gz"), optional:true, emit: interleaved + tuple val(meta), path("*_singleton.fastq.gz") , optional:true, emit: singleton + tuple val(meta), path("*_other.fastq.gz") , optional:true, emit: other + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def output = ( interleave && ! meta.single_end ) ? "> ${prefix}_interleaved.fastq.gz" : + meta.single_end ? "-1 ${prefix}_1.fastq.gz -s ${prefix}_singleton.fastq.gz" : + "-1 ${prefix}_1.fastq.gz -2 ${prefix}_2.fastq.gz -s ${prefix}_singleton.fastq.gz" + """ + samtools \\ + fastq \\ + $args \\ + --threads ${task.cpus-1} \\ + -0 ${prefix}_other.fastq.gz \\ + $input \\ + $output + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/fastq/meta.yml b/modules/nf-core/samtools/fastq/meta.yml new file mode 100644 index 0000000..c4002a4 --- /dev/null +++ b/modules/nf-core/samtools/fastq/meta.yml @@ -0,0 +1,62 @@ +name: samtools_fastq +description: Converts a SAM/BAM/CRAM file to FASTQ +keywords: + - bam + - sam + - cram + - fastq +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - interleave: + type: boolean + description: Set true for interleaved fastq file +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - fastq: + type: file + description: Compressed FASTQ file(s) with reads with either the READ1 or READ2 flag set in separate files. + pattern: "*_{1,2}.fastq.gz" + - interleaved: + type: file + description: Compressed FASTQ file with reads with either the READ1 or READ2 flag set in a combined file. Needs collated input file. + pattern: "*_interleaved.fastq.gz" + - singleton: + type: file + description: Compressed FASTQ file with singleton reads + pattern: "*_singleton.fastq.gz" + - other: + type: file + description: Compressed FASTQ file with reads with either both READ1 and READ2 flags set or unset + pattern: "*_other.fastq.gz" +authors: + - "@priyanka-surana" + - "@suzannejin" +maintainers: + - "@priyanka-surana" + - "@suzannejin" diff --git a/modules/nf-core/samtools/flagstat/environment.yml b/modules/nf-core/samtools/flagstat/environment.yml new file mode 100644 index 0000000..22bdb5c --- /dev/null +++ b/modules/nf-core/samtools/flagstat/environment.yml @@ -0,0 +1,7 @@ +name: samtools_flagstat +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.17 diff --git a/modules/nf-core/samtools/flagstat/main.nf b/modules/nf-core/samtools/flagstat/main.nf new file mode 100644 index 0000000..9dee35a --- /dev/null +++ b/modules/nf-core/samtools/flagstat/main.nf @@ -0,0 +1,46 @@ +process SAMTOOLS_FLAGSTAT { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(bam), path(bai) + + output: + tuple val(meta), path("*.flagstat"), emit: flagstat + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + samtools \\ + flagstat \\ + --threads ${task.cpus} \\ + $bam \\ + > ${prefix}.flagstat + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.flagstat + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/flagstat/meta.yml b/modules/nf-core/samtools/flagstat/meta.yml new file mode 100644 index 0000000..9799135 --- /dev/null +++ b/modules/nf-core/samtools/flagstat/meta.yml @@ -0,0 +1,51 @@ +name: samtools_flagstat +description: Counts the number of alignments in a BAM/CRAM/SAM file for each FLAG type +keywords: + - stats + - mapping + - counts + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - bai: + type: file + description: Index for BAM/CRAM/SAM file + pattern: "*.{bai,crai,sai}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - flagstat: + type: file + description: File containing samtools flagstat output + pattern: "*.{flagstat}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" +maintainers: + - "@drpatelh" diff --git a/modules/nf-core/samtools/flagstat/tests/main.nf.test b/modules/nf-core/samtools/flagstat/tests/main.nf.test new file mode 100644 index 0000000..c618de7 --- /dev/null +++ b/modules/nf-core/samtools/flagstat/tests/main.nf.test @@ -0,0 +1,35 @@ +nextflow_process { + + name "Test Process SAMTOOLS_FLAGSTAT" + script "../main.nf" + process "SAMTOOLS_FLAGSTAT" + tag "modules" + tag "modules_nfcore" + tag "samtools/flagstat" + + test("BAM") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam_bai'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out.flagstat).match() }, + { assert path(process.out.versions.get(0)).getText().contains("samtools") } + ) + } + } +} diff --git a/modules/nf-core/samtools/flagstat/tests/main.nf.test.snap b/modules/nf-core/samtools/flagstat/tests/main.nf.test.snap new file mode 100644 index 0000000..880019f --- /dev/null +++ b/modules/nf-core/samtools/flagstat/tests/main.nf.test.snap @@ -0,0 +1,16 @@ +{ + "BAM": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.flagstat:md5,4f7ffd1e6a5e85524d443209ac97d783" + ] + ] + ], + "timestamp": "2023-11-14T15:49:22.577133" + } +} \ No newline at end of file diff --git a/modules/nf-core/samtools/flagstat/tests/tags.yml b/modules/nf-core/samtools/flagstat/tests/tags.yml new file mode 100644 index 0000000..2d2b725 --- /dev/null +++ b/modules/nf-core/samtools/flagstat/tests/tags.yml @@ -0,0 +1,2 @@ +samtools/flagstat: + - modules/nf-core/samtools/flagstat/** diff --git a/modules/nf-core/samtools/idxstats/environment.yml b/modules/nf-core/samtools/idxstats/environment.yml new file mode 100644 index 0000000..89bd272 --- /dev/null +++ b/modules/nf-core/samtools/idxstats/environment.yml @@ -0,0 +1,7 @@ +name: samtools_idxstats +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.17 diff --git a/modules/nf-core/samtools/idxstats/main.nf b/modules/nf-core/samtools/idxstats/main.nf new file mode 100644 index 0000000..b22d084 --- /dev/null +++ b/modules/nf-core/samtools/idxstats/main.nf @@ -0,0 +1,48 @@ +process SAMTOOLS_IDXSTATS { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(bam), path(bai) + + output: + tuple val(meta), path("*.idxstats"), emit: idxstats + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + samtools \\ + idxstats \\ + --threads ${task.cpus-1} \\ + $bam \\ + > ${prefix}.idxstats + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + touch ${prefix}.idxstats + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/idxstats/meta.yml b/modules/nf-core/samtools/idxstats/meta.yml new file mode 100644 index 0000000..344e92a --- /dev/null +++ b/modules/nf-core/samtools/idxstats/meta.yml @@ -0,0 +1,52 @@ +name: samtools_idxstats +description: Reports alignment summary statistics for a BAM/CRAM/SAM file +keywords: + - stats + - mapping + - counts + - chromosome + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - bai: + type: file + description: Index for BAM/CRAM/SAM file + pattern: "*.{bai,crai,sai}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - idxstats: + type: file + description: File containing samtools idxstats output + pattern: "*.{idxstats}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" +maintainers: + - "@drpatelh" diff --git a/modules/nf-core/samtools/idxstats/tests/main.nf.test b/modules/nf-core/samtools/idxstats/tests/main.nf.test new file mode 100644 index 0000000..0174a9e --- /dev/null +++ b/modules/nf-core/samtools/idxstats/tests/main.nf.test @@ -0,0 +1,35 @@ +nextflow_process { + + name "Test Process SAMTOOLS_IDXSTATS" + script "../main.nf" + process "SAMTOOLS_IDXSTATS" + tag "modules" + tag "modules_nfcore" + tag "samtools/idxstats" + + test("BAM") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam_bai'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out.idxstats).match() }, + { assert path(process.out.versions.get(0)).getText().contains("samtools") } + ) + } + } +} diff --git a/modules/nf-core/samtools/idxstats/tests/main.nf.test.snap b/modules/nf-core/samtools/idxstats/tests/main.nf.test.snap new file mode 100644 index 0000000..4c6c12b --- /dev/null +++ b/modules/nf-core/samtools/idxstats/tests/main.nf.test.snap @@ -0,0 +1,16 @@ +{ + "BAM": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.idxstats:md5,df60a8c8d6621100d05178c93fb053a2" + ] + ] + ], + "timestamp": "2023-11-14T15:52:19.875194" + } +} \ No newline at end of file diff --git a/modules/nf-core/samtools/idxstats/tests/tags.yml b/modules/nf-core/samtools/idxstats/tests/tags.yml new file mode 100644 index 0000000..d3057c6 --- /dev/null +++ b/modules/nf-core/samtools/idxstats/tests/tags.yml @@ -0,0 +1,2 @@ +samtools/idxstats: + - modules/nf-core/samtools/idxstats/** diff --git a/modules/nf-core/samtools/merge/environment.yml b/modules/nf-core/samtools/merge/environment.yml new file mode 100644 index 0000000..04c82f1 --- /dev/null +++ b/modules/nf-core/samtools/merge/environment.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.17 diff --git a/modules/nf-core/samtools/merge/main.nf b/modules/nf-core/samtools/merge/main.nf new file mode 100644 index 0000000..21f785c --- /dev/null +++ b/modules/nf-core/samtools/merge/main.nf @@ -0,0 +1,57 @@ +process SAMTOOLS_MERGE { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(input_files, stageAs: "?/*") + tuple val(meta2), path(fasta) + tuple val(meta3), path(fai) + + output: + tuple val(meta), path("${prefix}.bam") , optional:true, emit: bam + tuple val(meta), path("${prefix}.cram"), optional:true, emit: cram + tuple val(meta), path("*.csi") , optional:true, emit: csi + tuple val(meta), path("*.crai") , optional:true, emit: crai + path "versions.yml" , emit: versions + + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + def file_type = input_files instanceof List ? input_files[0].getExtension() : input_files.getExtension() + def reference = fasta ? "--reference ${fasta}" : "" + """ + samtools \\ + merge \\ + --threads ${task.cpus-1} \\ + $args \\ + ${reference} \\ + ${prefix}.${file_type} \\ + $input_files + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + prefix = task.ext.suffix ? "${meta.id}${task.ext.suffix}" : "${meta.id}" + def file_type = input_files instanceof List ? input_files[0].getExtension() : input_files.getExtension() + """ + touch ${prefix}.${file_type} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/merge/meta.yml b/modules/nf-core/samtools/merge/meta.yml new file mode 100644 index 0000000..2e8f3db --- /dev/null +++ b/modules/nf-core/samtools/merge/meta.yml @@ -0,0 +1,83 @@ +name: samtools_merge +description: Merge BAM or CRAM file +keywords: + - merge + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input_files: + type: file + description: BAM/CRAM file + pattern: "*.{bam,cram,sam}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fasta: + type: file + description: Reference file the CRAM was created with (optional) + pattern: "*.{fasta,fa}" + - meta3: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fai: + type: file + description: Index of the reference file the CRAM was created with (optional) + pattern: "*.fai" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM file + pattern: "*.{bam}" + - cram: + type: file + description: CRAM file + pattern: "*.{cram}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - csi: + type: file + description: BAM index file (optional) + pattern: "*.csi" + - crai: + type: file + description: CRAM index file (optional) + pattern: "*.crai" +authors: + - "@drpatelh" + - "@yuukiiwa " + - "@maxulysse" + - "@FriederikeHanssen" + - "@ramprasadn" +maintainers: + - "@drpatelh" + - "@yuukiiwa " + - "@maxulysse" + - "@FriederikeHanssen" + - "@ramprasadn" diff --git a/modules/nf-core/samtools/sort/main.nf b/modules/nf-core/samtools/sort/main.nf new file mode 100644 index 0000000..2b7753f --- /dev/null +++ b/modules/nf-core/samtools/sort/main.nf @@ -0,0 +1,49 @@ +process SAMTOOLS_SORT { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::samtools=1.17" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(bam) + + output: + tuple val(meta), path("*.bam"), emit: bam + tuple val(meta), path("*.csi"), emit: csi, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + if ("$bam" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + samtools sort \\ + $args \\ + -@ $task.cpus \\ + -o ${prefix}.bam \\ + -T $prefix \\ + $bam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.bam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/sort/meta.yml b/modules/nf-core/samtools/sort/meta.yml new file mode 100644 index 0000000..0732843 --- /dev/null +++ b/modules/nf-core/samtools/sort/meta.yml @@ -0,0 +1,48 @@ +name: samtools_sort +description: Sort SAM/BAM/CRAM file +keywords: + - sort + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: Sorted BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - csi: + type: file + description: BAM index file (optional) + pattern: "*.csi" +authors: + - "@drpatelh" + - "@ewels" diff --git a/modules/nf-core/samtools/stats/environment.yml b/modules/nf-core/samtools/stats/environment.yml new file mode 100644 index 0000000..ed4e896 --- /dev/null +++ b/modules/nf-core/samtools/stats/environment.yml @@ -0,0 +1,7 @@ +name: samtools_stats +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.17 diff --git a/modules/nf-core/samtools/stats/main.nf b/modules/nf-core/samtools/stats/main.nf new file mode 100644 index 0000000..07286ef --- /dev/null +++ b/modules/nf-core/samtools/stats/main.nf @@ -0,0 +1,49 @@ +process SAMTOOLS_STATS { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(input), path(input_index) + tuple val(meta2), path(fasta) + + output: + tuple val(meta), path("*.stats"), emit: stats + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def reference = fasta ? "--reference ${fasta}" : "" + """ + samtools \\ + stats \\ + --threads ${task.cpus} \\ + ${reference} \\ + ${input} \\ + > ${prefix}.stats + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.stats + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/stats/meta.yml b/modules/nf-core/samtools/stats/meta.yml new file mode 100644 index 0000000..735ff81 --- /dev/null +++ b/modules/nf-core/samtools/stats/meta.yml @@ -0,0 +1,63 @@ +name: samtools_stats +description: Produces comprehensive statistics from SAM/BAM/CRAM file +keywords: + - statistics + - counts + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM file from alignment + pattern: "*.{bam,cram}" + - input_index: + type: file + description: BAI/CRAI file from alignment + pattern: "*.{bai,crai}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fasta: + type: file + description: Reference file the CRAM was created with (optional) + pattern: "*.{fasta,fa}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - stats: + type: file + description: File containing samtools stats output + pattern: "*.{stats}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@FriederikeHanssen" + - "@ramprasadn" +maintainers: + - "@drpatelh" + - "@FriederikeHanssen" + - "@ramprasadn" diff --git a/modules/nf-core/samtools/stats/tests/main.nf.test b/modules/nf-core/samtools/stats/tests/main.nf.test new file mode 100644 index 0000000..e037132 --- /dev/null +++ b/modules/nf-core/samtools/stats/tests/main.nf.test @@ -0,0 +1,78 @@ +nextflow_process { + + name "Test Process SAMTOOLS_STATS" + script "../main.nf" + process "SAMTOOLS_STATS" + tag "modules" + tag "modules/nf-core" + tag "samtools" + tag "samtools/stats" + + test("SAMTOOLS STATS Should run without failures") { + + when { + params { + + outdir = "$outputDir" + } + process { + """ + // define inputs of the process here. + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam_bai'], checkIfExists: true) + + ] + input[1] = [[],[]] + """ + + } + } + + then { + assertAll( + {assert process.success}, + {assert snapshot(process.out).match()} + ) + } + + } + + test("SAMTOOLS CRAM Should run without failures") { + + when { + params { + + outdir = "$outputDir" + } + process { + """ + // define inputs of the process here + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data['homo_sapiens']['illumina']['test_paired_end_recalibrated_sorted_cram'], checkIfExists: true), + file(params.test_data['homo_sapiens']['illumina']['test_paired_end_recalibrated_sorted_cram_crai'], checkIfExists: true) + + ] + input[1] = [ + [ id:'genome' ], + file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + + + } + + then { + assertAll( + {assert process.success}, + {assert snapshot(process.out).match()} + ) + } + + } + + +} diff --git a/modules/nf-core/samtools/stats/tests/main.nf.test.snap b/modules/nf-core/samtools/stats/tests/main.nf.test.snap new file mode 100644 index 0000000..516b2b0 --- /dev/null +++ b/modules/nf-core/samtools/stats/tests/main.nf.test.snap @@ -0,0 +1,64 @@ +{ + "SAMTOOLS STATS Should run without failures": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.stats:md5,6e768486d5df0257351c5419a79f9c9b" + ] + ], + "1": [ + "versions.yml:md5,08035f3409d934d47a416150884bb0df" + ], + "stats": [ + [ + { + "id": "test", + "single_end": false + }, + "test.stats:md5,6e768486d5df0257351c5419a79f9c9b" + ] + ], + "versions": [ + "versions.yml:md5,08035f3409d934d47a416150884bb0df" + ] + } + ], + "timestamp": "2023-10-18T12:12:42.998746" + }, + "SAMTOOLS CRAM Should run without failures": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.stats:md5,7c9ee5747793cceb9d6f4d733345641a" + ] + ], + "1": [ + "versions.yml:md5,08035f3409d934d47a416150884bb0df" + ], + "stats": [ + [ + { + "id": "test", + "single_end": false + }, + "test.stats:md5,7c9ee5747793cceb9d6f4d733345641a" + ] + ], + "versions": [ + "versions.yml:md5,08035f3409d934d47a416150884bb0df" + ] + } + ], + "timestamp": "2023-10-18T12:13:30.747222" + } +} \ No newline at end of file diff --git a/modules/nf-core/samtools/stats/tests/tags.yml b/modules/nf-core/samtools/stats/tests/tags.yml new file mode 100644 index 0000000..7c28e30 --- /dev/null +++ b/modules/nf-core/samtools/stats/tests/tags.yml @@ -0,0 +1,2 @@ +samtools/stats: + - modules/nf-core/samtools/stats/** diff --git a/modules/nf-core/samtools/view/environment.yml b/modules/nf-core/samtools/view/environment.yml new file mode 100644 index 0000000..141e7bd --- /dev/null +++ b/modules/nf-core/samtools/view/environment.yml @@ -0,0 +1,7 @@ +name: samtools_view +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.17 diff --git a/modules/nf-core/samtools/view/main.nf b/modules/nf-core/samtools/view/main.nf index d7b2a0d..613c6e7 100644 --- a/modules/nf-core/samtools/view/main.nf +++ b/modules/nf-core/samtools/view/main.nf @@ -2,14 +2,14 @@ process SAMTOOLS_VIEW { tag "$meta.id" label 'process_low' - conda "bioconda::samtools=1.17" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : - 'quay.io/biocontainers/samtools:1.17--h00cdaf9_0' }" + 'biocontainers/samtools:1.17--h00cdaf9_0' }" input: tuple val(meta), path(input), path(index) - path fasta + tuple val(meta2), path(fasta) path qname output: @@ -19,6 +19,7 @@ process SAMTOOLS_VIEW { tuple val(meta), path("*.bai"), emit: bai, optional: true tuple val(meta), path("*.csi"), emit: csi, optional: true tuple val(meta), path("*.crai"), emit: crai, optional: true + tuple val(meta), path("*.unoutput"), emit: unoutput, optional: true path "versions.yml", emit: versions when: @@ -29,7 +30,7 @@ process SAMTOOLS_VIEW { def args2 = task.ext.args2 ?: '' def prefix = task.ext.prefix ?: "${meta.id}" def reference = fasta ? "--reference ${fasta}" : "" - def readnames = qname ? "--qname-file ${qname}": "" + def readnames = qname ? "--qname-file ${qname} --unoutput ${prefix}.unoutput": "" def file_type = args.contains("--output-fmt sam") ? "sam" : args.contains("--output-fmt bam") ? "bam" : args.contains("--output-fmt cram") ? "cram" : diff --git a/modules/nf-core/samtools/view/meta.yml b/modules/nf-core/samtools/view/meta.yml index 7691603..3dadafa 100644 --- a/modules/nf-core/samtools/view/meta.yml +++ b/modules/nf-core/samtools/view/meta.yml @@ -26,12 +26,17 @@ input: description: BAM/CRAM/SAM file pattern: "*.{bam,cram,sam}" - index: - type: optional file - description: BAM.BAI/BAM.CSI/CRAM.CRAI file + type: file + description: BAM.BAI/BAM.CSI/CRAM.CRAI file (optional) pattern: "*.{.bai,.csi,.crai}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] - fasta: - type: optional file - description: Reference file the CRAM was created with + type: file + description: Reference file the CRAM was created with (optional) pattern: "*.{fasta,fa}" - qname: type: file @@ -77,3 +82,8 @@ authors: - "@joseespinosa" - "@FriederikeHanssen" - "@priyanka-surana" +maintainers: + - "@drpatelh" + - "@joseespinosa" + - "@FriederikeHanssen" + - "@priyanka-surana" diff --git a/modules/nf-core/samtools/view/samtools-view.diff b/modules/nf-core/samtools/view/samtools-view.diff new file mode 100644 index 0000000..1fa860a --- /dev/null +++ b/modules/nf-core/samtools/view/samtools-view.diff @@ -0,0 +1,22 @@ +Changes in module 'nf-core/samtools/view' +--- modules/nf-core/samtools/view/main.nf ++++ modules/nf-core/samtools/view/main.nf +@@ -19,6 +19,7 @@ + tuple val(meta), path("*.bai"), emit: bai, optional: true + tuple val(meta), path("*.csi"), emit: csi, optional: true + tuple val(meta), path("*.crai"), emit: crai, optional: true ++ tuple val(meta), path("*.unoutput"), emit: unoutput, optional: true + path "versions.yml", emit: versions + + when: +@@ -29,7 +30,7 @@ + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def reference = fasta ? "--reference ${fasta}" : "" +- def readnames = qname ? "--qname-file ${qname}": "" ++ def readnames = qname ? "--qname-file ${qname} --unoutput ${prefix}.unoutput": "" + def file_type = args.contains("--output-fmt sam") ? "sam" : + args.contains("--output-fmt bam") ? "bam" : + args.contains("--output-fmt cram") ? "cram" : + +************************************************************ diff --git a/modules/nf-core/untar/environment.yml b/modules/nf-core/untar/environment.yml new file mode 100644 index 0000000..d6917da --- /dev/null +++ b/modules/nf-core/untar/environment.yml @@ -0,0 +1,9 @@ +name: untar +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - conda-forge::sed=4.7 + - conda-forge::grep=3.11 + - conda-forge::tar=1.34 diff --git a/modules/nf-core/untar/main.nf b/modules/nf-core/untar/main.nf new file mode 100644 index 0000000..8a75bb9 --- /dev/null +++ b/modules/nf-core/untar/main.nf @@ -0,0 +1,63 @@ +process UNTAR { + tag "$archive" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + + input: + tuple val(meta), path(archive) + + output: + tuple val(meta), path("$prefix"), emit: untar + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + prefix = task.ext.prefix ?: ( meta.id ? "${meta.id}" : archive.baseName.toString().replaceFirst(/\.tar$/, "")) + + """ + mkdir $prefix + + ## Ensures --strip-components only applied when top level of tar contents is a directory + ## If just files or multiple directories, place all in prefix + if [[ \$(tar -taf ${archive} | grep -o -P "^.*?\\/" | uniq | wc -l) -eq 1 ]]; then + tar \\ + -C $prefix --strip-components 1 \\ + -xavf \\ + $args \\ + $archive \\ + $args2 + else + tar \\ + -C $prefix \\ + -xavf \\ + $args \\ + $archive \\ + $args2 + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + untar: \$(echo \$(tar --version 2>&1) | sed 's/^.*(GNU tar) //; s/ Copyright.*\$//') + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: ( meta.id ? "${meta.id}" : archive.toString().replaceFirst(/\.[^\.]+(.gz)?$/, "")) + """ + mkdir $prefix + touch ${prefix}/file.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + untar: \$(echo \$(tar --version 2>&1) | sed 's/^.*(GNU tar) //; s/ Copyright.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/untar/meta.yml b/modules/nf-core/untar/meta.yml new file mode 100644 index 0000000..a9a2110 --- /dev/null +++ b/modules/nf-core/untar/meta.yml @@ -0,0 +1,46 @@ +name: untar +description: Extract files. +keywords: + - untar + - uncompress + - extract +tools: + - untar: + description: | + Extract tar.gz files. + documentation: https://www.gnu.org/software/tar/manual/ + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - archive: + type: file + description: File to be untar + pattern: "*.{tar}.{gz}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - untar: + type: directory + description: Directory containing contents of archive + pattern: "*/" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" + - "@matthdsm" + - "@jfy133" +maintainers: + - "@joseespinosa" + - "@drpatelh" + - "@matthdsm" + - "@jfy133" diff --git a/modules/nf-core/untar/tests/main.nf.test b/modules/nf-core/untar/tests/main.nf.test new file mode 100644 index 0000000..d40db13 --- /dev/null +++ b/modules/nf-core/untar/tests/main.nf.test @@ -0,0 +1,77 @@ +nextflow_process { + + name "Test Process UNTAR" + script "../main.nf" + process "UNTAR" + + tag "modules" + tag "modules_nfcore" + tag "untar" + + test("test_untar") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ [], file(params.test_data['sarscov2']['genome']['kraken2_tar_gz'], checkIfExists: true) ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out.untar).match("test_untar") }, + ) + } + + } + + test("test_untar_different_output_path") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ [], file(params.test_data['homo_sapiens']['illumina']['test_flowcell'], checkIfExists: true) ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out.untar).match("test_untar_different_output_path") }, + ) + } + + } + + test("test_untar_onlyfiles") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ [], file(params.test_data['generic']['tar']['tar_gz'], checkIfExists: true) ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out.untar).match("test_untar_onlyfiles") }, + ) + } + + } + +} diff --git a/modules/nf-core/untar/tests/main.nf.test.snap b/modules/nf-core/untar/tests/main.nf.test.snap new file mode 100644 index 0000000..146c867 --- /dev/null +++ b/modules/nf-core/untar/tests/main.nf.test.snap @@ -0,0 +1,513 @@ +{ + "test_untar_different_output_path": { + "content": [ + [ + [ + [ + + ], + [ + [ + [ + [ + [ + [ + "s_1_1101.bcl:md5,ad01889e2ff43e2f194224e20bdb600c", + "s_1_1101.stats:md5,4bbbf103454b37fbc3138fadf1b4446b" + ], + [ + "s_1_1101.bcl:md5,565384bbe67a694dfd690bae6d1d30c2", + "s_1_1101.stats:md5,55e5abd8f129ff38ef169873547abdb8" + ], + [ + "s_1_1101.bcl:md5,650fa58a630a9148835ba79e323d4237", + "s_1_1101.stats:md5,77403669ca1b05340c390dff64425c1e" + ], + [ + "s_1_1101.bcl:md5,54471c9e97299cd141e202e204637702", + "s_1_1101.stats:md5,67b14c9a89b7f8556674a7524d5cfb2d" + ], + [ + "s_1_1101.bcl:md5,74e4f929fc7476c380fd9d741ddb6700", + "s_1_1101.stats:md5,5730a4c35463eaa12a06b6758710b98c" + ], + [ + "s_1_1101.bcl:md5,c785f472f4350c120c02c888c8189590", + "s_1_1101.stats:md5,fee4ec63895ea81007e06ee6a36ba5e0" + ], + [ + "s_1_1101.bcl:md5,b7ea50bb25f08d43c301741d77050a9b", + "s_1_1101.stats:md5,fa7c68f3122c74d14364e6f7b011af70" + ], + [ + "s_1_1101.bcl:md5,9d5087dc4bcae39d66486363d4f68ecf", + "s_1_1101.stats:md5,23cdceee4d82c4b8e7c60018b9276ace" + ], + [ + "s_1_1101.bcl:md5,581e0c5ee94e8f2de14b2b1d8e777530", + "s_1_1101.stats:md5,9a3536d573c97f66bb56b49463612607" + ], + [ + "s_1_1101.bcl:md5,296fc026bb34c67bbe2b44845fe0d1de", + "s_1_1101.stats:md5,a7f57a7770fb9c5ae2a0fb1ef403ec4f" + ], + [ + "s_1_1101.bcl:md5,2a3ca15531556c36d10d132a9e051de8", + "s_1_1101.stats:md5,2d0bcdb0a1b51d3d79e415db2ab2d3b1" + ], + [ + "s_1_1101.bcl:md5,1150d46a2ccd4ac58aee0585d3e4ffd7", + "s_1_1101.stats:md5,2e97550bd5b5864ffd0565bb7a3f6d40" + ], + [ + "s_1_1101.bcl:md5,0b85c4b3da0de95e7b862d849c5333ae", + "s_1_1101.stats:md5,6eab9746fbeb783b0cd70398f44e0c1a" + ], + [ + "s_1_1101.bcl:md5,e0e9c91f4698804d7a6d1058ef68b34f", + "s_1_1101.stats:md5,790022cdc7878a02b2ebd166e1ddf0a7" + ], + [ + "s_1_1101.bcl:md5,38cd0ad4de359e651c8ac0d5777ea625", + "s_1_1101.stats:md5,a1b1d5ea5371d326abb029774483c5e6" + ], + [ + "s_1_1101.bcl:md5,b0ddc05c4012ccba24e712a1cfec748f", + "s_1_1101.stats:md5,af3d232f839d720f76f40ba06caa2987" + ], + [ + "s_1_1101.bcl:md5,af32fcc5dc3b836cf7a5ba3db85a75dd", + "s_1_1101.stats:md5,f93f2c09bd4e486c74a5f6e2040f7296" + ], + [ + "s_1_1101.bcl:md5,54b7428e037ca87816107647d4a3d9db", + "s_1_1101.stats:md5,e5ac77a72cd7bed5e9bf03cccda0e48c" + ], + [ + "s_1_1101.bcl:md5,fc8b4eacd493bf3d0b20bc23998dc7ff", + "s_1_1101.stats:md5,190315e159e2f4bc4c057ded7470dc52" + ], + [ + "s_1_1101.bcl:md5,9484ecffda489927fce424ac6a44fa9d", + "s_1_1101.stats:md5,0825feeb457ecc9efcf6f8526ba32311" + ], + [ + "s_1_1101.bcl:md5,eec59e21036e31c95ce1e847bfb0a9c4", + "s_1_1101.stats:md5,9acc13f63c98e5a8445e7be70d49222b" + ], + [ + "s_1_1101.bcl:md5,a9fb24476f87cba4fba68e2b3c3f2c07", + "s_1_1101.stats:md5,dc0aa7db9790733291c3e6480ca2a0fc" + ], + [ + "s_1_1101.bcl:md5,ed950b3e82c500927c2e236c9df005c6", + "s_1_1101.stats:md5,dccb71ec47d1f9d33a192da6d5660a45" + ], + [ + "s_1_1101.bcl:md5,b3e992025e995ca56b5ea2820144ef47", + "s_1_1101.stats:md5,a6a829bf2cffb26ac5d9dc3012057699" + ], + [ + "s_1_1101.bcl:md5,89edc726a5a4e0b4ff8ca3899ed0232b", + "s_1_1101.stats:md5,5b9b4fd8110577a59b82d0c419519d29" + ], + [ + "s_1_1101.bcl:md5,4dc696149169f232c451225f563cb5cd", + "s_1_1101.stats:md5,d3514a71ea3adc60e2943c6b8f6e2598" + ], + [ + "s_1_1101.bcl:md5,35b992d0318afb7c825ceaa31b0755e6", + "s_1_1101.stats:md5,2826093acc175c16c3795de7c4ca8f07" + ], + [ + "s_1_1101.bcl:md5,7bc927f56a362e49c00b5d76ee048901", + "s_1_1101.stats:md5,e47d862b795fd6b88a31d7d482ab22f6" + ], + [ + "s_1_1101.bcl:md5,84742233ff2a651626fe9036f27f7cb2", + "s_1_1101.stats:md5,b78fad11d3c50bc76b722cdc03e3028b" + ], + [ + "s_1_1101.bcl:md5,3935341c86263a7938e8c49620ef39f8", + "s_1_1101.stats:md5,cc6585b2daac5354073d150874da9704" + ], + [ + "s_1_1101.bcl:md5,3627f4fd548bf6e64aaf08fba3a342be", + "s_1_1101.stats:md5,120ae4831ae004ff7d16728aef36e82f" + ], + [ + "s_1_1101.bcl:md5,07631014bc35124149fabd80ef19f933", + "s_1_1101.stats:md5,eadd63d91f47cc6db6b6f0a967a23927" + ], + [ + "s_1_1101.bcl:md5,a1149c80415dc2f34d768eeb397c43fb", + "s_1_1101.stats:md5,ca89a9def67611a9151c6ce685b7cce1" + ], + [ + "s_1_1101.bcl:md5,eb5f71d4741d2f40618756bc72eaf8b4", + "s_1_1101.stats:md5,90f48501e735e5915b843478e23d1ae2" + ], + [ + "s_1_1101.bcl:md5,9bf270fe3f6add1a591ebc24fff10078", + "s_1_1101.stats:md5,a4e429671d4098034293c638aa655e16" + ], + [ + "s_1_1101.bcl:md5,219bedcbd24bae54fe4cf05dae05282c", + "s_1_1101.stats:md5,dd97525b65b68207137d51fcf19132c7" + ], + [ + "s_1_1101.bcl:md5,5163bc00a68fd57ae50cae0b76350892", + "s_1_1101.stats:md5,b606a5368eff1f012f3ea5d11ccdf2e0" + ], + [ + "s_1_1101.bcl:md5,fc429195a5af59a59e0cc4c48e6c05ea", + "s_1_1101.stats:md5,d809aa19698053f90d639da4dcad8008" + ], + [ + "s_1_1101.bcl:md5,383340219a1dd77076a092a64a71a7e4", + "s_1_1101.stats:md5,b204a5cf256378679ffc906c15cc1bae" + ], + [ + "s_1_1101.bcl:md5,0c369540d3e24696cf1f9c55bab69315", + "s_1_1101.stats:md5,a2bc69a4031a22ce9621dcc623a0bf4b" + ], + [ + "s_1_1101.bcl:md5,3127abc8016ba8eb954f8f8015dff387", + "s_1_1101.stats:md5,5deafff31150b7bf757f814e49a53bc2" + ], + [ + "s_1_1101.bcl:md5,045f40c82de676bafec3d59f91376a7a", + "s_1_1101.stats:md5,890700edc20687c090ef52248c7884b1" + ], + [ + "s_1_1101.bcl:md5,78af269aa2b39a1d765703f0a4739a86", + "s_1_1101.stats:md5,303cf457aa1543a8208544f694cbc531" + ], + [ + "s_1_1101.bcl:md5,0ab8c781959b783b62888e9274364a46", + "s_1_1101.stats:md5,2605b0e8322f83aa4d0dae5da4ec7a7a" + ], + [ + "s_1_1101.bcl:md5,d0cf823ffe352e8b3f75d589544ab617", + "s_1_1101.stats:md5,efa3c0e01e3db71e12fd961cb2d03739" + ], + [ + "s_1_1101.bcl:md5,db4ca4ab7a01e03c246f9160c3758d82", + "s_1_1101.stats:md5,f61550d9e4a90df6b860e68f41f82f60" + ], + [ + "s_1_1101.bcl:md5,1af39a2c7e5ff20ece91cb8160b51d17", + "s_1_1101.stats:md5,d0e20879afcaf6dfcd88c73f1c5c78cf" + ], + [ + "s_1_1101.bcl:md5,4cf7123bb0fffcd79266df03aef01665", + "s_1_1101.stats:md5,29bff4075109a121b087116b58d7e927" + ], + [ + "s_1_1101.bcl:md5,aa9980428cb60cd6320f4b48f4dd0d74", + "s_1_1101.stats:md5,6b0e20bde93133117a8d1a6df3d6f37b" + ], + [ + "s_1_1101.bcl:md5,0f6e440374e15b9b491d52fb83a8adfe", + "s_1_1101.stats:md5,55cb5eb0ecdabd23dca39ab8c4607598" + ], + [ + "s_1_1101.bcl:md5,2c645d7bdaddaa403f6e304d36df9e4b", + "s_1_1101.stats:md5,53acf33d21f832779b400c2447386ce4" + ], + [ + "s_1_1101.bcl:md5,3bbf0863b423b770c879203644420206", + "s_1_1101.stats:md5,579bdc7293cac8c3d7407249cacf4c25" + ], + [ + "s_1_1101.bcl:md5,6658a08409e81d29cfeb2d096b491985", + "s_1_1101.stats:md5,bb559ffbea46d612f9933cefa84c4c03" + ], + [ + "s_1_1101.bcl:md5,1700d9a13d3d4f7643af2943ef838acb", + "s_1_1101.stats:md5,f01cb6050ebfb15da1e0399ebd791eb4" + ], + [ + "s_1_1101.bcl:md5,1ac7aa9ffae25eb103f755f33e4a39c6", + "s_1_1101.stats:md5,0b9d45d7929ccf336d5e5b95373ed3c2" + ], + [ + "s_1_1101.bcl:md5,812a97af2e983a53226e18c75190b06c", + "s_1_1101.stats:md5,d2410c7b0e506dab2972e77e2398de1e" + ], + [ + "s_1_1101.bcl:md5,c981e8e4dcc434956c2b86159da268bc", + "s_1_1101.stats:md5,e9c826e85361ce673f1f248786c9a611" + ], + [ + "s_1_1101.bcl:md5,88e09e99a0a4ef3357b203a41b22f77c", + "s_1_1101.stats:md5,ef06f2e5ad667bbd383f9ed6a05b7b42" + ], + [ + "s_1_1101.bcl:md5,461c8b146fc8a7938be38689978ecd09", + "s_1_1101.stats:md5,65115693935da66f9791b27136e22fb0" + ], + [ + "s_1_1101.bcl:md5,c7b827df5ce20e0f21916fe60860ca3f", + "s_1_1101.stats:md5,87be73613aeb507847f94d3cac5bb30a" + ], + [ + "s_1_1101.bcl:md5,7c4cc3dc9c8a1b0f15917b282dfb40ce", + "s_1_1101.stats:md5,bdd9181fa89debbfafe7b6ea3e064065" + ], + [ + "s_1_1101.bcl:md5,19f4debaf91e118aca8934517179ac33", + "s_1_1101.stats:md5,1143082719e136241d21b14a6b19b8a2" + ], + [ + "s_1_1101.bcl:md5,38aa256ad2d697d84b0b2c0e876a3eba", + "s_1_1101.stats:md5,64dd82f03df23f7f437eede2671ed4fe" + ], + [ + "s_1_1101.bcl:md5,b7929970378949571fed922c1b8cab32", + "s_1_1101.stats:md5,3d6d7985a41629fe196e4342d7fe36aa" + ], + [ + "s_1_1101.bcl:md5,fb2ed0bf6e89d79624ee78754e773491", + "s_1_1101.stats:md5,f34940810ff255aee79953496a12716d" + ], + [ + "s_1_1101.bcl:md5,4f8a8311f5f9c3a7629c1a973a7b280e", + "s_1_1101.stats:md5,4fd7cd28c09f4e152e7c2ad1ab541cd2" + ], + [ + "s_1_1101.bcl:md5,9eb46c903d0344e25af51f88cc311d60", + "s_1_1101.stats:md5,df3abd5f620d9e7f99496098d9fd3f7f" + ], + [ + "s_1_1101.bcl:md5,3ecbc17f3660e2014b58d7fe70ae62d5", + "s_1_1101.stats:md5,8e89a13c85a6d6ab3ccd251b66d1f165" + ], + [ + "s_1_1101.bcl:md5,5d59cc2499a77791233a64f73fe82894", + "s_1_1101.stats:md5,32ec99cd400f4b80cb26e2fa8e07ece0" + ], + [ + "s_1_1101.bcl:md5,1c052da47b9ae8554388f0fa3aade482", + "s_1_1101.stats:md5,d23f438772673688aa7bc92421dc6dce" + ], + [ + "s_1_1101.bcl:md5,1a52bd4f23130c0c96bc967ccd448a2b", + "s_1_1101.stats:md5,9b597e3388d59ef1f61aba30ac90ea79" + ], + [ + "s_1_1101.bcl:md5,8a1e84b79cf3f80794c20e3a0cc84688", + "s_1_1101.stats:md5,9561f7b6ef4b1849afc72b2bb49792bd" + ], + [ + "s_1_1101.bcl:md5,75c00111051f3fa95d04286823cb9109", + "s_1_1101.stats:md5,1fe786cdf8181767deafbd60b3c76610" + ], + [ + "s_1_1101.bcl:md5,529255d8deee0873ed5565e6d1a2ebda", + "s_1_1101.stats:md5,3fa7f467e97a75880f32d17b7429d316" + ], + [ + "s_1_1101.bcl:md5,ea4d960e3d9355d2149da71b88a21df4", + "s_1_1101.stats:md5,2540fe65586e8e800c1ddd8cddd1e8cd" + ], + [ + "s_1_1101.bcl:md5,0dfe1fd92a2dce2f23119aa483429744", + "s_1_1101.stats:md5,78257b2169fb9f0cf40966e06e847e86" + ], + [ + "s_1_1101.bcl:md5,f692ddc9aa3ab849271d07c666d0b3b9", + "s_1_1101.stats:md5,aa2ec6a3e3a9c116e34fe74a21e6459e" + ], + [ + "s_1_1101.bcl:md5,29cc4c239eae7c871c9a1adf92ebdb98", + "s_1_1101.stats:md5,263184813090acd740a5bf25304aed3a" + ], + [ + "s_1_1101.bcl:md5,e005af6a84925e326afbfe264241f047", + "s_1_1101.stats:md5,b6fb20868eebaffcc19daa694a449795" + ], + [ + "s_1_1101.bcl:md5,02f1a699b1ba9967accccf99a7af3d24", + "s_1_1101.stats:md5,4f007efacecaf26dc0e0231aede28754" + ], + [ + "s_1_1101.bcl:md5,df308c72a2dcc655cd95e98f5457187a", + "s_1_1101.stats:md5,130c4b07f4c14030bab012824cbe34da" + ], + [ + "s_1_1101.bcl:md5,f3ce10d8d2406b72355023bfa8c96822", + "s_1_1101.stats:md5,2638f4db393ed5b699ec2ce59ff0ec19" + ], + [ + "s_1_1101.bcl:md5,cc2f6d675ad1593ff96f734b172d249e", + "s_1_1101.stats:md5,f5b13f1e1ababc9e1a7a73b0b993cbf1" + ], + [ + "s_1_1101.bcl:md5,7938a0b21448305a951b023b1845b3a7", + "s_1_1101.stats:md5,fcd57511adabfc3ba1ac045165330006" + ], + [ + "s_1_1101.bcl:md5,44879bc6a38df1fee8def61868115041", + "s_1_1101.stats:md5,517e20e4b58a8023a37f9af62e0e2036" + ], + [ + "s_1_1101.bcl:md5,8749611e62406a7d2f34c610a55e56af", + "s_1_1101.stats:md5,8ccf24b3676ef84f2e513be8f2a9f3d1" + ], + [ + "s_1_1101.bcl:md5,a9846a037611cda3721958088f714c0e", + "s_1_1101.stats:md5,6438fa5a1892f328cab1605a95d80a3b" + ], + [ + "s_1_1101.bcl:md5,d6c4a2a726496476eb826532f974ed5f", + "s_1_1101.stats:md5,8c2c65b5e8b00dbf61ada65252aeb266" + ], + [ + "s_1_1101.bcl:md5,be3dde6cae7dd85855a6bf295ebfacfe", + "s_1_1101.stats:md5,93bc13f3b0749b2b8d8bcb0b1199f4f0" + ], + [ + "s_1_1101.bcl:md5,7c64514735a6cf1565b60647edd17d20", + "s_1_1101.stats:md5,4a0aa6c49b24f876415e5878cef7f805" + ], + [ + "s_1_1101.bcl:md5,3983b4043bc9df4b505202a5134ccf03", + "s_1_1101.stats:md5,1c9d9a8558adc1279ca27c96bc1b9758" + ], + [ + "s_1_1101.bcl:md5,a0b8d77f116ec95975f9253dcb768136", + "s_1_1101.stats:md5,c3992b786756e7ec42f65ef4b13b50d4" + ], + [ + "s_1_1101.bcl:md5,43c95ba35d06bb7c57fbd16f3d1cfd6c", + "s_1_1101.stats:md5,3cb69d04698c39f97f962e5bf1eea7f0" + ], + [ + "s_1_1101.bcl:md5,3dbeea0cad7052f19f53ff6f19dd4d90", + "s_1_1101.stats:md5,58bbc8254f0f5f4a244531e8e9c12a04" + ], + [ + "s_1_1101.bcl:md5,da56d088996376c898d855b6cd0a7dfc", + "s_1_1101.stats:md5,9f2d78af6908ce1576b89cdc059844ff" + ], + [ + "s_1_1101.bcl:md5,7b641a5565f095e9a6ffcad9e4305033", + "s_1_1101.stats:md5,3ada06c59b4fb41b83ab6abd0979e9fc" + ], + [ + "s_1_1101.bcl:md5,a3843d397a01d51657825bb652c191e5", + "s_1_1101.stats:md5,19341e52a4bfc7d9d48e9d2acc68c519" + ], + [ + "s_1_1101.bcl:md5,048e3ebfc8efeb8012def6b741c9060d", + "s_1_1101.stats:md5,88bd38deca1e87d700effab1fd099565" + ], + [ + "s_1_1101.bcl:md5,b340db0e07e829dd5da22371916a1a9e", + "s_1_1101.stats:md5,e44cfaddcc4ffb968e5b1a2f41ac48a5" + ], + [ + "s_1_1101.bcl:md5,e6011ec6eabbc2b8792deb283c621ce0", + "s_1_1101.stats:md5,090875dcd1a431af24bc631333f089c4" + ], + [ + "s_1_1101.bcl:md5,a08f216e3352345031ed100ec4245082", + "s_1_1101.stats:md5,97b949ef4b96219e1369f673cf5f8a6c" + ], + [ + "s_1_1101.bcl:md5,b43337c76fb037dfcf5f8f7bcb3618e5", + "s_1_1101.stats:md5,ddef585805e79951f69d23ab7354f69b" + ], + [ + "s_1_1101.bcl:md5,8c61fd004104397b360855e058bbf1bf", + "s_1_1101.stats:md5,0f8d253816d594dcfea3ccf48c826401" + ], + [ + "s_1_1101.bcl:md5,594d06310d328b188aa0b3edfff22cb2", + "s_1_1101.stats:md5,3160bf271b39aeb7590e4fd2984710ba" + ], + [ + "s_1_1101.bcl:md5,4c9eada67c9d55437211d83e111961d5", + "s_1_1101.stats:md5,2901b46ab16ec4863d30e4c84ec29c97" + ], + [ + "s_1_1101.bcl:md5,e03971ae5282f0accc0c1b7374d9ef1b", + "s_1_1101.stats:md5,60d2a19ce59bf70a21a28555484cead8" + ], + [ + "s_1_1101.bcl:md5,e1c6f7a06e63d149895d3e48e63df155", + "s_1_1101.stats:md5,44beb10af847ea3dddaf06dda7031126" + ], + [ + "s_1_1101.bcl:md5,960a99bf29a8f9d936e9b8582d46c9c6", + "s_1_1101.stats:md5,544cd1a7aaaa841914b40ece43399334" + ], + [ + "s_1_1101.bcl:md5,5706679f349fd4a6b6313bc2c41c7a42", + "s_1_1101.stats:md5,627eea844b26dae033848c2f9f69177b" + ], + [ + "s_1_1101.bcl:md5,21da5abc4b0402bbac14b5ab998b0b4f", + "s_1_1101.stats:md5,515bd140b095ad90473ca7a9a69877ab" + ], + "s_1_1101.control:md5,08a72e2198ae95150718e8adf011d105", + "s_1_1101.filter:md5,3a72bc73b323c8cb0ac5bfeb62d98989" + ] + ], + [ + "s_1_1101.locs:md5,0827ea802e5257cc5b20e757a33d4c98" + ], + "RTAConfiguration.xml:md5,c7d6e257bc374f142dc64b9d2281d4c9", + "config.xml:md5,9a4cc7ec01fefa2f1ce9bcb45bbad6e9" + ] + ], + [ + "ControlMetricsOut.bin:md5,6d77b38d0793a6e1ce1e85706e488953", + "CorrectedIntMetricsOut.bin:md5,2bbf84d3be72734addaa2fe794711434", + "ErrorMetricsOut.bin:md5,38c88def138e9bb832539911affdb286", + "ExtractionMetricsOut.bin:md5,7497c3178837eea8f09350b5cd252e99", + "IndexMetricsOut.bin:md5,d41d8cd98f00b204e9800998ecf8427e", + "QMetricsOut.bin:md5,7e9f198d53ebdfbb699a5f94cf1ed51c", + "TileMetricsOut.bin:md5,83891751ec1c91a425a524b476b6ca3c" + ], + "RunInfo.xml:md5,03038959f4dd181c86bc97ae71fe270a" + ] + ] + ] + ], + "timestamp": "2023-10-18T11:56:39.562418" + }, + "test_untar_onlyfiles": { + "content": [ + [ + [ + [ + + ], + [ + "hello.txt:md5,e59ff97941044f85df5297e1c302d260" + ] + ] + ] + ], + "timestamp": "2023-10-18T11:56:46.878844" + }, + "test_untar": { + "content": [ + [ + [ + [ + + ], + [ + "hash.k2d:md5,8b8598468f54a7087c203ad0190555d9", + "opts.k2d:md5,a033d00cf6759407010b21700938f543", + "taxo.k2d:md5,094d5891cdccf2f1468088855c214b2c" + ] + ] + ] + ], + "timestamp": "2023-10-18T11:56:08.16574" + } +} \ No newline at end of file diff --git a/modules/nf-core/untar/tests/tags.yml b/modules/nf-core/untar/tests/tags.yml new file mode 100644 index 0000000..feb6f15 --- /dev/null +++ b/modules/nf-core/untar/tests/tags.yml @@ -0,0 +1,2 @@ +untar: + - modules/nf-core/untar/** diff --git a/modules/nf-core/vcftools/environment.yml b/modules/nf-core/vcftools/environment.yml new file mode 100644 index 0000000..503449e --- /dev/null +++ b/modules/nf-core/vcftools/environment.yml @@ -0,0 +1,7 @@ +name: vcftools +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::vcftools=0.1.16 diff --git a/modules/nf-core/vcftools/main.nf b/modules/nf-core/vcftools/main.nf new file mode 100644 index 0000000..0e61955 --- /dev/null +++ b/modules/nf-core/vcftools/main.nf @@ -0,0 +1,127 @@ +process VCFTOOLS { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/vcftools:0.1.16--he513fc3_4' : + 'biocontainers/vcftools:0.1.16--he513fc3_4' }" + + input: + // Owing to the nature of vcftools we here provide solutions to working with optional bed files and optional + // alternative variant files, for use with the 'diff' suite of tools. + // Other optional input files can be utilised in a similar way to below but we do not exhaustively itterate through all + // possible options. Instead we leave that to the user. + tuple val(meta), path(variant_file) + path bed + path diff_variant_file + + output: + tuple val(meta), path("*.vcf") , optional:true, emit: vcf + tuple val(meta), path("*.bcf") , optional:true, emit: bcf + tuple val(meta), path("*.frq") , optional:true, emit: frq + tuple val(meta), path("*.frq.count") , optional:true, emit: frq_count + tuple val(meta), path("*.idepth") , optional:true, emit: idepth + tuple val(meta), path("*.ldepth") , optional:true, emit: ldepth + tuple val(meta), path("*.ldepth.mean") , optional:true, emit: ldepth_mean + tuple val(meta), path("*.gdepth") , optional:true, emit: gdepth + tuple val(meta), path("*.hap.ld") , optional:true, emit: hap_ld + tuple val(meta), path("*.geno.ld") , optional:true, emit: geno_ld + tuple val(meta), path("*.geno.chisq") , optional:true, emit: geno_chisq + tuple val(meta), path("*.list.hap.ld") , optional:true, emit: list_hap_ld + tuple val(meta), path("*.list.geno.ld") , optional:true, emit: list_geno_ld + tuple val(meta), path("*.interchrom.hap.ld") , optional:true, emit: interchrom_hap_ld + tuple val(meta), path("*.interchrom.geno.ld") , optional:true, emit: interchrom_geno_ld + tuple val(meta), path("*.TsTv") , optional:true, emit: tstv + tuple val(meta), path("*.TsTv.summary") , optional:true, emit: tstv_summary + tuple val(meta), path("*.TsTv.count") , optional:true, emit: tstv_count + tuple val(meta), path("*.TsTv.qual") , optional:true, emit: tstv_qual + tuple val(meta), path("*.FILTER.summary") , optional:true, emit: filter_summary + tuple val(meta), path("*.sites.pi") , optional:true, emit: sites_pi + tuple val(meta), path("*.windowed.pi") , optional:true, emit: windowed_pi + tuple val(meta), path("*.weir.fst") , optional:true, emit: weir_fst + tuple val(meta), path("*.het") , optional:true, emit: heterozygosity + tuple val(meta), path("*.hwe") , optional:true, emit: hwe + tuple val(meta), path("*.Tajima.D") , optional:true, emit: tajima_d + tuple val(meta), path("*.ifreqburden") , optional:true, emit: freq_burden + tuple val(meta), path("*.LROH") , optional:true, emit: lroh + tuple val(meta), path("*.relatedness") , optional:true, emit: relatedness + tuple val(meta), path("*.relatedness2") , optional:true, emit: relatedness2 + tuple val(meta), path("*.lqual") , optional:true, emit: lqual + tuple val(meta), path("*.imiss") , optional:true, emit: missing_individual + tuple val(meta), path("*.lmiss") , optional:true, emit: missing_site + tuple val(meta), path("*.snpden") , optional:true, emit: snp_density + tuple val(meta), path("*.kept.sites") , optional:true, emit: kept_sites + tuple val(meta), path("*.removed.sites") , optional:true, emit: removed_sites + tuple val(meta), path("*.singletons") , optional:true, emit: singeltons + tuple val(meta), path("*.indel.hist") , optional:true, emit: indel_hist + tuple val(meta), path("*.hapcount") , optional:true, emit: hapcount + tuple val(meta), path("*.mendel") , optional:true, emit: mendel + tuple val(meta), path("*.FORMAT") , optional:true, emit: format + tuple val(meta), path("*.INFO") , optional:true, emit: info + tuple val(meta), path("*.012") , optional:true, emit: genotypes_matrix + tuple val(meta), path("*.012.indv") , optional:true, emit: genotypes_matrix_individual + tuple val(meta), path("*.012.pos") , optional:true, emit: genotypes_matrix_position + tuple val(meta), path("*.impute.hap") , optional:true, emit: impute_hap + tuple val(meta), path("*.impute.hap.legend") , optional:true, emit: impute_hap_legend + tuple val(meta), path("*.impute.hap.indv") , optional:true, emit: impute_hap_indv + tuple val(meta), path("*.ldhat.sites") , optional:true, emit: ldhat_sites + tuple val(meta), path("*.ldhat.locs") , optional:true, emit: ldhat_locs + tuple val(meta), path("*.BEAGLE.GL") , optional:true, emit: beagle_gl + tuple val(meta), path("*.BEAGLE.PL") , optional:true, emit: beagle_pl + tuple val(meta), path("*.ped") , optional:true, emit: ped + tuple val(meta), path("*.map") , optional:true, emit: map_ + tuple val(meta), path("*.tped") , optional:true, emit: tped + tuple val(meta), path("*.tfam") , optional:true, emit: tfam + tuple val(meta), path("*.diff.sites_in_files") , optional:true, emit: diff_sites_in_files + tuple val(meta), path("*.diff.indv_in_files") , optional:true, emit: diff_indv_in_files + tuple val(meta), path("*.diff.sites") , optional:true, emit: diff_sites + tuple val(meta), path("*.diff.indv") , optional:true, emit: diff_indv + tuple val(meta), path("*.diff.discordance.matrix"), optional:true, emit: diff_discd_matrix + tuple val(meta), path("*.diff.switch") , optional:true, emit: diff_switch_error + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def args_list = args.tokenize() + + def bed_arg = (args.contains('--bed')) ? "--bed ${bed}" : + (args.contains('--exclude-bed')) ? "--exclude-bed ${bed}" : + (args.contains('--hapcount')) ? "--hapcount ${bed}" : + (args.contains('--positions')) ? "--positions ${bed}" : + (args.contains('--exclude-positions')) ? "--exclude-positions ${bed}" : '' + args_list.removeIf { it.contains('--bed') } + args_list.removeIf { it.contains('--exclude-bed') } + args_list.removeIf { it.contains('--hapcount') } + args_list.removeIf { it.contains('--positions') } + args_list.removeIf { it.contains('--exclude-positions') } + + def diff_variant_arg = (args.contains('--diff')) ? "--diff ${diff_variant_file}" : + (args.contains('--gzdiff')) ? "--gzdiff ${diff_variant_file}" : + (args.contains('--diff-bcf')) ? "--diff-bcf ${diff_variant_file}" : '' + args_list.removeIf { it.contains('--diff') } + args_list.removeIf { it.contains('--gzdiff') } + args_list.removeIf { it.contains('--diff-bcf') } + + def input_file = ("$variant_file".endsWith(".vcf")) ? "--vcf ${variant_file}" : + ("$variant_file".endsWith(".vcf.gz")) ? "--gzvcf ${variant_file}" : + ("$variant_file".endsWith(".bcf")) ? "--bcf ${variant_file}" : '' + + """ + vcftools \\ + $input_file \\ + --out $prefix \\ + ${args_list.join(' ')} \\ + $bed_arg \\ + $diff_variant_arg + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + vcftools: \$(echo \$(vcftools --version 2>&1) | sed 's/^.*VCFtools (//;s/).*//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/vcftools/meta.yml b/modules/nf-core/vcftools/meta.yml new file mode 100644 index 0000000..09ad590 --- /dev/null +++ b/modules/nf-core/vcftools/meta.yml @@ -0,0 +1,293 @@ +name: vcftools +description: A set of tools written in Perl and C++ for working with VCF files +keywords: + - VCFtools + - VCF + - sort +tools: + - vcftools: + description: A set of tools written in Perl and C++ for working with VCF files. This package only contains the C++ libraries whereas the package perl-vcftools-vcf contains the perl libraries + homepage: http://vcftools.sourceforge.net/ + documentation: http://vcftools.sourceforge.net/man_latest.html + licence: ["LGPL"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - variant_file: + type: file + description: variant input file which can be vcf, vcf.gz, or bcf format. + - bed: + type: file + description: bed file which can be used with different arguments in vcftools (optional) + - diff_variant_file: + type: file + description: secondary variant file which can be used with the 'diff' suite of tools (optional) +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - vcf: + type: file + description: vcf file (optional) + pattern: "*.vcf" + - bcf: + type: file + description: bcf file (optional) + pattern: "*.bcf" + - frq: + type: file + description: Allele frequency for each site (optional) + pattern: "*.frq" + - frq_count: + type: file + description: Allele counts for each site (optional) + pattern: "*.frq.count" + - idepth: + type: file + description: mean depth per individual (optional) + pattern: "*.idepth" + - ldepth: + type: file + description: depth per site summed across individuals (optional) + pattern: "*.ildepth" + - ldepth_mean: + type: file + description: mean depth per site calculated across individuals (optional) + pattern: "*.ldepth.mean" + - gdepth: + type: file + description: depth for each genotype in vcf file (optional) + pattern: "*.gdepth" + - hap_ld: + type: file + description: r2, D, and D’ statistics using phased haplotypes (optional) + pattern: "*.hap.ld" + - geno_ld: + type: file + description: squared correlation coefficient between genotypes encoded as 0, 1 and 2 to represent the number of non-reference alleles in each individual (optional) + pattern: "*.geno.ld" + - geno_chisq: + type: file + description: test for genotype independence via the chi-squared statistic (optional) + pattern: "*.geno.chisq" + - list_hap_ld: + type: file + description: r2 statistics of the sites contained in the provided input file verses all other sites (optional) + pattern: "*.list.hap.ld" + - list_geno_ld: + type: file + description: r2 statistics of the sites contained in the provided input file verses all other sites (optional) + pattern: "*.list.geno.ld" + - interchrom_hap_ld: + type: file + description: r2 statistics for sites (haplotypes) on different chromosomes (optional) + pattern: "*.interchrom.hap.ld" + - interchrom_geno_ld: + type: file + description: r2 statistics for sites (genotypes) on different chromosomes (optional) + pattern: "*.interchrom.geno.ld" + - tstv: + type: file + description: Transition / Transversion ratio in bins of size defined in options (optional) + pattern: "*.TsTv" + - tstv_summary: + type: file + description: Summary of all Transitions and Transversions (optional) + pattern: "*.TsTv.summary" + - tstv_count: + type: file + description: Transition / Transversion ratio as a function of alternative allele count (optional) + pattern: "*.TsTv.count" + - tstv_qual: + type: file + description: Transition / Transversion ratio as a function of SNP quality threshold (optional) + pattern: "*.TsTv.qual" + - filter_summary: + type: file + description: Summary of the number of SNPs and Ts/Tv ratio for each FILTER category (optional) + pattern: "*.FILTER.summary" + - sites_pi: + type: file + description: Nucleotide divergency on a per-site basis (optional) + pattern: "*.sites.pi" + - windowed_pi: + type: file + description: Nucleotide diversity in windows, with window size determined by options (optional) + pattern: "*windowed.pi" + - weir_fst: + type: file + description: Fst estimate from Weir and Cockerham’s 1984 paper (optional) + pattern: "*.weir.fst" + - heterozygosity: + type: file + description: Heterozygosity on a per-individual basis (optional) + pattern: "*.het" + - hwe: + type: file + description: Contains the Observed numbers of Homozygotes and Heterozygotes and the corresponding Expected numbers under HWE (optional) + pattern: "*.hwe" + - tajima_d: + type: file + description: Tajima’s D statistic in bins with size of the specified number in options (optional) + pattern: "*.Tajima.D" + - freq_burden: + type: file + description: Number of variants within each individual of a specific frequency in options (optional) + pattern: "*.ifreqburden" + - lroh: + type: file + description: Long Runs of Homozygosity (optional) + pattern: "*.LROH" + - relatedness: + type: file + description: Relatedness statistic based on the method of Yang et al, Nature Genetics 2010 (doi:10.1038/ng.608) (optional) + pattern: "*.relatedness" + - relatedness2: + type: file + description: Relatedness statistic based on the method of Manichaikul et al., BIOINFORMATICS 2010 (doi:10.1093/bioinformatics/btq559) (optional) + pattern: "*.relatedness2" + - lqual: + type: file + description: per-site SNP quality (optional) + pattern: "*.lqual" + - missing_individual: + type: file + description: Missingness on a per-individual basis (optional) + pattern: "*.imiss" + - missing_site: + type: file + description: Missingness on a per-site basis (optional) + pattern: "*.lmiss" + - snp_density: + type: file + description: Number and density of SNPs in bins of size defined by option (optional) + pattern: "*.snpden" + - kept_sites: + type: file + description: All sites that have been kept after filtering (optional) + pattern: "*.kept.sites" + - removed_sites: + type: file + description: All sites that have been removed after filtering (optional) + pattern: "*.removed.sites" + - singeltons: + type: file + description: Location of singletons, and the individual they occur in (optional) + pattern: "*.singeltons" + - indel_hist: + type: file + description: Histogram file of the length of all indels (including SNPs) (optional) + pattern: "*.indel_hist" + - hapcount: + type: file + description: Unique haplotypes within user specified bins (optional) + pattern: "*.hapcount" + - mendel: + type: file + description: Mendel errors identified in trios (optional) + pattern: "*.mendel" + - format: + type: file + description: Extracted information from the genotype fields in the VCF file relating to a specfied FORMAT identifier (optional) + pattern: "*.FORMAT" + - info: + type: file + description: Extracted information from the INFO field in the VCF file (optional) + pattern: "*.INFO" + - genotypes_matrix: + type: file + description: | + Genotypes output as large matrix. + Genotypes of each individual on a separate line. + Genotypes are represented as 0, 1 and 2, where the number represent that number of non-reference alleles. + Missing genotypes are represented by -1 (optional) + pattern: "*.012" + - genotypes_matrix_individual: + type: file + description: Details the individuals included in the main genotypes_matrix file (optional) + pattern: "*.012.indv" + - genotypes_matrix_position: + type: file + description: Details the site locations included in the main genotypes_matrix file (optional) + pattern: "*.012.pos" + - impute_hap: + type: file + description: Phased haplotypes in IMPUTE reference-panel format (optional) + pattern: "*.impute.hap" + - impute_hap_legend: + type: file + description: Impute haplotype legend file (optional) + pattern: "*.impute.hap.legend" + - impute_hap_indv: + type: file + description: Impute haplotype individuals file (optional) + pattern: "*.impute.hap.indv" + - ldhat_sites: + type: file + description: Output data in LDhat format, sites (optional) + pattern: "*.ldhat.sites" + - ldhat_locs: + type: file + description: output data in LDhat format, locations (optional) + pattern: "*.ldhat.locs" + - beagle_gl: + type: file + description: Genotype likelihoods for biallelic sites (optional) + pattern: "*.BEAGLE.GL" + - beagle_pl: + type: file + description: Genotype likelihoods for biallelic sites (optional) + pattern: "*.BEAGLE.PL" + - ped: + type: file + description: output the genotype data in PLINK PED format (optional) + pattern: "*.ped" + - map_: + type: file + description: output the genotype data in PLINK PED format (optional) + pattern: "*.map" + - tped: + type: file + description: output the genotype data in PLINK PED format (optional) + pattern: "*.tped" + - tfam: + type: file + description: output the genotype data in PLINK PED format (optional) + pattern: "*.tfam" + - diff_sites_in_files: + type: file + description: Sites that are common / unique to each file specified in optional inputs (optional) + pattern: "*.diff.sites.in.files" + - diff_indv_in_files: + type: file + description: Individuals that are common / unique to each file specified in optional inputs (optional) + pattern: "*.diff.indv.in.files" + - diff_sites: + type: file + description: Discordance on a site by site basis, specified in optional inputs (optional) + pattern: "*.diff.sites" + - diff_indv: + type: file + description: Discordance on a individual by individual basis, specified in optional inputs (optional) + pattern: "*.diff.indv" + - diff_discd_matrix: + type: file + description: Discordance matrix between files specified in optional inputs (optional) + pattern: "*.diff.discordance.matrix" + - diff_switch_error: + type: file + description: Switch errors found between sites (optional) + pattern: "*.diff.switch" +authors: + - "@Mark-S-Hill" +maintainers: + - "@Mark-S-Hill" diff --git a/nextflow.config b/nextflow.config index cb204b4..9cf2e35 100644 --- a/nextflow.config +++ b/nextflow.config @@ -13,14 +13,16 @@ params { input = null fasta = null fai = null - gzi = null + align = false interval = null + include_positions = null + exclude_positions = null split_fasta_cutoff = 100000 - + vector_db = "${projectDir}/assets/vectorDB.tar.gz" // Boilerplate options outdir = 'results' - tracedir = "${params.outdir}/variantcalling_info" + tracedir = "${params.outdir}/pipeline_info/variantcalling" publish_dir_mode = 'copy' email = null email_on_fail = null @@ -71,8 +73,15 @@ try { profiles { - debug { process.beforeScript = 'echo $HOSTNAME' } + cleanup { cleanup = true } + + debug { + dumpHashes = true + process.beforeScript = 'echo $HOSTNAME' + cleanup = false + } + conda { conda.enabled = true docker.enabled = false @@ -80,6 +89,7 @@ profiles { podman.enabled = false shifter.enabled = false charliecloud.enabled = false + apptainer.enabled = false } mamba { conda.enabled = true @@ -89,14 +99,18 @@ profiles { podman.enabled = false shifter.enabled = false charliecloud.enabled = false + apptainer.enabled = false } docker { docker.enabled = true + docker.registry = 'quay.io' docker.userEmulation = true + conda.enabled = false singularity.enabled = false podman.enabled = false shifter.enabled = false charliecloud.enabled = false + apptainer.enabled = false } arm { docker.runOptions = '-u $(id -u):$(id -g) --platform=linux/amd64' @@ -104,39 +118,60 @@ profiles { singularity { singularity.enabled = true singularity.autoMounts = true + singularity.registry = 'quay.io' + conda.enabled = false docker.enabled = false podman.enabled = false shifter.enabled = false charliecloud.enabled = false + apptainer.enabled = false } podman { podman.enabled = true + podman.registry = 'quay.io' + conda.enabled = false docker.enabled = false singularity.enabled = false shifter.enabled = false charliecloud.enabled = false + apptainer.enabled = false } shifter { shifter.enabled = true + conda.enabled = false docker.enabled = false singularity.enabled = false podman.enabled = false charliecloud.enabled = false + apptainer.enabled = false } charliecloud { charliecloud.enabled = true + conda.enabled = false docker.enabled = false singularity.enabled = false podman.enabled = false shifter.enabled = false + apptainer.enabled = false + } + apptainer { + apptainer.enabled = true + conda.enabled = false + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false } gitpod { executor.name = 'local' executor.cpus = 16 executor.memory = 60.GB } - test { includeConfig 'conf/test.config' } - test_full { includeConfig 'conf/test_full.config' } + test { includeConfig 'conf/test.config' } + test_align { includeConfig 'conf/test_align.config' } + test_full { includeConfig 'conf/test_full.config' } + test_full_align { includeConfig 'conf/test_full_align.config' } } @@ -175,12 +210,12 @@ dag { manifest { name = 'sanger-tol/variantcalling' - author = '@muffato, @gq1, @priyanka-surana' + author = """@muffato, @gq1, @priyanka-surana""" homePage = 'https://github.com/sanger-tol/variantcalling' - description = 'Variant calling pipeline for PacBio data using DeepVariant' + description = """Variant calling pipeline for PacBio data using DeepVariant""" mainScript = 'main.nf' nextflowVersion = '!>=22.10.1' - version = '1.0.0' + version = '1.1.0' doi = 'https://doi.org/10.5281/zenodo.7890527' } diff --git a/nextflow_schema.json b/nextflow_schema.json index c3e2d8b..c857174 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1,8 +1,8 @@ { "$schema": "http://json-schema.org/draft-07/schema", - "$id": "https://raw.githubusercontent.com/sanger-tol/variantcalling/main/nextflow_schema.json", + "$id": "https://raw.githubusercontent.com/sanger-tol/variantcalling/master/nextflow_schema.json", "title": "sanger-tol/variantcalling pipeline parameters", - "description": "variant calling", + "description": "Variant calling pipeline for PacBio data using DeepVariant", "type": "object", "definitions": { "input_output_options": { @@ -28,6 +28,12 @@ "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", "fa_icon": "fas fa-folder-open" }, + "vector_db": { + "type": "string", + "default": "${projectDir}/assets/vectorDB.tar.gz", + "description": "Path to directory or tar.gz archive for pre-built PacBio vector database.", + "format": "file-path" + }, "email": { "type": "string", "description": "Email address for completion summary.", @@ -45,15 +51,15 @@ "properties": { "fasta": { "type": "string", - "description": "Path to FASTA genome file, either fasta or fast.gz" + "description": "Path to FASTA genome file, either fasta or fast.gz." }, "fai": { "type": "string", - "description": "Path to the index file of the FASTA genome file." + "description": "Path to the index file of the FASTA genome file, either fai or gzi." }, - "gzi": { - "type": "string", - "description": "Path to the gzi index file of the FASTA genome file. Required if fasta in gz format." + "align": { + "type": "boolean", + "description": "Align the input reads to the reference" }, "interval": { "type": "string", @@ -64,9 +70,17 @@ "default": 100000, "hidden": true, "description": "The minimum fasta file size when splitting the input fasta file by sequence." + }, + "include_positions": { + "type": "string", + "description": "Path to a file with a set of sites on the basis of a list of positions to include, Each line of the input file should contain a (tab-separated) chromosome and position." + }, + "exclude_positions": { + "type": "string", + "description": "Path to a file with a set of sites on the basis of a list of positions to exclude,Each line of the input file should contain a (tab-separated) chromosome and position." } }, - "required": ["fasta", "fai"] + "required": ["fasta"] }, "institutional_config_options": { "title": "Institutional config options", @@ -209,7 +223,7 @@ "tracedir": { "type": "string", "description": "Directory to keep pipeline Nextflow logs and reports.", - "default": "${params.outdir}/pipeline_info", + "default": "${params.outdir}/pipeline_info/variantcalling", "fa_icon": "fas fa-cogs", "hidden": true }, diff --git a/pipeline_template.yml b/pipeline_template.yml new file mode 100644 index 0000000..0aa7398 --- /dev/null +++ b/pipeline_template.yml @@ -0,0 +1,3 @@ +prefix: sanger-tol +skip: + - igenomes diff --git a/subworkflows/local/align_pacbio.nf b/subworkflows/local/align_pacbio.nf new file mode 100644 index 0000000..75f4ac2 --- /dev/null +++ b/subworkflows/local/align_pacbio.nf @@ -0,0 +1,61 @@ +// +// Align PacBio read files against the genome +// + +include { FILTER_PACBIO } from '../../subworkflows/local/filter_pacbio' +include { MINIMAP2_ALIGN } from '../../modules/nf-core/minimap2/align/main' +include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' +include { SAMTOOLS_SORT } from '../../modules/nf-core/samtools/sort/main' +include { CONVERT_STATS } from '../../subworkflows/local/convert_stats' + + +workflow ALIGN_PACBIO { + take: + fasta // channel: [ val(meta), /path/to/fasta ] + reads // channel: [ val(meta), /path/to/datafile ] + db // channel: /path/to/vector_db + + + main: + ch_versions = Channel.empty() + + + // Filter BAM and output as FASTQ + FILTER_PACBIO ( reads, db ) + ch_versions = ch_versions.mix ( FILTER_PACBIO.out.versions ) + + + // Align Fastq to Genome + MINIMAP2_ALIGN ( FILTER_PACBIO.out.fastq, fasta, true, false, false ) + ch_versions = ch_versions.mix ( MINIMAP2_ALIGN.out.versions.first() ) + + + // Collect all alignment output by sample name + MINIMAP2_ALIGN.out.bam + | map { meta, bam -> [['id': meta.sample, 'datatype': meta.datatype, 'sample': meta.sample ], bam] } + | groupTuple ( by: [0] ) + | set { ch_bams } + + + // Merge + SAMTOOLS_MERGE ( ch_bams, [ [], [] ], [ [], [] ] ) + ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() ) + + + // Convert merged BAM to CRAM and calculate indices and statistics + SAMTOOLS_MERGE.out.bam + | map { meta, bam -> [ meta, bam, [] ] } + | set { ch_sort } + + CONVERT_STATS ( ch_sort, fasta ) + ch_versions = ch_versions.mix ( CONVERT_STATS.out.versions ) + + + emit: + cram = CONVERT_STATS.out.cram // channel: [ val(meta), /path/to/cram ] + crai = CONVERT_STATS.out.crai // channel: [ val(meta), /path/to/crai ] + stats = CONVERT_STATS.out.stats // channel: [ val(meta), /path/to/stats ] + idxstats = CONVERT_STATS.out.idxstats // channel: [ val(meta), /path/to/idxstats ] + flagstat = CONVERT_STATS.out.flagstat // channel: [ val(meta), /path/to/flagstat ] + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/convert_stats.nf b/subworkflows/local/convert_stats.nf new file mode 100644 index 0000000..9118e8d --- /dev/null +++ b/subworkflows/local/convert_stats.nf @@ -0,0 +1,53 @@ +// +// Convert BAM to CRAM, create index and calculate statistics +// + +include { SAMTOOLS_VIEW } from '../../modules/nf-core/samtools/view/main' +include { SAMTOOLS_STATS } from '../../modules/nf-core/samtools/stats/main' +include { SAMTOOLS_FLAGSTAT } from '../../modules/nf-core/samtools/flagstat/main' +include { SAMTOOLS_IDXSTATS } from '../../modules/nf-core/samtools/idxstats/main' + + +workflow CONVERT_STATS { + take: + bam // channel: [ val(meta), /path/to/bam, /path/to/bai] + fasta // channel: [ val(meta), /path/to/fasta ] + + + main: + ch_versions = Channel.empty() + + // Convert BAM to CRAM + SAMTOOLS_VIEW ( bam, fasta, [ ] ) + ch_versions = ch_versions.mix ( SAMTOOLS_VIEW.out.versions.first() ) + + + // Combine CRAM and CRAI into one channel + SAMTOOLS_VIEW.out.cram + | join ( SAMTOOLS_VIEW.out.crai ) + | set { ch_cram_crai } + + + // Calculate statistics + SAMTOOLS_STATS ( ch_cram_crai, fasta ) + ch_versions = ch_versions.mix ( SAMTOOLS_STATS.out.versions.first() ) + + + // Calculate statistics based on flag values + SAMTOOLS_FLAGSTAT ( ch_cram_crai ) + ch_versions = ch_versions.mix ( SAMTOOLS_FLAGSTAT.out.versions.first() ) + + + // Calculate index statistics + SAMTOOLS_IDXSTATS ( ch_cram_crai ) + ch_versions = ch_versions.mix ( SAMTOOLS_IDXSTATS.out.versions.first() ) + + + emit: + cram = SAMTOOLS_VIEW.out.cram // channel: [ val(meta), /path/to/cram ] + crai = SAMTOOLS_VIEW.out.crai // channel: [ val(meta), /path/to/crai ] + stats = SAMTOOLS_STATS.out.stats // channel: [ val(meta), /path/to/stats ] + flagstat = SAMTOOLS_FLAGSTAT.out.flagstat // channel: [ val(meta), /path/to/idxstats ] + idxstats = SAMTOOLS_IDXSTATS.out.idxstats // channel: [ val(meta), /path/to/flagstat ] + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/deepvariant_caller.nf b/subworkflows/local/deepvariant_caller.nf index 08adb0f..7e4adf0 100644 --- a/subworkflows/local/deepvariant_caller.nf +++ b/subworkflows/local/deepvariant_caller.nf @@ -14,7 +14,11 @@ workflow DEEPVARIANT_CALLER { ch_versions = Channel.empty() reads_fasta.map { meta, cram, crai, interval, fasta_file_name, fasta, fai -> - [ [ id: meta.id + "_" + fasta_file_name, sample: meta.id, type: meta.type ], + [ [ id: meta.id + "_" + fasta_file_name, + sample: meta.id, + type: meta.datatype, + fasta_file_name: fasta_file_name + ], cram, crai, interval @@ -22,13 +26,21 @@ workflow DEEPVARIANT_CALLER { .set { cram_crai } // fasta - fasta = reads_fasta.map { meta, cram, crai, interval, fasta_file_name, fasta, fai -> [ fasta ] } + fasta = reads_fasta.map { meta, cram, crai, interval, fasta_file_name, fasta, fai -> + [ [ id: meta.id + "_" + fasta_file_name, sample: meta.id, type: meta.datatype ], + fasta + ] + } // fai - fai = reads_fasta.map{ meta, cram, crai, interval, fasta_file_name, fasta, fai -> [ fai ] } + fai = reads_fasta.map{ meta, cram, crai, interval, fasta_file_name, fasta, fai -> + [ [ id: meta.id + "_" + fasta_file_name, sample: meta.id, type: meta.datatype ], + fai + ] + } // split fasta in compressed format, no gzi index file needed - gzi = [] + gzi = [ [], [] ] // call deepvariant DEEPVARIANT ( cram_crai, fasta, fai, gzi ) @@ -36,9 +48,15 @@ workflow DEEPVARIANT_CALLER { // group the vcf files together by sample DEEPVARIANT.out.vcf - .map { meta, vcf -> [ meta.sample, vcf ] } + .map { meta, vcf -> [ + [ id: meta.fasta_file_name.tokenize(".")[0..-2].join(".") + + "." + meta.type + + "." + meta.sample + ], + vcf + ] } .groupTuple() - .map { sample, vcf -> [ [id: sample], vcf, [] ] } + .map { meta, vcf -> [ meta, vcf, [] ] } .set { vcf } // catcat vcf files @@ -47,9 +65,15 @@ workflow DEEPVARIANT_CALLER { // group the g vcf files together by sample DEEPVARIANT.out.gvcf - .map { meta, gvcf -> [ meta.sample, gvcf ] } + .map { meta, gvcf -> [ + [ id: meta.fasta_file_name.tokenize(".")[0..-2].join(".") + + "." + meta.type + + "." + meta.sample + ], + gvcf + ] } .groupTuple() - .map { sample, gvcf -> [ [ id: sample ], gvcf, [] ] } + .map { meta, gvcf -> [ meta, gvcf, [] ] } .set { g_vcf } // catcat g vcf files diff --git a/subworkflows/local/filter_pacbio.nf b/subworkflows/local/filter_pacbio.nf new file mode 100644 index 0000000..2e306bf --- /dev/null +++ b/subworkflows/local/filter_pacbio.nf @@ -0,0 +1,78 @@ +// +// Filter PacBio reads +// Original protocol is a modified version by Shane of the original program, HiFiAdapterFilt +// + +include { SAMTOOLS_VIEW as SAMTOOLS_CONVERT } from '../../modules/nf-core/samtools/view/main' +include { SAMTOOLS_COLLATE } from '../../modules/nf-core/samtools/collate/main' +include { SAMTOOLS_FASTA } from '../../modules/nf-core/samtools/fasta/main' +include { GUNZIP } from '../../modules/nf-core/gunzip/main' +include { BLAST_BLASTN } from '../../modules/nf-core/blast/blastn/main' +include { PACBIO_FILTER } from '../../modules/local/pacbio_filter' +include { SAMTOOLS_VIEW as SAMTOOLS_FILTER } from '../../modules/nf-core/samtools/view/main' +include { SAMTOOLS_FASTQ } from '../../modules/nf-core/samtools/fastq/main' + + +workflow FILTER_PACBIO { + take: + reads // channel: [ val(meta), /path/to/datafile ] + db // channel: /path/to/vector_db + + + main: + ch_versions = Channel.empty() + + + // Convert from PacBio BAM to Samtools BAM + reads + | map { meta, bam -> [ meta, bam, [] ] } + | set { ch_pacbio } + + SAMTOOLS_CONVERT (ch_pacbio, [ [], [] ], [] ) + ch_versions = ch_versions.mix ( SAMTOOLS_CONVERT.out.versions.first() ) + + + // Collate BAM file to create interleaved FASTA + SAMTOOLS_COLLATE ( SAMTOOLS_CONVERT.out.bam, [] ) + ch_versions = ch_versions.mix ( SAMTOOLS_COLLATE.out.versions.first() ) + + + // Convert BAM to FASTA + SAMTOOLS_FASTA ( SAMTOOLS_COLLATE.out.bam, true ) + ch_versions = ch_versions.mix ( SAMTOOLS_FASTA.out.versions.first() ) + + + // Gunzip FASTA file to BLAST + GUNZIP ( SAMTOOLS_FASTA.out.other ) + ch_versions = ch_versions.mix ( GUNZIP.out.versions.first() ) + + + // Nucleotide BLAST + db.map{db -> [ [], db]}.set{ch_db} + BLAST_BLASTN ( GUNZIP.out.gunzip, ch_db ) + ch_versions = ch_versions.mix ( BLAST_BLASTN.out.versions.first() ) + + + // Filter BLAST output + PACBIO_FILTER ( BLAST_BLASTN.out.txt ) + ch_versions = ch_versions.mix ( PACBIO_FILTER.out.versions.first() ) + + + // Create filtered BAM file + SAMTOOLS_CONVERT.out.bam + | join ( SAMTOOLS_CONVERT.out.csi ) + | set { ch_reads } + + SAMTOOLS_FILTER ( ch_reads, [ [], [] ], PACBIO_FILTER.out.list ) + ch_versions = ch_versions.mix ( SAMTOOLS_FILTER.out.versions.first() ) + + + // Convert BAM to FASTQ + SAMTOOLS_FASTQ ( SAMTOOLS_FILTER.out.unoutput, true ) + ch_versions = ch_versions.mix ( SAMTOOLS_FASTQ.out.versions.first() ) + + + emit: + fastq = SAMTOOLS_FASTQ.out.other // channel: [ meta, /path/to/fastq ] + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index d2f72e9..b71f3fd 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -12,10 +12,33 @@ workflow INPUT_CHECK { SAMPLESHEET_CHECK ( samplesheet ) .csv .splitCsv ( header:true, sep:',' ) - .map { [[id: it.sample, type: it.datatype], file(it.datafile), file(it.indexfile)] } + .map { create_data_channel( it ) } .set { reads } emit: - reads // channel: [ val(meta), data, index ] + reads // channel: [ val(meta), data ] versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ] } + +// Function to get list of [ meta, reads ] +def create_data_channel ( LinkedHashMap row ) { + // create meta map + def meta = [:] + meta.id = row.sample + meta.sample = row.sample.split('_')[0..-2].join('_') + meta.datatype = row.datatype + + if ( meta.datatype == "pacbio" ) { + platform = "PACBIO" + } + meta.read_group = "\'@RG\\tID:" + row.datafile.split('/')[-1].split('\\.')[0..-2].join('.') + "\\tPL:" + platform + "\\tSM:" + meta.sample + "\'" + + // add path(s) of the read file(s) to the meta map + def data_meta = [] + if ( !file(row.datafile).exists() ) { + exit 1, "ERROR: Please check input samplesheet -> Data file does not exist!\n${row.datafile}" + } else { + data_meta = [ meta, file(row.datafile) ] + } + return data_meta +} diff --git a/subworkflows/local/input_filter_split.nf b/subworkflows/local/input_filter_split.nf index f2f26c5..dc0710f 100644 --- a/subworkflows/local/input_filter_split.nf +++ b/subworkflows/local/input_filter_split.nf @@ -9,8 +9,6 @@ include { CAT_CAT } from '../../modules/nf-core/cat/cat/main' workflow INPUT_FILTER_SPLIT { take: fasta // file: /path/to/genome.fasta or /path/to/genome.fasta.gz - fai // file: /path/to/genome.*.fai - gzi // file: /path/to/genome.fasta.gz.gzi or null reads // [ val(meta), data, index ] interval // file: /path/to/intervals.bed split_fasta_cutoff // val(min_file_size) @@ -19,8 +17,7 @@ workflow INPUT_FILTER_SPLIT { ch_versions = Channel.empty() // split the fasta file into files with one sequence each, group them by file size - Channel - .fromPath ( fasta ) + fasta .splitFasta ( file:true ) .branch { small: it.size() < split_fasta_cutoff @@ -49,7 +46,7 @@ workflow INPUT_FILTER_SPLIT { .set { split_fasta } // index split fasta files - SAMTOOLS_FAIDX ( split_fasta ) + SAMTOOLS_FAIDX ( split_fasta, [[], []]) ch_versions = ch_versions.mix( SAMTOOLS_FAIDX.out.versions.first() ) // join fasta with corresponding fai file @@ -62,13 +59,15 @@ workflow INPUT_FILTER_SPLIT { .set { fasta_fai } // filter reads - SAMTOOLS_VIEW ( reads, fasta, [] ) + ch_fasta = fasta.map { fasta -> [ [ 'id': fasta.baseName ], fasta ] }.first() + + SAMTOOLS_VIEW ( reads, ch_fasta, [] ) ch_versions = ch_versions.mix ( SAMTOOLS_VIEW.out.versions.first() ) // combine reads with splitted references SAMTOOLS_VIEW.out.cram .join ( SAMTOOLS_VIEW.out.crai ) - .map { filtered_reads -> filtered_reads + [interval ?: []] } + .combine(interval.ifEmpty([[]])) .combine ( fasta_fai ) .set { cram_crai_fasta_fai } diff --git a/subworkflows/local/input_merge.nf b/subworkflows/local/input_merge.nf new file mode 100644 index 0000000..12f2653 --- /dev/null +++ b/subworkflows/local/input_merge.nf @@ -0,0 +1,67 @@ +// +// Merge READS(bam or cram files) together by sample name +// + +include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge' +include { SAMTOOLS_SORT } from '../../modules/nf-core/samtools/sort' + +workflow INPUT_MERGE { + take: + fasta // channel: [ val(meta), /path/to/genome.fasta or /path/to/genome.fasta.gz ] + fai // channel: [ val(meta), /path/to/genome.*.fai or /path/to/genome.fasta.gz.gzi ] + reads // channel: [ val(meta), data ] + + main: + ch_versions = Channel.empty() + + // group input meta data together by sample name + reads + .map{ meta, bam_cram -> [ meta.sample, meta ] } + .groupTuple() + .set{ grouped_reads_meta } + + // sort input reads + SAMTOOLS_SORT( reads ) + ch_versions = ch_versions.mix ( SAMTOOLS_SORT.out.versions ) + sorted_reads = SAMTOOLS_SORT.out.bam + + // group input reads file by sample name + sorted_reads + .map{ meta, bam_cram -> [ meta.sample, bam_cram ] } + .groupTuple() + .set{ grouped_reads } + + // join grouped reads and meta + // use the first meta data for the combined reads + grouped_reads_meta + .map { sample, meta_list -> [sample, meta_list[0]] } + .join( grouped_reads ) + .map { sample, meta, bam_cram_list -> [ + [ id: sample, + datatype: meta.datatype + ], + bam_cram_list + ]} + .set { grouped_reads_with_meta } + + // call samtool merge + SAMTOOLS_MERGE( grouped_reads_with_meta, + fasta, + fai + ) + ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions ) + + // concat merged bam or cram together along with their index file + SAMTOOLS_MERGE.out.bam + .join(SAMTOOLS_MERGE.out.csi) + .concat( + SAMTOOLS_MERGE.out.cram + .join(SAMTOOLS_MERGE.out.crai) + ) + .set{ indexed_merged_reads }; + + emit: + indexed_merged_reads = indexed_merged_reads + versions = ch_versions // channel: [ versions.yml ] + +} diff --git a/subworkflows/local/process_vcf.nf b/subworkflows/local/process_vcf.nf new file mode 100644 index 0000000..432bbf9 --- /dev/null +++ b/subworkflows/local/process_vcf.nf @@ -0,0 +1,33 @@ +// +// Call vcftools to process VCF files +// + +include { VCFTOOLS as VCFTOOLS_SITE_PI } from '../../modules/nf-core/vcftools/main' +include { VCFTOOLS as VCFTOOLS_HET } from '../../modules/nf-core/vcftools/main' + +workflow PROCESS_VCF { + take: + vcf // [ val(meta), vcf ] + site_pi_positions // path to positions file to include or exclude + + main: + ch_versions = Channel.empty() + + // call vcftools for per site nucleotide diversity + VCFTOOLS_SITE_PI( + vcf, site_pi_positions, [] + ) + ch_versions = ch_versions.mix( VCFTOOLS_SITE_PI.out.versions ) + + // call vcftools to calculate for heterozygosity + VCFTOOLS_HET( + vcf, [], [] + ) + ch_versions = ch_versions.mix( VCFTOOLS_HET.out.versions ) + + emit: + versions = ch_versions // channel: [ versions.yml ] + stite_pi = VCFTOOLS_SITE_PI.out.sites_pi // [ meta, site_pi ] + heterozygosity = VCFTOOLS_HET.out.heterozygosity // [ meta, heterozygosity ] + +} diff --git a/tower.yml b/tower.yml new file mode 100644 index 0000000..48611ab --- /dev/null +++ b/tower.yml @@ -0,0 +1,3 @@ +reports: + samplesheet.csv: + display: "Auto-created samplesheet with collated metadata and aligned file paths." diff --git a/workflows/variantcalling.nf b/workflows/variantcalling.nf index 6c6ce09..f3e7450 100644 --- a/workflows/variantcalling.nf +++ b/workflows/variantcalling.nf @@ -10,32 +10,39 @@ def summary_params = NfcoreSchema.paramsSummaryMap(workflow, params) WorkflowVariantcalling.initialise(params, log) // Check input path parameters to see if they exist -def checkPathParamList = [ params.input, params.fasta, params.fai, params.gzi, params.interval ] +def checkPathParamList = [ params.input, params.fasta, params.fai, params.interval, params.include_positions, params.exclude_positions ] for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } // Check mandatory parameters -if (params.input) { input_file = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } -if (params.fasta) { fasta_file = file(params.fasta) } else { exit 1, 'Reference fasta not specified!' } -if (params.fai) { fai_file = file(params.fai) } else { exit 1, 'Reference fasta index not specified!' } - -// Check gzi being given if compressed fasta is provided -if (params.gzi) { - gzi_file = file(params.gzi) -} else if ( params.fasta.endsWith('fasta.gz') ) { - exit 1, 'Reference fasta index gzi file not specified for fasta.gz file!' -} else { - gzi_file = null -} +if (params.input) { ch_input = Channel.fromPath(params.input) } else { exit 1, 'Input samplesheet not specified!' } +if (params.fasta) { ch_fasta = Channel.fromPath(params.fasta) } else { exit 1, 'Reference fasta not specified!' } // Check optional parameters -if (params.interval) { interval_file = file(params.interval) } else { interval_file = null } +if (params.fai){ + if( ( params.fasta.endsWith('.gz') && params.fai.endsWith('.fai') ) + || + ( !params.fasta.endsWith('.gz') && params.fai.endsWith('.gzi') ) + ){ + exit 1, 'Reference fasta and its index file format not matched!' + } + ch_fai = Channel.fromPath(params.fai) +} else { + ch_fai = Channel.empty() +} + +if (params.interval){ ch_interval = Channel.fromPath(params.interval) } else { ch_interval = Channel.empty() } + if (params.split_fasta_cutoff ) { split_fasta_cutoff = params.split_fasta_cutoff } else { split_fasta_cutoff = 100000 } -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - CONFIG FILES -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ +if ( (params.include_positions) && (params.exclude_positions) ){ + exit 1, 'Only one positions file can be given to include or exclude!' +}else if (params.include_positions){ + ch_positions = Channel.fromPath(params.include_positions) +} else if (params.exclude_positions){ + ch_positions = Channel.fromPath(params.exclude_positions) +} else { + ch_positions = [] +} /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -47,8 +54,11 @@ if (params.split_fasta_cutoff ) { split_fasta_cutoff = params.split_fasta_cutoff // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // include { INPUT_CHECK } from '../subworkflows/local/input_check' +include { ALIGN_PACBIO } from '../subworkflows/local/align_pacbio' +include { INPUT_MERGE } from '../subworkflows/local/input_merge' include { INPUT_FILTER_SPLIT } from '../subworkflows/local/input_filter_split' include { DEEPVARIANT_CALLER } from '../subworkflows/local/deepvariant_caller' +include { PROCESS_VCF } from '../subworkflows/local/process_vcf' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -60,6 +70,8 @@ include { DEEPVARIANT_CALLER } from '../subworkflows/local/deepvariant_caller' // MODULE: Installed directly from nf-core/modules // include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' +include { SAMTOOLS_FAIDX } from '../modules/nf-core/samtools/faidx/main' +include { UNTAR } from '../modules/nf-core/untar/main' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -72,35 +84,121 @@ include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoft workflow VARIANTCALLING { ch_versions = Channel.empty() + ch_fasta + .map { fasta -> [ [ 'id': fasta.baseName - '.fasta' - '.fa' ], fasta ] } + .first() + .set { ch_genome } + + // + // check reference fasta index given or not + // + if( params.fai == null ){ + + SAMTOOLS_FAIDX ( ch_genome, [[], []] ) + ch_versions = ch_versions.mix( SAMTOOLS_FAIDX.out.versions ) + + if( params.fasta.endsWith('.gz') ){ + ch_genome_index = SAMTOOLS_FAIDX.out.gzi + }else{ + ch_genome_index = SAMTOOLS_FAIDX.out.fai + } + + }else{ + ch_index + .map { fai -> [ [ 'id': fai.baseName ], fai ] } + .first() + .set { ch_genome_index } + } // // SUBWORKFLOW: Read in samplesheet, validate and stage input files // INPUT_CHECK ( - input_file + ch_input ) - ch_versions = ch_versions.mix(INPUT_CHECK.out.versions) + ch_versions = ch_versions.mix( INPUT_CHECK.out.versions ) + + + // + // SUBWORKFLOW: align reads if required + // + if( params.align ){ + + if ( params.vector_db.endsWith( '.tar.gz' ) ) { + + UNTAR ( [ [:], params.vector_db ] ).untar + | map { meta, file -> file } + | set { ch_vector_db } + ch_versions = ch_versions.mix ( UNTAR.out.versions ) + + + } else { + + Channel.fromPath ( params.vector_db ) + | set { ch_vector_db } + + } + + ALIGN_PACBIO ( + ch_genome, + INPUT_CHECK.out.reads, + ch_vector_db + ) + ch_versions = ch_versions.mix( ALIGN_PACBIO.out.versions ) + + ALIGN_PACBIO.out.cram + .join( ALIGN_PACBIO.out.crai ) + .set{ ch_aligned_reads } + + } else { + + // + // SUBWORKFLOW: merge the input reads by sample name + // + INPUT_MERGE ( + ch_genome, + ch_genome_index, + INPUT_CHECK.out.reads, + ) + ch_versions = ch_versions.mix( INPUT_MERGE.out.versions ) + ch_aligned_reads = INPUT_MERGE.out.indexed_merged_reads + + } // // SUBWORKFLOW: split the input fasta file and filter input reads // INPUT_FILTER_SPLIT ( - fasta_file, - fai_file, - gzi_file, - INPUT_CHECK.out.reads, - interval_file, + ch_fasta, + ch_aligned_reads, + ch_interval, split_fasta_cutoff ) - ch_versions = ch_versions.mix(INPUT_FILTER_SPLIT.out.versions) - + ch_versions = ch_versions.mix( INPUT_FILTER_SPLIT.out.versions ) + + // // SUBWORKFLOW: call deepvariant // DEEPVARIANT_CALLER ( INPUT_FILTER_SPLIT.out.reads_fasta ) - ch_versions = ch_versions.mix(DEEPVARIANT_CALLER.out.versions) + ch_versions = ch_versions.mix( DEEPVARIANT_CALLER.out.versions ) + + + // + // convert VCF channel meta id + // + DEEPVARIANT_CALLER.out.vcf + .map{ meta, vcf -> [ [ id: vcf.baseName ], vcf ] } + .set{ vcf } + + // + // process VCF output files + // + PROCESS_VCF( vcf, ch_positions ) + ch_versions = ch_versions.mix( PROCESS_VCF.out.versions ) + // // MODULE: Combine different version together