Skip to content

Commit

Permalink
Merge pull request #66 from sanger-tol/dev
Browse files Browse the repository at this point in the history
Release 1.1.0 to main branch
  • Loading branch information
gq1 authored Dec 20, 2023
2 parents 0a65618 + 43aea20 commit 15a1475
Show file tree
Hide file tree
Showing 125 changed files with 4,066 additions and 317 deletions.
2 changes: 1 addition & 1 deletion .editorconfig
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ trim_trailing_whitespace = true
indent_size = 4
indent_style = space

[*.{md,yml,yaml,html,css,scss,js,cff}]
[*.{md,yml,yaml,html,css,scss,js}]
indent_size = 2

# These files are edited and tested upstream in nf-core/modules
Expand Down
3 changes: 2 additions & 1 deletion .github/ISSUE_TEMPLATE/bug_report.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ body:
* Executor _(eg. slurm, local, awsbatch)_
* Container engine: _(e.g. Docker, Singularity, Conda, Podman, Shifter or Charliecloud)_
* Container engine: _(e.g. Docker, Singularity, Conda, Podman, Shifter, Charliecloud,
or Apptainer)_
* OS _(eg. CentOS Linux, macOS, Linux Mint)_
Expand Down
4 changes: 4 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,7 @@ jobs:
# Remember that you can parallelise this by using strategy.matrix
run: |
nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results
- name: Run pipeline with unaligned test data
run: |
nextflow run ${GITHUB_WORKSPACE} -profile test_align,docker --outdir ./results --align
24 changes: 24 additions & 0 deletions .github/workflows/clean-up.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
name: "Close user-tagged issues and PRs"
on:
schedule:
- cron: "0 0 * * 0" # Once a week

jobs:
clean-up:
runs-on: ubuntu-latest
permissions:
issues: write
pull-requests: write
steps:
- uses: actions/stale@v7
with:
stale-issue-message: "This issue has been tagged as awaiting-changes or awaiting-feedback by an nf-core contributor. Remove stale label or add a comment otherwise this issue will be closed in 20 days."
stale-pr-message: "This PR has been tagged as awaiting-changes or awaiting-feedback by an nf-core contributor. Remove stale label or add a comment if it is still useful."
close-issue-message: "This issue was closed because it has been tagged as awaiting-changes or awaiting-feedback by an nf-core contributor and then staled for 20 days with no activity."
days-before-stale: 30
days-before-close: 20
days-before-pr-close: -1
any-of-labels: "awaiting-changes,awaiting-feedback"
exempt-issue-labels: "WIP"
exempt-pr-labels: "WIP"
repo-token: "${{ secrets.GITHUB_TOKEN }}"
12 changes: 6 additions & 6 deletions .github/workflows/fix-linting.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,21 @@ jobs:
# Only run if comment is on a PR with the main repo, and if it contains the magic keywords
if: >
contains(github.event.comment.html_url, '/pull/') &&
contains(github.event.comment.body, '@nf-core-bot fix linting') &&
contains(github.event.comment.body, '@sanger-tolsoft fix linting') &&
github.repository == 'sanger-tol/variantcalling'
runs-on: ubuntu-latest
steps:
# Use the @nf-core-bot token to check out so we can push later
# Use the @sanger-tolsoft token to check out so we can push later
- uses: actions/checkout@v3
with:
token: ${{ secrets.nf_core_bot_auth_token }}
token: ${{ secrets.sangertolsoft_access_token }}

# Action runs on the issue comment, so we don't get the PR by default
# Use the gh cli to check out the PR
- name: Checkout Pull Request
run: gh pr checkout ${{ github.event.issue.number }}
env:
GITHUB_TOKEN: ${{ secrets.nf_core_bot_auth_token }}
GITHUB_TOKEN: ${{ secrets.sangertolsoft_access_token }}

- uses: actions/setup-node@v3

Expand All @@ -46,8 +46,8 @@ jobs:
- name: Commit & push changes
if: steps.prettier_status.outputs.result == 'fail'
run: |
git config user.email "[email protected]"
git config user.name "nf-core-bot"
git config user.email "[email protected]"
git config user.name "sanger-tolsoft"
git config push.default upstream
git add .
git status
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/linting.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ jobs:
run: npm install -g editorconfig-checker

- name: Run ECLint check
run: editorconfig-checker -exclude README.md $(find .* -type f | grep -v '.git\|.py\|.md\|json\|yml\|yaml\|html\|css\|work\|.nextflow\|build\|nf_core.egg-info\|log.txt\|Makefile')
run: editorconfig-checker -exclude README.md $(find .* -type f | grep -v '.git\|.py\|.cff\|.md\|json\|yml\|yaml\|html\|css\|work\|.nextflow\|build\|nf_core.egg-info\|log.txt\|Makefile')

Prettier:
runs-on: ubuntu-latest
Expand Down Expand Up @@ -78,13 +78,13 @@ jobs:

- uses: actions/setup-python@v4
with:
python-version: "3.7"
python-version: "3.8"
architecture: "x64"

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install nf-core
pip install nf-core==2.8.0
- name: Run nf-core lint
env:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,23 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Launch workflow via tower
uses: nf-core/tower-action@v2
uses: seqeralabs/action-tower-launch@v2
with:
workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }}
access_token: ${{ secrets.TOWER_ACCESS_TOKEN }}
compute_env: ${{ secrets.TOWER_COMPUTE_ENV }}
pipeline: ${{ github.repository }}
revision: ${{ github.sha }}
workdir: ${{ secrets.TOWER_WORKDIR_PARENT }}/work/${{ github.repository }}/work-${{ github.sha }}
parameters: |
{
"outdir": "${{ secrets.TOWER_WORKDIR_PARENT }}/results/${{ github.repository }}/results-${{ github.sha }}",
"align": true
}
profiles: test,sanger,singularity,cleanup
profiles: test_align,sanger,singularity,cleanup

- uses: actions/upload-artifact@v3
with:
name: Tower debug log file
path: |
tower_action_*.log
tower_action_*.json
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,23 @@ jobs:
if: github.event_name == 'workflow_dispatch'

- name: Launch workflow via tower
uses: nf-core/tower-action@v2
uses: seqeralabs/action-tower-launch@v2
with:
workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }}
access_token: ${{ secrets.TOWER_ACCESS_TOKEN }}
compute_env: ${{ secrets.TOWER_COMPUTE_ENV }}
pipeline: ${{ github.repository }}
revision: ${{ env.REVISION }}
workdir: ${{ secrets.TOWER_WORKDIR_PARENT }}/work/${{ github.repository }}/work-${{ env.REVISION }}
parameters: |
{
"outdir": "${{ secrets.TOWER_WORKDIR_PARENT }}/results/${{ github.repository }}/results-${{ env.REVISION }}",
"align": true,
}
profiles: test_full,sanger,singularity,cleanup
profiles: test_full_align,sanger,singularity,cleanup

- uses: actions/upload-artifact@v3
with:
name: Tower debug log file
path: |
tower_action_*.log
tower_action_*.json
1 change: 1 addition & 0 deletions .nf-core.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ lint:
- lib/NfcoreTemplate.groovy
- .github/PULL_REQUEST_TEMPLATE.md
- .github/workflows/branch.yml
- .github/workflows/linting.yml
- LICENSE
- assets/email_template.html
- .github/ISSUE_TEMPLATE/bug_report.yml
Expand Down
5 changes: 5 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
repos:
- repo: https://github.com/pre-commit/mirrors-prettier
rev: "v2.7.1"
hooks:
- id: prettier
48 changes: 47 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,53 @@
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [[1.0.0](https://github.com/sanger-tol/variantcalling/releases/tag/1.0.0)] - [2023-05-03]
## [[1.1.0](https://github.com/sanger-tol/variantcalling/releases/tag/1.1.0)] - Shang Tang - [2023-12-20]

### Enhancements & fixes

- Updated the CI procedure to use "sanger-tol" rather than "nf-core" names.
- Renamed Sanger related Github CI test workflows.
- nf-core template was updated from 2.7 to 2.8.
- Removed BAM/CRAM index files from the sample sheets.
- Made fasta index file optional from the inputs.
- Imported PacBio readmapping sub-workflows from [sanger-tol/readmapping pipeline](https://github.com/sanger-tol/readmapping/). Therefore, the pipeline can run on unaligned BAM/CRAM samples now.
- Use VCFtools to calculate per site nucleotide diversity.
- Use VCFtools to calculate heterozygosity.

### Parameters

This release with the following initial parameters:

| Old parameter | New parameter |
| ------------- | ------------------- |
| --gzi | |
| | --vector_db |
| | --align |
| | --include_positions |
| | --exclude_positions |

> **NB:** Parameter has been **updated** if both old and new parameter information is present. </br> **NB:** Parameter has been **added** if just the new parameter information is present. </br> **NB:** Parameter has been **removed** if new parameter information isn't present.
### Software dependencies

Note, since the pipeline is using Nextflow DSL2, each process will be run with its own [Biocontainer](https://biocontainers.pro/#/registry). This means that on occasion it is entirely possible for the pipeline to be using different versions of the same tool. However, the overall software dependency changes compared to the last release have been listed below for reference. Only `Docker` or `Singularity` containers are supported, `conda` is not supported.

| Dependency | Old version | New version |
| ----------- | ----------- | ----------- |
| DeepVariant | 1.4.0 | 1.5.0 |
| samtools | 1.16.1 | 1.17 |
| bcftools | 1.16.1 | 1.17 |
| python | 3.11.0 | 3.11.4 |
| vcftools | | 0.1.16 |
| blast | | 2.14.1+ |
| gunzip: | | 1.10 |
| minimap2 | | 2.24-r1122 |
| awk | | 5.1.0 |
| untar | | 1.30 |

> **NB:** Dependency has been **updated** if both old and new version information is present. </br> **NB:** Dependency has been **added** if just the new version information is present. </br> **NB:** Dependency has been **removed** if version information isn't present.
## [[1.0.0](https://github.com/sanger-tol/variantcalling/releases/tag/1.0.0)] - Xia Yu - [2023-05-03]

Initial release of sanger-tol/variantcalling, created with the [nf-core](https://nf-co.re/) tools.

Expand Down
30 changes: 30 additions & 0 deletions CITATION.cff
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# This CITATION.cff file was generated with cffinit.
# Visit https://bit.ly/cffinit to generate yours today!

cff-version: 1.2.0
title: sanger-tol/variantcalling v1.1.0
message: >-
If you use this software, please cite it using the
metadata from this file.
type: software
authors:
- given-names: Guoying
family-names: Qi
affiliation: Wellcome Sanger Institute
orcid: "https://orcid.org/0000-0003-1262-8973"
- given-names: Priyanka
family-names: Surana
affiliation: Wellcome Sanger Institute
orcid: "https://orcid.org/0000-0002-7167-0875"
- given-names: Matthieu
family-names: Muffato
affiliation: Wellcome Sanger Institute
orcid: "https://orcid.org/0000-0002-7860-3560"
identifiers:
- type: doi
value: 10.5281/zenodo.7890528
repository-code: "https://github.com/sanger-tol/variantcalling"
license: MIT
commit: TODO
version: 1.1.0
date-released: "2022-12-20"
16 changes: 16 additions & 0 deletions CITATIONS.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,22 @@

> Danecek, Petr, et al. “Twelve Years of SAMtools and BCFtools.” GigaScience, vol. 10, no. 2, Jan. 2021, https://doi.org/10.1093/gigascience/giab008.
- [Blast](https://pubmed.ncbi.nlm.nih.gov/20003500/)

> Camacho C, Coulouris G, Avagyan V, Ma N, Papadopoulos J, Bealer K, Madden TL. BLAST+: architecture and applications. BMC Bioinformatics. 2009 Dec 15;10:421. doi: 10.1186/1471-2105-10-421. PMID: 20003500; PMCID: PMC2803857.
- [BWA-MEM2](https://ieeexplore.ieee.org/document/8820962)

> Vasimuddin Md, Misra S, Li H, Aluru S. Efficient Architecture-Aware Acceleration of BWA-MEM for Multicore Systems. 2019 IEEE International Parallel and Distributed Processing Symposium. 2019 May;314–24. doi: 10.1109/IPDPS.2019.00041.
- [Minimap2](https://pubmed.ncbi.nlm.nih.gov/34623391/)

> Li H. New strategies to improve minimap2 alignment accuracy. Bioinformatics. 2021 Oct 8;37(23):4572–4. doi: 10.1093/bioinformatics/btab705. Epub ahead of print. PMID: 34623391; PMCID: PMC8652018.
- [VCFTools](https://pubmed.ncbi.nlm.nih.gov/21653522/)

> Danecek P, Auton A, Abecasis G, et al.: The variant call format and VCFtools. Bioinformatics. 2011 Aug 1;27(15):2156-8. doi: 10.1093/bioinformatics/btr330. Epub 2011 Jun 7. PubMed PMID: 21653522; PubMed Central PMCID: PMC3137218.
## Software packaging/containerisation tools

- [Anaconda](https://anaconda.com)
Expand Down
14 changes: 12 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,19 @@ On release, automated continuous integration tests run the pipeline on a full-si

## Pipeline summary

The pipleline takes aligned PacBio sample reads (CRAM/BAM files and their index files) from a CSV file and the reference file in FASTA format, and then uses DeepVariant tool to make variant calling.
The pipeline takes aligned or unaligned PacBio sample reads (CRAM/BAM files) from a CSV file and the reference file in FASTA format, and then uses DeepVariant tool to make variant calling.

Steps involved:

- Split fasta file into smaller files, normally one sequence per file unless the sequences are too small.
- Align the reads if not aligned.
- Merge input BAM/CRAM files together if they have the same sample names.
- Filter out reads using the `-F 0x900` option to only retain the primary alignments.
- Run DeepVariant using filtered BAM/CRAM files against each of split fasta files.
- Merge all VCF and GVCF files generated by DeepVariant by sample together for each input BAM/CRAM file.
- Run VCFtools to calculate heterozygosity and per site nucleotide diversity.

<img src="docs/images/mermaid-diagram.png">

## Quick Start

Expand All @@ -36,7 +41,12 @@ Steps involved:
3. Download the pipeline and test it on a minimal dataset with a single command:

```bash
# for aligned reads
nextflow run sanger-tol/variantcalling -profile test,YOURPROFILE --outdir <OUTDIR>

# for unaligned reads
nextflow run sanger-tol/variantcalling -profile test_align,YOURPROFILE --align --outdir <OUTDIR>

```

Note that some form of configuration will be needed so that Nextflow knows how to fetch the required software. This is usually done in the form of a config profile (`YOURPROFILE` in the example command above). You can chain multiple config profiles in a comma-separated string.
Expand All @@ -49,7 +59,7 @@ Steps involved:
4. Start running your own analysis!

```bash
nextflow run sanger-tol/variantcalling --input samplesheet.csv --outdir <OUTDIR> --fasta genome.fasta.gz --fai genome.fasta.gz.fai --gzi genome.fasta.gz.gzi -profile <docker/singularity/podman/shifter/charliecloud/conda/institute>
nextflow run sanger-tol/variantcalling --input samplesheet.csv --outdir <OUTDIR> --fasta genome.fasta.gz -profile <docker/singularity/podman/shifter/charliecloud/conda/institute>
```

## Credits
Expand Down
3 changes: 1 addition & 2 deletions assets/email_template.html
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1">

<meta name="description" content="sanger-tol/variantcalling: Variant calling pipeline based on DeepVariant">
<meta name="description" content="sanger-tol/variantcalling: Variant calling pipeline for PacBio data using DeepVariant">
<title>sanger-tol/variantcalling Pipeline Report</title>
</head>
<body>
Expand Down
9 changes: 5 additions & 4 deletions assets/samplesheet.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
sample,datatype,datafile,indexfile
sample1,pacbio,/path/to/data/file/file1.bam,/path/to/index/file/file1.bam.bai
sample2,pacbio,/path/to/data/file/file2.cram,/path/to/index/file/file2.cram.crai
sample3,pacbio,/path/to/data/file/file3.bam,/path/to/index/file/file3.bam.csi
sample,datatype,datafile
sample1,pacbio,/path/to/data/file/file1.bam
sample2,pacbio,/path/to/data/file/file2.cram
sample3,pacbio,/path/to/data/file/file3-1.bam
sample3,pacbio,/path/to/data/file/file3-2.cram
9 changes: 5 additions & 4 deletions assets/samplesheet_test.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
sample,datatype,datafile,indexfile
icCanRufa1_crai,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.cram,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.cram.crai
icCanRufa1_bai,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.bam,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.bam.bai
icCanRufa1_csi,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.bam,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.bam.csi
sample,datatype,datafile
icCanRufa1_cram,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.cram
icCanRufa1_bam,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.bam
icCanRufa1,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.cram
icCanRufa1,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1_0_3.bam
4 changes: 4 additions & 0 deletions assets/samplesheet_test_align.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
sample,datatype,datafile
icCanRufa1,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/genomic_data/icCanRufa1/pacbio/m64094_200730_174533.ccs.bc1010_BAK8A_OA--bc1010_BAK8A_OA_0_03.bam
icCanRufa1,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/genomic_data/icCanRufa1/pacbio/m64094_200730_174533.ccs.bc1010_BAK8A_OA--bc1010_BAK8A_OA_0_02.bam
icCanRufa1XXXXX,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/genomic_data/icCanRufa1/pacbio/m64094_200730_174533.ccs.bc1010_BAK8A_OA--bc1010_BAK8A_OA_0_02.bam
4 changes: 2 additions & 2 deletions assets/samplesheet_test_full.csv
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
sample,datatype,datafile,indexfile
icCanRufa1,pacbio,/lustre/scratch123/tol/projects/.sandbox/data/insects/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1.cram,/lustre/scratch123/tol/projects/.sandbox/data/insects/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1.cram.crai
sample,datatype,datafile
icCanRufa1,pacbio,/lustre/scratch123/tol/resources/nextflow/test-data/Cantharis_rufa/analysis/icCanRufa1/read_mapping/pacbio/GCA_947369205.1.unmasked.pacbio.icCanRufa1.cram
2 changes: 2 additions & 0 deletions assets/samplesheet_test_full_align.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
sample,datatype,datafile
ilPolIcar1,pacbio,/lustre/scratch124/tol/projects/darwin/data/insects/Polyommatus_icarus/genomic_data/ilPolIcar1/pacbio/m64016_191206_183623.ccs.bc1019_BAK8B_OA--bc1019_BAK8B_OA.bam
7 changes: 1 addition & 6 deletions assets/schema_input.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,8 @@
"type": "string",
"pattern": "^\\S+\\.(bam|cram)$",
"errorMessage": "Data file for reads cannot contain spaces and must have extension 'cram' or 'bam'"
},
"indexfile": {
"type": "string",
"pattern": "^\\S+\\.(bai|csi|crai)$",
"errorMessage": "Data index file for reads cannot contain spaces and must have extension 'bai', 'csi' or 'crai'"
}
},
"required": ["sample", "datatype", "datafile", "indexfile"]
"required": ["sample", "datatype", "datafile"]
}
}
Loading

0 comments on commit 15a1475

Please sign in to comment.