Skip to content

Commit

Permalink
Merge pull request #83 from sanger-tol/alignment
Browse files Browse the repository at this point in the history
Optional alignment subworkflow
  • Loading branch information
priyanka-surana authored Oct 18, 2023
2 parents 5412568 + 378ab70 commit bb01d13
Show file tree
Hide file tree
Showing 58 changed files with 1,007 additions and 143 deletions.
8 changes: 0 additions & 8 deletions .editorconfig
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,3 @@ indent_size = unset
[/assets/email*]
indent_size = unset

# To prevent errors for these test diamond databases
[/assets/test*/*.dmnd]
charset = unset
end_of_line = unset
insert_final_newline = unset
trim_trailing_whitespace = unset
indent_style = unset
indent_size = unset
2 changes: 1 addition & 1 deletion .github/ISSUE_TEMPLATE/bug_report.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ body:
id: system
attributes:
label: System information
description: "* Nextflow version _(eg. 23.04.1)_
description: "* Nextflow version _(eg. 22.10.1)_
* Hardware _(eg. HPC, Desktop, Cloud)_
Expand Down
21 changes: 18 additions & 3 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
name: nf-core CI
# This workflow runs the pipeline with the minimal test dataset to check that it completes without any syntax errors
on:
workflow_dispatch:
push:
branches:
- dev
pull_request:
release:
types: [published]

env:
NXF_ANSI_LOG: false
Expand All @@ -19,7 +24,7 @@ jobs:
strategy:
matrix:
NXF_VER:
- "23.04.1"
- "22.10.1"
- "latest-everything"
steps:
- name: Check out pipeline code
Expand All @@ -30,9 +35,19 @@ jobs:
with:
version: "${{ matrix.NXF_VER }}"

- name: Download the NCBI taxdump database
run: |
mkdir ncbi_taxdump
curl -L https://ftp.ncbi.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz | tar -C ncbi_taxdump -xzf -
- name: Download the BUSCO lineage database
run: |
mkdir busco_database
curl -L https://tolit.cog.sanger.ac.uk/test-data/resources/busco/blobtoolkit.GCA_922984935.2.2023-08-03.lineages.tar.gz | tar -C busco_database -xzf -
- name: Run pipeline with test data
# You can customise CI pipeline run tests as required
# For example: adding multiple test runs with different parameters
# Remember that you can parallelise this by using strategy.matrix
run: |
nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results
nextflow run ${GITHUB_WORKSPACE} -profile test,docker --taxdump $PWD/ncbi_taxdump --busco $PWD/busco_database --outdir ./results
12 changes: 6 additions & 6 deletions .github/workflows/fix-linting.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,21 @@ jobs:
# Only run if comment is on a PR with the main repo, and if it contains the magic keywords
if: >
contains(github.event.comment.html_url, '/pull/') &&
contains(github.event.comment.body, '@nf-core-bot fix linting') &&
contains(github.event.comment.body, '@sanger-tolsoft fix linting') &&
github.repository == 'sanger-tol/blobtoolkit'
runs-on: ubuntu-latest
steps:
# Use the @nf-core-bot token to check out so we can push later
# Use the @sanger-tolsoft token to check out so we can push later
- uses: actions/checkout@v3
with:
token: ${{ secrets.nf_core_bot_auth_token }}
token: ${{ secrets.sangertolsoft_access_token }}

# Action runs on the issue comment, so we don't get the PR by default
# Use the gh cli to check out the PR
- name: Checkout Pull Request
run: gh pr checkout ${{ github.event.issue.number }}
env:
GITHUB_TOKEN: ${{ secrets.nf_core_bot_auth_token }}
GITHUB_TOKEN: ${{ secrets.sangertolsoft_access_token }}

- uses: actions/setup-node@v3

Expand All @@ -46,8 +46,8 @@ jobs:
- name: Commit & push changes
if: steps.prettier_status.outputs.result == 'fail'
run: |
git config user.email "[email protected]"
git config user.name "nf-core-bot"
git config user.email "[email protected]"
git config user.name "sanger-tolsoft"
git config push.default upstream
git add .
git status
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: nf-core Sanger LSF tests
name: sanger-tol LSF tests

on:
workflow_dispatch:
Expand All @@ -13,16 +13,21 @@ jobs:
if: github.event_name == 'workflow_dispatch'

- name: Launch workflow via tower
uses: nf-core/tower-action@v2
uses: seqeralabs/action-tower-launch@v2
with:
workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }}
access_token: ${{ secrets.TOWER_ACCESS_TOKEN }}
compute_env: ${{ secrets.TOWER_COMPUTE_ENV }}
pipeline: ${{ github.repository }}
revision: ${{ env.REVISION }}
workdir: ${{ secrets.TOWER_WORKDIR_PARENT }}/work/${{ github.repository }}/work-${{ env.REVISION }}
parameters: |
{
"outdir": "${{ secrets.TOWER_WORKDIR_PARENT }}/results/${{ github.repository }}/results-${{ env.REVISION }}",
}
profiles: test,sanger,singularity,cleanup
- uses: actions/upload-artifact@v3
with:
name: Tower debug log file
path: |
tower_action_*.log
tower_action_*.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: nf-core Sanger LSF full size tests
name: sanger-tol LSF full size tests

on:
workflow_dispatch:
Expand All @@ -18,16 +18,21 @@ jobs:
if: github.event_name == 'workflow_dispatch'

- name: Launch workflow via tower
uses: nf-core/tower-action@v2
uses: seqeralabs/action-tower-launch@v2
with:
workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }}
access_token: ${{ secrets.TOWER_ACCESS_TOKEN }}
compute_env: ${{ secrets.TOWER_COMPUTE_ENV }}
pipeline: ${{ github.repository }}
revision: ${{ env.REVISION }}
workdir: ${{ secrets.TOWER_WORKDIR_PARENT }}/work/${{ github.repository }}/work-${{ env.REVISION }}
parameters: |
{
"outdir": "${{ secrets.TOWER_WORKDIR_PARENT }}/results/${{ github.repository }}/results-${{ env.REVISION }}",
}
profiles: test_full,sanger,singularity,cleanup
- uses: actions/upload-artifact@v3
with:
name: Tower debug log file
path: |
tower_action_*.log
tower_action_*.json
1 change: 1 addition & 0 deletions .nf-core.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ lint:
files_unchanged:
- LICENSE
- .github/ISSUE_TEMPLATE/bug_report.yml
- .github/workflows/linting.yml
- assets/sendmail_template.txt
- lib/NfcoreTemplate.groovy
- .prettierignore
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
[![GitHub Actions Linting Status](https://github.com/sanger-tol/blobtoolkit/workflows/nf-core%20linting/badge.svg)](https://github.com/sanger-tol/blobtoolkit/actions?query=workflow%3A%22nf-core+linting%22)
[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.7949058-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.7949058)

[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A523.04.1-23aa62.svg)](https://www.nextflow.io/)
[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A522.10.1-23aa62.svg)](https://www.nextflow.io/)
[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)
[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)
[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)
Expand Down Expand Up @@ -60,7 +60,7 @@ mMelMel1,illumina,GCA_922984935.2.illumina.mMelMel1.cram
mMelMel3,ont,GCA_922984935.2.ont.mMelMel3.cram
```

Each row represents an aligned file. Rows with the same sample identifier are considered technical replicates. The datatype refers to the sequencing technology used to generate the underlying raw data and follows a controlled vocabulary (ont, hic, pacbio, illumina). The aligned read files can be generated using the [sanger-tol/readmapping](https://github.com/sanger-tol/readmapping) pipeline.
Each row represents an aligned file. Rows with the same sample identifier are considered technical replicates. The datatype refers to the sequencing technology used to generate the underlying raw data and follows a controlled vocabulary (ont, hic, pacbio, pacbio_clr illumina). The aligned read files can be generated using the [sanger-tol/readmapping](https://github.com/sanger-tol/readmapping) pipeline.

Now, you can run the pipeline using:

Expand Down
2 changes: 1 addition & 1 deletion assets/schema_input.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
"datatype": {
"type": "string",
"pattern": "^\\S+$",
"enum": ["hic", "illumina", "ont", "pacbio"],
"enum": ["hic", "illumina", "ont", "pacbio", "pacbio_clr"],
"errorMessage": "Data type, and must be one of: 'hic' or 'illumina' or 'ont' or 'pacbio'"
},
"datafile": {
Expand Down
Binary file removed assets/test/mCerEla1.1.buscogenes.dmnd
Binary file not shown.
4 changes: 4 additions & 0 deletions assets/test/samplesheet_raw.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
sample,datatype,datafile
mMelMel1,illumina,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/genomic_data/mMelMel1/illumina/31231_3#1_subset.cram
mMelMel2,illumina,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/genomic_data/mMelMel2/illumina/31231_4#1_subset.cram
mMelMel3,hic,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/genomic_data/mMelMel3/hic-arima2/35528_2#1_subset.cram
4 changes: 2 additions & 2 deletions assets/test_full/full_samplesheet.csv
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
sample,datatype,datafile
gfLaeSulp1,hic,/lustre/scratch123/tol/projects/.sandbox/data/fungi/Laetiporus_sulphureus/analysis/gfLaeSulp1.1/read_mapping/hic/GCA_927399515.1.unmasked.hic.gfLaeSulp1.cram
gfLaeSulp1,pacbio,/lustre/scratch123/tol/projects/.sandbox/data/fungi/Laetiporus_sulphureus/analysis/gfLaeSulp1.1/read_mapping/pacbio/GCA_927399515.1.unmasked.pacbio.gfLaeSulp1.cram
gfLaeSulp1,hic,/lustre/scratch123/tol/resources/nextflow/test-data/Laetiporus_sulphureus/analysis/gfLaeSulp1.1/read_mapping/hic/GCA_927399515.1.unmasked.hic.gfLaeSulp1.cram
gfLaeSulp1,pacbio,/lustre/scratch123/tol/resources/nextflow/test-data/Laetiporus_sulphureus/analysis/gfLaeSulp1.1/read_mapping/pacbio/GCA_927399515.1.unmasked.pacbio.gfLaeSulp1.cram
Binary file removed assets/test_full/gfLaeSulp1.1.buscogenes.dmnd
Binary file not shown.
6 changes: 5 additions & 1 deletion bin/check_samplesheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,16 @@ class RowChecker:
"""

VALID_FORMATS = (".cram",)
VALID_FORMATS = (
".cram",
".bam",
)

VALID_DATATYPES = (
"hic",
"illumina",
"pacbio",
"pacbio_clr",
"ont",
)

Expand Down
60 changes: 56 additions & 4 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -14,22 +14,55 @@ process {

withName: "SAMPLESHEET_CHECK" {
publishDir = [
path: { "${params.outdir}/blobtoolkit_info" },
path: { "${params.outdir}/pipeline_info/blobtoolkit" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals("versions.yml") ? null : filename }
]
}

withName: "GOAT_TAXONSEARCH" {
ext.args = "-l -b"
withName: "WINDOWMASKER_MKCOUNTS" {
ext.args = "-infmt fasta -sformat obinary"
}

withName: "WINDOWMASKER_USTAT" {
ext.args = "-infmt fasta -dust T -outfmt fasta"
}

withName: "MINIMAP2_HIC" {
ext.args = "-ax sr"
}

withName: "MINIMAP2_ILMN" {
ext.args = "-ax sr"
}

withName: "MINIMAP2_CCS" {
ext.args = "-ax map-hifi --cs=short"
}

withName: "MINIMAP2_CLR" {
ext.args = "-ax map-pb"
}

withName: "MINIMAP2_ONT" {
ext.args = "-ax map-ont"
}

withName: "SAMTOOLS_VIEW" {
ext.args = "--output-fmt bam --write-index"
}

withName: "SAMTOOLS_INDEX" {
ext.args = "-c"
}

withName: "GOAT_TAXONSEARCH" {
ext.args = "--lineage --busco"
}

withName: "BUSCO" {
scratch = true
// Overridden in the test profile, see at the end of this file
ext.args = "--mode genome --force"
}

Expand Down Expand Up @@ -68,7 +101,7 @@ process {

withName: "CUSTOM_DUMPSOFTWAREVERSIONS" {
publishDir = [
path: { "${params.outdir}/blobtoolkit_info" },
path: { "${params.outdir}/pipeline_info/blobtoolkit" },
mode: params.publish_dir_mode,
pattern: "*_versions.yml"
]
Expand All @@ -84,3 +117,22 @@ process {
}

}


/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Additional configuration to speed processes up during testing.
----------------------------------------------------------------------------------------
*/

profiles {
test {
process {
withName: BUSCO {
// Note: BUSCO *must* see the double-quotes around the parameters
ext.args = '--mode genome --force --metaeuk_parameters \'"-s=2"\' --metaeuk_rerun_parameters \'"-s=2"\''
}
}
}
}
10 changes: 5 additions & 5 deletions conf/test.config
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

params {
config_profile_name = 'Test profile'
config_profile_description = 'Minimal test dataset to check pipeline function'
config_profile_description = 'Minimal aligned test dataset to check pipeline function'

// Limit resources so that this can run on GitHub Actions
max_cpus = 2
Expand All @@ -22,15 +22,15 @@ params {
// Input test data
// Specify the paths to your test data
// Give any required params for the test so that command line flags are not needed
input = "${projectDir}/assets/test/samplesheet.csv"
input = "${projectDir}/assets/test/samplesheet_s3.csv"

// Fasta references
fasta = "/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/assembly/release/mMelMel3.1_paternal_haplotype/GCA_922984935.2.subset.fasta.gz"
fasta = "https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/assembly/release/mMelMel3.1_paternal_haplotype/GCA_922984935.2.subset.fasta.gz"
accession = "GCA_922984935.2"
taxon = "Meles meles"

// Databases
taxdump = "/lustre/scratch123/tol/teams/grit/geval_pipeline/btk_databases/taxdump"
busco = "/lustre/scratch123/tol/resources/nextflow/busco_2021_06_reduced/"
uniprot = "${projectDir}/assets/test/mCerEla1.1.buscogenes.dmnd"
busco = "/lustre/scratch123/tol/resources/nextflow/busco/blobtoolkit.GCA_922984935.2.2023-08-03"
uniprot = "https://tolit.cog.sanger.ac.uk/test-data/resources/diamond/mCerEla1.1.buscogenes.dmnd"
}
4 changes: 2 additions & 2 deletions conf/test_full.config
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,12 @@ params {
input = "${projectDir}/assets/test_full/full_samplesheet.csv"

// Fasta references
fasta = "/lustre/scratch124/tol/projects/darwin/data/fungi/Laetiporus_sulphureus/assembly/release/gfLaeSulp1.1/insdc/GCA_927399515.1.fasta.gz"
fasta = "https://tolit.cog.sanger.ac.uk/test-data/Laetiporus_sulphureus/assembly/release/gfLaeSulp1.1/insdc/GCA_927399515.1.fasta.gz"
accession = "GCA_927399515.1"
taxon = "Laetiporus sulphureus"

// Databases
taxdump = "/lustre/scratch123/tol/teams/grit/geval_pipeline/btk_databases/taxdump"
busco = "/lustre/scratch123/tol/resources/busco/v5/"
uniprot = "${projectDir}/assets/test_full/gfLaeSulp1.1.buscogenes.dmnd"
uniprot = "https://tolit.cog.sanger.ac.uk/test-data/resources/diamond/gfLaeSulp1.1.buscogenes.dmnd"
}
37 changes: 37 additions & 0 deletions conf/test_raw.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Nextflow config file for running minimal tests
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Defines input files and everything required to run a fast and simple pipeline test.
Use as follows:
nextflow run sanger-tol/blobtoolkit -profile test_raw,<docker/singularity> --outdir <OUTDIR>
----------------------------------------------------------------------------------------
*/

params {
config_profile_name = 'Raw test profile'
config_profile_description = 'Minimal raw test dataset to check pipeline function'

// Limit resources so that this can run on GitHub Actions
max_cpus = 2
max_memory = '6.GB'
max_time = '6.h'

// Input test data
// Specify the paths to your test data
// Give any required params for the test so that command line flags are not needed
input = "${projectDir}/assets/test/samplesheet_raw.csv"
align = true

// Fasta references
fasta = "https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/assembly/release/mMelMel3.1_paternal_haplotype/GCA_922984935.2.subset.fasta.gz"
accession = "GCA_922984935.2"
taxon = "Meles meles"

// Databases
taxdump = "/lustre/scratch123/tol/teams/grit/geval_pipeline/btk_databases/taxdump"
busco = "/lustre/scratch123/tol/resources/nextflow/busco/blobtoolkit.GCA_922984935.2.2023-08-03"
uniprot = "https://tolit.cog.sanger.ac.uk/test-data/resources/diamond/mCerEla1.1.buscogenes.dmnd"
}
Loading

0 comments on commit bb01d13

Please sign in to comment.