Skip to content

Commit

Permalink
Merge pull request #63 from sanger-tol/pacbio_align
Browse files Browse the repository at this point in the history
Pacbio align
  • Loading branch information
gq1 authored Dec 8, 2023
2 parents 3b3a745 + 45a7ba4 commit 5f528f7
Show file tree
Hide file tree
Showing 78 changed files with 2,683 additions and 98 deletions.
4 changes: 4 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,7 @@ jobs:
# Remember that you can parallelise this by using strategy.matrix
run: |
nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results
- name: Run pipeline with unaligned test data
run: |
nextflow run ${GITHUB_WORKSPACE} -profile test_align,docker --outdir ./results --align
3 changes: 2 additions & 1 deletion .github/workflows/sanger_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,9 @@ jobs:
parameters: |
{
"outdir": "${{ secrets.TOWER_WORKDIR_PARENT }}/results/${{ github.repository }}/results-${{ github.sha }}",
"align": true
}
profiles: test,sanger,singularity,cleanup
profiles: test_align,sanger,singularity,cleanup

- uses: actions/upload-artifact@v3
with:
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/sanger_test_full.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,9 @@ jobs:
parameters: |
{
"outdir": "${{ secrets.TOWER_WORKDIR_PARENT }}/results/${{ github.repository }}/results-${{ env.REVISION }}",
"align": true,
}
profiles: test_full,sanger,singularity,cleanup
profiles: test_full_align,sanger,singularity,cleanup

- uses: actions/upload-artifact@v3
with:
Expand Down
4 changes: 4 additions & 0 deletions assets/samplesheet_test_align.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
sample,datatype,datafile
icCanRufa1,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/genomic_data/icCanRufa1/pacbio/m64094_200730_174533.ccs.bc1010_BAK8A_OA--bc1010_BAK8A_OA_0_03.bam
icCanRufa1,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/genomic_data/icCanRufa1/pacbio/m64094_200730_174533.ccs.bc1010_BAK8A_OA--bc1010_BAK8A_OA_0_02.bam
icCanRufa1XXXXX,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/genomic_data/icCanRufa1/pacbio/m64094_200730_174533.ccs.bc1010_BAK8A_OA--bc1010_BAK8A_OA_0_02.bam
2 changes: 2 additions & 0 deletions assets/samplesheet_test_full_align.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
sample,datatype,datafile
ilPolIcar1,pacbio,/lustre/scratch124/tol/projects/darwin/data/insects/Polyommatus_icarus/genomic_data/ilPolIcar1/pacbio/m64016_191206_183623.ccs.bc1019_BAK8B_OA--bc1019_BAK8B_OA.bam
Binary file added assets/vectorDB.tar.gz
Binary file not shown.
5 changes: 5 additions & 0 deletions bin/check_samplesheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,11 @@ def validate_unique_samples(self):
"""
if len(self._seen) != len(self.validated):
raise AssertionError("The combination of sample name and data file must be unique.")
seen = Counter()
for row in self.validated:
sample = row[self._sample_col]
seen[sample] += 1
row[self._sample_col] = f"{sample}_T{seen[sample]}"


def read_head(handle, num_lines=10):
Expand Down
6 changes: 6 additions & 0 deletions bin/pacbio_filter.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/bash

input=$1
output=$2

grep -v 'MG551957' $input | awk -v OFS='\t' '{if (($2 ~ /NGB00972/ && $3 >= 97 && $4 >= 44) || ($2 ~ /NGB00973/ && $3 >= 97 && $4 >= 34) || ($2 ~ /^bc/ && $3 >= 99 && $4 >= 16)) print $1}' | sort -u > $output
2 changes: 1 addition & 1 deletion conf/base.config
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ process {
time = { check_max( 4.h * task.attempt, 'time' ) }

errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' }
maxRetries = 1
maxRetries = 5
maxErrors = '-1'

// Process-specific resource requirements
Expand Down
66 changes: 62 additions & 4 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,73 @@ process {
]
}

if( params.align ) {

withName: '.*:ALIGN_PACBIO:FILTER_PACBIO:SAMTOOLS_CONVERT' {
ext.args = "-e '[rq]>=0.99' -x fi -x fp -x ri -x rp --write-index --output-fmt bam"
}

withName: '.*:ALIGN_PACBIO:FILTER_PACBIO:SAMTOOLS_COLLATE' {
ext.prefix = { "${meta.id}.collate" }
}

withName: '.*:ALIGN_PACBIO:FILTER_PACBIO:BLAST_BLASTN' {
ext.args = '-task blastn -reward 1 -penalty -5 -gapopen 3 -gapextend 3 -dust yes -soft_masking true -evalue .01 -searchsp 1750000000000 -outfmt 6'
}

withName: '.*:ALIGN_PACBIO:FILTER_PACBIO:SAMTOOLS_FILTER' {
ext.prefix = { "${meta.id}.filter" }
}

withName: '.*:ALIGN_PACBIO:FILTER_PACBIO:SAMTOOLS_FASTQ' {
ext.args = '-F 0x200 -nt'
}

withName: '.*:.*:ALIGN_PACBIO:MINIMAP2_ALIGN' {
ext.args = { "-ax map-hifi --cs=short -R ${meta.read_group}" }
}

withName: '.*:.*:ALIGN_PACBIO:SAMTOOLS_MERGE' {
ext.args = { "-c -p" }
ext.prefix = { "${meta.id}.merge" }
}

withName: '.*:CONVERT_STATS:SAMTOOLS_VIEW' {
ext.prefix = { "${meta2.id}.${meta.datatype}.${meta.id}" }
ext.args = '--output-fmt cram --write-index'
}

withName: '.*:CONVERT_STATS:SAMTOOLS_STATS' {
ext.prefix = { "${input.baseName}" }
}

withName: '.*:CONVERT_STATS:SAMTOOLS_FLAGSTAT' {
ext.prefix = { "${bam.baseName}" }
}

withName: '.*:CONVERT_STATS:SAMTOOLS_IDXSTATS' {
ext.prefix = { "${bam.baseName}" }
}

withName: '.*:ALIGN_PACBIO:CONVERT_STATS:.*' {
publishDir = [
path: { "${params.outdir}/variant_calling" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}
} else{

withName: '.*:INPUT_MERGE:SAMTOOLS_MERGE' {
ext.args = '--write-index'
}
}

withName: '.*:INPUT_FILTER_SPLIT:SAMTOOLS_VIEW' {
ext.args = '--output-fmt cram --write-index -F 0x900'
ext.prefix = { "${meta.id}_filtered" }
}

withName: '.*:INPUT_MERGE:SAMTOOLS_MERGE' {
ext.args = '--write-index'
}

withName: '.*:DEEPVARIANT_CALLER:DEEPVARIANT' {
ext.args = '--model_type=PACBIO'
}
Expand Down
27 changes: 27 additions & 0 deletions conf/test_align.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Nextflow config file for running minimal tests
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Defines input files and everything required to run a fast and simple pipeline test.
Use as follows:
nextflow run sanger-tol/variantcalling -profile test_align,<docker/singularity> --outdir <OUTDIR> --align
----------------------------------------------------------------------------------------
*/

params {
config_profile_name = 'Test profile with alignment'
config_profile_description = 'Minimal unaligned test dataset to check pipeline function'

// Limit resources so that this can run on GitHub Actions
max_cpus = 2
max_memory = '6.GB'
max_time = '6.h'

// Input data
input = "${projectDir}/assets/samplesheet_test_align.csv"

// Fasta references
fasta = "https://tolit.cog.sanger.ac.uk/test-data/Cantharis_rufa/assembly/GCA_947369205.1_OX376310.1_CANBKR010000003.1.fasta.gz"
}
25 changes: 25 additions & 0 deletions conf/test_full_align.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Nextflow config file for running full-size tests with alignment
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Defines input files and everything required to run a full size pipeline test.
Use as follows:
nextflow run sanger-tol/variantcalling -profile test_full_align,<docker/singularity> --outdir <OUTDIR> --align
----------------------------------------------------------------------------------------
*/

cleanup = true

params {
config_profile_name = 'Full test profile with alignment'
config_profile_description = 'Full non-aligned test dataset to check pipeline function'

// Input data for full size test
input = "${projectDir}/assets/samplesheet_test_full_align.csv"

// Fasta references
fasta = "/lustre/scratch124/tol/projects/darwin/data/insects/Polyommatus_icarus/assembly/release/ilPolIcar1.1/insdc/GCA_937595015.1.fasta.gz"

}
55 changes: 53 additions & 2 deletions modules.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,11 @@
"git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
"installed_by": ["modules"]
},
"blast/blastn": {
"branch": "master",
"git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
"installed_by": ["modules"]
},
"cat/cat": {
"branch": "master",
"git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
Expand All @@ -25,11 +30,46 @@
"git_sha": "516189e968feb4ebdd9921806988b4c12b4ac2dc",
"installed_by": ["modules"]
},
"gunzip": {
"branch": "master",
"git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
"installed_by": ["modules"]
},
"minimap2/align": {
"branch": "master",
"git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
"installed_by": ["modules"]
},
"samtools/collate": {
"branch": "master",
"git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
"installed_by": ["modules"]
},
"samtools/faidx": {
"branch": "master",
"git_sha": "fd742419940e01ba1c5ecb172c3e32ec840662fe",
"installed_by": ["modules"]
},
"samtools/fasta": {
"branch": "master",
"git_sha": "6f4299292ef2c5b66e6829527b2647c301b77cc9",
"installed_by": ["modules"]
},
"samtools/fastq": {
"branch": "master",
"git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
"installed_by": ["modules"]
},
"samtools/flagstat": {
"branch": "master",
"git_sha": "63e817de8c617131447192ab2c4e70b4ed4071f7",
"installed_by": ["modules"]
},
"samtools/idxstats": {
"branch": "master",
"git_sha": "63e817de8c617131447192ab2c4e70b4ed4071f7",
"installed_by": ["modules"]
},
"samtools/merge": {
"branch": "master",
"git_sha": "e7ce60acc8a33fa17429e966364657a63016e870",
Expand All @@ -41,14 +81,25 @@
"git_sha": "a0f7be95788366c1923171e358da7d049eb440f9",
"installed_by": ["modules"]
},
"samtools/stats": {
"branch": "master",
"git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
"installed_by": ["modules"]
},
"samtools/view": {
"branch": "master",
"git_sha": "3ffae3598260a99e8db3207dead9f73f87f90d1f",
"git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
"installed_by": ["modules"],
"patch": "modules/nf-core/samtools/view/samtools-view.diff"
},
"untar": {
"branch": "master",
"git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
"installed_by": ["modules"]
},
"vcftools": {
"branch": "master",
"git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a",
"git_sha": "485558b40040fc3ace093d9084210125d8ba4c97",
"installed_by": ["modules"],
"patch": "modules/nf-core/vcftools/vcftools.diff"
}
Expand Down
30 changes: 30 additions & 0 deletions modules/local/pacbio_filter.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
process PACBIO_FILTER {
tag "$meta.id"
label 'process_single'

conda "conda-forge::gawk=5.1.0"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/gawk:5.1.0' :
'quay.io/biocontainers/gawk:5.1.0' }"

input:
tuple val(meta), path(txt)

output:
path("*.blocklist"), emit: list
path "versions.yml", emit: versions

when:
task.ext.when == null || task.ext.when

script:
def prefix = task.ext.prefix ?: "${meta.id}"
"""
pacbio_filter.sh $txt ${prefix}.blocklist
cat <<-END_VERSIONS > versions.yml
"${task.process}":
GNU Awk: \$(echo \$(awk --version 2>&1) | grep -i awk | sed 's/GNU Awk //; s/,.*//')
END_VERSIONS
"""
}
7 changes: 7 additions & 0 deletions modules/nf-core/blast/blastn/environment.yml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

57 changes: 57 additions & 0 deletions modules/nf-core/blast/blastn/main.nf

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 5f528f7

Please sign in to comment.