From 286c1eab20721cd5bb87204d08973e3dbcde464b Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Tue, 26 Nov 2024 12:30:57 +0000 Subject: [PATCH] Updates to changelog and removing CHUNKFASTA, replaced with seqkit --- CHANGELOG.md | 72 ++++++++++++++++++---------------- modules/local/chunkfasta.nf | 48 ----------------------- subworkflows/local/selfcomp.nf | 13 +++--- 3 files changed, 44 insertions(+), 89 deletions(-) delete mode 100755 modules/local/chunkfasta.nf diff --git a/CHANGELOG.md b/CHANGELOG.md index 4c188317..6be0c8b6 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,9 @@ Our 3rd release for sanger-tol/treeval. - Adding arch specification to Pretext GitHub actions runner. Hopefully this will stop the spurious errors we see on there. - Addition of steps into schema. - Adds \*ktab as an output. +- Updated singularity containers +- Added `--metaeuk` to BUSCO_BUSCO, default was causing pipeline errors on Actions -- Needs more investigation. +- Replaced Pyfasta split (depreciated 6 years ago) with Seqkit split which is frequently updated and very fast. ### Parameters @@ -31,40 +34,41 @@ Our 3rd release for sanger-tol/treeval. Note, since the pipeline is using Nextflow DSL2, each process will be run with its own Biocontainer. This means that on occasion it is entirely possible for the pipeline to be using different versions of the same tool. However, the overall software dependency changes compared to the last release have been listed below for reference. -| Module | Old Version | New Versions | -| -------------------------------------- | ---------------- | ------------ | -| bamtobed_sort ( bedtools + samtools ) | 2.31.0 + 1.17 | | -| bedtools | 2.31.1 | - | -| busco\* | 5.5.0 | - | -| bwa-mem2 | 2.2.1 | | -| cat | 2.3.4 | | -| chunk_fasta ( pyfasta ) | 0.5.2-1 | | -| cooler | 0.9.2 | | -| cram_filter_align_bwamem2_fixmate_sort | - | | -| ^ ( samtools + bwamem2 ) ^ | 1.17 + 2.2.1 | | -| coreutils | 9.1 | | -| fastk | 1.0.1 | | -| gcc | 10.4.0 | | -| find_telomere_windows ( java-jdk ) | 8.0.112 | | -| generate_cram_csv ( samtools ) | 1.17 | | -| gnu-sort | 8.25 | 9.3 | -| juicer_tools_pre ( java-jdk ) | 8.0.112 | | -| perl | 5.26.2 | | -| merquryfk | 1.0.1 | | -| minimap2 + samtools | 2.24 + 1.14 | | -| minimap2_index | 2.24 | 2.28 | -| miniprot | 0.11--he4a0461_2 | | -| mummer | 3.23 | | -| paftools ( minimap2 + samtools ) | 2.24 + 1.14 | | -| pretextmap + samtools | 0.0.2 + 1.17 | 0.0.3 + 1.17 | -| python | 3.9 | - | -| - pandas | 1.5.2 | - | -| samtools | 1.18 | 1.21 | -| selfcomp_splitfasta ( perl-bioperl ) | 1.7.8-1 | | -| seqtk | 1.4 | | -| tabix | 1.11 | | -| ucsc | 377 | 447 | -| windowmasker (blast) | 2.14.0 | 2.15.0 | +| Module | Old Version | New Versions | +| -------------------------------------- | ---------------- | ----------------- | +| bamtobed_sort ( bedtools + samtools ) | 2.31.0 + 1.17 | | +| bedtools | 2.31.1 | - | +| busco\* | 5.5.0 | - | +| bwa-mem2 | 2.2.1 | | +| cat | 2.3.4 | | +| chunk_fasta ( pyfasta ) | 0.5.2-1 | REMOVED | +| cooler | 0.9.2 | | +| cram_filter_align_bwamem2_fixmate_sort | - | | +| ^ ( samtools + bwamem2 ) ^ | 1.17 + 2.2.1 | | +| coreutils | 9.1 | | +| fastk | 1.0.1 | | +| gcc | 10.4.0 | | +| find_telomere_windows ( java-jdk ) | 8.0.112 | | +| generate_cram_csv ( samtools ) | 1.17 | | +| gnu-sort | 8.25 | 9.3 | +| juicer_tools_pre ( java-jdk ) | 8.0.112 | | +| perl | 5.26.2 | | +| merquryfk | 1.0.1 | | +| minimap2 + samtools | 2.24 + 1.14 | | +| minimap2_index | 2.24 | 2.28 | +| miniprot | 0.11--he4a0461_2 | | +| mummer | 3.23 | | +| paftools ( minimap2 + samtools ) | 2.24 + 1.14 | | +| pretextmap + samtools | 0.0.2 + 1.17 | 0.0.3 + 1.17 | +| python | 3.9 | - | +| - pandas | 1.5.2 | - | +| samtools | 1.18 | 1.21 | +| selfcomp_splitfasta ( perl-bioperl ) | 1.7.8-1 | | +| seqtk | 1.4 | | +| seqkit | ADDED | 2.9.0--h9ee0642_0 | +| tabix | 1.11 | | +| ucsc | 377 | 447 | +| windowmasker (blast) | 2.14.0 | 2.15.0 | - busco is currently pinned to v5.5.0 - Upgrading v5.7.1 would cause github actions to crash. Further investigation needed. diff --git a/modules/local/chunkfasta.nf b/modules/local/chunkfasta.nf deleted file mode 100755 index 0400df24..00000000 --- a/modules/local/chunkfasta.nf +++ /dev/null @@ -1,48 +0,0 @@ -process CHUNKFASTA { - tag "${meta.id}" - label 'process_low' - - conda "conda-forge::pyfasta=0.5.2-1" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/pyfasta:0.5.2--py_1' : - 'biocontainers/pyfasta:0.5.2--py_1' }" - - input: - tuple val(meta), path('input.fasta') - val(number_of_chunks) - - output: - tuple val(meta), path('*.fasta'), emit: fasta - path "versions.yml" , emit: versions - - script: - def VERSION = '0.5.2' // Tool does not report version - // This should be abstracted outside of the container to - // stop it spinning up in the first place, - // however dsl2 can't do comparisons with channels which makes it harder - """ - if [ $number_of_chunks -le 1 ]; then - mv input.fasta ${meta.id}_whole.fasta - else - pyfasta split -n $number_of_chunks input.fasta - fi - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: \$(python --version | sed 's/Python //g') - pyfasta: $VERSION - END_VERSIONS - """ - - stub: - def VERSION = '0.5.2' // Tool does not report version - """ - touch ${meta.id}.fa - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: \$(python --version | sed 's/Python //g') - pyfasta: $VERSION - END_VERSIONS - """ -} diff --git a/subworkflows/local/selfcomp.nf b/subworkflows/local/selfcomp.nf index 9d1e0e95..8e702635 100755 --- a/subworkflows/local/selfcomp.nf +++ b/subworkflows/local/selfcomp.nf @@ -12,7 +12,7 @@ include { BEDTOOLS_SORT } from '../../modules/nf-core/bedtools/ include { SELFCOMP_SPLITFASTA } from '../../modules/local/selfcomp_splitfasta' include { SELFCOMP_MUMMER2BED } from '../../modules/local/selfcomp_mummer2bed' include { SELFCOMP_MAPIDS } from '../../modules/local/selfcomp_mapids' -include { CHUNKFASTA } from '../../modules/local/chunkfasta' +include { SEQKIT_SPLIT } from '../../modules/local/seqkit/split/main' include { CAT_CAT } from '../../modules/nf-core/cat/cat/main' include { SELFCOMP_ALIGNMENTBLOCKS } from '../../modules/local/selfcomp_alignmentblocks' include { CONCATBLOCKS } from '../../modules/local/concatblocks' @@ -57,28 +57,28 @@ workflow SELFCOMP { // MODULE: SPLIT REFERENCE FILE INTO 1GB CHUNKS // THIS IS THE QUERY, AND REFERENCE IF GENOME.size() > 1GB // - CHUNKFASTA( + SEQKIT_SPLIT( SELFCOMP_SPLITFASTA.out.fa, chunk_number ) - ch_versions = ch_versions.mix(CHUNKFASTA.out.versions) + ch_versions = ch_versions.mix(SEQKIT_SPLIT.out.versions) // // LOGIC: STRIP META FROM QUERY, AND COMBINE WITH REFERENCE FILE // THIS LEAVES US WITH n=( REFERENCE + QUERY) IF GENOME.SIZE() < 1GB // OR n=((REFERENCE / 1E9) * (REFENCE / 1E9)) IF GENOME.SIZE() > 1GB // - CHUNKFASTA.out.fasta + SEQKIT_SPLIT.out.fasta .map{meta, query -> query } - .collect() // Collect any output from CHUNKFASTA + .collect() // Collect any output from SEQKIT_SPLIT .map {it -> tuple( [ len: it.size() ], // Calc length of list it ) } - .set {len_ch} // tap out to preserve length of CHUNKFASTA list + .set {len_ch} // tap out to preserve length of SEQKIT_SPLIT list len_ch // tap swapped with set as tap stops pipeline completion .map {meta, files -> @@ -216,4 +216,3 @@ workflow SELFCOMP { ch_bigbed = UCSC_BEDTOBIGBED.out.bigbed versions = ch_versions.ifEmpty(null) } -