From bb4b99efea0d50fa242640b4357605945be8f6ae Mon Sep 17 00:00:00 2001 From: Charles Plessy Date: Thu, 3 Oct 2024 15:39:39 +0900 Subject: [PATCH] pre-commit run --all-files --- README.md | 42 +++++++++++++++++------------------ modules.json | 44 ++++++++++--------------------------- modules/local/mergemasks.nf | 2 +- nextflow_schema.json | 14 ++---------- 4 files changed, 35 insertions(+), 67 deletions(-) diff --git a/README.md b/README.md index e3530f3..f2e74a2 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,4 @@ -Genome pre-processing pipeline -============================== +# Genome pre-processing pipeline This is a local pipeline to pre-process downloaded genomes before feeding them to . @@ -8,21 +7,22 @@ to . This pipeline takes genomes as inputs and soft-masks their repeats with the following software: - - tantan (our default choice from a long time because TRF used to be non-free). - - windowmasker - - repeatmasker +- tantan (our default choice from a long time because TRF used to be non-free). +- windowmasker +- repeatmasker The input of repeatmasker can be any of: - - repeatmodeller (default) - - DFAM - - a custom repeat library. -Repeatmasker and repeatmodeller are run from the same image as the standard _nf-core_ module. But it is possible to pass the URL to an alternative singularity image, for instance to use the latest [TE Tools container](https://github.com/Dfam-consortium/TETools?tab=readme-ov-file#dfam-te-tools-container) +- repeatmodeller (default) +- DFAM +- a custom repeat library. + +Repeatmasker and repeatmodeller are run from the same image as the standard _nf-core_ module. But it is possible to pass the URL to an alternative singularity image, for instance to use the latest [TE Tools container](https://github.com/Dfam-consortium/TETools?tab=readme-ov-file#dfam-te-tools-container) ## Disclaimer This is not an official pipeline. This pipeline uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) initative, and reused here under the [MIT license](https://github.com/nf-core/tools/blob/master/LICENSE). - + > The nf-core framework for community-curated bioinformatics pipelines. > > Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen. @@ -73,30 +73,30 @@ nextflow run ./main.nf \ ## Options - - Point `--repeatlib` to a FASTA file to have an extra RepeatMasker run using it as a library. - - Set `--taxon` to a taxon name to have an extra RepeatMasker run using the `-species` option set to that taxon. - - Point `--singularity_image` to a local file path like `/flash/LuscombeU/singularity.cacheDir/tetools_1.88.5.sif` or an URL to singularity image to replace the default one. - - Set the `--gzipped_input=false` parameter when the input is not compressed.. +- Point `--repeatlib` to a FASTA file to have an extra RepeatMasker run using it as a library. +- Set `--taxon` to a taxon name to have an extra RepeatMasker run using the `-species` option set to that taxon. +- Point `--singularity_image` to a local file path like `/flash/LuscombeU/singularity.cacheDir/tetools_1.88.5.sif` or an URL to singularity image to replace the default one. +- Set the `--gzipped_input=false` parameter when the input is not compressed.. ## Pipeline output ### `tantan`, `repeatmodeler`, `windowmasker`, `dfam` (optional), `extlib` (optional) - - Masked genome file (compressed). - - BED file representing the masked regions. - - Summary statistics of the softmasked genome. +- Masked genome file (compressed). +- BED file representing the masked regions. +- Summary statistics of the softmasked genome. ## Resource usage On a test run on haplotype-merged and diploid assemblies of _Oikopleura dioica_ (2n = 60 Mbp): - - CPU usage was ~50 % for most processes. RepeatModeller was allocated 24 cores and used ~10 on average. - - Memory usage was less than 1 GB for all processes except RepeatModeller (~6 GB, max 8 GB). - - All processes needed only 10 % of the allocated time, except for RepeatModeller, which took between 100 and 500 minutes. +- CPU usage was ~50 % for most processes. RepeatModeller was allocated 24 cores and used ~10 on average. +- Memory usage was less than 1 GB for all processes except RepeatModeller (~6 GB, max 8 GB). +- All processes needed only 10 % of the allocated time, except for RepeatModeller, which took between 100 and 500 minutes. ## Future directions - - It may be interesting to add TRF and ULTRA, and compare and combine their results to the ones of tantan. +- It may be interesting to add TRF and ULTRA, and compare and combine their results to the ones of tantan. ## Credits diff --git a/modules.json b/modules.json index a5ccdd1..3228bc5 100644 --- a/modules.json +++ b/modules.json @@ -8,58 +8,42 @@ "fastqc": { "branch": "master", "git_sha": "285a50500f9e02578d90b3ce6382ea3c30216acd", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "gfastats": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "gunzip": { "branch": "master", "git_sha": "06c8865e36741e05ad32ef70ab3fac127486af48", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "multiqc": { "branch": "master", "git_sha": "b7ebe95761cd389603f9cc0e0dc384c0f663815a", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "repeatmodeler/builddatabase": { "branch": "master", "git_sha": "1cbb5551b917aa423e414dcd69898d01520a309d", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "repeatmodeler/repeatmodeler": { "branch": "master", "git_sha": "84efd2f87d07deb22ee9378f065a9aa5f1434161", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "windowmasker/mkcounts": { "branch": "master", "git_sha": "32cac29d4a92220965dace68a1fb0bb2e3547cac", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "windowmasker/ustat": { "branch": "master", "git_sha": "32cac29d4a92220965dace68a1fb0bb2e3547cac", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] } } }, @@ -68,23 +52,17 @@ "utils_nextflow_pipeline": { "branch": "master", "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", - "installed_by": [ - "subworkflows" - ] + "installed_by": ["subworkflows"] }, "utils_nfcore_pipeline": { "branch": "master", "git_sha": "92de218a329bfc9a9033116eb5f65fd270e72ba3", - "installed_by": [ - "subworkflows" - ] + "installed_by": ["subworkflows"] }, "utils_nfvalidation_plugin": { "branch": "master", "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", - "installed_by": [ - "subworkflows" - ] + "installed_by": ["subworkflows"] } } } diff --git a/modules/local/mergemasks.nf b/modules/local/mergemasks.nf index 89ed9f0..f284f43 100644 --- a/modules/local/mergemasks.nf +++ b/modules/local/mergemasks.nf @@ -26,7 +26,7 @@ process MERGE_MASKS { bedtools jaccard -nonamecheck -a $tantan -b $windowmasker > ${prefix}_tantan_windowmasker_jaccard.txt bedtools jaccard -nonamecheck -a $tantan -b $repeatmasker > ${prefix}_tantan_repeatmasker_jaccard.txt bedtools jaccard -nonamecheck -a $repeatmasker -b $windowmasker > ${prefix}_repeatmasker_windowmasker_jaccard.txt - + zcat $tantan $windowmasker | sort -k1,1 -k2,2n | bedtools merge | gzip --best > ${prefix}_tantan_windowmasker.bed.gz zcat $tantan $repeatmasker | sort -k1,1 -k2,2n | bedtools merge | gzip --best > ${prefix}_tantan_repeatmasker.bed.gz zcat $windowmasker $repeatmasker | sort -k1,1 -k2,2n | bedtools merge | gzip --best > ${prefix}_windowmasker_repeatmasker.bed.gz diff --git a/nextflow_schema.json b/nextflow_schema.json index 64fc1d9..9c14430 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,10 +10,7 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": [ - "input", - "outdir" - ], + "required": ["input", "outdir"], "properties": { "input": { "type": "string", @@ -184,14 +181,7 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": [ - "symlink", - "rellink", - "link", - "copy", - "copyNoFollow", - "move" - ], + "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], "hidden": true }, "email_on_fail": {