diff --git a/CHANGELOG.md b/CHANGELOG.md index fb8159fa..5203d463 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## v2.2.0dev - [31-Oct-2024] +## v2.2.0dev - [04-Nov-2024] ### `Added` @@ -16,6 +16,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 7. Added parameter `hic_samtools_ext_args` and set its default value to `-F 3852` [#159](https://github.com/Plant-Food-Research-Open/assemblyqc/issues/159) 8. Added the HiC QC report to the final report so that users don't have to navigate to the results folder [#162](https://github.com/Plant-Food-Research-Open/assemblyqc/issues/162) 9. Added the fastp log to the final report [#163](https://github.com/Plant-Food-Research-Open/assemblyqc/issues/163) +10. Updated the tube map along with the tool list [#166](https://github.com/Plant-Food-Research-Open/assemblyqc/issues/166) +11. Added Orthofinder [#167](https://github.com/Plant-Food-Research-Open/assemblyqc/issues/167) +12. Changed order of tool options in the `nextflow.config` file ### `Fixed` diff --git a/CITATIONS.md b/CITATIONS.md index 22ae2fb0..bf095d43 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -18,15 +18,15 @@ > Gremme G, Steinbiss S, Kurtz S. 2013. "GenomeTools: A Comprehensive Software Library for Efficient Processing of Structured Genome Annotations," in IEEE/ACM Transactions on Computational Biology and Bioinformatics, vol. 10, no. 3, pp. 645-656, May 2013, doi: -- SAMTOOLS, [MIT/Expat](https://github.com/samtools/samtools/blob/develop/LICENSE) +- samtools, [MIT/Expat](https://github.com/samtools/samtools/blob/develop/LICENSE) > Danecek P, Bonfield JK, Liddle J, Marshall J, Ohan V, Pollard MO, Whitwham A, Keane T, McCarthy SA, Davies RM, Li H. 2021. Twelve years of SAMtools and BCFtools, GigaScience, Volume 10, Issue 2, February 2021, giab008, -- NCBI/FCS, [License](https://github.com/ncbi/fcs/blob/main/LICENSE.txt) +- NCBI FCS, [License](https://github.com/ncbi/fcs/blob/main/LICENSE.txt) > Astashyn A, Tvedte ES, Sweeney D, Sapojnikov V, Bouk N, Joukov V, Mozes E, Strope PK, Sylla PM, Wagner L, Bidwell SL, Clark K, Davis EW, Smith-White B, Hlavina W, Pruitt KD, Schneider VA, Murphy TD. 2023. Rapid and sensitive detection of genome contamination at scale with FCS-GX. bioRxiv 2023.06.02.543519; doi: -- KRONA, [License](https://github.com/marbl/Krona/blob/master/KronaTools/LICENSE.txt) +- Krona, [License](https://github.com/marbl/Krona/blob/master/KronaTools/LICENSE.txt) > Ondov BD, Bergman NH, Phillippy AM. 2011. Interactive metagenomic visualization in a Web browser. BMC Bioinformatics. 2011 Sep 30;12:385. doi: @@ -36,7 +36,7 @@ > > Forked from: -- GFASTATS, [MIT](https://github.com/vgl-hub/gfastats/blob/main/LICENSE) +- gfastats, [MIT](https://github.com/vgl-hub/gfastats/blob/main/LICENSE) > Giulio Formenti, Linelle Abueg, Angelo Brajuka, Nadolina Brajuka, Cristóbal Gallardo-Alba, Alice Giani, Olivier Fedrigo, Erich D Jarvis, Gfastats: conversion, evaluation and manipulation of genome sequences using assembly graphs, Bioinformatics, Volume 38, Issue 17, September 2022, Pages 4214–4216, @@ -44,15 +44,15 @@ > Manni M, Berkeley MR, Seppey M, Simão FA, Zdobnov EM. 2021. BUSCO Update: Novel and Streamlined Workflows along with Broader and Deeper Phylogenetic Coverage for Scoring of Eukaryotic, Prokaryotic, and Viral Genomes, Molecular Biology and Evolution, Volume 38, Issue 10, October 2021, Pages 4647–4654, -- GFFREAD, [MIT](https://github.com/gpertea/gffread/blob/master/LICENSE) +- GffRead, [MIT](https://github.com/gpertea/gffread/blob/master/LICENSE) > Pertea G, Pertea M. GFF Utilities: GffRead and GffCompare. F1000Res. 2020 Apr 28;9:ISCB Comm J-304. doi: . PMID: 32489650; PMCID: PMC7222033. -- TIDK, [MIT](https://github.com/tolkit/telomeric-identifier/blob/main/LICENSE) +- tidk, [MIT](https://github.com/tolkit/telomeric-identifier/blob/main/LICENSE) > -- SEQKIT, [MIT](https://github.com/shenwei356/seqkit/blob/master/LICENSE) +- SeqKit, [MIT](https://github.com/shenwei356/seqkit/blob/master/LICENSE) > Shen W, Le S, Li Y, Hu F. 2016. SeqKit: A Cross-Platform and Ultrafast Toolkit for FASTA/Q File Manipulation. PLoS ONE 11(10): e0163962. @@ -72,19 +72,19 @@ > Shujun O, Ning J 2018. LTR_retriever: A Highly Accurate and Sensitive Program for Identification of Long Terminal Repeat Retrotransposons, Plant Physiology, 176, 2 (2018). -- KRAKEN2, [MIT](https://github.com/DerrickWood/kraken2/blob/master/LICENSE) +- Kraken 2, [MIT](https://github.com/DerrickWood/kraken2/blob/master/LICENSE) > Wood DE, Salzberg SL, Wood DE, Lu J, Langmead B. 2019. Improved metagenomic analysis with Kraken 2. Genome Biol 20, 257 (2019). -- JUICEBOX.JS, [MIT](https://github.com/igvteam/juicebox.js/blob/master/LICENSE) +- juicebox.js, [MIT](https://github.com/igvteam/juicebox.js/blob/master/LICENSE) > Robinson JT, Turner D, Durand NC, Thorvaldsdóttir H, Mesirov JP, Aiden EL. 2018. Juicebox.js Provides a Cloud-Based Visualization System for Hi-C Data. Cell Syst. 2018 Feb 28;6(2):256-258.e1. doi: . Epub 2018 Feb 7. PMID: 29428417; PMCID: PMC6047755. -- FASTP, [MIT](https://github.com/OpenGene/fastp/blob/master/LICENSE) +- fastp, [MIT](https://github.com/OpenGene/fastp/blob/master/LICENSE) > Chen S, Zhou Y, Chen Y, Gu J. 2018. fastp: an ultra-fast all-in-one FASTQ preprocessor, Bioinformatics, Volume 34, Issue 17, 01 September 2018, Pages i884–i890, -- FASTQC, [GPL v3](https://github.com/s-andrews/FastQC/blob/master/LICENSE.txt) +- FastQC, [GPL v3](https://github.com/s-andrews/FastQC/blob/master/LICENSE.txt) > @@ -92,7 +92,7 @@ > Dudchenko O, Batra SS, Omer AD, Nyquist SK, Hoeger M, Durand NC, Shamim MS, Machol I, Lander, Aiden AP, Aiden EL 2017. De novo assembly of the Aedes aegypti genome using Hi-C yields chromosome-length scaffolds.Science356, 92-95(2017). doi: . Available at: -- HIC_QC, [AGPL v3](https://github.com/phasegenomics/hic_qc/blob/master/LICENSE) +- hic_qc, [AGPL v3](https://github.com/phasegenomics/hic_qc/blob/master/LICENSE) > @@ -100,42 +100,46 @@ > -- BWA, [GPL v3](https://github.com/lh3/bwa/blob/master/COPYING) +- bwa-mem, [GPL v3](https://github.com/lh3/bwa/blob/master/COPYING) > Li H. 2013. Aligning sequence reads, clone sequences and assembly contigs with BWA-MEM. -- MATLOCK, [AGPL v3](https://github.com/phasegenomics/matlock/blob/master/LICENSE) +- Matlock, [AGPL v3](https://github.com/phasegenomics/matlock/blob/master/LICENSE) > ; -- SAMBLASTER, [MIT](https://github.com/GregoryFaust/samblaster/blob/master/LICENSE.txt) +- samblaster, [MIT](https://github.com/GregoryFaust/samblaster/blob/master/LICENSE.txt) > Faust GG, Hall IM. 2014. SAMBLASTER: fast duplicate marking and structural variant read extraction, Bioinformatics, Volume 30, Issue 17, September 2014, Pages 2503–2505, -- CIRCOS, [GPL v3](https://www.gnu.org/licenses/gpl-3.0.txt) +- Circos, [GPL v3](https://www.gnu.org/licenses/gpl-3.0.txt) > Krzywinski M, Schein J, Birol I, Connors J, Gascoyne R. Horsman D, ... Marra MA. 2009. Circos: an information aesthetic for comparative genomics. Genome research, 19(9), 1639-1645. -- MUMMER, [Artistic 2.0](https://github.com/mummer4/mummer/blob/master/LICENSE.md) +- MUMmer, [Artistic 2.0](https://github.com/mummer4/mummer/blob/master/LICENSE.md) > Marçais G, Delcher AL, Phillippy AM, Coston R, Salzberg SL, Zimin A. 2018. MUMmer4: A fast and versatile genome alignment system. PLoS Comput Biol. 2018 Jan 26;14(1):e1005944. doi: . PMID: 29373581; PMCID: PMC5802927. -- PLOTSR, [MIT](https://github.com/schneebergerlab/plotsr/blob/master/LICENSE) +- Plotsr, [MIT](https://github.com/schneebergerlab/plotsr/blob/master/LICENSE) > Goel M, Schneeberger K. 2022. plotsr: visualizing structural similarities and rearrangements between multiple genomes. Bioinformatics. 2022 May 13;38(10):2922-2926. doi: . PMID: 35561173; PMCID: PMC9113368. -- SYRI, [MIT](https://github.com/schneebergerlab/syri/blob/master/LICENSE) +- Syri, [MIT](https://github.com/schneebergerlab/syri/blob/master/LICENSE) > Goel M, Sun H, Jiao WB, Schneeberger K. 2019. SyRI: finding genomic rearrangements and local sequence differences from whole-genome assemblies. Genome Biol. 2019 Dec 16;20(1):277. doi: . PMID: 31842948; PMCID: PMC6913012. -- MINIMAP2, [MIT](https://github.com/lh3/minimap2/blob/master/LICENSE.txt) +- Minimap2, [MIT](https://github.com/lh3/minimap2/blob/master/LICENSE.txt) > Li H. 2021. New strategies to improve minimap2 alignment accuracy, Bioinformatics, Volume 37, Issue 23, December 2021, Pages 4572–4574, doi: -- MERQURY, [United States Government Work](https://github.com/marbl/merqury?tab=License-1-ov-file#readme) +- Merqury, [United States Government Work](https://github.com/marbl/merqury?tab=License-1-ov-file#readme) > Rhie, A., Walenz, B.P., Koren, S. et al. 2020. Merqury: reference-free quality, completeness, and phasing assessment for genome assemblies. Genome Biol 21, 245. doi: +- OrthoFinder, [GPL v3](https://github.com/davidemms/OrthoFinder/blob/master/License.md) + + > Emms, D.M., Kelly, S. OrthoFinder: phylogenetic orthology inference for comparative genomics. Genome Biol 20, 238 (2019). doi: 10.1186/s13059-019-1832-y + ## Software packaging/containerisation tools - [Anaconda](https://anaconda.com) diff --git a/README.md b/README.md index 8d9b478e..aa63b68d 100644 --- a/README.md +++ b/README.md @@ -12,79 +12,43 @@ ## Introduction -**plant-food-research-open/assemblyqc** is a [Nextflow](https://www.nextflow.io/docs/latest/index.html) pipeline which evaluates assembly quality with multiple QC tools and presents the results in a unified html report. The tools are shown in the [Pipeline Flowchart](#pipeline-flowchart) and their references are listed in [CITATIONS.md](./CITATIONS.md). +**plant-food-research-open/assemblyqc** is a [Nextflow](https://www.nextflow.io/docs/latest/index.html) pipeline which evaluates assembly quality with multiple QC tools and presents the results in a unified html report. The tools are shown in the [Pipeline Flowchart](#pipeline-flowchart) and their references are listed in [CITATIONS.md](./CITATIONS.md). The pipeline includes skip flags to disable execution of various tools. ## Pipeline Flowchart -```mermaid -%%{init: { - 'theme': 'base', - 'themeVariables': { - 'fontSize': '52px", - 'primaryColor': '#9A6421', - 'primaryTextColor': '#ffffff', - 'primaryBorderColor': '#9A6421', - 'lineColor': '#B180A8', - 'secondaryColor': '#455C58', - 'tertiaryColor': '#ffffff' - } -}}%% -flowchart LR - forEachTag(Assembly) ==> VALIDATE_FORMAT[VALIDATE FORMAT] - - VALIDATE_FORMAT ==> ncbiFCS[NCBI FCS ADAPTOR] - ncbiFCS ==> Check{Check} - - VALIDATE_FORMAT ==> ncbiGX[NCBI FCS GX] - ncbiGX ==> Check - Check ==> |Clean|Run(Run) - - Check ==> |Contamination|Skip(Skip All) - Skip ==> REPORT - - VALIDATE_FORMAT ==> GFF_STATS[GENOMETOOLS GT STAT] - - Run ==> ASS_STATS[STATS] - Run ==> BUSCO - Run ==> TIDK - Run ==> LAI - Run ==> KRAKEN2 - Run ==> HIC_CONTACT_MAP[HIC CONTACT MAP] - Run ==> MUMMER - Run ==> MINIMAP2 - Run ==> MERQURY - - MUMMER ==> CIRCOS - MUMMER ==> DOTPLOT - - MINIMAP2 ==> PLOTSR - - ASS_STATS ==> REPORT - GFF_STATS ==> REPORT - BUSCO ==> REPORT - TIDK ==> REPORT - LAI ==> REPORT - KRAKEN2 ==> REPORT - HIC_CONTACT_MAP ==> REPORT - CIRCOS ==> REPORT - DOTPLOT ==> REPORT - PLOTSR ==> REPORT - MERQURY ==> REPORT -``` - -- [FASTA VALIDATOR](https://github.com/linsalrob/fasta_validator) + [SEQKIT RMDUP](https://github.com/shenwei356/seqkit): FASTA validation -- [GENOMETOOLS GT GFF3VALIDATOR](https://genometools.org/tools/gt_gff3validator.html): GFF3 validation -- [ASSEMBLATHON STATS](https://github.com/PlantandFoodResearch/assemblathon2-analysis/blob/a93cba25d847434f7eadc04e63b58c567c46a56d/assemblathon_stats.pl), [GFASTATS](https://github.com/vgl-hub/gfastats): Assembly statistics -- [GENOMETOOLS GT STAT](https://genometools.org/tools/gt_stat.html): Annotation statistics -- [NCBI FCS ADAPTOR](https://github.com/ncbi/fcs): Adaptor contamination pass/fail -- [NCBI FCS GX](https://github.com/ncbi/fcs): Foreign organism contamination pass/fail -- [BUSCO](https://gitlab.com/ezlab/busco): Gene-space completeness estimation -- [TIDK](https://github.com/tolkit/telomeric-identifier): Telomere repeat identification -- [LAI](https://github.com/oushujun/LTR_retriever/blob/master/LAI): Continuity of repetitive sequences -- [KRAKEN2](https://github.com/DerrickWood/kraken2): Taxonomy classification -- [HIC CONTACT MAP](https://github.com/igvteam/juicebox.js): Alignment and visualisation of HiC data -- [MUMMER](https://github.com/mummer4/mummer) → [CIRCOS](http://circos.ca/documentation/) + [DOTPLOT](https://plotly.com) & [MINIMAP2](https://github.com/lh3/minimap2) → [PLOTSR](https://github.com/schneebergerlab/plotsr): Synteny analysis -- [MERQURY](https://github.com/marbl/merqury): K-mer completeness, consensus quality and phasing assessment +

+ +- `Assembly` + - [fasta_validator](https://github.com/linsalrob/fasta_validator) + [SeqKit rmdup](https://github.com/shenwei356/seqkit): FASTA validation + - [assemblathon_stats](https://github.com/PlantandFoodResearch/assemblathon2-analysis/blob/a93cba25d847434f7eadc04e63b58c567c46a56d/assemblathon_stats.pl), [gfastats](https://github.com/vgl-hub/gfastats): Assembly statistics + - [NCBI FCS-adaptor](https://github.com/ncbi/fcs): Adaptor contamination pass/fail + - [NCBI FCS-GX](https://github.com/ncbi/fcs): Foreign organism contamination pass/fail + - [tidk](https://github.com/tolkit/telomeric-identifier): Telomere repeat identification + - [BUSCO](https://gitlab.com/ezlab/busco): Gene-space completeness estimation + - [LAI](https://github.com/oushujun/LTR_retriever/blob/master/LAI): Continuity of repetitive sequences + - [Kraken 2](https://github.com/DerrickWood/kraken2), [Krona](https://github.com/marbl/Krona): Taxonomy classification + - `Alignment and visualisation of HiC data` + - [sra-tools](https://github.com/ncbi/sra-tools): HiC data download from SRA or use of local FASTQ files + - [fastp](https://github.com/OpenGene/fastp), [FastQC](https://github.com/s-andrews/FastQC): Read QC and trimming + - [SeqKit sort](https://github.com/shenwei356/seqkit): Alphanumeric sorting of FASTA by sequence ID + - [bwa-mem](https://github.com/lh3/bwa): HiC read alignment + - [samblaster](https://github.com/GregoryFaust/samblaster): Duplicate marking + - [hic_qc](https://github.com/phasegenomics/hic_qc): HiC read and alignment statistics + - [Matlock](https://github.com/phasegenomics/matlock): BAM to juicer conversion + - [3d-dna/visualize](https://github.com/aidenlab/3d-dna/tree/master/visualize): `.hic` file creation + - [juicebox.js](https://github.com/igvteam/juicebox.js): HiC contact map visualisation + - `K-mer completeness, consensus quality and phasing assessment` + - [sra-tools](https://github.com/ncbi/sra-tools): Assembly, maternal and paternal data download from SRA or use of local FASTQ files + - [Merqury hapmers](https://github.com/marbl/merqury/blob/master/trio/hapmers.sh): Hapmer generation if parental data is available + - [Merqury](https://github.com/marbl/merqury): Completeness, consensus quality and phasing assessment + - `Synteny analysis` + - [MUMmer](https://github.com/mummer4/mummer) → [Circos](http://circos.ca/documentation/) + [dotplot](https://plotly.com): One-to-all and all-to-all synteny analysis at the contig level + - [Minimap2](https://github.com/lh3/minimap2) → [Syri](https://github.com/schneebergerlab/syri)/[Plotsr](https://github.com/schneebergerlab/plotsr): One-to-one synteny analysis at the chromosome level +- `Annotation` + - [GenomeTools gt gff3validator](https://genometools.org/tools/gt_gff3validator.html) + [FASTA/GFF correspondence](subworkflows/gallvp/gff3_gt_gff3_gff3validator_stat/main.nf): GFF3 validation + - [GenomeTools gt stat](https://genometools.org/tools/gt_stat.html): Annotation statistics + - [GffRead](https://github.com/gpertea/gffread), [BUSCO](https://gitlab.com/ezlab/busco): Gene-space completeness estimation in annotation proteins + - [OrthoFinder](https://github.com/davidemms/OrthoFinder): Phylogenetic orthology inference for comparative genomics ## Usage @@ -103,9 +67,9 @@ Now, you can run the pipeline using: ```bash nextflow run plant-food-research-open/assemblyqc \ -revision \ - -profile \ - --input assemblysheet.csv \ - --outdir + -profile \ + --input assemblysheet.csv \ + --outdir ``` > [!WARNING] @@ -140,31 +104,32 @@ The pipeline uses nf-core modules contributed by following authors: - - + + + + - + + - - + + + - - - diff --git a/bin/assemblyqc.py b/bin/assemblyqc.py index c81db840..c808e4de 100755 --- a/bin/assemblyqc.py +++ b/bin/assemblyqc.py @@ -29,6 +29,7 @@ from report_modules.parsers.hic_parser import parse_hic_folder from report_modules.parsers.synteny_parser import parse_synteny_folder from report_modules.parsers.merqury_parser import parse_merqury_folder +from report_modules.parsers.orthofinder_parser import parse_orthofinder_folder if __name__ == "__main__": params_dict, params_table = parse_params_json("params_json.json") @@ -57,6 +58,7 @@ data_from_tools = {**data_from_tools, **parse_hic_folder()} data_from_tools = {**data_from_tools, **parse_synteny_folder()} data_from_tools = {**data_from_tools, **parse_merqury_folder()} + data_from_tools = {**data_from_tools, **parse_orthofinder_folder()} with open("software_versions.yml", "r") as f: versions_from_ch_versions = yaml.safe_load(f) diff --git a/bin/report_modules/parsers/orthofinder_parser.py b/bin/report_modules/parsers/orthofinder_parser.py new file mode 100644 index 00000000..d31fdfe8 --- /dev/null +++ b/bin/report_modules/parsers/orthofinder_parser.py @@ -0,0 +1,100 @@ +import pandas as pd +import base64 +import os +import re + +import matplotlib.pyplot as plt +from tabulate import tabulate +from pathlib import Path +from io import StringIO +from Bio import Phylo + + +def parse_orthofinder_folder(folder_name="orthofinder_outputs/assemblyqc"): + dir = os.getcwdb().decode() + results_root_path = Path(f"{dir}/{folder_name}") + + if not results_root_path.exists(): + return {} + + data = {"ORTHOFINDER": {}} + + # Species tree + tree = Phylo.read( + f"{results_root_path}/Species_Tree/SpeciesTree_rooted.txt", "newick" + ) + + fig = plt.figure(figsize=(6, 6)) + ax = fig.add_subplot(1, 1, 1) + Phylo.draw(tree, do_show=False, axes=ax) + + plt.gca().spines["top"].set_visible(False) + plt.gca().spines["right"].set_visible(False) + + plt.savefig("speciestree_rooted.png", format="png", dpi=300) + + with open("speciestree_rooted.png", "rb") as f: + binary_fc = f.read() + + base64_utf8_str = base64.b64encode(binary_fc).decode("utf-8") + data["ORTHOFINDER"]["speciestree_rooted"] = ( + f"data:image/png+xml;base64,{base64_utf8_str}" + ) + + # Overall statistics + overall_statistics = Path( + f"{results_root_path}/Comparative_Genomics_Statistics/Statistics_Overall.tsv" + ).read_text() + + ## General stats + general_stats = re.findall( + r"(Number of species.*)Orthogroups file", overall_statistics, flags=re.DOTALL + )[0] + general_stats_pd = pd.read_csv(StringIO(general_stats), sep="\t") + + data["ORTHOFINDER"]["general_stats"] = general_stats_pd.to_dict("records") + data["ORTHOFINDER"]["general_stats_html"] = tabulate( + general_stats_pd, + headers=["Stat", "Value"], + tablefmt="html", + numalign="left", + showindex=False, + ) + + ## Genes per-species + genes_per_species = re.findall( + r"(Average number of genes per-species in orthogroup.*)Number of species in orthogroup", + overall_statistics, + flags=re.DOTALL, + )[0] + genes_per_species_pd = pd.read_csv(StringIO(genes_per_species), sep="\t", header=0) + data["ORTHOFINDER"]["genes_per_species"] = genes_per_species_pd.to_dict("records") + data["ORTHOFINDER"]["genes_per_species_html"] = tabulate( + genes_per_species_pd, + headers=genes_per_species_pd.columns.to_list(), + tablefmt="html", + numalign="left", + showindex=False, + ) + + ## Number of species in orthogroup + num_species_orthogroup = re.findall( + r"(Number of species in orthogroup.*)", + overall_statistics, + flags=re.DOTALL, + )[0] + num_species_orthogroup_pd = pd.read_csv( + StringIO(num_species_orthogroup), sep="\t", header=0 + ) + data["ORTHOFINDER"]["num_species_orthogroup"] = num_species_orthogroup_pd.to_dict( + "records" + ) + data["ORTHOFINDER"]["num_species_orthogroup_html"] = tabulate( + num_species_orthogroup_pd, + headers=num_species_orthogroup_pd.columns.to_list(), + tablefmt="html", + numalign="left", + showindex=False, + ) + + return data diff --git a/bin/report_modules/templates/base.html b/bin/report_modules/templates/base.html index c7c94f04..4da53369 100644 --- a/bin/report_modules/templates/base.html +++ b/bin/report_modules/templates/base.html @@ -79,6 +79,11 @@ {% if 'MERQURY' in all_stats_dicts %} {% endif %} + + {% if 'ORTHOFINDER' in all_stats_dicts %} + + {% endif %} + {% include 'params/params.html' %} @@ -151,6 +156,11 @@ {% if 'MERQURY' in all_stats_dicts %} {% include 'merqury/merqury.html' %} {% endif %} + + {% if 'ORTHOFINDER' in all_stats_dicts %} + {% include 'orthofinder/orthofinder.html' %} + {% endif %} + {% include 'js.html' %} diff --git a/bin/report_modules/templates/orthofinder/orthofinder.html b/bin/report_modules/templates/orthofinder/orthofinder.html new file mode 100644 index 00000000..35928c4d --- /dev/null +++ b/bin/report_modules/templates/orthofinder/orthofinder.html @@ -0,0 +1,14 @@ + diff --git a/bin/report_modules/templates/orthofinder/report_contents.html b/bin/report_modules/templates/orthofinder/report_contents.html new file mode 100644 index 00000000..8afa502a --- /dev/null +++ b/bin/report_modules/templates/orthofinder/report_contents.html @@ -0,0 +1,33 @@ +
+
+ +
+

Species tree (rooted)

+
+
+ +
+ +
+

General statistics

+
+
+
{{ all_stats_dicts['ORTHOFINDER']['general_stats_html'] }}
+
+ +
+

Genes per-species in orthogroup

+
+
+
{{ all_stats_dicts['ORTHOFINDER']['genes_per_species_html'] }}
+
+ +
+

Number of species in orthogroup

+
+
+
{{ all_stats_dicts['ORTHOFINDER']['num_species_orthogroup_html'] }}
+
+ +
+
diff --git a/conf/modules.config b/conf/modules.config index 0f05443c..090ebd3a 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -388,6 +388,18 @@ process { ] } + withName: '.*:ASSEMBLYQC:GFFREAD' { + ext.args = '-y -S' + } + + withName: '.*:ASSEMBLYQC:ORTHOFINDER' { + publishDir = [ + path: { "${params.outdir}/orthofinder" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ] + } + withName: '.*:ASSEMBLYQC:CREATEREPORT' { publishDir = [ [ diff --git a/docs/images/assemblyqc.png b/docs/images/assemblyqc.png new file mode 100644 index 00000000..3fd97dd1 Binary files /dev/null and b/docs/images/assemblyqc.png differ diff --git a/docs/images/orthofinder.png b/docs/images/orthofinder.png new file mode 100644 index 00000000..4a53e3c0 Binary files /dev/null and b/docs/images/orthofinder.png differ diff --git a/docs/output.md b/docs/output.md index dd5dd998..68adc287 100644 --- a/docs/output.md +++ b/docs/output.md @@ -8,26 +8,27 @@ The directories listed below will be created in the results directory after the ## Pipeline overview -The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: +The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data to produce following outputs: -- [FASTA and GFF3 validation](#fasta-and-gff3-validation) +- [Format validation](#format-validation) - [Assemblathon stats](#assemblathon-stats) -- [Gfastats](#gfastats) -- [GenomeTools gt stat](#genometools-gt-stat) -- [NCBI FCS adaptor](#ncbi-fcs-adaptor) -- [NCBI FCS GX](#ncbi-fcs-gx) +- [gfastats](#gfastats) +- [NCBI FCS-adaptor](#ncbi-fcs-adaptor) +- [NCBI FCS-GX](#ncbi-fcs-gx) +- [tidk](#tidk) - [BUSCO](#busco) -- [TIDK](#tidk) - [LAI](#lai) -- [Kraken2](#kraken2) +- [Kraken 2](#kraken-2) - [HiC contact map](#hic-contact-map) -- [Synteny](#synteny) - [Merqury](#merqury) +- [Synteny](#synteny) +- [GenomeTools gt stat](#genometools-gt-stat) +- [OrthoFinder](#orthofinder) - [Pipeline information](#pipeline-information) -### FASTA and GFF3 validation +### Format validation The pipeline prints a warning in the pipeline log if FASTA or GFF3 validation fails. The error log from the validator is reported in the `report.html`. The remaining QC tools are skipped for the assembly with invalid fasta file. @@ -46,7 +47,7 @@ The pipeline prints a warning in the pipeline log if FASTA or GFF3 validation fa > [!WARNING] > Contig-related stats are based on the assumption that `assemblathon_stats_n_limit` is specified correctly. If you are not certain of the value of `assemblathon_stats_n_limit`, please ignore the contig-related stats. -### Gfastats +### gfastats
Output files @@ -56,35 +57,21 @@ The pipeline prints a warning in the pipeline log if FASTA or GFF3 validation fa
-Gfastats is a fast and exhaustive tool for summary statistics. +gfastats is a fast and exhaustive tool for summary statistics. -### GenomeTools gt stat - -
-Output files - -- `genometools_gt_stat/` - - `*.gt.stat.yml`: Assembly annotation stats in yaml format. - -
- -GenomeTools `gt stat` tool calculates a basic set of statistics about features contained in GFF3 files. - -
AssemblyQC - GenomeTools gt stat gene length distribution
AssemblyQC - GenomeTools gt stat gene length distribution
- -### NCBI FCS adaptor +### NCBI FCS-adaptor
Output files - `ncbi_fcs_adaptor/` - - `*_fcs_adaptor_report.tsv`: NCBI FCS adaptor report in CSV format. + - `*_fcs_adaptor_report.tsv`: NCBI FCS-adaptor report in CSV format.
-[FCS-adaptor detects](https://github.com/ncbi/fcs/wiki/FCS-adaptor#rules-for-action-assignment) adaptor and vector contamination in genome sequences. +[FCS-adaptor](https://github.com/ncbi/fcs/wiki/FCS-adaptor#rules-for-action-assignment) detects adaptor and vector contamination in genome sequences. -### NCBI FCS GX +### NCBI FCS-GX
Output files @@ -98,42 +85,47 @@ GenomeTools `gt stat` tool calculates a basic set of statistics about features c
-[FCS-GX detects](https://github.com/ncbi/fcs/wiki/FCS-GX#outputs) contamination from foreign organisms in genome sequences. +[FCS-GX](https://github.com/ncbi/fcs/wiki/FCS-GX#outputs) detects contamination from foreign organisms in genome sequences. -### BUSCO +### tidk
Output files -- `busco/` - - `busco_figure.png`: Summary figure created from all the BUSCO summaries. - - `tag` - - `short_summary.specific.*_odb10.tag_*.txt`: BUSCO summary for the assembly represented by `tag`. +- `tidk/` + - `*.apriori.tsv`: Frequencies for successive windows in forward and reverse directions for the pre-specified telomere-repeat sequence. + - `*.apriori.svg`: Plot of `*.apriori.tsv` + - `*.tidk.explore.tsv`: List of the most frequent repeat sequences. + - `*.top.sequence.txt`: The top sequence from `*.tidk.explore.tsv`. + - `*.aposteriori.tsv`: Frequencies for successive windows in forward and reverse directions for the top sequence from `*.top.sequence.txt`. + - `*.aposteriori.svg`: Plot of `*.aposteriori.tsv`.
-[BUSCO estimates](https://busco.ezlab.org/busco_userguide.html) the completeness and redundancy of processed genomic data based on universal single-copy orthologs. +tidk toolkit is designed to [identify and visualize](https://github.com/tolkit/telomeric-identifier) telomeric repeats for the Darwin Tree of Life genomes. -
AssemblyQC - BUSCO summary plot
AssemblyQC - BUSCO summary plot
+
AssemblyQC - tidk plot
AssemblyQC - tidk plot
-### TIDK +### BUSCO
Output files -- `tidk/` - - `*.apriori.tsv`: Frequencies for successive windows in forward and reverse directions for the pre-specified telomere-repeat sequence. - - `*.apriori.svg`: Plot of `*.apriori.tsv` - - `*.tidk.explore.tsv`: List of the most frequent repeat sequences. - - `*.top.sequence.txt`: The top sequence from `*.tidk.explore.tsv`. - - `*.aposteriori.tsv`: Frequencies for successive windows in forward and reverse directions for the top sequence from `*.top.sequence.txt`. - - `*.aposteriori.svg`: Plot of `*.aposteriori.tsv`. +- `busco/` + - `fasta` + - `busco_figure.png`: Summary figure created from all the BUSCO summaries. + - `tag` + - `short_summary.specific.*_odb10.tag_*.txt`: BUSCO summary for the assembly represented by `tag`. + - `gff` + - `busco_figure.png`: Summary figure created from all the BUSCO summaries. + - `tag` + - `short_summary.specific.*_odb10.tag_*.txt`: BUSCO summary for the annotation of the assembly represented by `tag`.
-TIDK toolkit is designed to [identify and visualize](https://github.com/tolkit/telomeric-identifier) telomeric repeats for the Darwin Tree of Life genomes. +[BUSCO](https://busco.ezlab.org/busco_userguide.html) estimates the completeness and redundancy of processed genomic data based on universal single-copy orthologs. -
AssemblyQC - TIDK plot
AssemblyQC - TIDK plot
+
AssemblyQC - BUSCO summary plot
AssemblyQC - BUSCO summary plot
### LAI @@ -154,22 +146,22 @@ LTR Assembly Index (LAI) is a reference-free genome metric that [evaluates assem > [!WARNING] > Soft masked regions are unmasked when calculating LAI. However, hard masked regions are left as is. The pipeline will fail to calculate LAI if all the LTRs are already hard masked. -### Kraken2 +### Kraken 2
Output files - `kraken2/` - - `*.kraken2.report`: [Kraken2 report](https://github.com/DerrickWood/kraken2/wiki/Manual#output-formats). - - `*.kraken2.cut`: [Kraken2 output](https://github.com/DerrickWood/kraken2/wiki/Manual#output-formats). + - `*.kraken2.report`: [Kraken 2 report](https://github.com/DerrickWood/kraken2/wiki/Manual#output-formats). + - `*.kraken2.cut`: [Kraken 2 output](https://github.com/DerrickWood/kraken2/wiki/Manual#output-formats). - `*.kraken2.krona.cut`: [Select columns](../modules/local/kraken2_krona_plot.nf) from `*.kraken2.cut` used for generation of a Krona taxonomy plot. - `*.kraken2.krona.html`: Interactive Krona taxonomy plot.
-Kraken2 [assigns taxonomic labels](https://ccb.jhu.edu/software/kraken2/) to sequencing reads for metagenomics projects. Further reading regarding performance of Kraken2: +Kraken 2 [assigns taxonomic labels](https://ccb.jhu.edu/software/kraken2/) to sequencing reads for metagenomics projects. Further reading regarding performance of Kraken 2: -
AssemblyQC - Interactive Krona plot from Kraken2 taxonomy
AssemblyQC - Interactive Krona plot from Kraken2 taxonomy
+
AssemblyQC - Interactive Krona plot from Kraken 2 taxonomy
AssemblyQC - Interactive Krona plot from Kraken 2 taxonomy
### HiC contact map @@ -178,17 +170,17 @@ Kraken2 [assigns taxonomic labels](https://ccb.jhu.edu/software/kraken2/) to seq - `hic/` - `fastqc_raw/` - - `*_1_fastqc.html/*_2_fastqc.html`: FASTQC html report for the raw reads - - `*_1_fastqc.zip/*_2_fastqc.zip`: FASTQC stats for the raw reads + - `*_1_fastqc.html/*_2_fastqc.html`: FastQC html report for the raw reads + - `*_1_fastqc.zip/*_2_fastqc.zip`: FastQC stats for the raw reads - `fastp/` - - `*.fastp.html`: FASTP HTML report - - `*.fastp.json`: FASTP statistics in JSON format - - `*.fastp.log`: FASTP log - - `*_1.fastp.fastq.gz/*_2.fastp.fastq.gz`: Reads passed by FASTP - - `*_1.fail.fastq.gz/*_2.fail.fastq.gz`: Reads failed by FASTP + - `*.fastp.html`: fastp HTML report + - `*.fastp.json`: fastp statistics in JSON format + - `*.fastp.log`: fastp log + - `*_1.fastp.fastq.gz/*_2.fastp.fastq.gz`: Reads passed by fastp + - `*_1.fail.fastq.gz/*_2.fail.fastq.gz`: Reads failed by fastp - `fastqc_trim/` - - `*_1_fastqc.html/*_2_fastqc.html`: FASTQC html report for the reads passed by FASTP. - - `*_1_fastqc.zip/*_2_fastqc.zip`: FASTQC stats for the reads passed by FASTP. + - `*_1_fastqc.html/*_2_fastqc.html`: FastQC html report for the reads passed by FASTP. + - `*_1_fastqc.zip/*_2_fastqc.zip`: FastQC stats for the reads passed by FASTP. - `hicqc` - `*.on.*_qc_report.pdf`: HiC QC report for reads mapped to an assembly. - `assembly/` @@ -206,6 +198,28 @@ Hi-C contact mapping experiments measure the frequency of physical contact betwe AssemblyQC - HiC results +### Merqury + +
+Output files + +- `merqury/` + - `tag1-and-tag2`: Results folder for haplotype `tag1` and `tag2`. + - `*.completeness.stats`: Assembly completeness statistics + - `*.qv`: Assembly consensus quality QV statistics + - `*.fl.png`: Spectra plots + - `*.hapmers.blob.png`: Hap-mer blob plot +
+ +[Merqury](https://github.com/marbl/merqury) is used for the k-mer analysis. + +
+AssemblyQC - Spectra-cn plot +AssemblyQC - Plotsr synteny plot +
+AssemblyQC - Merqury plots +
+ ### Synteny
@@ -225,37 +239,42 @@ Hi-C contact mapping experiments measure the frequency of physical contact betwe - `plotsr.png`: Plotsr synteny plot
-[Circos](https://circos.ca) and linear synteny plots are created from genome-wide alignments performed with [MUMMER](https://github.com/mummer4/mummer?tab=readme-ov-file) and [`dnadiff.pl`](https://github.com/mummer4/mummer/blob/master/scripts/dnadiff.pl). +[Circos](https://circos.ca) and dotplots are created from genome-wide alignments performed with [MUMmer](https://github.com/mummer4/mummer?tab=readme-ov-file). Whereas, [Plotsr](https://github.com/schneebergerlab/plotsr) plots are created from genome-wide alignments performed with [Minimap2](https://github.com/lh3/minimap2).
AssemblyQC - Circos synteny plot AssemblyQC - Plotsr synteny plot -AssemblyQC - Dotplot synteny plot +AssemblyQC - dotplot synteny plot
AssemblyQC - Synteny plots
-### Merqury +### GenomeTools gt stat
Output files -- `merqury/` - - `tag1-and-tag2`: Results folder for haplotype `tag1` and `tag2`. - - `*.completeness.stats`: Assembly completeness statistics - - `*.qv`: Assembly consensus quality QV statistics - - `*.fl.png`: Spectra plots - - `*.hapmers.blob.png`: Hap-mer blob plot -
+- `genometools_gt_stat/` + - `*.gt.stat.yml`: Assembly annotation stats in yaml format. -[MERQURY](https://github.com/marbl/merqury) is used for the k-mer analysis. + -
-AssemblyQC - Spectra-cn plot -AssemblyQC - Plotsr synteny plot -
-AssemblyQC - Merqury plots -
+GenomeTools `gt stat` tool calculates a basic set of statistics about features contained in GFF3 files. + +
AssemblyQC - GenomeTools gt stat gene length distribution
AssemblyQC - GenomeTools gt stat gene length distribution
+ +### OrthoFinder + +
+Output files + +- `orthofinder/assemblyqc`: OrthoFinder output folder. + +
+ +If more than one assemblies are included along with their annotations, OrthoFinder is executed on the annotation proteins to perform a phylogenetic orthology inference for comparative genomics. + +
AssemblyQC - OrthoFinder species tree
AssemblyQC - OrthoFinder species tree
### Pipeline information diff --git a/docs/parameters.md b/docs/parameters.md index 90a39759..1c8fd3f6 100644 --- a/docs/parameters.md +++ b/docs/parameters.md @@ -34,6 +34,15 @@ A Nextflow pipeline which evaluates assembly quality with multiple QC tools and | `ncbi_fcs_gx_db_path` | Path to NCBI FCS GX database. See: https://github.com/ncbi/fcs/wiki/FCS-GX | `string` | | | | | `contamination_stops_pipeline` | Skip remaining QC steps for an assembly which has adaptor or GX contamination | `boolean` | True | | | +## tidk options + +| Parameter | Description | Type | Default | Required | Hidden | +| --------------------- | ---------------------------------------------------------------------------------------------------------- | --------- | ------- | -------- | ------ | +| `tidk_skip` | Skip telomere identification | `boolean` | True | | | +| `tidk_repeat_seq` | Telomere repeat sequence. Typical values for plant: TTTAGGG, fungus, vertebrates: TTAGGG and Insect: TTAGG | `string` | | | | +| `tidk_filter_by_size` | Filter assembly sequences smaller than the specified length | `boolean` | | | | +| `tidk_filter_size_bp` | Filter size in base-pairs | `integer` | 1000000 | | | + ## BUSCO options | Parameter | Description | Type | Default | Required | Hidden | @@ -43,22 +52,13 @@ A Nextflow pipeline which evaluates assembly quality with multiple QC tools and | `busco_lineage_datasets` | BUSCO lineages. It should be provided as a space-separated list of lineages: 'fungi_odb10 microsporidia_odb10' | `string` | | | | | `busco_download_path` | Download path for BUSCO | `string` | | | | -## TIDK options - -| Parameter | Description | Type | Default | Required | Hidden | -| --------------------- | ---------------------------------------------------------------------------------------------------------- | --------- | ------- | -------- | ------ | -| `tidk_skip` | Skip telomere identification | `boolean` | True | | | -| `tidk_repeat_seq` | Telomere repeat sequence. Typical values for plant: TTTAGGG, fungus, vertebrates: TTAGGG and Insect: TTAGG | `string` | | | | -| `tidk_filter_by_size` | Filter assembly sequences smaller than the specified length | `boolean` | | | | -| `tidk_filter_size_bp` | Filter size in base-pairs | `integer` | 1000000 | | | - ## LAI options | Parameter | Description | Type | Default | Required | Hidden | | ---------- | ------------------- | --------- | ------- | -------- | ------ | | `lai_skip` | Skip LAI estimation | `boolean` | True | | | -## Kraken2 options +## Kraken 2 options | Parameter | Description | Type | Default | Required | Hidden | | ----------------- | --------------------- | --------- | ------- | -------- | ------ | @@ -75,6 +75,13 @@ A Nextflow pipeline which evaluates assembly quality with multiple QC tools and | `hic_fastp_ext_args` | Additional parameters for fastp trimming | `string` | --qualified_quality_phred 20 --length_required 50 | | | | `hic_samtools_ext_args` | Additional parameters for samtools view command run after samblaster | `string` | -F 3852 | | | +## Merqury options + +| Parameter | Description | Type | Default | Required | Hidden | +| --------------------- | -------------------------------- | --------- | ------- | -------- | ------ | +| `merqury_skip` | Skip merqury analysis | `boolean` | True | | | +| `merqury_kmer_length` | kmer length for merqury analysis | `integer` | 21 | | | + ## Synteny options | Parameter | Description | Type | Default | Required | Hidden | @@ -93,12 +100,11 @@ A Nextflow pipeline which evaluates assembly quality with multiple QC tools and | `synteny_plotsr_seq_label` | Sequence label prefix for plotsr synteny | `string` | Chr | | | | `synteny_plotsr_assembly_order` | The order in which the assemblies should be compared, provided as space separated string of assembly tags. If absent, assemblies are ordered by their tags alphabetically. | `string` | | | | -## Merqury options +## OrthoFinder options -| Parameter | Description | Type | Default | Required | Hidden | -| --------------------- | -------------------------------- | --------- | ------- | -------- | ------ | -| `merqury_skip` | Skip merqury analysis | `boolean` | True | | | -| `merqury_kmer_length` | kmer length for merqury analysis | `integer` | 21 | | | +| Parameter | Description | Type | Default | Required | Hidden | +| ------------------ | ---------------- | --------- | ------- | -------- | ------ | +| `orthofinder_skip` | Skip orthofinder | `boolean` | True | | | ## Institutional config options diff --git a/docs/usage.md b/docs/usage.md index 3e8b314b..02b198b3 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -2,17 +2,17 @@ - [Assemblysheet input](#assemblysheet-input) - [External databases](#external-databases) - - [NCBI FCS GX database](#ncbi-fcs-gx-database) - - [Kraken2](#kraken2) + - [NCBI FCS-GX database](#ncbi-fcs-gx-database) - [BUSCO](#busco) + - [Kraken 2](#kraken-2) - [Other parameters](#other-parameters) - [Assemblathon stats](#assemblathon-stats) - - [NCBI FCS GX](#ncbi-fcs-gx) + - [NCBI FCS-GX](#ncbi-fcs-gx) + - [tidk](#tidk) - [BUSCO](#busco-1) - - [TIDK](#tidk) - [HiC](#hic) - - [Synteny analysis](#synteny-analysis) - [Merqury K-mer analysis](#merqury-k-mer-analysis) + - [Synteny analysis](#synteny-analysis) - [Minimum System Requirements](#minimum-system-requirements) - [Running the pipeline](#running-the-pipeline) - [Updating the pipeline](#updating-the-pipeline) @@ -43,9 +43,9 @@ See the [Merqury](#merqury-k-mer-analysis) section For description of assemblysh ## External databases -### NCBI FCS GX database +### NCBI FCS-GX database -If NCBI FCS GX foreign organism contamination check is executed by setting `ncbi_fcs_gx_skip` to `false`, the path to the GX database must be provided with option `ncbi_fcs_gx_db_path`. The user must ensure that the database is correctly downloaded and placed in a location accessible to the pipeline. Setup instructions are available at . The database path must contain following files: +If NCBI FCS-GX foreign organism contamination check is executed by setting `ncbi_fcs_gx_skip` to `false`, the path to the GX database must be provided with option `ncbi_fcs_gx_db_path`. The user must ensure that the database is correctly downloaded and placed in a location accessible to the pipeline. Setup instructions are available at . The database path must contain following files: ```bash all.assemblies.tsv @@ -59,14 +59,14 @@ all.seq_info.tsv.gz all.taxa.tsv ``` -### Kraken2 - -Path to Kraken2 database is provided by the `kraken2_db_path` parameter. This can be a URL to a public `.tar.gz` file such as `https://genome-idx.s3.amazonaws.com/kraken/k2_pluspfp_20240112.tar.gz`. The pipeline can download and extract the database. This is not the recommended practice owing to the size of the database. Rather, the database should be downloaded, extracted and stored in a read-only location. The path to that location can be provided by the `kraken2_db_path` parameter such as `/workspace/ComparativeDataSources/kraken2db/k2_pluspfp_20230314`. - ### BUSCO BUSCO lineage databases are downloaded and updated by the BUSCO tool itself. A persistent location for the database can be provided by specifying `busco_download_path` parameter. +### Kraken 2 + +Path to Kraken 2 database is provided by the `kraken2_db_path` parameter. This can be a URL to a public `.tar.gz` file such as `https://genome-idx.s3.amazonaws.com/kraken/k2_pluspfp_20240112.tar.gz`. The pipeline can download and extract the database. This is not the recommended practice owing to the size of the database. Rather, the database should be downloaded, extracted and stored in a read-only location. The path to that location can be provided by the `kraken2_db_path` parameter such as `/workspace/ComparativeDataSources/kraken2db/k2_pluspfp_20230314`. + ## Other parameters This section provides additional information for parameters. It does not list all the pipeline parameters. For an exhaustive list, see [parameters.md](./parameters.md). @@ -75,35 +75,22 @@ This section provides additional information for parameters. It does not list al `assemblathon_stats_n_limit` is the number of 'N's for the unknown gap size. This number is used to split the scaffolds into contigs to compute contig-related stats. NCBI's recommendation for unknown gap size is 100 . -### NCBI FCS GX +### NCBI FCS-GX - `ncbi_fcs_gx_tax_id` is the taxonomy ID for all the assemblies listed in the assemblysheet. A taxonomy ID can be obtained by searching a _Genus species_ at . -### BUSCO +### tidk -- `busco_lineage_datasets`: A space-separated list of BUSCO lineages. Any number of lineages can be specified such as "fungi_odb10 hypocreales_odb10". Each assembly is assessed against each of the listed lineage. To select a lineage, refer to . +- `tidk_repeat_seq`: The telomere search sequence. To select an appropriate sequence, see . Commonly used sequences are TTTAGGG (Plant), TTAGGG (Fungus, Vertebrates) and TTAGG (Insect). Further reading: -### TIDK +### BUSCO -- `tidk_repeat_seq`: The telomere search sequence. To select an appropriate sequence, see . Commonly used sequences are TTTAGGG (Plant), TTAGGG (Fungus, Vertebrates) and TTAGG (Insect). Further reading: +- `busco_lineage_datasets`: A space-separated list of BUSCO lineages. Any number of lineages can be specified such as "fungi_odb10 hypocreales_odb10". Each assembly is assessed against each of the listed lineage. To select a lineage, refer to . ### HiC - `hic`: Path to reads provided as a SRA ID or as a path to paired reads such as 'hic_reads{1,2}.fastq.gz'. These reads are applied to each assembly listed by `input`. -### Synteny analysis - -- `synteny_xref_assemblies`: Similar to `--input`, this parameter also provides a CSV sheet listing external reference assemblies which are included in the synteny analysis but are not analysed by other QC tools. See the [example xrefsheet](../assets/xrefsheet.csv) included with the pipeline. Its fields are: - - - `tag:` A unique tag which represents the reference assembly in the final report - - `fasta:` FASTA file - - `synteny_labels:` A two column tsv file listing fasta sequence ids (first column) and their labels for the synteny plots (second column) - -- `synteny_plotsr_assembly_order`: The order in which Minimap2 alignments are performed and, then, plotted by Plotsr. For assembly A, B and C; if the order is specified as 'B C A', then, two alignments are performed. First, C is aligned against B as reference. Second, A is aligned against C as reference. The order of these assemblies on the Plotsr figure is also 'B C A' so that B appears on top, C in the middle and A at the bottom. If this parameter is `null`, the assemblies are ordered alphabetically. All assemblies from `input` and `synteny_xref_assemblies` are included by default. If an assembly is missing from this list, that assembly is excluded from the analysis. - -> [!WARNING] -> PLOTSR performs a sequence-wise (preferably chromosome-wise) synteny analysis. The order of the sequences for each assembly is inferred from its `synteny_labels` file and the order of sequences in the FASTA file is ignored. As all the assemblies are included in a single plot and the number of sequences from each assembly should be same, sequences after the common minimum number are excluded. Afterwards, the sequences are marked sequentially as `Chr1`, `Chr2`, `Chr3`,... If a label other than `Chr` is desirable, it can be configured with the `synteny_plotsr_seq_label` parameter. - ### Merqury K-mer analysis Additional assemblysheet columns: @@ -124,6 +111,19 @@ See following assemblysheet examples for MERQURY analysis. The data for these examples comes from: [umd.edu](https://obj.umiacs.umd.edu/marbl_publications/triobinning/index.html) +### Synteny analysis + +- `synteny_xref_assemblies`: Similar to `--input`, this parameter also provides a CSV sheet listing external reference assemblies which are included in the synteny analysis but are not analysed by other QC tools. See the [example xrefsheet](../assets/xrefsheet.csv) included with the pipeline. Its fields are: + + - `tag:` A unique tag which represents the reference assembly in the final report + - `fasta:` FASTA file + - `synteny_labels:` A two column tsv file listing fasta sequence ids (first column) and their labels for the synteny plots (second column) + +- `synteny_plotsr_assembly_order`: The order in which Minimap2 alignments are performed and, then, plotted by Plotsr. For assembly A, B and C; if the order is specified as 'B C A', then, two alignments are performed. First, C is aligned against B as reference. Second, A is aligned against C as reference. The order of these assemblies on the Plotsr figure is also 'B C A' so that B appears on top, C in the middle and A at the bottom. If this parameter is `null`, the assemblies are ordered alphabetically. All assemblies from `input` and `synteny_xref_assemblies` are included by default. If an assembly is missing from this list, that assembly is excluded from the analysis. + +> [!WARNING] +> PLOTSR performs a sequence-wise (preferably chromosome-wise) synteny analysis. The order of the sequences for each assembly is inferred from its `synteny_labels` file and the order of sequences in the FASTA file is ignored. As all the assemblies are included in a single plot and the number of sequences from each assembly should be same, sequences after the common minimum number are excluded. Afterwards, the sequences are marked sequentially as `Chr1`, `Chr2`, `Chr3`,... If a label other than `Chr` is desirable, it can be configured with the `synteny_plotsr_seq_label` parameter. + ## Minimum System Requirements All the modules have been tested to work on a single machine with 10 CPUs + 32 GBs of memory, except NCBI FCS GX and Kraken2. Their minimum requirements are: diff --git a/modules.json b/modules.json index a1a83606..e15a72cd 100644 --- a/modules.json +++ b/modules.json @@ -180,6 +180,11 @@ "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", "installed_by": ["modules"] }, + "gffread": { + "branch": "master", + "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", + "installed_by": ["modules"] + }, "gunzip": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", @@ -210,6 +215,11 @@ "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", "installed_by": ["modules"] }, + "orthofinder": { + "branch": "master", + "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", + "installed_by": ["modules"] + }, "seqkit/rmdup": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", diff --git a/modules/local/createreport.nf b/modules/local/createreport.nf index a7c05566..319d2b05 100644 --- a/modules/local/createreport.nf +++ b/modules/local/createreport.nf @@ -20,6 +20,7 @@ process CREATEREPORT { path hic_outputs , stageAs: 'hic_outputs/*' path synteny_outputs , stageAs: 'synteny_outputs/*' path merqury_outputs , stageAs: 'merqury_outputs/*' + path orthofinder_outputs , stageAs: 'orthofinder_outputs/*' path versions val params_json val params_summary_json diff --git a/modules/nf-core/gffread/environment.yml b/modules/nf-core/gffread/environment.yml new file mode 100644 index 00000000..ee239841 --- /dev/null +++ b/modules/nf-core/gffread/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::gffread=0.12.7 diff --git a/modules/nf-core/gffread/main.nf b/modules/nf-core/gffread/main.nf new file mode 100644 index 00000000..da55cbab --- /dev/null +++ b/modules/nf-core/gffread/main.nf @@ -0,0 +1,60 @@ +process GFFREAD { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gffread:0.12.7--hdcf5f25_4' : + 'biocontainers/gffread:0.12.7--hdcf5f25_4' }" + + input: + tuple val(meta), path(gff) + path fasta + + output: + tuple val(meta), path("*.gtf") , emit: gtf , optional: true + tuple val(meta), path("*.gff3") , emit: gffread_gff , optional: true + tuple val(meta), path("*.fasta"), emit: gffread_fasta , optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def extension = args.contains("-T") ? 'gtf' : ( ( ['-w', '-x', '-y' ].any { args.contains(it) } ) ? 'fasta' : 'gff3' ) + def fasta_arg = fasta ? "-g $fasta" : '' + def output_name = "${prefix}.${extension}" + def output = extension == "fasta" ? "$output_name" : "-o $output_name" + def args_sorted = args.replaceAll(/(.*)(-[wxy])(.*)/) { all, pre, param, post -> "$pre $post $param" }.trim() + // args_sorted = Move '-w', '-x', and '-y' to the end of the args string as gffread expects the file name after these parameters + if ( "$output_name" in [ "$gff", "$fasta" ] ) error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + gffread \\ + $gff \\ + $fasta_arg \\ + $args_sorted \\ + $output + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gffread: \$(gffread --version 2>&1) + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def extension = args.contains("-T") ? 'gtf' : ( ( ['-w', '-x', '-y' ].any { args.contains(it) } ) ? 'fasta' : 'gff3' ) + def output_name = "${prefix}.${extension}" + if ( "$output_name" in [ "$gff", "$fasta" ] ) error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + touch $output_name + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gffread: \$(gffread --version 2>&1) + END_VERSIONS + """ +} diff --git a/modules/nf-core/gffread/meta.yml b/modules/nf-core/gffread/meta.yml new file mode 100644 index 00000000..bebe7f57 --- /dev/null +++ b/modules/nf-core/gffread/meta.yml @@ -0,0 +1,75 @@ +name: gffread +description: Validate, filter, convert and perform various other operations on GFF + files +keywords: + - gff + - conversion + - validation +tools: + - gffread: + description: GFF/GTF utility providing format conversions, region filtering, FASTA + sequence extraction and more. + homepage: http://ccb.jhu.edu/software/stringtie/gff.shtml#gffread + documentation: http://ccb.jhu.edu/software/stringtie/gff.shtml#gffread + tool_dev_url: https://github.com/gpertea/gffread + doi: 10.12688/f1000research.23297.1 + licence: ["MIT"] + identifier: biotools:gffread +input: + - - meta: + type: map + description: | + Groovy Map containing meta data + e.g. [ id:'test' ] + - gff: + type: file + description: A reference file in either the GFF3, GFF2 or GTF format. + pattern: "*.{gff, gtf}" + - - fasta: + type: file + description: A multi-fasta file with the genomic sequences + pattern: "*.{fasta,fa,faa,fas,fsa}" +output: + - gtf: + - meta: + type: map + description: | + Groovy Map containing meta data + e.g. [ id:'test' ] + - "*.gtf": + type: file + description: GTF file resulting from the conversion of the GFF input file if + '-T' argument is present + pattern: "*.{gtf}" + - gffread_gff: + - meta: + type: map + description: | + Groovy Map containing meta data + e.g. [ id:'test' ] + - "*.gff3": + type: file + description: GFF3 file resulting from the conversion of the GFF input file if + '-T' argument is absent + pattern: "*.gff3" + - gffread_fasta: + - meta: + type: map + description: | + Groovy Map containing meta data + e.g. [ id:'test' ] + - "*.fasta": + type: file + description: Fasta file produced when either of '-w', '-x', '-y' parameters + is present + pattern: "*.fasta" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@edmundmiller" +maintainers: + - "@edmundmiller" + - "@gallvp" diff --git a/modules/nf-core/gffread/tests/main.nf.test b/modules/nf-core/gffread/tests/main.nf.test new file mode 100644 index 00000000..4cd13dcd --- /dev/null +++ b/modules/nf-core/gffread/tests/main.nf.test @@ -0,0 +1,223 @@ +nextflow_process { + + name "Test Process GFFREAD" + script "../main.nf" + process "GFFREAD" + + tag "gffread" + tag "modules_nfcore" + tag "modules" + + test("sarscov2-gff3-gtf") { + + config "./nextflow.config" + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [id: 'test'], + file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.gff3", checkIfExists: true) + ] + input[1] = [] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert process.out.gffread_gff == [] }, + { assert process.out.gffread_fasta == [] } + ) + } + + } + + test("sarscov2-gff3-gtf-stub") { + + options '-stub' + config "./nextflow.config" + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [id: 'test'], + file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.gff3", checkIfExists: true) + ] + input[1] = [] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert process.out.gffread_gff == [] }, + { assert process.out.gffread_fasta == [] } + ) + } + + } + + test("sarscov2-gff3-gff3") { + + config "./nextflow-gff3.config" + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [id: 'test'], + file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.gff3", checkIfExists: true) + ] + input[1] = [] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert process.out.gtf == [] }, + { assert process.out.gffread_fasta == [] } + ) + } + + } + + test("sarscov2-gff3-gff3-stub") { + + options '-stub' + config "./nextflow-gff3.config" + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [id: 'test'], + file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.gff3", checkIfExists: true) + ] + input[1] = [] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert process.out.gtf == [] }, + { assert process.out.gffread_fasta == [] } + ) + } + + } + + test("sarscov2-gff3-fasta") { + + config "./nextflow-fasta.config" + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [id: 'test'], + file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.gff3", checkIfExists: true) + ] + input[1] = file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.fasta", checkIfExists: true) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert process.out.gtf == [] }, + { assert process.out.gffread_gff == [] } + ) + } + + } + + test("sarscov2-gff3-fasta-stub") { + + options '-stub' + config "./nextflow-fasta.config" + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [id: 'test'], + file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.gff3", checkIfExists: true) + ] + input[1] = file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.fasta", checkIfExists: true) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert process.out.gtf == [] }, + { assert process.out.gffread_gff == [] } + ) + } + + } + + test("sarscov2-gff3-fasta-fail-catch") { + + options '-stub' + config "./nextflow-fasta.config" + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [id: 'genome'], + file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.gff3", checkIfExists: true) + ] + input[1] = file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.fasta", checkIfExists: true) + """ + } + } + + then { + assertAll ( + { assert ! process.success }, + { assert process.stdout.toString().contains("Input and output names are the same") } + ) + } + + } + +} diff --git a/modules/nf-core/gffread/tests/main.nf.test.snap b/modules/nf-core/gffread/tests/main.nf.test.snap new file mode 100644 index 00000000..15262320 --- /dev/null +++ b/modules/nf-core/gffread/tests/main.nf.test.snap @@ -0,0 +1,272 @@ +{ + "sarscov2-gff3-gtf": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.gtf:md5,1ea0ae98d3388e0576407dc4a24ef428" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" + ], + "gffread_fasta": [ + + ], + "gffread_gff": [ + + ], + "gtf": [ + [ + { + "id": "test" + }, + "test.gtf:md5,1ea0ae98d3388e0576407dc4a24ef428" + ] + ], + "versions": [ + "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-09T10:48:56.496187" + }, + "sarscov2-gff3-gff3": { + "content": [ + { + "0": [ + + ], + "1": [ + [ + { + "id": "test" + }, + "test.gff3:md5,c4e5da6267c6bee5899a2c204ae1ad91" + ] + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" + ], + "gffread_fasta": [ + + ], + "gffread_gff": [ + [ + { + "id": "test" + }, + "test.gff3:md5,c4e5da6267c6bee5899a2c204ae1ad91" + ] + ], + "gtf": [ + + ], + "versions": [ + "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-09T10:49:00.892782" + }, + "sarscov2-gff3-gtf-stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.gtf:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" + ], + "gffread_fasta": [ + + ], + "gffread_gff": [ + + ], + "gtf": [ + [ + { + "id": "test" + }, + "test.gtf:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-09T11:11:26.975666" + }, + "sarscov2-gff3-fasta-stub": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + [ + { + "id": "test" + }, + "test.fasta:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" + ], + "gffread_fasta": [ + [ + { + "id": "test" + }, + "test.fasta:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "gffread_gff": [ + + ], + "gtf": [ + + ], + "versions": [ + "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-09T11:11:44.34792" + }, + "sarscov2-gff3-gff3-stub": { + "content": [ + { + "0": [ + + ], + "1": [ + [ + { + "id": "test" + }, + "test.gff3:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" + ], + "gffread_fasta": [ + + ], + "gffread_gff": [ + [ + { + "id": "test" + }, + "test.gff3:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "gtf": [ + + ], + "versions": [ + "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-09T11:11:35.221671" + }, + "sarscov2-gff3-fasta": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + [ + { + "id": "test" + }, + "test.fasta:md5,5f8108fb51739a0588ccf0a251de919a" + ] + ], + "3": [ + "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" + ], + "gffread_fasta": [ + [ + { + "id": "test" + }, + "test.fasta:md5,5f8108fb51739a0588ccf0a251de919a" + ] + ], + "gffread_gff": [ + + ], + "gtf": [ + + ], + "versions": [ + "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-09T10:54:02.88143" + } +} \ No newline at end of file diff --git a/modules/nf-core/gffread/tests/nextflow-fasta.config b/modules/nf-core/gffread/tests/nextflow-fasta.config new file mode 100644 index 00000000..ac6cb148 --- /dev/null +++ b/modules/nf-core/gffread/tests/nextflow-fasta.config @@ -0,0 +1,5 @@ +process { + withName: GFFREAD { + ext.args = '-w -S' + } +} diff --git a/modules/nf-core/gffread/tests/nextflow-gff3.config b/modules/nf-core/gffread/tests/nextflow-gff3.config new file mode 100644 index 00000000..afe0830e --- /dev/null +++ b/modules/nf-core/gffread/tests/nextflow-gff3.config @@ -0,0 +1,5 @@ +process { + withName: GFFREAD { + ext.args = '' + } +} diff --git a/modules/nf-core/gffread/tests/nextflow.config b/modules/nf-core/gffread/tests/nextflow.config new file mode 100644 index 00000000..74b25094 --- /dev/null +++ b/modules/nf-core/gffread/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: GFFREAD { + ext.args = '-T' + } +} diff --git a/modules/nf-core/gffread/tests/tags.yml b/modules/nf-core/gffread/tests/tags.yml new file mode 100644 index 00000000..05576065 --- /dev/null +++ b/modules/nf-core/gffread/tests/tags.yml @@ -0,0 +1,2 @@ +gffread: + - modules/nf-core/gffread/** diff --git a/modules/nf-core/orthofinder/environment.yml b/modules/nf-core/orthofinder/environment.yml new file mode 100644 index 00000000..68c475f8 --- /dev/null +++ b/modules/nf-core/orthofinder/environment.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::diamond=2.1.9 + - bioconda::orthofinder=2.5.5 diff --git a/modules/nf-core/orthofinder/main.nf b/modules/nf-core/orthofinder/main.nf new file mode 100644 index 00000000..a47c4dea --- /dev/null +++ b/modules/nf-core/orthofinder/main.nf @@ -0,0 +1,80 @@ +process ORTHOFINDER { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/orthofinder:2.5.5--hdfd78af_2': + 'biocontainers/orthofinder:2.5.5--hdfd78af_2' }" + + input: + tuple val(meta), path(fastas, stageAs: 'input/') + tuple val(meta2), path(prior_run) + + output: + tuple val(meta), path("$prefix") , emit: orthofinder + tuple val(meta), path("$prefix/WorkingDirectory") , emit: working + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + def include_command = prior_run ? "-b $prior_run" : '' + + """ + mkdir temp_pickle + + orthofinder \\ + -t $task.cpus \\ + -a $task.cpus \\ + -p temp_pickle \\ + -f input \\ + -n $prefix \\ + $include_command \\ + $args + + if [ -e input/OrthoFinder/Results_$prefix ]; then + mv input/OrthoFinder/Results_$prefix $prefix + fi + + if [ -e ${prior_run}/OrthoFinder/Results_$prefix ]; then + mv ${prior_run}/OrthoFinder/Results_$prefix $prefix + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + orthofinder: \$(orthofinder -h | sed -n 's/.*version \\(.*\\) Copy.*/\\1/p') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + def include_command = prior_run ? "-b $prior_run" : '' + + """ + mkdir -p $prefix/Comparative_Genomics_Statistics + mkdir $prefix/Gene_Duplication_Events + mkdir $prefix/Gene_Trees + mkdir $prefix/Orthogroup_Sequences + mkdir $prefix/Orthogroups + mkdir $prefix/Orthologues + mkdir $prefix/Phylogenetic_Hierarchical_Orthogroups + mkdir $prefix/Phylogenetically_Misplaced_Genes + mkdir $prefix/Putative_Xenologs + mkdir $prefix/Resolved_Gene_Trees + mkdir $prefix/Single_Copy_Orthologue_Sequences + mkdir $prefix/Species_Tree + mkdir $prefix/WorkingDirectory + + touch $prefix/Log.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + orthofinder: \$(orthofinder -h | sed -n 's/.*version \\(.*\\) Copy.*/\\1/p') + END_VERSIONS + """ +} diff --git a/modules/nf-core/orthofinder/meta.yml b/modules/nf-core/orthofinder/meta.yml new file mode 100644 index 00000000..4aeb46b3 --- /dev/null +++ b/modules/nf-core/orthofinder/meta.yml @@ -0,0 +1,71 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "orthofinder" +description: OrthoFinder is a fast, accurate and comprehensive platform for comparative + genomics. +keywords: + - genomics + - orthogroup + - orthologs + - gene + - duplication + - tree + - phylogeny +tools: + - "orthofinder": + description: "Accurate inference of orthogroups, orthologues, gene trees and rooted + species tree made easy!" + homepage: "https://github.com/davidemms/OrthoFinder" + documentation: "https://github.com/davidemms/OrthoFinder" + tool_dev_url: "https://github.com/davidemms/OrthoFinder" + doi: "10.1186/s13059-019-1832-y" + licence: ["GPL v3"] + identifier: biotools:OrthoFinder + +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - fastas: + type: list + description: Input fasta files + pattern: "*.{fa,faa,fasta,fas,pep}" + - - meta2: + type: map + description: | + Groovy Map containing a name + e.g. `[ id:'folder1' ]` + - prior_run: + type: directory + description: | + A folder container containing a previous WorkingDirectory from Orthofinder. +output: + - orthofinder: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - $prefix: + type: directory + description: Orthofinder output directory + - working: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - $prefix/WorkingDirectory: + type: directory + description: Orthofinder output WorkingDirectory (used for the orthofinder resume + function) + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@GallVp" +maintainers: + - "@GallVp" diff --git a/modules/nf-core/orthofinder/tests/main.nf.test b/modules/nf-core/orthofinder/tests/main.nf.test new file mode 100644 index 00000000..aa68d1d2 --- /dev/null +++ b/modules/nf-core/orthofinder/tests/main.nf.test @@ -0,0 +1,161 @@ +import groovy.io.FileType + +nextflow_process { + + name "Test Process ORTHOFINDER" + script "../main.nf" + process "ORTHOFINDER" + + tag "modules" + tag "modules_nfcore" + tag "orthofinder" + tag "untar" + + test("sarscov2 - candidatus_portiera_aleyrodidarum - proteome") { + + when { + process { + """ + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/proteome.fasta', checkIfExists: true) + .copyTo("${workDir}/sarscov2.fasta") + + file(params.modules_testdata_base_path + 'genomics/prokaryotes/candidatus_portiera_aleyrodidarum/genome/proteome.fasta', checkIfExists: true) + .copyTo("${workDir}/candidatus_portiera_aleyrodidarum.fasta") + + def file_a = file("${workDir}/sarscov2.fasta", checkIfExists:true) + def file_b = file("${workDir}/candidatus_portiera_aleyrodidarum.fasta", checkIfExists:true) + + input[0] = [ + [ id:'test', single_end:false ], + [ file_a, file_b ] + ] + input[1] = [ + [], + [] + ] + """ + } + } + + then { + assert process.success + + def all_files = [] + + file(process.out.orthofinder[0][1]).eachFileRecurse (FileType.FILES) { file -> + all_files << file + } + + def stable_file_names = [ + 'Statistics_PerSpecies.tsv', + 'SpeciesTree_Gene_Duplications_0.5_Support.txt', + 'SpeciesTree_rooted.txt' + ] + + def stable_files = all_files.findAll { it.name in stable_file_names } + + assert snapshot( + stable_files.toSorted(), + process.out.versions[0] + ).match() + } + + } + + + test("sarscov2 - candidatus_portiera_aleyrodidarum - proteome - resume") { + + + setup { + run("UNTAR") { + script "../../untar/main.nf" + process { + """ + input[0] = [ [ id:'test1' ], // meta map + file(params.modules_testdata_base_path + 'delete_me/orthofinder/WorkingDirectory.tar.gz', checkIfExists: true) + ] + """ + } + } + } + + when { + process { + """ + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/proteome.fasta', checkIfExists: true) + .copyTo("${workDir}/sarscov2.fasta") + + def file_a = file("https://raw.githubusercontent.com/nf-core/test-datasets/proteinfold/testdata/sequences/H1065.fasta") + def file_c = UNTAR.out.untar + input[0] = [ + [ id:'test_2', single_end:false ], + [ file_a ] + ] + input[1] = UNTAR.out.untar + """ + } + } + + then { + assert process.success + + def all_files = [] + + file(process.out.orthofinder[0][1]).eachFileRecurse (FileType.FILES) { file -> + all_files << file + } + + def stable_file_names = [ + 'Statistics_PerSpecies.tsv', + 'OrthologuesStats_Totals.tsv', + 'Duplications_per_Species_Tree_Node.tsv' + ] + + def stable_files = all_files.findAll { it.name in stable_file_names } + + assert snapshot( + stable_files.toSorted(), + process.out.versions[0] + ).match() + } + + } + + test("sarscov2 - candidatus_portiera_aleyrodidarum - proteome - stub") { + + options '-stub' + + when { + process { + """ + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/proteome.fasta', checkIfExists: true) + .copyTo("${workDir}/sarscov2.fasta") + + file(params.modules_testdata_base_path + 'genomics/prokaryotes/candidatus_portiera_aleyrodidarum/genome/proteome.fasta', checkIfExists: true) + .copyTo("${workDir}/candidatus_portiera_aleyrodidarum.fasta") + + def file_a = file("${workDir}/sarscov2.fasta", checkIfExists:true) + def file_b = file("${workDir}/candidatus_portiera_aleyrodidarum.fasta", checkIfExists:true) + + input[0] = [ + [ id:'test', single_end:false ], + [ file_a, file_b ] + ] + input[1] = [ + [], + [] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/orthofinder/tests/main.nf.test.snap b/modules/nf-core/orthofinder/tests/main.nf.test.snap new file mode 100644 index 00000000..f2c7b916 --- /dev/null +++ b/modules/nf-core/orthofinder/tests/main.nf.test.snap @@ -0,0 +1,171 @@ +{ + "sarscov2 - candidatus_portiera_aleyrodidarum - proteome": { + "content": [ + [ + "Statistics_PerSpecies.tsv:md5,984b5011a34d54527fe17896bfa36a2d", + "SpeciesTree_Gene_Duplications_0.5_Support.txt:md5,8b7a673e2e8b6d1aeb697f2bb88afa18", + "SpeciesTree_rooted.txt:md5,4d5ea525feebe479fca0c0768271ba81" + ], + "versions.yml:md5,86b472c85626aac1840eec0769016f5c" + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-09-03T10:59:02.895708598" + }, + "sarscov2 - candidatus_portiera_aleyrodidarum - proteome - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + [ + [ + + ], + [ + + ], + [ + + ], + "Log.txt:md5,d41d8cd98f00b204e9800998ecf8427e", + [ + + ], + [ + + ], + [ + + ], + [ + + ], + [ + + ], + [ + + ], + [ + + ], + [ + + ], + [ + + ], + [ + + ] + ] + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": false + }, + [ + + ] + ] + ], + "2": [ + "versions.yml:md5,86b472c85626aac1840eec0769016f5c" + ], + "orthofinder": [ + [ + { + "id": "test", + "single_end": false + }, + [ + [ + + ], + [ + + ], + [ + + ], + "Log.txt:md5,d41d8cd98f00b204e9800998ecf8427e", + [ + + ], + [ + + ], + [ + + ], + [ + + ], + [ + + ], + [ + + ], + [ + + ], + [ + + ], + [ + + ], + [ + + ] + ] + ] + ], + "versions": [ + "versions.yml:md5,86b472c85626aac1840eec0769016f5c" + ], + "working": [ + [ + { + "id": "test", + "single_end": false + }, + [ + + ] + ] + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-09-03T11:07:31.319665056" + }, + "sarscov2 - candidatus_portiera_aleyrodidarum - proteome - resume": { + "content": [ + [ + "Duplications_per_Species_Tree_Node.tsv:md5,addc6f5ceec40bd82b00038d1872a27c", + "OrthologuesStats_Totals.tsv:md5,20d243abef226051a43cb37e922fc3eb", + "Statistics_PerSpecies.tsv:md5,83174c383b6c6828d1cc9b3be1679890" + ], + "versions.yml:md5,86b472c85626aac1840eec0769016f5c" + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-09-03T11:04:10.916947006" + } +} \ No newline at end of file diff --git a/modules/nf-core/orthofinder/tests/tags.yml b/modules/nf-core/orthofinder/tests/tags.yml new file mode 100644 index 00000000..f386e259 --- /dev/null +++ b/modules/nf-core/orthofinder/tests/tags.yml @@ -0,0 +1,2 @@ +orthofinder: + - "modules/nf-core/orthofinder/**" diff --git a/nextflow.config b/nextflow.config index 91e2ddf2..0e7c24bb 100644 --- a/nextflow.config +++ b/nextflow.config @@ -29,22 +29,22 @@ params { contamination_stops_pipeline = true + // tidk options + tidk_skip = true + tidk_repeat_seq = null + tidk_filter_by_size = false + tidk_filter_size_bp = 1000000 + // BUSCO options busco_skip = true busco_mode = null busco_lineage_datasets = null busco_download_path = null - // TIDK options - tidk_skip = true - tidk_repeat_seq = null - tidk_filter_by_size = false - tidk_filter_size_bp = 1000000 - // LAI options lai_skip = true - // kraken2 options + // kraken 2 options kraken2_skip = true kraken2_db_path = null @@ -55,6 +55,10 @@ params { hic_fastp_ext_args = '--qualified_quality_phred 20 --length_required 50' hic_samtools_ext_args = '-F 3852' + // Merqury options + merqury_skip = true + merqury_kmer_length = 21 + // Synteny options synteny_skip = true synteny_mummer_skip = true @@ -70,9 +74,8 @@ params { synteny_plotsr_seq_label = 'Chr' synteny_plotsr_assembly_order = null - // Merqury options - merqury_skip = true - merqury_kmer_length = 21 + // OrthoFinder options + orthofinder_skip = true // Output options outdir = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 6b5b9d63..a457f73f 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -113,6 +113,37 @@ } } }, + "tidk_options": { + "title": "tidk options", + "type": "object", + "description": "", + "default": "", + "properties": { + "tidk_skip": { + "type": "boolean", + "description": "Skip telomere identification", + "default": true, + "fa_icon": "fas fa-forward" + }, + "tidk_repeat_seq": { + "type": "string", + "description": "Telomere repeat sequence. Typical values for plant: TTTAGGG, fungus, vertebrates: TTAGGG and Insect: TTAGG", + "pattern": "^[ACGT]+$", + "fa_icon": "fas fa-dna" + }, + "tidk_filter_by_size": { + "type": "boolean", + "description": "Filter assembly sequences smaller than the specified length", + "fa_icon": "fas fa-question-circle" + }, + "tidk_filter_size_bp": { + "type": "integer", + "default": 1000000, + "description": "Filter size in base-pairs", + "fa_icon": "fas fa-ruler-horizontal" + } + } + }, "busco_options": { "title": "BUSCO options", "type": "object", @@ -145,37 +176,6 @@ } } }, - "tidk_options": { - "title": "TIDK options", - "type": "object", - "description": "", - "default": "", - "properties": { - "tidk_skip": { - "type": "boolean", - "description": "Skip telomere identification", - "default": true, - "fa_icon": "fas fa-forward" - }, - "tidk_repeat_seq": { - "type": "string", - "description": "Telomere repeat sequence. Typical values for plant: TTTAGGG, fungus, vertebrates: TTAGGG and Insect: TTAGG", - "pattern": "^[ACGT]+$", - "fa_icon": "fas fa-dna" - }, - "tidk_filter_by_size": { - "type": "boolean", - "description": "Filter assembly sequences smaller than the specified length", - "fa_icon": "fas fa-question-circle" - }, - "tidk_filter_size_bp": { - "type": "integer", - "default": 1000000, - "description": "Filter size in base-pairs", - "fa_icon": "fas fa-ruler-horizontal" - } - } - }, "lai_options": { "title": "LAI options", "type": "object", @@ -191,7 +191,7 @@ } }, "kraken2_options": { - "title": "Kraken2 options", + "title": "Kraken 2 options", "type": "object", "description": "", "default": "", @@ -246,6 +246,27 @@ } } }, + "merqury_options": { + "title": "Merqury options", + "type": "object", + "description": "", + "default": "", + "properties": { + "merqury_skip": { + "type": "boolean", + "default": true, + "description": "Skip merqury analysis", + "fa_icon": "fas fa-forward" + }, + "merqury_kmer_length": { + "type": "integer", + "default": 21, + "description": "kmer length for merqury analysis", + "minimum": 3, + "fa_icon": "fas fa-ruler-horizontal" + } + } + }, "synteny_options": { "title": "Synteny options", "type": "object", @@ -333,24 +354,17 @@ } } }, - "merqury_options": { - "title": "Merqury options", + "orthofinder_options": { + "title": "OrthoFinder options", "type": "object", "description": "", "default": "", "properties": { - "merqury_skip": { + "orthofinder_skip": { "type": "boolean", "default": true, - "description": "Skip merqury analysis", - "fa_icon": "fas fa-forward" - }, - "merqury_kmer_length": { - "type": "integer", - "default": 21, - "description": "kmer length for merqury analysis", - "minimum": 3, - "fa_icon": "fas fa-ruler-horizontal" + "fa_icon": "fas fa-forward", + "description": "Skip orthofinder" } } }, @@ -452,10 +466,10 @@ "$ref": "#/$defs/ncbi_fcs_options" }, { - "$ref": "#/$defs/busco_options" + "$ref": "#/$defs/tidk_options" }, { - "$ref": "#/$defs/tidk_options" + "$ref": "#/$defs/busco_options" }, { "$ref": "#/$defs/lai_options" @@ -466,11 +480,14 @@ { "$ref": "#/$defs/hic_options" }, + { + "$ref": "#/$defs/merqury_options" + }, { "$ref": "#/$defs/synteny_options" }, { - "$ref": "#/$defs/merqury_options" + "$ref": "#/$defs/orthofinder_options" }, { "$ref": "#/$defs/institutional_config_options" diff --git a/tests/nextflow.config b/tests/nextflow.config index 1d5de542..ed1a8053 100644 --- a/tests/nextflow.config +++ b/tests/nextflow.config @@ -5,7 +5,7 @@ */ params { - modules_testdata_base_path = 's3://ngi-igenomes/testdata/nf-core/modules/' + modules_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/' } timeline { enabled = false } diff --git a/tests/orthofinder/assemblysheet.csv b/tests/orthofinder/assemblysheet.csv new file mode 100644 index 00000000..64892681 --- /dev/null +++ b/tests/orthofinder/assemblysheet.csv @@ -0,0 +1,5 @@ +tag,fasta,gff3 +agalactiae,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/063/605/GCF_000063605.1_ASM6360v1/GCF_000063605.1_ASM6360v1_genomic.fna.gz,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/063/605/GCF_000063605.1_ASM6360v1/GCF_000063605.1_ASM6360v1_genomic.gff.gz +gallisepticum,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/900/476/085/GCF_900476085.1_50569_G01/GCF_900476085.1_50569_G01_genomic.fna.gz,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/900/476/085/GCF_900476085.1_50569_G01/GCF_900476085.1_50569_G01_genomic.gff.gz +genitalium,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/027/325/GCF_000027325.1_ASM2732v1/GCF_000027325.1_ASM2732v1_genomic.fna.gz,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/027/325/GCF_000027325.1_ASM2732v1/GCF_000027325.1_ASM2732v1_genomic.gff.gz +hyopneumoniae,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/008/205/GCF_000008205.1_ASM820v1/GCF_000008205.1_ASM820v1_genomic.fna.gz,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/008/205/GCF_000008205.1_ASM820v1/GCF_000008205.1_ASM820v1_genomic.gff.gz diff --git a/tests/orthofinder/main.nf.test b/tests/orthofinder/main.nf.test new file mode 100644 index 00000000..8e6bfab7 --- /dev/null +++ b/tests/orthofinder/main.nf.test @@ -0,0 +1,36 @@ +nextflow_pipeline { + + name "Test orthofinder" + script "main.nf" + + test("invalid") { + + when { + params { + input = "$baseDir/tests/orthofinder/assemblysheet.csv" + orthofinder_skip = false + outdir = "$outputDir" + } + } + + then { + def stable_path = getAllFilesFromDir(params.outdir, false, ['pipeline_info/*.{html,json,txt,yml}', 'report.{html,json}', 'orthofinder/**'], null, ['**']) + def report_json = (Map) new groovy.json.JsonSlurper().parseText(file("$outputDir/report.json").text) + def orthofinder_stats = report_json['ORTHOFINDER']['num_species_orthogroup'] + + assertAll( + { assert workflow.success}, + { assert snapshot( + [ + 'successful tasks': workflow.trace.succeeded().size(), + 'versions': removeNextflowVersion("$outputDir/pipeline_info/software_versions.yml"), + 'stable paths': stable_path, + 'orthofinder stats': orthofinder_stats + ] + ).match() } + ) + } + + } + +} diff --git a/tests/orthofinder/main.nf.test.snap b/tests/orthofinder/main.nf.test.snap new file mode 100644 index 00000000..633b6c30 --- /dev/null +++ b/tests/orthofinder/main.nf.test.snap @@ -0,0 +1,83 @@ +{ + "invalid": { + "content": [ + { + "successful tasks": 46, + "versions": { + "ASSEMBLATHON_STATS": { + "assemblathon_stats": "github/PlantandFoodResearch/assemblathon2-analysis/a93cba2" + }, + "FASTAVALIDATOR": { + "py_fasta_validator": 0.6 + }, + "GFFREAD": { + "gffread": "0.12.7" + }, + "GT_GFF3": { + "genometools": "1.6.5" + }, + "GT_GFF3VALIDATOR": { + "genometools": "1.6.5" + }, + "GT_STAT": { + "genometools": "1.6.5" + }, + "GUNZIP_FASTA": { + "gunzip": 1.1 + }, + "GUNZIP_GFF3": { + "gunzip": 1.1 + }, + "ORTHOFINDER": { + "orthofinder": "2.5.5" + }, + "SAMTOOLS_FAIDX": { + "samtools": 1.21 + }, + "SEQKIT_RMDUP": { + "seqkit": "v2.8.0" + }, + "TAG_ASSEMBLY": { + "pigz": "2.3.4" + }, + "Workflow": { + "plant-food-research-open/assemblyqc": "v2.2.0dev" + } + }, + "stable paths": [ + "agalactiae_stats.csv:md5,4f4ce28e8975f9ded73cf86ff5eaa507", + "gallisepticum_stats.csv:md5,cedd23a5778c76bf17053f5a0aa6eaf8", + "genitalium_stats.csv:md5,04ef67d681fba6ca05df4171f888424a", + "hyopneumoniae_stats.csv:md5,d59081ff1b5bb0e9f1be494b8463f3b8", + "agalactiae.gt.stat.yml:md5,74fe2e9753fdebaa31840016201eaf17", + "gallisepticum.gt.stat.yml:md5,0ec946739663876b25fca0efd47fd467", + "genitalium.gt.stat.yml:md5,ce9917e5b197e130c0d1be90d88e3177", + "hyopneumoniae.gt.stat.yml:md5,856c5eb7511f53a40056987922e649c7" + ], + "orthofinder stats": [ + { + "Number of species in orthogroup": 1, + "Number of orthogroups": 79 + }, + { + "Number of species in orthogroup": 2, + "Number of orthogroups": 166 + }, + { + "Number of species in orthogroup": 3, + "Number of orthogroups": 70 + }, + { + "Number of species in orthogroup": 4, + "Number of orthogroups": 277 + } + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-11-01T14:11:21.865104" + } +} \ No newline at end of file diff --git a/workflows/assemblyqc.nf b/workflows/assemblyqc.nf index 9116a5ea..21496aaa 100644 --- a/workflows/assemblyqc.nf +++ b/workflows/assemblyqc.nf @@ -30,6 +30,8 @@ include { MERYL_COUNT as PAT_MERYL_COUNT } from '../modules/nf-core/meryl/cou include { MERYL_UNIONSUM as PAT_UNIONSUM } from '../modules/nf-core/meryl/unionsum/main' include { MERQURY_HAPMERS } from '../modules/nf-core/merqury/hapmers/main' include { MERQURY_MERQURY } from '../modules/nf-core/merqury/merqury/main' +include { GFFREAD } from '../modules/nf-core/gffread/main' +include { ORTHOFINDER } from '../modules/nf-core/orthofinder/main' include { CREATEREPORT } from '../modules/local/createreport' include { FASTQ_DOWNLOAD_PREFETCH_FASTERQDUMP_SRATOOLS as FETCHNGS } from '../subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/main' @@ -805,6 +807,38 @@ workflow ASSEMBLYQC { | flatMap { meta, data -> data } ch_versions = ch_versions.mix(MERQURY_MERQURY.out.versions.first()) + // MODULE: GFFREAD + ch_gffread_inputs = params.orthofinder_skip + ? Channel.empty() + : ch_valid_gff3 + | join( + ch_clean_assembly + | map { tag, fasta -> [ [ id: tag ], fasta ] } + ) + | map { [ it ] } + | collect + | filter { it.size() > 1 } + | flatten + | buffer ( size: 3 ) + + GFFREAD( + ch_gffread_inputs.map { meta, gff, fasta -> [ meta, gff ] }, + ch_gffread_inputs.map { meta, gff, fasta -> fasta } + ) + + ch_proteins_fasta = GFFREAD.out.gffread_fasta + ch_versions = ch_versions.mix(GFFREAD.out.versions.first()) + + // ORTHOFINDER + ORTHOFINDER( + ch_proteins_fasta.map { meta, fasta -> fasta }.collect().map { fastas -> [ [ id: 'assemblyqc' ], fastas ] }, + [ [], [] ] + ) + + ch_orthofinder_outputs = ORTHOFINDER.out.orthofinder + | map { meta, dir -> dir } + ch_versions = ch_versions.mix(ORTHOFINDER.out.versions) + // Collate and save software versions ch_versions = ch_versions | unique @@ -838,6 +872,7 @@ workflow ASSEMBLYQC { ch_hic_report_files .collect().ifEmpty([]), ch_synteny_outputs .collect().ifEmpty([]), ch_merqury_outputs .collect().ifEmpty([]), + ch_orthofinder_outputs .collect().ifEmpty([]), ch_versions_yml, ch_params_as_json, ch_summary_params_as_json