diff --git a/CHANGELOG.md b/CHANGELOG.md index 0ef7601e..436cacf8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Added` +1. Added Gfastats [#126](https://github.com/Plant-Food-Research-Open/assemblyqc/issues/126) + ### `Fixed` ### `Dependencies` @@ -16,8 +18,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Deprecated` -1. Reduced the GenomeTools stats figures to 300 DPI -2. Now `synteny_mummer_min_bundle_size` is set to `1000000` by default +1. Reduced the GenomeTools stats figures to 300 DPI [#142](https://github.com/Plant-Food-Research-Open/assemblyqc/issues/142) +2. Now `synteny_mummer_min_bundle_size` is set to `1000000` by default [#142](https://github.com/Plant-Food-Research-Open/assemblyqc/issues/142) ## v2.1.1 - [20-Sep-2024] diff --git a/CITATIONS.md b/CITATIONS.md index fbd8036d..22ae2fb0 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -36,6 +36,10 @@ > > Forked from: +- GFASTATS, [MIT](https://github.com/vgl-hub/gfastats/blob/main/LICENSE) + + > Giulio Formenti, Linelle Abueg, Angelo Brajuka, Nadolina Brajuka, Cristóbal Gallardo-Alba, Alice Giani, Olivier Fedrigo, Erich D Jarvis, Gfastats: conversion, evaluation and manipulation of genome sequences using assembly graphs, Bioinformatics, Volume 38, Issue 17, September 2022, Pages 4214–4216, + - BUSCO, [MIT](https://gitlab.com/ezlab/busco/-/blob/master/LICENSE) > Manni M, Berkeley MR, Seppey M, Simão FA, Zdobnov EM. 2021. BUSCO Update: Novel and Streamlined Workflows along with Broader and Deeper Phylogenetic Coverage for Scoring of Eukaryotic, Prokaryotic, and Viral Genomes, Molecular Biology and Evolution, Volume 38, Issue 10, October 2021, Pages 4647–4654, diff --git a/README.md b/README.md index de13e09d..7adcae92 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ flowchart LR VALIDATE_FORMAT ==> GFF_STATS[GENOMETOOLS GT STAT] - Run ==> ASS_STATS[ASSEMBLATHON STATS] + Run ==> ASS_STATS[STATS] Run ==> BUSCO Run ==> TIDK Run ==> LAI @@ -72,7 +72,7 @@ flowchart LR - [FASTA VALIDATOR](https://github.com/linsalrob/fasta_validator) + [SEQKIT RMDUP](https://github.com/shenwei356/seqkit): FASTA validation - [GENOMETOOLS GT GFF3VALIDATOR](https://genometools.org/tools/gt_gff3validator.html): GFF3 validation -- [ASSEMBLATHON STATS](https://github.com/PlantandFoodResearch/assemblathon2-analysis/blob/a93cba25d847434f7eadc04e63b58c567c46a56d/assemblathon_stats.pl): Assembly statistics +- [ASSEMBLATHON STATS](https://github.com/PlantandFoodResearch/assemblathon2-analysis/blob/a93cba25d847434f7eadc04e63b58c567c46a56d/assemblathon_stats.pl), [GFASTATS](https://github.com/vgl-hub/gfastats): Assembly statistics - [GENOMETOOLS GT STAT](https://genometools.org/tools/gt_stat.html): Annotation statistics - [NCBI FCS ADAPTOR](https://github.com/ncbi/fcs): Adaptor contamination pass/fail - [NCBI FCS GX](https://github.com/ncbi/fcs): Foreign organism contamination pass/fail diff --git a/bin/assemblyqc.py b/bin/assemblyqc.py index b5c73f56..c81db840 100755 --- a/bin/assemblyqc.py +++ b/bin/assemblyqc.py @@ -16,6 +16,9 @@ from report_modules.parsers.assemblathon_stats_parser import ( parse_assemblathon_stats_folder, ) +from report_modules.parsers.gfastats_parser import ( + parse_gfastats_folder, +) from report_modules.parsers.genometools_gt_stat_parser import ( parse_genometools_gt_stat_folder, ) @@ -41,6 +44,7 @@ data_from_tools = {**data_from_tools, **parse_ncbi_fcs_adaptor_folder()} data_from_tools = {**data_from_tools, **parse_ncbi_fcs_gx_folder()} data_from_tools = {**data_from_tools, **parse_assemblathon_stats_folder()} + data_from_tools = {**data_from_tools, **parse_gfastats_folder()} data_from_tools = {**data_from_tools, **parse_genometools_gt_stat_folder()} data_from_tools = {**data_from_tools, **parse_busco_folder()} data_from_tools = { diff --git a/bin/report_modules/parsers/gfastats_parser.py b/bin/report_modules/parsers/gfastats_parser.py new file mode 100644 index 00000000..6cfe2104 --- /dev/null +++ b/bin/report_modules/parsers/gfastats_parser.py @@ -0,0 +1,46 @@ +import os +from pathlib import Path +import pandas as pd +from tabulate import tabulate +import re + +from report_modules.parsers.parsing_commons import sort_list_of_results + + +def parse_gfastats_folder(folder_name="gfastats"): + dir = os.getcwdb().decode() + reports_folder_path = Path(f"{dir}/{folder_name}") + + if not os.path.exists(reports_folder_path): + return {} + + list_of_report_files = reports_folder_path.glob("*.assembly_summary") + + data = {"GFASTATS": []} + + for report_path in list_of_report_files: + report_table = pd.read_csv(report_path, sep="\t") + report_table.columns = ['Stat', 'Value'] + + file_tokens = re.findall( + r"([\w]+).assembly_summary", + os.path.basename(str(report_path)), + )[0] + + data["GFASTATS"].append( + { + "hap": file_tokens, + "report_table": report_table.to_dict("records"), + "report_table_html": tabulate( + report_table, + headers=["Stat", "Value"], + tablefmt="html", + numalign="left", + showindex=False, + ), + } + ) + + return { + "GFASTATS": sort_list_of_results(data["GFASTATS"], "hap") + } diff --git a/bin/report_modules/templates/base.html b/bin/report_modules/templates/base.html index 11fe7d92..c7c94f04 100644 --- a/bin/report_modules/templates/base.html +++ b/bin/report_modules/templates/base.html @@ -32,6 +32,10 @@ {% endif %} + {% if 'GFASTATS' in all_stats_dicts %} + + {% endif %} + {% if 'GENOMETOOLS_GT_STAT' in all_stats_dicts %} {% endif %} @@ -100,6 +104,10 @@ {% include 'assemblathon_stats/assemblathon_stats.html' %} {% endif %} + {% if 'GFASTATS' in all_stats_dicts %} + {% include 'gfastats/gfastats.html' %} + {% endif %} + {% if 'GENOMETOOLS_GT_STAT' in all_stats_dicts %} {% include 'genometools_gt_stat/genometools_gt_stat.html' %} {% endif %} diff --git a/bin/report_modules/templates/gfastats/dropdown.html b/bin/report_modules/templates/gfastats/dropdown.html new file mode 100644 index 00000000..e2e76f80 --- /dev/null +++ b/bin/report_modules/templates/gfastats/dropdown.html @@ -0,0 +1,10 @@ + diff --git a/bin/report_modules/templates/gfastats/gfastats.html b/bin/report_modules/templates/gfastats/gfastats.html new file mode 100644 index 00000000..eb94d1c4 --- /dev/null +++ b/bin/report_modules/templates/gfastats/gfastats.html @@ -0,0 +1,16 @@ + diff --git a/bin/report_modules/templates/gfastats/report_contents.html b/bin/report_modules/templates/gfastats/report_contents.html new file mode 100644 index 00000000..e1ca2e9a --- /dev/null +++ b/bin/report_modules/templates/gfastats/report_contents.html @@ -0,0 +1,17 @@ +{% set vars = {'is_first': True} %} {% for item in range(all_stats_dicts["GFASTATS"]|length) %} {% set +active_text = 'display: block' if vars.is_first else 'display: none' %} +
+
+
+
{{ all_stats_dicts['GFASTATS'][item]['hap'] }}
+
+
+
+
{{ all_stats_dicts['GFASTATS'][item]['report_table_html'] }}
+
+
+{% if vars.update({'is_first': False}) %} {% endif %} {% endfor %} diff --git a/conf/modules.config b/conf/modules.config index 5f857db6..efc06589 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -50,6 +50,16 @@ process { ] } + withName: GFASTATS { + ext.args = '--stats -t --nstar-report' + publishDir = [ + path: { "${params.outdir}/gfastats" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals("versions.yml") ? null : filename }, + pattern: '*.assembly_summary' + ] + } + withName: FCS_FCSADAPTOR { ext.args = params.ncbi_fcs_adaptor_empire ? "--${params.ncbi_fcs_adaptor_empire}" : '--prok' diff --git a/conf/test_full.config b/conf/test_full.config index a8e663ab..14f95cb9 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -16,6 +16,8 @@ params { input = 'https://raw.githubusercontent.com/plant-food-research-open/assemblyqc/dev/assets/assemblysheetv2.csv' + gfastats_skip = false + ncbi_fcs_adaptor_skip = false ncbi_fcs_adaptor_empire = 'euk' diff --git a/docs/output.md b/docs/output.md index ae9e7f75..f5bfacff 100644 --- a/docs/output.md +++ b/docs/output.md @@ -14,7 +14,8 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [FASTA and GFF3 validation](#fasta-and-gff3-validation) - [Assemblathon stats](#assemblathon-stats) -- [Genometools gt stat](#genometools-gt-stat) +- [Gfastats](#gfastats) +- [GenomeTools gt stat](#genometools-gt-stat) - [NCBI FCS adaptor](#ncbi-fcs-adaptor) - [NCBI FCS GX](#ncbi-fcs-gx) - [BUSCO](#busco) @@ -45,7 +46,19 @@ The pipeline prints a warning in the pipeline log if FASTA or GFF3 validation fa > [!WARNING] > Contig-related stats are based on the assumption that `assemblathon_stats_n_limit` is specified correctly. If you are not certain of the value of `assemblathon_stats_n_limit`, please ignore the contig-related stats. -### Genometools gt stat +### Gfastats + +
+Output files + +- `gfastats/` + - `*.assembly_summary`: Assembly stats in TSV format. + +
+ +Gfastats is a fast and exhaustive tool for summary statistics. + +### GenomeTools gt stat
Output files diff --git a/docs/parameters.md b/docs/parameters.md index 7f8e4ff0..c9d7eefb 100644 --- a/docs/parameters.md +++ b/docs/parameters.md @@ -21,6 +21,7 @@ A Nextflow pipeline which evaluates assembly quality with multiple QC tools and | Parameter | Description | Type | Default | Required | Hidden | | ---------------------------- | ----------------------------------------------------------------------- | --------- | ------- | -------- | ------ | | `assemblathon_stats_n_limit` | The number of 'N's for the unknown gap size. NCBI recommendation is 100 | `integer` | 100 | | | +| `gfastats_skip` | Skip Gfastats | `boolean` | True | | | ## NCBI FCS options diff --git a/modules.json b/modules.json index 86cf046a..21d052a3 100644 --- a/modules.json +++ b/modules.json @@ -165,6 +165,11 @@ "git_sha": "06c8865e36741e05ad32ef70ab3fac127486af48", "installed_by": ["modules"] }, + "gfastats": { + "branch": "master", + "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", + "installed_by": ["modules"] + }, "gunzip": { "branch": "master", "git_sha": "06c8865e36741e05ad32ef70ab3fac127486af48", diff --git a/modules/local/createreport.nf b/modules/local/createreport.nf index fd53edc0..a7c05566 100644 --- a/modules/local/createreport.nf +++ b/modules/local/createreport.nf @@ -10,6 +10,7 @@ process CREATEREPORT { path ncbi_fcs_adaptor_reports , stageAs: 'ncbi_fcs_adaptor_reports/*' path fcs_gx_reports , stageAs: 'fcs_gx_reports/*' path assemblathon_stats , stageAs: 'assemblathon_stats/*' + path gfastats , stageAs: 'gfastats/*' path genometools_gt_stats , stageAs: 'genometools_gt_stat/*' path busco_outputs , stageAs: 'busco_outputs/*' path busco_gff_outputs , stageAs: 'busco_gff_outputs/*' diff --git a/modules/nf-core/gfastats/environment.yml b/modules/nf-core/gfastats/environment.yml new file mode 100644 index 00000000..b47bbdbb --- /dev/null +++ b/modules/nf-core/gfastats/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::gfastats=1.3.6 diff --git a/modules/nf-core/gfastats/main.nf b/modules/nf-core/gfastats/main.nf new file mode 100644 index 00000000..8db239ad --- /dev/null +++ b/modules/nf-core/gfastats/main.nf @@ -0,0 +1,66 @@ +process GFASTATS { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gfastats:1.3.6--hdcf5f25_3': + 'biocontainers/gfastats:1.3.6--hdcf5f25_3' }" + + input: + tuple val(meta), path(assembly) // input.[fasta|fastq|gfa][.gz] + val out_fmt // output format (fasta/fastq/gfa) + val genome_size // estimated genome size for NG* statistics (optional). + val target // target specific sequence by header, optionally with coordinates (optional). + path agpfile // -a --agp-to-path converts input agp to path and replaces existing paths. + path include_bed // -i --include-bed generates output on a subset list of headers or coordinates in 0-based bed format. + path exclude_bed // -e --exclude-bed opposite of --include-bed. They can be combined (no coordinates). + path instructions // -k --swiss-army-knife set of instructions provided as an ordered list. + + output: + tuple val(meta), path("*.assembly_summary"), emit: assembly_summary + tuple val(meta), path("*.${out_fmt}.gz") , emit: assembly + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def agp = agpfile ? "--agp-to-path $agp" : "" + def ibed = include_bed ? "--include-bed $include_bed" : "" + def ebed = exclude_bed ? "--exclude-bed $exclude_bed" : "" + def sak = instructions ? "--swiss-army-knife $instructions" : "" + """ + gfastats \\ + $args \\ + --threads $task.cpus \\ + $agp \\ + $ibed \\ + $ebed \\ + $sak \\ + --out-format ${prefix}.${out_fmt}.gz \\ + $assembly \\ + $genome_size \\ + $target \\ + > ${prefix}.assembly_summary + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gfastats: \$( gfastats -v | sed '1!d;s/.*v//' ) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.${out_fmt}.gz + touch ${prefix}.assembly_summary + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gfastats: \$( gfastats -v | sed '1!d;s/.*v//' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/gfastats/meta.yml b/modules/nf-core/gfastats/meta.yml new file mode 100644 index 00000000..a6213433 --- /dev/null +++ b/modules/nf-core/gfastats/meta.yml @@ -0,0 +1,83 @@ +name: "gfastats" +description: | + A single fast and exhaustive tool for summary statistics and simultaneous *fa* + (fasta, fastq, gfa [.gz]) genome assembly file manipulation. +keywords: + - gfastats + - fasta + - genome assembly + - genome summary + - genome manipulation + - genome statistics +tools: + - "gfastats": + description: "The swiss army knife for genome assembly." + homepage: "https://github.com/vgl-hub/gfastats" + documentation: "https://github.com/vgl-hub/gfastats/tree/main/instructions" + tool_dev_url: "https://github.com/vgl-hub/gfastats" + doi: "10.1093/bioinformatics/btac460" + licence: ["MIT"] + identifier: biotools:gfastats +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - assembly: + type: file + description: Draft assembly file + pattern: "*.{fasta,fastq,gfa}(.gz)?" + - - out_fmt: + type: string + description: Output format (fasta, fastq, gfa) + - - genome_size: + type: integer + description: estimated genome size (bp) for NG* statistics (optional). + - - target: + type: string + description: target specific sequence by header, optionally with coordinates + (optional). + - - agpfile: + type: file + description: converts input agp to path and replaces existing paths. + - - include_bed: + type: file + description: generates output on a subset list of headers or coordinates in + 0-based bed format. + - - exclude_bed: + type: file + description: opposite of --include-bed. They can be combined (no coordinates). + - - instructions: + type: file + description: set of instructions provided as an ordered list. +output: + - assembly_summary: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.assembly_summary": + type: file + description: Assembly summary statistics file + pattern: "*.assembly_summary" + - assembly: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.${out_fmt}.gz": + type: file + description: The assembly as modified by gfastats + pattern: "*.{fasta,fastq,gfa}.gz" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@mahesh-panchal" +maintainers: + - "@mahesh-panchal" diff --git a/nextflow.config b/nextflow.config index 02a7acb7..aafebe96 100644 --- a/nextflow.config +++ b/nextflow.config @@ -15,8 +15,9 @@ params { // Validation options check_sequence_duplicates = true - // Assemblathon stats options + // General stats options assemblathon_stats_n_limit = 100 + gfastats_skip = true // NCBI FCS options ncbi_fcs_adaptor_skip = true diff --git a/nextflow_schema.json b/nextflow_schema.json index fdb21ac2..3c3fb30d 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -62,6 +62,12 @@ "default": 100, "description": "The number of 'N's for the unknown gap size. NCBI recommendation is 100", "fa_icon": "fas fa-ruler-horizontal" + }, + "gfastats_skip": { + "type": "boolean", + "description": "Skip Gfastats", + "default": true, + "fa_icon": "fas fa-forward" } } }, diff --git a/pfr/params.json b/pfr/params.json index 8453604b..aabf5fba 100644 --- a/pfr/params.json +++ b/pfr/params.json @@ -2,6 +2,7 @@ "input": "/workspace/assemblyqc/testdata/v2/assemblysheet.csv", "check_sequence_duplicates": true, "assemblathon_stats_n_limit": 100, + "gfastats_skip": false, "ncbi_fcs_adaptor_skip": false, "ncbi_fcs_adaptor_empire": "euk", "ncbi_fcs_gx_skip": false, diff --git a/tests/stub/params.json b/tests/stub/params.json index 6e303ac4..573986a7 100644 --- a/tests/stub/params.json +++ b/tests/stub/params.json @@ -2,6 +2,7 @@ "config_profile_name": "Full stub test", "config_profile_description": "Full test of the pipeline in stub mode", "input": "assets/assemblysheetv2.csv", + "gfastats_skip": false, "ncbi_fcs_adaptor_skip": false, "ncbi_fcs_adaptor_empire": "euk", "ncbi_fcs_gx_skip": false, diff --git a/workflows/assemblyqc.nf b/workflows/assemblyqc.nf index 8f615b3e..4dd9067b 100644 --- a/workflows/assemblyqc.nf +++ b/workflows/assemblyqc.nf @@ -15,6 +15,7 @@ include { GFF3_GT_GFF3_GFF3VALIDATOR_STAT } from '../subworkflows/gallvp/gff3_ include { FCS_FCSADAPTOR } from '../modules/nf-core/fcs/fcsadaptor/main' include { NCBI_FCS_GX } from '../subworkflows/local/ncbi_fcs_gx' include { ASSEMBLATHON_STATS } from '../modules/local/assemblathon_stats' +include { GFASTATS } from '../modules/nf-core/gfastats/main' include { FASTA_GXF_BUSCO_PLOT } from '../subworkflows/gallvp/fasta_gxf_busco_plot/main' include { FASTA_LTRRETRIEVER_LAI } from '../subworkflows/gallvp/fasta_ltrretriever_lai/main' include { FASTA_KRAKEN2 } from '../subworkflows/local/fasta_kraken2' @@ -433,6 +434,27 @@ workflow ASSEMBLYQC { ch_assemblathon_stats = ASSEMBLATHON_STATS.out.stats ch_versions = ch_versions.mix(ASSEMBLATHON_STATS.out.versions.first()) + // MODULE: GFASTATS + ch_gfastats_assembly = params.gfastats_skip + ? Channel.empty() + : ch_clean_assembly + | map { tag, fasta -> [ [ id: tag ], fasta ] } + + GFASTATS( + ch_gfastats_assembly, + 'gfa', // output format + '', // estimated genome size + '', // target specific sequence by header + [], // agp file + [], // include bed + [], // exclude bed + [] // instructions + ) + + ch_gfastats_stats = GFASTATS.out.assembly_summary + | map { tag, stats -> stats } + ch_versions = ch_versions.mix(GFASTATS.out.versions.first()) + // SUBWORKFLOW: FASTA_GXF_BUSCO_PLOT ch_busco_input_assembly = params.busco_skip ? Channel.empty() @@ -793,6 +815,7 @@ workflow ASSEMBLYQC { ch_fcs_adaptor_report .map { meta, file -> file }.collect().ifEmpty([]), ch_fcs_gx_report .mix(ch_fcs_gx_taxonomy_plot).map { meta, file -> file }.collect().ifEmpty([]), ch_assemblathon_stats .collect().ifEmpty([]), + ch_gfastats_stats .collect().ifEmpty([]), ch_gt_stats .collect().ifEmpty([]), ch_busco_outputs .collect().ifEmpty([]), ch_busco_gff_outputs .collect().ifEmpty([]),