From 1b7f285d6e9cb07ef6645241f6931f5c2f3931d8 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Thu, 3 Aug 2023 16:29:32 +0000 Subject: [PATCH 1/2] Now use "outdir", which is equivalent to a ToL analysis directory --- CHANGELOG.md | 4 ++- assets/samplesheet.csv | 14 ++++---- assets/schema_input.json | 4 +-- bin/check_samplesheet.py | 12 +++---- conf/modules.config | 6 ++-- docs/output.md | 50 ++++++++++++--------------- docs/usage.md | 8 ++--- modules/local/ncbi_download.nf | 6 ++-- nextflow_schema.json | 2 +- subworkflows/local/download_genome.nf | 2 +- subworkflows/local/params_check.nf | 6 ++-- 11 files changed, 54 insertions(+), 60 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b518284..e405031 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,10 +3,12 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## v1.2.0 - [date] +## v2.0.0 - [date] ### `Fixed` +- The sample-sheet column `species_dir` is replaced with the `outdir` column which + represents where the assembly and repeats are downloaded (in immediate sub-directories) - Relative paths in the sample-sheet are now evaluated from the `--outdir` parameter - Memory usage rules for `samtools dict` - Appropriate use of `tabix`'s TBI and CSI indexing, depending on the sequence lengths diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index 49f1be2..0f2f8be 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,7 +1,7 @@ -species_dir,assembly_name,assembly_accession -25g/data/echinoderms/Asterias_rubens,eAstRub1.3,GCA_902459465.3 -25g/data/insects/Osmia_bicornis,iOsmBic2.1,GCA_907164935.1 -25g/data/insects/Osmia_bicornis,iOsmBic2.1_alternate_haplotype,GCA_907164925.1 -darwin/data/fungi/Laetiporus_sulphureus,gfLaeSulp1.1,GCA_927399515.1 -darwin/data/insects/Noctua_fimbriata,ilNocFimb1.1,GCA_905163415.1 -darwin/data/mammals/Meles_meles,mMelMel3.2_paternal_haplotype,GCA_922984935.2 +outdir,assembly_name,assembly_accession +Asterias_rubens/eAstRub1.3,eAstRub1.3,GCA_902459465.3 +Osmia_bicornis/iOsmBic2.1,iOsmBic2.1,GCA_907164935.1 +Osmia_bicornis/iOsmBic2.1_alternate_haplotype,iOsmBic2.1_alternate_haplotype,GCA_907164925.1 +Laetiporus_sulphureus/gfLaeSulp1.1,gfLaeSulp1.1,GCA_927399515.1 +Noctua_fimbriata/ilNocFimb1.1,ilNocFimb1.1,GCA_905163415.1 +Meles_meles/mMelMel3.2_paternal_haplotype,mMelMel3.2_paternal_haplotype,GCA_922984935.2 diff --git a/assets/schema_input.json b/assets/schema_input.json index ed91197..dafbfaa 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -7,7 +7,7 @@ "items": { "type": "object", "properties": { - "species_dir": { + "outdir": { "type": "string", "pattern": "^\\S+$", "errorMessage": "Species directory must be provided and exist" @@ -23,6 +23,6 @@ "errorMessage": "Assembly accession number must be provided and be of the form GCA_*" } }, - "required": ["species_dir", "assembly_name", "assembly_accession"] + "required": ["outdir", "assembly_name", "assembly_accession"] } } diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index de6852e..aa8bfc4 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -28,7 +28,7 @@ class RowChecker: def __init__( self, - dir_col="species_dir", + dir_col="outdir", name_col="assembly_name", accession_col="assembly_accession", **kwargs, @@ -38,7 +38,7 @@ def __init__( Args: dir_col (str): The name of the column that contains the species directory - (default "species_dir"). + (default "outdir"). name_col (str): The name of the column that contains the assembly name (default "assembly_name"). accession_col (str): The name of the column that contains the accession @@ -142,12 +142,12 @@ def check_samplesheet(file_in, file_out): Example: This function checks that the samplesheet follows the following structure:: - species_dir,assembly_name,assembly_accession - darwin/data/fungi/Laetiporus_sulphureus,gfLaeSulp1.1,GCA_927399515.1 - darwin/data/mammals/Meles_meles,mMelMel3.2_paternal_haplotype,GCA_922984935.2 + outdir,assembly_name,assembly_accession + Laetiporus_sulphureus/gfLaeSulp1.1,gfLaeSulp1.1,GCA_927399515.1 + Meles_meles/mMelMel3.2_paternal_haplotype,mMelMel3.2_paternal_haplotype,GCA_922984935.2 """ required_columns = { - "species_dir", + "outdir", "assembly_name", "assembly_accession", } diff --git a/conf/modules.config b/conf/modules.config index 091bce5..c0bcbe4 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -15,7 +15,7 @@ process { withName: 'NCBI_DOWNLOAD' { maxForks = 3 publishDir = [ - path: { "${meta.species_dir}/assembly/release/${meta.assembly_name}/insdc" }, + path: { "${meta.outdir}/assembly" }, mode: 'copy', saveAs: { filename -> filename.endsWith('assembly_report.txt') || filename.endsWith('assembly_stats.txt') || filename.endsWith("ACCESSION") ? filename : null } ] @@ -23,7 +23,7 @@ process { withName: '.*:.*:PREPARE_UNMASKED_FASTA:.*' { publishDir = [ - path: { "${meta.species_dir}/assembly/release/${meta.assembly_name}/insdc" }, + path: { "${meta.outdir}/assembly" }, mode: 'copy', saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -31,7 +31,7 @@ process { withName: '.*:.*:(PREPARE_REPEAT_MASKED_FASTA:.*|PREPARE_REPEATS:TABIX_.*)' { publishDir = [ - path: { "${meta.species_dir}/analysis/${meta.assembly_name}/repeats/ncbi" }, + path: { "${meta.outdir}/repeats/ncbi" }, mode: 'copy', saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] diff --git a/docs/output.md b/docs/output.md index 009197f..312f315 100644 --- a/docs/output.md +++ b/docs/output.md @@ -4,7 +4,8 @@ This document describes the output produced by the pipeline. -The directories listed below will be created in the results directory (or `species_dir` when using a samplesheet) after the pipeline has finished. +The directories listed below will be created in a directory based on the `--outdir` command-line parameter and the `outdir` column of the samplesheet. +) after the pipeline has finished. All paths are relative to the top-level results directory. The directories comply with Tree of Life's canonical directory structure. @@ -23,20 +24,17 @@ Here are the files you can expect in the `assembly/` sub-directory. ```text assembly -└── release - └── gfLaeSulp1.1 - └── insdc - ├── ACCESSION - ├── GCA_927399515.1.assembly_report.txt - ├── GCA_927399515.1.assembly_stats.txt - ├── GCA_927399515.1.fa.dict - ├── GCA_927399515.1.fa.gz - ├── GCA_927399515.1.fa.gz.fai - ├── GCA_927399515.1.fa.gz.gzi - └── GCA_927399515.1.fa.gz.sizes +├── ACCESSION +├── GCA_927399515.1.assembly_report.txt +├── GCA_927399515.1.assembly_stats.txt +├── GCA_927399515.1.fa.dict +├── GCA_927399515.1.fa.gz +├── GCA_927399515.1.fa.gz.fai +├── GCA_927399515.1.fa.gz.gzi +└── GCA_927399515.1.fa.gz.sizes ``` -The directory structure includes the assembly name, e.g. `gfLaeSulp1.1`, and all files are named after the assembly accession, e.g. `GCA_927399515.1`. +All files are named after the assembly accession, e.g. `GCA_927399515.1`. - `GCA_*.assembly_report.txt` and `GCA_*.assembly_stats.txt`: report and statistics files, straight from the NCBI FTP - `GCA_*.fa.gz`: Unmasked assembly in Fasta format, compressed with `bgzip` (whose index is `GCA_*.fa.gz.gzi`) @@ -48,25 +46,23 @@ with the exception of `ACCESSION`, which contains a single line of text: the ass ### Primary analysis files -Here are the files you can expect in the `analysis/` sub-directory. +Here are the files you can expect in the `repeats/` sub-directory. ```text -analysis -└── gfLaeSulp1.1 - └── repeats - └── ncbi - ├── GCA_927399515.1.masked.ncbi.bed.gz - ├── GCA_927399515.1.masked.ncbi.bed.gz.csi - ├── GCA_927399515.1.masked.ncbi.bed.gz.tbi - ├── GCA_927399515.1.masked.ncbi.fa.dict - ├── GCA_927399515.1.masked.ncbi.fa.gz - ├── GCA_927399515.1.masked.ncbi.fa.gz.fai - ├── GCA_927399515.1.masked.ncbi.fa.gz.gzi - └── GCA_927399515.1.masked.ncbi.fa.gz.sizes +repeats +└── ncbi + ├── GCA_927399515.1.masked.ncbi.bed.gz + ├── GCA_927399515.1.masked.ncbi.bed.gz.csi + ├── GCA_927399515.1.masked.ncbi.bed.gz.tbi + ├── GCA_927399515.1.masked.ncbi.fa.dict + ├── GCA_927399515.1.masked.ncbi.fa.gz + ├── GCA_927399515.1.masked.ncbi.fa.gz.fai + ├── GCA_927399515.1.masked.ncbi.fa.gz.gzi + └── GCA_927399515.1.masked.ncbi.fa.gz.sizes ``` They all correspond to the repeat-masking analysis run by the NCBI themselves. Like for the `assembly/` sub-directory, -the directory structure includes the assembly name, e.g. `gfLaeSulp1.1`, and all files are named after the assembly accession, e.g. `GCA_927399515.1`. +all files are named after the assembly accession, e.g. `GCA_927399515.1`. - `GCA_*.masked.ncbi.fa.gz`: Masked assembly in Fasta format, compressed with `bgzip` (whose index is `GCA_*.fa.gz.gzi`) - `GCA_*.masked.ncbi.fa.gz.fai`: `samtools faidx` index, which allows accessing any region of the assembly in constant time diff --git a/docs/usage.md b/docs/usage.md index 0b120bc..d6c683f 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -30,14 +30,14 @@ The pipeline can download multiple assemblies at once, by providing them in a `. It has to be a comma-separated file with three columns, and a header row as shown in the examples below. ```console -species_dir,assembly_name,assembly_accession +outdir,assembly_name,assembly_accession darwin/data/fungi/Laetiporus_sulphureus,gfLaeSulp1.1,GCA_927399515.1 darwin/data/mammals/Meles_meles,mMelMel3.2_paternal_haplotype,GCA_922984935.2 ``` | Column | Description | | -------------------- | -------------------------------------------------------------------------------- | -| `species_dir` | Base download directory for this species. Evaluated from `--outdir` if relative. | +| `outdir` | Base download directory for this species. Evaluated from `--outdir` if relative. | | `assembly_name` | Name of the assembly, as on the NCBI website, e.g. `gfLaeSulp1.1`. | | `assembly_accession` | Accession number of the assembly to download. Typically of the form `GCA_*.*`. | @@ -48,9 +48,7 @@ A samplesheet may contain: - only one row per assembly All samplesheet columns correspond exactly to their corresponding command-line parameter, -except `species_dir` which overrides or complements `--oudir`. -`species_dir` is used to fit the output of this pipeline into a directory structure compatible with the other pipelines -from Sanger Tree of Life. +except `outdir` which overrides or complements `--oudir`. An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. diff --git a/modules/local/ncbi_download.nf b/modules/local/ncbi_download.nf index 54196d2..ae63e52 100644 --- a/modules/local/ncbi_download.nf +++ b/modules/local/ncbi_download.nf @@ -12,7 +12,7 @@ process NCBI_DOWNLOAD { 'biocontainers/gnu-wget:1.18--h7132678_6' }" input: - tuple val(assembly_accession), val(assembly_name), val(species_dir) + tuple val(assembly_accession), val(assembly_name), val(outdir) output: tuple val(meta), path(filename_fasta) , emit: fasta @@ -36,7 +36,7 @@ process NCBI_DOWNLOAD { meta = [ id : assembly_accession, assembly_name : assembly_name, - species_dir : species_dir, + outdir : outdir, ] def prefix = task.ext.prefix ?: "${meta.id}" filename_assembly_report = "${prefix}.assembly_report.txt" @@ -45,8 +45,6 @@ process NCBI_DOWNLOAD { filename_accession = "ACCESSION" """ - #export https_proxy=http://wwwcache.sanger.ac.uk:3128 - #export http_proxy=http://wwwcache.sanger.ac.uk:3128 wget ${ftp_path}/${remote_filename_stem}_assembly_report.txt wget ${ftp_path}/${remote_filename_stem}_assembly_stats.txt wget ${ftp_path}/${remote_filename_stem}_genomic.fna.gz diff --git a/nextflow_schema.json b/nextflow_schema.json index 90a125c..bc19c9b 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -38,7 +38,7 @@ "pattern": "^\\S+\\.csv$", "schema": "assets/schema_input.json", "description": "Path to comma-separated file containing information about the assemblies to download. Used for bulk download of many assemblies.", - "help_text": "The file has to be a comma-separated file with three columns, and a header row. The columns names must be `species_dir`, `assembly_accession`, and `assembly_name`.", + "help_text": "The file has to be a comma-separated file with three columns, and a header row. The columns names must be `outdir`, `assembly_accession`, and `assembly_name`.", "fa_icon": "fas fa-file-csv" }, "ftp_root": { diff --git a/subworkflows/local/download_genome.nf b/subworkflows/local/download_genome.nf index 0a57503..dd4da55 100644 --- a/subworkflows/local/download_genome.nf +++ b/subworkflows/local/download_genome.nf @@ -9,7 +9,7 @@ include { REMOVE_MASKING } from '../../modules/local/remove_masking' workflow DOWNLOAD_GENOME { take: - assembly_params // tuple(assembly_accession, assembly_name, species_dir) + assembly_params // tuple(assembly_accession, assembly_name, outdir) main: diff --git a/subworkflows/local/params_check.nf b/subworkflows/local/params_check.nf index 8992365..e9e4458 100644 --- a/subworkflows/local/params_check.nf +++ b/subworkflows/local/params_check.nf @@ -21,13 +21,13 @@ workflow PARAMS_CHECK { SAMPLESHEET_CHECK ( file(samplesheet, checkIfExists: true) ) .csv - // Provides species_dir, assembly_accession, and assembly_name + // Provides outdir, assembly_accession, and assembly_name .splitCsv ( header:true, sep:',' ) // Convert to tuple, as required by the download subworkflow .map { [ it["assembly_accession"], it["assembly_name"], - (it["species_dir"].startsWith("/") ? "" : outdir + "/") + it["species_dir"], + (it["outdir"].startsWith("/") ? "" : outdir + "/") + it["outdir"], ] } .set { ch_inputs } @@ -41,7 +41,7 @@ workflow PARAMS_CHECK { emit: - assembly_params = ch_inputs // channel: tuple(assembly_accession, assembly_name, species_dir) + assembly_params = ch_inputs // channel: tuple(assembly_accession, assembly_name, outdir) versions = ch_versions // channel: versions.yml } From 403f469d084d0631e0135f2cace320fdd4219491 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Sat, 23 Sep 2023 11:18:38 +0000 Subject: [PATCH 2/2] Made the relation between --outdir and the samplesheet outdir clearer --- docs/usage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/usage.md b/docs/usage.md index d6c683f..f785a05 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -15,7 +15,7 @@ The pipeline accepts command-one line arguments to specify a single genome to do - `--assembly_name`: The name of the assembly, - `--assembly_accession`: The accession number of the assembly, -- `--outdir`: Where to download the data. +- `--outdir`: Where the pipeline runtime information will be stored, and where data will be downloaded (except if absolute paths are given in the samplesheet). ```console nextflow run sanger-tol/insdcdownload --assembly_accession GCA_927399515.1 --assembly_name gfLaeSulp1.1 --outdir gfLaeSulp1.1_data