Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rework of the output directories #12

Merged
merged 2 commits into from
Oct 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## v1.2.0 - [date]
## v2.0.0 - [date]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For v2 are you planning to update to the new CHANGELOG format? I personally find that more informative.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Discussed on Slack. The answer is yes


### `Fixed`

- The sample-sheet column `species_dir` is replaced with the `outdir` column which
represents where the assembly and repeats are downloaded (in immediate sub-directories)
- Relative paths in the sample-sheet are now evaluated from the `--outdir` parameter
- Memory usage rules for `samtools dict`
- Appropriate use of `tabix`'s TBI and CSI indexing, depending on the sequence lengths
Expand Down
14 changes: 7 additions & 7 deletions assets/samplesheet.csv
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
species_dir,assembly_name,assembly_accession
25g/data/echinoderms/Asterias_rubens,eAstRub1.3,GCA_902459465.3
25g/data/insects/Osmia_bicornis,iOsmBic2.1,GCA_907164935.1
25g/data/insects/Osmia_bicornis,iOsmBic2.1_alternate_haplotype,GCA_907164925.1
darwin/data/fungi/Laetiporus_sulphureus,gfLaeSulp1.1,GCA_927399515.1
darwin/data/insects/Noctua_fimbriata,ilNocFimb1.1,GCA_905163415.1
darwin/data/mammals/Meles_meles,mMelMel3.2_paternal_haplotype,GCA_922984935.2
outdir,assembly_name,assembly_accession
Asterias_rubens/eAstRub1.3,eAstRub1.3,GCA_902459465.3
Osmia_bicornis/iOsmBic2.1,iOsmBic2.1,GCA_907164935.1
Osmia_bicornis/iOsmBic2.1_alternate_haplotype,iOsmBic2.1_alternate_haplotype,GCA_907164925.1
Laetiporus_sulphureus/gfLaeSulp1.1,gfLaeSulp1.1,GCA_927399515.1
Noctua_fimbriata/ilNocFimb1.1,ilNocFimb1.1,GCA_905163415.1
Meles_meles/mMelMel3.2_paternal_haplotype,mMelMel3.2_paternal_haplotype,GCA_922984935.2
4 changes: 2 additions & 2 deletions assets/schema_input.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
"items": {
"type": "object",
"properties": {
"species_dir": {
"outdir": {
"type": "string",
"pattern": "^\\S+$",
"errorMessage": "Species directory must be provided and exist"
Expand All @@ -23,6 +23,6 @@
"errorMessage": "Assembly accession number must be provided and be of the form GCA_*"
}
},
"required": ["species_dir", "assembly_name", "assembly_accession"]
"required": ["outdir", "assembly_name", "assembly_accession"]
}
}
12 changes: 6 additions & 6 deletions bin/check_samplesheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class RowChecker:

def __init__(
self,
dir_col="species_dir",
dir_col="outdir",
name_col="assembly_name",
accession_col="assembly_accession",
**kwargs,
Expand All @@ -38,7 +38,7 @@ def __init__(

Args:
dir_col (str): The name of the column that contains the species directory
(default "species_dir").
(default "outdir").
name_col (str): The name of the column that contains the assembly name
(default "assembly_name").
accession_col (str): The name of the column that contains the accession
Expand Down Expand Up @@ -142,12 +142,12 @@ def check_samplesheet(file_in, file_out):
Example:
This function checks that the samplesheet follows the following structure::

species_dir,assembly_name,assembly_accession
darwin/data/fungi/Laetiporus_sulphureus,gfLaeSulp1.1,GCA_927399515.1
darwin/data/mammals/Meles_meles,mMelMel3.2_paternal_haplotype,GCA_922984935.2
outdir,assembly_name,assembly_accession
Laetiporus_sulphureus/gfLaeSulp1.1,gfLaeSulp1.1,GCA_927399515.1
Meles_meles/mMelMel3.2_paternal_haplotype,mMelMel3.2_paternal_haplotype,GCA_922984935.2
"""
required_columns = {
"species_dir",
"outdir",
"assembly_name",
"assembly_accession",
}
Expand Down
6 changes: 3 additions & 3 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -15,23 +15,23 @@ process {
withName: 'NCBI_DOWNLOAD' {
maxForks = 3
publishDir = [
path: { "${meta.species_dir}/assembly/release/${meta.assembly_name}/insdc" },
path: { "${meta.outdir}/assembly" },
mode: 'copy',
saveAs: { filename -> filename.endsWith('assembly_report.txt') || filename.endsWith('assembly_stats.txt') || filename.endsWith("ACCESSION") ? filename : null }
]
}

withName: '.*:.*:PREPARE_UNMASKED_FASTA:.*' {
publishDir = [
path: { "${meta.species_dir}/assembly/release/${meta.assembly_name}/insdc" },
path: { "${meta.outdir}/assembly" },
mode: 'copy',
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}

withName: '.*:.*:(PREPARE_REPEAT_MASKED_FASTA:.*|PREPARE_REPEATS:TABIX_.*)' {
publishDir = [
path: { "${meta.species_dir}/analysis/${meta.assembly_name}/repeats/ncbi" },
path: { "${meta.outdir}/repeats/ncbi" },
mode: 'copy',
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
Expand Down
50 changes: 23 additions & 27 deletions docs/output.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@

This document describes the output produced by the pipeline.

The directories listed below will be created in the results directory (or `species_dir` when using a samplesheet) after the pipeline has finished.
The directories listed below will be created in a directory based on the `--outdir` command-line parameter and the `outdir` column of the samplesheet.
) after the pipeline has finished.
All paths are relative to the top-level results directory.

The directories comply with Tree of Life's canonical directory structure.
Expand All @@ -23,20 +24,17 @@ Here are the files you can expect in the `assembly/` sub-directory.

```text
assembly
└── release
└── gfLaeSulp1.1
└── insdc
├── ACCESSION
├── GCA_927399515.1.assembly_report.txt
├── GCA_927399515.1.assembly_stats.txt
├── GCA_927399515.1.fa.dict
├── GCA_927399515.1.fa.gz
├── GCA_927399515.1.fa.gz.fai
├── GCA_927399515.1.fa.gz.gzi
└── GCA_927399515.1.fa.gz.sizes
├── ACCESSION
├── GCA_927399515.1.assembly_report.txt
├── GCA_927399515.1.assembly_stats.txt
├── GCA_927399515.1.fa.dict
├── GCA_927399515.1.fa.gz
├── GCA_927399515.1.fa.gz.fai
├── GCA_927399515.1.fa.gz.gzi
└── GCA_927399515.1.fa.gz.sizes
```

The directory structure includes the assembly name, e.g. `gfLaeSulp1.1`, and all files are named after the assembly accession, e.g. `GCA_927399515.1`.
All files are named after the assembly accession, e.g. `GCA_927399515.1`.

- `GCA_*.assembly_report.txt` and `GCA_*.assembly_stats.txt`: report and statistics files, straight from the NCBI FTP
- `GCA_*.fa.gz`: Unmasked assembly in Fasta format, compressed with `bgzip` (whose index is `GCA_*.fa.gz.gzi`)
Expand All @@ -48,25 +46,23 @@ with the exception of `ACCESSION`, which contains a single line of text: the ass

### Primary analysis files

Here are the files you can expect in the `analysis/` sub-directory.
Here are the files you can expect in the `repeats/` sub-directory.

```text
analysis
└── gfLaeSulp1.1
└── repeats
└── ncbi
├── GCA_927399515.1.masked.ncbi.bed.gz
├── GCA_927399515.1.masked.ncbi.bed.gz.csi
├── GCA_927399515.1.masked.ncbi.bed.gz.tbi
├── GCA_927399515.1.masked.ncbi.fa.dict
├── GCA_927399515.1.masked.ncbi.fa.gz
├── GCA_927399515.1.masked.ncbi.fa.gz.fai
├── GCA_927399515.1.masked.ncbi.fa.gz.gzi
└── GCA_927399515.1.masked.ncbi.fa.gz.sizes
repeats
└── ncbi
├── GCA_927399515.1.masked.ncbi.bed.gz
├── GCA_927399515.1.masked.ncbi.bed.gz.csi
├── GCA_927399515.1.masked.ncbi.bed.gz.tbi
├── GCA_927399515.1.masked.ncbi.fa.dict
├── GCA_927399515.1.masked.ncbi.fa.gz
├── GCA_927399515.1.masked.ncbi.fa.gz.fai
├── GCA_927399515.1.masked.ncbi.fa.gz.gzi
└── GCA_927399515.1.masked.ncbi.fa.gz.sizes
```

They all correspond to the repeat-masking analysis run by the NCBI themselves. Like for the `assembly/` sub-directory,
the directory structure includes the assembly name, e.g. `gfLaeSulp1.1`, and all files are named after the assembly accession, e.g. `GCA_927399515.1`.
all files are named after the assembly accession, e.g. `GCA_927399515.1`.

- `GCA_*.masked.ncbi.fa.gz`: Masked assembly in Fasta format, compressed with `bgzip` (whose index is `GCA_*.fa.gz.gzi`)
- `GCA_*.masked.ncbi.fa.gz.fai`: `samtools faidx` index, which allows accessing any region of the assembly in constant time
Expand Down
10 changes: 4 additions & 6 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ The pipeline accepts command-one line arguments to specify a single genome to do

- `--assembly_name`: The name of the assembly,
- `--assembly_accession`: The accession number of the assembly,
- `--outdir`: Where to download the data.
- `--outdir`: Where the pipeline runtime information will be stored, and where data will be downloaded (except if absolute paths are given in the samplesheet).

```console
nextflow run sanger-tol/insdcdownload --assembly_accession GCA_927399515.1 --assembly_name gfLaeSulp1.1 --outdir gfLaeSulp1.1_data
Expand All @@ -30,14 +30,14 @@ The pipeline can download multiple assemblies at once, by providing them in a `.
It has to be a comma-separated file with three columns, and a header row as shown in the examples below.

```console
species_dir,assembly_name,assembly_accession
outdir,assembly_name,assembly_accession
darwin/data/fungi/Laetiporus_sulphureus,gfLaeSulp1.1,GCA_927399515.1
darwin/data/mammals/Meles_meles,mMelMel3.2_paternal_haplotype,GCA_922984935.2
```

| Column | Description |
| -------------------- | -------------------------------------------------------------------------------- |
| `species_dir` | Base download directory for this species. Evaluated from `--outdir` if relative. |
| `outdir` | Base download directory for this species. Evaluated from `--outdir` if relative. |
| `assembly_name` | Name of the assembly, as on the NCBI website, e.g. `gfLaeSulp1.1`. |
| `assembly_accession` | Accession number of the assembly to download. Typically of the form `GCA_*.*`. |

Expand All @@ -48,9 +48,7 @@ A samplesheet may contain:
- only one row per assembly

All samplesheet columns correspond exactly to their corresponding command-line parameter,
except `species_dir` which overrides or complements `--oudir`.
`species_dir` is used to fit the output of this pipeline into a directory structure compatible with the other pipelines
from Sanger Tree of Life.
except `outdir` which overrides or complements `--oudir`.

An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.

Expand Down
6 changes: 2 additions & 4 deletions modules/local/ncbi_download.nf
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ process NCBI_DOWNLOAD {
'biocontainers/gnu-wget:1.18--h7132678_6' }"

input:
tuple val(assembly_accession), val(assembly_name), val(species_dir)
tuple val(assembly_accession), val(assembly_name), val(outdir)

output:
tuple val(meta), path(filename_fasta) , emit: fasta
Expand All @@ -36,7 +36,7 @@ process NCBI_DOWNLOAD {
meta = [
id : assembly_accession,
assembly_name : assembly_name,
species_dir : species_dir,
outdir : outdir,
]
def prefix = task.ext.prefix ?: "${meta.id}"
filename_assembly_report = "${prefix}.assembly_report.txt"
Expand All @@ -45,8 +45,6 @@ process NCBI_DOWNLOAD {
filename_accession = "ACCESSION"

"""
#export https_proxy=http://wwwcache.sanger.ac.uk:3128
#export http_proxy=http://wwwcache.sanger.ac.uk:3128
wget ${ftp_path}/${remote_filename_stem}_assembly_report.txt
wget ${ftp_path}/${remote_filename_stem}_assembly_stats.txt
wget ${ftp_path}/${remote_filename_stem}_genomic.fna.gz
Expand Down
2 changes: 1 addition & 1 deletion nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
"pattern": "^\\S+\\.csv$",
"schema": "assets/schema_input.json",
"description": "Path to comma-separated file containing information about the assemblies to download. Used for bulk download of many assemblies.",
"help_text": "The file has to be a comma-separated file with three columns, and a header row. The columns names must be `species_dir`, `assembly_accession`, and `assembly_name`.",
"help_text": "The file has to be a comma-separated file with three columns, and a header row. The columns names must be `outdir`, `assembly_accession`, and `assembly_name`.",
"fa_icon": "fas fa-file-csv"
},
"ftp_root": {
Expand Down
2 changes: 1 addition & 1 deletion subworkflows/local/download_genome.nf
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ include { REMOVE_MASKING } from '../../modules/local/remove_masking'
workflow DOWNLOAD_GENOME {

take:
assembly_params // tuple(assembly_accession, assembly_name, species_dir)
assembly_params // tuple(assembly_accession, assembly_name, outdir)


main:
Expand Down
6 changes: 3 additions & 3 deletions subworkflows/local/params_check.nf
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,13 @@ workflow PARAMS_CHECK {

SAMPLESHEET_CHECK ( file(samplesheet, checkIfExists: true) )
.csv
// Provides species_dir, assembly_accession, and assembly_name
// Provides outdir, assembly_accession, and assembly_name
.splitCsv ( header:true, sep:',' )
// Convert to tuple, as required by the download subworkflow
.map { [
it["assembly_accession"],
it["assembly_name"],
(it["species_dir"].startsWith("/") ? "" : outdir + "/") + it["species_dir"],
(it["outdir"].startsWith("/") ? "" : outdir + "/") + it["outdir"],
] }
.set { ch_inputs }

Expand All @@ -41,7 +41,7 @@ workflow PARAMS_CHECK {


emit:
assembly_params = ch_inputs // channel: tuple(assembly_accession, assembly_name, species_dir)
assembly_params = ch_inputs // channel: tuple(assembly_accession, assembly_name, outdir)
versions = ch_versions // channel: versions.yml
}