Skip to content

Commit

Permalink
Now use "outdir", which is equivalent to a ToL analysis directory
Browse files Browse the repository at this point in the history
  • Loading branch information
muffato committed Aug 4, 2023
1 parent e1b927d commit c75a738
Show file tree
Hide file tree
Showing 11 changed files with 54 additions and 60 deletions.
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## v1.2.0 - [date]
## v2.0.0 - [date]

### `Fixed`

- The sample-sheet column `species_dir` is replaced with the `outdir` column which
represents where the assembly and repeats are downloaded (in immediate sub-directories)
- Relative paths in the sample-sheet are now evaluated from the `--outdir` parameter
- Memory usage rules for `samtools dict`
- Appropriate use of `tabix`'s TBI and CSI indexing, depending on the sequence lengths
Expand Down
14 changes: 7 additions & 7 deletions assets/samplesheet.csv
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
species_dir,assembly_name,assembly_accession
25g/data/echinoderms/Asterias_rubens,eAstRub1.3,GCA_902459465.3
25g/data/insects/Osmia_bicornis,iOsmBic2.1,GCA_907164935.1
25g/data/insects/Osmia_bicornis,iOsmBic2.1_alternate_haplotype,GCA_907164925.1
darwin/data/fungi/Laetiporus_sulphureus,gfLaeSulp1.1,GCA_927399515.1
darwin/data/insects/Noctua_fimbriata,ilNocFimb1.1,GCA_905163415.1
darwin/data/mammals/Meles_meles,mMelMel3.2_paternal_haplotype,GCA_922984935.2
outdir,assembly_name,assembly_accession
Asterias_rubens/eAstRub1.3,eAstRub1.3,GCA_902459465.3
Osmia_bicornis/iOsmBic2.1,iOsmBic2.1,GCA_907164935.1
Osmia_bicornis/iOsmBic2.1_alternate_haplotype,iOsmBic2.1_alternate_haplotype,GCA_907164925.1
Laetiporus_sulphureus/gfLaeSulp1.1,gfLaeSulp1.1,GCA_927399515.1
Noctua_fimbriata/ilNocFimb1.1,ilNocFimb1.1,GCA_905163415.1
Meles_meles/mMelMel3.2_paternal_haplotype,mMelMel3.2_paternal_haplotype,GCA_922984935.2
4 changes: 2 additions & 2 deletions assets/schema_input.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
"items": {
"type": "object",
"properties": {
"species_dir": {
"outdir": {
"type": "string",
"pattern": "^\\S+$",
"errorMessage": "Species directory must be provided and exist"
Expand All @@ -23,6 +23,6 @@
"errorMessage": "Assembly accession number must be provided and be of the form GCA_*"
}
},
"required": ["species_dir", "assembly_name", "assembly_accession"]
"required": ["outdir", "assembly_name", "assembly_accession"]
}
}
12 changes: 6 additions & 6 deletions bin/check_samplesheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class RowChecker:

def __init__(
self,
dir_col="species_dir",
dir_col="outdir",
name_col="assembly_name",
accession_col="assembly_accession",
**kwargs,
Expand All @@ -38,7 +38,7 @@ def __init__(
Args:
dir_col (str): The name of the column that contains the species directory
(default "species_dir").
(default "outdir").
name_col (str): The name of the column that contains the assembly name
(default "assembly_name").
accession_col (str): The name of the column that contains the accession
Expand Down Expand Up @@ -142,12 +142,12 @@ def check_samplesheet(file_in, file_out):
Example:
This function checks that the samplesheet follows the following structure::
species_dir,assembly_name,assembly_accession
darwin/data/fungi/Laetiporus_sulphureus,gfLaeSulp1.1,GCA_927399515.1
darwin/data/mammals/Meles_meles,mMelMel3.2_paternal_haplotype,GCA_922984935.2
outdir,assembly_name,assembly_accession
Laetiporus_sulphureus/gfLaeSulp1.1,gfLaeSulp1.1,GCA_927399515.1
Meles_meles/mMelMel3.2_paternal_haplotype,mMelMel3.2_paternal_haplotype,GCA_922984935.2
"""
required_columns = {
"species_dir",
"outdir",
"assembly_name",
"assembly_accession",
}
Expand Down
6 changes: 3 additions & 3 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -15,23 +15,23 @@ process {
withName: 'NCBI_DOWNLOAD' {
maxForks = 3
publishDir = [
path: { "${meta.species_dir}/assembly/release/${meta.assembly_name}/insdc" },
path: { "${meta.outdir}/assembly" },
mode: 'copy',
saveAs: { filename -> filename.endsWith('assembly_report.txt') || filename.endsWith('assembly_stats.txt') || filename.endsWith("ACCESSION") ? filename : null }
]
}

withName: '.*:.*:PREPARE_UNMASKED_FASTA:.*' {
publishDir = [
path: { "${meta.species_dir}/assembly/release/${meta.assembly_name}/insdc" },
path: { "${meta.outdir}/assembly" },
mode: 'copy',
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}

withName: '.*:.*:(PREPARE_REPEAT_MASKED_FASTA:.*|PREPARE_REPEATS:TABIX_.*)' {
publishDir = [
path: { "${meta.species_dir}/analysis/${meta.assembly_name}/repeats/ncbi" },
path: { "${meta.outdir}/repeats/ncbi" },
mode: 'copy',
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
Expand Down
50 changes: 23 additions & 27 deletions docs/output.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@

This document describes the output produced by the pipeline.

The directories listed below will be created in the results directory (or `species_dir` when using a samplesheet) after the pipeline has finished.
The directories listed below will be created in a directory based on the `--outdir` command-line parameter and the `outdir` column of the samplesheet.
) after the pipeline has finished.
All paths are relative to the top-level results directory.

The directories comply with Tree of Life's canonical directory structure.
Expand All @@ -23,20 +24,17 @@ Here are the files you can expect in the `assembly/` sub-directory.

```text
assembly
└── release
└── gfLaeSulp1.1
└── insdc
├── ACCESSION
├── GCA_927399515.1.assembly_report.txt
├── GCA_927399515.1.assembly_stats.txt
├── GCA_927399515.1.fa.dict
├── GCA_927399515.1.fa.gz
├── GCA_927399515.1.fa.gz.fai
├── GCA_927399515.1.fa.gz.gzi
└── GCA_927399515.1.fa.gz.sizes
├── ACCESSION
├── GCA_927399515.1.assembly_report.txt
├── GCA_927399515.1.assembly_stats.txt
├── GCA_927399515.1.fa.dict
├── GCA_927399515.1.fa.gz
├── GCA_927399515.1.fa.gz.fai
├── GCA_927399515.1.fa.gz.gzi
└── GCA_927399515.1.fa.gz.sizes
```

The directory structure includes the assembly name, e.g. `gfLaeSulp1.1`, and all files are named after the assembly accession, e.g. `GCA_927399515.1`.
All files are named after the assembly accession, e.g. `GCA_927399515.1`.

- `GCA_*.assembly_report.txt` and `GCA_*.assembly_stats.txt`: report and statistics files, straight from the NCBI FTP
- `GCA_*.fa.gz`: Unmasked assembly in Fasta format, compressed with `bgzip` (whose index is `GCA_*.fa.gz.gzi`)
Expand All @@ -48,25 +46,23 @@ with the exception of `ACCESSION`, which contains a single line of text: the ass

### Primary analysis files

Here are the files you can expect in the `analysis/` sub-directory.
Here are the files you can expect in the `repeats/` sub-directory.

```text
analysis
└── gfLaeSulp1.1
└── repeats
└── ncbi
├── GCA_927399515.1.masked.ncbi.bed.gz
├── GCA_927399515.1.masked.ncbi.bed.gz.csi
├── GCA_927399515.1.masked.ncbi.bed.gz.tbi
├── GCA_927399515.1.masked.ncbi.fa.dict
├── GCA_927399515.1.masked.ncbi.fa.gz
├── GCA_927399515.1.masked.ncbi.fa.gz.fai
├── GCA_927399515.1.masked.ncbi.fa.gz.gzi
└── GCA_927399515.1.masked.ncbi.fa.gz.sizes
repeats
└── ncbi
├── GCA_927399515.1.masked.ncbi.bed.gz
├── GCA_927399515.1.masked.ncbi.bed.gz.csi
├── GCA_927399515.1.masked.ncbi.bed.gz.tbi
├── GCA_927399515.1.masked.ncbi.fa.dict
├── GCA_927399515.1.masked.ncbi.fa.gz
├── GCA_927399515.1.masked.ncbi.fa.gz.fai
├── GCA_927399515.1.masked.ncbi.fa.gz.gzi
└── GCA_927399515.1.masked.ncbi.fa.gz.sizes
```

They all correspond to the repeat-masking analysis run by the NCBI themselves. Like for the `assembly/` sub-directory,
the directory structure includes the assembly name, e.g. `gfLaeSulp1.1`, and all files are named after the assembly accession, e.g. `GCA_927399515.1`.
all files are named after the assembly accession, e.g. `GCA_927399515.1`.

- `GCA_*.masked.ncbi.fa.gz`: Masked assembly in Fasta format, compressed with `bgzip` (whose index is `GCA_*.fa.gz.gzi`)
- `GCA_*.masked.ncbi.fa.gz.fai`: `samtools faidx` index, which allows accessing any region of the assembly in constant time
Expand Down
8 changes: 3 additions & 5 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,14 @@ The pipeline can download multiple assemblies at once, by providing them in a `.
It has to be a comma-separated file with three columns, and a header row as shown in the examples below.

```console
species_dir,assembly_name,assembly_accession
outdir,assembly_name,assembly_accession
darwin/data/fungi/Laetiporus_sulphureus,gfLaeSulp1.1,GCA_927399515.1
darwin/data/mammals/Meles_meles,mMelMel3.2_paternal_haplotype,GCA_922984935.2
```

| Column | Description |
| -------------------- | -------------------------------------------------------------------------------- |
| `species_dir` | Base download directory for this species. Evaluated from `--outdir` if relative. |
| `outdir` | Base download directory for this species. Evaluated from `--outdir` if relative. |
| `assembly_name` | Name of the assembly, as on the NCBI website, e.g. `gfLaeSulp1.1`. |
| `assembly_accession` | Accession number of the assembly to download. Typically of the form `GCA_*.*`. |

Expand All @@ -48,9 +48,7 @@ A samplesheet may contain:
- only one row per assembly

All samplesheet columns correspond exactly to their corresponding command-line parameter,
except `species_dir` which overrides or complements `--oudir`.
`species_dir` is used to fit the output of this pipeline into a directory structure compatible with the other pipelines
from Sanger Tree of Life.
except `outdir` which overrides or complements `--oudir`.

An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.

Expand Down
6 changes: 2 additions & 4 deletions modules/local/ncbi_download.nf
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ process NCBI_DOWNLOAD {
'biocontainers/gnu-wget:1.18--h7132678_6' }"

input:
tuple val(assembly_accession), val(assembly_name), val(species_dir)
tuple val(assembly_accession), val(assembly_name), val(outdir)

output:
tuple val(meta), path(filename_fasta) , emit: fasta
Expand All @@ -36,7 +36,7 @@ process NCBI_DOWNLOAD {
meta = [
id : assembly_accession,
assembly_name : assembly_name,
species_dir : species_dir,
outdir : outdir,
]
def prefix = task.ext.prefix ?: "${meta.id}"
filename_assembly_report = "${prefix}.assembly_report.txt"
Expand All @@ -45,8 +45,6 @@ process NCBI_DOWNLOAD {
filename_accession = "ACCESSION"

"""
#export https_proxy=http://wwwcache.sanger.ac.uk:3128
#export http_proxy=http://wwwcache.sanger.ac.uk:3128
wget ${ftp_path}/${remote_filename_stem}_assembly_report.txt
wget ${ftp_path}/${remote_filename_stem}_assembly_stats.txt
wget ${ftp_path}/${remote_filename_stem}_genomic.fna.gz
Expand Down
2 changes: 1 addition & 1 deletion nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
"pattern": "^\\S+\\.csv$",
"schema": "assets/schema_input.json",
"description": "Path to comma-separated file containing information about the assemblies to download. Used for bulk download of many assemblies.",
"help_text": "The file has to be a comma-separated file with three columns, and a header row. The columns names must be `species_dir`, `assembly_accession`, and `assembly_name`.",
"help_text": "The file has to be a comma-separated file with three columns, and a header row. The columns names must be `outdir`, `assembly_accession`, and `assembly_name`.",
"fa_icon": "fas fa-file-csv"
},
"ftp_root": {
Expand Down
2 changes: 1 addition & 1 deletion subworkflows/local/download_genome.nf
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ include { REMOVE_MASKING } from '../../modules/local/remove_masking'
workflow DOWNLOAD_GENOME {

take:
assembly_params // tuple(assembly_accession, assembly_name, species_dir)
assembly_params // tuple(assembly_accession, assembly_name, outdir)


main:
Expand Down
6 changes: 3 additions & 3 deletions subworkflows/local/params_check.nf
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,13 @@ workflow PARAMS_CHECK {

SAMPLESHEET_CHECK ( file(samplesheet, checkIfExists: true) )
.csv
// Provides species_dir, assembly_accession, and assembly_name
// Provides outdir, assembly_accession, and assembly_name
.splitCsv ( header:true, sep:',' )
// Convert to tuple, as required by the download subworkflow
.map { [
it["assembly_accession"],
it["assembly_name"],
(it["species_dir"].startsWith("/") ? "" : outdir + "/") + it["species_dir"],
(it["outdir"].startsWith("/") ? "" : outdir + "/") + it["outdir"],
] }
.set { ch_inputs }

Expand All @@ -41,7 +41,7 @@ workflow PARAMS_CHECK {


emit:
assembly_params = ch_inputs // channel: tuple(assembly_accession, assembly_name, species_dir)
assembly_params = ch_inputs // channel: tuple(assembly_accession, assembly_name, outdir)
versions = ch_versions // channel: versions.yml
}

0 comments on commit c75a738

Please sign in to comment.