From 1b7f285d6e9cb07ef6645241f6931f5c2f3931d8 Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Thu, 3 Aug 2023 16:29:32 +0000
Subject: [PATCH 1/2] Now use "outdir", which is equivalent to a ToL analysis
 directory

---
 CHANGELOG.md                          |  4 ++-
 assets/samplesheet.csv                | 14 ++++----
 assets/schema_input.json              |  4 +--
 bin/check_samplesheet.py              | 12 +++----
 conf/modules.config                   |  6 ++--
 docs/output.md                        | 50 ++++++++++++---------------
 docs/usage.md                         |  8 ++---
 modules/local/ncbi_download.nf        |  6 ++--
 nextflow_schema.json                  |  2 +-
 subworkflows/local/download_genome.nf |  2 +-
 subworkflows/local/params_check.nf    |  6 ++--
 11 files changed, 54 insertions(+), 60 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b518284..e405031 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,10 +3,12 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## v1.2.0 - [date]
+## v2.0.0 - [date]
 
 ### `Fixed`
 
+- The sample-sheet column `species_dir` is replaced with the `outdir` column which
+  represents where the assembly and repeats are downloaded (in immediate sub-directories)
 - Relative paths in the sample-sheet are now evaluated from the `--outdir` parameter
 - Memory usage rules for `samtools dict`
 - Appropriate use of `tabix`'s TBI and CSI indexing, depending on the sequence lengths
diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv
index 49f1be2..0f2f8be 100644
--- a/assets/samplesheet.csv
+++ b/assets/samplesheet.csv
@@ -1,7 +1,7 @@
-species_dir,assembly_name,assembly_accession
-25g/data/echinoderms/Asterias_rubens,eAstRub1.3,GCA_902459465.3
-25g/data/insects/Osmia_bicornis,iOsmBic2.1,GCA_907164935.1
-25g/data/insects/Osmia_bicornis,iOsmBic2.1_alternate_haplotype,GCA_907164925.1
-darwin/data/fungi/Laetiporus_sulphureus,gfLaeSulp1.1,GCA_927399515.1
-darwin/data/insects/Noctua_fimbriata,ilNocFimb1.1,GCA_905163415.1
-darwin/data/mammals/Meles_meles,mMelMel3.2_paternal_haplotype,GCA_922984935.2
+outdir,assembly_name,assembly_accession
+Asterias_rubens/eAstRub1.3,eAstRub1.3,GCA_902459465.3
+Osmia_bicornis/iOsmBic2.1,iOsmBic2.1,GCA_907164935.1
+Osmia_bicornis/iOsmBic2.1_alternate_haplotype,iOsmBic2.1_alternate_haplotype,GCA_907164925.1
+Laetiporus_sulphureus/gfLaeSulp1.1,gfLaeSulp1.1,GCA_927399515.1
+Noctua_fimbriata/ilNocFimb1.1,ilNocFimb1.1,GCA_905163415.1
+Meles_meles/mMelMel3.2_paternal_haplotype,mMelMel3.2_paternal_haplotype,GCA_922984935.2
diff --git a/assets/schema_input.json b/assets/schema_input.json
index ed91197..dafbfaa 100644
--- a/assets/schema_input.json
+++ b/assets/schema_input.json
@@ -7,7 +7,7 @@
     "items": {
         "type": "object",
         "properties": {
-            "species_dir": {
+            "outdir": {
                 "type": "string",
                 "pattern": "^\\S+$",
                 "errorMessage": "Species directory must be provided and exist"
@@ -23,6 +23,6 @@
                 "errorMessage": "Assembly accession number must be provided and be of the form GCA_*"
             }
         },
-        "required": ["species_dir", "assembly_name", "assembly_accession"]
+        "required": ["outdir", "assembly_name", "assembly_accession"]
     }
 }
diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
index de6852e..aa8bfc4 100755
--- a/bin/check_samplesheet.py
+++ b/bin/check_samplesheet.py
@@ -28,7 +28,7 @@ class RowChecker:
 
     def __init__(
         self,
-        dir_col="species_dir",
+        dir_col="outdir",
         name_col="assembly_name",
         accession_col="assembly_accession",
         **kwargs,
@@ -38,7 +38,7 @@ def __init__(
 
         Args:
             dir_col (str): The name of the column that contains the species directory
-                (default "species_dir").
+                (default "outdir").
             name_col (str): The name of the column that contains the assembly name
                 (default "assembly_name").
             accession_col (str): The name of the column that contains the accession
@@ -142,12 +142,12 @@ def check_samplesheet(file_in, file_out):
     Example:
         This function checks that the samplesheet follows the following structure::
 
-            species_dir,assembly_name,assembly_accession
-            darwin/data/fungi/Laetiporus_sulphureus,gfLaeSulp1.1,GCA_927399515.1
-            darwin/data/mammals/Meles_meles,mMelMel3.2_paternal_haplotype,GCA_922984935.2
+            outdir,assembly_name,assembly_accession
+            Laetiporus_sulphureus/gfLaeSulp1.1,gfLaeSulp1.1,GCA_927399515.1
+            Meles_meles/mMelMel3.2_paternal_haplotype,mMelMel3.2_paternal_haplotype,GCA_922984935.2
     """
     required_columns = {
-        "species_dir",
+        "outdir",
         "assembly_name",
         "assembly_accession",
     }
diff --git a/conf/modules.config b/conf/modules.config
index 091bce5..c0bcbe4 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -15,7 +15,7 @@ process {
     withName: 'NCBI_DOWNLOAD' {
         maxForks = 3
         publishDir = [
-            path: { "${meta.species_dir}/assembly/release/${meta.assembly_name}/insdc" },
+            path: { "${meta.outdir}/assembly" },
             mode: 'copy',
             saveAs: { filename -> filename.endsWith('assembly_report.txt') || filename.endsWith('assembly_stats.txt') || filename.endsWith("ACCESSION") ? filename : null }
         ]
@@ -23,7 +23,7 @@ process {
 
     withName: '.*:.*:PREPARE_UNMASKED_FASTA:.*' {
         publishDir = [
-            path: { "${meta.species_dir}/assembly/release/${meta.assembly_name}/insdc" },
+            path: { "${meta.outdir}/assembly" },
             mode: 'copy',
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]
@@ -31,7 +31,7 @@ process {
 
     withName: '.*:.*:(PREPARE_REPEAT_MASKED_FASTA:.*|PREPARE_REPEATS:TABIX_.*)' {
         publishDir = [
-            path: { "${meta.species_dir}/analysis/${meta.assembly_name}/repeats/ncbi" },
+            path: { "${meta.outdir}/repeats/ncbi" },
             mode: 'copy',
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]
diff --git a/docs/output.md b/docs/output.md
index 009197f..312f315 100644
--- a/docs/output.md
+++ b/docs/output.md
@@ -4,7 +4,8 @@
 
 This document describes the output produced by the pipeline.
 
-The directories listed below will be created in the results directory (or `species_dir` when using a samplesheet) after the pipeline has finished.
+The directories listed below will be created in a directory based on the `--outdir` command-line parameter and the `outdir` column of the samplesheet.
+) after the pipeline has finished.
 All paths are relative to the top-level results directory.
 
 The directories comply with Tree of Life's canonical directory structure.
@@ -23,20 +24,17 @@ Here are the files you can expect in the `assembly/` sub-directory.
 
 ```text
 assembly
-└── release
-    └── gfLaeSulp1.1
-        └── insdc
-            ├── ACCESSION
-            ├── GCA_927399515.1.assembly_report.txt
-            ├── GCA_927399515.1.assembly_stats.txt
-            ├── GCA_927399515.1.fa.dict
-            ├── GCA_927399515.1.fa.gz
-            ├── GCA_927399515.1.fa.gz.fai
-            ├── GCA_927399515.1.fa.gz.gzi
-            └── GCA_927399515.1.fa.gz.sizes
+├── ACCESSION
+├── GCA_927399515.1.assembly_report.txt
+├── GCA_927399515.1.assembly_stats.txt
+├── GCA_927399515.1.fa.dict
+├── GCA_927399515.1.fa.gz
+├── GCA_927399515.1.fa.gz.fai
+├── GCA_927399515.1.fa.gz.gzi
+└── GCA_927399515.1.fa.gz.sizes
 ```
 
-The directory structure includes the assembly name, e.g. `gfLaeSulp1.1`, and all files are named after the assembly accession, e.g. `GCA_927399515.1`.
+All files are named after the assembly accession, e.g. `GCA_927399515.1`.
 
 - `GCA_*.assembly_report.txt` and `GCA_*.assembly_stats.txt`: report and statistics files, straight from the NCBI FTP
 - `GCA_*.fa.gz`: Unmasked assembly in Fasta format, compressed with `bgzip` (whose index is `GCA_*.fa.gz.gzi`)
@@ -48,25 +46,23 @@ with the exception of `ACCESSION`, which contains a single line of text: the ass
 
 ### Primary analysis files
 
-Here are the files you can expect in the `analysis/` sub-directory.
+Here are the files you can expect in the `repeats/` sub-directory.
 
 ```text
-analysis
-└── gfLaeSulp1.1
-    └── repeats
-        └── ncbi
-            ├── GCA_927399515.1.masked.ncbi.bed.gz
-            ├── GCA_927399515.1.masked.ncbi.bed.gz.csi
-            ├── GCA_927399515.1.masked.ncbi.bed.gz.tbi
-            ├── GCA_927399515.1.masked.ncbi.fa.dict
-            ├── GCA_927399515.1.masked.ncbi.fa.gz
-            ├── GCA_927399515.1.masked.ncbi.fa.gz.fai
-            ├── GCA_927399515.1.masked.ncbi.fa.gz.gzi
-            └── GCA_927399515.1.masked.ncbi.fa.gz.sizes
+repeats
+└── ncbi
+    ├── GCA_927399515.1.masked.ncbi.bed.gz
+    ├── GCA_927399515.1.masked.ncbi.bed.gz.csi
+    ├── GCA_927399515.1.masked.ncbi.bed.gz.tbi
+    ├── GCA_927399515.1.masked.ncbi.fa.dict
+    ├── GCA_927399515.1.masked.ncbi.fa.gz
+    ├── GCA_927399515.1.masked.ncbi.fa.gz.fai
+    ├── GCA_927399515.1.masked.ncbi.fa.gz.gzi
+    └── GCA_927399515.1.masked.ncbi.fa.gz.sizes
 ```
 
 They all correspond to the repeat-masking analysis run by the NCBI themselves. Like for the `assembly/` sub-directory,
-the directory structure includes the assembly name, e.g. `gfLaeSulp1.1`, and all files are named after the assembly accession, e.g. `GCA_927399515.1`.
+all files are named after the assembly accession, e.g. `GCA_927399515.1`.
 
 - `GCA_*.masked.ncbi.fa.gz`: Masked assembly in Fasta format, compressed with `bgzip` (whose index is `GCA_*.fa.gz.gzi`)
 - `GCA_*.masked.ncbi.fa.gz.fai`: `samtools faidx` index, which allows accessing any region of the assembly in constant time
diff --git a/docs/usage.md b/docs/usage.md
index 0b120bc..d6c683f 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -30,14 +30,14 @@ The pipeline can download multiple assemblies at once, by providing them in a `.
 It has to be a comma-separated file with three columns, and a header row as shown in the examples below.
 
 ```console
-species_dir,assembly_name,assembly_accession
+outdir,assembly_name,assembly_accession
 darwin/data/fungi/Laetiporus_sulphureus,gfLaeSulp1.1,GCA_927399515.1
 darwin/data/mammals/Meles_meles,mMelMel3.2_paternal_haplotype,GCA_922984935.2
 ```
 
 | Column               | Description                                                                      |
 | -------------------- | -------------------------------------------------------------------------------- |
-| `species_dir`        | Base download directory for this species. Evaluated from `--outdir` if relative. |
+| `outdir`             | Base download directory for this species. Evaluated from `--outdir` if relative. |
 | `assembly_name`      | Name of the assembly, as on the NCBI website, e.g. `gfLaeSulp1.1`.               |
 | `assembly_accession` | Accession number of the assembly to download. Typically of the form `GCA_*.*`.   |
 
@@ -48,9 +48,7 @@ A samplesheet may contain:
 - only one row per assembly
 
 All samplesheet columns correspond exactly to their corresponding command-line parameter,
-except `species_dir` which overrides or complements `--oudir`.
-`species_dir` is used to fit the output of this pipeline into a directory structure compatible with the other pipelines
-from Sanger Tree of Life.
+except `outdir` which overrides or complements `--oudir`.
 
 An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.
 
diff --git a/modules/local/ncbi_download.nf b/modules/local/ncbi_download.nf
index 54196d2..ae63e52 100644
--- a/modules/local/ncbi_download.nf
+++ b/modules/local/ncbi_download.nf
@@ -12,7 +12,7 @@ process NCBI_DOWNLOAD {
         'biocontainers/gnu-wget:1.18--h7132678_6' }"
 
     input:
-    tuple val(assembly_accession), val(assembly_name), val(species_dir)
+    tuple val(assembly_accession), val(assembly_name), val(outdir)
 
     output:
     tuple val(meta), path(filename_fasta)          , emit: fasta
@@ -36,7 +36,7 @@ process NCBI_DOWNLOAD {
     meta = [
         id : assembly_accession,
         assembly_name : assembly_name,
-        species_dir : species_dir,
+        outdir : outdir,
     ]
     def prefix = task.ext.prefix ?: "${meta.id}"
     filename_assembly_report = "${prefix}.assembly_report.txt"
@@ -45,8 +45,6 @@ process NCBI_DOWNLOAD {
     filename_accession = "ACCESSION"
 
     """
-    #export https_proxy=http://wwwcache.sanger.ac.uk:3128
-    #export http_proxy=http://wwwcache.sanger.ac.uk:3128
     wget ${ftp_path}/${remote_filename_stem}_assembly_report.txt
     wget ${ftp_path}/${remote_filename_stem}_assembly_stats.txt
     wget ${ftp_path}/${remote_filename_stem}_genomic.fna.gz
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 90a125c..bc19c9b 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -38,7 +38,7 @@
                     "pattern": "^\\S+\\.csv$",
                     "schema": "assets/schema_input.json",
                     "description": "Path to comma-separated file containing information about the assemblies to download. Used for bulk download of many assemblies.",
-                    "help_text": "The file has to be a comma-separated file with three columns, and a header row. The columns names must be `species_dir`, `assembly_accession`, and `assembly_name`.",
+                    "help_text": "The file has to be a comma-separated file with three columns, and a header row. The columns names must be `outdir`, `assembly_accession`, and `assembly_name`.",
                     "fa_icon": "fas fa-file-csv"
                 },
                 "ftp_root": {
diff --git a/subworkflows/local/download_genome.nf b/subworkflows/local/download_genome.nf
index 0a57503..dd4da55 100644
--- a/subworkflows/local/download_genome.nf
+++ b/subworkflows/local/download_genome.nf
@@ -9,7 +9,7 @@ include { REMOVE_MASKING          } from '../../modules/local/remove_masking'
 workflow DOWNLOAD_GENOME {
 
     take:
-    assembly_params         // tuple(assembly_accession, assembly_name, species_dir)
+    assembly_params         // tuple(assembly_accession, assembly_name, outdir)
 
 
     main:
diff --git a/subworkflows/local/params_check.nf b/subworkflows/local/params_check.nf
index 8992365..e9e4458 100644
--- a/subworkflows/local/params_check.nf
+++ b/subworkflows/local/params_check.nf
@@ -21,13 +21,13 @@ workflow PARAMS_CHECK {
 
         SAMPLESHEET_CHECK ( file(samplesheet, checkIfExists: true) )
             .csv
-            // Provides species_dir, assembly_accession, and assembly_name
+            // Provides outdir, assembly_accession, and assembly_name
             .splitCsv ( header:true, sep:',' )
             // Convert to tuple, as required by the download subworkflow
             .map { [
                 it["assembly_accession"],
                 it["assembly_name"],
-                (it["species_dir"].startsWith("/") ? "" : outdir + "/") + it["species_dir"],
+                (it["outdir"].startsWith("/") ? "" : outdir + "/") + it["outdir"],
             ] }
             .set { ch_inputs }
 
@@ -41,7 +41,7 @@ workflow PARAMS_CHECK {
 
 
     emit:
-    assembly_params = ch_inputs        // channel: tuple(assembly_accession, assembly_name, species_dir)
+    assembly_params = ch_inputs        // channel: tuple(assembly_accession, assembly_name, outdir)
     versions        = ch_versions      // channel: versions.yml
 }
 

From 403f469d084d0631e0135f2cace320fdd4219491 Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Sat, 23 Sep 2023 11:18:38 +0000
Subject: [PATCH 2/2] Made the relation between --outdir and the samplesheet
 outdir clearer

---
 docs/usage.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/usage.md b/docs/usage.md
index d6c683f..f785a05 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -15,7 +15,7 @@ The pipeline accepts command-one line arguments to specify a single genome to do
 
 - `--assembly_name`: The name of the assembly,
 - `--assembly_accession`: The accession number of the assembly,
-- `--outdir`: Where to download the data.
+- `--outdir`: Where the pipeline runtime information will be stored, and where data will be downloaded (except if absolute paths are given in the samplesheet).
 
 ```console
 nextflow run sanger-tol/insdcdownload --assembly_accession GCA_927399515.1 --assembly_name gfLaeSulp1.1 --outdir gfLaeSulp1.1_data