Merge pull request #10 from sanger-tol/DLBPointon-patch-1

Dlb pointon patch 1
sanger-tol · Sep 21, 2023 · 6006596 · 6006596
2 parents 99816ae + 6d7f397
commit 6006596
Show file tree

Hide file tree

Showing 25 changed files with 180 additions and 102 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -35,9 +35,17 @@ jobs:
         with:
           version: "${{ matrix.NXF_VER }}"
 
-      - name: Run pipeline with test data
-        # TODO nf-core: You can customise CI pipeline run tests as required
-        # For example: adding multiple test runs with different parameters
+      - name: Download test data
+        # Download A fungal test data set that is full enough to show some real output.
+        run: |
+          curl https://tolit.cog.sanger.ac.uk/test-data/resources/treeval/TreeValTinyData.tar.gz | tar xzf -
+
+      - name: Run MAPS_ONLY pipeline with test data
+        # Remember that you can parallelise this by using strategy.matrix
+        run: |
+          nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results -entry MAPS_ONLY
+
+      - name: Run ALL_FILES pipeline with test data
         # Remember that you can parallelise this by using strategy.matrix
         run: |
           nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results
diff --git a/.github/workflows/sanger_test.yml b/.github/workflows/sanger_test.yml
@@ -0,0 +1,29 @@
+name: sanger-tol LSF tests
+
+on:
+  workflow_dispatch:
+jobs:
+  run-tower:
+    name: Run LSF tests
+    runs-on: ubuntu-latest
+    steps:
+      - name: Launch workflow via tower
+        uses: seqeralabs/action-tower-launch@v2
+        with:
+          workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }}
+          access_token: ${{ secrets.TOWER_ACCESS_TOKEN }}
+          compute_env: ${{ secrets.TOWER_COMPUTE_ENV }}
+          revision: ${{ github.sha }}
+          workdir: ${{ secrets.TOWER_WORKDIR_PARENT }}/work/${{ github.repository }}/work-${{ github.sha }}
+          parameters: |
+            {
+              "outdir": "${{ secrets.TOWER_WORKDIR_PARENT }}/results/${{ github.repository }}/results-${{ github.sha }}",
+            }
+          profiles: test,sanger,singularity,cleanup
+
+      - uses: actions/upload-artifact@v3
+        with:
+          name: Tower debug log file
+          path: |
+            tower_action_*.log
+            tower_action_*.json
diff --git a/.github/workflows/sanger_test_full.yml b/.github/workflows/sanger_test_full.yml
@@ -0,0 +1,43 @@
+name: sanger-tol LSF full size tests
+
+on:
+  push:
+    branches:
+      - main
+      - dev
+  workflow_dispatch:
+jobs:
+  run-tower:
+    name: Run LSF full size tests
+    runs-on: ubuntu-latest
+    steps:
+      - name: Sets env vars for push
+        run: |
+          echo "REVISION=${{ github.sha }}" >> $GITHUB_ENV
+        if: github.event_name == 'push'
+
+      - name: Sets env vars for workflow_dispatch
+        run: |
+          echo "REVISION=${{ github.sha }}" >> $GITHUB_ENV
+        if: github.event_name == 'workflow_dispatch'
+
+      - name: Launch workflow via tower
+        uses: seqeralabs/action-tower-launch@v2
+        with:
+          workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }}
+          access_token: ${{ secrets.TOWER_ACCESS_TOKEN }}
+          compute_env: ${{ secrets.TOWER_COMPUTE_ENV }}
+          revision: ${{ env.REVISION }}
+          workdir: ${{ secrets.TOWER_WORKDIR_PARENT }}/work/${{ github.repository }}/work-${{ env.REVISION }}
+          parameters: |
+            {
+              "outdir": "${{ secrets.TOWER_WORKDIR_PARENT }}/results/${{ github.repository }}/results-${{ env.REVISION }}",
+            }
+          profiles: test_full,sanger,singularity,cleanup
+
+      - uses: actions/upload-artifact@v3
+        with:
+          name: Tower debug log file
+          path: |
+            tower_action_*.log
+            tower_action_*.json
diff --git a/README.md b/README.md
@@ -12,16 +12,16 @@
 
 ## Introduction
 
-**sanger-tol/curationpretext** is a bioinformatics pipeline typically used in conjunction with [TreeVal](https://github.com/sanger-tol/treeval) to generate pretext maps (and optionally telomeric, gap, coverage and repeat density plots which can be ingested into pretext) for the manual curation of high quality genomes.
+**sanger-tol/curationpretext** is a bioinformatics pipeline typically used in conjunction with [TreeVal](https://github.com/sanger-tol/treeval) to generate pretext maps (and optionally telomeric, gap, coverage, and repeat density plots which can be ingested into pretext) for the manual curation of high quality genomes.
 
-This is intended as a supplementary pipeline for the [treeval](https://github.com/sanger-tol/treeval) project. However, can be simply used to generate pretext maps.
+This is intended as a supplementary pipeline for the [treeval](https://github.com/sanger-tol/treeval) project. This pipeline can be simply used to generate pretext maps, information on how to run this pipeline can be found in the [usage documentation](https://pipelines.tol.sanger.ac.uk/curationpretext/usage).
 
 <!-- TODO nf-core: Include a figure that guides the user through the major workflow steps. Many nf-core
      workflows use the "tube map" design for that. See https://nf-co.re/docs/contributing/design_guidelines#examples for examples.   -->
 
-1. Generate Maps - Generates pretext maps aswell as a static image.
+1. Generate Maps - Generates pretext maps as well as a static image.
 
-2. Accessory files - Generates the repeat density, gap, telomere and coverage tracks.
+2. Accessory files - Generates the repeat density, gap, telomere, and coverage tracks.
 
 ## Usage
 
@@ -34,19 +34,19 @@ Currently, the pipeline uses the following flags:
 
 - --input
 
-  - The absolute path to the assembled genome in, e.g, `/path/to/assembly.fa`
+  - The absolute path to the assembled genome in, e.g., `/path/to/assembly.fa`
 
 - --pacbio
 
-  - The directory of the fasta files generated from pacbio reads, e.g, `/path/to/fasta/`
+  - The directory of the fasta files generated from pacbio reads, e.g., `/path/to/fasta/`
 
 - --cram
 
-  - The directory of the cram _and_ cram.crai files, e.g, `/path/to/cram/`
+  - The directory of the cram _and_ cram.crai files, e.g., `/path/to/cram/`
 
 - --teloseq
 
-  - A telomeric sequence, e.g, `TTAGGG`
+  - A telomeric sequence, e.g., `TTAGGG`
 
 - -entry
   - ALL_FILES generates all accessory files as well as pretext maps
@@ -60,18 +60,17 @@ Now, you can run the pipeline using:
 // For ALL_FILES run
 nextflow run sanger-tol/curationpretext \
    -profile <docker/singularity/.../institute> \
-   --fasta path/to/assembly.fa \
+   --input path/to/assembly.fa \
    --cram path/to/cram/ \
    --pacbio path/to/pacbio/fasta/ \
    --teloseq TTAGGG \
    --sample { default is "pretext_rerun" }
-   -entry ALL_FILES \
    --outdir path/to/outdir/
 
 // For MAPS_ONLY run
 nextflow run sanger-tol/curationpretext \
    -profile <docker/singularity/.../institute> \
-   --fasta path/to/assembly.fa \
+   --input path/to/assembly.fa \
    --cram path/to/cram/ \
    --sample { default is "pretext_rerun" }
    -entry MAPS_ONLY \
@@ -81,15 +80,14 @@ nextflow run sanger-tol/curationpretext \
 > **Warning:**
 > Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those
 > provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_;
-> see [docs](https://nf-co.re/usage/configuration#custom-configuration-files).
 
-For more details, please refer to the [usage documentation](https://nf-co.re/curationpretext/usage) and the [parameter documentation](https://nf-co.re/curationpretext/parameters).
+For more details, please refer to the [usage documentation](https://pipelines.tol.sanger.ac.uk/curationpretext/usage) and the [parameter documentation](https://pipelines.tol.sanger.ac.uk/curationpretext/parameters).
 
 ## Pipeline output
 
-To see the the results of a test run with a full size dataset refer to the [results](https://nf-co.re/curationpretext/results) tab on the nf-core website pipeline page.
+To see the the results of a test run with a full size dataset refer to the [results](https://pipelines.tol.sanger.ac.uk/curationpretext/results) tab on the nf-core website pipeline page.
 For more details about the output files and reports, please refer to the
-[output documentation](https://nf-co.re/curationpretext/output).
+[output documentation](https://pipelines.tol.sanger.ac.uk/curationpretext/output).
 
 ## Credits
 

diff --git a/conf/test.config b/conf/test.config
@@ -11,19 +11,18 @@
 */
 
 params {
-    config_profile_name        = 'Test profile'
+    config_profile_name        = 'GitHub Test profile'
     config_profile_description = 'Minimal test dataset to check pipeline function'
 
     // Limit resources so that this can run on GitHub Actions
-    max_cpus   = 2
-    max_memory = '6.GB'
-    max_time   = '6.h'
+    max_cpus    = 2
+    max_memory  = '6.GB'
+    max_time    = '6.h'
 
-    // Input data
-    // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets
-    // TODO nf-core: Give any required params for the test so that command line flags are not needed
-    input  = 'https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv'
-
-    // Genome references
-    genome = 'R64-1-1'
+    input       = "/home/runner/work/curationpretext/curationpretext/TreeValTinyData/assembly/draft/grTriPseu1.fa"
+    outdir      = "./results"
+    pacbio      = "/home/runner/work/curationpretext/curationpretext/TreeValTinyData/genomic_data/pacbio/"
+    cram        = "/home/runner/work/curationpretext/curationpretext/TreeValTinyData/genomic_data/hic-arima/"
+    sample      = "CurationPretextTest"
+    teloseq     = "TTAGGG"
 }
diff --git a/conf/test_full.config b/conf/test_full.config
@@ -21,6 +21,10 @@ params {
     // TODO nf-core: Give any required params for the test so that command line flags are not needed
     input = 'https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_full_illumina_amplicon.csv'
 
-    // Genome references
-    genome = 'R64-1-1'
+    input       = "/lustre/scratch123/tol/resources/treeval/treeval-testdata/TreeValSmallData/Oscheius_DF5033/assembly/draft/DF5033.hifiasm.noTelos.20211120/DF5033.noTelos.hifiasm.purged.noCont.noMito.fasta"
+    outdir      = "./results"
+    pacbio      = "/lustre/scratch123/tol/resources/treeval/treeval-testdata/TreeValSmallData/Oscheius_DF5033/genomic_data/nxOscSpes1/pacbio/fasta/"
+    cram        = "/lustre/scratch123/tol/resources/treeval/treeval-testdata/TreeValSmallData/Oscheius_DF5033/genomic_data/nxOscSpes1/hic-arima2/full/"
+    sample      = "CurationPretextTest"
+    teloseq     = "TTAGGG"
 }
diff --git a/docs/usage.md b/docs/usage.md
@@ -6,68 +6,51 @@
 
 ## Introduction
 
-<!-- TODO nf-core: Add documentation about anything specific to running your pipeline. For general topics, please point to (and add to) the main nf-core website. -->
+This is a sister pipeline to [TreeVal](https://github.com/sanger-tol/treeval/) which generated a plurality of data for the curation of reference-quality genomes. curationpretext is a subset of TreeVal that produces soley the Pretext maps and accessory files
 
-## Samplesheet input
+Currently, the pipeline expects input data to be in a specific format.
 
-You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below.
+The `--input` should be `.fasta` or `.fa` (the same format but differing suffix).
 
-```bash
---input '[path to samplesheet file]'
-```
-
-### Multiple runs of the same sample
-
-The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 3 lanes:
+The `--cram` should point to the folder containing `.cram` files along with a `.crai` per `.cram`.
 
-```console
-sample,fastq_1,fastq_2
-CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz
-CONTROL_REP1,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz
-CONTROL_REP1,AEG588A1_S1_L004_R1_001.fastq.gz,AEG588A1_S1_L004_R2_001.fastq.gz
-```
-
-### Full samplesheet
+The `--pacbio` should point to the folder containing `.fasta.gz` files.
 
-The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 3 columns to match those defined in the table below.
+If you do not have these file formats we have also included instructions on converting from common formats to our preferred format.
+If there is a popular public preference for a particular format, we can modify the pipeline to utilise those formats. Just submit an issue.
 
-A final samplesheet file consisting of both single- and paired-end data may look something like the one below. This is for 6 samples, where `TREATMENT_REP3` has been sequenced twice.
-
-```console
-sample,fastq_1,fastq_2
-CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz
-CONTROL_REP2,AEG588A2_S2_L002_R1_001.fastq.gz,AEG588A2_S2_L002_R2_001.fastq.gz
-CONTROL_REP3,AEG588A3_S3_L002_R1_001.fastq.gz,AEG588A3_S3_L002_R2_001.fastq.gz
-TREATMENT_REP1,AEG588A4_S4_L003_R1_001.fastq.gz,
-TREATMENT_REP2,AEG588A5_S5_L003_R1_001.fastq.gz,
-TREATMENT_REP3,AEG588A6_S6_L003_R1_001.fastq.gz,
-TREATMENT_REP3,AEG588A6_S6_L004_R1_001.fastq.gz,
-```
+The conversion documentation can be found in:
 
-| Column    | Description                                                                                                                                                                            |
-| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `sample`  | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). |
-| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz".                                                             |
-| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz".                                                             |
+- [TreeVal Wiki - PacBio Data](https://github.com/sanger-tol/treeval/wiki/Data-Preparation#pacbio-data).
+- [TreeVal Wiki - Cram Data](https://github.com/sanger-tol/treeval/wiki/Data-Preparation#hic-data).
 
-An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.
+The pipeline currently does not ingest the accessory files into the pretext map for you, however, we have included this code in the [TreeVal Wiki - Ingesting PreText Accessory Files](https://github.com/sanger-tol/treeval/wiki/Ingesting-PreText-Accessory-Files).
 
 ## Running the pipeline
 
 The typical command for running the pipeline is as follows:
 
 ```bash
-nextflow run sanger-tol/curationpretext --input samplesheet.csv --outdir <OUTDIR> --genome GRCh37 -profile docker
+nextflow run sanger-tol/curationpretext \
+  --input { input.fasta } \
+  --cram { path/to/cram/ } \
+  --pacbio { path/to/pacbio/fasta/ } \
+  --sample { default is "pretext_rerun" } \
+  --teloseq {TTAGGG} \
+  --outdir { OUTDIR } \
+  -profile <docker/singularity/{institute}> \
+  -entry <ALL_FILES/MAPS_ONLY> \
 ```
 
-This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles.
+Above arguments surrounded with `{}` are user-defined values, those in `<>` are choices made between the shown values.
 
 Note that the pipeline will create the following files in your working directory:
 
 ```bash
-work                # Directory containing the nextflow working files
-<OUTDIR>            # Finished results in specified location (defined with --outdir)
-.nextflow_log       # Log file from Nextflow
+work                    # Directory containing the nextflow working files
+<OUTDIR>/pipeline_info  # Finished results in specified location (defined with --outdir)
+<OUTDIR>/hic_files      # Finished results in specified location (defined with --outdir)
+.nextflow_log           # Log file from Nextflow
 # Other nextflow hidden files, eg. history of pipeline runs and old logs.
 ```
 
@@ -79,17 +62,18 @@ Pipeline settings can be provided in a `yaml` or `json` file via `-params-file <
 > The above pipeline run specified with a params file in yaml format:
 
 ```bash
-nextflow run sanger-tol/curationpretext -profile docker -params-file params.yaml
+nextflow run sanger-tol/curationpretext -profile docker -params-file params.yaml -entry <ALL_FILES/MAPS_ONLY>
 ```
 
 with `params.yaml` containing:
 
 ```yaml
-input: './samplesheet.csv'
-outdir: './results/'
-genome: 'GRCh37'
-input: 'data'
-<...>
+input: "./samplesheet.csv"
+outdir: "./results/"
+teloseq: "GRCh37"
+sample: "data"
+pacbio: "pacbio_path"
+cram: "cram_path"
 ```
 
 You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-co.re/launch).

diff --git a/main.nf b/main.nf
@@ -57,7 +57,7 @@ workflow SANGERTOL_CURATIONPRETEXT_MAPS {
 // WORKFLOW: Execute a single named workflow for the pipeline
 // See: https://github.com/nf-core/rnaseq/issues/619
 //
-workflow ALL_FILES {
+workflow {
     SANGERTOL_CURATIONPRETEXT_ALL_FILES ()
 }
 

diff --git a/modules/local/bamtobed_sort.nf b/modules/local/bamtobed_sort.nf
@@ -4,7 +4,7 @@ process BAMTOBED_SORT {
 
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
         'https://depot.galaxyproject.org/singularity/mulled-v2-9d3a458f6420e5712103ae2af82c94d26d63f059:60b54b43045e8cf39ba307fd683c69d4c57240ce-0' :
-        'quay.io/biocontainers/mulled-v2-9d3a458f6420e5712103ae2af82c94d26d63f059:60b54b43045e8cf39ba307fd683c69d4c57240ce-0' }"
+        'biocontainers/mulled-v2-9d3a458f6420e5712103ae2af82c94d26d63f059:60b54b43045e8cf39ba307fd683c69d4c57240ce-0' }"
 
     input:
     tuple val(meta), path(bam)

diff --git a/modules/local/cram_filter_align_bwamem2_fixmate_sort.nf b/modules/local/cram_filter_align_bwamem2_fixmate_sort.nf
@@ -4,7 +4,7 @@ process CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT {
 
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
         'https://depot.galaxyproject.org/singularity/mulled-v2-50d89b457e04ed90fa0cbf8ebc3ae1b9ffbc836b:caf993da1689e8d42f5e4c113ffc9ef81d26df96-0' :
-        'quay.io/biocontainers/mulled-v2-50d89b457e04ed90fa0cbf8ebc3ae1b9ffbc836b:caf993da1689e8d42f5e4c113ffc9ef81d26df96-0' }"
+        'biocontainers/mulled-v2-50d89b457e04ed90fa0cbf8ebc3ae1b9ffbc836b:caf993da1689e8d42f5e4c113ffc9ef81d26df96-0' }"
 
     input:
     tuple val(meta), path(cramfile), path(cramindex), val(from), val(to), val(base), val(chunkid), val(rglines), val(bwaprefix)

diff --git a/modules/local/extract_repeat.nf b/modules/local/extract_repeat.nf
@@ -5,7 +5,7 @@ process EXTRACT_REPEAT {
     conda "conda-forge::perl=5.26.2"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
         'https://depot.galaxyproject.org/singularity/perl:5.26.2' :
-        'quay.io/biocontainers/perl:5.26.2' }"
+        'biocontainers/perl:5.26.2' }"
 
     input:
     tuple val( meta ), path( file )

diff --git a/modules/local/extract_telo.nf b/modules/local/extract_telo.nf
@@ -5,7 +5,7 @@ process EXTRACT_TELO {
     conda "conda-forge::coreutils=9.1"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
     'https://depot.galaxyproject.org/singularity/ubuntu:20.04' :
-    'ubuntu:20.04' }"
+    'docker.io/ubuntu:20.04' }"
 
     input:
     tuple val( meta ), path( file )

diff --git a/modules/local/find_telomere_windows.nf b/modules/local/find_telomere_windows.nf
@@ -6,7 +6,7 @@ process FIND_TELOMERE_WINDOWS {
     container "${ workflow.containerEngine == 'singularity' &&
                     !task.ext.singularity_pull_docker_container ?
         'https://depot.galaxyproject.org/singularity/java-jdk:8.0.112--1' :
-        'quay.io/biocontainers/java-jdk:8.0.112--1' }"
+        'biocontainers/java-jdk:8.0.112--1' }"
 
     input:
     tuple val( meta ), path( file )