Merge pull request #83 from sanger-tol/alignment

Optional alignment subworkflow
sanger-tol · Oct 18, 2023 · bb01d13 · bb01d13
2 parents 5412568 + 378ab70
commit bb01d13
Show file tree

Hide file tree

Showing 58 changed files with 1,007 additions and 143 deletions.
diff --git a/.editorconfig b/.editorconfig
@@ -23,11 +23,3 @@ indent_size = unset
 [/assets/email*]
 indent_size = unset
 
-# To prevent errors for these test diamond databases
-[/assets/test*/*.dmnd]
-charset = unset
-end_of_line = unset
-insert_final_newline = unset
-trim_trailing_whitespace = unset
-indent_style = unset
-indent_size = unset
diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -38,7 +38,7 @@ body:
     id: system
     attributes:
       label: System information
-      description: "* Nextflow version _(eg. 23.04.1)_
+      description: "* Nextflow version _(eg. 22.10.1)_
 
         * Hardware _(eg. HPC, Desktop, Cloud)_
 

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -1,7 +1,12 @@
 name: nf-core CI
 # This workflow runs the pipeline with the minimal test dataset to check that it completes without any syntax errors
 on:
-  workflow_dispatch:
+  push:
+    branches:
+      - dev
+  pull_request:
+  release:
+    types: [published]
 
 env:
   NXF_ANSI_LOG: false
@@ -19,7 +24,7 @@ jobs:
     strategy:
       matrix:
         NXF_VER:
-          - "23.04.1"
+          - "22.10.1"
           - "latest-everything"
     steps:
       - name: Check out pipeline code
@@ -30,9 +35,19 @@ jobs:
         with:
           version: "${{ matrix.NXF_VER }}"
 
+      - name: Download the NCBI taxdump database
+        run: |
+          mkdir ncbi_taxdump
+          curl -L https://ftp.ncbi.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz | tar -C ncbi_taxdump -xzf -
+
+      - name: Download the BUSCO lineage database
+        run: |
+          mkdir busco_database
+          curl -L https://tolit.cog.sanger.ac.uk/test-data/resources/busco/blobtoolkit.GCA_922984935.2.2023-08-03.lineages.tar.gz | tar -C busco_database -xzf -
+
       - name: Run pipeline with test data
         # You can customise CI pipeline run tests as required
         # For example: adding multiple test runs with different parameters
         # Remember that you can parallelise this by using strategy.matrix
         run: |
-          nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results
+          nextflow run ${GITHUB_WORKSPACE} -profile test,docker --taxdump $PWD/ncbi_taxdump --busco $PWD/busco_database --outdir ./results
diff --git a/.github/workflows/fix-linting.yml b/.github/workflows/fix-linting.yml
@@ -8,21 +8,21 @@ jobs:
     # Only run if comment is on a PR with the main repo, and if it contains the magic keywords
     if: >
       contains(github.event.comment.html_url, '/pull/') &&
-      contains(github.event.comment.body, '@nf-core-bot fix linting') &&
+      contains(github.event.comment.body, '@sanger-tolsoft fix linting') &&
       github.repository == 'sanger-tol/blobtoolkit'
     runs-on: ubuntu-latest
     steps:
-      # Use the @nf-core-bot token to check out so we can push later
+      # Use the @sanger-tolsoft token to check out so we can push later
       - uses: actions/checkout@v3
         with:
-          token: ${{ secrets.nf_core_bot_auth_token }}
+          token: ${{ secrets.sangertolsoft_access_token }}
 
       # Action runs on the issue comment, so we don't get the PR by default
       # Use the gh cli to check out the PR
       - name: Checkout Pull Request
         run: gh pr checkout ${{ github.event.issue.number }}
         env:
-          GITHUB_TOKEN: ${{ secrets.nf_core_bot_auth_token }}
+          GITHUB_TOKEN: ${{ secrets.sangertolsoft_access_token }}
 
       - uses: actions/setup-node@v3
 
@@ -46,8 +46,8 @@ jobs:
       - name: Commit & push changes
         if: steps.prettier_status.outputs.result == 'fail'
         run: |
-          git config user.email "[email protected]"
-          git config user.name "nf-core-bot"
+          git config user.email "[email protected]"
+          git config user.name "sanger-tolsoft"
           git config push.default upstream
           git add .
           git status

diff --git a/.github/workflows/sangertest.yml → .github/workflows/sanger_test.yml b/.github/workflows/sangertest.yml → .github/workflows/sanger_test.yml
@@ -1,4 +1,4 @@
-name: nf-core Sanger LSF tests
+name: sanger-tol LSF tests
 
 on:
   workflow_dispatch:
@@ -13,16 +13,21 @@ jobs:
         if: github.event_name == 'workflow_dispatch'
 
       - name: Launch workflow via tower
-        uses: nf-core/tower-action@v2
+        uses: seqeralabs/action-tower-launch@v2
         with:
           workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }}
           access_token: ${{ secrets.TOWER_ACCESS_TOKEN }}
           compute_env: ${{ secrets.TOWER_COMPUTE_ENV }}
-          pipeline: ${{ github.repository }}
           revision: ${{ env.REVISION }}
           workdir: ${{ secrets.TOWER_WORKDIR_PARENT }}/work/${{ github.repository }}/work-${{ env.REVISION }}
           parameters: |
             {
               "outdir": "${{ secrets.TOWER_WORKDIR_PARENT }}/results/${{ github.repository }}/results-${{ env.REVISION }}",
             }
           profiles: test,sanger,singularity,cleanup
+      - uses: actions/upload-artifact@v3
+        with:
+          name: Tower debug log file
+          path: |
+            tower_action_*.log
+            tower_action_*.json
diff --git a/.github/workflows/sangerfulltest.yml → .github/workflows/sanger_test_full.yml b/.github/workflows/sangerfulltest.yml → .github/workflows/sanger_test_full.yml
@@ -1,4 +1,4 @@
-name: nf-core Sanger LSF full size tests
+name: sanger-tol LSF full size tests
 
 on:
   workflow_dispatch:
@@ -18,16 +18,21 @@ jobs:
         if: github.event_name == 'workflow_dispatch'
 
       - name: Launch workflow via tower
-        uses: nf-core/tower-action@v2
+        uses: seqeralabs/action-tower-launch@v2
         with:
           workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }}
           access_token: ${{ secrets.TOWER_ACCESS_TOKEN }}
           compute_env: ${{ secrets.TOWER_COMPUTE_ENV }}
-          pipeline: ${{ github.repository }}
           revision: ${{ env.REVISION }}
           workdir: ${{ secrets.TOWER_WORKDIR_PARENT }}/work/${{ github.repository }}/work-${{ env.REVISION }}
           parameters: |
             {
               "outdir": "${{ secrets.TOWER_WORKDIR_PARENT }}/results/${{ github.repository }}/results-${{ env.REVISION }}",
             }
           profiles: test_full,sanger,singularity,cleanup
+      - uses: actions/upload-artifact@v3
+        with:
+          name: Tower debug log file
+          path: |
+            tower_action_*.log
+            tower_action_*.json
diff --git a/.nf-core.yml b/.nf-core.yml
@@ -8,6 +8,7 @@ lint:
   files_unchanged:
     - LICENSE
     - .github/ISSUE_TEMPLATE/bug_report.yml
+    - .github/workflows/linting.yml
     - assets/sendmail_template.txt
     - lib/NfcoreTemplate.groovy
     - .prettierignore

diff --git a/README.md b/README.md
@@ -5,7 +5,7 @@
 [![GitHub Actions Linting Status](https://github.com/sanger-tol/blobtoolkit/workflows/nf-core%20linting/badge.svg)](https://github.com/sanger-tol/blobtoolkit/actions?query=workflow%3A%22nf-core+linting%22)
 [![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.7949058-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.7949058)
 
-[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A523.04.1-23aa62.svg)](https://www.nextflow.io/)
+[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A522.10.1-23aa62.svg)](https://www.nextflow.io/)
 [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)
 [![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)
 [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)
@@ -60,7 +60,7 @@ mMelMel1,illumina,GCA_922984935.2.illumina.mMelMel1.cram
 mMelMel3,ont,GCA_922984935.2.ont.mMelMel3.cram
 ```
 
-Each row represents an aligned file. Rows with the same sample identifier are considered technical replicates. The datatype refers to the sequencing technology used to generate the underlying raw data and follows a controlled vocabulary (ont, hic, pacbio, illumina). The aligned read files can be generated using the [sanger-tol/readmapping](https://github.com/sanger-tol/readmapping) pipeline.
+Each row represents an aligned file. Rows with the same sample identifier are considered technical replicates. The datatype refers to the sequencing technology used to generate the underlying raw data and follows a controlled vocabulary (ont, hic, pacbio, pacbio_clr illumina). The aligned read files can be generated using the [sanger-tol/readmapping](https://github.com/sanger-tol/readmapping) pipeline.
 
 Now, you can run the pipeline using:
 

diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -16,7 +16,7 @@
             "datatype": {
                 "type": "string",
                 "pattern": "^\\S+$",
-                "enum": ["hic", "illumina", "ont", "pacbio"],
+                "enum": ["hic", "illumina", "ont", "pacbio", "pacbio_clr"],
                 "errorMessage": "Data type, and must be one of: 'hic' or 'illumina' or 'ont' or 'pacbio'"
             },
             "datafile": {

diff --git a/assets/test/mCerEla1.1.buscogenes.dmnd b/assets/test/mCerEla1.1.buscogenes.dmnd
diff --git a/assets/test/samplesheet_raw.csv b/assets/test/samplesheet_raw.csv
@@ -0,0 +1,4 @@
+sample,datatype,datafile
+mMelMel1,illumina,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/genomic_data/mMelMel1/illumina/31231_3#1_subset.cram
+mMelMel2,illumina,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/genomic_data/mMelMel2/illumina/31231_4#1_subset.cram
+mMelMel3,hic,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/genomic_data/mMelMel3/hic-arima2/35528_2#1_subset.cram
diff --git a/assets/test_full/full_samplesheet.csv b/assets/test_full/full_samplesheet.csv
@@ -1,3 +1,3 @@
 sample,datatype,datafile
-gfLaeSulp1,hic,/lustre/scratch123/tol/projects/.sandbox/data/fungi/Laetiporus_sulphureus/analysis/gfLaeSulp1.1/read_mapping/hic/GCA_927399515.1.unmasked.hic.gfLaeSulp1.cram
-gfLaeSulp1,pacbio,/lustre/scratch123/tol/projects/.sandbox/data/fungi/Laetiporus_sulphureus/analysis/gfLaeSulp1.1/read_mapping/pacbio/GCA_927399515.1.unmasked.pacbio.gfLaeSulp1.cram
+gfLaeSulp1,hic,/lustre/scratch123/tol/resources/nextflow/test-data/Laetiporus_sulphureus/analysis/gfLaeSulp1.1/read_mapping/hic/GCA_927399515.1.unmasked.hic.gfLaeSulp1.cram
+gfLaeSulp1,pacbio,/lustre/scratch123/tol/resources/nextflow/test-data/Laetiporus_sulphureus/analysis/gfLaeSulp1.1/read_mapping/pacbio/GCA_927399515.1.unmasked.pacbio.gfLaeSulp1.cram
diff --git a/assets/test_full/gfLaeSulp1.1.buscogenes.dmnd b/assets/test_full/gfLaeSulp1.1.buscogenes.dmnd
diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
@@ -24,12 +24,16 @@ class RowChecker:
 
     """
 
-    VALID_FORMATS = (".cram",)
+    VALID_FORMATS = (
+        ".cram",
+        ".bam",
+    )
 
     VALID_DATATYPES = (
         "hic",
         "illumina",
         "pacbio",
+        "pacbio_clr",
         "ont",
     )
 

diff --git a/conf/modules.config b/conf/modules.config
@@ -14,22 +14,55 @@ process {
 
     withName: "SAMPLESHEET_CHECK" {
         publishDir = [
-            path: { "${params.outdir}/blobtoolkit_info" },
+            path: { "${params.outdir}/pipeline_info/blobtoolkit" },
             mode: params.publish_dir_mode,
             saveAs: { filename -> filename.equals("versions.yml") ? null : filename }
         ]
     }
 
-    withName: "GOAT_TAXONSEARCH" {
-        ext.args = "-l -b"
+    withName: "WINDOWMASKER_MKCOUNTS" {
+        ext.args = "-infmt fasta -sformat obinary"
+    }
+
+    withName: "WINDOWMASKER_USTAT" {
+        ext.args = "-infmt fasta -dust T -outfmt fasta"
+    }
+
+    withName: "MINIMAP2_HIC" {
+        ext.args = "-ax sr"
+    }
+
+    withName: "MINIMAP2_ILMN" {
+        ext.args = "-ax sr"
+    }
+
+    withName: "MINIMAP2_CCS" {
+        ext.args = "-ax map-hifi --cs=short"
+    }
+
+    withName: "MINIMAP2_CLR" {
+        ext.args = "-ax map-pb"
+    }
+
+    withName: "MINIMAP2_ONT" {
+        ext.args = "-ax map-ont"
     }
 
     withName: "SAMTOOLS_VIEW" {
         ext.args = "--output-fmt bam --write-index"
     }
 
+    withName: "SAMTOOLS_INDEX" {
+        ext.args = "-c"
+    }
+
+    withName: "GOAT_TAXONSEARCH" {
+        ext.args = "--lineage --busco"
+    }
+
     withName: "BUSCO" {
         scratch = true
+        // Overridden in the test profile, see at the end of this file
         ext.args = "--mode genome --force"
     }
 
@@ -68,7 +101,7 @@ process {
 
     withName: "CUSTOM_DUMPSOFTWAREVERSIONS" {
         publishDir = [
-            path: { "${params.outdir}/blobtoolkit_info" },
+            path: { "${params.outdir}/pipeline_info/blobtoolkit" },
             mode: params.publish_dir_mode,
             pattern: "*_versions.yml"
         ]
@@ -84,3 +117,22 @@ process {
     }
 
 }
+
+
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Additional configuration to speed processes up during testing.
+
+----------------------------------------------------------------------------------------
+*/
+
+profiles {
+    test {
+        process {
+            withName: BUSCO {
+                // Note: BUSCO *must* see the double-quotes around the parameters
+                ext.args = '--mode genome --force --metaeuk_parameters \'"-s=2"\' --metaeuk_rerun_parameters \'"-s=2"\''
+            }
+        }
+    }
+}
diff --git a/conf/test.config b/conf/test.config
@@ -12,7 +12,7 @@
 
 params {
     config_profile_name        = 'Test profile'
-    config_profile_description = 'Minimal test dataset to check pipeline function'
+    config_profile_description = 'Minimal aligned test dataset to check pipeline function'
 
     // Limit resources so that this can run on GitHub Actions
     max_cpus   = 2
@@ -22,15 +22,15 @@ params {
     // Input test data
     // Specify the paths to your test data
     // Give any required params for the test so that command line flags are not needed
-    input     = "${projectDir}/assets/test/samplesheet.csv"
+    input     = "${projectDir}/assets/test/samplesheet_s3.csv"
 
     // Fasta references
-    fasta     = "/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/assembly/release/mMelMel3.1_paternal_haplotype/GCA_922984935.2.subset.fasta.gz"
+    fasta     = "https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/assembly/release/mMelMel3.1_paternal_haplotype/GCA_922984935.2.subset.fasta.gz"
     accession = "GCA_922984935.2"
     taxon     = "Meles meles"
 
     // Databases
     taxdump   = "/lustre/scratch123/tol/teams/grit/geval_pipeline/btk_databases/taxdump"
-    busco     = "/lustre/scratch123/tol/resources/nextflow/busco_2021_06_reduced/"
-    uniprot   = "${projectDir}/assets/test/mCerEla1.1.buscogenes.dmnd"
+    busco     = "/lustre/scratch123/tol/resources/nextflow/busco/blobtoolkit.GCA_922984935.2.2023-08-03"
+    uniprot   = "https://tolit.cog.sanger.ac.uk/test-data/resources/diamond/mCerEla1.1.buscogenes.dmnd"
 }
diff --git a/conf/test_full.config b/conf/test_full.config
@@ -22,12 +22,12 @@ params {
     input     = "${projectDir}/assets/test_full/full_samplesheet.csv"
 
     // Fasta references
-    fasta     = "/lustre/scratch124/tol/projects/darwin/data/fungi/Laetiporus_sulphureus/assembly/release/gfLaeSulp1.1/insdc/GCA_927399515.1.fasta.gz"
+    fasta     = "https://tolit.cog.sanger.ac.uk/test-data/Laetiporus_sulphureus/assembly/release/gfLaeSulp1.1/insdc/GCA_927399515.1.fasta.gz"
     accession = "GCA_927399515.1"
     taxon     = "Laetiporus sulphureus"
 
     // Databases
     taxdump   = "/lustre/scratch123/tol/teams/grit/geval_pipeline/btk_databases/taxdump"
     busco     = "/lustre/scratch123/tol/resources/busco/v5/"
-    uniprot   = "${projectDir}/assets/test_full/gfLaeSulp1.1.buscogenes.dmnd"
+    uniprot   = "https://tolit.cog.sanger.ac.uk/test-data/resources/diamond/gfLaeSulp1.1.buscogenes.dmnd"
 }
diff --git a/conf/test_raw.config b/conf/test_raw.config
@@ -0,0 +1,37 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Nextflow config file for running minimal tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Defines input files and everything required to run a fast and simple pipeline test.
+
+    Use as follows:
+        nextflow run sanger-tol/blobtoolkit -profile test_raw,<docker/singularity> --outdir <OUTDIR>
+
+----------------------------------------------------------------------------------------
+*/
+
+params {
+    config_profile_name        = 'Raw test profile'
+    config_profile_description = 'Minimal raw test dataset to check pipeline function'
+
+    // Limit resources so that this can run on GitHub Actions
+    max_cpus   = 2
+    max_memory = '6.GB'
+    max_time   = '6.h'
+
+    // Input test data
+    // Specify the paths to your test data
+    // Give any required params for the test so that command line flags are not needed
+    input     = "${projectDir}/assets/test/samplesheet_raw.csv"
+    align     = true
+
+    // Fasta references
+    fasta     = "https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/assembly/release/mMelMel3.1_paternal_haplotype/GCA_922984935.2.subset.fasta.gz"
+    accession = "GCA_922984935.2"
+    taxon     = "Meles meles"
+
+    // Databases
+    taxdump   = "/lustre/scratch123/tol/teams/grit/geval_pipeline/btk_databases/taxdump"
+    busco     = "/lustre/scratch123/tol/resources/nextflow/busco/blobtoolkit.GCA_922984935.2.2023-08-03"
+    uniprot   = "https://tolit.cog.sanger.ac.uk/test-data/resources/diamond/mCerEla1.1.buscogenes.dmnd"
+}