Merge pull request #293 from sanger-tol/dev

1.1.0 - Ancient Atlantis
sanger-tol · Apr 8, 2024 · dd32db5 · dd32db5
2 parents a064360 + c3ecafe
commit dd32db5
Show file tree

Hide file tree

Showing 260 changed files with 6,536 additions and 1,627 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -10,6 +10,8 @@ on:
 
 env:
   NXF_ANSI_LOG: false
+  NXF_SINGULARITY_CACHEDIR: ${{ github.workspace }}/.singularity
+  NXF_SINGULARITY_LIBRARYDIR: ${{ github.workspace }}/.singularity
 
 concurrency:
   group: "${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}"
@@ -27,25 +29,47 @@ jobs:
           - "22.10.1"
           - "latest-everything"
     steps:
-      - name: Check out pipeline code
-        uses: actions/checkout@v3
+      - name: Get branch names
+        # Pulls the names of current branches in repo
+        # steps.branch-names.outputs.current_branch is used later and returns the name of the branch the PR is made FROM not to
+        id: branch-names
+        uses: tj-actions/branch-names@v8
 
       - name: Install Nextflow
         uses: nf-core/setup-nextflow@v1
         with:
           version: "${{ matrix.NXF_VER }}"
 
-      - name: Download test data
-        # Download A fungal test data set that is full enough to show some real output.
+      - name: Setup apptainer
+        uses: eWaterCycle/setup-apptainer@main
+
+      - name: Set up Singularity
         run: |
-          curl https://tolit.cog.sanger.ac.uk/test-data/resources/treeval/TreeValTinyData.tar.gz | tar xzf -
+          mkdir -p $NXF_SINGULARITY_CACHEDIR
+          mkdir -p $NXF_SINGULARITY_LIBRARYDIR
 
-      - name: Run RAPID pipeline with test data
-        # Remember that you can parallelise this by using strategy.matrix
+      - name: Install Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+
+      - name: Install nf-core
         run: |
-          nextflow run ${GITHUB_WORKSPACE} -entry RAPID -profile test_github,docker --outdir ./results-rapid
+          pip install nf-core
+
+      - name: NF-Core Download - download singularity containers
+        # Forcibly download repo on active branch and download SINGULARITY containers into the CACHE dir if not found
+        # Must occur after singularity install or will crash trying to dl containers
+        # Zip up this fresh download and run the checked out version
+        run: |
+          nf-core download sanger-tol/treeval --revision ${{ steps.branch-names.outputs.current_branch }} --compress none -d --force --outdir sanger-treeval --container-cache-utilisation amend --container-system singularity
+
+      - name: Download Tiny test data
+        # Download A fungal test data set that is full enough to show some real output.
+        run: |
+          curl https://tolit.cog.sanger.ac.uk/test-data/resources/treeval/TreeValTinyData.tar.gz | tar xzf -
 
-      - name: Run FULL pipeline with test data
+      - name: Singularity - Run FULL pipeline with test data
         # Remember that you can parallelise this by using strategy.matrix
         run: |
-          nextflow run ${GITHUB_WORKSPACE} -profile test_github,docker --outdir ./results-full
+          nextflow run ./sanger-treeval/${{ steps.branch-names.outputs.current_branch }}/main.nf -profile test_github,singularity --outdir ./Sing-Full
diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml
@@ -32,7 +32,7 @@ jobs:
       - uses: actions/setup-node@v3
 
       - name: Install Prettier
-        run: npm install -g prettier
+        run: npm install -g prettier@3.0.3
 
       - name: Run Prettier --check
         run: prettier --check ${GITHUB_WORKSPACE}

diff --git a/.gitpod.yml b/.gitpod.yml
@@ -68,7 +68,7 @@ tasks:
     init: |
       cd /workspace/treeval-curation/
 
-      git clone -b pre-tag https://github.com/sanger-tol/treeval.git
+      git clone -b v1.0.0 https://github.com/sanger-tol/treeval.git
 
   - name: Install Curtation Pretext
     # https://github.com/sanger-tol/curationpretext

diff --git a/.nf-core.yml b/.nf-core.yml
@@ -5,6 +5,9 @@ lint:
     - conf/test_full.config
     - docs/images/nf-core-treeval_logo_light.png
     - docs/images/nf-core-treeval_logo_dark.png
+    - conf/igenomes.config
+    - .github/workflows/awstest.yml
+    - .github/workflows/awsfulltest.yml
   files_unchanged:
     - .github/workflows/linting.yml
     - .github/CONTRIBUTING.md

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,96 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [1.1.0] - Ancient Aurora - [2024-04-26]
+
+The second release for sanger-tol, created with the [nf-core](https://nf-co.re/) template.
+
+This builds on the initial release by adding subworkflows which generate kmer based coverage tracks and a kmer spectra graph. There are also a number of updates to logic used throughout the pipeline, as well as to the resources required by a significant number of modules.
+
+### Enhancements & Fixes
+
+- Updates to the resource allocation methods used by a number of modules in the base.config.
+- Added a flag to stop the usage of Juicer.
+- Subworkflow to generate a kmer based coverage track.
+- Subworkflow to generate/update a kmer spectra graph.
+- Subworkflow to use minimap2 for HiC mapping, if selected.
+- Subworkflow to use BWAmem2 for HiC mapping, if selected.
+- Subworkflow to ingest Pretext accessory files into the Pretext file, simplifying post-TreeVal data manipulation.
+- Updated the logic in use throughout the pipeline.
+- Updated the modules.config to include some of the logic, cleaning the code.
+- Updated the HiC subworkflow to include subsampling the HiC data for Juicer due to resource requirements with large amounts of data.
+- Updated the YAML_INPUT subworkflow, this now contains "flags" to change some software options.
+- Updated the data names in the input YAML to reduce confusion.
+- Updated software (Pretext{View,Snapshot,Graph}) to allow for use on large genomes with big data.
+  - Added associated patch files and cpu architecture files.
+- Updated the minimap2 align module to remove samtools view in preference of paftools for our usecase.
+- Updated the test.yml inline with the above changes.
+- Updated the SELFCOMP subworkflow to allow for the parallelisation of the work on large genomes.
+- Updated the READ_COVERAGE subworkflow to produce the scaffold based AVG coverage and STND coverage
+- Updated Modules from NF-Core - mostly relates to module structure rather than software.
+- Updated the SummaryStats output to include HiC container counts.
+- Added -T / -t flags where possible to minimise the use of the /tmp directory.
+- Replaced CONCAT_MUMMER with CATCAT for simplicity.
+- Removed JUICER from the RAPID entrypoint.
+- Removed the csi or tbi logic. CSI is now used by default, this simplified the workflow and enlarges the capacity to handle much larger genomes. The logic block previously required was then moved.
+- Added NF-DOWNLOAD to the CI-CD due to an error that causes incomplete downloaded when downloading a number of images at the same time.
+- Added the RAPID_TOL entry point which is more geared towards the requirements of Sanger.
+- Fix a bug in build_alignment_blocks.py to avoid indexing errors happening in large genomes.
+- Change output BEDGRAPH from EXTRACT_TELO module.
+
+### Parameters
+
+| Old Parameter | New Parameter |
+| ------------- | ------------- |
+| -             | --juicer      |
+
+### Software dependencies
+
+Note, since the pipeline is using Nextflow DSL2, each process will be run with its own Biocontainer. This means that on occasion it is entirely possible for the pipeline to be using different versions of the same tool. However, the overall software dependency changes compared to the last release have been listed below for reference.
+
+| Module                                 | Old Version  | New Versions     |
+| -------------------------------------- | ------------ | ---------------- |
+| bamtobed_sort ( bedtools + samtools )  | -            | 2.31.0 + 1.17    |
+| bedtools                               | 2.31.0       | 2.31.1           |
+| busco                                  | 5.4.3        | 5.5.0            |
+| bwa-mem2                               | -            | 2.2.1            |
+| cat                                    | -            | 2.3.4            |
+| chunk_fasta ( pyfasta )                | -            | 0.5.2-1          |
+| cooler                                 | -            | 0.9.2            |
+| cram_filter_align_bwamem2_fixmate_sort | -            |                  |
+| ^ ( samtools + bwamem2 ) ^             | -            | 1.17 + 2.2.1     |
+| coreutils                              | -            | 9.1              |
+| fastk                                  | -            | 1.0.1            |
+| gcc                                    | 7.1.0        | 10.4.0           |
+| find_telomere_windows ( java-jdk )     | -            | 8.0.112          |
+| generate_cram_csv ( samtools )         | -            | 1.17             |
+| gnu-sort                               | -            | 8.25             |
+| juicer_tools_pre ( java-jdk )          | -            | 8.0.112          |
+| perl                                   | -            | 5.26.2           |
+| merquryfk                              | -            | 1.0.1            |
+| minimap2 + samtools                    | -            | 2.24 + 1.14      |
+| miniprot                               | -            | 0.11--he4a0461_2 |
+| mummer                                 | -            | 3.23             |
+| paftools ( minimap2 + samtools )       | -            | 2.24 + 1.14      |
+| pretextmap + samtools                  | 0.1.9 + 1.17 | 0.0.2 + 1.17     |
+| python                                 | 3.9          | -                |
+| - pandas                               | 1.5.2        | -                |
+| samtools                               | 1.17         | 1.18             |
+| selfcomp_splitfasta ( perl-bioperl )   | -            | 1.7.8-1          |
+| seqtk                                  | -            | 1.4              |
+| tabix                                  | -            | 1.11             |
+| ucsc                                   | -            | 377              |
+| windowmasker (blast)                   | -            | 2.14.0           |
+
+### Fixed
+
+- Resource allocations being calculated incorrectly.
+- Pretext bugs related to large data.
+
+### Dependencies
+
+### Deprecated
+
 ## [1.0.0] - Ancient Atlantis - [2023-06-27]
 
 Initial release of sanger-tol/treeval, created with the [nf-core](https://nf-co.re/) template.

diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX)
+[![Cite with Zenodo](https://zenodo.org/badge/509096312.svg)](https://zenodo.org/doi/10.5281/zenodo.10047653)
 [![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A522.10.1-23aa62.svg)](https://www.nextflow.io/)
 [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)
 [![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)
@@ -7,10 +7,13 @@
 
 ## Introduction
 
-**sanger-tol/treeval** is a bioinformatics best-practice analysis pipeline for the generation of data supplemental to the curation of reference quality genomes. This pipeline has been written to generate flat files compatible with [JBrowse2](https://jbrowse.org/jb2/).
+**sanger-tol/treeval [1.1.0 - Ancient Aurora]** is a bioinformatics best-practice analysis pipeline for the generation of data supplemental to the curation of reference quality genomes. This pipeline has been written to generate flat files compatible with [JBrowse2](https://jbrowse.org/jb2/) as well as HiC maps for use in Juicebox, PretextView and HiGlass.
 
 The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community!
 
+You can also set up and attempt to run the pipeline here: https://gitpod.io/#https://github.com/BGAcademy23/treeval-curation
+This is a gitpod set up for BGA23 with a version of TreeVal, although for now gitpod will not run a nextflow pipeline die to issues with using singularity. We will be replacing this with an AWS instance soon.
+
 The treeval pipeline has a sister pipeline currently named [curationpretext](https://github.com/sanger-tol/curationpretext) which acts to regenerate the pretext maps and accessory files during genomic curation in order to confirm interventions. This pipeline is sufficiently different to the treeval implementation that it is written as it's own pipeline.
 
 1. Parse input yaml ( YAML_INPUT )
@@ -26,6 +29,8 @@ The treeval pipeline has a sister pipeline currently named [curationpretext](htt
 11. Generate a telomere track based on input motif ( TELO_FINDER )
 12. Run Busco and convert results into bed format ( BUSCO_ANNOTATION )
 13. Ancestral Busco linkage if available for clade ( BUSCO_ANNOTATION:ANCESTRAL_GENE )
+14. Count KMERs with FastK and plot the spectra using MerquryFK ( KMER )
+15. Generate a coverge track using KMER data ( KMER_READ_COVERAGE )
 
 ## Usage
 
@@ -77,7 +82,7 @@ If you would like to contribute to this pipeline, please see the [contributing g
 
 <!--TODO: Citation-->
 
-If you use sanger-tol/treeval for your analysis, please cite it using the following doi: [10.5281/zenodo.XXXXXX](https://doi.org/10.5281/zenodo.XXXXXX).
+If you use sanger-tol/treeval for your analysis, please cite it using the following doi: [10.5281/zenodo.10047653](https://doi.org/10.5281/zenodo.10047653).
 
 ### Tools
 

diff --git a/assets/github_testing/TreeValTinyFullTest.yaml b/assets/github_testing/TreeValTinyFullTest.yaml
@@ -0,0 +1,38 @@
+assembly:
+  assem_level: scaffold
+  sample_id: grTriPseu1
+  latin_name: to_provide_taxonomic_rank
+  defined_class: fungi
+  assem_version: 1
+  project_id: DTOL
+reference_file: /home/runner/work/treeval/treeval/TreeValTinyData/assembly/draft/grTriPseu1.fa
+map_order: length
+assem_reads:
+  read_type: hifi
+  read_data: /home/runner/work/treeval/treeval/TreeValTinyData/genomic_data/pacbio
+  supplementary_data: path
+hic_data:
+  hic_cram: /home/runner/work/treeval/treeval/TreeValTinyData/genomic_data/hic-arima/
+  hic_aligner: bwamem2
+kmer_profile:
+  # kmer_length will act as input for kmer_read_cov fastk and as the name of folder in profile_dir
+  kmer_length: 31
+  dir: /home/runner/work/treeval/treeval/TreeValTinyData/
+alignment:
+  data_dir: /home/runner/work/treeval/treeval/TreeValTinyData/gene_alignment_data/
+  common_name: "" # For future implementation (adding bee, wasp, ant etc)
+  geneset_id: "LaetiporusSulphureus.gfLaeSulp1"
+  #Path should end up looking like "{data_dir}{classT}/{common_name}/csv_data/{geneset}-data.csv"
+self_comp:
+  motif_len: 0
+  mummer_chunk: 10
+intron:
+  size: "50k"
+telomere:
+  teloseq: TTAGGG
+synteny:
+  synteny_path: /home/runner/work/treeval/treeval/treeval/TreeValTinyData/synteny
+  synteny_genomes: "LaetiporusSulphureus"
+busco:
+  lineages_path: /home/runner/work/treeval/treeval/TreeValTinyData/busco/subset/
+  lineage: fungi_odb10
diff --git a/assets/github_testing/TreeValTinyTest.yaml b/assets/github_testing/TreeValTinyTest.yaml
diff --git a/assets/local_testing/nxOscDF5033-BGA.yaml b/assets/local_testing/nxOscDF5033-BGA.yaml
@@ -1,9 +1,10 @@
 assembly:
+  assem_level: scaffold
   sample_id: Oscheius_DF5033
   latin_name: to_provide_taxonomic_rank # Not currently in use
-  classT: nematode
-  asmVersion: 1
-  gevalType: DTOL
+  defined_class: nematode
+  assem_version: 1
+  project_id: DTOL
 reference_file: /workspace/treeval-curation/Oscheius_DF5033/genomic_data/Oscheius_DF5033.fa
 assem_reads:
   pacbio: /workspace/treeval-curation/Oscheius_DF5033/pacbio/
@@ -20,7 +21,8 @@ intron:
 telomere:
   teloseq: TTAGGG
 synteny:
-  synteny_genome_path: /workspace/treeval-curation/synteny/ # Will not exist
+  synteny_path: /nfs/treeoflife-01/teams/tola/users/dp24/treeval/TreeValTinyData/synteny/
+  synteny_genomes: "LaetiporusSulphureus"
 busco:
   lineages_path: /workspace/treeval-curation/busco/v5
   lineage: nematoda_odb10
diff --git a/assets/local_testing/nxOscDF5033.yaml b/assets/local_testing/nxOscDF5033.yaml
@@ -1,19 +1,27 @@
 assembly:
-  level: scaffold
+  assem_level: scaffold
+  assem_version: 1
   sample_id: Oscheius_DF5033
   latin_name: to_provide_taxonomic_rank
-  classT: nematode
-  asmVersion: 1
-  gevalType: DTOL
+  defined_class: nematode
+  project_id: DTOL
 reference_file: /lustre/scratch123/tol/resources/treeval/treeval-testdata/TreeValSmallData/Oscheius_DF5033/assembly/draft/DF5033.hifiasm.noTelos.20211120/DF5033.noTelos.hifiasm.purged.noCont.noMito.fasta
+map_order: length
 assem_reads:
-  pacbio: /lustre/scratch123/tol/resources/treeval/treeval-testdata/TreeValSmallData/Oscheius_DF5033/genomic_data/nxOscSpes1/pacbio/fasta/
-  hic: /lustre/scratch123/tol/resources/treeval/treeval-testdata/TreeValSmallData/Oscheius_DF5033/genomic_data/nxOscSpes1/hic-arima2/full/
-  supplementary: path
+  read_type: hifi
+  read_data: /lustre/scratch123/tol/resources/treeval/treeval-testdata/TreeValSmallData/Oscheius_DF5033/genomic_data/nxOscSpes1/pacbio/fasta/
+  supplementary_data: path
+hic_data:
+  hic_cram: /lustre/scratch123/tol/resources/treeval/treeval-testdata/TreeValSmallData/Oscheius_DF5033/genomic_data/nxOscSpes1/hic-arima2/full/
+  hic_aligner: minimap2
+kmer_profile:
+  # kmer_length will act as input for kmer_read_cov fastk and as the name of folder in profile_dir
+  kmer_length: 31
+  dir: /lustre/scratch123/tol/resources/treeval/treeval-testdata/TreeValSmallData/Oscheius_DF5033/genomic_data/nxOscSpes1/pacbio/
 alignment:
   data_dir: /lustre/scratch123/tol/resources/treeval/gene_alignment_data/
   common_name: "" # For future implementation (adding bee, wasp, ant etc)
-  geneset: "OscheiusTipulae.ASM1342590v1,CaenorhabditisElegans.WBcel235,Gae_host.Gae"
+  geneset_id: "OscheiusTipulae.ASM1342590v1,CaenorhabditisElegans.WBcel235,Gae_host.Gae"
   #Path should end up looking like "{data_dir}{classT}/{common_name}/csv_data/{geneset}-data.csv"
 self_comp:
   motif_len: 0
@@ -23,7 +31,8 @@ intron:
 telomere:
   teloseq: TTAGGG
 synteny:
-  synteny_genome_path: /lustre/scratch123/tol/resources/treeval/synteny/
+  synteny_path: /nfs/treeoflife-01/teams/tola/users/dp24/treeval/TreeValTinyData/synteny/
+  synteny_genomes: ""
 busco:
   lineages_path: /lustre/scratch123/tol/resources/busco/v5
   lineage: nematoda_odb10