Merge pull request #175 from Plant-Food-Research-Open/release/test

Updated PFR's Kraken 2 and started using _JAVA_OPTIONS in RUNASSEMBLYVISUALIZER
Plant-Food-Research-Open · Nov 5, 2024 · 8a7d09c · 8a7d09c
2 parents 10a4bd7 + ddc9aae
commit 8a7d09c
Showing 20 changed files with 49 additions and 58 deletions.
diff --git a/.github/version_checks.sh b/.github/version_checks.sh
@@ -1,5 +1,7 @@
 #!/usr/bin/env bash
 
+set -euo pipefail
+
 config_version=$(sed -n "/^\s*version\s*=\s*'/s/version//p" nextflow.config | tr -d "=[:space:]'")
 cff_version=$(sed -n '/^version: /s/version: //p' CITATION.cff | tr -d '[:space:]')
 

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,7 +3,7 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## v2.2.0dev - [04-Nov-2024]
+## v2.2.0 - [05-Nov-2024]
 
 ### `Added`
 
@@ -19,12 +19,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 10. Updated the tube map along with the tool list [#166](https://github.com/Plant-Food-Research-Open/assemblyqc/issues/166)
 11. Added Orthofinder [#167](https://github.com/Plant-Food-Research-Open/assemblyqc/issues/167)
 12. Changed order of tool options in the `nextflow.config` file
+13. Updated PFR's Kraken 2 database to `k2_pluspfp_20240904` [#170](https://github.com/Plant-Food-Research-Open/assemblyqc/issues/170)
+14. Increased memory requirement for Kraken 2 to `256.GB`
 
 ### `Fixed`
 
 1. Fixed a bug where Gene score distribution graph did not appear correctly [#125](https://github.com/Plant-Food-Research-Open/assemblyqc/issues/125)
 2. Increased memory requirement for `DNADIFF` to avoid SLURM OOM kills with exit code 2 [#141](https://github.com/Plant-Food-Research-Open/assemblyqc/issues/141)
 3. Documented the use explicit use of `-revision` parameter [#160](https://github.com/Plant-Food-Research-Open/assemblyqc/issues/160)
+4. Now using `_JAVA_OPTIONS` in module `RUNASSEMBLYVISUALIZER` to avoid user preferences related errors
 
 ### `Dependencies`
 

diff --git a/CITATION.cff b/CITATION.cff
@@ -25,7 +25,7 @@ authors:
   - family-names: "Deng"
     given-names: "Cecilia"
 title: "AssemblyQC: A Nextflow pipeline for reproducible reporting of assembly quality"
-version: 2.2.0dev
+version: 2.2.0
 date-released: 2024-07-30
 url: "https://github.com/Plant-Food-Research-Open/assemblyqc"
 doi: 10.1093/bioinformatics/btae477
diff --git a/conf/base.config b/conf/base.config
@@ -61,7 +61,7 @@ process {
         memory = { 512.GB * task.attempt }
     }
     withName:KRAKEN2 {
-        memory = { 200.GB * task.attempt }
+        memory = { 256.GB * task.attempt }
     }
     withName:BWA_MEM {
         time   = { 2.day  * task.attempt }

diff --git a/conf/test_full.config b/conf/test_full.config
@@ -10,14 +10,6 @@
 ----------------------------------------------------------------------------------------
 */
 
-process {
-    resourceLimits = [
-        cpus: 10,
-        memory: '32.GB',
-        time: '6.h'
-    ]
-}
-
 params {
     config_profile_name         = 'Full test profile'
     config_profile_description  = 'Full test dataset to check pipeline function'
@@ -33,24 +25,24 @@ params {
     ncbi_fcs_gx_tax_id          = 35717
     // ncbi_fcs_gx_db_path      = 'https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/r2023-01-24'
 
+    tidk_skip                   = false
+    tidk_repeat_seq             = 'TTTGGG'
+
     busco_skip                  = false
     busco_mode                  = 'genome'
     busco_lineage_datasets      = 'fungi_odb10 hypocreales_odb10'
 
-    tidk_skip                   = false
-    tidk_repeat_seq             = 'TTTGGG'
-
     lai_skip                    = false
 
     kraken2_skip                = true // Skipping this step as the dataset is humengous (126 GB). Please download the dataset manually
-    // kraken2_db_path          = 'https://genome-idx.s3.amazonaws.com/kraken/k2_pluspfp_20240112.tar.gz'
+    // kraken2_db_path          = 'https://genome-idx.s3.amazonaws.com/kraken/k2_pluspfp_20240904.tar.gz'
 
     hic                         = 'SRR8238190'
 
+    merqury_skip                = false
+
     synteny_skip                = false
     synteny_mummer_skip         = false
     synteny_plotsr_skip         = false
     synteny_xref_assemblies     = 'https://raw.githubusercontent.com/plant-food-research-open/assemblyqc/dev/assets/xrefsheet.csv'
-
-    merqury_skip                = false
 }
diff --git a/docs/usage.md b/docs/usage.md
@@ -65,7 +65,7 @@ BUSCO lineage databases are downloaded and updated by the BUSCO tool itself. A p
 
 ### Kraken 2
 
-Path to Kraken 2 database is provided by the `kraken2_db_path` parameter. This can be a URL to a public `.tar.gz` file such as `https://genome-idx.s3.amazonaws.com/kraken/k2_pluspfp_20240112.tar.gz`. The pipeline can download and extract the database. This is not the recommended practice owing to the size of the database. Rather, the database should be downloaded, extracted and stored in a read-only location. The path to that location can be provided by the `kraken2_db_path` parameter such as `/workspace/ComparativeDataSources/kraken2db/k2_pluspfp_20230314`.
+Path to Kraken 2 database is provided by the `kraken2_db_path` parameter. This can be a URL to a public `.tar.gz` file such as `https://genome-idx.s3.amazonaws.com/kraken/k2_pluspfp_20240112.tar.gz`. The pipeline can download and extract the database. This is not the recommended practice owing to the size of the database. Rather, the database should be downloaded, extracted and stored in a read-only location. The path to that location can be provided by the `kraken2_db_path` parameter such as `/workspace/ComparativeDataSources/kraken2db/k2_pluspfp_20240904`.
 
 ## Other parameters
 
@@ -129,7 +129,7 @@ The data for these examples comes from: [umd.edu](https://obj.umiacs.umd.edu/mar
 All the modules have been tested to work on a single machine with 10 CPUs + 32 GBs of memory, except NCBI FCS GX and Kraken2. Their minimum requirements are:
 
 - NCBI FCS GX: 1 CPU + 512 GBs memory
-- Kraken2: 1 CPU + 200 GBs memory
+- Kraken2: 1 CPU + 256 GBs memory
 
 ## Running the pipeline
 

diff --git a/local_assemblyqc b/local_assemblyqc
@@ -17,6 +17,7 @@ nextflow run \
     -profile docker,test_full \
     -resume \
     $stub \
+    -c ../nxf-config/resources.config \
     --ncbi_fcs_gx_skip false \
     --ncbi_fcs_gx_db_path ../dbs/gxdb/test \
     --busco_download_path ../dbs/busco \

diff --git a/modules/local/generatekaryotype.nf b/modules/local/generatekaryotype.nf
@@ -35,8 +35,7 @@ process GENERATEKARYOTYPE {
         exit 0
     fi
 
-    tmp_file=\$(mktemp)
-    printf '%s\\n' "\${ref_seqs[@]}" > "\$tmp_file"
+    printf '%s\\n' "\${ref_seqs[@]}" > ${target_on_ref}.${seq_tag}.tmp
 
     if [[ $seq_tag = "all" ]];then
         cat $target_seq_len > filtered.target.seq.len
@@ -45,7 +44,7 @@ process GENERATEKARYOTYPE {
     fi
     cat filtered.target.seq.len | awk '{print \$1,\$2,"grey"}' OFS="\\t" > colored.filtered.target.seq.len
 
-    grep -w -f "\$tmp_file" $ref_seq_len > filtered.ref.seq.len
+    grep -w -f ${target_on_ref}.${seq_tag}.tmp $ref_seq_len > filtered.ref.seq.len
     cat filtered.ref.seq.len | awk '{print \$1,\$2,"black"}' OFS="\\t" > colored.filtered.ref.seq.len
 
     cat colored.filtered.ref.seq.len | sort -k1V > merged.seq.lengths
@@ -67,8 +66,6 @@ process GENERATEKARYOTYPE {
     | sed '/^\$/d' \
     | awk '{print "chr -",\$1,\$1,"0",\$2-1,\$3}' OFS="\\t" \
     > karyotype_target.tsv
-
-    rm "\$tmp_file"
     """
 
     stub:

diff --git a/modules/local/runassemblyvisualizer.nf b/modules/local/runassemblyvisualizer.nf
@@ -21,14 +21,10 @@ process RUNASSEMBLYVISUALIZER {
     assembly_tag=\$(echo $sample_id_on_tag | sed 's/.*\\.on\\.//g')
     file_name="${agp_assembly_file}"
 
-    cp -r /usr/src/3d-dna/ \\
-        3d-dna
+    mkdir user_home
+    export _JAVA_OPTIONS="-Djava.util.prefs.userRoot=user_prefs -Duser.home=user_home -Xms${avail_mem}g -Xmx${avail_mem}g"
 
-    sed -i \\
-        's/-Xms49152m -Xmx49152m/-Xms${avail_mem}g -Xmx${avail_mem}g/1' \\
-        3d-dna/visualize/juicebox_tools.sh
-
-    3d-dna/visualize/run-assembly-visualizer.sh \\
+    /usr/src/3d-dna/visualize/run-assembly-visualizer.sh \\
         -p false \\
         $agp_assembly_file $sorted_links_txt_file
 
@@ -42,7 +38,6 @@ process RUNASSEMBLYVISUALIZER {
 
     stub:
     if ( !task.memory ) { error '[RUNASSEMBLYVISUALIZER] Available memory not known. Specify process memory requirements to fix this.' }
-    def avail_mem = (task.memory.giga*0.8).intValue()
     """
     assembly_tag=\$(echo $sample_id_on_tag | sed 's/.*\\.on\\.//g')
     touch "\${assembly_tag}.hic"

diff --git a/nextflow.config b/nextflow.config
@@ -9,8 +9,10 @@
 // Global default params, used in configs
 params {
 
-    // Input options
+    // Input/output options
     input                               = null
+    outdir                              = null
+    email                               = null
 
     // Validation options
     check_sequence_duplicates           = true
@@ -77,10 +79,6 @@ params {
     // OrthoFinder options
     orthofinder_skip                    = true
 
-    // Output options
-    outdir                              = null
-    email                               = null
-
     // Boilerplate options
     publish_dir_mode                    = 'copy'
     email_on_fail                       = null
@@ -276,7 +274,7 @@ manifest {
     description     = """A Nextflow pipeline which evaluates assembly quality with multiple QC tools and presents the results in a unified html report."""
     mainScript      = 'main.nf'
     nextflowVersion = '!>=24.04.2'
-    version         = '2.2.0dev'
+    version         = '2.2.0'
     doi             = 'https://doi.org/10.1093/bioinformatics/btae477'
 }
 

diff --git a/pfr/params.json b/pfr/params.json
@@ -1,5 +1,7 @@
 {
     "input": "/workspace/assemblyqc/testdata/v2/assemblysheet.csv",
+    "outdir": "./results",
+    "email": null,
     "check_sequence_duplicates": true,
     "assemblathon_stats_n_limit": 100,
     "gfastats_skip": false,
@@ -9,21 +11,24 @@
     "ncbi_fcs_gx_tax_id": 3750,
     "ncbi_fcs_gx_db_path": "/workspace/ComparativeDataSources/NCBI/FCS/GX/r2023-01-24",
     "contamination_stops_pipeline": false,
-    "busco_skip": false,
-    "busco_mode": "genome",
-    "busco_lineage_datasets": "embryophyta_odb10 eudicots_odb10",
-    "busco_download_path": "/workspace/ComparativeDataSources/BUSCO/assemblyqc",
     "tidk_skip": false,
     "tidk_repeat_seq": "TTTAGGG",
     "tidk_filter_by_size": true,
     "tidk_filter_size_bp": 1000000,
+    "busco_skip": false,
+    "busco_mode": "genome",
+    "busco_lineage_datasets": "embryophyta_odb10 eudicots_odb10",
+    "busco_download_path": "/workspace/ComparativeDataSources/BUSCO/assemblyqc",
     "lai_skip": false,
     "kraken2_skip": false,
-    "kraken2_db_path": "/workspace/ComparativeDataSources/kraken2db/k2_pluspfp_20230314",
+    "kraken2_db_path": "/workspace/ComparativeDataSources/kraken2db/k2_pluspfp_20240904",
     "hic": null,
     "hic_skip_fastp": false,
     "hic_skip_fastqc": false,
     "hic_fastp_ext_args": "--qualified_quality_phred 20 --length_required 50",
+    "hic_samtools_ext_args": "-F 3852",
+    "merqury_skip": false,
+    "merqury_kmer_length": 21,
     "synteny_skip": false,
     "synteny_mummer_skip": false,
     "synteny_plotsr_skip": false,
@@ -32,12 +37,10 @@
     "synteny_mummer_plot_type": "both",
     "synteny_mummer_m2m_align": false,
     "synteny_mummer_max_gap": 1000000,
-    "synteny_mummer_min_bundle_size": 1000,
+    "synteny_mummer_min_bundle_size": 1000000,
     "synteny_plot_1_vs_all": false,
     "synteny_color_by_contig": true,
     "synteny_plotsr_seq_label": "Chr",
     "synteny_plotsr_assembly_order": "gddh13_v1p1 m9_v1 m9_v1_h1 m9_v1_h2",
-    "merqury_skip": false,
-    "outdir": "./results",
-    "email": null
+    "orthofinder_skip": false
 }
diff --git a/pfr_assemblyqc b/pfr_assemblyqc
@@ -27,7 +27,7 @@ shift $((OPTIND -1))
 
 ml unload perl
 ml apptainer/1.1
-ml nextflow/23.04.4
+ml nextflow/24.04.3
 
 export TMPDIR="/workspace/$USER/tmp"
 export APPTAINER_BINDPATH="$APPTAINER_BINDPATH,$TMPDIR:$TMPDIR,$TMPDIR:/tmp"
@@ -40,7 +40,7 @@ if [ $full_test_flag -eq 1 ]; then
         --ncbi_fcs_gx_skip false \
         --ncbi_fcs_gx_db_path "/workspace/ComparativeDataSources/NCBI/FCS/GX/r2023-01-24" \
         --kraken2_skip false \
-        --kraken2_db_path "/workspace/ComparativeDataSources/kraken2db/k2_pluspfp_20230314" \
+        --kraken2_db_path "/workspace/ComparativeDataSources/kraken2db/k2_pluspfp_20240904" \
         -resume \
         --outdir results
 else

diff --git a/tests/README.md b/tests/README.md
@@ -20,7 +20,7 @@ The GitHub [CI action](../.github/workflows/ci.yml) included with the pipeline c
 
 ## Testing with a Large Dataset at Plant&Food
 
-Before each release, the functionality of the entire pipeline is tested with a large dataset on the on-prem SLURM-based HPC at The New Zealand Institute of Plant and Food Research.
+Before each release, the functionality of the entire pipeline is tested with a large dataset on the on-prem SLURM-based HPC at The New Zealand Institute for Plant and Food Research.
 
 ## Testing Merqury Datasets
 

diff --git a/tests/hicparam/main.nf.test.snap b/tests/hicparam/main.nf.test.snap
@@ -68,7 +68,7 @@
                         "pigz": "2.3.4"
                     },
                     "Workflow": {
-                        "plant-food-research-open/assemblyqc": "v2.2.0dev"
+                        "plant-food-research-open/assemblyqc": "v2.2.0"
                     }
                 },
                 "stable paths": [

diff --git a/tests/invalid/main.nf.test.snap b/tests/invalid/main.nf.test.snap
@@ -35,7 +35,7 @@
                         "pigz": "2.3.4"
                     },
                     "Workflow": {
-                        "plant-food-research-open/assemblyqc": "v2.2.0dev"
+                        "plant-food-research-open/assemblyqc": "v2.2.0"
                     }
                 },
                 "stable paths": [

diff --git a/tests/minimal/main.nf.test.snap b/tests/minimal/main.nf.test.snap
@@ -35,7 +35,7 @@
                         "pigz": "2.3.4"
                     },
                     "Workflow": {
-                        "plant-food-research-open/assemblyqc": "v2.2.0dev"
+                        "plant-food-research-open/assemblyqc": "v2.2.0"
                     }
                 },
                 "stable paths": [

diff --git a/tests/noltr/main.nf.test.snap b/tests/noltr/main.nf.test.snap
@@ -50,7 +50,7 @@
                         "seqkit": "v2.8.0"
                     },
                     "Workflow": {
-                        "plant-food-research-open/assemblyqc": "v2.2.0dev"
+                        "plant-food-research-open/assemblyqc": "v2.2.0"
                     }
                 },
                 "stable paths": [

diff --git a/tests/orthofinder/main.nf.test.snap b/tests/orthofinder/main.nf.test.snap
@@ -41,7 +41,7 @@
                         "pigz": "2.3.4"
                     },
                     "Workflow": {
-                        "plant-food-research-open/assemblyqc": "v2.2.0dev"
+                        "plant-food-research-open/assemblyqc": "v2.2.0"
                     }
                 },
                 "stable paths": [
@@ -80,4 +80,4 @@
         },
         "timestamp": "2024-11-01T14:11:21.865104"
     }
-}
+}
diff --git a/tests/stub/main.nf.test.snap b/tests/stub/main.nf.test.snap
@@ -230,7 +230,7 @@
                         "untar": 1.34
                     },
                     "Workflow": {
-                        "plant-food-research-open/assemblyqc": "v2.2.0dev"
+                        "plant-food-research-open/assemblyqc": "v2.2.0"
                     }
                 },
                 "stable paths": [

diff --git a/tests/tiny/main.nf.test.snap b/tests/tiny/main.nf.test.snap
@@ -29,7 +29,7 @@
                         "pigz": "2.3.4"
                     },
                     "Workflow": {
-                        "plant-food-research-open/assemblyqc": "v2.2.0dev"
+                        "plant-food-research-open/assemblyqc": "v2.2.0"
                     }
                 },
                 "stable paths": [