diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 798d97fb..89b33330 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -35,9 +35,17 @@ jobs: with: version: "${{ matrix.NXF_VER }}" - - name: Run pipeline with test data - # TODO nf-core: You can customise CI pipeline run tests as required - # For example: adding multiple test runs with different parameters + - name: Download test data + # Download A fungal test data set that is full enough to show some real output. + run: | + curl https://dp24.cog.sanger.ac.uk/TreeValTinyData.tar.gz | tar xzf - + + - name: Run RAPID pipeline with test data + # Remember that you can parallelise this by using strategy.matrix + run: | + nextflow run ${GITHUB_WORKSPACE} -entry RAPID -profile github_test,docker --outdir ./results-rapid + + - name: Run FULL pipeline with test data # Remember that you can parallelise this by using strategy.matrix run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results + nextflow run ${GITHUB_WORKSPACE} -entry FULL -profile github_test,docker --outdir ./results-full diff --git a/.gitpod.yml b/.gitpod.yml index 85d95ecc..7cc49dcd 100644 --- a/.gitpod.yml +++ b/.gitpod.yml @@ -1,14 +1,148 @@ -image: nfcore/gitpod:latest +image: gitpod/workspace-full-vnc +# Update 7th September to reflect code base changes +ports: + - name: JBrowseWeb + description: The JBrowse Webserver port + port: 3000 + onOpen: open-browser + visibility: public + + - name: HiGlass + description: The HiGlass port + port: 8989 + onOpen: open-browser + visibility: public + +tasks: + - name: Install Singularity 3.11.4 + # https://docs.sylabs.io/guides/3.0/user-guide/installation.html + init: | + cd /workspace/treeval-curation/ + + sudo apt-get update && sudo apt-get install -y \ + build-essential \ + libssl-dev \ + uuid-dev \ + libgpgme11-dev \ + squashfs-tools \ + libseccomp-dev \ + pkg-config + + mkdir -p $GOPATH/src/github.com/sylabs && \ + cd $GOPATH/src/github.com/sylabs && \ + wget https://github.com/sylabs/singularity/releases/download/v3.11.4/singularity-ce-3.11.4.tar.gz && \ + tar -xzf singularity-ce-3.11.4.tar.gz && \ + cd ./singularity-ce-3.11.4 && \ + ./mconfig + + ./mconfig && \ + make -C ./builddir && \ + sudo make -C ./builddir install + + - name: Install Nextflow + # https://www.nextflow.io/docs/latest/getstarted.html + init: | + cd /workspace/treeval-curation/ + + wget -qO- https://get.nextflow.io | bash + + chmod +x nextflow + + nextflow self-update + + - name: Install JBrowse2 + # https://jbrowse.org/jb2/download/#jbrowse-cli-tools + command: | + cd /workspace/treeval-curation/ + + npm install -g @jbrowse/cli + + jbrowse create jbrowse2 + + cd jbrowse2/ + + npx serve . -l 3000 + + - name: Install TreeVal Pipeline + # https://github.com/sanger-tol/treeval + init: | + cd /workspace/treeval-curation/ + + git clone -b pre-tag https://github.com/sanger-tol/treeval.git + + - name: Install Curtation Pretext + # https://github.com/sanger-tol/curationpretext + init: | + cd /workspace/treeval-curation/ + + git clone -b dev https://github.com/sanger-tol/curationpretext.git + + - name: Install HiGlass + # https://docs.higlass.io/tutorial.html + init: | + cd /workspace/treeval-curation/ + + pip install higlass-manage + + higlass-manage start + + - name: Alias Nextflow + init: | + cd /workspace/treeval-curation/ + + echo "alias nextflow_cmd='/workspace/treeval-curation/nextflow'" >> ~/.bashrc + + source ~/.bashrc + + - name: Download busco for nematode + init: | + cd /workspace/treeval-curation/ + + curl https://dp24.cog.sanger.ac.uk/Busco.tar.gz | tar xzf - + + - name: Download Nematode Test data and make synteny + init: | + cd /workspace/treeval-curation/ + + curl https://dp24.cog.sanger.ac.uk/Nematode.tar.gz | tar xzf - + + mkdir -p /workspace/treeval-curation/synteny/nematode/ + + cp /workspace/treeval-curation/Oscheius_DF5033/genomic_data/Oscheius_DF5033.fa /workspace/treeval-curation/synteny/nematode/SuperNematode.fa + + - name: Download Lepidoptera data + init: | + cd /workspace/treeval-curation/ + + curl https://dp24.cog.sanger.ac.uk/ilTorViri5.tar.gz | tar xzf - + + - name: Download Genomic Alignment data + init: | + cd /workspace/treeval-curation/ + + curl https://dp24.cog.sanger.ac.uk/AlignmentData.tar.gz | tar xzf - + + - name: Open Tutorial Page + init: | + gp preview https://bga23.org/treeval-curation/Tutorial/ + +github: + prebuilds: + # enable for the master/default branch (defaults to true) + master: true + # add a "Review in Gitpod" button as a comment to pull requests (defaults to true) + addComment: true + # add a "Review in Gitpod" button to pull requests (defaults to false) + addBadge: true + # add a label once the prebuild is ready to pull requests (defaults to false) + addLabel: prebuilt-in-gitpod vscode: extensions: # based on nf-core.nf-core-extensionpack - codezombiech.gitignore # Language support for .gitignore files - # - cssho.vscode-svgviewer # SVG viewer - esbenp.prettier-vscode # Markdown/CommonMark linting and style checking for Visual Studio Code - - eamodio.gitlens # Quickly glimpse into whom, why, and when a line or code block was changed - EditorConfig.EditorConfig # override user/workspace settings with settings found in .editorconfig files - - Gruntfuggly.todo-tree # Display TODO and FIXME in a tree view in the activity bar - mechatroner.rainbow-csv # Highlight columns in csv files in different colors - # - nextflow.nextflow # Nextflow syntax highlighting + - nextflow.nextflow # Nextflow syntax highlighting - oderwat.indent-rainbow # Highlight indentation level - streetsidesoftware.code-spell-checker # Spelling checker for source code diff --git a/.nf-core.yml b/.nf-core.yml index 7442cda9..c96cc78d 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -2,6 +2,7 @@ repository_type: pipeline lint: files_exist: - assets/nf-core-treeval_logo_light.png + - conf/test_full.config - docs/images/nf-core-treeval_logo_light.png - docs/images/nf-core-treeval_logo_dark.png files_unchanged: diff --git a/CHANGELOG.md b/CHANGELOG.md index e7ee3034..79e4e815 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,8 @@ The essential pathways of the gEVAL pipeline have now been converted to Nextflow - Subworkflow to generate syntenic alignments to high quality genomes. - Subworkflow to generate tracks containing telomeric sites. - Custom Groovy for reporting to provide file metrics and resource usage. +- Citations and all docs (including walkthroughs). +- Added gitpod.yml for running in the cloud. This is the tutorial written for BGA23. ### Parameters @@ -36,24 +38,58 @@ The essential pathways of the gEVAL pipeline have now been converted to Nextflow Note, since the pipeline is using Nextflow DSL2, each process will be run with its own Biocontainer. This means that on occasion it is entirely possible for the pipeline to be using different versions of the same tool. However, the overall software dependency changes compared to the last release have been listed below for reference. -| Module | Old Version | New Versions | -| ------------------------------ | ----------- | ---------------- | -| bedtools | - | 2.31.0 | -| busco | - | 5.4.3 | -| bwa-mem2 | - | 2.2.1 | -| cat | - | 2.3.4 | -| cooler | - | 0.9.2 | -| gnu-sort | - | 8.25 | -| minimap2 + samtools | - | 2.24 + 1.14 | -| miniprot | - | 0.11--he4a0461_2 | -| mummer | - | 3.23 | -| paftools (minimap2 + samtools) | - | 2.24 + 1.14 | -| pretextmap + samtools | - | 0.1.9 + 1.17 | -| samtools | - | 1.17 | -| seqtk | - | 1.4 | -| tabix | - | 1.11 | -| ucsc | - | 377 | -| windowmasker (blast) | - | 2.14.0 | +| Module | Old Version | New Versions | +| -------------------------------------- | ----------- | ---------------- | +| assign_ancestal ( pandas + Python ) | - | 1.5.2 + 3.9 | +| bamtobed_sort ( bedtools + samtools ) | - | 2.31.0 + 1.17 | +| bedtools | - | 2.31.0 | +| busco | - | 5.4.3 | +| bwa-mem2 | - | 2.2.1 | +| cat | - | 2.3.4 | +| chunk_fasta ( pyfasta ) | - | 0.5.2-1 | +| cooler | - | 0.9.2 | +| concat_block ( coreutils ) | - | 9.1 | +| concat_mummer ( coreutils ) | - | 9.1 | +| cram_filter_align_bwamem2_fixmate_sort | - | | +| ^ ( samtools + bwamem2 ) ^ | - | 1.16.1 + 2.2.1 | +| extract_ancestral ( python ) | - | 3.9 | +| extract_buscogene ( coreutils ) | - | 9.1 | +| extract_cov_id ( coreutils ) | - | 9.1 | +| extract_repeat ( perl ) | - | 5.26.2 | +| extract_telo ( coreutils ) | - | 9.1 | +| find_telomere_regions ( gcc ) | - | 7.1.0 | +| find_telomere_windows ( java-jdk ) | - | 8.0.112 | +| findhalfcoverage ( python ) | - | 3.9 | +| gap_length ( coreutils ) | - | 9.1 | +| generate_cram_csv ( samtools ) | - | 1.17 | +| get_largest_scaff ( coreutils ) | - | 9.1 | +| get_paired_contact_bed ( coreutils ) | - | 9.1 | +| get_synteny_genomes ( coreutils ) | - | 9.1 | +| getminmaxpunches ( coreutils ) | - | 9.1 | +| graphoverallcoverage ( perl ) | - | 5.26.2 | +| gnu-sort | - | 8.25 | +| juicer_tools_pre ( java-jdk ) | - | 8.0.112 | +| makecmap_cmap2bed ( python ) | - | 3.9 | +| makecmap_fa2cmapmulticolor ( perl ) | - | 5.26.2 | +| makecmap_renamecmapids ( perl ) | - | 5.26.2 | +| minimap2 + samtools | - | 2.24 + 1.14 | +| miniprot | - | 0.11--he4a0461_2 | +| mummer | - | 3.23 | +| paf_to_bed ( coreutils ) | - | 9.1 | +| paftools ( minimap2 + samtools ) | - | 2.24 + 1.14 | +| pretextmap + samtools | - | 0.1.9 + 1.17 | +| reformat_intersect ( coreutils ) | - | 9.1 | +| reformat_ids ( coreutils ) | - | 9.1 | +| replace_dots ( coreutils ) | - | 9.1 | +| samtools | - | 1.17 | +| selfcomp_alignmentblocks ( python ) | - | 3.9 | +| selfcomp_mapids ( python ) | - | 3.9 | +| selfcomp_mummer2bed ( python ) | - | 3.9 | +| selfcomp_splitfasta ( perl-bioperl ) | - | 1.7.8-1 | +| seqtk | - | 1.4 | +| tabix | - | 1.11 | +| ucsc | - | 377 | +| windowmasker (blast) | - | 2.14.0 | ### Fixed diff --git a/CITATIONS.md b/CITATIONS.md index 460acae6..8cfb197f 100755 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -26,7 +26,7 @@ > Abdennur, N. and Mirny, L.A. 2019. ‘Cooler: Scalable storage for hi-C data and other genomically labeled arrays’, Bioinformatics, 36(1), pp. 311–316. doi:10.1093/bioinformatics/btz540. -- [Find Telomere]() +- [Find Telomere](https://github.com/VGP/vgp-assembly/tree/master/pipeline/telomere) > VGP. 2022. vgp-assembly telomere [online]. https://github.com/VGP/vgp-assembly/tree/master/pipeline/telomere. (Accessed on 28th February 2023). @@ -95,6 +95,7 @@ > Morgulis, A., et al. 2006. WindowMasker: window-based masker for sequenced genomes. Bioinformatics. 22(2). pp.134–141. doi: 10.1093/bioinformatics/bti774. - [lep_busco_painter](https://www.biorxiv.org/content/10.1101/2023.05.12.540473v1.full.pdf) + > Wright, C. et al. 2023. Chromosome evolution in Lepidoptera. bioRxiv. 540473. https://doi.org/10.1101/2023.05.12.540473 ## Software packaging/containerisation tools diff --git a/README.md b/README.md index 4e0cf096..8bc073a8 100755 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ nextflow run main.nf -profile singularity --input treeval.yaml -entry {FULL|RAPI An example treeval.yaml can be found [here](assets/local_testing/nxOscDF5033.yaml). -Further documentation about the pipeline can be found in the following files: [usage](https://nf-co.re/treeval/usage), [parameters](https://nf-co.re/treeval/parameters) and [output](https://nf-co.re/treeval/output). +Further documentation about the pipeline can be found in the following files: [usage](https://pipelines.tol.sanger.ac.uk/treeval/dev/usage), [parameters](https://pipelines.tol.sanger.ac.uk/treeval/dev/parameters) and [output](https://pipelines.tol.sanger.ac.uk/treeval/dev/output). > **Warning:** > Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those diff --git a/assets/full_s3_treeval_test.yaml b/assets/full_s3_treeval_test.yaml deleted file mode 100755 index 911260b0..00000000 --- a/assets/full_s3_treeval_test.yaml +++ /dev/null @@ -1,25 +0,0 @@ -assembly: - level: scaffold - sample_id: nxOscDoli1 - classT: nematode - asmVersion: PB.a1 - dbVersion: "1" - gevalType: DTOL -reference_file: https://tolit.cog.sanger.ac.uk/test-data/Gae_host/assembly/DTOL_nxOscDoli1_1_FULL.fa -assem_reads: - pacbio: path - hic: path - supplementary: path -alignment: - data_dir: https://tolit.cog.sanger.ac.uk/test-data/Gae_host/genomic_data/ - common_name: "" # For future implementation (adding bee, wasp, ant etc) - geneset: "s3_Gae_Host.Gae" - #Path should end up looking like "{data_dir}{classT}/{common_name}/csv_data/{geneset}-data.csv" -self_comp: - motif_len: 0 - mummer_chunk: 10 -synteny: - synteny_genome_path: "" # No Syntenic Data for Nematode -outdir: "NEEDS TESTING" -intron: - size: "50k" diff --git a/assets/github_testing/TreeValTinyTest-Local.yaml b/assets/github_testing/TreeValTinyTest-Local.yaml new file mode 100755 index 00000000..532d0fd6 --- /dev/null +++ b/assets/github_testing/TreeValTinyTest-Local.yaml @@ -0,0 +1,31 @@ +assembly: + level: scaffold + sample_id: grTriPseu1 + latin_name: to_provide_taxonomic_rank + classT: fungi + asmVersion: 1 + dbVersion: "1" + gevalType: DTOL +reference_file: /nfs/treeoflife-01/teams/tola/users/dp24/treeval/TreeValTinyData/assembly/draft/grTriPseu1.fa +assem_reads: + pacbio: /nfs/treeoflife-01/teams/tola/users/dp24/treeval/TreeValTinyData/genomic_data/pacbio/ + hic: /nfs/treeoflife-01/teams/tola/users/dp24/treeval/TreeValTinyData/genomic_data/hic-arima/ + supplementary: path +alignment: + data_dir: /nfs/treeoflife-01/teams/tola/users/dp24/treeval/TreeValTinyData/gene_alignment_data/ + common_name: "" # For future implementation (adding bee, wasp, ant etc) + geneset: "LaetiporusSulphureus.gfLaeSulp1" + #Path should end up looking like "{data_dir}{classT}/{common_name}/csv_data/{geneset}-data.csv" +self_comp: + motif_len: 0 + mummer_chunk: 10 +synteny: + synteny_genome_path: /nfs/treeoflife-01/teams/tola/users/dp24/treeval/TreeValTinyData/synteny/ +outdir: "NEEDS TESTING" +intron: + size: "50k" +telomere: + teloseq: TTAGGG +busco: + lineages_path: /nfs/treeoflife-01/teams/tola/users/dp24/treeval/TreeValTinyData/busco/subset/ + lineage: fungi_odb10 diff --git a/assets/github_testing/TreeValTinyTest.yaml b/assets/github_testing/TreeValTinyTest.yaml new file mode 100755 index 00000000..d9152241 --- /dev/null +++ b/assets/github_testing/TreeValTinyTest.yaml @@ -0,0 +1,31 @@ +assembly: + level: scaffold + sample_id: grTriPseu1 + latin_name: to_provide_taxonomic_rank + classT: fungi + asmVersion: 1 + dbVersion: "1" + gevalType: DTOL +reference_file: /home/runner/work/treeval/treeval/TreeValTinyData/assembly/draft/grTriPseu1.fa +assem_reads: + pacbio: /home/runner/work/treeval/treeval/TreeValTinyData/genomic_data/pacbio/ + hic: /home/runner/work/treeval/treeval/TreeValTinyData/genomic_data/hic-arima/ + supplementary: path +alignment: + data_dir: /home/runner/work/treeval/treeval/TreeValTinyData/gene_alignment_data/ + common_name: "" # For future implementation (adding bee, wasp, ant etc) + geneset: "LaetiporusSulphureus.gfLaeSulp1" + #Path should end up looking like "{data_dir}{classT}/{common_name}/csv_data/{geneset}-data.csv" +self_comp: + motif_len: 0 + mummer_chunk: 10 +synteny: + synteny_genome_path: /home/runner/work/treeval/treeval/TreeValTinyData/synteny/ +outdir: "NEEDS TESTING" +intron: + size: "50k" +telomere: + teloseq: TTAGGG +busco: + lineages_path: /home/runner/work/treeval/treeval/TreeValTinyData/busco/subset/ + lineage: fungi_odb10 diff --git a/assets/local_testing/nxOscDF5033-BGA.yaml b/assets/local_testing/nxOscDF5033-BGA.yaml new file mode 100755 index 00000000..ecb2ff1a --- /dev/null +++ b/assets/local_testing/nxOscDF5033-BGA.yaml @@ -0,0 +1,26 @@ +assembly: + sample_id: Oscheius_DF5033 + latin_name: to_provide_taxonomic_rank # Not currently in use + classT: nematode + asmVersion: 1 + gevalType: DTOL +reference_file: /workspace/treeval-curation/Oscheius_DF5033/genomic_data/Oscheius_DF5033.fa +assem_reads: + pacbio: /workspace/treeval-curation/Oscheius_DF5033/pacbio/ + hic: /workspace/treeval-curation/Oscheius_DF5033/hic-arima2/ + supplementary: path # Not currently in use +alignment: + data_dir: /workspace/treeval-curation/gene_alignment_data/ + geneset: "OscheiusTipulae.ASM1342590v1,CaenorhabditisElegans.WBcel235,Gae_host.Gae" +self_comp: + motif_len: 0 + mummer_chunk: 10 +intron: + size: "50k" +telomere: + teloseq: TTAGGG +synteny: + synteny_genome_path: /workspace/treeval-curation/synteny/ # Will not exist +busco: + lineages_path: /workspace/treeval-curation/busco/v5 + lineage: nematoda_odb10 diff --git a/assets/local_testing/nxOscDF5033.yaml b/assets/local_testing/nxOscDF5033.yaml index 69d61dc7..8c2547c1 100755 --- a/assets/local_testing/nxOscDF5033.yaml +++ b/assets/local_testing/nxOscDF5033.yaml @@ -1,5 +1,4 @@ assembly: - sizeClass: S # S if {genome => 4Gb} else L level: scaffold sample_id: Oscheius_DF5033 latin_name: to_provide_taxonomic_rank diff --git a/assets/local_testing/nxOscSUBSET.yaml b/assets/local_testing/nxOscSUBSET.yaml index 64082b6e..fcc726b1 100755 --- a/assets/local_testing/nxOscSUBSET.yaml +++ b/assets/local_testing/nxOscSUBSET.yaml @@ -1,5 +1,4 @@ assembly: - sizeClass: S # S if {genome => 4Gb} else L level: scaffold sample_id: OscheiusSUBSET latin_name: to_provide_taxonomic_rank diff --git a/assets/s3_treeval_test.yaml b/assets/s3_treeval_test.yaml deleted file mode 100755 index d3ddb32a..00000000 --- a/assets/s3_treeval_test.yaml +++ /dev/null @@ -1,25 +0,0 @@ -assembly: - level: scaffold - sample_id: nxOscDoli1 - classT: nematode - asmVersion: PB.a1 - dbVersion: "1" - gevalType: DTOL -reference_file: https://tolit.cog.sanger.ac.uk/test-data/Gae_host/assembly/DTOL_nxOscDoli1_1_FULL.fa -assem_reads: - pacbio: path - hic: path - supplementary: path -alignment: - data_dir: /lustre/scratch123/tol/teams/grit/dp24/treeval2/treeval/assets/ - common_name: "" # For future implementation (adding bee, wasp, ant etc) - geneset: "s3_Gae_Host.Gae" - #Path should end up looking like "{data_dir}{classT}/{common_name}/csv_data/{geneset}-data.csv" -self_comp: - motif_len: 0 - mummer_chunk: 10 -synteny: - synteny_genome_path: "" # No Syntenic Data for Nematode -outdir: "NEEDS TESTING" -intron: - size: "50k" diff --git a/bin/treeval-dataprep/GA_csv_gen.py b/bin/treeval-dataprep/GA_csv_gen.py new file mode 100644 index 00000000..adb3eaa5 --- /dev/null +++ b/bin/treeval-dataprep/GA_csv_gen.py @@ -0,0 +1,77 @@ +""" +---- Gene Alignment CSV Generator ---- + By Damon-Lee Pointon (dp24) + +This script generates the csv files + required by TreeVal (the gene alignment + sub-workflows). + +Script generates a csv per organism.assembly + in their respective classT folder + +USAGE: + python3 GA_data_sorting.py + +TODO: ADD argparse + version data +""" + +import argparse +import sys +import os +from os import path + + +def list_dir(dir_loc: str): + return [os.path.join(dir_loc, i) for i in os.listdir(dir_loc) if path.isdir(os.path.join(dir_loc, i))] + + +def get_file_list(root: str): + return [os.path.join(path, name) for path, subdirs, files in os.walk(root) for name in files] + + +def list_2_dict(file_list: list): + file_dict = {} + path_list = [] + for i in file_list: + path_list = i.split("/") + if path_list[-1].lower() in ["readme.txt", "readme"]: + pass + else: + file_dict[path_list[-1]] = [path_list[-3], path_list[-2], i] + return file_dict, path_list[-3] + + +def save_data(dict_of_data: dict, save_loc: str, org_accession: str): + save_path = f"{save_loc}/csv_data/{org_accession}-data.csv" + if os.path.exists(save_path): + os.remove(save_path) + else: + pass + print(f"Generating CSV for:\t{org_accession}\nSave Path:\t\t{save_path}") + with open(save_path, "w+") as new_csv: + new_csv.write("org,type,data_file") + for x, y in dict_of_data.items(): + new_csv.write(f"\n{y[0]},{y[1]},{y[2]}") + + +def main(): + gene_alignment_dir = sys.argv[1] + clade_list = list_dir(gene_alignment_dir) + master_list = [] + + for i in clade_list: + org_list = list_dir(i) + for ii in org_list: + accession_list = list_dir(ii) + for iii in accession_list: + print(f'============> {iii.split("/")[-1]} -- {i.split("/")[-1]}') + data_list = list_dir(iii) + master_list = [] + master_list += [get_file_list(j) for j in data_list] + file_dict, org = list_2_dict([item for sublist in master_list for item in sublist]) + save_data(file_dict, i, org) + print("GA_csv_gen: Complete") + + +if __name__ == "__main__": + main() diff --git a/bin/treeval-dataprep/GA_data_prep.py b/bin/treeval-dataprep/GA_data_prep.py new file mode 100644 index 00000000..0c72db09 --- /dev/null +++ b/bin/treeval-dataprep/GA_data_prep.py @@ -0,0 +1,231 @@ +#!/usr/local/bin/python + +PRINT_ERROR = """Does not exist\n + Get module installed before import attempt\n + If running server side then contact your admin""" + +try: + import sys + + if sys.version_info[0] < 3 and sys.version_info[1] < 6: + raise Exception( + """Must be using Python 3.6 for the full + functionality of this script""" + ) + if sys.version_info[0] >= 3 and sys.version_info[1] >= 6: + print("Your using at least Version 3.6, You are good to go...") +except ImportError: + print(f"sys not imported \n {PRINT_ERROR}") + sys.exit(0) + +try: + import os + + print("os imported") +except ImportError: + print(f"os not imported \n {PRINT_ERROR}") + sys.exit(0) + +try: + import argparse + + print("argparse imported") +except ImportError: + print(f"argparse not imported \n {PRINT_ERROR}") + sys.exit(0) + +try: + import re + + print("regex imported") +except ImportError: + print(f"re not imported \n {PRINT_ERROR}") + sys.exit(0) + +DOCSTRING = """ +---------------------------------------------------------------- + Gene Alignment Data Prep + By Damon-Lee Pointon (dp24) +---------------------------------------------------------------- +This script takes an input file and chunks it into 1000 sequence +files for cds and rna files or a user defined number (default 100) +for pep and cdna sequences. + +---------------------------------------------------------------- +Usage: + +GA_data_prep.py {name}-{accession}.{datatype}.fasta {ncbi|ens} {chunk} + +File name example is: ThalassiosiraPseudonana-ASM14940v2.rna.fasta +---------------------------------------------------------------- +""" + + +def get_command_args(args=None): + parser = argparse.ArgumentParser( + prog="GA_data_prep.py (Python 3)", description=DOCSTRING, formatter_class=argparse.RawDescriptionHelpFormatter + ) + + parser.add_argument("FASTA", action="store", help="Input unzipped fasta file", type=str) + + parser.add_argument("DB", action="store", help="database of origin", choices=["ncbi", "ens"], type=str) + + parser.add_argument("CHUNK", action="store", help="Chunk size for pep and cdna", default=100, type=int) + + parser.add_argument("-v", "--version", action="version", version="v7.0.0") + + options = parser.parse_args(args) + return options + + +def entryfunction(file, dtype, org, entryper, filesavedto, options): + """ + The entryfunction function splits a FASTA file into a defined + number of entries per file, pep == 2000 enteries and everything + else is split into 5000 enteries. + :param seq_file: + :param org: + :param directory: + :param entryper: + :param option: + """ + print("Entryfunction called") + count = 0 + filecounter = 0 + entry = [] + print(file) + if os.path.exists(file): + print("File found at %s", file) + with open(file, "r") as filetoparse: + print("Renaming headers") + for name, seq in read_fasta(filetoparse): + new_name = massage(name, options) + # print(new_name) # Here as a manual check of headers + nameseq = new_name, seq + entry.append(nameseq) + count += 1 + + if count == entryper: + filecounter += 1 + with open(f"{filesavedto}/{org}{filecounter}{dtype}.MOD.fa", "w") as done: + for head, body in entry: + done.write(f"{head}\n{body}\n") + count = 0 + entry = [] + print(f"File saved: -- {filesavedto}/{org}{filecounter}{dtype}.MOD.fa") + + filecounter += 1 + + with open(f"{filesavedto}/{org}{filecounter}{dtype}.MOD.fa", "w") as done: + for head, body in entry: + done.write(f"{head}\n{body}\n") + entry = [] + + print(f"File saved: -- {filesavedto}/{org}{filecounter}{dtype}.MOD.fa") + + +def massage(name, options): + """ + A function to 'massage' the sequence headers into a more human + readable style + :param option: + :param name: + :return name: + """ + + if name.startswith(">"): + if options.DB == "ncbi": + gene_symbol = re.search(r"gene=([A-Z]\w+)", name) + ens_code = re.search(r"GeneID:([1-9])\w+", name) + else: + gene_symbol = re.search(r"symbol:(\S+)", name) + ens_code = re.search(r"ENS(\w+)T(\w+.\d+)", name) + + if gene_symbol: + gene_symbol = gene_symbol.group(1) + elif gene_symbol is None: + gene_symbol = re.search(r"gene:(\S+)", name) + + if gene_symbol: + gene_symbol = gene_symbol.group(1) + + elif gene_symbol is None: + gene_symbol = re.search(r"PREDICTED: (.+) \[", name) + if gene_symbol: + gene_symbol = gene_symbol.group(1) + gene_symbol = gene_symbol.split() + gene_symbol = "_".join(gene_symbol) + else: + gene_symbol = "MissingInfo" + + if ens_code: + ens_code = ens_code.group(0) + + elif ens_code is None: + ens_code = re.search(r">(\S+)", name) + if ens_code: + ens_code = ens_code.group(1) + elif ens_code is None: + ens_code = "NoEnsCode" + + # print('Gene Symbol found as: %s', gene_symbol) + # print('Ens Code found as: %s', ens_code) + if gene_symbol == "MissingInfo": + # print('MissingInfo replaced with %s', ens_code) + gene_symbol = ens_code + name = f">{gene_symbol}({ens_code})" + + else: + print("Somethings gone wrongs, headers are wrong") + sys.exit(0) + + return name + + +def read_fasta(filetoparse): + """ + A function which opens and splits a fasta into name and seq. + :param filetoparse: + """ + print("Read_fasta called") + counter = 0 + name, seq = None, [] + + for line in filetoparse: + line = line.rstrip() + + if line.startswith(">"): + if name: + yield name, "".join(seq) + name, seq = line, [] + else: + seq.append(line) + + if name: + yield name, "".join(seq) + counter += 1 + + +def main(): + options = get_command_args() + file = options.FASTA + dtype = file.split(".")[1] + org = file.split(".")[0] + + print(f"WORKING ON:\t\t{dtype}--{org}") + + directory = f"./{org.split('-')[0]}/{org.split('-')[0]}.{org.split('-')[1]}/{dtype}" + + try: + os.makedirs(directory, mode=0o777) + except: + print("probably already exists") + + entryper = [1000 if dtype in ["cds", "rna"] else options.CHUNK] + + print(f"Records per file:\t{int(entryper[0])}") + entryfunction(file, dtype, org.split("-")[0], int(entryper[0]), directory, options) + + +if __name__ == "__main__": + main() diff --git a/conf/base.config b/conf/base.config index 4e7932ac..55b5c5ec 100755 --- a/conf/base.config +++ b/conf/base.config @@ -74,7 +74,7 @@ process { withName:SAMTOOLS_MERGE { cpus = { check_max( 16 * 1, 'cpus' ) } - memory = { check_max( 50.GB * task.attempt, 'memory') } + memory = { check_max( 150.GB * task.attempt, 'memory') } } // RESOURCES: MEMORY INTENSIVE STEPS, SOFTWARE TO BE UPDATED TO COMBAT THIS @@ -93,16 +93,24 @@ process { // RESOURCES: CHANGES TO FREQUENT FAILURES BELOW THIS MEM POINT withName: '.*:.*:GENE_ALIGNMENT:.*:(MINIPROT_ALIGN|MINIMAP2_ALIGN)' { - memory = { check_max( 100.GB * Math.ceil( task.attempt * 1.5 ) , 'memory' ) } - time = { check_max( 12.h * task.attempt, 'time' ) } + memory = { check_max( 50.GB * Math.ceil( task.attempt * 1.5 ) , 'memory' ) } + time = { check_max( 10.h * task.attempt, 'time' ) } } + // Standard parameters, covers most insecta withName: '.*:.*:LONGREAD_COVERAGE:(MINIMAP2_ALIGN|MINIMAP2_ALIGN_SPLIT)' { cpus = { check_max( 16 * 1, 'cpus' ) } - memory = { check_max( 300.GB * task.attempt, 'memory' ) } - time = { check_max( 36.h * task.attempt, 'time' ) } + memory = { check_max( 100.GB * task.attempt, 'memory' ) } + time = { check_max( 18.h * task.attempt, 'time' ) } } + // For Large complex genomes > 4Gb + // withName: '.*:.*:LONGREAD_COVERAGE:(MINIMAP2_ALIGN|MINIMAP2_ALIGN_SPLIT)' { + //cpus = { check_max( 20 * 1, 'cpus' ) } + //memory = { check_max( 400.GB * task.attempt, 'memory' ) } + // time = { check_max( 300.h * task.attempt, 'time' ) } + //} + withName: '.*:.*:LONGREAD_COVERAGE:SAMTOOLS_SORT' { cpus = { check_max( 8 * 1, 'cpus' ) } } @@ -135,7 +143,7 @@ process { withName: SNAPSHOT_HRES { cpus = { check_max( 1 * task.attempt, 'cpus' ) } - memory = { check_max( 50.GB * task.attempt, 'memory' ) } + memory = { check_max( 50.GB * task.attempt, 'memory' ) } } withName: JUICER_TOOLS_PRE { @@ -144,12 +152,12 @@ process { } withName: BWAMEM2_INDEX { - memory = { check_max( 40.GB * task.attempt, 'memory' ) } + memory = { check_max( 50.GB * task.attempt, 'memory' ) } } // add a cpus 16 if bam.size() >= 50GB withName: '(SAMTOOLS_MARKDUP|BAMTOBED_SORT)' { - cpus = { check_max( 12 * 1, 'cpus' ) } + cpus = { check_max( 12 * 1, 'cpus' ) } memory = { check_max( 100.GB * task.attempt, 'memory' ) } } @@ -159,8 +167,15 @@ process { } withName: BUSCO { - cpus = { check_max( 6 * task.attempt, 'cpus' ) } - memory = { check_max( 80.GB * task.attempt, 'memory' ) } - time = { check_max( 12.h * task.attempt, 'time' ) } + cpus = { check_max( 16 * task.attempt, 'cpus' ) } + memory = { check_max( 50.GB * task.attempt, 'memory' ) } + time = { check_max( 20.h * task.attempt, 'time' ) } } + + // Large Genomes > 4Gb + //withName: BUSCO { + //cpus = { check_max( 30 * task.attempt, 'cpus' ) } + //memory = { check_max( 120.GB * task.attempt, 'memory' ) } + //time = { check_max( 300.h * task.attempt, 'time' ) } + //} } diff --git a/conf/test_full.config b/conf/farm_test.config similarity index 64% rename from conf/test_full.config rename to conf/farm_test.config index 5d6a9617..68b15e3a 100755 --- a/conf/test_full.config +++ b/conf/farm_test.config @@ -5,7 +5,7 @@ Defines input files and everything required to run a full size pipeline test. Use as follows: - nextflow run sanger-tol/treeval -profile test_full,singularity,sanger + nextflow run sanger-tol/treeval -profile farm_full,singularity,sanger On LSF / tol farm: bsub -Is -tty -e error -o out -n 2 -q oversubscribed -M4000 -R'select[mem>4000] rusage[mem=4000] span[hosts=1]' 'nextflow run main.nf -profile test_full,singularity,sanger' @@ -16,8 +16,9 @@ cleanup = true params { - config_profile_name = 'Full local test profile' - config_profile_description = 'Full test dataset to check pipeline function, using a current full local dataset' + config_profile_name = "FULL local test profile" + config_profile_description = "FULL test dataset to check pipeline function, using a current full local dataset" - input = 'assets/local_testing/nxOscDF5033.yaml' + input = "${projectDir}/assets/local_testing/nxOscDF5033.yaml" + output = nxOscDF5033 } diff --git a/conf/full_s3_test.config b/conf/github_test.config similarity index 74% rename from conf/full_s3_test.config rename to conf/github_test.config index 89d7cecf..a5acbb0d 100755 --- a/conf/full_s3_test.config +++ b/conf/github_test.config @@ -11,8 +11,8 @@ */ params { - config_profile_name = 's3_test' - config_profile_description = 'Minimal Test Data for GitHub Actions test' + config_profile_name = 'GitHub FULL test' + config_profile_description = 'FULL Test Data for GitHub Actions test' // Limit resources so that this can run on GitHub Actions max_cpus = 2 @@ -20,5 +20,6 @@ params { max_time = '6.h' // Input data - input = "${projectDir}/assets/full_s3_treeval_test.yaml" + input = "${projectDir}/assets/github_testing/TreeValTinyTest.yaml" + outdir = "TinyTest" } diff --git a/conf/s3_test.config b/conf/local_github_test.config similarity index 74% rename from conf/s3_test.config rename to conf/local_github_test.config index 65bc1f09..8544b962 100755 --- a/conf/s3_test.config +++ b/conf/local_github_test.config @@ -11,8 +11,8 @@ */ params { - config_profile_name = 's3_test' - config_profile_description = 'Minimal Test Data for GitHub Actions test' + config_profile_name = 'GitHub FULL test' + config_profile_description = 'FULL Test Data for GitHub Actions test' // Limit resources so that this can run on GitHub Actions max_cpus = 2 @@ -20,5 +20,6 @@ params { max_time = '6.h' // Input data - input = "${projectDir}/assets/s3_treeval_test.yaml" + input = "${projectDir}/assets/github_testing/TreeValTinyTest-Local.yaml" + outdir = "TinyTest" } diff --git a/conf/modules.config b/conf/modules.config index 40c44751..985f2164 100755 --- a/conf/modules.config +++ b/conf/modules.config @@ -265,4 +265,18 @@ process { withName: '.*:.*:HIC_MAPPING:SAMTOOLS_MERGE' { ext.prefix = { "${meta.id}_merged" } } + + withName: CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT { + ext.args = '' + ext.args1 = '-F0xB00 -nt' + ext.args2 = { "-5SPCp -H'${rglines}'" } + ext.args3 = '-mpu' + ext.args4 = { '--write-index -l1' } + } + + withName: '.*:.*:GENERATE_GENOME:GNU_SORT' { + ext.prefix = { "${meta.id}_sorted"} + ext.args = { '-k2,2 -nr' } + } + } diff --git a/docs/genealignmentsynteny.md b/docs/genealignmentsynteny.md new file mode 100644 index 00000000..79f11bd6 --- /dev/null +++ b/docs/genealignmentsynteny.md @@ -0,0 +1,213 @@ +## Work through + +Seeing as this can be quite a complicated to set up here's a walk through. + +### Step 1 - Set up the directories + +Lets set up the system as if we want to run it on a bird genome. + +```bash + +mkdir -p gene_alignment_prep/scripts/ + +cp treeval/bin/treeval-dataprep/* gene_alignment_prep/scripts/ + +mkdir -p gene_alignment_prep/raw_fasta/ + +mkdir -p gene_alignment_data/bird/csv_data/ + +mkdir -p synteny/bird/ +``` + +The naming of the bird folder here is important, keep this in mind. + +So now we have this structure: + +``` +~/treeval-resources + │ + ├─ synteny/ + │ └─ bird/ + │ + ├─ gene_alignment_data/ + │ └─ bird/ + │ └─ csv_data/ + │ + └─ gene_alignment_prep/ + ├─ scripts/ + └─ raw_fasta/ +``` + +### Step 2 - Download some data + +First, let's download out sytenic alignment data. I think the Zebrafinch ( _Taeniopygia guttata_ ) would be good against the Chicken ( _Gallus gallus_ ). + +```bash +cd synteny/bird/ + +curl https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/003/957/565/GCA_003957565.4_bTaeGut1.4.pri/GCA_003957565.4_bTaeGut1.4.pri_genomic.fna.gz -o bTaeGut1_4.fasta.gz + +gunzip bTaeGut1_4.fasta.gz +``` + +This leaves us with a file called `bTaeGut1_4.fasta` the genomic assembly of `bTaeGut1_4` (the [Tree of Life ID](https://id.tol.sanger.ac.uk/) for this species) also known as _Taeniopygia guttata_, the Australian Zebrafinch. + +Now lets move into the `raw_data` folder and download some data, this may take some time. + +```bash +cd ../../gene_alignment_prep/raw_data/ + +curl https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/016/699/485/GCF_016699485.2_bGalGal1.mat.broiler.GRCg7b/GCF_016699485.2_bGalGal1.mat.broiler.GRCg7b_cds_from_genomic.fna.gz -o GallusGallus-GRCg7b.cds.fasta.gz + +curl https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/016/699/485/GCF_016699485.2_bGalGal1.mat.broiler.GRCg7b/GCF_016699485.2_bGalGal1.mat.broiler.GRCg7b_genomic.fna.gz -o GallusGallus-GRCg7b.cdna.fasta.gz + +curl https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/016/699/485/GCF_016699485.2_bGalGal1.mat.broiler.GRCg7b/GCF_016699485.2_bGalGal1.mat.broiler.GRCg7b_protein.faa.gz -o GallusGallus-GRCg7b.pep.fasta.gz + +curl https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/016/699/485/GCF_016699485.2_bGalGal1.mat.broiler.GRCg7b/GCF_016699485.2_bGalGal1.mat.broiler.GRCg7b_rna.fna.gz -o GallusGallus-GRCg7b.rna.fasta.gz + +``` + +Now that's all downloaded we need to prep it. At this point it is all still gzipped (the `.gz` on the end denotes that the file is compressed) in this format we can't use it. So lets use some bash magic. + +This is a for loop written in bash, this will look through the current folder we are in for files ending with `.fasta.gz` and then gunzip them. This unzips the file, uncompresses it so it is usable, and then runs our python script, `GA_data_prep.py`. + +```bash +for i in *.fasta.gz; do +gunzip $i; +python3 GA_data_prep.py ${i/.gz} ncbi 10; +done +``` + +This python script command here, in English, means. Take the file, uncompressed, that I downloaded from NCBI and cut it into pieces. A Fasta file looks something like this, with headers (lines starting with `>`) and sequence (the usual ATGC's): + +```markdown +> SCAFFOLD_1_DATA_ON_METHODS +> ATGCGCATGCATGCATCGACTTCGAGCATCGTAG +> SCAFFOLD_2_DATA_ON_METHODS +> ACCAGTGCTAGCTAGCTACGTGTGGGTTTGCCCCGTTT +``` + +The headers here will be trimmed, to only essential data that you need in order to fine the sequence in your database of choice. + +Fasta file may be made up of anywhere between 10's to many thousands of these. So in the case of our `cdna` and `pep` files they need to be cut up to let TreeVal have a chance in reading them all in a small time frame. + +`cds` and `rna` files will be cut up into 1,000 header-sequence pairs per file. The number given on the command-line is ignored. +`pep` and `cdna` will be cut up by a number you give or by 100. + +This is because the size of `pep` and `cdna` files are so much larger. + +The smaller the number you chunk a file, the smaller the files you produce which means you will also make many more files so there is a trade off. + +So will produce a large amount of output in your terminal. Looking like: + +```bash +python3 ../scripts/GA_data_prep.py GallusGallus-GRCg7b.cds.fasta ncbi 100 + +Your using at least Version 3.6, You are good to go... +os imported +argparse imported +regex imported +WORKING ON: cds--GallusGallus-GRCg7b +Records per file: 1000 +Entryfunction called +GallusGallus-GRCg7b.cds.fasta +File found at %s GallusGallus-GRCg7b.cds.fasta +Renaming headers +Read_fasta called +File saved: -- ./GallusGallus/GallusGallus.GRCg7b/cds/GallusGallus1000cds.MOD.fa +File saved: -- ./GallusGallus/GallusGallus.GRCg7b/cds/GallusGallus2001cds.MOD.fa +File saved: -- ./GallusGallus/GallusGallus.GRCg7b/cds/GallusGallus3002cds.MOD.fa +File saved: -- ./GallusGallus/GallusGallus.GRCg7b/cds/GallusGallus4003cds.MOD.fa +File saved: -- ./GallusGallus/GallusGallus.GRCg7b/cds/GallusGallus5004cds.MOD.fa +File saved: -- ./GallusGallus/GallusGallus.GRCg7b/cds/GallusGallus6005cds.MOD.fa +File saved: -- ./GallusGallus/GallusGallus.GRCg7b/cds/GallusGallus7006cds.MOD.fa +File saved: -- ./GallusGallus/GallusGallus.GRCg7b/cds/GallusGallus8007cds.MOD.fa +File saved: -- ./GallusGallus/GallusGallus.GRCg7b/cds/GallusGallus9008cds.MOD.fa +``` + +This is pretty much telling us that, yes you have given me a file and for every 1000 (i'm ignoring the number you gave me because this isn't a `pep` or `cdna` file) header, sequence pairs I have come across I have made a new file found here. You'll notice that it has also generated a new set of folders. This is based off of how we have named the file. + +If you now type `ls` you should see the files we have downloaded (`GallusGallus-GRCg7b.cds.fasta`) and the folder `GallusGallus`. This folder we can now move to its permanent home. + +```bash +mv GallusGallus/ ../../gene_alignment_data/bird/ +``` + +### Step 3 -- Generate the CSV + +This file will act as an index of all files we have produced in the gene_alignment_data folder, and thankfully is a very simple step. + +```bash +cd ../ +python3 scripts/GA_csv_gen.py /path/to/gene_alignment_data/ +``` + +Running this will look like: + +```bash +============> CorvusMon1.bCorMon1 -- bird +Generating CSV for: CorvusMon1.bCorMon1 +Save Path: /gene_alignment_data/bird/csv_data/CorvusMon1.bCorMon1-data.csv +============> CorvusMoneduloides.bCorMon1 -- bird +Generating CSV for: CorvusMoneduloides.bCorMon1 +Save Path: /gene_alignment_data/bird/csv_data/CorvusMoneduloides.bCorMon1-data.csv +============> Gallus_gallus.UW_022020 -- bird +Generating CSV for: Gallus_gallus.UW_022020 +Save Path: /gene_alignment_data/bird/csv_data/Gallus_gallus.UW_022020-data.csv +============> Gallus_gallus.GRCg6a -- bird +Generating CSV for: Gallus_gallus.GRCg6a +Save Path: /gene_alignment_data/bird/csv_data/Gallus_gallus.GRCg6a-data.csv +============> GallusGallus.GRCg7b -- bird +Generating CSV for: GallusGallus.GRCg7b +Save Path: /gene_alignment_data/bird/csv_data/GallusGallus.GRCg7b-data.csv +``` + +So what is happening is that it is moving through the directory, identifying each unique folder and generating a CSV summarising the data found in those directories into a csv with the following information, this looks like: + +```bash +head -n 5 /gene_alignment_data/bird/csv_data/Gallus_gallus.GRCg6a-data.csv + +org,type,data_file +Gallus_gallus.GRCg6a,cds,/gene_alignment_data/bird/Gallus_gallus/Gallus_gallus.GRCg6a/cds/Gallus_gallus9002cds.MOD.fa +Gallus_gallus.GRCg6a,cds,/gene_alignment_data/bird/Gallus_gallus/Gallus_gallus.GRCg6a/cds/Gallus_gallus28453cds.MOD.fa +Gallus_gallus.GRCg6a,cds,/gene_alignment_data/bird/Gallus_gallus/Gallus_gallus.GRCg6a/cds/Gallus_gallus18005cds.MOD.fa +Gallus_gallus.GRCg6a,cds,/gene_alignment_data/bird/Gallus_gallus/Gallus_gallus.GRCg6a/cds/Gallus_gallus6001cds.MOD.fa +``` + +This is all useful for the pipeline which generates job ids based on the org column, groups files by org and type columns and then pulls data from the data_file. + +### Step 4 -- Understand where we are at + +So we have now generated the directory structure for gene_alignment_data. So now lets use what we know to fill out the yaml. + +The yaml is a file that we need in order to tell the pipeline where everything is, an example can be found [here](../assets/local_testing/nxOscDF5033.yaml). + +Here we can see a number of fields that need to be filled out, the easiest being `synteny_genome_path` and `data_dir`. These refer to the directories we made earlier so we can replace them as such: + +```yaml +alignment: + data_dir: /FULL/PATH/TO/treeval-resources/gene_alignment_data/ + +synteny_genome_path: /FULL/PATH/TO/treeval-resources/synteny +``` + +I said earlier that the the fact we called a folder `bird` was important, this is because it now becomes our `classT`: + +```yaml +classT: bird +``` + +During the running of the pipeline, this is appended onto the end of `data_dir` and `synteny_genome_path` in order to find the correct files to use. So now all of the files inside `/FULL/PATH/TO/treeval-resources/synteny/bird/ ` will be used for syntenic alignments. Likewise with our genomic_alignment_data, TreeVal will turn this into `/FULL/PATH/TO/treeval-resources/gene_alignment_data/bird/` and then appends `csv_data`. + +In Step 3, we generated some files which will be living in our `/FULL/PATH/TO/treeval-resources/gene_alignment_data/bird/csv_data/` folder and look like `GallusGallus.GRCg7b-data.csv`. These (minus the `-data.csv`) will be what we enter into the `geneset` field in the yaml. The common_name is a field we don't currently use. + +```yaml +alignment: + data_dir: /FULL/PATH/TO/treeval-resources/gene_alignment_data/ + common_name: "" # For future implementation (adding bee, wasp, ant etc) + geneset: "GallusGallus.GRCg7b" +``` + +However, what is cool about this field is that you can add as many as you want. So say you have the genomic_alignment_data for the Finch saved as `TaeniopygiaGuttata.bTaeGut1_4`. The geneset field becomes: `geneset: "GallusGallus.GRCg7b,TaeniopygiaGuttata.bTaeGut1_4"` + +Hopefully this explains things a bit better and you understand how this sticks together! diff --git a/docs/images/Sanger-classT.png b/docs/images/Sanger-classT.png new file mode 100644 index 00000000..0a531928 Binary files /dev/null and b/docs/images/Sanger-classT.png differ diff --git a/docs/pacbio.md b/docs/pacbio.md new file mode 100644 index 00000000..97b9e2a5 --- /dev/null +++ b/docs/pacbio.md @@ -0,0 +1,56 @@ +## PacBio Data + +Before running the pipeline data has to be in the `fasta.gz` format. Because of the software we use this data with it must also be long-read data as well as single stranded. This means you could use ONT too (except duplex reads). + +The below commands should help you convert from mapped bam to fasta.gz, or from fastq to fasta. + +If your data isn't already in these formats, then let us know and we'll see how we can help. + +### BAM -> FASTQ + +This command iterates through your bam files and converts them to fastq via samtools. + +```bash +cd { TO FOLDER OF BAM FILES } +mkdir fastq +for i in *bam +do + echo $i + j=${i%.bam} + echo $j + samtools bam2fq ${i} > fastq/${j}.fq +done +``` + +### FASTQ -> FASTA + +This command creates a `fasta` folder (to store our fasta files), moves into the `fastq` folder and then converts `fastq` to `fasta` using seqtk seq. + +```bash +mkdir fasta +cd fastq + +for i in *fq; do + echo $i + j=${i%.fq} + echo $j + seqtk seq -a $i > ../fasta/${j}.fasta +done +``` + +### FASTA -> FASTA.GZ + +This simply gzips the fasta files. + +```bash +for i in .fasta; do + echo $i + gzip $i +done +``` + +### Or if you're a command line ninja + +```bash +samtools bam2fq {prefix}.bam| seqtk seq -a - | gzip - > {prefix}.fasta.gz +``` diff --git a/docs/usage.md b/docs/usage.md index 7769c18d..f64551e9 100755 --- a/docs/usage.md +++ b/docs/usage.md @@ -6,32 +6,97 @@ ## Introduction -The TreeVal pipeline has three requirements before being able to run +The TreeVal pipeline has a few requirements before being able to run: -Firstly, it requires a .yaml file (described below in Full Samplesheet). This will contain all of the information required for running the TreeVal pipeline. +- The gene_alignment_data and synteny data much follow a particular directory structure -The cram files must each be accompanied by an index file (.crai) generated by samtools index. +- HiC CRAM files much already be pre-indexed in the same location as the CRAM file, e.g., `samtools index {cram file}`. If this would be more helpful to the community as an automated process then please submit an issue. -The gene alignment data is also expected to follow a particular folder structure, such as (using data from the yaml below): +- Finally, the yaml file which is described below in Full Samplesheet. This needs to contain all of the information related to the assembly for the pipeline to run. + +### Prior to running TreeVal + +:warning: Please ensure you read the following sections on Directory Strucutre (gene_alignment_data, synteny, scripts), HiC data prep and Pacbio data prep. Without these you may not be able to successfully run the TreeVal pipeline. If nothing is clear then leave an issue report. + +#### Directory Structure + +Working example found [here](genealignmentsynteny.md), this will cover setting up synteny and gene_alignment_data directories as well as downloading some example data. + +These two sub-workflows, for now, need the use of the variables `classT`, `synteny_genome_path`, `data_dir` and `geneset`. These variables are found inside the yaml ( this is the file that will tell TreeVal what and where everything is ). Currently, we don't use `common_name`, e.g., `bee`, `wasp`, `moth`, etc. However, we hope to make use of it in the future as our gene_alignment_data "database" grows. + +First, you should set up a directory in our recommended structure: ```bash -geneset = "Gae_host.Gae,CSKR_v2.CSKR" -for organism in geneset.split(',') - path = {gene_alignment_dir}{classT}/{organism}/csv-data/{organism}-data.csv +treeval-resources + │ + ├─ gene_alignment_data/ + │ └─ { classT } + │ ├─ csv_data + │ │ └─ { Name.Assession }-data.csv # Generated by our scripts + │ └─ { Name } # Here and below is generated by our scripts + │ └─ { Name.Assession } + │ ├─ cdna + │ │ └─ { Chunked fasta files } + │ ├─ rna + │ │ └─ { Chunked fasta files } + │ ├─ cds + │ │ └─ { Chunked fasta files } + │ └─ pep + │ └─ { Chunked fasta files } + │ + ├─ gene_alignment_prep/ + │ ├─ scripts/ # We supply these in this repo + │ ├─ raw_fasta/ # Storing your fasta downloaded from NCBI or Ensembl + │ └─ treeval-datasets.tsv # Organism, common_name, clade, family, group, link_to_data, notes + │ + ├─ synteny/ + │ └─ {classT} + │ + ├─ treeval_yaml/ # Storage folder for you yaml files, it's useful to keep them + │ + └─ treeval_stats/ # Storage for you treeval output stats file whether for upload to our repo + ``` -Each csv file acts as a file of file names. Containing data per file in a format such as: +`classT` can be your own system of classification, as long as it is consistent. At Sanger we use the below, we advise you do too. Again, this value, that is entered into the yaml (the file we will use to tell TreeVal where everything is), is used to find gene_alignment_data as well as syntenic genomes. + +![ClassT](../docs/images/Sanger-classT.png) + +##### Synteny + +For synteny, below the `classT` variable you should store the full genomic fasta file of any high quality genome you want to be compared against. + +For bird we recommend the Golden Eagle ( _Aquila chrysaetos_ ) and the Zebrafinch (_Taeniopygia guttata_), which can be downloaded from NCBI. Rename, these files to something more human readable, and drop them into the `synteny/bird/` folder. Any TreeVal run you now perform where the `classT` is bird will run a syntenic alignment against all genomes in that folder. It would be best to keep this to around three. Again, this is something we could expand on with the `common_name` field if people want in the future, submit a feature request. + +#### Gene_alignment_data + +Find information on this here: [Gene Alignment and Synteny Data Prep](genealignmentsynteny.md) + +#### HiC data Preparation + +Illumina HiC read files should be presented in an unmapped CRAM format, each must be accompanied by an index file (.crai) generated by samtools index. If your unmapped HiC reads are in FASTQ format, you should first convert them to CRAM format by using samtools import methods. Examples are below: + +##### Conversion of FASTQ to CRAM ```bash -{organism},{cdna|pep|cds|rna},/lustre/scratch123/tol/resources/treeval/gene_alignment_data/{classT}/{organism}/{cds|cdna|rna|pep}/{organism}_{cdna|pep|cds|rna}.fasta +samtools import -@8 -r ID:{prefix} -r CN:{hic-kit} -r PU:{prefix} -r SM:{sample_name} {prefix}_R1.fastq.gz {prefix}_R2.fastq.gz -o {prefix}.cram ``` -It is advised that the fasta files be no larger than 50Mb, this allows the pipeline to run without wasting significant resources on large alignments. +##### Indexing of CRAM + +```bash +samtools index {prefix}.cram +``` + +#### PacBio Data Preparation + +Find information on this here: [PacBio Data Prep](pacbio.md) ## Full samplesheet The samplesheet for this pipeline is as shown below. This yaml is parsed by the pipeline and converted into the relevant channels. +A real production version of this YAML can be found here: [nxOscDF5033.yaml](../assets/local_testing/nxOscDF5033.yaml) - `assembly` - `sample_id`: ToLID of the sample. diff --git a/lib/TreeValProject.groovy b/lib/TreeValProject.groovy index 775835ef..3c13c231 100755 --- a/lib/TreeValProject.groovy +++ b/lib/TreeValProject.groovy @@ -1,3 +1,6 @@ +import java.time.OffsetDateTime; +import java.time.Duration; + class TreeValProject { // // Generates a small summary containing context for the input files @@ -5,52 +8,53 @@ class TreeValProject { // Will be used for graph generation. // - public static void summary(workflow, params) { + public static void summary(workflow, params, metrics, log) { + + def date_completed = OffsetDateTime.now() def input_data = [:] input_data['version'] = NfcoreTemplate.version( workflow ) input_data['runName'] = workflow.runName input_data['session_id'] = workflow.sessionId - input_data['duration'] = workflow.duration + input_data['duration'] = Duration.between( workflow.start, date_completed ).toSeconds() input_data['DateStarted'] = workflow.start - input_data['DateCompleted'] = workflow.complete + input_data['DateCompleted'] = date_completed + input_data['entry'] = params.entry input_data['input_yaml'] = params.input - input_data['sample_name'] = params.sample_id.value - input_data['rf_data'] = params.rf_data.value - input_data['pb_data'] = params.pb_data.value - input_data['cm_data'] = params.cm_data.value - - if (workflow.success) { - - def output_directory = new File("${params.tracedir}/") - if (!output_directory.exists()) { - output_directory.mkdirs() - } - - def output_hf = new File(output_directory, "input_data_${params.trace_timestamp}.txt") - output_hf.write """\ - ---RUN_DATA--- - Pipeline_version: ${input_data.version} - Pipeline_runname: ${input_data.runName} - Pipeline_session: ${input_data.session_id} - Pipeline_duration: ${input_data.duration} - Pipeline_datastrt: ${input_data.DateStarted} - Pipeline_datecomp: ${input_data.DateCompleted} - ---INPUT_DATA--- - InputSampleID: ${input_data.sample_name} - InputYamlFile: ${input_data.input_yaml} - InputAssemblyData: ${input_data.rf_data} - Input_PacBio_Files: ${input_data.pb_data} - Input_Cram_Files: ${input_data.cm_data} - ---RESOURCES--- - """.stripIndent() - - def full_file = new File( output_directory, "TreeVal_run_${params.sample_id.value}_${params.trace_timestamp}.txt" ) - def file_locs = ["${params.tracedir}/input_data_${params.trace_timestamp}.txt", - "${params.tracedir}/pipeline_execution_${params.trace_timestamp}.txt"] - file_locs.each{ full_file.append( new File( it ).getText() ) } - + input_data['sample_name'] = metrics.sample_id + input_data['rf_data'] = metrics.rf_data + input_data['pb_data'] = metrics.pb_data + input_data['cm_data'] = metrics.cm_data + + def output_directory = new File("${params.tracedir}/") + if (!output_directory.exists()) { + output_directory.mkdirs() } + + def output_hf = new File( output_directory, "input_data_${input_data.sample_name}_${input_data.entry}_${params.trace_timestamp}.txt" ) + output_hf.write """\ + ---RUN_DATA--- + Pipeline_version: ${input_data.version} + Pipeline_runname: ${input_data.runName} + Pipeline_session: ${input_data.session_id} + Pipeline_duration: ${input_data.duration} + Pipeline_datastrt: ${input_data.DateStarted} + Pipeline_datecomp: ${input_data.DateCompleted} + Pipeline_entrypnt: ${input_data.entry} + ---INPUT_DATA--- + InputSampleID: ${input_data.sample_name} + InputYamlFile: ${input_data.input_yaml} + InputAssemblyData: ${input_data.rf_data} + Input_PacBio_Files: ${input_data.pb_data} + Input_Cram_Files: ${input_data.cm_data} + ---RESOURCES--- + """.stripIndent() + + def full_file = new File( output_directory, "TreeVal_run_${input_data.sample_name}_${input_data.entry}_${params.trace_timestamp}.txt" ) + def file_locs = ["${params.tracedir}/input_data_${input_data.sample_name}_${input_data.entry}_${params.trace_timestamp}.txt", + "${params.tracedir}/pipeline_execution_${params.trace_timestamp}.txt"] + file_locs.each{ full_file.append( new File( it ).getText() ) } + } } diff --git a/main.nf b/main.nf index 5e412368..0d25a633 100755 --- a/main.nf +++ b/main.nf @@ -25,11 +25,16 @@ WorkflowMain.initialise( workflow, params, log ) include { TREEVAL } from './workflows/treeval' include { TREEVAL_RAPID } from './workflows/treeval_rapid' -// WORKFLOW: Run main sanger-tol/treeval analysis pipeline +// +// WORKFLOW: RUN MAIN PIPELINE GENERATING ALL OUTPUT +// workflow SANGERTOL_TREEVAL { TREEVAL () } +// +// WORKFLOW: RUN TRUNCATED PIPELINE TO PRODUCE CONTACT MAPS AND PRETEXT ACCESSORIES +// workflow SANGERTOL_TREEVAL_RAPID { TREEVAL_RAPID () } @@ -41,7 +46,7 @@ workflow SANGERTOL_TREEVAL_RAPID { */ // -// WORKFLOW: Execute named workflow for the pipeline +// WORKFLOWS: Execute named workflow for the pipeline // workflow FULL { SANGERTOL_TREEVAL () diff --git a/modules.json b/modules.json index f2689dc0..b3f7eb35 100755 --- a/modules.json +++ b/modules.json @@ -72,8 +72,9 @@ }, "custom/getchromsizes": { "branch": "master", - "git_sha": "d75b37fef175f241230ee25c485bd574c768e282", - "installed_by": ["modules"] + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"], + "patch": "modules/nf-core/custom/getchromsizes/custom-getchromsizes.diff" }, "gnu/sort": { "branch": "master", diff --git a/modules/local/assign_ancestral.nf b/modules/local/assign_ancestral.nf index 43758c92..5b94e625 100755 --- a/modules/local/assign_ancestral.nf +++ b/modules/local/assign_ancestral.nf @@ -5,7 +5,7 @@ process ASSIGN_ANCESTRAL { conda "conda-forge::python=3.9" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/pandas:1.5.2' : - 'quay.io/biocontainers/pandas:1.5.2' }" + 'biocontainers/pandas:1.5.2' }" input: tuple val(meta), path(comp_location) diff --git a/modules/local/bamtobed_sort.nf b/modules/local/bamtobed_sort.nf index 3af4fbc4..87c20951 100755 --- a/modules/local/bamtobed_sort.nf +++ b/modules/local/bamtobed_sort.nf @@ -4,7 +4,7 @@ process BAMTOBED_SORT { container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/mulled-v2-9d3a458f6420e5712103ae2af82c94d26d63f059:60b54b43045e8cf39ba307fd683c69d4c57240ce-0' : - 'quay.io/biocontainers/mulled-v2-9d3a458f6420e5712103ae2af82c94d26d63f059:60b54b43045e8cf39ba307fd683c69d4c57240ce-0' }" + 'biocontainers/mulled-v2-9d3a458f6420e5712103ae2af82c94d26d63f059:60b54b43045e8cf39ba307fd683c69d4c57240ce-0' }" input: tuple val(meta), path(bam) @@ -22,7 +22,7 @@ process BAMTOBED_SORT { cat <<-END_VERSIONS > versions.yml "${task.process}": - samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//' )) + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') bedtools: \$(bedtools --version | sed -e "s/bedtools v//g") END_VERSIONS """ @@ -34,7 +34,7 @@ process BAMTOBED_SORT { cat <<-END_VERSIONS > versions.yml "${task.process}": - samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//' )) + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') bedtools: \$(bedtools --version | sed -e "s/bedtools v//g") END_VERSIONS """ diff --git a/modules/local/chunkfasta.nf b/modules/local/chunkfasta.nf index 266f19be..3c9113c4 100755 --- a/modules/local/chunkfasta.nf +++ b/modules/local/chunkfasta.nf @@ -5,7 +5,7 @@ process CHUNKFASTA { conda "conda-forge::pyfasta=0.5.2-1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/pyfasta:0.5.2--py_1' : - 'quay.io/biocontainers/pyfasta:0.5.2--py_1' }" + 'biocontainers/pyfasta:0.5.2--py_1' }" input: tuple val(meta), path(fasta) diff --git a/modules/local/concatblocks.nf b/modules/local/concatblocks.nf index fc58d0a6..5c01459d 100755 --- a/modules/local/concatblocks.nf +++ b/modules/local/concatblocks.nf @@ -5,7 +5,7 @@ process CONCATBLOCKS { conda "conda-forge::coreutils=9.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : - 'ubuntu:20.04' }" + 'docker.io/ubuntu:20.04' }" input: tuple val(meta), path(mergeblocks) diff --git a/modules/local/concatmummer.nf b/modules/local/concatmummer.nf index 3a5bf9a7..da38c968 100755 --- a/modules/local/concatmummer.nf +++ b/modules/local/concatmummer.nf @@ -5,7 +5,7 @@ process CONCATMUMMER { conda "conda-forge::coreutils=9.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : - 'ubuntu:20.04' }" + 'docker.io/ubuntu:20.04' }" input: tuple val(meta), path(coords) diff --git a/modules/local/cram_filter_align_bwamem2_fixmate_sort.nf b/modules/local/cram_filter_align_bwamem2_fixmate_sort.nf index f65298aa..3c639b5b 100755 --- a/modules/local/cram_filter_align_bwamem2_fixmate_sort.nf +++ b/modules/local/cram_filter_align_bwamem2_fixmate_sort.nf @@ -4,7 +4,7 @@ process CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT { container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/mulled-v2-50d89b457e04ed90fa0cbf8ebc3ae1b9ffbc836b:caf993da1689e8d42f5e4c113ffc9ef81d26df96-0' : - 'quay.io/biocontainers/mulled-v2-50d89b457e04ed90fa0cbf8ebc3ae1b9ffbc836b:caf993da1689e8d42f5e4c113ffc9ef81d26df96-0' }" + 'biocontainers/mulled-v2-50d89b457e04ed90fa0cbf8ebc3ae1b9ffbc836b:caf993da1689e8d42f5e4c113ffc9ef81d26df96-0' }" input: tuple val(meta), path(cramfile), path(cramindex), val(from), val(to), val(base), val(chunkid), val(rglines), val(bwaprefix) @@ -18,13 +18,18 @@ process CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT { script: def args = task.ext.args ?: '' + def args1 = task.ext.args1 ?: '' + def args2 = task.ext.args2 ?: '' + def args3 = task.ext.args3 ?: '' + def args4 = task.ext.args4 ?: '' def prefix = task.ext.prefix ?: "${meta.id}" + // Please be aware one of the tools here required mem = 28 * reference size!!! """ cram_filter -n ${from}-${to} ${cramfile} - | \\ - samtools fastq -F0xB00 -nt - | \\ + samtools fastq ${args1} | \\ bwa-mem2 mem -p ${bwaprefix} -t${task.cpus} -5SPCp -H'${rglines}' - | \\ - samtools fixmate -mpu - - | \\ - samtools sort --write-index -l1 -@${task.cpus} -T ${base}_${chunkid}_sort_tmp -o ${prefix}_${base}_${chunkid}_mem.bam - + samtools fixmate ${args3} - - | \\ + samtools sort ${args4} -@${task.cpus} -T ${base}_${chunkid}_sort_tmp -o ${prefix}_${base}_${chunkid}_mem.bam - cat <<-END_VERSIONS > versions.yml "${task.process}": @@ -44,7 +49,7 @@ process CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT { cat <<-END_VERSIONS > versions.yml "${task.process}": samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//' ) - bwa-mem2: \$(bwa-mem2 --version | sed 's/bwa-mem2 //g') + bwamem2: \$(echo \$(bwa-mem2 version 2>&1) | sed 's/.* //') END_VERSIONS """ } diff --git a/modules/local/extract_ancestral.nf b/modules/local/extract_ancestral.nf index 8c5c509c..265f9f2e 100755 --- a/modules/local/extract_ancestral.nf +++ b/modules/local/extract_ancestral.nf @@ -5,7 +5,7 @@ process EXTRACT_ANCESTRAL { conda "conda-forge::python=3.9" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/python:3.9' : - 'quay.io/biocontainers/python:3.9' }" + 'biocontainers/python:3.9' }" input: tuple val(meta), path(fulltable) diff --git a/modules/local/extract_buscogene.nf b/modules/local/extract_buscogene.nf index 42d6fb18..44149d74 100755 --- a/modules/local/extract_buscogene.nf +++ b/modules/local/extract_buscogene.nf @@ -5,7 +5,7 @@ process EXTRACT_BUSCOGENE { conda "conda-forge::coreutils=9.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : - 'ubuntu:20.04' }" + 'docker.io/ubuntu:20.04' }" input: diff --git a/modules/local/extract_cov_iden.nf b/modules/local/extract_cov_iden.nf index 871d4ec4..d50fd39c 100755 --- a/modules/local/extract_cov_iden.nf +++ b/modules/local/extract_cov_iden.nf @@ -5,7 +5,7 @@ process EXTRACT_COV_IDEN { conda "conda-forge::coreutils=9.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : - 'ubuntu:20.04' }" + 'docker.io/ubuntu:20.04' }" input: tuple val( meta ), path( file ) diff --git a/modules/local/extract_repeat.nf b/modules/local/extract_repeat.nf index 801c9b45..85fe9c93 100755 --- a/modules/local/extract_repeat.nf +++ b/modules/local/extract_repeat.nf @@ -5,7 +5,7 @@ process EXTRACT_REPEAT { conda "conda-forge::perl=5.26.2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/perl:5.26.2' : - 'quay.io/biocontainers/perl:5.26.2' }" + 'biocontainers/perl:5.26.2' }" input: tuple val( meta ), path( file ) diff --git a/modules/local/extract_telo.nf b/modules/local/extract_telo.nf index f78e0a36..cad234fc 100755 --- a/modules/local/extract_telo.nf +++ b/modules/local/extract_telo.nf @@ -5,7 +5,7 @@ process EXTRACT_TELO { conda "conda-forge::coreutils=9.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : - 'ubuntu:20.04' }" + 'docker.io/ubuntu:20.04' }" input: tuple val( meta ), path( file ) diff --git a/modules/local/find_telomere_windows.nf b/modules/local/find_telomere_windows.nf index ac9584e2..2fcd0022 100755 --- a/modules/local/find_telomere_windows.nf +++ b/modules/local/find_telomere_windows.nf @@ -6,7 +6,7 @@ process FIND_TELOMERE_WINDOWS { container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/java-jdk:8.0.112--1' : - 'quay.io/biocontainers/java-jdk:8.0.112--1' }" + 'biocontainers/java-jdk:8.0.112--1' }" input: tuple val( meta ), path( file ) diff --git a/modules/local/findhalfcoverage.nf b/modules/local/findhalfcoverage.nf index 462f8f30..068ed6c4 100755 --- a/modules/local/findhalfcoverage.nf +++ b/modules/local/findhalfcoverage.nf @@ -5,7 +5,7 @@ process FINDHALFCOVERAGE { conda "conda-forge::python=3.9" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/python:3.9' : - 'quay.io/biocontainers/python:3.9' }" + 'biocontainers/python:3.9' }" input: tuple val(meta), path(bedfile) diff --git a/modules/local/gap_length.nf b/modules/local/gap_length.nf index 9240de0e..8dc384a9 100755 --- a/modules/local/gap_length.nf +++ b/modules/local/gap_length.nf @@ -5,7 +5,7 @@ process GAP_LENGTH { conda "conda-forge::coreutils=9.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : - 'ubuntu:20.04' }" + 'docker.io/ubuntu:20.04' }" input: tuple val( meta ), path( file ) diff --git a/modules/local/get_largest_scaff.nf b/modules/local/get_largest_scaff.nf index 3e3fc2d7..2296958c 100755 --- a/modules/local/get_largest_scaff.nf +++ b/modules/local/get_largest_scaff.nf @@ -6,7 +6,7 @@ process GET_LARGEST_SCAFF { conda "conda-forge::coreutils=9.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : - 'ubuntu:20.04' }" + 'docker.io/ubuntu:20.04' }" input: tuple val( meta ), path( file ) diff --git a/modules/local/get_paired_contact_bed.nf b/modules/local/get_paired_contact_bed.nf index 308ea84a..e6d3a135 100755 --- a/modules/local/get_paired_contact_bed.nf +++ b/modules/local/get_paired_contact_bed.nf @@ -5,7 +5,7 @@ process GET_PAIRED_CONTACT_BED { conda "conda-forge::coreutils=9.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : - 'ubuntu:20.04' }" + 'docker.io/ubuntu:20.04' }" input: tuple val( meta ), path( file ) diff --git a/modules/local/get_synteny_genomes.nf b/modules/local/get_synteny_genomes.nf index 933ebb57..506fbf5a 100755 --- a/modules/local/get_synteny_genomes.nf +++ b/modules/local/get_synteny_genomes.nf @@ -5,7 +5,7 @@ process GET_SYNTENY_GENOMES { conda "conda-forge::coreutils=9.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : - 'ubuntu:20.04' }" + 'docker.io/ubuntu:20.04' }" input: val ( synteny_path ) diff --git a/modules/local/getminmaxpunches.nf b/modules/local/getminmaxpunches.nf index 6453ab5b..6e828bb5 100755 --- a/modules/local/getminmaxpunches.nf +++ b/modules/local/getminmaxpunches.nf @@ -5,7 +5,7 @@ process GETMINMAXPUNCHES{ conda "conda-forge::coreutils=9.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : - 'ubuntu:20.04' }" + 'docker.io/ubuntu:20.04' }" input: tuple val(meta), path(bedfile) diff --git a/modules/local/graphoverallcoverage.nf b/modules/local/graphoverallcoverage.nf index d14caee6..10c0a112 100755 --- a/modules/local/graphoverallcoverage.nf +++ b/modules/local/graphoverallcoverage.nf @@ -5,7 +5,7 @@ process GRAPHOVERALLCOVERAGE { conda "conda-forge::perl=5.26.2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/perl:5.26.2' : - 'quay.io/biocontainers/perl:5.26.2' }" + 'biocontainers/perl:5.26.2' }" input: tuple val(meta), path(bed) diff --git a/modules/local/juicer_tools_pre.nf b/modules/local/juicer_tools_pre.nf index 68f4ba5c..12b46ce8 100755 --- a/modules/local/juicer_tools_pre.nf +++ b/modules/local/juicer_tools_pre.nf @@ -8,7 +8,7 @@ process JUICER_TOOLS_PRE { container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/java-jdk:8.0.112--1' : - 'quay.io/biocontainers/java-jdk:8.0.112--1' }" + 'biocontainers/java-jdk:8.0.112--1' }" input: tuple val(meta), path(pairs) diff --git a/modules/local/makecmap_cmap2bed.nf b/modules/local/makecmap_cmap2bed.nf index 533abd79..8d7914db 100755 --- a/modules/local/makecmap_cmap2bed.nf +++ b/modules/local/makecmap_cmap2bed.nf @@ -5,7 +5,7 @@ process MAKECMAP_CMAP2BED { conda "conda-forge::python=3.9" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/python:3.9' : - 'quay.io/biocontainers/python:3.9' }" + 'biocontainers/python:3.9' }" input: tuple val(meta), path(cmap) diff --git a/modules/local/makecmap_fa2cmapmulticolor.nf b/modules/local/makecmap_fa2cmapmulticolor.nf index 6aa8e406..ef641ef4 100755 --- a/modules/local/makecmap_fa2cmapmulticolor.nf +++ b/modules/local/makecmap_fa2cmapmulticolor.nf @@ -5,7 +5,7 @@ process MAKECMAP_FA2CMAPMULTICOLOR { conda "conda-forge::perl=5.26.2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/perl:5.26.2' : - 'quay.io/biocontainers/perl:5.26.2' }" + 'biocontainers/perl:5.26.2' }" input: tuple val(meta), path(fasta) diff --git a/modules/local/makecmap_renamecmapids.nf b/modules/local/makecmap_renamecmapids.nf index ef162457..ce398b2e 100755 --- a/modules/local/makecmap_renamecmapids.nf +++ b/modules/local/makecmap_renamecmapids.nf @@ -5,7 +5,7 @@ process MAKECMAP_RENAMECMAPIDS { conda "conda-forge::perl=5.26.2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/perl:5.26.2' : - 'quay.io/biocontainers/perl:5.26.2' }" + 'biocontainers/perl:5.26.2' }" input: tuple val(meta), path(cmap) diff --git a/modules/local/paf_to_bed.nf b/modules/local/paf_to_bed.nf index a980c2fd..c50f0373 100755 --- a/modules/local/paf_to_bed.nf +++ b/modules/local/paf_to_bed.nf @@ -5,7 +5,7 @@ process PAF2BED { conda "conda-forge::coreutils=9.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : - 'ubuntu:20.04' }" + 'docker.io/ubuntu:20.04' }" input: tuple val( meta ), path( file ) diff --git a/modules/local/reformat_intersect.nf b/modules/local/reformat_intersect.nf index 0842faef..1d1930ce 100755 --- a/modules/local/reformat_intersect.nf +++ b/modules/local/reformat_intersect.nf @@ -5,7 +5,7 @@ process REFORMAT_INTERSECT { conda "conda-forge::coreutils=9.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : - 'ubuntu:20.04' }" + 'docker.io/ubuntu:20.04' }" input: tuple val( meta ), path( file ) diff --git a/modules/local/rename_ids.nf b/modules/local/rename_ids.nf index 545d0c4a..f69f518d 100755 --- a/modules/local/rename_ids.nf +++ b/modules/local/rename_ids.nf @@ -5,7 +5,7 @@ process RENAME_IDS { conda "conda-forge::coreutils=9.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : - 'ubuntu:20.04' }" + 'docker.io/ubuntu:20.04' }" input: tuple val( meta ), path( file ) diff --git a/modules/local/replace_dots.nf b/modules/local/replace_dots.nf index 0f266ca2..4d12f5cd 100755 --- a/modules/local/replace_dots.nf +++ b/modules/local/replace_dots.nf @@ -5,7 +5,7 @@ process REPLACE_DOTS { conda "conda-forge::coreutils=9.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : - 'ubuntu:20.04' }" + 'docker.io/ubuntu:20.04' }" input: tuple val( meta ), path( file ) diff --git a/modules/local/selfcomp_alignmentblocks.nf b/modules/local/selfcomp_alignmentblocks.nf index 5d23431a..419a7b42 100755 --- a/modules/local/selfcomp_alignmentblocks.nf +++ b/modules/local/selfcomp_alignmentblocks.nf @@ -5,7 +5,7 @@ process SELFCOMP_ALIGNMENTBLOCKS { conda "conda-forge::python=3.9" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/mulled-v2-548f120dc8914d802c46e110ec27751bc1c5a414:8770fa59aa0ae8b50cbf444255b91c201c883685-0' : - 'quay.io/biocontainers/mulled-v2-548f120dc8914d802c46e110ec27751bc1c5a414:8770fa59aa0ae8b50cbf444255b91c201c883685-0' }" + 'biocontainers/mulled-v2-548f120dc8914d802c46e110ec27751bc1c5a414:8770fa59aa0ae8b50cbf444255b91c201c883685-0' }" input: tuple val(meta), path(bedfile) diff --git a/modules/local/selfcomp_mapids.nf b/modules/local/selfcomp_mapids.nf index c19d7393..be40c76e 100755 --- a/modules/local/selfcomp_mapids.nf +++ b/modules/local/selfcomp_mapids.nf @@ -5,7 +5,7 @@ process SELFCOMP_MAPIDS { conda "conda-forge::python=3.9" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/python:3.9' : - 'quay.io/biocontainers/python:3.9' }" + 'biocontainers/python:3.9' }" input: tuple val(meta), path(bed) diff --git a/modules/local/selfcomp_mummer2bed.nf b/modules/local/selfcomp_mummer2bed.nf index c38f6d88..02a215cb 100755 --- a/modules/local/selfcomp_mummer2bed.nf +++ b/modules/local/selfcomp_mummer2bed.nf @@ -5,7 +5,7 @@ process SELFCOMP_MUMMER2BED { conda "conda-forge::python=3.9" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/python:3.9' : - 'quay.io/biocontainers/python:3.9' }" + 'biocontainers/python:3.9' }" input: tuple val(meta), path(mummerfile) diff --git a/modules/local/selfcomp_splitfasta.nf b/modules/local/selfcomp_splitfasta.nf index be4584b2..510febe8 100755 --- a/modules/local/selfcomp_splitfasta.nf +++ b/modules/local/selfcomp_splitfasta.nf @@ -5,7 +5,7 @@ process SELFCOMP_SPLITFASTA { conda "conda-forge::perl-bioperl=1.7.8-1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/perl-bioperl:1.7.8--hdfd78af_1' : - 'quay.io/biocontainers/perl-bioperl:1.7.8--hdfd78af_1' }" + 'biocontainers/perl-bioperl:1.7.8--hdfd78af_1' }" input: tuple val(meta), path(fasta) diff --git a/modules/nf-core/custom/getchromsizes/custom-getchromsizes.diff b/modules/nf-core/custom/getchromsizes/custom-getchromsizes.diff new file mode 100644 index 00000000..6d72652e --- /dev/null +++ b/modules/nf-core/custom/getchromsizes/custom-getchromsizes.diff @@ -0,0 +1,63 @@ +Changes in module 'nf-core/custom/getchromsizes' +--- modules/nf-core/custom/getchromsizes/main.nf ++++ modules/nf-core/custom/getchromsizes/main.nf +@@ -1,5 +1,9 @@ ++// Forked from the nf-core module to: ++// 1. allow selecting a different extension for the `sizes` channel ++// 2. force all output files to be named according to the prefix ++// 3. rename the input fasta file too and output it so that it can be "published" + process CUSTOM_GETCHROMSIZES { +- tag "$fasta" ++ tag "$meta.id" + label 'process_single' + + conda "bioconda::samtools=1.16.1" +@@ -8,22 +12,26 @@ + 'biocontainers/samtools:1.16.1--h6899075_1' }" + + input: +- tuple val(meta), path(fasta) ++ tuple val(meta), path(fasta, stageAs: 'input/*') ++ val suffix + + output: +- tuple val(meta), path ("*.sizes"), emit: sizes +- tuple val(meta), path ("*.fai") , emit: fai +- tuple val(meta), path ("*.gzi") , emit: gzi, optional: true +- path "versions.yml" , emit: versions ++ tuple val(meta), path ("*.${suffix}") , emit: sizes ++ tuple val(meta), path ("*.fa") , emit: fasta ++ tuple val(meta), path ("*.fai") , emit: fai ++ tuple val(meta), path ("*.gzi") , emit: gzi, optional: true ++ path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: +- def args = task.ext.args ?: '' ++ def args = task.ext.args ?: '' ++ def prefix = task.ext.prefix ?: "${meta.id}" + """ +- samtools faidx $fasta +- cut -f 1,2 ${fasta}.fai > ${fasta}.sizes ++ ln -s ${fasta} ${prefix}.fa ++ samtools faidx ${prefix}.fa -o ${prefix}.fa.fai ++ cut -f 1,2 ${prefix}.fa.fai > ${prefix}.${suffix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": +@@ -33,8 +41,9 @@ + + stub: + """ +- touch ${fasta}.fai +- touch ${fasta}.sizes ++ ln -s ${fasta} ${prefix}.fa ++ touch ${prefix}.fa.fai ++ touch ${prefix}.${suffix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + +************************************************************ diff --git a/modules/nf-core/custom/getchromsizes/main.nf b/modules/nf-core/custom/getchromsizes/main.nf old mode 100755 new mode 100644 index 1fd1e768..b6387e00 --- a/modules/nf-core/custom/getchromsizes/main.nf +++ b/modules/nf-core/custom/getchromsizes/main.nf @@ -1,3 +1,7 @@ +// Forked from the nf-core module to: +// 1. allow selecting a different extension for the `sizes` channel +// 2. force all output files to be named according to the prefix +// 3. rename the input fasta file too and output it so that it can be "published" process CUSTOM_GETCHROMSIZES { tag "$meta.id" label 'process_single' @@ -8,7 +12,7 @@ process CUSTOM_GETCHROMSIZES { 'biocontainers/samtools:1.16.1--h6899075_1' }" input: - tuple val(meta), path(fasta) + tuple val(meta), path(fasta, stageAs: 'input/*') val suffix output: @@ -25,13 +29,10 @@ process CUSTOM_GETCHROMSIZES { def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" """ - samtools faidx $fasta -o ${prefix}.fa.fai + ln -s ${fasta} ${prefix}.fa + samtools faidx ${prefix}.fa -o ${prefix}.fa.fai cut -f 1,2 ${prefix}.fa.fai > ${prefix}.${suffix} - if [[ "${fasta}" != "${prefix}-ref.fa" ]]; then - mv ${fasta} ${prefix}-ref.fa - fi - cat <<-END_VERSIONS > versions.yml "${task.process}": getchromsizes: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') @@ -40,7 +41,8 @@ process CUSTOM_GETCHROMSIZES { stub: """ - touch ${prefix}.fai + ln -s ${fasta} ${prefix}.fa + touch ${prefix}.fa.fai touch ${prefix}.${suffix} cat <<-END_VERSIONS > versions.yml diff --git a/modules/nf-core/custom/getchromsizes/meta.yml b/modules/nf-core/custom/getchromsizes/meta.yml old mode 100755 new mode 100644 diff --git a/nextflow.config b/nextflow.config index 008981d4..17035f39 100755 --- a/nextflow.config +++ b/nextflow.config @@ -12,7 +12,7 @@ params { // Boilerplate options input = null outdir = "./results" - tracedir = "${params.outdir}/treeval_info" + tracedir = "${params.outdir}/pipeline_info" publish_dir_mode = 'copy' email = null email_on_fail = null @@ -151,10 +151,9 @@ profiles { executor.memory = 60.GB } - full_s3_test { includeConfig 'conf/full_s3_test.config' } - s3_test { includeConfig 'conf/s3_test.config' } - test { includeConfig 'conf/test.config' } - test_full { includeConfig 'conf/test_full.config' } + farm_test { includeConfig 'conf/farm_test.config' } + github_test { includeConfig 'conf/github_test.config' } + local_github_test { includeConfig 'conf/local_github_test.config' } } diff --git a/subworkflows/local/generate_genome.nf b/subworkflows/local/generate_genome.nf index 97580e7d..7cb0065a 100755 --- a/subworkflows/local/generate_genome.nf +++ b/subworkflows/local/generate_genome.nf @@ -5,12 +5,13 @@ // include { SAMTOOLS_FAIDX } from '../../modules/nf-core/samtools/faidx/main' include { CUSTOM_GETCHROMSIZES } from '../../modules/nf-core/custom/getchromsizes/main' +include { GNU_SORT } from '../../modules/nf-core/gnu/sort' include { GET_LARGEST_SCAFF } from '../../modules/local/get_largest_scaff' workflow GENERATE_GENOME { take: assembly_id // Channel val(assembly_id) - reference_file // Channel [ val(meta), path(file) ] + reference_file // Channel path(file) main: ch_versions = Channel.empty() @@ -36,6 +37,12 @@ workflow GENERATE_GENOME { ) ch_versions = ch_versions.mix( CUSTOM_GETCHROMSIZES.out.versions ) + // + // MODULE: SORT CHROM SIZES BY CHOM SIZE NOT NAME + // + GNU_SORT ( + CUSTOM_GETCHROMSIZES.out.sizes + ) // // MODULE: Cut out the largest scaffold size and use as comparator against 512MB @@ -48,7 +55,7 @@ workflow GENERATE_GENOME { emit: max_scaff_size = GET_LARGEST_SCAFF.out.scaff_size.toInteger() - dot_genome = CUSTOM_GETCHROMSIZES.out.sizes + dot_genome = GNU_SORT.out.sorted ref_index = CUSTOM_GETCHROMSIZES.out.fai reference_tuple = to_chromsize versions = ch_versions.ifEmpty(null) diff --git a/subworkflows/local/hic_mapping.nf b/subworkflows/local/hic_mapping.nf index cf249ca4..0f73ee5b 100755 --- a/subworkflows/local/hic_mapping.nf +++ b/subworkflows/local/hic_mapping.nf @@ -31,6 +31,7 @@ workflow HIC_MAPPING { dot_genome // Channel [ val(meta), [ datafile ]] hic_reads_path // Channel [ val(meta), path(directory) ] assembly_id // Channel val( id ) + workflow_setting // val( {RAPID | FULL } ) main: ch_versions = Channel.empty() @@ -150,13 +151,24 @@ workflow HIC_MAPPING { ch_versions = ch_versions.mix( PRETEXTMAP_STANDRD.out.versions ) // - // MODULE: GENERATE PRETEXT MAP FROM MAPPED BAM FOR HIGH RES + // LOGIC: HIRES IS TOO INTENSIVE FOR RUNNING IN GITHUB CI SO THIS STOPS IT RUNNING // - PRETEXTMAP_HIGHRES ( - pretext_input.input_bam, - pretext_input.reference - ) - ch_versions = ch_versions.mix( PRETEXTMAP_HIGHRES.out.versions ) + if ( params.config_profile_name ) { + config_profile_name = params.config_profile_name + } else { + config_profile_name = 'Local' + } + + if ( !config_profile_name.contains('GitHub') ) { + // + // MODULE: GENERATE PRETEXT MAP FROM MAPPED BAM FOR HIGH RES + // + PRETEXTMAP_HIGHRES ( + pretext_input.input_bam, + pretext_input.reference + ) + ch_versions = ch_versions.mix( PRETEXTMAP_HIGHRES.out.versions ) + } // // MODULE: GENERATE PNG FROM STANDARD PRETEXT @@ -197,26 +209,32 @@ workflow HIC_MAPPING { ch_versions = ch_versions.mix( GET_PAIRED_CONTACT_BED.out.versions ) // - // LOGIC: PREPARE JUICER TOOLS INPUT + // LOGIC: SECTION ONLY NEEDED FOR TREEVAL VISUALISATION, NOT RAPID ANALYSIS // - GET_PAIRED_CONTACT_BED.out.bed - .combine( dot_genome ) - .multiMap { meta, paired_contacts, meta_my_genome, my_genome -> - paired : tuple([ id: meta.id, single_end: true], paired_contacts ) - genome : my_genome - id : meta.id - } - .set { ch_juicer_input } + if (workflow_setting == 'FULL' && !config_profile_name.contains('GitHub')) { + // + // LOGIC: PREPARE JUICER TOOLS INPUT + // + GET_PAIRED_CONTACT_BED.out.bed + .combine( dot_genome ) + .multiMap { meta, paired_contacts, meta_my_genome, my_genome -> + paired : tuple([ id: meta.id, single_end: true], paired_contacts ) + genome : my_genome + id : meta.id + } + .set { ch_juicer_input } - // - // MODULE: GENERATE HIC MAP - // - JUICER_TOOLS_PRE( - ch_juicer_input.paired, - ch_juicer_input.genome, - ch_juicer_input.id - ) - ch_versions = ch_versions.mix( JUICER_TOOLS_PRE.out.versions ) + // + // MODULE: GENERATE HIC MAP, ONLY IS PIPELINE IS RUNNING ON ENTRY FULL + // + + JUICER_TOOLS_PRE( + ch_juicer_input.paired, + ch_juicer_input.genome, + ch_juicer_input.id + ) + ch_versions = ch_versions.mix( JUICER_TOOLS_PRE.out.versions ) + } // // LOGIC: BIN CONTACT PAIRS @@ -280,10 +298,9 @@ workflow HIC_MAPPING { emit: standrd_pretext = PRETEXTMAP_STANDRD.out.pretext standrd_snpshot = SNAPSHOT_SRES.out.image - highres_pretext = PRETEXTMAP_HIGHRES.out.pretext + //highres_pretext = PRETEXTMAP_HIGHRES.out.pretext //highres_snpshot = SNAPSHOT_HRES.out.image mcool = COOLER_ZOOMIFY.out.mcool - hic = JUICER_TOOLS_PRE.out.hic ch_reporting = ch_reporting_cram.collect() versions = ch_versions.ifEmpty(null) } diff --git a/workflows/treeval.nf b/workflows/treeval.nf index ce3b2137..fc3901d3 100755 --- a/workflows/treeval.nf +++ b/workflows/treeval.nf @@ -21,7 +21,7 @@ for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true */ // -// SUBWORKFLOW TREEVAL_FULL: Consisting of a mix of local and nf-core/modules +// IMPORT: SUBWORKFLOWS CALLED BY THE MAIN // include { YAML_INPUT } from '../subworkflows/local/yaml_input' include { GENERATE_GENOME } from '../subworkflows/local/generate_genome' @@ -32,7 +32,6 @@ include { SYNTENY } from '../subworkflows/local/synteny' include { LONGREAD_COVERAGE } from '../subworkflows/local/longread_coverage' include { REPEAT_DENSITY } from '../subworkflows/local/repeat_density' include { GAP_FINDER } from '../subworkflows/local/gap_finder' -include { LONGREAD_COVERAGE } from '../subworkflows/local/longread_coverage' include { TELO_FINDER } from '../subworkflows/local/telo_finder' include { BUSCO_ANNOTATION } from '../subworkflows/local/busco_annotation' include { HIC_MAPPING } from '../subworkflows/local/hic_mapping' @@ -44,7 +43,7 @@ include { HIC_MAPPING } from '../subworkflows/local/hic_mapping' */ // -// MODULE: Installed directly from nf-core/modules +// IMPORT: Installed directly from nf-core/modules // include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' @@ -61,6 +60,7 @@ workflow TREEVAL { // ch_versions = Channel.empty() + params.entry = 'FULL' input_ch = Channel.fromPath(params.input, checkIfExists: true) Channel @@ -101,7 +101,6 @@ workflow TREEVAL { YAML_INPUT.out.assembly_id, YAML_INPUT.out.reference ) - ch_versions = ch_versions.mix( GENERATE_GENOME.out.versions ) // @@ -110,7 +109,6 @@ workflow TREEVAL { // ch_enzyme = Channel.of( "bspq1","bsss1","DLE1" ) - INSILICO_DIGEST ( YAML_INPUT.out.assembly_id, GENERATE_GENOME.out.dot_genome, @@ -208,7 +206,8 @@ workflow TREEVAL { GENERATE_GENOME.out.ref_index, GENERATE_GENOME.out.dot_genome, YAML_INPUT.out.hic_reads, - YAML_INPUT.out.assembly_id + YAML_INPUT.out.assembly_id, + params.entry ) ch_versions = ch_versions.mix(HIC_MAPPING.out.versions) @@ -248,21 +247,29 @@ workflow TREEVAL { GENERATE_GENOME.out.reference_tuple .combine( YAML_INPUT.out.assembly_classT ) .combine( YAML_INPUT.out.assembly_ttype ) - .map { meta, reference, lineage, ticket -> - tuple( + .combine( YAML_INPUT.out.assembly_id ) + .combine( LONGREAD_COVERAGE.out.ch_reporting ) + .combine( HIC_MAPPING.out.ch_reporting ) + .combine( CUSTOM_DUMPSOFTWAREVERSIONS.out.versions ) + .map { meta, reference, lineage, ticket, sample_id, longread_meta, longread_files, hic_meta, hic_files, custom_file -> [ + rf_data: tuple( [ id: meta.id, sz: file(reference).size(), ln: lineage, tk: ticket ], reference - ) + ), + sample_id: sample_id, + pb_data: tuple(longread_meta, longread_files), + cm_data: tuple(hic_meta, hic_files), + custom: custom_file, + ] } - .set { rf_data } + .set { collected_metrics_ch } - params.sample_id = YAML_INPUT.out.assembly_id.collect() - params.rf_data = rf_data.collect() // reference data tuple( [ id, size, lineage, ticket ], file) - params.pb_data = LONGREAD_COVERAGE.out.ch_reporting.collect() // merged pacbio.bam data tuple( [ id, size ], file ) | Should really be a collected list of the raw fasta - params.cm_data = HIC_MAPPING.out.ch_reporting.collect() // merged cram.bam data tuple( [ id, size ], file ) | Should really be a collected list of the raw cram + collected_metrics_ch.map { metrics -> + TreeValProject.summary(workflow, params, metrics, log) + } emit: software_ch = CUSTOM_DUMPSOFTWAREVERSIONS.out.yml @@ -284,9 +291,6 @@ workflow.onComplete { if (params.hook_url) { NfcoreTemplate.IM_notification(workflow, params, summary_params, projectDir, log) } - - TreeValProject.summary(workflow, params) - } /* diff --git a/workflows/treeval_rapid.nf b/workflows/treeval_rapid.nf index b7606031..ed3497c4 100755 --- a/workflows/treeval_rapid.nf +++ b/workflows/treeval_rapid.nf @@ -20,7 +20,7 @@ for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true */ // -// SUBWORKFLOW TREEVAL_FULL: Consisting of a mix of local and nf-core/modules +// IMPORT: SUBWORKFLOWS CALLED BY THE MAIN // include { YAML_INPUT } from '../subworkflows/local/yaml_input' include { GENERATE_GENOME } from '../subworkflows/local/generate_genome' @@ -37,7 +37,7 @@ include { HIC_MAPPING } from '../subworkflows/local/hic_mapping' */ // -// MODULE: Installed directly from nf-core/modules +// IMPORT: Installed directly from nf-core/modules // include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' @@ -52,6 +52,7 @@ workflow TREEVAL_RAPID { main: ch_versions = Channel.empty() + params.entry = 'RAPID' input_ch = Channel.fromPath(params.input, checkIfExists: true) // // SUBWORKFLOW: reads the yaml and pushing out into a channel per yaml field @@ -103,7 +104,8 @@ workflow TREEVAL_RAPID { GENERATE_GENOME.out.ref_index, GENERATE_GENOME.out.dot_genome, YAML_INPUT.out.hic_reads, - YAML_INPUT.out.assembly_id + YAML_INPUT.out.assembly_id, + params.entry ) ch_versions = ch_versions.mix(HIC_MAPPING.out.versions) @@ -130,21 +132,29 @@ workflow TREEVAL_RAPID { GENERATE_GENOME.out.reference_tuple .combine( YAML_INPUT.out.assembly_classT ) .combine( YAML_INPUT.out.assembly_ttype ) - .map { meta, reference, lineage, ticket -> - tuple( + .combine( YAML_INPUT.out.assembly_id ) + .combine( LONGREAD_COVERAGE.out.ch_reporting ) + .combine( HIC_MAPPING.out.ch_reporting ) + .combine( CUSTOM_DUMPSOFTWAREVERSIONS.out.versions ) + .map { meta, reference, lineage, ticket, sample_id, longread_meta, longread_files, hic_meta, hic_files, custom_file -> [ + rf_data: tuple( [ id: meta.id, sz: file(reference).size(), ln: lineage, tk: ticket ], reference - ) + ), + sample_id: sample_id, + pb_data: tuple(longread_meta, longread_files), + cm_data: tuple(hic_meta, hic_files), + custom: custom_file, + ] } - .set { rf_data } + .set { collected_metrics_ch } - params.sample_id = YAML_INPUT.out.assembly_id.collect() - params.rf_data = rf_data.collect() // reference data tuple( [ id, size, lineage, ticket ], file) - params.pb_data = LONGREAD_COVERAGE.out.ch_reporting.collect() // merged pacbio.bam data tuple( [ id, size ], file ) | Should really be a collected list of the raw fasta - params.cm_data = HIC_MAPPING.out.ch_reporting.collect() // merged cram.bam data tuple( [ id, size ], file ) | Should really be a collected list of the raw cram + collected_metrics_ch.map { metrics -> + TreeValProject.summary(workflow, params, metrics, log) + } emit: software_ch = CUSTOM_DUMPSOFTWAREVERSIONS.out.yml @@ -161,10 +171,11 @@ workflow.onComplete { if (params.email || params.email_on_fail) { NfcoreTemplate.email(workflow, params, summary_params, projectDir, log) } - NfcoreTemplate.summary(workflow, params, log) - - TreeValProject.summary(workflow, params) + NfcoreTemplate.summary(workflow, params, log) + if (params.hook_url) { + NfcoreTemplate.IM_notification(workflow, params, summary_params, projectDir, log) + } } /*