From 581fe442d60cecdab4341470087ef20affd49d79 Mon Sep 17 00:00:00 2001 From: Andre Date: Fri, 18 Oct 2024 21:28:37 -0400 Subject: [PATCH] Test_1: Pipeline --- dockers/README.md | 31 +- pharmcat/PharmCAT.wdl | 473 ++++++++++++++++++++++++++ pharmcat/README.md | 119 +++++++ pharmcat/github.txt | 2 + pharmcat/test_file.json | 6 + pipeline/PharmCAT_Pipeline.wdl | 350 +++++++++++++++++-- pipeline/test.json | 4 - pipeline/test_file.json | 2 +- vcf_preprocessor/vcf_preprocessor.wdl | 8 +- 9 files changed, 954 insertions(+), 41 deletions(-) create mode 100644 pharmcat/PharmCAT.wdl create mode 100644 pharmcat/README.md create mode 100644 pharmcat/github.txt create mode 100644 pharmcat/test_file.json delete mode 100644 pipeline/test.json diff --git a/dockers/README.md b/dockers/README.md index b6bea14..d58734c 100644 --- a/dockers/README.md +++ b/dockers/README.md @@ -1,10 +1,31 @@ +# Cloud Tools Docker Setup -Run docker on-premisse: +This repository contains the configuration for a Docker image designed to run on-premise environments. The image includes various cloud-related tools and utilities. -Command to create the imagem: -$ docker build -t cloud-tools . +## Running the Docker Image On-Premise -Command to run the containner: +### Step 1: Build the Docker Image + +To create the Docker image locally, use the following command: + +```bash +docker build -t cloud-tools . +``` + +### Step 2: Run the Docker Container + +After building the image, you can run the container interactively using: + +```bash docker run -it cloud-tools +``` + +### Additional Information + +- This image is also **published on Docker Hub** for easier access and distribution. +- All updates to the image are **automatically synced** to Docker Hub via **GitHub Actions**. The workflow for this automation is located in: + + ```bash + .github/workflows/docker-build.yml + ``` -. \ No newline at end of file diff --git a/pharmcat/PharmCAT.wdl b/pharmcat/PharmCAT.wdl new file mode 100644 index 0000000..2ff1f94 --- /dev/null +++ b/pharmcat/PharmCAT.wdl @@ -0,0 +1,473 @@ +version 1.0 + +workflow pharmcat { + input { + File? files_list # Optional input file containing list of URLs + Array[File]? files_local # Optional array of files + String? files_directory # Read all VCF from a diretory + String? results_directory # Write the Results in Cloud Diretory + String pharmcat_version = "2.13.0" + Int max_concurrent_processes = 1 + String max_memory = "4G" + + Boolean run_vcf_preprocessor = true # Flag to control VCF Preprocessor + Boolean run_named_allele_matcher = true # Flag to control Named Allele Matcher + Boolean run_phenotype = true # Flag to control Phenotype + Boolean run_report = true # Flag to control Report generation + } + + call cloud_reader_task { + input: + files_list = files_list, + files_local = files_local, + files_directory = files_directory, + max_concurrent_processes = max_concurrent_processes, + max_memory = max_memory + } + + # Subtasks controladas por flags + if (run_vcf_preprocessor) { + call vcf_preprocessor_task { + input: + result_cloud_reader = cloud_reader_task.result_cloud_reader, + docker_version = pharmcat_version, + max_concurrent_processes = max_concurrent_processes, + max_memory = max_memory, + } + } + + if (run_named_allele_matcher) { + call named_allele_matcher_task { + input: + result_vcf_preprocessor = vcf_preprocessor_task.result_vcf_preprocessor, + docker_version = pharmcat_version, + max_concurrent_processes = max_concurrent_processes, + max_memory = max_memory, + } + } + +if (run_phenotype) { + call phenotype_task { + input: + result_named_allele_matcher = named_allele_matcher_task.result_named_allele_matcher, + docker_version = pharmcat_version, + max_concurrent_processes = max_concurrent_processes, + max_memory = max_memory, + } + } + +if (run_reporter) { + call reporter_task { + input: + result_phenotyper = named_allele_matcher_task.result_phenotyper, + docker_version = pharmcat_version, + max_concurrent_processes = max_concurrent_processes, + max_memory = max_memory, + } + } + + if (defined(results_directory) && results_directory != "") { + call cloud_writer_task { + input: + result_vcf_preprocessor = pharmcat_task.result_vcf_preprocessor, + results_directory = results_directory + } + } + + output { + # File log = task03_cloud_writer_task.log + } +} + +task cloud_reader_task { + input { + File? files_list + Array[File]? files_local + String? files_directory + Int max_concurrent_processes + String max_memory + } + + command <<< + set -e -x -o pipefail + + # Create folders + mkdir -p files/VCFs_inputs + + # Create log file + log_file="files/log.txt" + touch $log_file + echo "-----------------------" >> $log_file + echo "Start Cloud Reader Task" >> $log_file + echo "-----------------------" >> $log_file + + # Create the txt file + VCFs_list="files/VCFs_list.txt" + touch $VCFs_list + + # Check gsutil + gsutil --version >> $log_file + + + # Process the Directory Input [ files_directory ] + # ----------------------------------------------- + if [[ ~{true='true' false='false' defined(files_directory)} == "true" ]]; then + echo "Start to Read from Files Directory: ~{files_directory}" >> $log_file + # Check if files_directory is a Google Storage + if [[ "~{files_directory}" == gs://* ]]; then + echo "Copying all VCF files from directory: ~{files_directory}" >> $log_file + # List all the VCF files in the directory + gsutil ls "~{files_directory}/*.vcf.*" >> $log_file + # Copy all the VCF files from the directory to the local folder + gsutil cp "~{files_directory}/*.vcf.*" files/VCFs_inputs/ >> $log_file + # Add all copied files to the VCFs list file + ls files/VCFs_inputs/*.vcf.* >> files/VCFs_list.txt + echo "All VCF files from ~{files_directory} have been copied to files/VCFs_inputs/" >> $log_file + + # The files_directory no works in local paths. We can not mount in runtime. + # TODO - Add other cloud directories + # Handle unsupported directory formats + else + echo "ERROR: The directory path is not a valid gs:// URL. Skipping file copy." >> $log_file + fi + else + echo "The files_directory input type wasn't defined" >> $log_file + fi + + # Process the List File [ files_list ] + # ------------------------------------ + if [[ -n "~{files_list}" ]]; then + echo "Start to Read from Files List" >> $log_file + + for url in $(cat ~{files_list}); do + if [[ $url == http* ]]; then + echo "-- Get $url by wget" >> $log_file + wget -P files/VCFs_inputs $url --verbose + + elif [[ $url == gs://* ]]; then + echo "-- Get $url by gsutil" >> $log_file + gsutil cp $url files/VCFs_inputs/ + + # TODO - Add other cloud directories + else + echo "-- URL formant not support: $url" >> $log_file + fi + done + fi + + # Process the Local File in Array [ files_local ] + # ---------------------------------------------- + if [[ ~{true='true' false='false' defined(files_local)} == "true" ]]; then + echo "Processing files from the array" >> $log_file + + for file in ~{sep=' ' files_local}; do + file_name=$(basename "$file") # Extract just the filename + echo "Processing $file_name" >> $log_file + if [[ -f "files/VCFs_inputs/$file_name" ]]; then + echo "-- File $file_name already exists, skipping" >> $log_file + + else + cp "$file" "files/VCFs_inputs/" + echo "-- File $file_name copied to files/VCFs_inputs/" >> $log_file + fi + done + fi + + # Create VCFs_list.txt + ls files/VCFs_inputs/* | sort > $VCFs_list + + # Prepare the folder structure to process in next task + if [[ $(ls files/VCFs_inputs | wc -l) -gt 0 ]]; then + file_count=$(ls files/VCFs_inputs/* | wc -l) + echo "Number of files in VCFs_inputs: $file_count" >> $log_file + echo "End of Cloud Reader Task" >> $log_file + tar -czvf files.tar.gz files + else + echo "No files to compress, task failed." >> $log_file + exit 1 + fi + + >>> + + output { + # Return the compressed file instead of an array of individual files + File result_cloud_reader = "files.tar.gz" + } + + runtime { + # docker: "google/cloud-sdk:slim" + docker: "ricoandre/cloud-tools:latest" + memory: max_memory + cpu: max_concurrent_processes + } +} + +task vcf_preprocessor_task { + input { + # Data from cloud_reader_task + File result_cloud_reader + + # Environment Settings + String docker_version + Int max_concurrent_processes + String max_memory + + # Adapted for this Task + Boolean? single_vcf_mode = true # Defaul will run VCF files individually + + # Inputs from Pharmcat_vcf_preprocesssor.py + File? sample_file # Optional file containing a list of sample IDs + String? sample_ids # Optional comma-separated list of sample IDs + Boolean single_sample = false # Whether to generate one VCF per sample + Boolean missing_to_ref = false # Whether to add missing PGx positions as reference (use with caution) + Boolean concurrent_mode = false # Enable concurrent mode + Boolean no_gvcf_check = false # Bypass the check for gVCF format + File? reference_pgx_vcf # Custom PGx VCF for reference positions + File? reference_genome # Custom reference genome (GRCh38) + Boolean retain_specific_regions = false # Retain specific regions + File? reference_regions_to_retain # BED file specifying PGx regions to retain + + # Inputs from Pharmcat_vcf_preprocesssor.py / No works on Could Environment + # File vcf_file # Input VCF file (can be a single VCF or a list file with multiple VCFs) + # String? output_dir = "." # Output directory for the processed files + # String? base_filename # Prefix for the output files + # Boolean keep_intermediate_files = false # Whether to keep intermediate files + # Boolean verbose = false # Enable verbose output + # File? bcftools_path # Optional custom path to bcftools + # File? bgzip_path # Optional custom path to bgzip + } + + command <<< + set -e -x -o pipefail + + # Extract the compressed file from a_cloud_reader_task + tar -xzvf ~{result_cloud_reader} + + # Start log file + log_file="files/log.txt" + echo " " >> $log_file + echo "---------------------------" >> $log_file + echo "Start VCF Preprocessor Task" >> $log_file + echo "---------------------------" >> $log_file + + # Common arguments + arg=" -o files/Results" + if [ ! -z "$sample_file" ]; then + arg+=" -S $sample_file" + fi + if [ ! -z "$sample_ids" ]; then + arg+=" -s $sample_ids" + fi + if [ "$single_sample" == "true" ]; then + arg+=" -ss" + fi + if [ "$missing_to_ref" == "true" ]; then + arg+=" -0" + fi + if [ "$concurrent_mode" == "true" ]; then + arg+=" -c" + fi + if [ ! -z "$max_concurrent_processes" ]; then + arg+=" -cp $max_concurrent_processes" + fi + if [ "$no_gvcf_check" == "true" ]; then + arg+=" -G" + fi + echo "Set Common Arguments: $arg" >> $log_file + + # Mandatory argument: -vcf. + # ------------------------- + # Path to a single VCF file or a file containing the list of VCF file paths (one per line), + # sorted by chromosome position. All VCF files must have the same set of samples. Use this + # when data for a sample has been split among multiple files (e.g. VCF files from large + # cohorts, such as UK Biobank). Input VCF files must at least comply with + # Variant Call Format (VCF) Version >= 4.2. + + # The $single_vcf_mode will control the model to run + + # Process same set of samples across all split VCF files. + if [ "$single_vcf_mode" == "false" ]; then + echo "Running VCF-Preprocessor in List File Mode" >> $log_file + cmd="python3 /pharmcat/pharmcat_vcf_preprocessor.py -vcf files/VCFs_list.txt" + cmd="$cmd $arg" # Concatenating the arguments + echo "Running: $cmd" >> $log_file + eval $cmd + # Process each file in the VCFs_list.txt individually + else + echo "Running VCF-Preprocessor in VCF File Mode" >> $log_file + while read -r vcf_file; do + echo "Processing file: $vcf_file" >> $log_file + cmd="python3 /pharmcat/pharmcat_vcf_preprocessor.py -vcf $vcf_file" + cmd="$cmd $arg" # Concatenating the arguments + echo "Running: $cmd" >> $log_file + eval $cmd + done < files/VCFs_list.txt + fi + + # Run the command + echo "Pharmcat_vcf_preprocessor.py finished" >> $log_file + + # Check Results Files + if [ -n "$(ls files/Results/ 2>/dev/null)" ]; then + echo "Results files:" + ls files/Results/* >> $log_file + else + echo "No results found in files/Results/" >> $log_file + fi + + # Package the entire 'files' directory and create a tar.gz file + # echo "Packaging the 'files' directory..." >> $log_file + # tar -czvf result_vcf_preprocessor.tar.gz -C files . # Use -C to change directory and include all contents of 'files' folder + tar -czvf result_vcf_preprocessor.tar.gz files + >>> + + output { + # Return the packaged tar.gz file containing all the processed files + File result_vcf_preprocessor = "result_vcf_preprocessor.tar.gz" + } + + runtime { + docker: "pgkb/pharmcat:${docker_version}" # Use the user-specified or default Docker version + memory: max_memory + cpu: max_concurrent_processes + } +} + +task named_allele_matcher_task { + input { + # Data from cloud_reader_task + File result_vcf_preprocessor + + # Environment Settings + String docker_version + Int max_concurrent_processes + String max_memory + } + + command <<< + echo " --------- " + touch results.txt + >>> + + output { + # Return the packaged tar.gz file containing all the processed files + File result_named_allele_matcher = "results.txt" + } + + runtime { + docker: "pgkb/pharmcat:${docker_version}" # Use the user-specified or default Docker version + memory: max_memory + cpu: max_concurrent_processes + } +} + +task phenotyper_task { + input { + # Data from cloud_reader_task + File result_result_named_allele_matcher + + # Environment Settings + String docker_version + Int max_concurrent_processes + String max_memory + } + + command <<< + echo " --------- " + touch results.txt + >>> + + output { + # Return the packaged tar.gz file containing all the processed files + File result_phenotyper = "results.txt" + } + + runtime { + docker: "pgkb/pharmcat:${docker_version}" # Use the user-specified or default Docker version + memory: max_memory + cpu: max_concurrent_processes + } +} + +task reporter_task { + input { + # Data from cloud_reader_task + File result_phenotyper + + # Environment Settings + String docker_version + Int max_concurrent_processes + String max_memory + } + + command <<< + echo " --------- " + touch results.txt + >>> + + output { + # Return the packaged tar.gz file containing all the processed files + File result_reporter = "results.txt" + } + + runtime { + docker: "pgkb/pharmcat:${docker_version}" # Use the user-specified or default Docker version + memory: max_memory + cpu: max_concurrent_processes + } +} + + + +task cloud_writer_task { + input { + File? result_vcf_preprocessor + String? results_directory + } + + command <<< + set -e -x -o pipefail + + # Extract the compressed file from a_cloud_reader_task + tar -xzvf ~{result_vcf_preprocessor} + + # Start log file + log_file="files/log.txt" + echo " " >> $log_file + echo "-----------------------" >> $log_file + echo "Start Cloud Writer Task" >> $log_file + echo "-----------------------" >> $log_file + + # Ensure gsutil is available in this environment + if ! command -v gsutil &> /dev/null; then + echo "ERROR: gsutil not found. Please ensure gsutil is available." >> $log_file + exit 1 + fi + + # Save Results in directory defined by the user + echo "Copying results to ~{results_directory}" >> $log_file + + # TODO - Add other cloud directories + if [[ ~{results_directory} == gs://* ]]; then + # Copying individual result files + gsutil cp Results/* "~{results_directory}/" >> $log_file + # Copying the pre_processor tar.gz as well + gsutil cp ~{result_vcf_preprocessor} ~{results_directory}/ >> $log_file + else + echo "ERROR: Unsupported storage destination. Only gs:// is supported in this task." >> $log_file + exit 1 + fi + + echo "Cloud Writer Task completed successfully." >> $log_file + >>> + + output { + File log = "files/log.txt" + } + + runtime { + docker: "ricoandre/cloud-tools:latest" + memory: "4G" + cpu: 1 + } +} \ No newline at end of file diff --git a/pharmcat/README.md b/pharmcat/README.md new file mode 100644 index 0000000..1e20c6e --- /dev/null +++ b/pharmcat/README.md @@ -0,0 +1,119 @@ +# WDL to run PharmCAT_Pipeline + +This WDL script executes the PharmCAT pipeline on a specified VCF file or a set of VCF files, processing genetic data to provide pharmacogenomic insights. The workflow automates the execution of the PharmCAT pipeline, streamlining the analysis of genetic variants to predict drug response and tailor medical treatment to individual patients' genetic profiles. By leveraging the Workflow Description Language (WDL), this script ensures reproducibility, scalability, and ease of use across various computational environments. + + +## Input Parameters + +### Input Arguments +- `File vcf_file`: Path to a VCF file or a file of paths to VCF files (one file per line), sorted by chromosome position. +- `String sample_ids` (default: `""`): A comma-separated list of sample IDs. +- `File? sample_file` (default: `null`): A file containing a list of samples, one sample per line. + +### Preprocessor Arguments +- `Boolean missing_to_ref` (default: `false`): Assume genotypes at missing PGx sites are 0/0. DANGEROUS!. +- `Boolean no_gvcf_check` (default: `false`): Bypass the gVCF check for the input VCF. DANGEROUS!. +- `Boolean retain_specific_regions` (default: `false`): Retain the genomic regions specified by `-refRegion`. +- `File? reference_regions` (default: `null`): A sorted bed file of specific PGx regions to retain. Must be used with the `-R` argument. + +### Named Allele Matcher Arguments +- `Boolean run_matcher` (default: `false`): Run named allele matcher independently. +- `Boolean matcher_all_results` (default: `false`): Return all possible diplotypes, not just top hits. +- `Boolean matcher_save_html` (default: `false`): Save named allele matcher results as HTML. +- `String research_mode` (default: `""`): Comma-separated list of research features to enable: [cyp2d6, combinations]. + +### Phenotyper Arguments +- `Boolean run_phenotyper` (default: `false`): Run phenotyper independently. + +### Reporter Arguments +- `Boolean run_reporter` (default: `false`): Run reporter independently. +- `String reporter_sources` (default: `""`): Comma-separated list of sources to limit report to: [CPIC, DPWG]. +- `Boolean reporter_extended` (default: `false`): Output extended report. +- `Boolean reporter_save_json` (default: `false`): Save reporter results as JSON. + +### Output Arguments +- `String base_filename` (default: `""`): Prefix for output files. Defaults to the same base name as the input. +- `Boolean delete_intermediate_files` (default: `false`): Delete intermediate PharmCAT files (saved by default). + +### Concurrency/Memory Arguments +- `Int max_concurrent_processes` (default: `1`): The maximum number of processes to use when concurrent mode is enabled. +- `String max_memory` (default: `"4G"`): The maximum memory PharmCAT should use (e.g. "64G"). This is passed to Java using the -Xmx flag. + +## Outputs +- `Array[File] results_all`: The results of the PharmCAT pipeline. These files are saved in the execution directory of the job. + +## Running the PharmCAT Pipeline +For convenience, the `pharmcat_pipeline script simplifies the process of running the entire PharmCAT pipeline (the VCF Preprocessor and the core PharmCAT tool). The necessary dependencies are already included in the provided image. + +### Prerequisites +The required dependencies (python3, java, bcftools, bgzip) are included in the provided image, so no additional installation is needed. + +### Execution Platforms +This WDL script can be executed using [Cromwell](https://github.com/broadinstitute/cromwell) or on platforms such as [Terra](https://support.terra.bio/hc/en-us) and [AnVIL](https://anvil.terra.bio/#) that can be launched from [Dockstore](https://dockstore.org/workflows/github.com/AndreRico/PharmCAT_Dockstore/PharmCAT-Pipeline:main?tab=info). + +### Local Execution +To run the WDL with the PharmCAT-Pipeline locally, ensure that Docker and Cromwell are installed in your execution environment. Then, execute the following command in your bash: + +```sh +$ java -jar {path}/cromwell-{version}.jar run {path}/pharmCAT_Pipeline.wdl -i {path}/inputs.json +``` + +Here is an example of how to provide the inputs in a JSON file: + +```json +{ + "pharmcat_pipeline.vcf_file": "gs://your-bucket/path/to/your.vcf", + "pharmcat_pipeline.sample_ids": "", + "pharmcat_pipeline.sample_file": null, + "pharmcat_pipeline.missing_to_ref": false, + "pharmcat_pipeline.no_gvcf_check": false, + "pharmcat_pipeline.retain_specific_regions": false, + "pharmcat_pipeline.reference_regions": null, + "pharmcat_pipeline.run_matcher": false, + "pharmcat_pipeline.matcher_all_results": false, + "pharmcat_pipeline.matcher_save_html": false, + "pharmcat_pipeline.research_mode": "", + "pharmcat_pipeline.run_phenotyper": false, + "pharmcat_pipeline.run_reporter": false, + "pharmcat_pipeline.reporter_sources": "", + "pharmcat_pipeline.reporter_extended": false, + "pharmcat_pipeline.reporter_save_json": false, + "pharmcat_pipeline.base_filename": "", + "pharmcat_pipeline.delete_intermediate_files": false, + "pharmcat_pipeline.max_concurrent_processes": 1, + "pharmcat_pipeline.max_memory": "4G" +} +``` + + +## Software Version Notes + +### WDL Version +This script is written in WDL (Workflow Description Language) 1.0 to ensure compatibility with Dockstore. For more information about WDL, visit the WDL Documentation. + +### Cromwell Version +Successfully tested on v53 and v87 + +### PharmCAT and Imagem Version +v:2.13.0 + +### Important Notes +- Runtime parameters are optimized for Google Cloud Platform implementation. +- The provided JSON is a generic ready-to-use example template for the workflow. It is the user’s responsibility to correctly set the reference and resource variables for their own particular test case using the PharmCAT documentation. + + +## Documentation Links +- [PharmCAT Documentation](https://pharmcat.org/) +- [PharmCAT-Pipeline Documentation](https://pharmcat.org/using/Running-PharmCAT-Pipeline/) +- [PharmCAT Project](https://github.com/PharmGKB/PharmCAT) + +## Contact +For technical questions or bug reports, [file an issue](https://github.com/PharmGKB/PharmCAT/issues). + +For general questions about the PharmCAT project, contact [pharmcat@pharmgkb.org](mailto:pharmcat@pharmgkb.org). + + +## License +(C) 2024 Your Organization | BSD-3 + +This script is released under the [WDL open source code license](https://github.com/openwdl/wdl/blob/master/LICENSE) (BSD-3). Note, however, that the programs it calls may be subject to different licenses. Users are responsible for checking that they are authorized to run all programs before running this script. \ No newline at end of file diff --git a/pharmcat/github.txt b/pharmcat/github.txt new file mode 100644 index 0000000..f68d9c2 --- /dev/null +++ b/pharmcat/github.txt @@ -0,0 +1,2 @@ +https://raw.githubusercontent.com/AndreRico/PharmCAT_Dockstore/refs/heads/main/data/sample_2.vcf +https://raw.githubusercontent.com/AndreRico/PharmCAT_Dockstore/refs/heads/main/data/sample_3.vcf diff --git a/pharmcat/test_file.json b/pharmcat/test_file.json new file mode 100644 index 0000000..b9d4ab8 --- /dev/null +++ b/pharmcat/test_file.json @@ -0,0 +1,6 @@ +{ + "pharmcat_pipeline.files_list": "/Users/andrerico/Works/Projects/PharmCAT/PharmCAT_Dockstore/pipeline/github.txt", + "pharmcat_pipeline.max_concurrent_processes": 1, + "pharmcat_pipeline.max_memory": "4G", + "pharmcat_pipeline.run_vcf_preprocessor": true +} \ No newline at end of file diff --git a/pipeline/PharmCAT_Pipeline.wdl b/pipeline/PharmCAT_Pipeline.wdl index 246c114..11e5415 100644 --- a/pipeline/PharmCAT_Pipeline.wdl +++ b/pipeline/PharmCAT_Pipeline.wdl @@ -2,29 +2,46 @@ version 1.0 workflow pharmcat_pipeline { input { - File? files_list # Optional input file containing list of URLs - Array[File]? files_local # Optional array of files - String? files_directory # Read all VCF from a diretory - String? results_directory # Write the Results in Cloud Diretory + # File? input_file # Simple VCF or TSV file + String? input_directory # Read all VCF from a diretory + # String? results_directory # Write the Results in Cloud Diretory + + String pharmcat_version = "2.13.0" Int max_concurrent_processes = 1 String max_memory = "4G" + } - call a_cloud_reader_task { + call cloud_reader_task { input: - files_list = files_list, + input_directory = input_directory, max_concurrent_processes = max_concurrent_processes, max_memory = max_memory } + call pipeline_task { + input: + result_cloud_reader = cloud_reader_task.result_cloud_reader, + docker_version = pharmcat_version, + max_concurrent_processes = max_concurrent_processes, + max_memory = max_memory, + } + + # call cloud_writer_task { + # input: + # results = pipeline_task.results, + # } + + output { - File compressed_files = a_cloud_reader_task.compressed_files + # File results = pipeline_task.results + File result_cloud_reader = cloud_reader_task.result_cloud_reader } } -task a_cloud_reader_task { +task cloud_reader_task { input { - File? files_list # Optional input file containing list of URLs + String? input_directory Int max_concurrent_processes String max_memory } @@ -32,40 +49,319 @@ task a_cloud_reader_task { command <<< set -e -x -o pipefail - # Install necessary tools - apt-get update && apt-get install -y \ - wget \ - curl \ - python3 \ - python3-pip \ - unzip - # Create folders - mkdir -p files - mkdir -p files/VCFs_inputs + mkdir -p files/input_directory # Create log file log_file="files/log.txt" touch $log_file + echo "-----------------------" >> $log_file echo "Start Cloud Reader Task" >> $log_file - - # Create the txt file - VCFs_list="files/VCFs_list.txt" - touch $VCFs_list + echo "-----------------------" >> $log_file # Check gsutil gsutil --version >> $log_file + # Process the Directory Input [ files_directory ] + # ----------------------------------------------- + if [[ ~{true='true' false='false' defined(input_directory)} == "true" ]]; then + echo "Start to Read from Files Directory: ~{input_directory}" >> $log_file + # Check if input_directory is a Google Storage + if [[ "~{input_directory}" == gs://* ]]; then + echo "Copying all files from directory: ~{input_directory}" >> $log_file + # List all the files in the directory + gsutil ls "~{input_directory}/*" >> $log_file + # Copy all the files from the directory to the local folder + gsutil cp "~{input_directory}/*" files/input_directory/ >> $log_file + echo "All files from ~{input_directory} have been copied to files/input_directory/" >> $log_file + else + echo "ERROR: The directory path is not a valid gs:// URL. Skipping file copy." >> $log_file + fi + else + echo "The files_directory input type wasn't defined" >> $log_file + fi + + + # Prepare the folder structure to process in next task + if [[ $(ls files/input_directory | wc -l) -gt 0 ]]; then + file_count=$(ls files/input_directory/* | wc -l) + echo "Number of files copied: $file_count" >> $log_file + echo "End of Cloud Reader Task" >> $log_file + tar -czvf files.tar.gz files + else + echo "No files to compress" >> $log_file + tar -czvf files.tar.gz files + fi + + >>> + + output { + File result_cloud_reader = "files.tar.gz" + } + + runtime { + docker: "ricoandre/cloud-tools:latest" + memory: max_memory + cpu: max_concurrent_processes + } +} + +task pipeline_task { + input { + # Environment Settings + String docker_version + Int max_concurrent_processes + String max_memory + Boolean delete_intermediate_files = false + + # Diretory from cloud_reader_task + File result_cloud_reader + + # Read single files + File? vcf_file + String? base_filename + + # Sample informations + File? sample_file # Optional file containing a list of sample IDs + String? sample_ids # Optional comma-separated list of sample IDs + + # Args to Preprocessor + Boolean missing_to_ref = false + Boolean no_gvcf_check = false + Boolean retain_specific_regions = false # Flag to retain specific genomic regions + File? reference_regions_to_retain # BED file specifying PGx regions to retain + + # Args to Named Allele Matcher + Boolean run_matcher = false # Flag to run only Named Allele Matcher + Boolean matcher_all_results = false + Boolean matcher_save_html = false + String research_mode = "" + + # Args to Phonopyter + Boolean run_phenotype = false # Flag to run only Phenotype + + # Args to Reporter + Boolean run_reporter = false # Flag to run only Reporter + String reporter_sources = "" + Boolean reporter_extended = false + Boolean reporter_save_json = false + } + + command <<< + set -e -x -o pipefail + + # Extract the compressed file from a_cloud_reader_task + tar -xzvf ~{result_cloud_reader} + + # Start log file + log_file="files/log.txt" + echo " " >> $log_file + echo "---------------------------" >> $log_file + echo "Start VCF Preprocessor Task" >> $log_file + echo "---------------------------" >> $log_file + + # Common arguments + arg=" -o files/Results" + + # Sample inputs + if [ ! -z "$sample_file" ]; then + arg+=" -S $sample_file" + fi + + if [ ! -z "$sample_ids" ]; then + arg+=" -s $sample_ids" + fi + + # Preprocessor arguments + if [ "$missing_to_ref" == "true" ]; then + arg+=" -0" # --missing-to-ref + fi + + if [ "$no_gvcf_check" == "true" ]; then + arg+=" -G" # --no-gvcf-check + fi + + if [ "$retain_specific_regions" == "true" ]; then + arg+=" -R" # Retain specific regions + fi + + if [ ! -z "$reference_regions_to_retain" ]; then + arg+=" -refRegion $reference_regions_to_retain" # Specify the BED file for regions to retain + fi + + # Named Allele Matcher arguments + if [ "$run_matcher" == "true" ]; then + arg+=" -matcher" # Run named allele matcher + fi + + if [ "$matcher_all_results" == "true" ]; then + arg+=" -ma" # Return all possible diplotypes + fi + + if [ "$matcher_save_html" == "true" ]; then + arg+=" -matcherHtml" # Save matcher results as HTML + fi + + if [ ! -z "$research_mode" ]; then + arg+=" -research $research_mode" # Enable research mode features + fi + + # Phenotyper arguments + if [ "$run_phenotype" == "true" ]; then + arg+=" -phenotyper" # Run phenotyper independently + fi + + # Reporter arguments + if [ "$run_reporter" == "true" ]; then + arg+=" -reporter" # Run reporter independently + fi + + if [ ! -z "$reporter_sources" ]; then + arg+=" -rs $reporter_sources" # Specify sources for the reporter + fi + + if [ "$reporter_extended" == "true" ]; then + arg+=" -re" # Write an extended report + fi + + if [ "$reporter_save_json" == "true" ]; then + arg+=" -reporterJson" # Save reporter results as JSON + fi + + # Output and concurrency arguments + if [ ! -z "$base_filename" ]; then + arg+=" -bf $base_filename" # Set base filename for output + fi + + if [ "$delete_intermediate_files" == "true" ]; then + arg+=" -del" # Delete intermediate PharmCAT files + fi + + if [ ! -z "$max_concurrent_processes" ]; then + arg+=" -cp $max_concurrent_processes" # Set max concurrent processes + fi + + if [ ! -z "$max_memory" ]; then + arg+=" -cm $max_memory" # Set max memory + fi + + echo "Set Common Arguments: $arg" >> $log_file + + # Mandatory argument: -vcf. + # ------------------------- + # Path to a single VCF file or a file containing the list of VCF file paths (one per line), + # sorted by chromosome position. All VCF files must have the same set of samples. Use this + # when data for a sample has been split among multiple files (e.g. VCF files from large + # cohorts, such as UK Biobank). Input VCF files must at least comply with + # Variant Call Format (VCF) Version >= 4.2. + + # The $single_vcf_mode will control the model to run + # Exemplo de comando final + # cmd="python3 /path/to/pharmcat_pipeline.py $arg" + # echo "Running: $cmd" >> files/log.txt + # eval $cmd + + # VCFs_list="files/VCFs_list.txt" + # touch $VCFs_list + + echo "Run PharmCAT Pipeline" >> $log_file + + # Option 1: User add on VCF or TSV file in the vcf_file input + if [[ -n "~{vcf_file}" && -f ~{vcf_file} ]]; then + # Copy to input_directory because host all vcf files in tsv or outside.calls + cp ~{vcf_file} files/input_directory + echo "Processing as a single mode VCF or TSV" >> $log_file + # Prepare command sintax + cmd="pharmcat_pipeline files/input_directory/$(basename ~{vcf_file}) $args" + echo "Running command: $cmd" >> $log_file + eval $cmd + + # Option 2: None VCF or TSV input. Check directory content to process + elif [[ -z "~{vcf_file}" ]]; then + echo "Processing all individual VCF files in the directory" >> $log_file + ls files/VCFs_inputs/*.vcf.* >> $VCFs_list # Create list with all vcf in the directory + + # Run all vcf files in the diretory individually + for vcf_file in $(cat $VCFs_list); do + echo "Processing individual VCF file: $vcf_file" >> $log_file + cmd="pharmcat_pipeline $vcf_file $args" + echo "Running command: $cmd" >> $log_file + eval $cmd + done + + else + echo "No VCF or list of VCFs provided. Exiting." >> $log_file + exit 1 + fi + + # Run the command + echo "Pharmcat_pipeline finished" >> $log_file + + # Package the entire 'files' directory and create a tar.gz file + tar -czvf results.tar.gz files >>> output { - # Return the compressed file instead of an array of individual files - File compressed_files = "files/log.txt" + File results = "results.tar.gz" } runtime { - docker: "google/cloud-sdk:slim" + docker: "pgkb/pharmcat:${docker_version}" # Use the user-specified or default Docker version memory: max_memory cpu: max_concurrent_processes } -} \ No newline at end of file +} + + +# task cloud_writer_task { +# input { +# File? result_vcf_preprocessor +# String? results_directory +# } + +# command <<< +# set -e -x -o pipefail + +# # Extract the compressed file from a_cloud_reader_task +# tar -xzvf ~{result_vcf_preprocessor} + +# # Start log file +# log_file="files/log.txt" +# echo " " >> $log_file +# echo "-----------------------" >> $log_file +# echo "Start Cloud Writer Task" >> $log_file +# echo "-----------------------" >> $log_file + +# # Ensure gsutil is available in this environment +# if ! command -v gsutil &> /dev/null; then +# echo "ERROR: gsutil not found. Please ensure gsutil is available." >> $log_file +# exit 1 +# fi + +# # Save Results in directory defined by the user +# echo "Copying results to ~{results_directory}" >> $log_file + +# # TODO - Add other cloud directories +# if [[ ~{results_directory} == gs://* ]]; then +# # Copying individual result files +# gsutil cp Results/* "~{results_directory}/" >> $log_file +# # Copying the pre_processor tar.gz as well +# gsutil cp ~{result_vcf_preprocessor} ~{results_directory}/ >> $log_file +# else +# echo "ERROR: Unsupported storage destination. Only gs:// is supported in this task." >> $log_file +# exit 1 +# fi + +# echo "Cloud Writer Task completed successfully." >> $log_file +# >>> + +# output { +# File log = "files/log.txt" +# } + +# runtime { +# docker: "ricoandre/cloud-tools:latest" +# memory: "4G" +# cpu: 1 +# } +# } \ No newline at end of file diff --git a/pipeline/test.json b/pipeline/test.json deleted file mode 100644 index cf5864e..0000000 --- a/pipeline/test.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "pharmcat_workflow.vcf_file": "{path}/{file}.vcf", - "pharmcat_workflow.output_directory": "{results_path}" -} diff --git a/pipeline/test_file.json b/pipeline/test_file.json index a4687f8..d6096f2 100644 --- a/pipeline/test_file.json +++ b/pipeline/test_file.json @@ -1,5 +1,5 @@ { - "pharmcat_pipeline.files_list": "/Users/andrerico/Works/Projects/PharmCAT/PharmCAT_Dockstore/pipeline/github.txt", + "pharmcat_pipeline.pipeline_task.vcf_file": "/Users/andrerico/Works/Projects/PharmCAT/PharmCAT_Dockstore/data/sample_2.vcf", "pharmcat_pipeline.max_concurrent_processes": 1, "pharmcat_pipeline.max_memory": "4G" } \ No newline at end of file diff --git a/vcf_preprocessor/vcf_preprocessor.wdl b/vcf_preprocessor/vcf_preprocessor.wdl index 9eb2319..27cff5a 100644 --- a/vcf_preprocessor/vcf_preprocessor.wdl +++ b/vcf_preprocessor/vcf_preprocessor.wdl @@ -276,10 +276,10 @@ task b_vcf_preprocessor { tar -czvf pre_processor.tar.gz -C files . # Use -C to change directory and include all contents of 'files' folder >>> -output { - # Return the packaged tar.gz file containing all the processed files - File pre_processor = "pre_processor.tar.gz" -} + output { + # Return the packaged tar.gz file containing all the processed files + File pre_processor = "pre_processor.tar.gz" + } runtime { docker: "pgkb/pharmcat:${docker_version}" # Use the user-specified or default Docker version