From 7de0569fee5d0e9e22cf77c0066cbf88418f1ec3 Mon Sep 17 00:00:00 2001
From: Andre <ricoandre@hotmail.com>
Date: Mon, 21 Oct 2024 21:35:30 -0400
Subject: [PATCH] Test_06: Writer

---
 data/input_vcf_list.txt             |  22 +-
 pipeline/PharmCAT_Pipeline copy.wdl | 474 ----------------------------
 pipeline/PharmCAT_Pipeline.wdl      |  58 ++--
 3 files changed, 31 insertions(+), 523 deletions(-)
 delete mode 100644 pipeline/PharmCAT_Pipeline copy.wdl

diff --git a/data/input_vcf_list.txt b/data/input_vcf_list.txt
index a32c2c5..9eddcd8 100644
--- a/data/input_vcf_list.txt
+++ b/data/input_vcf_list.txt
@@ -1,11 +1,11 @@
-data/PharmCAT_tutorial_get-rm_wgs_30x_grch38_chr1.vcf.bgz
-data/PharmCAT_tutorial_get-rm_wgs_30x_grch38_chr2.vcf.bgz
-data/PharmCAT_tutorial_get-rm_wgs_30x_grch38_chr4.vcf.bgz
-data/PharmCAT_tutorial_get-rm_wgs_30x_grch38_chr6.vcf.bgz
-data/PharmCAT_tutorial_get-rm_wgs_30x_grch38_chr7.vcf.bgz
-data/PharmCAT_tutorial_get-rm_wgs_30x_grch38_chr10.vcf.bgz
-data/PharmCAT_tutorial_get-rm_wgs_30x_grch38_chr12.vcf.bgz
-data/PharmCAT_tutorial_get-rm_wgs_30x_grch38_chr13.vcf.bgz
-data/PharmCAT_tutorial_get-rm_wgs_30x_grch38_chr16.vcf.bgz
-data/PharmCAT_tutorial_get-rm_wgs_30x_grch38_chr19.vcf.bgz
-data/PharmCAT_tutorial_get-rm_wgs_30x_grch38_chr22.vcf.bgz
\ No newline at end of file
+PharmCAT_tutorial_get-rm_wgs_30x_grch38_chr1.vcf.bgz
+PharmCAT_tutorial_get-rm_wgs_30x_grch38_chr2.vcf.bgz
+PharmCAT_tutorial_get-rm_wgs_30x_grch38_chr4.vcf.bgz
+PharmCAT_tutorial_get-rm_wgs_30x_grch38_chr6.vcf.bgz
+PharmCAT_tutorial_get-rm_wgs_30x_grch38_chr7.vcf.bgz
+PharmCAT_tutorial_get-rm_wgs_30x_grch38_chr10.vcf.bgz
+PharmCAT_tutorial_get-rm_wgs_30x_grch38_chr12.vcf.bgz
+PharmCAT_tutorial_get-rm_wgs_30x_grch38_chr13.vcf.bgz
+PharmCAT_tutorial_get-rm_wgs_30x_grch38_chr16.vcf.bgz
+PharmCAT_tutorial_get-rm_wgs_30x_grch38_chr19.vcf.bgz
+PharmCAT_tutorial_get-rm_wgs_30x_grch38_chr22.vcf.bgz
\ No newline at end of file
diff --git a/pipeline/PharmCAT_Pipeline copy.wdl b/pipeline/PharmCAT_Pipeline copy.wdl
deleted file mode 100644
index 226720e..0000000
--- a/pipeline/PharmCAT_Pipeline copy.wdl	
+++ /dev/null
@@ -1,474 +0,0 @@
-version 1.0
-
-workflow pharmcat_pipeline {
-  meta {
-    author: "Andre Rico"
-    email: "ricoandre@hotmail.com"
-    description: "This workflow runs a VCF file through the PharmCAT pipeline."
-  }
-
-  parameter_meta {
-    # Arg to input/output data
-    vcf_file: "A VCF file or a list of files name (can be gzipped or bgzipped)."
-    input_directory: "A directory containing VCF files to process."
-    results_directory: "The directory to save the results.  Only applicable if you want to save the results in a cloud directory."
-    
-    # Args to Sample
-    sample_ids: "A comma-separated list of sample IDs.  Only applicable if you have multiple samples and only want to work on specific ones."
-    sample_file: "A file containing a list of sample IDs, one sample ID per line.  Only applicable if you have multiple samples and only want to work on specific ones."
-
-    # Args to Preprocessor
-    missing_to_ref: "Assume genotypes at missing PGx sites are 0/0.  DANGEROUS!"
-    no_gvcf_check: "Bypass check if VCF file is in gVCF format."
-    # not including retain_specific_regions and reference_regions
-
-    # Args to Named Allele Matcher
-    run_matcher: "Run named allele matcher independently."
-    matcher_all_results: "Return all possible diplotypes, not just top hits."
-    matcher_save_html: "Save named allele matcher results as HTML.'"
-    research_mode: "Comma-separated list of research features to enable: [cyp2d6, combinations]"
-
-    # Args to Phonopyter
-    run_phenotyper: "Run phenotyper independently."
-
-    # Args to Reporter
-    run_reporter: "Run reporter independently."
-    reporter_sources: "Comma-separated list of sources to limit recommendations to: [CPIC, DPWG, FDA]"
-    reporter_extended: "Write an extended report (includes all possible genes and drugs, even if no data is available)"
-    reporter_save_json: "Save reporter results as JSON."
-
-    # Args to settings
-    max_concurrent_processes: "The maximum number of processes to use when concurrent mode is enabled."
-    max_memory: "The maximum memory PharmCAT should use (e.g. '64G')."
-    pharmcat_version: "PharmCAT version to use in the pipeline."
-    delete_intermediate_files: "Delete intermediate PharmCAT files.  Defaults to saving all files."
-    # base_filename: "Prefix for output files.  Defaults to the same base name as the input."
-  }
-  
-  input {
-    # File? input_file  # Simple VCF or TSV file
-    String? input_directory  # Read all VCF from a diretory
-    # String? results_directory  # Write the Results in Cloud Diretory
-    
-    String pharmcat_version = "2.13.0"
-    Int max_concurrent_processes = 1
-    String max_memory = "4G"
-
-  }
-
-  call cloud_reader_task {
-    input:
-      input_directory = input_directory,
-      max_concurrent_processes = max_concurrent_processes,
-      max_memory = max_memory
-  }
-
-  call pipeline_task {
-      input:
-        cloud_reader_results = cloud_reader_task.cloud_reader_results,
-        docker_version = pharmcat_version,
-        max_concurrent_processes = max_concurrent_processes,
-        max_memory = max_memory,
-  }
-  
-  call cloud_writer_task {
-    input:
-      pipeline_results = pipeline_task.pipeline_results,
-  }
-
-
-  output {
-    # File results = pipeline_task.results
-    File cloud_reader_results = cloud_reader_task.cloud_reader_results
-    File pipeline_results = pipeline_task.pipeline_results
-    File log = cloud_writer_task.log
-  }
-}
-
-# ---------------------------------------------------------------------
-# TASK 1: Cloud Reader Task
-# ---------------------------------------------------------------------
-task cloud_reader_task {
-  input {
-    String? input_directory
-    Int max_concurrent_processes
-    String max_memory
-  }
-
-  command <<<
-    set -e -x -o pipefail
-
-    # Create folders
-    mkdir -p wf/data
-    mkdir -p wf/results
-
-    # Create log file
-    log_file="wf/log.txt"
-    touch $log_file
-    echo "-----------------------" >> $log_file
-    echo "Start Cloud Reader Task" >> $log_file
-    echo "-----------------------" >> $log_file
-
-    # Check gsutil
-    gsutil --version >> $log_file
-
-    # TODO - Check if the diretory has / at the end, delete if has.
-
-    # Process the Directory Input [ files_directory ]
-    if [[ ~{true='true' false='false' defined(input_directory)} == "true" ]]; then
-      echo "Start to Read from Files Directory: ~{input_directory}" >> $log_file
-      # Check if input_directory is a Google Storage
-      if [[ "~{input_directory}" == gs://* ]]; then
-        echo "Copying all files from directory: ~{input_directory}" >> $log_file
-        # List all the files in the directory
-        gsutil ls "~{input_directory}/*" >> $log_file
-        # Copy all the files from the directory to the local folder
-        gsutil cp "~{input_directory}/*" wf/data/ >> $log_file
-        echo "All files from ~{input_directory} have been copied to wf/data/" >> $log_file
-      else
-        echo "ERROR: The directory path is not a valid gs:// URL. Skipping file copy." >> $log_file
-      fi
-    else
-      echo "The files_directory input type wasn't defined" >> $log_file
-    fi
-
-    # Prepare the folder structure to process in next task
-    if [[ $(ls wf/data | wc -l) -gt 0 ]]; then
-      file_count=$(ls wf/data/* | wc -l)
-      echo "Number of files copied: $file_count" >> $log_file
-      echo "End of Cloud Reader Task" >> $log_file
-      tar -czvf cloud_reader_results.tar.gz wf
-    else
-      echo "No files to compress" >> $log_file
-      tar -czvf cloud_reader_results.tar.gz wf
-    fi
-  >>>
-
-  output {
-    File cloud_reader_results = "cloud_reader_results.tar.gz"
-  }
-
-  runtime {
-    docker: "ricoandre/cloud-tools:latest"
-    memory: max_memory
-    cpu: max_concurrent_processes
-  }
-}
-
-# ---------------------------------------------------------------------
-# TASK 2: Pipeline Task
-# ---------------------------------------------------------------------
-task pipeline_task {
-  input {
-    # Environment Settings
-    String docker_version
-    Int max_concurrent_processes
-    String max_memory
-    Boolean delete_intermediate_files = false
-
-    # Diretory from cloud_reader_task
-    File cloud_reader_results
-
-    # Read single files
-    File? vcf_file
-    String? base_filename
-
-    # Sample informations
-    File? sample_file  # Optional file containing a list of sample IDs
-    String? sample_ids  # Optional comma-separated list of sample IDs
-    
-    # Args to Preprocessor
-    Boolean missing_to_ref = false
-    Boolean no_gvcf_check = false
-    Boolean retain_specific_regions = false  # Flag to retain specific genomic regions
-    File? reference_regions_to_retain  # BED file specifying PGx regions to retain
-
-    # Args to Named Allele Matcher
-    Boolean run_matcher = false  # Flag to run only Named Allele Matcher
-    Boolean matcher_all_results = false
-    Boolean matcher_save_html = false
-    String research_mode = ""
-    
-    # Args to Phonopyter
-    Boolean run_phenotype = false  # Flag to run only Phenotype
-    
-    # Args to Reporter
-    Boolean run_reporter = false  # Flag to run only Reporter
-    String reporter_sources = ""
-    Boolean reporter_extended = false
-    Boolean reporter_save_json = false
-  }
-
-  command <<<
-    set -e -x -o pipefail
-  
-    # Extract the compressed file from a_cloud_reader_task
-    tar -xzvf ~{cloud_reader_results}
-
-    # Start log file
-    log_file="wf/log.txt"
-    echo " " >> $log_file
-    echo "---------------------------" >> $log_file
-    echo "Start VCF Preprocessor Task" >> $log_file
-    echo "---------------------------" >> $log_file
-
-    # Create list file to keep VCFs to process
-    list="wf/list.txt"
-    touch $list
-
-    # Common arguments
-    arg=" -o wf/results"
-
-    # Sample inputs
-    if [ ! -z "$sample_file" ]; then
-      arg+=" -S $sample_file"
-    fi
-
-    if [ ! -z "$sample_ids" ]; then
-      arg+=" -s $sample_ids"
-    fi
-
-    # Preprocessor arguments
-    if [ "$missing_to_ref" == "true" ]; then
-      arg+=" -0"  # --missing-to-ref
-    fi
-
-    if [ "$no_gvcf_check" == "true" ]; then
-      arg+=" -G"  # --no-gvcf-check
-    fi
-
-    if [ "$retain_specific_regions" == "true" ]; then
-      arg+=" -R"  # Retain specific regions
-    fi
-
-    if [ ! -z "$reference_regions_to_retain" ]; then
-      arg+=" -refRegion $reference_regions_to_retain"  # Specify the BED file for regions to retain
-    fi
-
-    # Named Allele Matcher arguments
-    if [ "$run_matcher" == "true" ]; then
-      arg+=" -matcher"  # Run named allele matcher
-    fi
-
-    if [ "$matcher_all_results" == "true" ]; then
-      arg+=" -ma"  # Return all possible diplotypes
-    fi
-
-    if [ "$matcher_save_html" == "true" ]; then
-      arg+=" -matcherHtml"  # Save matcher results as HTML
-    fi
-
-    if [ ! -z "$research_mode" ]; then
-      arg+=" -research $research_mode"  # Enable research mode features
-    fi
-
-    # Phenotyper arguments
-    if [ "$run_phenotype" == "true" ]; then
-      arg+=" -phenotyper"  # Run phenotyper independently
-    fi
-
-    # Reporter arguments
-    if [ "$run_reporter" == "true" ]; then
-      arg+=" -reporter"  # Run reporter independently
-    fi
-
-    if [ ! -z "$reporter_sources" ]; then
-      arg+=" -rs $reporter_sources"  # Specify sources for the reporter
-    fi
-
-    if [ "$reporter_extended" == "true" ]; then
-      arg+=" -re"  # Write an extended report
-    fi
-
-    if [ "$reporter_save_json" == "true" ]; then
-      arg+=" -reporterJson"  # Save reporter results as JSON
-    fi
-
-    # Output and concurrency arguments
-    if [ ! -z "$base_filename" ]; then
-      arg+=" -bf $base_filename"  # Set base filename for output
-    fi
-
-    if [ "$delete_intermediate_files" == "true" ]; then
-      arg+=" -del"  # Delete intermediate PharmCAT files
-    fi
-
-    if [ ! -z "$max_concurrent_processes" ]; then
-      arg+=" -cp $max_concurrent_processes"  # Set max concurrent processes
-    fi
-
-    if [ ! -z "$max_memory" ]; then
-      arg+=" -cm $max_memory"  # Set max memory
-    fi
-
-    echo "Set Common Arguments: $arg" >> $log_file
-
-    # Mandatory argument: -vcf.
-    # -------------------------
-    # Path to a single VCF file or a file containing the list of VCF file paths (one per line),
-    #   sorted by chromosome position. All VCF files must have the same set of samples. Use this
-    #   when data for a sample has been split among multiple files (e.g. VCF files from large
-    #   cohorts, such as UK Biobank). Input VCF files must at least comply with 
-    #   Variant Call Format (VCF) Version >= 4.2.
-
-    # Resolver a variável `vcf_file` fora do bloco condicional
-    # Nao podemos usar a variavel `vcf_file` dentro do bloco condicional
-    vcf_file="~{vcf_file}"
-
-    # Obter a extensão do arquivo para verificar se é um arquivo de lista ou um VCF simples
-    file_extension="${vcf_file##*.}"
-
-    # option 1: User add on VCF or TSV file in the vcf_file input
-    if [[ -n "$vcf_file" && -f "$vcf_file" ]]; then
-      # cp ~{vcf_file} wf/data
-      # echo "Processing list of VCF files as a single block from: ~{vcf_file}" >> $log_file
-      # cmd="pharmcat_pipeline wf/data/$(basename ~{vcf_file}) $arg"
-      # echo "Running command: $cmd" >> $log_file
-      # eval $cmd
-
-      echo "Processing list of VCF files as a single block from: ~{vcf_file}" >> $log_file
-
-      if [[ "$file_extension" == "txt" || "$file_extension" == "tsv" ]]; then
-        echo "Treatment pathway from : $vcf_file" >> $log_file
-
-        # Copiar o arquivo de lista para a pasta interna 'wf/data'
-        cp "$vcf_file" wf/data
-        list_file="wf/data/$(basename "$vcf_file")"
-
-        # Criar um novo arquivo de lista com o caminho completo 'wf/data/'
-        adjusted_list="wf/data/adjusted_list.txt"
-        touch $adjusted_list
-
-        # Verificar cada linha no arquivo original e adicionar 'wf/data/' caso necessário
-        while read -r line; do
-          if [[ "$line" == wf/data/* ]]; then
-            # Se a linha já contém 'wf/data/', adicionar diretamente
-            echo "$line" >> $adjusted_list
-          else
-            # Caso contrário, adicionar o prefixo 'wf/data/'
-            echo "wf/data/$line" >> $adjusted_list
-          fi
-        done < "$list_file"
-
-        echo "Adjusted VCF list created at: $adjusted_list" >> $log_file
-
-        # Rodar o PharmCAT com a lista ajustada
-        cmd="pharmcat_pipeline $adjusted_list $arg"
-        echo "Running command: $cmd" >> $log_file
-        eval $cmd
-      
-      else
-        # Caso seja um arquivo VCF simples, processá-lo diretamente
-        echo "Processing single VCF file: $vcf_file" >> $log_file
-        cp "$vcf_file" wf/data
-        cmd="pharmcat_pipeline wf/data/$(basename "$vcf_file") $arg"
-        echo "Running command: $cmd" >> $log_file
-        eval $cmd
-      fi
-
-
-    # Option 2: None VCF or TSV input. Check directory content to process
-    elif [[ -z "$vcf_file" ]]; then
-      if [[ $(ls wf/data/*.vcf.* 2>/dev/null | wc -l) -gt 0 ]]; then
-        echo "Processing all individual VCF files in the directory: wf/data/" >> $log_file
-
-        VCFs_list="wf/VCFs_list.txt"
-        ls wf/data/*.vcf.* > $VCFs_list
-
-        while read -r vcf_file; do
-          echo "Processing individual VCF file: $vcf_file" >> $log_file
-          cmd="pharmcat_pipeline $vcf_file $arg"
-          echo "Running command: $cmd" >> $log_file
-          eval $cmd
-        done < $VCFs_list
-      else
-        echo "No VCF files found in wf/data/. Exiting." >> $log_file
-        exit 1
-      fi
-
-    else
-      echo "No VCF or list of VCFs provided or found in directory. Exiting." >> $log_file
-      exit 1
-    fi
-
-
-    # Run the command
-    echo "Pharmcat_pipeline finished" >> $log_file
-
-    # Package the entire 'wf' directory and create a tar.gz file
-    tar -czvf pipeline_results.tar.gz wf
-  >>>
-
-  output {
-      File pipeline_results = "pipeline_results.tar.gz"
-  }
-
-  runtime {
-    docker: "pgkb/pharmcat:${docker_version}" 
-    memory: max_memory
-    cpu: max_concurrent_processes
-  }
-}
-
-
-task cloud_writer_task {
-  input {
-    File? pipeline_results
-    String? results_directory
-  }
-
-  command <<<
-    set -e -x -o pipefail
-
-    # Extrair o arquivo compactado
-    tar -xzvf ~{pipeline_results}
-
-    # Iniciar arquivo de log
-    log_file="wf/log.txt"
-    echo " " >> $log_file
-    echo "-----------------------" >> $log_file
-    echo "Start Cloud Writer Task" >> $log_file
-    echo "-----------------------" >> $log_file
-
-    # Definir a variável results_directory como string
-    results_directory="~{results_directory}"
-
-    # Verificar se results_directory foi definido e não está vazio
-    if [[ -n "$results_directory" ]]; then  
-      # Verificar se gsutil está disponível neste ambiente
-      if ! command -v gsutil &> /dev/null; then
-        echo "ERROR: gsutil not found. Please ensure gsutil is available." >> $log_file
-        exit 1
-      fi
-
-      # Salvar resultados no diretório definido pelo usuário
-      echo "Copying results to $results_directory" >> $log_file
-
-      # TODO - Adicionar suporte para outros diretórios em nuvem
-      if [[ "$results_directory" == gs://* ]]; then
-        # Copiar arquivos de resultados individuais
-        gsutil cp wf/results/* "$results_directory/" >> $log_file
-        # Copiar também o arquivo tar.gz com os resultados do pipeline
-        # gsutil cp ~{pipeline_results} "$results_directory/" >> $log_file
-      else
-        echo "ERROR: Unsupported storage destination. Only gs:// is supported in this task." >> $log_file
-        exit 1
-      fi
-
-      echo "Cloud Writer Task completed successfully." >> $log_file
-    else
-      echo "No results directory defined. Skipping cloud write." >> $log_file
-    fi
-
-  >>>
-
-  output {
-    File log = "wf/log.txt"
-  }
-
-  runtime {
-    docker: "ricoandre/cloud-tools:latest"
-    memory: "4G"
-    cpu: 1
-  }
-}
\ No newline at end of file
diff --git a/pipeline/PharmCAT_Pipeline.wdl b/pipeline/PharmCAT_Pipeline.wdl
index 2b00b87..b04a5de 100644
--- a/pipeline/PharmCAT_Pipeline.wdl
+++ b/pipeline/PharmCAT_Pipeline.wdl
@@ -9,40 +9,34 @@ workflow pharmcat_pipeline {
 
   parameter_meta {
     # Arg to input/output data
-    a__vcf_file: "A VCF file or a list of files name (can be gzipped or bgzipped)."
-    b__input_directory: "A directory containing VCF files to process."
-    c__results_directory: "The directory to save the results.  Only applicable if you want to save the results in a cloud directory."
-    
+    vcf_file: "A VCF file or a list of files name (can be gzipped or bgzipped)."
+    input_directory: "A directory containing VCF files to process."
+    results_directory: "The directory to save the results.  Only applicable if you want to save the results in a cloud directory."
+    base_filename: "Prefix for output files.  Defaults to the same base name as the input."
     # Args to Sample
     sample_ids: "A comma-separated list of sample IDs.  Only applicable if you have multiple samples and only want to work on specific ones."
     sample_file: "A file containing a list of sample IDs, one sample ID per line.  Only applicable if you have multiple samples and only want to work on specific ones."
-
     # Args to Preprocessor
     missing_to_ref: "Assume genotypes at missing PGx sites are 0/0.  DANGEROUS!"
     no_gvcf_check: "Bypass check if VCF file is in gVCF format."
     # not including retain_specific_regions and reference_regions
-
     # Args to Named Allele Matcher
     run_matcher: "Run named allele matcher independently."
     matcher_all_results: "Return all possible diplotypes, not just top hits."
     matcher_save_html: "Save named allele matcher results as HTML.'"
     research_mode: "Comma-separated list of research features to enable: [cyp2d6, combinations]"
-
     # Args to Phonopyter
     run_phenotyper: "Run phenotyper independently."
-
     # Args to Reporter
     run_reporter: "Run reporter independently."
     reporter_sources: "Comma-separated list of sources to limit recommendations to: [CPIC, DPWG, FDA]"
     reporter_extended: "Write an extended report (includes all possible genes and drugs, even if no data is available)"
     reporter_save_json: "Save reporter results as JSON."
-
     # Args to settings
     max_concurrent_processes: "The maximum number of processes to use when concurrent mode is enabled."
     max_memory: "The maximum memory PharmCAT should use (e.g. '64G')."
     pharmcat_version: "PharmCAT version to use in the pipeline."
     delete_intermediate_files: "Delete intermediate PharmCAT files.  Defaults to saving all files."
-    # base_filename: "Prefix for output files.  Defaults to the same base name as the input."
   }
   
   input {
@@ -82,7 +76,6 @@ workflow pharmcat_pipeline {
       input:
         cloud_reader_results = cloud_reader_task.cloud_reader_results,
         vcf_file = a__input_file,
-
         base_filename = d__base_filename,
         sample_file = e__sample_file,
         sample_ids = f__sample_ids,
@@ -111,7 +104,6 @@ workflow pharmcat_pipeline {
       results_directory = c__results_directory,
   }
 
-
   output {
     # File results = pipeline_task.results
     File cloud_reader_results = cloud_reader_task.cloud_reader_results
@@ -345,11 +337,10 @@ task pipeline_task {
     #   cohorts, such as UK Biobank). Input VCF files must at least comply with 
     #   Variant Call Format (VCF) Version >= 4.2.
 
-    # Resolver a variável `vcf_file` fora do bloco condicional
-    # Nao podemos usar a variavel `vcf_file` dentro do bloco condicional
+    # We can not use the input `vcf_file` direct in the conditional block
     vcf_file="~{vcf_file}"
 
-    # Obter a extensão do arquivo para verificar se é um arquivo de lista ou um VCF simples
+    # Get the file extension to check if it is a list file or a simple VCF
     file_extension="${vcf_file##*.}"
 
     # option 1: User add on VCF or TSV file in the vcf_file input
@@ -359,40 +350,39 @@ task pipeline_task {
       # cmd="pharmcat_pipeline wf/data/$(basename ~{vcf_file}) $arg"
       # echo "Running command: $cmd" >> $log_file
       # eval $cmd
-
       echo "Processing list of VCF files as a single block from: ~{vcf_file}" >> $log_file
 
       if [[ "$file_extension" == "txt" || "$file_extension" == "tsv" ]]; then
         echo "Treatment pathway from : $vcf_file" >> $log_file
 
-        # Copiar o arquivo de lista para a pasta interna 'wf/data'
+        # Copy the list file to the internal folder 'wf/data'
         cp "$vcf_file" wf/data
         list_file="wf/data/$(basename "$vcf_file")"
 
-        # Criar um novo arquivo de lista com o caminho completo 'wf/data/'
+        # Create a new list file with the full path 'wf/data/'
         adjusted_list="wf/data/adjusted_list.txt"
         touch $adjusted_list
 
-        # Verificar cada linha no arquivo original e adicionar 'wf/data/' caso necessário
+        # Check each line in the original file and add 'wf/data/' if necessary
         while read -r line; do
           if [[ "$line" == wf/data/* ]]; then
-            # Se a linha já contém 'wf/data/', adicionar diretamente
+            # If the line already contains 'wf/data/', add it directly
             echo "$line" >> $adjusted_list
           else
-            # Caso contrário, adicionar o prefixo 'wf/data/'
+            # If the line does not contain 'wf/data/', add the prefix 'wf/data/'
             echo "wf/data/$line" >> $adjusted_list
           fi
         done < "$list_file"
 
         echo "Adjusted VCF list created at: $adjusted_list" >> $log_file
 
-        # Rodar o PharmCAT com a lista ajustada
+        # Run PharmCAT with the adjusted list
         cmd="pharmcat_pipeline $adjusted_list $arg"
         echo "Running command: $cmd" >> $log_file
         eval $cmd
       
       else
-        # Caso seja um arquivo VCF simples, processá-lo diretamente
+        # If it is a single VCF file, process it directly
         echo "Processing single VCF file: $vcf_file" >> $log_file
         cp "$vcf_file" wf/data
         cmd="pharmcat_pipeline wf/data/$(basename "$vcf_file") $arg"
@@ -400,7 +390,6 @@ task pipeline_task {
         eval $cmd
       fi
 
-
     # Option 2: None VCF or TSV input. Check directory content to process
     elif [[ -z "$vcf_file" ]]; then
       if [[ $(ls wf/data/*.vcf.* 2>/dev/null | wc -l) -gt 0 ]]; then
@@ -425,10 +414,7 @@ task pipeline_task {
       exit 1
     fi
 
-
-    # Run the command
     echo "Pharmcat_pipeline finished" >> $log_file
-
     # Package the entire 'wf' directory and create a tar.gz file
     tar -czvf pipeline_results.tar.gz wf
   >>>
@@ -456,7 +442,7 @@ task cloud_writer_task {
   command <<<
     set -e -x -o pipefail
 
-    # Extrair o arquivo compactado
+    # Extract the compressed file from a_cloud_reader_task
     tar -xzvf ~{pipeline_results}
 
     # Iniciar arquivo de log
@@ -466,36 +452,32 @@ task cloud_writer_task {
     echo "Start Cloud Writer Task" >> $log_file
     echo "-----------------------" >> $log_file
 
-    # Definir a variável results_directory como string
+    # Define the results_directory variable as a string
     results_directory="~{results_directory}"
 
-    # Verificar se results_directory foi definido e não está vazio
+    # Check if results_directory is defined and not empty
     if [[ -n "$results_directory" ]]; then  
-      # Verificar se gsutil está disponível neste ambiente
+      # Check if gsutil is available
       if ! command -v gsutil &> /dev/null; then
         echo "ERROR: gsutil not found. Please ensure gsutil is available." >> $log_file
         exit 1
       fi
 
-      # Salvar resultados no diretório definido pelo usuário
+      # Save results to the user-defined directory
       echo "Copying results to $results_directory" >> $log_file
 
-      # TODO - Adicionar suporte para outros diretórios em nuvem
+      # TODO - Add support for other cloud directories
       if [[ "$results_directory" == gs://* ]]; then
-        # Copiar arquivos de resultados individuais
+        # Copy all files from the results directory to the cloud directory
         gsutil cp wf/results/* "$results_directory/" >> $log_file
-        # Copiar também o arquivo tar.gz com os resultados do pipeline
-        # gsutil cp ~{pipeline_results} "$results_directory/" >> $log_file
       else
         echo "ERROR: Unsupported storage destination. Only gs:// is supported in this task." >> $log_file
         exit 1
       fi
-
       echo "Cloud Writer Task completed successfully." >> $log_file
     else
       echo "No results directory defined. Skipping cloud write." >> $log_file
     fi
-
   >>>
 
   output {