broadinstitute · bshifaw · Apr 27, 2023 · May 3, 2023 · May 4, 2023 · May 4, 2023
diff --git a/.github/workflows/carrot_push.yml b/.github/workflows/carrot_push.yml
@@ -0,0 +1,36 @@
+# Runs workflow tests from the branch when commits have been pushed to a PR
+# the workflows to be tested are specified by the "test_names"
+# parameter as string seperated by space.
+
+name: carrot-test-on-push
+on: [push, workflow_dispatch]
+jobs:
+    publish-test:
+        runs-on: ubuntu-latest
+        steps:
+
+        # https://github.com/google-github-actions/setup-gcloud#service-account-key-json
+        - id: auth
+          uses: google-github-actions/auth@v0
+          with:
+            credentials_json: ${{ secrets.CARROT_SA_KEY }}
+
+        - name: Set up Cloud SDK
+          uses: google-github-actions/setup-gcloud@v0
+
+        # https://cloud.google.com/pubsub/docs/publisher#publish_messages
+        - name: Use gcloud CLI
+          run: >
+            gcloud pubsub topics publish ${{ secrets.CARROT_TOPIC_NAME }}
+            --message='{"source":"github",
+            "author":"${{ github.triggering_actor }}",
+            "owner":"${{ github.repository_owner }}",
+            "wdl_tests_dir":"wdl_test",
+            "repo_url":"${{ github.repositoryUrl }}",
+            "branch_name":"${{ github.ref_name }}",
+            "commit":"${{ github.sha }}",
+            "repo":"${{ github.repository }}",
+            "test_names": "PBCCSWholeGenome",
+            "issue_number":"",
+            "software_name":""
+            }'
diff --git a/.github/workflows/carrot_weekly.yml b/.github/workflows/carrot_weekly.yml
@@ -0,0 +1,36 @@
+# Runs all workflow tests every Sunday
+
+name: carrot-test-weekly
+on:
+  schedule:
+    - cron: '0 7 * * 0'  # Run every Sunday at 7am
+jobs:
+    publish-test:
+        runs-on: ubuntu-latest
+        steps:
+
+        # https://github.com/google-github-actions/setup-gcloud#service-account-key-json
+        - id: auth
+          uses: google-github-actions/auth@v0
+          with:
+            credentials_json: ${{ secrets.CARROT_SA_KEY }}
+
+        - name: Set up Cloud SDK
+          uses: google-github-actions/setup-gcloud@v0
+
+        # https://cloud.google.com/pubsub/docs/publisher#publish_messages
+        - name: Use gcloud CLI
+          run: >
+            gcloud pubsub topics publish ${{ secrets.CARROT_TOPIC_NAME }}
+            --message='{"source":"github",
+            "author":"${{ github.triggering_actor }}",
+            "owner":"${{ github.repository_owner }}",
+            "wdl_tests_dir":"wdl_test",
+            "repo_url":"${{ github.repositoryUrl }}",
+            "branch_name":"${{ github.ref_name }}",
+            "commit":"${{ github.sha }}",
+            "repo":"${{ github.repository }}",
+            "test_names": "",
+            "issue_number":"",
+            "software_name":""
+            }'
diff --git a/wdl_test/PBCCSWholeGenome/basic_output_valdation/NA24385_downsampled/eval_input.json b/wdl_test/PBCCSWholeGenome/basic_output_valdation/NA24385_downsampled/eval_input.json
@@ -0,0 +1 @@
+{}
diff --git a/wdl_test/PBCCSWholeGenome/basic_output_valdation/NA24385_downsampled/test_input.json b/wdl_test/PBCCSWholeGenome/basic_output_valdation/NA24385_downsampled/test_input.json
@@ -0,0 +1,21 @@
+{
+  "PBCCSWholeGenome.aligned_bais": [
+    "gs://broad-dsp-lrma-pipeline-test-data/NA24385/downsampled_bam_01/m64020e_220303_2002560.01.downsample.bai",
+    "gs://broad-dsp-lrma-pipeline-test-data/NA24385/downsampled_bam_01/m64218e_220328_1613170.01.downsample.bai",
+    "gs://broad-dsp-lrma-pipeline-test-data/NA24385/downsampled_bam_01/m64218e_220330_0132120.01.downsample.bai",
+    "gs://broad-dsp-lrma-pipeline-test-data/NA24385/downsampled_bam_01/m64297e_220215_1930240.01.downsample.bai",
+    "gs://broad-dsp-lrma-pipeline-test-data/NA24385/downsampled_bam_01/m64297e_220218_1550340.01.downsample.bai",
+    "gs://broad-dsp-lrma-pipeline-test-data/NA24385/downsampled_bam_01/m64297e_220220_0052040.01.downsample.bai",
+    "gs://broad-dsp-lrma-pipeline-test-data/NA24385/downsampled_bam_01/m64297e_220303_1959350.01.downsample.bai"
+  ],
+
+  "PBCCSWholeGenome.aligned_bams": [
+    "gs://broad-dsp-lrma-pipeline-test-data/NA24385/downsampled_bam_01/m64020e_220303_2002560.01.downsample.bam",
+    "gs://broad-dsp-lrma-pipeline-test-data/NA24385/downsampled_bam_01/m64218e_220328_1613170.01.downsample.bam",
+    "gs://broad-dsp-lrma-pipeline-test-data/NA24385/downsampled_bam_01/m64218e_220330_0132120.01.downsample.bam",
+    "gs://broad-dsp-lrma-pipeline-test-data/NA24385/downsampled_bam_01/m64297e_220215_1930240.01.downsample.bam",
+    "gs://broad-dsp-lrma-pipeline-test-data/NA24385/downsampled_bam_01/m64297e_220218_1550340.01.downsample.bam",
+    "gs://broad-dsp-lrma-pipeline-test-data/NA24385/downsampled_bam_01/m64297e_220220_0052040.01.downsample.bam",
+    "gs://broad-dsp-lrma-pipeline-test-data/NA24385/downsampled_bam_01/m64297e_220303_1959350.01.downsample.bam"
+  ]
+}
diff --git a/wdl_test/PBCCSWholeGenome/basic_output_valdation/eval.wdl b/wdl_test/PBCCSWholeGenome/basic_output_valdation/eval.wdl
@@ -0,0 +1,234 @@
+version 1.0
+
+workflow eval_workflow {
+    input {
+
+        #Floats
+
+        Float aligned_read_length_N50
+        Float aligned_num_reads
+        Float aligned_frac_bases
+        Float aligned_num_bases
+        Float aligned_read_length_stdev
+        Float average_identity
+        Float aligned_est_fold_cov
+        Float aligned_read_length_mean
+        Float median_identity
+        Float aligned_read_length_median
+
+        #Files
+
+        String pbsv_tbi
+        String sniffles_vcf
+        String clair_gtbi
+        String dvp_tbi
+        String dvp_g_tbi
+        String dvp_vcf
+        String clair_vcf
+        String pbsv_vcf
+        String aligned_pbi
+        String aligned_bai
+        String dvp_phased_vcf
+        #String bed_cov_summary # this ends up being 'null' so not including in array of files for now
+        String dvp_phased_tbi
+        String clair_tbi
+        String clair_gvcf
+        String aligned_bam
+        String sniffles_tbi
+        String dvp_g_vcf
+
+    }
+
+    Array[Float] workflow_out_floats = [
+        aligned_read_length_N50,
+        aligned_num_reads,
+        aligned_frac_bases,
+        aligned_num_bases,
+        aligned_read_length_stdev,
+        average_identity,
+        aligned_est_fold_cov,
+        aligned_read_length_mean,
+        median_identity,
+        aligned_read_length_median
+    ]
+    Array[String] workflow_out_files = [
+        pbsv_tbi,
+        sniffles_vcf,
+        clair_gtbi,
+        dvp_tbi,
+        dvp_g_tbi,
+        dvp_vcf,
+        clair_vcf,
+        pbsv_vcf,
+        aligned_pbi,
+        aligned_bai,
+        dvp_phased_vcf,
+        dvp_phased_tbi,
+        clair_tbi,
+        clair_gvcf,
+        aligned_bam,
+        sniffles_tbi,
+        dvp_g_vcf
+    ]
+
+    String ubuntu_image_tag = "latest"
+    String gcloud_slim_image_tag = "slim"
+
+################
+    ## Compairing test and expected Floats
+    ## Json inputs like "eval_workflow.workflow_out_floats": [{"Left":1.0,"Right":1.0},{"Left":2.2,"Right":3.2}]
+    ## With Array[Pair[Float,Float]] workflow_out_floats
+
+#    Array[Boolean] scattered_float_match = []
+#    scatter (pair in workflow_out_floats){
+#        if (pair.left != pair.right) {
+#            Boolean scattered_float_match = false
+#            call CheckerWorkflowError{
+#                input:
+#                    message = "Expected Float "+pair.right+" but got "+pair.left,
+#                    image_to_use = ubuntu_image
+#            }
+#        }
+#    }
+################
+
+    ## Confirm float does not equal to zero test and expected Floats
+    Array[Boolean] scattered_float_match = []
+    scatter (in_float in workflow_out_floats){
+        if (in_float == 0.0) {
+            Boolean scattered_float_match = false
+            call CheckerWorkflowError{
+                input:
+                    message = "Error: Expected a non-zero float but got "+in_float+" .",
+                    image_tag = ubuntu_image_tag
+            }
+        }
+    }
+
+
+    call CheckFileUpdatedDateGCP {
+        input:
+        file_paths = workflow_out_files,
+        image_tag = gcloud_slim_image_tag
+    }
+}
+
+task CheckFileUpdatedDateGCP {
+
+    meta {
+        description: "Checks if file was updated within a specified time (default: 1 day)"
+        note: "Specific to GCP"
+    }
+
+
+    input {
+        Array[String] file_paths
+        Int days_back = 1
+        String image_tag
+    }
+
+    String image_to_use = "gcr.io/google.com/cloudsdktool/cloud-sdk:" + image_tag
+
+    #FILE_DATE description: get file info | grep the 'Update Time' row | parse the date info | reformat date info
+
+    command <<<
+        set -eu pipefail
+
+        GS_BUCKET_PATHS=("~{sep='" "' file_paths}")
+        EMPTY_MD5="d41d8cd98f00b204e9800998ecf8427e"
+        return_code=0
+        echo -e "FilePath\tFileMD5\tValid"
+
+        for GS_FILE in ${GS_BUCKET_PATHS[@]};
+        do
+          FILE_MD5=`gsutil hash -hm  ${GS_FILE} | grep "md5" | awk -F '[:][\t]+' '{print $2}'`
+
+          if [[ $FILE_MD5 != $EMPTY_MD5 ]] ; then
+            echo -e "$GS_FILE\t$FILE_MD5\tTrue"
+          else
+            echo "ERROR: MD5 for $GS_FILE equals md5sum of an empty file: $EMPTY_MD5" >&2
+
+            echo -e "$GS_FILE\t$FILE_MD5\tFalse"
+            return_code=1
+          fi
+        done
+
+        if [ $return_code == 1 ]; then
+            exit 1
+        fi
+
+    >>>
+    runtime {
+        docker: image_to_use
+    }
+    output {
+        File file_date_result = stdout()
+    }
+}
+
+task CheckerWorkflowError {
+
+    input {
+        String message
+        String image_tag
+    }
+
+    String image_to_use = "marketplace.gcr.io/google/ubuntu2004:" + image_tag
+
+    command <<<
+        set -eu pipefail
+
+        echo ~{message}
+        exit 1
+
+    >>>
+    runtime {
+        docker: image_to_use
+    }
+    output {
+        Boolean errmessage = stdout()
+    }
+}
+
+task ValidFloatOutput {
+
+    input {
+        Array[Pair[Float,Float]] workflow_out_floats
+        String image_to_use
+    }
+    command <<<
+
+        echo ~{workflow_out_floats}
+
+    >>>
+    runtime {
+        docker: image_to_use
+    }
+    output {
+        Boolean comparison_result = read_boolean(stdout())
+    }
+}
+
+task ValidMd5SumOutput {
+    input {
+        File data_file
+        String expectedMd5sum
+    }
+    command <<<
+
+        md5sum helloworld.txt | sed "s/|/ /" | awk "{print $1, $8}" | read filemd5
+
+        if [$filemd5 == $expectedMd5sum]
+        then
+        echo "true"
+        else
+        echo "false"
+        fi
+    >>>
+    runtime {
+        docker: "quay.io/agduncan94/my-md5sum"
+    }
+    output {
+        File comparison_result = stdout()
+    }
+}
diff --git a/wdl_test/PBCCSWholeGenome/basic_output_valdation/eval_input_defaults.json b/wdl_test/PBCCSWholeGenome/basic_output_valdation/eval_input_defaults.json
@@ -0,0 +1,29 @@
+{
+  "eval_workflow.aligned_read_length_N50":  "test_output:PBCCSWholeGenome.aligned_read_length_N50",
+  "eval_workflow.aligned_num_reads": "test_output:PBCCSWholeGenome.aligned_num_reads",
+  "eval_workflow.aligned_frac_bases":  "test_output:PBCCSWholeGenome.aligned_frac_bases",
+  "eval_workflow.aligned_num_bases": "test_output:PBCCSWholeGenome.aligned_num_bases",
+  "eval_workflow.aligned_read_length_stdev": "test_output:PBCCSWholeGenome.aligned_read_length_stdev",
+  "eval_workflow.average_identity": "test_output:PBCCSWholeGenome.average_identity",
+  "eval_workflow.aligned_est_fold_cov":  "test_output:PBCCSWholeGenome.aligned_est_fold_cov",
+  "eval_workflow.aligned_read_length_mean": "test_output:PBCCSWholeGenome.aligned_read_length_mean",
+  "eval_workflow.median_identity":  "test_output:PBCCSWholeGenome.median_identity",
+  "eval_workflow.aligned_read_length_median": "test_output:PBCCSWholeGenome.aligned_read_length_median",
+  "eval_workflow.pbsv_tbi": "test_output:PBCCSWholeGenome.pbsv_tbi",
+  "eval_workflow.sniffles_vcf": "test_output:PBCCSWholeGenome.sniffles_vcf",
+  "eval_workflow.clair_gtbi": "test_output:PBCCSWholeGenome.clair_gtbi",
+  "eval_workflow.dvp_tbi": "test_output:PBCCSWholeGenome.dvp_tbi",
+  "eval_workflow.dvp_g_tbi": "test_output:PBCCSWholeGenome.dvp_g_tbi",
+  "eval_workflow.dvp_vcf": "test_output:PBCCSWholeGenome.dvp_vcf",
+  "eval_workflow.clair_vcf": "test_output:PBCCSWholeGenome.clair_vcf",
+  "eval_workflow.pbsv_vcf": "test_output:PBCCSWholeGenome.pbsv_vcf",
+  "eval_workflow.aligned_pbi": "test_output:PBCCSWholeGenome.aligned_pbi" ,
+  "eval_workflow.aligned_bai": "test_output:PBCCSWholeGenome.aligned_bai" ,
+  "eval_workflow.dvp_phased_vcf": "test_output:PBCCSWholeGenome.dvp_phased_vcf",
+  "eval_workflow.dvp_phased_tbi": "test_output:PBCCSWholeGenome.dvp_phased_tbi",
+  "eval_workflow.clair_tbi":"test_output:PBCCSWholeGenome.clair_tbi",
+  "eval_workflow.clair_gvcf": "test_output:PBCCSWholeGenome.clair_gvcf",
+  "eval_workflow.aligned_bam": "test_output:PBCCSWholeGenome.aligned_bam",
+  "eval_workflow.sniffles_tbi": "test_output:PBCCSWholeGenome.sniffles_tbi",
+  "eval_workflow.dvp_g_vcf":"test_output:PBCCSWholeGenome.dvp_g_vcf"
+}
diff --git a/wdl_test/PBCCSWholeGenome/basic_output_valdation/test_input_defaults.json b/wdl_test/PBCCSWholeGenome/basic_output_valdation/test_input_defaults.json
@@ -0,0 +1,15 @@
+{
+  "PBCCSWholeGenome.aligned_bais": ["gs://broad-gp-pacbio-outgoing/results/PBFlowcell/m64297e_220215_193024/reads/ccs/aligned/m64297e_220215_193024.bam.bai", "gs://broad-gp-pacbio-outgoing/results/PBFlowcell/m64297e_220218_155034/reads/ccs/aligned/m64297e_220218_155034.bam.bai","gs://broad-gp-pacbio-outgoing/results/PBFlowcell/m64020e_220303_200256/reads/ccs/aligned/m64020e_220303_200256.bam.bai","gs://broad-gp-pacbio-outgoing/results/PBFlowcell/m64297e_220303_195935/reads/ccs/aligned/m64297e_220303_195935.bam.bai", "gs://broad-gp-pacbio-outgoing/results/PBFlowcell/m64297e_220220_005204/reads/ccs/aligned/m64297e_220220_005204.bam.bai", "gs://broad-gp-pacbio-outgoing/results/PBFlowcell/m64218e_220328_161317/reads/ccs/aligned/m64218e_220328_161317.bam.bai", "gs://broad-gp-pacbio-outgoing/results/PBFlowcell/m64218e_220330_013212/reads/ccs/aligned/m64218e_220330_013212.bam.bai"],
+  "PBCCSWholeGenome.aligned_bams": ["gs://broad-gp-pacbio-outgoing/results/PBFlowcell/m64297e_220215_193024/reads/ccs/aligned/m64297e_220215_193024.bam", "gs://broad-gp-pacbio-outgoing/results/PBFlowcell/m64297e_220218_155034/reads/ccs/aligned/m64297e_220218_155034.bam", "gs://broad-gp-pacbio-outgoing/results/PBFlowcell/m64020e_220303_200256/reads/ccs/aligned/m64020e_220303_200256.bam", "gs://broad-gp-pacbio-outgoing/results/PBFlowcell/m64297e_220303_195935/reads/ccs/aligned/m64297e_220303_195935.bam", "gs://broad-gp-pacbio-outgoing/results/PBFlowcell/m64297e_220220_005204/reads/ccs/aligned/m64297e_220220_005204.bam", "gs://broad-gp-pacbio-outgoing/results/PBFlowcell/m64218e_220328_161317/reads/ccs/aligned/m64218e_220328_161317.bam", "gs://broad-gp-pacbio-outgoing/results/PBFlowcell/m64218e_220330_013212/reads/ccs/aligned/m64218e_220330_013212.bam"],
+  "PBCCSWholeGenome.call_small_variants": true,
+  "PBCCSWholeGenome.call_small_vars_on_mitochondria": false,
+  "PBCCSWholeGenome.dvp_memory": 128,
+  "PBCCSWholeGenome.dvp_threads": 32,
+  "PBCCSWholeGenome.fast_less_sensitive_sv": true,
+  "PBCCSWholeGenome.gcs_out_root_dir": "gs://broad-dsp-lrma-pipeline-tests-cromwell/workflow_out_root_dir",
+  "PBCCSWholeGenome.participant_name": "NA24385",
+  "PBCCSWholeGenome.ref_map_file": "gs://broad-dsde-methods-long-reads/resources/references/grch38_noalt/grch38_noalt.txt",
+  "PBCCSWholeGenome.ref_scatter_interval_list_ids": "gs://broad-dsde-methods-long-reads/resources/references/grch38_noalt/size_balanced_interva_lists_ids.txt",
+  "PBCCSWholeGenome.ref_scatter_interval_list_locator": "gs://broad-dsde-methods-long-reads/resources/references/grch38_noalt/size_balanced_interva_lists.txt",
+  "PBCCSWholeGenome.run_dv_pepper_analysis": true
+}
diff --git a/wdl_test/README.md b/wdl_test/README.md
@@ -0,0 +1,13 @@
+# Pipeline Testing
+
+## Testing Schedule 
+- Run all available WDL workflows once a week
+- Run a handful of important workflows when a push is made to feature branch
+
+
+Note: Call caching is enabled so that if a wdl hasn't changed then its tests will not run. 
+
+### Questions
+
+- If an utils task is changed, theoretically, it affects all pipelines that use the utils task. Do we want to test all pipelines? Or do we want to have multiple tests for that task covering all scenarios? 
+- This *should* be handled by the feature branch test. The test will attempt to run a workflow, if that workflow imports a util task that has been changed then cromwell should not call cache that imported workflow/task and instead and run the workflow from scratch.