Merge pull request #3 from phac-nml/output-stub

Result Retrieval from FastMatch Pipeline
phac-nml · Dec 12, 2024 · 70962d5 · 70962d5
2 parents cbea074 + 4c18be3
commit 70962d5
Show file tree

Hide file tree

Showing 16 changed files with 275 additions and 48 deletions.
diff --git a/bin/process_output.py b/bin/process_output.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python
+
+from pathlib import Path
+from mimetypes import guess_type
+from functools import partial
+import gzip
+import sys
+import argparse
+import pandas as pd
+
+
+def get_open(f):
+    if "gzip" == guess_type(str(f))[1]:
+        return partial(gzip.open)
+    else:
+        return open
+
+def main(argv=None):
+
+    parser = argparse.ArgumentParser(
+        description="Parses a profile_dists distances to create query-reference-format output for the FastMatch pipeline.",
+        epilog="Example: python process_output.py --input distances.tsv --output results.tsv --threshold 10",
+    )
+
+    parser.add_argument(
+        "--input",
+        action="store",
+        dest="input",
+        type=str,
+        help="profile_dists-generated distance matrix",
+        default=None,
+        required=True,
+    )
+
+    parser.add_argument(
+        "--threshold",
+        action="store",
+        dest="threshold",
+        type=float,
+        help="distance threshold to be included in output",
+        default=None,
+        required=True,
+    )
+
+    parser.add_argument(
+        "--output",
+        action="store",
+        dest="output",
+        type=str,
+        help="output prefix (without extension)",
+        default=None,
+        required=True,
+    )
+
+    args = parser.parse_args(argv)
+
+    input = Path(args.input)
+    tsv_path = Path(args.output + ".tsv")
+    excel_path = Path(args.output + ".xlsx")
+    threshold = args.threshold
+
+    data = pd.read_csv(input, sep="\t")
+    data = data[data['Distance'] <= threshold]
+    data.to_csv(tsv_path, sep="\t", index=False)
+    data.to_excel(excel_path)
+
+    print("Output written to:")
+    print(tsv_path)
+    print(excel_path)
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/modules/local/append_metadata/main.nf b/modules/local/append_metadata/main.nf
@@ -0,0 +1,61 @@
+process APPEND_METADATA {
+    tag "Appends metadata to distances"
+    label 'process_single'
+
+    input:
+    val distances_path      // distance data as a TSV path
+                            // this needs to be "val", because "path"
+                            // won't stage the file correctly for exec
+    val metadata_rows       // metadata rows (no headers) to be appened, list of lists
+    val metadata_headers    // headers to name the metadata columns
+
+    output:
+    path("distances_and_metadata.tsv"), emit: distances
+
+    exec:
+    def distances_rows  // has a header row
+    def metadata_rows_map = [:]
+    def sample_names_map = [:] // maps sample names to Irida IDs
+    def merged = []
+
+    distances_path.withReader { reader ->
+        distances_rows = reader.readLines()*.split('\t')
+    }
+
+    // Create a map of the metadata rows:
+    // Start on i = 0 because there are no headers included.
+    for(int i = 0; i < metadata_rows.size(); i++)
+    {
+        // "sample" -> ["sample", meta1, meta2, meta3, ...]
+        sample_name = metadata_rows[i][0]
+        metadata_rows_map[sample_name] = metadata_rows[i]
+
+        // "sample" -> "Irida ID"
+        sample_names_map[sample_name] = metadata_rows[i][1]
+    }
+
+    // Create the header row:
+    merged.add(["Query ID", "Query Sample Name", "Reference ID", "Reference Sample Name", "Distance"]
+                + metadata_headers[1..-1])
+
+    // Merge the remaining rows in original order:
+    // Start on i = 1 because we don't want the headers.
+    for(int i = 1; i < distances_rows.size(); i++)
+    {
+        query_sample_name = distances_rows[i][0]
+        query_irida_id = sample_names_map[query_sample_name]
+        reference_sample_name = distances_rows[i][1]
+        reference_irida_id = sample_names_map[reference_sample_name]
+        distance = distances_rows[i][2]
+
+        merged_row = [query_irida_id, query_sample_name, reference_irida_id, reference_sample_name, distance] \
+                        + metadata_rows_map[reference_sample_name][2..-1]
+
+        merged.add(merged_row)
+    }
+
+    task.workDir.resolve("distances_and_metadata.tsv").withWriter { writer ->
+        merged.each { writer.writeLine it.join("\t") }
+    }
+
+}
diff --git a/modules/local/process_output/main.nf b/modules/local/process_output/main.nf
@@ -0,0 +1,34 @@
+process PROCESS_OUTPUT {
+    label 'process_single'
+
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/staramr:0.10.0--pyhdfd78af_0':
+        'biocontainers/staramr:0.10.0--pyhdfd78af_0' }"
+
+    input:
+    path distances
+    val threshold
+
+    output:
+    path "results.tsv", emit: tsv
+    path "results.xlsx", emit: excel
+    path "versions.yml", emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    """
+    process_output.py \\
+        $args \\
+        --input $distances \\
+        --output results \\
+        --threshold $threshold
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        process_outout : 0.1.0
+    END_VERSIONS
+    """
+}
diff --git a/nextflow.config b/nextflow.config
@@ -44,7 +44,6 @@ params {
     validate_params                  = true
 
     // Profile dists args
-    pd_outfmt = "matrix"
     pd_distm = "hamming"
     pd_missing_threshold = 1.0
     pd_sample_quality_threshold = 1.0

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -110,13 +110,6 @@
             "description": "Parameters for profile_dists distance calculations",
             "default": "",
             "properties": {
-                "pd_outfmt": {
-                    "type": "string",
-                    "description": "The output format for distances",
-                    "enum": ["matrix"],
-                    "default": "matrix",
-                    "hidden": true
-                },
                 "pd_distm": {
                     "type": "string",
                     "description": "The distance method/unit",

diff --git a/tests/data/distances/expected_dists-hamming.tsv b/tests/data/distances/expected_dists-hamming.tsv
@@ -1,4 +1,10 @@
-dists	sample1	sample2	sample3
-sample1	0	1	2
-sample2	1	0	3
-sample3	2	3	0
+query_id	ref_id	dist
+sample1	sample1	0
+sample1	sample2	1
+sample1	sample3	2
+sample2	sample2	0
+sample2	sample1	1
+sample2	sample3	3
+sample3	sample3	0
+sample3	sample1	2
+sample3	sample2	3
diff --git a/tests/data/distances/expected_dists-hash-keep-one-loci.tsv b/tests/data/distances/expected_dists-hash-keep-one-loci.tsv
@@ -1,4 +1,10 @@
-dists	sample1	sample2	sample3
-sample1	0	1	0
-sample2	1	0	1
-sample3	0	1	0
+query_id	ref_id	dist
+sample1	sample1	0
+sample1	sample3	0
+sample1	sample2	1
+sample2	sample2	0
+sample2	sample1	1
+sample2	sample3	1
+sample3	sample1	0
+sample3	sample3	0
+sample3	sample2	1
diff --git a/tests/data/distances/expected_dists-hash-missing-count-missing.tsv b/tests/data/distances/expected_dists-hash-missing-count-missing.tsv
@@ -1,4 +1,10 @@
-dists	sample1	sample2	sample3
-sample1	0	2	2
-sample2	2	0	2
-sample3	2	2	0
+query_id	ref_id	dist
+sample1	sample1	0
+sample1	sample2	2
+sample1	sample3	2
+sample2	sample2	0
+sample2	sample1	2
+sample2	sample3	2
+sample3	sample3	0
+sample3	sample1	2
+sample3	sample2	2
diff --git a/tests/data/distances/expected_dists-hash-missing.tsv b/tests/data/distances/expected_dists-hash-missing.tsv
@@ -1,4 +1,10 @@
-dists	sample1	sample2	sample3
-sample1	0	1	1
-sample2	1	0	2
-sample3	1	2	0
+query_id	ref_id	dist
+sample1	sample1	0
+sample1	sample2	1
+sample1	sample3	1
+sample2	sample2	0
+sample2	sample1	1
+sample2	sample3	2
+sample3	sample3	0
+sample3	sample1	1
+sample3	sample2	2
diff --git a/tests/data/distances/expected_dists-hash-more-missing-remove-sample.tsv b/tests/data/distances/expected_dists-hash-more-missing-remove-sample.tsv
@@ -1,3 +1,5 @@
-dists	sample1	sample2
-sample1	0	2
-sample2	2	0
+query_id	ref_id	dist
+sample1	sample1	0
+sample1	sample2	2
+sample2	sample2	0
+sample2	sample1	2
diff --git a/tests/data/distances/expected_dists-hash-more-missing.tsv b/tests/data/distances/expected_dists-hash-more-missing.tsv
@@ -1,4 +1,10 @@
-dists	sample1	sample2	sample3
-sample1	0	2	3
-sample2	2	0	2
-sample3	3	2	0
+query_id	ref_id	dist
+sample1	sample1	0
+sample1	sample2	2
+sample1	sample3	3
+sample2	sample2	0
+sample2	sample1	2
+sample2	sample3	2
+sample3	sample3	0
+sample3	sample2	2
+sample3	sample1	3
diff --git a/tests/data/distances/expected_dists-hash-remove-missing-loci.tsv b/tests/data/distances/expected_dists-hash-remove-missing-loci.tsv
@@ -1,4 +1,10 @@
-dists	sample1	sample2	sample3
-sample1	0	1	1
-sample2	1	0	2
-sample3	1	2	0
+query_id	ref_id	dist
+sample1	sample1	0
+sample1	sample2	1
+sample1	sample3	1
+sample2	sample2	0
+sample2	sample1	1
+sample2	sample3	2
+sample3	sample3	0
+sample3	sample1	1
+sample3	sample2	2
diff --git a/tests/data/distances/expected_dists-mismatched-ids.tsv b/tests/data/distances/expected_dists-mismatched-ids.tsv
@@ -1,4 +1,10 @@
-dists	sampleA	sampleB	sampleC
-sampleA	0.0	0.0	33.333333333333336
-sampleB	0.0	0.0	33.333333333333336
-sampleC	33.333333333333336	33.333333333333336	0.0
+query_id	ref_id	dist
+sampleA	sampleA	0.0
+sampleA	sampleB	0.0
+sampleA	sampleC	33.333333333333336
+sampleB	sampleA	0.0
+sampleB	sampleB	0.0
+sampleB	sampleC	33.333333333333336
+sampleC	sampleC	0.0
+sampleC	sampleA	33.333333333333336
+sampleC	sampleB	33.333333333333336
diff --git a/tests/data/distances/expected_dists-partial-mismatched-ids.tsv b/tests/data/distances/expected_dists-partial-mismatched-ids.tsv
@@ -1,4 +1,10 @@
-dists	sampleA	sampleB	sample3
-sampleA	0.0	0.0	33.333333333333336
-sampleB	0.0	0.0	33.333333333333336
-sample3	33.333333333333336	33.333333333333336	0.0
+query_id	ref_id	dist
+sampleA	sampleA	0.0
+sampleA	sampleB	0.0
+sampleA	sample3	33.333333333333336
+sampleB	sampleA	0.0
+sampleB	sampleB	0.0
+sampleB	sample3	33.333333333333336
+sample3	sample3	0.0
+sample3	sampleA	33.333333333333336
+sample3	sampleB	33.333333333333336
diff --git a/tests/data/distances/expected_dists.tsv b/tests/data/distances/expected_dists.tsv
@@ -1,4 +1,10 @@
-dists	sample1	sample2	sample3
-sample1	0.0	0.0	33.333333333333336
-sample2	0.0	0.0	33.333333333333336
-sample3	33.333333333333336	33.333333333333336	0.0
+query_id	ref_id	dist
+sample1	sample1	0.0
+sample1	sample2	0.0
+sample1	sample3	33.333333333333336
+sample2	sample1	0.0
+sample2	sample2	0.0
+sample2	sample3	33.333333333333336
+sample3	sample3	0.0
+sample3	sample1	33.333333333333336
+sample3	sample2	33.333333333333336
diff --git a/workflows/fastmatchirida.nf b/workflows/fastmatchirida.nf
@@ -29,6 +29,8 @@ Workflowfastmatchirida.initialise(params, log)
 include { LOCIDEX_MERGE    } from '../modules/local/locidex/merge/main'
 include { PROFILE_DISTS    } from '../modules/local/profile_dists/main'
 include { INPUT_ASSURE     } from "../modules/local/input_assure/main"
+include { PROCESS_OUTPUT   } from "../modules/local/process_output/main"
+include { APPEND_METADATA  } from "../modules/local/append_metadata/main"
 
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -147,11 +149,18 @@ workflow FASTMATCH {
     }
 
     // Options related to profile dists
-    mapping_format = Channel.value(params.pd_outfmt)
+    mapping_format = Channel.value("pairwise")
 
     distances = PROFILE_DISTS(merged.combined_profiles, mapping_format, mapping_file, columns_file)
     ch_versions = ch_versions.mix(distances.versions)
 
+    // Append metadata to references:
+    distances_metadata = APPEND_METADATA(distances.results, metadata_rows, metadata_headers)
+
+    // Process the output:
+    processed_output = PROCESS_OUTPUT(distances_metadata.distances, 0)
+    ch_versions = ch_versions.mix(processed_output.versions)
+
     CUSTOM_DUMPSOFTWAREVERSIONS (
         ch_versions.unique().collectFile(name: 'collated_versions.yml')
     )