From 4448ac1186937beab9ffba8edc5c8a7098432d8d Mon Sep 17 00:00:00 2001
From: Eric <emarinier@gmail.com>
Date: Wed, 11 Dec 2024 11:20:43 -0600
Subject: [PATCH 01/12] Very basic output framework.

---
 bin/process_output.py                | 61 ++++++++++++++++++++++++++++
 modules/local/process_output/main.nf | 31 ++++++++++++++
 workflows/fastmatchirida.nf          |  4 ++
 3 files changed, 96 insertions(+)
 create mode 100755 bin/process_output.py
 create mode 100644 modules/local/process_output/main.nf

diff --git a/bin/process_output.py b/bin/process_output.py
new file mode 100755
index 0000000..e0243de
--- /dev/null
+++ b/bin/process_output.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python
+
+from pathlib import Path
+from mimetypes import guess_type
+from functools import partial
+import gzip
+import sys
+import argparse
+
+
+def get_open(f):
+    if "gzip" == guess_type(str(f))[1]:
+        return partial(gzip.open)
+    else:
+        return open
+
+def main(argv=None):
+    parser = argparse.ArgumentParser(
+        description="Parses a profile_dists distances to create query-reference-format output for the FastMatch pipeline.",
+        epilog="Example: python process_output.py --input matrix.csv",
+    )
+    parser.add_argument(
+        "--input",
+        action="store",
+        dest="input",
+        type=str,
+        help="profile_dists-generated distance matrix",
+        default=None,
+        required=True,
+    )
+    parser.add_argument(
+        "--output",
+        action="store",
+        dest="output",
+        type=str,
+        help="output in query-reference format",
+        default=None,
+        required=True,
+    )
+
+    args = parser.parse_args(argv)
+
+    input = Path(args.input)
+    output = Path(args.output)
+
+    headers = ["query", "reference", "distance"]
+    results = [["A", "B", "1"], ["C", "D", "2"], ["E", "F", "3"]]
+
+    with open(output, "w") as output_file:
+        output_file.write((",").join(headers) + "\n")
+
+        for line in results:
+            output_file.write((",").join(line) + "\n")
+
+    print(f"Output written to [{output}]")
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/modules/local/process_output/main.nf b/modules/local/process_output/main.nf
new file mode 100644
index 0000000..a47992a
--- /dev/null
+++ b/modules/local/process_output/main.nf
@@ -0,0 +1,31 @@
+process PROCESS_OUTPUT {
+    label 'process_single'
+
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/python:3.10' :
+        'biocontainers/python:3.10' }"
+
+    input:
+    path(distances)
+
+    output:
+    path("results.csv"), emit: results
+    path "versions.yml", emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    """
+    process_output.py \\
+        $args \\
+        --input $distances \\
+        --output results.csv
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        process_outout : 0.1.0
+    END_VERSIONS
+    """
+}
\ No newline at end of file
diff --git a/workflows/fastmatchirida.nf b/workflows/fastmatchirida.nf
index 82c3a14..1fa5009 100644
--- a/workflows/fastmatchirida.nf
+++ b/workflows/fastmatchirida.nf
@@ -29,6 +29,7 @@ Workflowfastmatchirida.initialise(params, log)
 include { LOCIDEX_MERGE    } from '../modules/local/locidex/merge/main'
 include { PROFILE_DISTS    } from '../modules/local/profile_dists/main'
 include { INPUT_ASSURE     } from "../modules/local/input_assure/main"
+include { PROCESS_OUTPUT   } from "../modules/local/process_output/main"
 
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -152,6 +153,9 @@ workflow FASTMATCH {
     distances = PROFILE_DISTS(merged.combined_profiles, mapping_format, mapping_file, columns_file)
     ch_versions = ch_versions.mix(distances.versions)
 
+    processed_output = PROCESS_OUTPUT(distances.results)
+    ch_versions = ch_versions.mix(processed_output.versions)
+
     CUSTOM_DUMPSOFTWAREVERSIONS (
         ch_versions.unique().collectFile(name: 'collated_versions.yml')
     )

From d579c352c1a8c842cfee43340fa49f8d02e86516 Mon Sep 17 00:00:00 2001
From: Eric <emarinier@gmail.com>
Date: Wed, 11 Dec 2024 11:53:08 -0600
Subject: [PATCH 02/12] pairwise output format, appending metadata

---
 modules/local/append_metadata/main.nf | 47 +++++++++++++++++++++++++++
 nextflow.config                       |  1 -
 workflows/fastmatchirida.nf           |  9 +++--
 3 files changed, 54 insertions(+), 3 deletions(-)
 create mode 100644 modules/local/append_metadata/main.nf

diff --git a/modules/local/append_metadata/main.nf b/modules/local/append_metadata/main.nf
new file mode 100644
index 0000000..62a2790
--- /dev/null
+++ b/modules/local/append_metadata/main.nf
@@ -0,0 +1,47 @@
+process APPEND_METADATA {
+    tag "append_metadata"
+    label 'process_single'
+
+    input:
+    val distances_path       // cluster data as a TSV path
+                            // this needs to be "val", because "path"
+                            // won't stage the file correctly for exec
+    val metadata_rows       // metadata rows (no headers) to be appened, list of lists
+    val metadata_headers    // headers to name the metadata columns
+
+    output:
+    path("distances_and_metadata.tsv"), emit: distances
+
+    exec:
+    def distances_rows  // has a header row
+    def metadata_rows_map = [:]
+    def merged = []
+
+    distances_path.withReader { reader ->
+        distances_rows = reader.readLines()*.split('\t')
+    }
+
+    // Create a map of the metadata rows:
+    // Start on i = 0 because there are no headers included.
+    for(int i = 0; i < metadata_rows.size(); i++)
+    {
+        // "sample" -> ["sample", meta1, meta2, meta3, ...]
+        metadata_rows_map[metadata_rows[i][0]] = metadata_rows[i]
+    }
+
+    // Merge the headers
+    merged.add(distances_rows[0] + metadata_headers)
+
+    // Merge the remaining rows in original order:
+    // Start on i = 1 because we don't want the headers.
+    for(int i = 1; i < distances_rows.size(); i++)
+    {
+        def sample_key = distances_rows[i][1] // We want ref ID (second column)
+        merged.add(distances_rows[i] + metadata_rows_map[sample_key][1..-1])
+    }
+
+    task.workDir.resolve("distances_and_metadata.tsv").withWriter { writer ->
+        merged.each { writer.writeLine it.join("\t") }
+    }
+
+}
diff --git a/nextflow.config b/nextflow.config
index 45cf222..ae2c268 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -44,7 +44,6 @@ params {
     validate_params                  = true
 
     // Profile dists args
-    pd_outfmt = "matrix"
     pd_distm = "hamming"
     pd_missing_threshold = 1.0
     pd_sample_quality_threshold = 1.0
diff --git a/workflows/fastmatchirida.nf b/workflows/fastmatchirida.nf
index 1fa5009..1c76864 100644
--- a/workflows/fastmatchirida.nf
+++ b/workflows/fastmatchirida.nf
@@ -30,6 +30,7 @@ include { LOCIDEX_MERGE    } from '../modules/local/locidex/merge/main'
 include { PROFILE_DISTS    } from '../modules/local/profile_dists/main'
 include { INPUT_ASSURE     } from "../modules/local/input_assure/main"
 include { PROCESS_OUTPUT   } from "../modules/local/process_output/main"
+include { APPEND_METADATA  } from "../modules/local/append_metadata/main"
 
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -148,12 +149,16 @@ workflow FASTMATCH {
     }
 
     // Options related to profile dists
-    mapping_format = Channel.value(params.pd_outfmt)
+    mapping_format = Channel.value("pairwise")
 
     distances = PROFILE_DISTS(merged.combined_profiles, mapping_format, mapping_file, columns_file)
     ch_versions = ch_versions.mix(distances.versions)
 
-    processed_output = PROCESS_OUTPUT(distances.results)
+    // Append metadata to references:
+    distances_metadata = APPEND_METADATA(distances.results, metadata_rows, metadata_headers)
+
+    // Process the output:
+    processed_output = PROCESS_OUTPUT(distances_metadata.distances)
     ch_versions = ch_versions.mix(processed_output.versions)
 
     CUSTOM_DUMPSOFTWAREVERSIONS (

From 6ee63581008bdd3d4514aa043ba3197c60a34198 Mon Sep 17 00:00:00 2001
From: Eric <emarinier@gmail.com>
Date: Wed, 11 Dec 2024 12:07:48 -0600
Subject: [PATCH 03/12] Cleaning up, process_output.py outputs the input for
 now.

---
 bin/process_output.py                 | 11 ++++-------
 modules/local/append_metadata/main.nf |  4 ++--
 modules/local/process_output/main.nf  |  4 ++--
 3 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/bin/process_output.py b/bin/process_output.py
index e0243de..bba7f07 100755
--- a/bin/process_output.py
+++ b/bin/process_output.py
@@ -43,14 +43,11 @@ def main(argv=None):
     input = Path(args.input)
     output = Path(args.output)
 
-    headers = ["query", "reference", "distance"]
-    results = [["A", "B", "1"], ["C", "D", "2"], ["E", "F", "3"]]
+    with open(output, "w") as output_file, \
+        open(input, "r") as input_file:
 
-    with open(output, "w") as output_file:
-        output_file.write((",").join(headers) + "\n")
-
-        for line in results:
-            output_file.write((",").join(line) + "\n")
+        for line in input_file:
+            output_file.write(line)
 
     print(f"Output written to [{output}]")
 
diff --git a/modules/local/append_metadata/main.nf b/modules/local/append_metadata/main.nf
index 62a2790..583f284 100644
--- a/modules/local/append_metadata/main.nf
+++ b/modules/local/append_metadata/main.nf
@@ -1,9 +1,9 @@
 process APPEND_METADATA {
-    tag "append_metadata"
+    tag "Appends metadata to distances"
     label 'process_single'
 
     input:
-    val distances_path       // cluster data as a TSV path
+    val distances_path      // distance data as a TSV path
                             // this needs to be "val", because "path"
                             // won't stage the file correctly for exec
     val metadata_rows       // metadata rows (no headers) to be appened, list of lists
diff --git a/modules/local/process_output/main.nf b/modules/local/process_output/main.nf
index a47992a..50a6512 100644
--- a/modules/local/process_output/main.nf
+++ b/modules/local/process_output/main.nf
@@ -9,7 +9,7 @@ process PROCESS_OUTPUT {
     path(distances)
 
     output:
-    path("results.csv"), emit: results
+    path("results.tsv"), emit: results
     path "versions.yml", emit: versions
 
     when:
@@ -21,7 +21,7 @@ process PROCESS_OUTPUT {
     process_output.py \\
         $args \\
         --input $distances \\
-        --output results.csv
+        --output results.tsv
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

From 2f08dd1a56ef8adf83c870bc29c6f463dd0bff0e Mon Sep 17 00:00:00 2001
From: Eric <emarinier@gmail.com>
Date: Wed, 11 Dec 2024 14:44:34 -0600
Subject: [PATCH 04/12] Both sample names and sample/Irida IDs in output.

---
 modules/local/append_metadata/main.nf | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/modules/local/append_metadata/main.nf b/modules/local/append_metadata/main.nf
index 583f284..18711cd 100644
--- a/modules/local/append_metadata/main.nf
+++ b/modules/local/append_metadata/main.nf
@@ -15,6 +15,7 @@ process APPEND_METADATA {
     exec:
     def distances_rows  // has a header row
     def metadata_rows_map = [:]
+    def sample_names_map = [:] // maps sample names to Irida IDs
     def merged = []
 
     distances_path.withReader { reader ->
@@ -26,18 +27,31 @@ process APPEND_METADATA {
     for(int i = 0; i < metadata_rows.size(); i++)
     {
         // "sample" -> ["sample", meta1, meta2, meta3, ...]
-        metadata_rows_map[metadata_rows[i][0]] = metadata_rows[i]
+        sample_name = metadata_rows[i][0]
+        metadata_rows_map[sample_name] = metadata_rows[i]
+
+        // "sample" -> "Irida ID"
+        sample_names_map[sample_name] = metadata_rows[i][1]
     }
 
-    // Merge the headers
-    merged.add(distances_rows[0] + metadata_headers)
+    // Create the header row:
+    merged.add(["Query ID", "Query Sample Name", "Reference ID", "Reference Sample Name", "Disance"]
+                + metadata_headers[1..-1])
 
     // Merge the remaining rows in original order:
     // Start on i = 1 because we don't want the headers.
     for(int i = 1; i < distances_rows.size(); i++)
     {
-        def sample_key = distances_rows[i][1] // We want ref ID (second column)
-        merged.add(distances_rows[i] + metadata_rows_map[sample_key][1..-1])
+        query_sample_name = distances_rows[i][0]
+        query_irida_id = sample_names_map[query_sample_name]
+        reference_sample_name = distances_rows[i][1]
+        reference_irida_id = sample_names_map[reference_sample_name]
+        distance = distances_rows[i][2]
+
+        merged_row = [query_irida_id, query_sample_name, reference_irida_id, reference_sample_name, distance] \
+                        + metadata_rows_map[reference_sample_name][2..-1]
+
+        merged.add(merged_row)
     }
 
     task.workDir.resolve("distances_and_metadata.tsv").withWriter { writer ->

From 2121cfd02c479dbbd55ff92210c45264450281b5 Mon Sep 17 00:00:00 2001
From: Eric <emarinier@gmail.com>
Date: Wed, 11 Dec 2024 16:07:02 -0600
Subject: [PATCH 05/12] Distance threshold.

---
 bin/process_output.py                 | 23 ++++++++++++++++++-----
 modules/local/append_metadata/main.nf |  2 +-
 modules/local/process_output/main.nf  |  7 ++++---
 3 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/bin/process_output.py b/bin/process_output.py
index bba7f07..c8288bb 100755
--- a/bin/process_output.py
+++ b/bin/process_output.py
@@ -6,6 +6,7 @@
 import gzip
 import sys
 import argparse
+import pandas as pd
 
 
 def get_open(f):
@@ -15,10 +16,12 @@ def get_open(f):
         return open
 
 def main(argv=None):
+
     parser = argparse.ArgumentParser(
         description="Parses a profile_dists distances to create query-reference-format output for the FastMatch pipeline.",
         epilog="Example: python process_output.py --input matrix.csv",
     )
+
     parser.add_argument(
         "--input",
         action="store",
@@ -28,6 +31,17 @@ def main(argv=None):
         default=None,
         required=True,
     )
+
+    parser.add_argument(
+        "--threshold",
+        action="store",
+        dest="threshold",
+        type=int,
+        help="distance threshold to be included in output",
+        default=None,
+        required=True,
+    )
+
     parser.add_argument(
         "--output",
         action="store",
@@ -42,12 +56,11 @@ def main(argv=None):
 
     input = Path(args.input)
     output = Path(args.output)
+    threshold = args.threshold
 
-    with open(output, "w") as output_file, \
-        open(input, "r") as input_file:
-
-        for line in input_file:
-            output_file.write(line)
+    data = pd.read_csv(input, sep="\t")
+    data = data[data['Distance'] <= threshold]
+    data.to_csv(output, sep="\t", index=False)
 
     print(f"Output written to [{output}]")
 
diff --git a/modules/local/append_metadata/main.nf b/modules/local/append_metadata/main.nf
index 18711cd..a871e6f 100644
--- a/modules/local/append_metadata/main.nf
+++ b/modules/local/append_metadata/main.nf
@@ -35,7 +35,7 @@ process APPEND_METADATA {
     }
 
     // Create the header row:
-    merged.add(["Query ID", "Query Sample Name", "Reference ID", "Reference Sample Name", "Disance"]
+    merged.add(["Query ID", "Query Sample Name", "Reference ID", "Reference Sample Name", "Distance"]
                 + metadata_headers[1..-1])
 
     // Merge the remaining rows in original order:
diff --git a/modules/local/process_output/main.nf b/modules/local/process_output/main.nf
index 50a6512..dd22dbc 100644
--- a/modules/local/process_output/main.nf
+++ b/modules/local/process_output/main.nf
@@ -2,8 +2,8 @@ process PROCESS_OUTPUT {
     label 'process_single'
 
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/python:3.10' :
-        'biocontainers/python:3.10' }"
+        'https://depot.galaxyproject.org/singularity/pandas:2.2.1' :
+        'biocontainers/pandas' }"
 
     input:
     path(distances)
@@ -21,7 +21,8 @@ process PROCESS_OUTPUT {
     process_output.py \\
         $args \\
         --input $distances \\
-        --output results.tsv
+        --output results.tsv \\
+        --threshold 0
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

From 4a2138c0cd50d8a3feb47fbb11d781f91a9df973 Mon Sep 17 00:00:00 2001
From: Eric <emarinier@gmail.com>
Date: Wed, 11 Dec 2024 16:15:27 -0600
Subject: [PATCH 06/12] Removing profile dist output format from schema.

---
 nextflow_schema.json | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/nextflow_schema.json b/nextflow_schema.json
index 10bf5d5..dcd7c07 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -110,13 +110,6 @@
             "description": "Parameters for profile_dists distance calculations",
             "default": "",
             "properties": {
-                "pd_outfmt": {
-                    "type": "string",
-                    "description": "The output format for distances",
-                    "enum": ["matrix"],
-                    "default": "matrix",
-                    "hidden": true
-                },
                 "pd_distm": {
                     "type": "string",
                     "description": "The distance method/unit",

From 34b83736ce98d5cb041ea42f58d3461c1d69ce21 Mon Sep 17 00:00:00 2001
From: Eric <emarinier@gmail.com>
Date: Wed, 11 Dec 2024 16:17:54 -0600
Subject: [PATCH 07/12] threshold as a parameter

---
 modules/local/process_output/main.nf | 7 ++++---
 workflows/fastmatchirida.nf          | 2 +-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/modules/local/process_output/main.nf b/modules/local/process_output/main.nf
index dd22dbc..b8c2dbf 100644
--- a/modules/local/process_output/main.nf
+++ b/modules/local/process_output/main.nf
@@ -6,10 +6,11 @@ process PROCESS_OUTPUT {
         'biocontainers/pandas' }"
 
     input:
-    path(distances)
+    path distances
+    val threshold
 
     output:
-    path("results.tsv"), emit: results
+    path "results.tsv", emit: results
     path "versions.yml", emit: versions
 
     when:
@@ -22,7 +23,7 @@ process PROCESS_OUTPUT {
         $args \\
         --input $distances \\
         --output results.tsv \\
-        --threshold 0
+        --threshold $threshold
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
diff --git a/workflows/fastmatchirida.nf b/workflows/fastmatchirida.nf
index 1c76864..55c2301 100644
--- a/workflows/fastmatchirida.nf
+++ b/workflows/fastmatchirida.nf
@@ -158,7 +158,7 @@ workflow FASTMATCH {
     distances_metadata = APPEND_METADATA(distances.results, metadata_rows, metadata_headers)
 
     // Process the output:
-    processed_output = PROCESS_OUTPUT(distances_metadata.distances)
+    processed_output = PROCESS_OUTPUT(distances_metadata.distances, 0)
     ch_versions = ch_versions.mix(processed_output.versions)
 
     CUSTOM_DUMPSOFTWAREVERSIONS (

From eb173c22a90a08d886b11301de584604c0ca73d9 Mon Sep 17 00:00:00 2001
From: Eric <emarinier@gmail.com>
Date: Wed, 11 Dec 2024 16:22:25 -0600
Subject: [PATCH 08/12] Newline at end of file.

---
 modules/local/process_output/main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/local/process_output/main.nf b/modules/local/process_output/main.nf
index b8c2dbf..5527ffe 100644
--- a/modules/local/process_output/main.nf
+++ b/modules/local/process_output/main.nf
@@ -30,4 +30,4 @@ process PROCESS_OUTPUT {
         process_outout : 0.1.0
     END_VERSIONS
     """
-}
\ No newline at end of file
+}

From 8ade704e6e87b274445e70f3376751edd4de0dda Mon Sep 17 00:00:00 2001
From: Eric <emarinier@gmail.com>
Date: Wed, 11 Dec 2024 16:25:52 -0600
Subject: [PATCH 09/12] Updated script example.

---
 bin/process_output.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/process_output.py b/bin/process_output.py
index c8288bb..fe7b452 100755
--- a/bin/process_output.py
+++ b/bin/process_output.py
@@ -19,7 +19,7 @@ def main(argv=None):
 
     parser = argparse.ArgumentParser(
         description="Parses a profile_dists distances to create query-reference-format output for the FastMatch pipeline.",
-        epilog="Example: python process_output.py --input matrix.csv",
+        epilog="Example: python process_output.py --input distances.tsv --output results.tsv --threshold 10",
     )
 
     parser.add_argument(

From 75013de2ecbe6b0b4acffb285ee8c3522b872cbc Mon Sep 17 00:00:00 2001
From: Eric <emarinier@gmail.com>
Date: Wed, 11 Dec 2024 16:38:43 -0600
Subject: [PATCH 10/12] Updating container

---
 modules/local/process_output/main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/local/process_output/main.nf b/modules/local/process_output/main.nf
index 5527ffe..f85f73f 100644
--- a/modules/local/process_output/main.nf
+++ b/modules/local/process_output/main.nf
@@ -3,7 +3,7 @@ process PROCESS_OUTPUT {
 
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
         'https://depot.galaxyproject.org/singularity/pandas:2.2.1' :
-        'biocontainers/pandas' }"
+        'biocontainers/pandas:2.2.1' }"
 
     input:
     path distances

From 0699fcadbde2fe25caf5187ad00464c88515ae22 Mon Sep 17 00:00:00 2001
From: Eric <emarinier@gmail.com>
Date: Thu, 12 Dec 2024 10:01:31 -0600
Subject: [PATCH 11/12] Updating test data after switching to pairwise output.

---
 tests/data/distances/expected_dists-hamming.tsv    | 14 ++++++++++----
 .../expected_dists-hash-keep-one-loci.tsv          | 14 ++++++++++----
 .../expected_dists-hash-missing-count-missing.tsv  | 14 ++++++++++----
 .../data/distances/expected_dists-hash-missing.tsv | 14 ++++++++++----
 ...ected_dists-hash-more-missing-remove-sample.tsv |  8 +++++---
 .../distances/expected_dists-hash-more-missing.tsv | 14 ++++++++++----
 .../expected_dists-hash-remove-missing-loci.tsv    | 14 ++++++++++----
 .../distances/expected_dists-mismatched-ids.tsv    | 14 ++++++++++----
 .../expected_dists-partial-mismatched-ids.tsv      | 14 ++++++++++----
 tests/data/distances/expected_dists.tsv            | 14 ++++++++++----
 10 files changed, 95 insertions(+), 39 deletions(-)

diff --git a/tests/data/distances/expected_dists-hamming.tsv b/tests/data/distances/expected_dists-hamming.tsv
index 2b09f4c..3e8daca 100644
--- a/tests/data/distances/expected_dists-hamming.tsv
+++ b/tests/data/distances/expected_dists-hamming.tsv
@@ -1,4 +1,10 @@
-dists	sample1	sample2	sample3
-sample1	0	1	2
-sample2	1	0	3
-sample3	2	3	0
+query_id	ref_id	dist
+sample1	sample1	0
+sample1	sample2	1
+sample1	sample3	2
+sample2	sample2	0
+sample2	sample1	1
+sample2	sample3	3
+sample3	sample3	0
+sample3	sample1	2
+sample3	sample2	3
diff --git a/tests/data/distances/expected_dists-hash-keep-one-loci.tsv b/tests/data/distances/expected_dists-hash-keep-one-loci.tsv
index 254033a..535e1de 100644
--- a/tests/data/distances/expected_dists-hash-keep-one-loci.tsv
+++ b/tests/data/distances/expected_dists-hash-keep-one-loci.tsv
@@ -1,4 +1,10 @@
-dists	sample1	sample2	sample3
-sample1	0	1	0
-sample2	1	0	1
-sample3	0	1	0
+query_id	ref_id	dist
+sample1	sample1	0
+sample1	sample3	0
+sample1	sample2	1
+sample2	sample2	0
+sample2	sample1	1
+sample2	sample3	1
+sample3	sample1	0
+sample3	sample3	0
+sample3	sample2	1
diff --git a/tests/data/distances/expected_dists-hash-missing-count-missing.tsv b/tests/data/distances/expected_dists-hash-missing-count-missing.tsv
index 5ead144..b6d8e5f 100644
--- a/tests/data/distances/expected_dists-hash-missing-count-missing.tsv
+++ b/tests/data/distances/expected_dists-hash-missing-count-missing.tsv
@@ -1,4 +1,10 @@
-dists	sample1	sample2	sample3
-sample1	0	2	2
-sample2	2	0	2
-sample3	2	2	0
+query_id	ref_id	dist
+sample1	sample1	0
+sample1	sample2	2
+sample1	sample3	2
+sample2	sample2	0
+sample2	sample1	2
+sample2	sample3	2
+sample3	sample3	0
+sample3	sample1	2
+sample3	sample2	2
diff --git a/tests/data/distances/expected_dists-hash-missing.tsv b/tests/data/distances/expected_dists-hash-missing.tsv
index 3403cd7..2a86a67 100644
--- a/tests/data/distances/expected_dists-hash-missing.tsv
+++ b/tests/data/distances/expected_dists-hash-missing.tsv
@@ -1,4 +1,10 @@
-dists	sample1	sample2	sample3
-sample1	0	1	1
-sample2	1	0	2
-sample3	1	2	0
+query_id	ref_id	dist
+sample1	sample1	0
+sample1	sample2	1
+sample1	sample3	1
+sample2	sample2	0
+sample2	sample1	1
+sample2	sample3	2
+sample3	sample3	0
+sample3	sample1	1
+sample3	sample2	2
diff --git a/tests/data/distances/expected_dists-hash-more-missing-remove-sample.tsv b/tests/data/distances/expected_dists-hash-more-missing-remove-sample.tsv
index 0cc834e..9226d7d 100644
--- a/tests/data/distances/expected_dists-hash-more-missing-remove-sample.tsv
+++ b/tests/data/distances/expected_dists-hash-more-missing-remove-sample.tsv
@@ -1,3 +1,5 @@
-dists	sample1	sample2
-sample1	0	2
-sample2	2	0
+query_id	ref_id	dist
+sample1	sample1	0
+sample1	sample2	2
+sample2	sample2	0
+sample2	sample1	2
diff --git a/tests/data/distances/expected_dists-hash-more-missing.tsv b/tests/data/distances/expected_dists-hash-more-missing.tsv
index dae119c..cba7ad2 100644
--- a/tests/data/distances/expected_dists-hash-more-missing.tsv
+++ b/tests/data/distances/expected_dists-hash-more-missing.tsv
@@ -1,4 +1,10 @@
-dists	sample1	sample2	sample3
-sample1	0	2	3
-sample2	2	0	2
-sample3	3	2	0
+query_id	ref_id	dist
+sample1	sample1	0
+sample1	sample2	2
+sample1	sample3	3
+sample2	sample2	0
+sample2	sample1	2
+sample2	sample3	2
+sample3	sample3	0
+sample3	sample2	2
+sample3	sample1	3
diff --git a/tests/data/distances/expected_dists-hash-remove-missing-loci.tsv b/tests/data/distances/expected_dists-hash-remove-missing-loci.tsv
index 3403cd7..2a86a67 100644
--- a/tests/data/distances/expected_dists-hash-remove-missing-loci.tsv
+++ b/tests/data/distances/expected_dists-hash-remove-missing-loci.tsv
@@ -1,4 +1,10 @@
-dists	sample1	sample2	sample3
-sample1	0	1	1
-sample2	1	0	2
-sample3	1	2	0
+query_id	ref_id	dist
+sample1	sample1	0
+sample1	sample2	1
+sample1	sample3	1
+sample2	sample2	0
+sample2	sample1	1
+sample2	sample3	2
+sample3	sample3	0
+sample3	sample1	1
+sample3	sample2	2
diff --git a/tests/data/distances/expected_dists-mismatched-ids.tsv b/tests/data/distances/expected_dists-mismatched-ids.tsv
index 1a64f2b..a1311dc 100644
--- a/tests/data/distances/expected_dists-mismatched-ids.tsv
+++ b/tests/data/distances/expected_dists-mismatched-ids.tsv
@@ -1,4 +1,10 @@
-dists	sampleA	sampleB	sampleC
-sampleA	0.0	0.0	33.333333333333336
-sampleB	0.0	0.0	33.333333333333336
-sampleC	33.333333333333336	33.333333333333336	0.0
+query_id	ref_id	dist
+sampleA	sampleA	0.0
+sampleA	sampleB	0.0
+sampleA	sampleC	33.333333333333336
+sampleB	sampleA	0.0
+sampleB	sampleB	0.0
+sampleB	sampleC	33.333333333333336
+sampleC	sampleC	0.0
+sampleC	sampleA	33.333333333333336
+sampleC	sampleB	33.333333333333336
diff --git a/tests/data/distances/expected_dists-partial-mismatched-ids.tsv b/tests/data/distances/expected_dists-partial-mismatched-ids.tsv
index e7b7940..a68155f 100644
--- a/tests/data/distances/expected_dists-partial-mismatched-ids.tsv
+++ b/tests/data/distances/expected_dists-partial-mismatched-ids.tsv
@@ -1,4 +1,10 @@
-dists	sampleA	sampleB	sample3
-sampleA	0.0	0.0	33.333333333333336
-sampleB	0.0	0.0	33.333333333333336
-sample3	33.333333333333336	33.333333333333336	0.0
+query_id	ref_id	dist
+sampleA	sampleA	0.0
+sampleA	sampleB	0.0
+sampleA	sample3	33.333333333333336
+sampleB	sampleA	0.0
+sampleB	sampleB	0.0
+sampleB	sample3	33.333333333333336
+sample3	sample3	0.0
+sample3	sampleA	33.333333333333336
+sample3	sampleB	33.333333333333336
diff --git a/tests/data/distances/expected_dists.tsv b/tests/data/distances/expected_dists.tsv
index 00e9cec..4fc6d6f 100644
--- a/tests/data/distances/expected_dists.tsv
+++ b/tests/data/distances/expected_dists.tsv
@@ -1,4 +1,10 @@
-dists	sample1	sample2	sample3
-sample1	0.0	0.0	33.333333333333336
-sample2	0.0	0.0	33.333333333333336
-sample3	33.333333333333336	33.333333333333336	0.0
+query_id	ref_id	dist
+sample1	sample1	0.0
+sample1	sample2	0.0
+sample1	sample3	33.333333333333336
+sample2	sample1	0.0
+sample2	sample2	0.0
+sample2	sample3	33.333333333333336
+sample3	sample3	0.0
+sample3	sample1	33.333333333333336
+sample3	sample2	33.333333333333336

From 4c18be30fbc6ee6c2e50fb7465cbd86df9062c3d Mon Sep 17 00:00:00 2001
From: Eric <emarinier@gmail.com>
Date: Thu, 12 Dec 2024 11:57:48 -0600
Subject: [PATCH 12/12] xlsx, float threshold

---
 bin/process_output.py                | 14 +++++++++-----
 modules/local/process_output/main.nf |  9 +++++----
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/bin/process_output.py b/bin/process_output.py
index fe7b452..345894c 100755
--- a/bin/process_output.py
+++ b/bin/process_output.py
@@ -36,7 +36,7 @@ def main(argv=None):
         "--threshold",
         action="store",
         dest="threshold",
-        type=int,
+        type=float,
         help="distance threshold to be included in output",
         default=None,
         required=True,
@@ -47,7 +47,7 @@ def main(argv=None):
         action="store",
         dest="output",
         type=str,
-        help="output in query-reference format",
+        help="output prefix (without extension)",
         default=None,
         required=True,
     )
@@ -55,14 +55,18 @@ def main(argv=None):
     args = parser.parse_args(argv)
 
     input = Path(args.input)
-    output = Path(args.output)
+    tsv_path = Path(args.output + ".tsv")
+    excel_path = Path(args.output + ".xlsx")
     threshold = args.threshold
 
     data = pd.read_csv(input, sep="\t")
     data = data[data['Distance'] <= threshold]
-    data.to_csv(output, sep="\t", index=False)
+    data.to_csv(tsv_path, sep="\t", index=False)
+    data.to_excel(excel_path)
 
-    print(f"Output written to [{output}]")
+    print("Output written to:")
+    print(tsv_path)
+    print(excel_path)
 
     return 0
 
diff --git a/modules/local/process_output/main.nf b/modules/local/process_output/main.nf
index f85f73f..3b8ec20 100644
--- a/modules/local/process_output/main.nf
+++ b/modules/local/process_output/main.nf
@@ -2,15 +2,16 @@ process PROCESS_OUTPUT {
     label 'process_single'
 
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/pandas:2.2.1' :
-        'biocontainers/pandas:2.2.1' }"
+        'https://depot.galaxyproject.org/singularity/staramr:0.10.0--pyhdfd78af_0':
+        'biocontainers/staramr:0.10.0--pyhdfd78af_0' }"
 
     input:
     path distances
     val threshold
 
     output:
-    path "results.tsv", emit: results
+    path "results.tsv", emit: tsv
+    path "results.xlsx", emit: excel
     path "versions.yml", emit: versions
 
     when:
@@ -22,7 +23,7 @@ process PROCESS_OUTPUT {
     process_output.py \\
         $args \\
         --input $distances \\
-        --output results.tsv \\
+        --output results \\
         --threshold $threshold
 
     cat <<-END_VERSIONS > versions.yml