From 4448ac1186937beab9ffba8edc5c8a7098432d8d Mon Sep 17 00:00:00 2001 From: Eric Date: Wed, 11 Dec 2024 11:20:43 -0600 Subject: [PATCH 01/12] Very basic output framework. --- bin/process_output.py | 61 ++++++++++++++++++++++++++++ modules/local/process_output/main.nf | 31 ++++++++++++++ workflows/fastmatchirida.nf | 4 ++ 3 files changed, 96 insertions(+) create mode 100755 bin/process_output.py create mode 100644 modules/local/process_output/main.nf diff --git a/bin/process_output.py b/bin/process_output.py new file mode 100755 index 0000000..e0243de --- /dev/null +++ b/bin/process_output.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python + +from pathlib import Path +from mimetypes import guess_type +from functools import partial +import gzip +import sys +import argparse + + +def get_open(f): + if "gzip" == guess_type(str(f))[1]: + return partial(gzip.open) + else: + return open + +def main(argv=None): + parser = argparse.ArgumentParser( + description="Parses a profile_dists distances to create query-reference-format output for the FastMatch pipeline.", + epilog="Example: python process_output.py --input matrix.csv", + ) + parser.add_argument( + "--input", + action="store", + dest="input", + type=str, + help="profile_dists-generated distance matrix", + default=None, + required=True, + ) + parser.add_argument( + "--output", + action="store", + dest="output", + type=str, + help="output in query-reference format", + default=None, + required=True, + ) + + args = parser.parse_args(argv) + + input = Path(args.input) + output = Path(args.output) + + headers = ["query", "reference", "distance"] + results = [["A", "B", "1"], ["C", "D", "2"], ["E", "F", "3"]] + + with open(output, "w") as output_file: + output_file.write((",").join(headers) + "\n") + + for line in results: + output_file.write((",").join(line) + "\n") + + print(f"Output written to [{output}]") + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/modules/local/process_output/main.nf b/modules/local/process_output/main.nf new file mode 100644 index 0000000..a47992a --- /dev/null +++ b/modules/local/process_output/main.nf @@ -0,0 +1,31 @@ +process PROCESS_OUTPUT { + label 'process_single' + + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.10' : + 'biocontainers/python:3.10' }" + + input: + path(distances) + + output: + path("results.csv"), emit: results + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + process_output.py \\ + $args \\ + --input $distances \\ + --output results.csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + process_outout : 0.1.0 + END_VERSIONS + """ +} \ No newline at end of file diff --git a/workflows/fastmatchirida.nf b/workflows/fastmatchirida.nf index 82c3a14..1fa5009 100644 --- a/workflows/fastmatchirida.nf +++ b/workflows/fastmatchirida.nf @@ -29,6 +29,7 @@ Workflowfastmatchirida.initialise(params, log) include { LOCIDEX_MERGE } from '../modules/local/locidex/merge/main' include { PROFILE_DISTS } from '../modules/local/profile_dists/main' include { INPUT_ASSURE } from "../modules/local/input_assure/main" +include { PROCESS_OUTPUT } from "../modules/local/process_output/main" /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -152,6 +153,9 @@ workflow FASTMATCH { distances = PROFILE_DISTS(merged.combined_profiles, mapping_format, mapping_file, columns_file) ch_versions = ch_versions.mix(distances.versions) + processed_output = PROCESS_OUTPUT(distances.results) + ch_versions = ch_versions.mix(processed_output.versions) + CUSTOM_DUMPSOFTWAREVERSIONS ( ch_versions.unique().collectFile(name: 'collated_versions.yml') ) From d579c352c1a8c842cfee43340fa49f8d02e86516 Mon Sep 17 00:00:00 2001 From: Eric Date: Wed, 11 Dec 2024 11:53:08 -0600 Subject: [PATCH 02/12] pairwise output format, appending metadata --- modules/local/append_metadata/main.nf | 47 +++++++++++++++++++++++++++ nextflow.config | 1 - workflows/fastmatchirida.nf | 9 +++-- 3 files changed, 54 insertions(+), 3 deletions(-) create mode 100644 modules/local/append_metadata/main.nf diff --git a/modules/local/append_metadata/main.nf b/modules/local/append_metadata/main.nf new file mode 100644 index 0000000..62a2790 --- /dev/null +++ b/modules/local/append_metadata/main.nf @@ -0,0 +1,47 @@ +process APPEND_METADATA { + tag "append_metadata" + label 'process_single' + + input: + val distances_path // cluster data as a TSV path + // this needs to be "val", because "path" + // won't stage the file correctly for exec + val metadata_rows // metadata rows (no headers) to be appened, list of lists + val metadata_headers // headers to name the metadata columns + + output: + path("distances_and_metadata.tsv"), emit: distances + + exec: + def distances_rows // has a header row + def metadata_rows_map = [:] + def merged = [] + + distances_path.withReader { reader -> + distances_rows = reader.readLines()*.split('\t') + } + + // Create a map of the metadata rows: + // Start on i = 0 because there are no headers included. + for(int i = 0; i < metadata_rows.size(); i++) + { + // "sample" -> ["sample", meta1, meta2, meta3, ...] + metadata_rows_map[metadata_rows[i][0]] = metadata_rows[i] + } + + // Merge the headers + merged.add(distances_rows[0] + metadata_headers) + + // Merge the remaining rows in original order: + // Start on i = 1 because we don't want the headers. + for(int i = 1; i < distances_rows.size(); i++) + { + def sample_key = distances_rows[i][1] // We want ref ID (second column) + merged.add(distances_rows[i] + metadata_rows_map[sample_key][1..-1]) + } + + task.workDir.resolve("distances_and_metadata.tsv").withWriter { writer -> + merged.each { writer.writeLine it.join("\t") } + } + +} diff --git a/nextflow.config b/nextflow.config index 45cf222..ae2c268 100644 --- a/nextflow.config +++ b/nextflow.config @@ -44,7 +44,6 @@ params { validate_params = true // Profile dists args - pd_outfmt = "matrix" pd_distm = "hamming" pd_missing_threshold = 1.0 pd_sample_quality_threshold = 1.0 diff --git a/workflows/fastmatchirida.nf b/workflows/fastmatchirida.nf index 1fa5009..1c76864 100644 --- a/workflows/fastmatchirida.nf +++ b/workflows/fastmatchirida.nf @@ -30,6 +30,7 @@ include { LOCIDEX_MERGE } from '../modules/local/locidex/merge/main' include { PROFILE_DISTS } from '../modules/local/profile_dists/main' include { INPUT_ASSURE } from "../modules/local/input_assure/main" include { PROCESS_OUTPUT } from "../modules/local/process_output/main" +include { APPEND_METADATA } from "../modules/local/append_metadata/main" /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -148,12 +149,16 @@ workflow FASTMATCH { } // Options related to profile dists - mapping_format = Channel.value(params.pd_outfmt) + mapping_format = Channel.value("pairwise") distances = PROFILE_DISTS(merged.combined_profiles, mapping_format, mapping_file, columns_file) ch_versions = ch_versions.mix(distances.versions) - processed_output = PROCESS_OUTPUT(distances.results) + // Append metadata to references: + distances_metadata = APPEND_METADATA(distances.results, metadata_rows, metadata_headers) + + // Process the output: + processed_output = PROCESS_OUTPUT(distances_metadata.distances) ch_versions = ch_versions.mix(processed_output.versions) CUSTOM_DUMPSOFTWAREVERSIONS ( From 6ee63581008bdd3d4514aa043ba3197c60a34198 Mon Sep 17 00:00:00 2001 From: Eric Date: Wed, 11 Dec 2024 12:07:48 -0600 Subject: [PATCH 03/12] Cleaning up, process_output.py outputs the input for now. --- bin/process_output.py | 11 ++++------- modules/local/append_metadata/main.nf | 4 ++-- modules/local/process_output/main.nf | 4 ++-- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/bin/process_output.py b/bin/process_output.py index e0243de..bba7f07 100755 --- a/bin/process_output.py +++ b/bin/process_output.py @@ -43,14 +43,11 @@ def main(argv=None): input = Path(args.input) output = Path(args.output) - headers = ["query", "reference", "distance"] - results = [["A", "B", "1"], ["C", "D", "2"], ["E", "F", "3"]] + with open(output, "w") as output_file, \ + open(input, "r") as input_file: - with open(output, "w") as output_file: - output_file.write((",").join(headers) + "\n") - - for line in results: - output_file.write((",").join(line) + "\n") + for line in input_file: + output_file.write(line) print(f"Output written to [{output}]") diff --git a/modules/local/append_metadata/main.nf b/modules/local/append_metadata/main.nf index 62a2790..583f284 100644 --- a/modules/local/append_metadata/main.nf +++ b/modules/local/append_metadata/main.nf @@ -1,9 +1,9 @@ process APPEND_METADATA { - tag "append_metadata" + tag "Appends metadata to distances" label 'process_single' input: - val distances_path // cluster data as a TSV path + val distances_path // distance data as a TSV path // this needs to be "val", because "path" // won't stage the file correctly for exec val metadata_rows // metadata rows (no headers) to be appened, list of lists diff --git a/modules/local/process_output/main.nf b/modules/local/process_output/main.nf index a47992a..50a6512 100644 --- a/modules/local/process_output/main.nf +++ b/modules/local/process_output/main.nf @@ -9,7 +9,7 @@ process PROCESS_OUTPUT { path(distances) output: - path("results.csv"), emit: results + path("results.tsv"), emit: results path "versions.yml", emit: versions when: @@ -21,7 +21,7 @@ process PROCESS_OUTPUT { process_output.py \\ $args \\ --input $distances \\ - --output results.csv + --output results.tsv cat <<-END_VERSIONS > versions.yml "${task.process}": From 2f08dd1a56ef8adf83c870bc29c6f463dd0bff0e Mon Sep 17 00:00:00 2001 From: Eric Date: Wed, 11 Dec 2024 14:44:34 -0600 Subject: [PATCH 04/12] Both sample names and sample/Irida IDs in output. --- modules/local/append_metadata/main.nf | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/modules/local/append_metadata/main.nf b/modules/local/append_metadata/main.nf index 583f284..18711cd 100644 --- a/modules/local/append_metadata/main.nf +++ b/modules/local/append_metadata/main.nf @@ -15,6 +15,7 @@ process APPEND_METADATA { exec: def distances_rows // has a header row def metadata_rows_map = [:] + def sample_names_map = [:] // maps sample names to Irida IDs def merged = [] distances_path.withReader { reader -> @@ -26,18 +27,31 @@ process APPEND_METADATA { for(int i = 0; i < metadata_rows.size(); i++) { // "sample" -> ["sample", meta1, meta2, meta3, ...] - metadata_rows_map[metadata_rows[i][0]] = metadata_rows[i] + sample_name = metadata_rows[i][0] + metadata_rows_map[sample_name] = metadata_rows[i] + + // "sample" -> "Irida ID" + sample_names_map[sample_name] = metadata_rows[i][1] } - // Merge the headers - merged.add(distances_rows[0] + metadata_headers) + // Create the header row: + merged.add(["Query ID", "Query Sample Name", "Reference ID", "Reference Sample Name", "Disance"] + + metadata_headers[1..-1]) // Merge the remaining rows in original order: // Start on i = 1 because we don't want the headers. for(int i = 1; i < distances_rows.size(); i++) { - def sample_key = distances_rows[i][1] // We want ref ID (second column) - merged.add(distances_rows[i] + metadata_rows_map[sample_key][1..-1]) + query_sample_name = distances_rows[i][0] + query_irida_id = sample_names_map[query_sample_name] + reference_sample_name = distances_rows[i][1] + reference_irida_id = sample_names_map[reference_sample_name] + distance = distances_rows[i][2] + + merged_row = [query_irida_id, query_sample_name, reference_irida_id, reference_sample_name, distance] \ + + metadata_rows_map[reference_sample_name][2..-1] + + merged.add(merged_row) } task.workDir.resolve("distances_and_metadata.tsv").withWriter { writer -> From 2121cfd02c479dbbd55ff92210c45264450281b5 Mon Sep 17 00:00:00 2001 From: Eric Date: Wed, 11 Dec 2024 16:07:02 -0600 Subject: [PATCH 05/12] Distance threshold. --- bin/process_output.py | 23 ++++++++++++++++++----- modules/local/append_metadata/main.nf | 2 +- modules/local/process_output/main.nf | 7 ++++--- 3 files changed, 23 insertions(+), 9 deletions(-) diff --git a/bin/process_output.py b/bin/process_output.py index bba7f07..c8288bb 100755 --- a/bin/process_output.py +++ b/bin/process_output.py @@ -6,6 +6,7 @@ import gzip import sys import argparse +import pandas as pd def get_open(f): @@ -15,10 +16,12 @@ def get_open(f): return open def main(argv=None): + parser = argparse.ArgumentParser( description="Parses a profile_dists distances to create query-reference-format output for the FastMatch pipeline.", epilog="Example: python process_output.py --input matrix.csv", ) + parser.add_argument( "--input", action="store", @@ -28,6 +31,17 @@ def main(argv=None): default=None, required=True, ) + + parser.add_argument( + "--threshold", + action="store", + dest="threshold", + type=int, + help="distance threshold to be included in output", + default=None, + required=True, + ) + parser.add_argument( "--output", action="store", @@ -42,12 +56,11 @@ def main(argv=None): input = Path(args.input) output = Path(args.output) + threshold = args.threshold - with open(output, "w") as output_file, \ - open(input, "r") as input_file: - - for line in input_file: - output_file.write(line) + data = pd.read_csv(input, sep="\t") + data = data[data['Distance'] <= threshold] + data.to_csv(output, sep="\t", index=False) print(f"Output written to [{output}]") diff --git a/modules/local/append_metadata/main.nf b/modules/local/append_metadata/main.nf index 18711cd..a871e6f 100644 --- a/modules/local/append_metadata/main.nf +++ b/modules/local/append_metadata/main.nf @@ -35,7 +35,7 @@ process APPEND_METADATA { } // Create the header row: - merged.add(["Query ID", "Query Sample Name", "Reference ID", "Reference Sample Name", "Disance"] + merged.add(["Query ID", "Query Sample Name", "Reference ID", "Reference Sample Name", "Distance"] + metadata_headers[1..-1]) // Merge the remaining rows in original order: diff --git a/modules/local/process_output/main.nf b/modules/local/process_output/main.nf index 50a6512..dd22dbc 100644 --- a/modules/local/process_output/main.nf +++ b/modules/local/process_output/main.nf @@ -2,8 +2,8 @@ process PROCESS_OUTPUT { label 'process_single' container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/python:3.10' : - 'biocontainers/python:3.10' }" + 'https://depot.galaxyproject.org/singularity/pandas:2.2.1' : + 'biocontainers/pandas' }" input: path(distances) @@ -21,7 +21,8 @@ process PROCESS_OUTPUT { process_output.py \\ $args \\ --input $distances \\ - --output results.tsv + --output results.tsv \\ + --threshold 0 cat <<-END_VERSIONS > versions.yml "${task.process}": From 4a2138c0cd50d8a3feb47fbb11d781f91a9df973 Mon Sep 17 00:00:00 2001 From: Eric Date: Wed, 11 Dec 2024 16:15:27 -0600 Subject: [PATCH 06/12] Removing profile dist output format from schema. --- nextflow_schema.json | 7 ------- 1 file changed, 7 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 10bf5d5..dcd7c07 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -110,13 +110,6 @@ "description": "Parameters for profile_dists distance calculations", "default": "", "properties": { - "pd_outfmt": { - "type": "string", - "description": "The output format for distances", - "enum": ["matrix"], - "default": "matrix", - "hidden": true - }, "pd_distm": { "type": "string", "description": "The distance method/unit", From 34b83736ce98d5cb041ea42f58d3461c1d69ce21 Mon Sep 17 00:00:00 2001 From: Eric Date: Wed, 11 Dec 2024 16:17:54 -0600 Subject: [PATCH 07/12] threshold as a parameter --- modules/local/process_output/main.nf | 7 ++++--- workflows/fastmatchirida.nf | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/modules/local/process_output/main.nf b/modules/local/process_output/main.nf index dd22dbc..b8c2dbf 100644 --- a/modules/local/process_output/main.nf +++ b/modules/local/process_output/main.nf @@ -6,10 +6,11 @@ process PROCESS_OUTPUT { 'biocontainers/pandas' }" input: - path(distances) + path distances + val threshold output: - path("results.tsv"), emit: results + path "results.tsv", emit: results path "versions.yml", emit: versions when: @@ -22,7 +23,7 @@ process PROCESS_OUTPUT { $args \\ --input $distances \\ --output results.tsv \\ - --threshold 0 + --threshold $threshold cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/workflows/fastmatchirida.nf b/workflows/fastmatchirida.nf index 1c76864..55c2301 100644 --- a/workflows/fastmatchirida.nf +++ b/workflows/fastmatchirida.nf @@ -158,7 +158,7 @@ workflow FASTMATCH { distances_metadata = APPEND_METADATA(distances.results, metadata_rows, metadata_headers) // Process the output: - processed_output = PROCESS_OUTPUT(distances_metadata.distances) + processed_output = PROCESS_OUTPUT(distances_metadata.distances, 0) ch_versions = ch_versions.mix(processed_output.versions) CUSTOM_DUMPSOFTWAREVERSIONS ( From eb173c22a90a08d886b11301de584604c0ca73d9 Mon Sep 17 00:00:00 2001 From: Eric Date: Wed, 11 Dec 2024 16:22:25 -0600 Subject: [PATCH 08/12] Newline at end of file. --- modules/local/process_output/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/process_output/main.nf b/modules/local/process_output/main.nf index b8c2dbf..5527ffe 100644 --- a/modules/local/process_output/main.nf +++ b/modules/local/process_output/main.nf @@ -30,4 +30,4 @@ process PROCESS_OUTPUT { process_outout : 0.1.0 END_VERSIONS """ -} \ No newline at end of file +} From 8ade704e6e87b274445e70f3376751edd4de0dda Mon Sep 17 00:00:00 2001 From: Eric Date: Wed, 11 Dec 2024 16:25:52 -0600 Subject: [PATCH 09/12] Updated script example. --- bin/process_output.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/process_output.py b/bin/process_output.py index c8288bb..fe7b452 100755 --- a/bin/process_output.py +++ b/bin/process_output.py @@ -19,7 +19,7 @@ def main(argv=None): parser = argparse.ArgumentParser( description="Parses a profile_dists distances to create query-reference-format output for the FastMatch pipeline.", - epilog="Example: python process_output.py --input matrix.csv", + epilog="Example: python process_output.py --input distances.tsv --output results.tsv --threshold 10", ) parser.add_argument( From 75013de2ecbe6b0b4acffb285ee8c3522b872cbc Mon Sep 17 00:00:00 2001 From: Eric Date: Wed, 11 Dec 2024 16:38:43 -0600 Subject: [PATCH 10/12] Updating container --- modules/local/process_output/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/process_output/main.nf b/modules/local/process_output/main.nf index 5527ffe..f85f73f 100644 --- a/modules/local/process_output/main.nf +++ b/modules/local/process_output/main.nf @@ -3,7 +3,7 @@ process PROCESS_OUTPUT { container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/pandas:2.2.1' : - 'biocontainers/pandas' }" + 'biocontainers/pandas:2.2.1' }" input: path distances From 0699fcadbde2fe25caf5187ad00464c88515ae22 Mon Sep 17 00:00:00 2001 From: Eric Date: Thu, 12 Dec 2024 10:01:31 -0600 Subject: [PATCH 11/12] Updating test data after switching to pairwise output. --- tests/data/distances/expected_dists-hamming.tsv | 14 ++++++++++---- .../expected_dists-hash-keep-one-loci.tsv | 14 ++++++++++---- .../expected_dists-hash-missing-count-missing.tsv | 14 ++++++++++---- .../data/distances/expected_dists-hash-missing.tsv | 14 ++++++++++---- ...ected_dists-hash-more-missing-remove-sample.tsv | 8 +++++--- .../distances/expected_dists-hash-more-missing.tsv | 14 ++++++++++---- .../expected_dists-hash-remove-missing-loci.tsv | 14 ++++++++++---- .../distances/expected_dists-mismatched-ids.tsv | 14 ++++++++++---- .../expected_dists-partial-mismatched-ids.tsv | 14 ++++++++++---- tests/data/distances/expected_dists.tsv | 14 ++++++++++---- 10 files changed, 95 insertions(+), 39 deletions(-) diff --git a/tests/data/distances/expected_dists-hamming.tsv b/tests/data/distances/expected_dists-hamming.tsv index 2b09f4c..3e8daca 100644 --- a/tests/data/distances/expected_dists-hamming.tsv +++ b/tests/data/distances/expected_dists-hamming.tsv @@ -1,4 +1,10 @@ -dists sample1 sample2 sample3 -sample1 0 1 2 -sample2 1 0 3 -sample3 2 3 0 +query_id ref_id dist +sample1 sample1 0 +sample1 sample2 1 +sample1 sample3 2 +sample2 sample2 0 +sample2 sample1 1 +sample2 sample3 3 +sample3 sample3 0 +sample3 sample1 2 +sample3 sample2 3 diff --git a/tests/data/distances/expected_dists-hash-keep-one-loci.tsv b/tests/data/distances/expected_dists-hash-keep-one-loci.tsv index 254033a..535e1de 100644 --- a/tests/data/distances/expected_dists-hash-keep-one-loci.tsv +++ b/tests/data/distances/expected_dists-hash-keep-one-loci.tsv @@ -1,4 +1,10 @@ -dists sample1 sample2 sample3 -sample1 0 1 0 -sample2 1 0 1 -sample3 0 1 0 +query_id ref_id dist +sample1 sample1 0 +sample1 sample3 0 +sample1 sample2 1 +sample2 sample2 0 +sample2 sample1 1 +sample2 sample3 1 +sample3 sample1 0 +sample3 sample3 0 +sample3 sample2 1 diff --git a/tests/data/distances/expected_dists-hash-missing-count-missing.tsv b/tests/data/distances/expected_dists-hash-missing-count-missing.tsv index 5ead144..b6d8e5f 100644 --- a/tests/data/distances/expected_dists-hash-missing-count-missing.tsv +++ b/tests/data/distances/expected_dists-hash-missing-count-missing.tsv @@ -1,4 +1,10 @@ -dists sample1 sample2 sample3 -sample1 0 2 2 -sample2 2 0 2 -sample3 2 2 0 +query_id ref_id dist +sample1 sample1 0 +sample1 sample2 2 +sample1 sample3 2 +sample2 sample2 0 +sample2 sample1 2 +sample2 sample3 2 +sample3 sample3 0 +sample3 sample1 2 +sample3 sample2 2 diff --git a/tests/data/distances/expected_dists-hash-missing.tsv b/tests/data/distances/expected_dists-hash-missing.tsv index 3403cd7..2a86a67 100644 --- a/tests/data/distances/expected_dists-hash-missing.tsv +++ b/tests/data/distances/expected_dists-hash-missing.tsv @@ -1,4 +1,10 @@ -dists sample1 sample2 sample3 -sample1 0 1 1 -sample2 1 0 2 -sample3 1 2 0 +query_id ref_id dist +sample1 sample1 0 +sample1 sample2 1 +sample1 sample3 1 +sample2 sample2 0 +sample2 sample1 1 +sample2 sample3 2 +sample3 sample3 0 +sample3 sample1 1 +sample3 sample2 2 diff --git a/tests/data/distances/expected_dists-hash-more-missing-remove-sample.tsv b/tests/data/distances/expected_dists-hash-more-missing-remove-sample.tsv index 0cc834e..9226d7d 100644 --- a/tests/data/distances/expected_dists-hash-more-missing-remove-sample.tsv +++ b/tests/data/distances/expected_dists-hash-more-missing-remove-sample.tsv @@ -1,3 +1,5 @@ -dists sample1 sample2 -sample1 0 2 -sample2 2 0 +query_id ref_id dist +sample1 sample1 0 +sample1 sample2 2 +sample2 sample2 0 +sample2 sample1 2 diff --git a/tests/data/distances/expected_dists-hash-more-missing.tsv b/tests/data/distances/expected_dists-hash-more-missing.tsv index dae119c..cba7ad2 100644 --- a/tests/data/distances/expected_dists-hash-more-missing.tsv +++ b/tests/data/distances/expected_dists-hash-more-missing.tsv @@ -1,4 +1,10 @@ -dists sample1 sample2 sample3 -sample1 0 2 3 -sample2 2 0 2 -sample3 3 2 0 +query_id ref_id dist +sample1 sample1 0 +sample1 sample2 2 +sample1 sample3 3 +sample2 sample2 0 +sample2 sample1 2 +sample2 sample3 2 +sample3 sample3 0 +sample3 sample2 2 +sample3 sample1 3 diff --git a/tests/data/distances/expected_dists-hash-remove-missing-loci.tsv b/tests/data/distances/expected_dists-hash-remove-missing-loci.tsv index 3403cd7..2a86a67 100644 --- a/tests/data/distances/expected_dists-hash-remove-missing-loci.tsv +++ b/tests/data/distances/expected_dists-hash-remove-missing-loci.tsv @@ -1,4 +1,10 @@ -dists sample1 sample2 sample3 -sample1 0 1 1 -sample2 1 0 2 -sample3 1 2 0 +query_id ref_id dist +sample1 sample1 0 +sample1 sample2 1 +sample1 sample3 1 +sample2 sample2 0 +sample2 sample1 1 +sample2 sample3 2 +sample3 sample3 0 +sample3 sample1 1 +sample3 sample2 2 diff --git a/tests/data/distances/expected_dists-mismatched-ids.tsv b/tests/data/distances/expected_dists-mismatched-ids.tsv index 1a64f2b..a1311dc 100644 --- a/tests/data/distances/expected_dists-mismatched-ids.tsv +++ b/tests/data/distances/expected_dists-mismatched-ids.tsv @@ -1,4 +1,10 @@ -dists sampleA sampleB sampleC -sampleA 0.0 0.0 33.333333333333336 -sampleB 0.0 0.0 33.333333333333336 -sampleC 33.333333333333336 33.333333333333336 0.0 +query_id ref_id dist +sampleA sampleA 0.0 +sampleA sampleB 0.0 +sampleA sampleC 33.333333333333336 +sampleB sampleA 0.0 +sampleB sampleB 0.0 +sampleB sampleC 33.333333333333336 +sampleC sampleC 0.0 +sampleC sampleA 33.333333333333336 +sampleC sampleB 33.333333333333336 diff --git a/tests/data/distances/expected_dists-partial-mismatched-ids.tsv b/tests/data/distances/expected_dists-partial-mismatched-ids.tsv index e7b7940..a68155f 100644 --- a/tests/data/distances/expected_dists-partial-mismatched-ids.tsv +++ b/tests/data/distances/expected_dists-partial-mismatched-ids.tsv @@ -1,4 +1,10 @@ -dists sampleA sampleB sample3 -sampleA 0.0 0.0 33.333333333333336 -sampleB 0.0 0.0 33.333333333333336 -sample3 33.333333333333336 33.333333333333336 0.0 +query_id ref_id dist +sampleA sampleA 0.0 +sampleA sampleB 0.0 +sampleA sample3 33.333333333333336 +sampleB sampleA 0.0 +sampleB sampleB 0.0 +sampleB sample3 33.333333333333336 +sample3 sample3 0.0 +sample3 sampleA 33.333333333333336 +sample3 sampleB 33.333333333333336 diff --git a/tests/data/distances/expected_dists.tsv b/tests/data/distances/expected_dists.tsv index 00e9cec..4fc6d6f 100644 --- a/tests/data/distances/expected_dists.tsv +++ b/tests/data/distances/expected_dists.tsv @@ -1,4 +1,10 @@ -dists sample1 sample2 sample3 -sample1 0.0 0.0 33.333333333333336 -sample2 0.0 0.0 33.333333333333336 -sample3 33.333333333333336 33.333333333333336 0.0 +query_id ref_id dist +sample1 sample1 0.0 +sample1 sample2 0.0 +sample1 sample3 33.333333333333336 +sample2 sample1 0.0 +sample2 sample2 0.0 +sample2 sample3 33.333333333333336 +sample3 sample3 0.0 +sample3 sample1 33.333333333333336 +sample3 sample2 33.333333333333336 From 4c18be30fbc6ee6c2e50fb7465cbd86df9062c3d Mon Sep 17 00:00:00 2001 From: Eric Date: Thu, 12 Dec 2024 11:57:48 -0600 Subject: [PATCH 12/12] xlsx, float threshold --- bin/process_output.py | 14 +++++++++----- modules/local/process_output/main.nf | 9 +++++---- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/bin/process_output.py b/bin/process_output.py index fe7b452..345894c 100755 --- a/bin/process_output.py +++ b/bin/process_output.py @@ -36,7 +36,7 @@ def main(argv=None): "--threshold", action="store", dest="threshold", - type=int, + type=float, help="distance threshold to be included in output", default=None, required=True, @@ -47,7 +47,7 @@ def main(argv=None): action="store", dest="output", type=str, - help="output in query-reference format", + help="output prefix (without extension)", default=None, required=True, ) @@ -55,14 +55,18 @@ def main(argv=None): args = parser.parse_args(argv) input = Path(args.input) - output = Path(args.output) + tsv_path = Path(args.output + ".tsv") + excel_path = Path(args.output + ".xlsx") threshold = args.threshold data = pd.read_csv(input, sep="\t") data = data[data['Distance'] <= threshold] - data.to_csv(output, sep="\t", index=False) + data.to_csv(tsv_path, sep="\t", index=False) + data.to_excel(excel_path) - print(f"Output written to [{output}]") + print("Output written to:") + print(tsv_path) + print(excel_path) return 0 diff --git a/modules/local/process_output/main.nf b/modules/local/process_output/main.nf index f85f73f..3b8ec20 100644 --- a/modules/local/process_output/main.nf +++ b/modules/local/process_output/main.nf @@ -2,15 +2,16 @@ process PROCESS_OUTPUT { label 'process_single' container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/pandas:2.2.1' : - 'biocontainers/pandas:2.2.1' }" + 'https://depot.galaxyproject.org/singularity/staramr:0.10.0--pyhdfd78af_0': + 'biocontainers/staramr:0.10.0--pyhdfd78af_0' }" input: path distances val threshold output: - path "results.tsv", emit: results + path "results.tsv", emit: tsv + path "results.xlsx", emit: excel path "versions.yml", emit: versions when: @@ -22,7 +23,7 @@ process PROCESS_OUTPUT { process_output.py \\ $args \\ --input $distances \\ - --output results.tsv \\ + --output results \\ --threshold $threshold cat <<-END_VERSIONS > versions.yml