diff --git a/bin/process_output.py b/bin/process_output.py new file mode 100755 index 0000000..345894c --- /dev/null +++ b/bin/process_output.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python + +from pathlib import Path +from mimetypes import guess_type +from functools import partial +import gzip +import sys +import argparse +import pandas as pd + + +def get_open(f): + if "gzip" == guess_type(str(f))[1]: + return partial(gzip.open) + else: + return open + +def main(argv=None): + + parser = argparse.ArgumentParser( + description="Parses a profile_dists distances to create query-reference-format output for the FastMatch pipeline.", + epilog="Example: python process_output.py --input distances.tsv --output results.tsv --threshold 10", + ) + + parser.add_argument( + "--input", + action="store", + dest="input", + type=str, + help="profile_dists-generated distance matrix", + default=None, + required=True, + ) + + parser.add_argument( + "--threshold", + action="store", + dest="threshold", + type=float, + help="distance threshold to be included in output", + default=None, + required=True, + ) + + parser.add_argument( + "--output", + action="store", + dest="output", + type=str, + help="output prefix (without extension)", + default=None, + required=True, + ) + + args = parser.parse_args(argv) + + input = Path(args.input) + tsv_path = Path(args.output + ".tsv") + excel_path = Path(args.output + ".xlsx") + threshold = args.threshold + + data = pd.read_csv(input, sep="\t") + data = data[data['Distance'] <= threshold] + data.to_csv(tsv_path, sep="\t", index=False) + data.to_excel(excel_path) + + print("Output written to:") + print(tsv_path) + print(excel_path) + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/modules/local/append_metadata/main.nf b/modules/local/append_metadata/main.nf new file mode 100644 index 0000000..a871e6f --- /dev/null +++ b/modules/local/append_metadata/main.nf @@ -0,0 +1,61 @@ +process APPEND_METADATA { + tag "Appends metadata to distances" + label 'process_single' + + input: + val distances_path // distance data as a TSV path + // this needs to be "val", because "path" + // won't stage the file correctly for exec + val metadata_rows // metadata rows (no headers) to be appened, list of lists + val metadata_headers // headers to name the metadata columns + + output: + path("distances_and_metadata.tsv"), emit: distances + + exec: + def distances_rows // has a header row + def metadata_rows_map = [:] + def sample_names_map = [:] // maps sample names to Irida IDs + def merged = [] + + distances_path.withReader { reader -> + distances_rows = reader.readLines()*.split('\t') + } + + // Create a map of the metadata rows: + // Start on i = 0 because there are no headers included. + for(int i = 0; i < metadata_rows.size(); i++) + { + // "sample" -> ["sample", meta1, meta2, meta3, ...] + sample_name = metadata_rows[i][0] + metadata_rows_map[sample_name] = metadata_rows[i] + + // "sample" -> "Irida ID" + sample_names_map[sample_name] = metadata_rows[i][1] + } + + // Create the header row: + merged.add(["Query ID", "Query Sample Name", "Reference ID", "Reference Sample Name", "Distance"] + + metadata_headers[1..-1]) + + // Merge the remaining rows in original order: + // Start on i = 1 because we don't want the headers. + for(int i = 1; i < distances_rows.size(); i++) + { + query_sample_name = distances_rows[i][0] + query_irida_id = sample_names_map[query_sample_name] + reference_sample_name = distances_rows[i][1] + reference_irida_id = sample_names_map[reference_sample_name] + distance = distances_rows[i][2] + + merged_row = [query_irida_id, query_sample_name, reference_irida_id, reference_sample_name, distance] \ + + metadata_rows_map[reference_sample_name][2..-1] + + merged.add(merged_row) + } + + task.workDir.resolve("distances_and_metadata.tsv").withWriter { writer -> + merged.each { writer.writeLine it.join("\t") } + } + +} diff --git a/modules/local/process_output/main.nf b/modules/local/process_output/main.nf new file mode 100644 index 0000000..3b8ec20 --- /dev/null +++ b/modules/local/process_output/main.nf @@ -0,0 +1,34 @@ +process PROCESS_OUTPUT { + label 'process_single' + + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/staramr:0.10.0--pyhdfd78af_0': + 'biocontainers/staramr:0.10.0--pyhdfd78af_0' }" + + input: + path distances + val threshold + + output: + path "results.tsv", emit: tsv + path "results.xlsx", emit: excel + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + process_output.py \\ + $args \\ + --input $distances \\ + --output results \\ + --threshold $threshold + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + process_outout : 0.1.0 + END_VERSIONS + """ +} diff --git a/nextflow.config b/nextflow.config index 45cf222..ae2c268 100644 --- a/nextflow.config +++ b/nextflow.config @@ -44,7 +44,6 @@ params { validate_params = true // Profile dists args - pd_outfmt = "matrix" pd_distm = "hamming" pd_missing_threshold = 1.0 pd_sample_quality_threshold = 1.0 diff --git a/nextflow_schema.json b/nextflow_schema.json index 10bf5d5..dcd7c07 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -110,13 +110,6 @@ "description": "Parameters for profile_dists distance calculations", "default": "", "properties": { - "pd_outfmt": { - "type": "string", - "description": "The output format for distances", - "enum": ["matrix"], - "default": "matrix", - "hidden": true - }, "pd_distm": { "type": "string", "description": "The distance method/unit", diff --git a/tests/data/distances/expected_dists-hamming.tsv b/tests/data/distances/expected_dists-hamming.tsv index 2b09f4c..3e8daca 100644 --- a/tests/data/distances/expected_dists-hamming.tsv +++ b/tests/data/distances/expected_dists-hamming.tsv @@ -1,4 +1,10 @@ -dists sample1 sample2 sample3 -sample1 0 1 2 -sample2 1 0 3 -sample3 2 3 0 +query_id ref_id dist +sample1 sample1 0 +sample1 sample2 1 +sample1 sample3 2 +sample2 sample2 0 +sample2 sample1 1 +sample2 sample3 3 +sample3 sample3 0 +sample3 sample1 2 +sample3 sample2 3 diff --git a/tests/data/distances/expected_dists-hash-keep-one-loci.tsv b/tests/data/distances/expected_dists-hash-keep-one-loci.tsv index 254033a..535e1de 100644 --- a/tests/data/distances/expected_dists-hash-keep-one-loci.tsv +++ b/tests/data/distances/expected_dists-hash-keep-one-loci.tsv @@ -1,4 +1,10 @@ -dists sample1 sample2 sample3 -sample1 0 1 0 -sample2 1 0 1 -sample3 0 1 0 +query_id ref_id dist +sample1 sample1 0 +sample1 sample3 0 +sample1 sample2 1 +sample2 sample2 0 +sample2 sample1 1 +sample2 sample3 1 +sample3 sample1 0 +sample3 sample3 0 +sample3 sample2 1 diff --git a/tests/data/distances/expected_dists-hash-missing-count-missing.tsv b/tests/data/distances/expected_dists-hash-missing-count-missing.tsv index 5ead144..b6d8e5f 100644 --- a/tests/data/distances/expected_dists-hash-missing-count-missing.tsv +++ b/tests/data/distances/expected_dists-hash-missing-count-missing.tsv @@ -1,4 +1,10 @@ -dists sample1 sample2 sample3 -sample1 0 2 2 -sample2 2 0 2 -sample3 2 2 0 +query_id ref_id dist +sample1 sample1 0 +sample1 sample2 2 +sample1 sample3 2 +sample2 sample2 0 +sample2 sample1 2 +sample2 sample3 2 +sample3 sample3 0 +sample3 sample1 2 +sample3 sample2 2 diff --git a/tests/data/distances/expected_dists-hash-missing.tsv b/tests/data/distances/expected_dists-hash-missing.tsv index 3403cd7..2a86a67 100644 --- a/tests/data/distances/expected_dists-hash-missing.tsv +++ b/tests/data/distances/expected_dists-hash-missing.tsv @@ -1,4 +1,10 @@ -dists sample1 sample2 sample3 -sample1 0 1 1 -sample2 1 0 2 -sample3 1 2 0 +query_id ref_id dist +sample1 sample1 0 +sample1 sample2 1 +sample1 sample3 1 +sample2 sample2 0 +sample2 sample1 1 +sample2 sample3 2 +sample3 sample3 0 +sample3 sample1 1 +sample3 sample2 2 diff --git a/tests/data/distances/expected_dists-hash-more-missing-remove-sample.tsv b/tests/data/distances/expected_dists-hash-more-missing-remove-sample.tsv index 0cc834e..9226d7d 100644 --- a/tests/data/distances/expected_dists-hash-more-missing-remove-sample.tsv +++ b/tests/data/distances/expected_dists-hash-more-missing-remove-sample.tsv @@ -1,3 +1,5 @@ -dists sample1 sample2 -sample1 0 2 -sample2 2 0 +query_id ref_id dist +sample1 sample1 0 +sample1 sample2 2 +sample2 sample2 0 +sample2 sample1 2 diff --git a/tests/data/distances/expected_dists-hash-more-missing.tsv b/tests/data/distances/expected_dists-hash-more-missing.tsv index dae119c..cba7ad2 100644 --- a/tests/data/distances/expected_dists-hash-more-missing.tsv +++ b/tests/data/distances/expected_dists-hash-more-missing.tsv @@ -1,4 +1,10 @@ -dists sample1 sample2 sample3 -sample1 0 2 3 -sample2 2 0 2 -sample3 3 2 0 +query_id ref_id dist +sample1 sample1 0 +sample1 sample2 2 +sample1 sample3 3 +sample2 sample2 0 +sample2 sample1 2 +sample2 sample3 2 +sample3 sample3 0 +sample3 sample2 2 +sample3 sample1 3 diff --git a/tests/data/distances/expected_dists-hash-remove-missing-loci.tsv b/tests/data/distances/expected_dists-hash-remove-missing-loci.tsv index 3403cd7..2a86a67 100644 --- a/tests/data/distances/expected_dists-hash-remove-missing-loci.tsv +++ b/tests/data/distances/expected_dists-hash-remove-missing-loci.tsv @@ -1,4 +1,10 @@ -dists sample1 sample2 sample3 -sample1 0 1 1 -sample2 1 0 2 -sample3 1 2 0 +query_id ref_id dist +sample1 sample1 0 +sample1 sample2 1 +sample1 sample3 1 +sample2 sample2 0 +sample2 sample1 1 +sample2 sample3 2 +sample3 sample3 0 +sample3 sample1 1 +sample3 sample2 2 diff --git a/tests/data/distances/expected_dists-mismatched-ids.tsv b/tests/data/distances/expected_dists-mismatched-ids.tsv index 1a64f2b..a1311dc 100644 --- a/tests/data/distances/expected_dists-mismatched-ids.tsv +++ b/tests/data/distances/expected_dists-mismatched-ids.tsv @@ -1,4 +1,10 @@ -dists sampleA sampleB sampleC -sampleA 0.0 0.0 33.333333333333336 -sampleB 0.0 0.0 33.333333333333336 -sampleC 33.333333333333336 33.333333333333336 0.0 +query_id ref_id dist +sampleA sampleA 0.0 +sampleA sampleB 0.0 +sampleA sampleC 33.333333333333336 +sampleB sampleA 0.0 +sampleB sampleB 0.0 +sampleB sampleC 33.333333333333336 +sampleC sampleC 0.0 +sampleC sampleA 33.333333333333336 +sampleC sampleB 33.333333333333336 diff --git a/tests/data/distances/expected_dists-partial-mismatched-ids.tsv b/tests/data/distances/expected_dists-partial-mismatched-ids.tsv index e7b7940..a68155f 100644 --- a/tests/data/distances/expected_dists-partial-mismatched-ids.tsv +++ b/tests/data/distances/expected_dists-partial-mismatched-ids.tsv @@ -1,4 +1,10 @@ -dists sampleA sampleB sample3 -sampleA 0.0 0.0 33.333333333333336 -sampleB 0.0 0.0 33.333333333333336 -sample3 33.333333333333336 33.333333333333336 0.0 +query_id ref_id dist +sampleA sampleA 0.0 +sampleA sampleB 0.0 +sampleA sample3 33.333333333333336 +sampleB sampleA 0.0 +sampleB sampleB 0.0 +sampleB sample3 33.333333333333336 +sample3 sample3 0.0 +sample3 sampleA 33.333333333333336 +sample3 sampleB 33.333333333333336 diff --git a/tests/data/distances/expected_dists.tsv b/tests/data/distances/expected_dists.tsv index 00e9cec..4fc6d6f 100644 --- a/tests/data/distances/expected_dists.tsv +++ b/tests/data/distances/expected_dists.tsv @@ -1,4 +1,10 @@ -dists sample1 sample2 sample3 -sample1 0.0 0.0 33.333333333333336 -sample2 0.0 0.0 33.333333333333336 -sample3 33.333333333333336 33.333333333333336 0.0 +query_id ref_id dist +sample1 sample1 0.0 +sample1 sample2 0.0 +sample1 sample3 33.333333333333336 +sample2 sample1 0.0 +sample2 sample2 0.0 +sample2 sample3 33.333333333333336 +sample3 sample3 0.0 +sample3 sample1 33.333333333333336 +sample3 sample2 33.333333333333336 diff --git a/workflows/fastmatchirida.nf b/workflows/fastmatchirida.nf index 82c3a14..55c2301 100644 --- a/workflows/fastmatchirida.nf +++ b/workflows/fastmatchirida.nf @@ -29,6 +29,8 @@ Workflowfastmatchirida.initialise(params, log) include { LOCIDEX_MERGE } from '../modules/local/locidex/merge/main' include { PROFILE_DISTS } from '../modules/local/profile_dists/main' include { INPUT_ASSURE } from "../modules/local/input_assure/main" +include { PROCESS_OUTPUT } from "../modules/local/process_output/main" +include { APPEND_METADATA } from "../modules/local/append_metadata/main" /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -147,11 +149,18 @@ workflow FASTMATCH { } // Options related to profile dists - mapping_format = Channel.value(params.pd_outfmt) + mapping_format = Channel.value("pairwise") distances = PROFILE_DISTS(merged.combined_profiles, mapping_format, mapping_file, columns_file) ch_versions = ch_versions.mix(distances.versions) + // Append metadata to references: + distances_metadata = APPEND_METADATA(distances.results, metadata_rows, metadata_headers) + + // Process the output: + processed_output = PROCESS_OUTPUT(distances_metadata.distances, 0) + ch_versions = ch_versions.mix(processed_output.versions) + CUSTOM_DUMPSOFTWAREVERSIONS ( ch_versions.unique().collectFile(name: 'collated_versions.yml') )