Skip to content

Commit

Permalink
Merge pull request #3 from phac-nml/output-stub
Browse files Browse the repository at this point in the history
Result Retrieval from FastMatch Pipeline
  • Loading branch information
emarinier authored Dec 12, 2024
2 parents cbea074 + 4c18be3 commit 70962d5
Show file tree
Hide file tree
Showing 16 changed files with 275 additions and 48 deletions.
75 changes: 75 additions & 0 deletions bin/process_output.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#!/usr/bin/env python

from pathlib import Path
from mimetypes import guess_type
from functools import partial
import gzip
import sys
import argparse
import pandas as pd


def get_open(f):
if "gzip" == guess_type(str(f))[1]:
return partial(gzip.open)
else:
return open

def main(argv=None):

parser = argparse.ArgumentParser(
description="Parses a profile_dists distances to create query-reference-format output for the FastMatch pipeline.",
epilog="Example: python process_output.py --input distances.tsv --output results.tsv --threshold 10",
)

parser.add_argument(
"--input",
action="store",
dest="input",
type=str,
help="profile_dists-generated distance matrix",
default=None,
required=True,
)

parser.add_argument(
"--threshold",
action="store",
dest="threshold",
type=float,
help="distance threshold to be included in output",
default=None,
required=True,
)

parser.add_argument(
"--output",
action="store",
dest="output",
type=str,
help="output prefix (without extension)",
default=None,
required=True,
)

args = parser.parse_args(argv)

input = Path(args.input)
tsv_path = Path(args.output + ".tsv")
excel_path = Path(args.output + ".xlsx")
threshold = args.threshold

data = pd.read_csv(input, sep="\t")
data = data[data['Distance'] <= threshold]
data.to_csv(tsv_path, sep="\t", index=False)
data.to_excel(excel_path)

print("Output written to:")
print(tsv_path)
print(excel_path)

return 0


if __name__ == "__main__":
sys.exit(main())
61 changes: 61 additions & 0 deletions modules/local/append_metadata/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
process APPEND_METADATA {
tag "Appends metadata to distances"
label 'process_single'

input:
val distances_path // distance data as a TSV path
// this needs to be "val", because "path"
// won't stage the file correctly for exec
val metadata_rows // metadata rows (no headers) to be appened, list of lists
val metadata_headers // headers to name the metadata columns

output:
path("distances_and_metadata.tsv"), emit: distances

exec:
def distances_rows // has a header row
def metadata_rows_map = [:]
def sample_names_map = [:] // maps sample names to Irida IDs
def merged = []

distances_path.withReader { reader ->
distances_rows = reader.readLines()*.split('\t')
}

// Create a map of the metadata rows:
// Start on i = 0 because there are no headers included.
for(int i = 0; i < metadata_rows.size(); i++)
{
// "sample" -> ["sample", meta1, meta2, meta3, ...]
sample_name = metadata_rows[i][0]
metadata_rows_map[sample_name] = metadata_rows[i]

// "sample" -> "Irida ID"
sample_names_map[sample_name] = metadata_rows[i][1]
}

// Create the header row:
merged.add(["Query ID", "Query Sample Name", "Reference ID", "Reference Sample Name", "Distance"]
+ metadata_headers[1..-1])

// Merge the remaining rows in original order:
// Start on i = 1 because we don't want the headers.
for(int i = 1; i < distances_rows.size(); i++)
{
query_sample_name = distances_rows[i][0]
query_irida_id = sample_names_map[query_sample_name]
reference_sample_name = distances_rows[i][1]
reference_irida_id = sample_names_map[reference_sample_name]
distance = distances_rows[i][2]

merged_row = [query_irida_id, query_sample_name, reference_irida_id, reference_sample_name, distance] \
+ metadata_rows_map[reference_sample_name][2..-1]

merged.add(merged_row)
}

task.workDir.resolve("distances_and_metadata.tsv").withWriter { writer ->
merged.each { writer.writeLine it.join("\t") }
}

}
34 changes: 34 additions & 0 deletions modules/local/process_output/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
process PROCESS_OUTPUT {
label 'process_single'

container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/staramr:0.10.0--pyhdfd78af_0':
'biocontainers/staramr:0.10.0--pyhdfd78af_0' }"

input:
path distances
val threshold

output:
path "results.tsv", emit: tsv
path "results.xlsx", emit: excel
path "versions.yml", emit: versions

when:
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ''
"""
process_output.py \\
$args \\
--input $distances \\
--output results \\
--threshold $threshold
cat <<-END_VERSIONS > versions.yml
"${task.process}":
process_outout : 0.1.0
END_VERSIONS
"""
}
1 change: 0 additions & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@ params {
validate_params = true

// Profile dists args
pd_outfmt = "matrix"
pd_distm = "hamming"
pd_missing_threshold = 1.0
pd_sample_quality_threshold = 1.0
Expand Down
7 changes: 0 additions & 7 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -110,13 +110,6 @@
"description": "Parameters for profile_dists distance calculations",
"default": "",
"properties": {
"pd_outfmt": {
"type": "string",
"description": "The output format for distances",
"enum": ["matrix"],
"default": "matrix",
"hidden": true
},
"pd_distm": {
"type": "string",
"description": "The distance method/unit",
Expand Down
14 changes: 10 additions & 4 deletions tests/data/distances/expected_dists-hamming.tsv
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
dists sample1 sample2 sample3
sample1 0 1 2
sample2 1 0 3
sample3 2 3 0
query_id ref_id dist
sample1 sample1 0
sample1 sample2 1
sample1 sample3 2
sample2 sample2 0
sample2 sample1 1
sample2 sample3 3
sample3 sample3 0
sample3 sample1 2
sample3 sample2 3
14 changes: 10 additions & 4 deletions tests/data/distances/expected_dists-hash-keep-one-loci.tsv
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
dists sample1 sample2 sample3
sample1 0 1 0
sample2 1 0 1
sample3 0 1 0
query_id ref_id dist
sample1 sample1 0
sample1 sample3 0
sample1 sample2 1
sample2 sample2 0
sample2 sample1 1
sample2 sample3 1
sample3 sample1 0
sample3 sample3 0
sample3 sample2 1
14 changes: 10 additions & 4 deletions tests/data/distances/expected_dists-hash-missing-count-missing.tsv
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
dists sample1 sample2 sample3
sample1 0 2 2
sample2 2 0 2
sample3 2 2 0
query_id ref_id dist
sample1 sample1 0
sample1 sample2 2
sample1 sample3 2
sample2 sample2 0
sample2 sample1 2
sample2 sample3 2
sample3 sample3 0
sample3 sample1 2
sample3 sample2 2
14 changes: 10 additions & 4 deletions tests/data/distances/expected_dists-hash-missing.tsv
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
dists sample1 sample2 sample3
sample1 0 1 1
sample2 1 0 2
sample3 1 2 0
query_id ref_id dist
sample1 sample1 0
sample1 sample2 1
sample1 sample3 1
sample2 sample2 0
sample2 sample1 1
sample2 sample3 2
sample3 sample3 0
sample3 sample1 1
sample3 sample2 2
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
dists sample1 sample2
sample1 0 2
sample2 2 0
query_id ref_id dist
sample1 sample1 0
sample1 sample2 2
sample2 sample2 0
sample2 sample1 2
14 changes: 10 additions & 4 deletions tests/data/distances/expected_dists-hash-more-missing.tsv
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
dists sample1 sample2 sample3
sample1 0 2 3
sample2 2 0 2
sample3 3 2 0
query_id ref_id dist
sample1 sample1 0
sample1 sample2 2
sample1 sample3 3
sample2 sample2 0
sample2 sample1 2
sample2 sample3 2
sample3 sample3 0
sample3 sample2 2
sample3 sample1 3
14 changes: 10 additions & 4 deletions tests/data/distances/expected_dists-hash-remove-missing-loci.tsv
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
dists sample1 sample2 sample3
sample1 0 1 1
sample2 1 0 2
sample3 1 2 0
query_id ref_id dist
sample1 sample1 0
sample1 sample2 1
sample1 sample3 1
sample2 sample2 0
sample2 sample1 1
sample2 sample3 2
sample3 sample3 0
sample3 sample1 1
sample3 sample2 2
14 changes: 10 additions & 4 deletions tests/data/distances/expected_dists-mismatched-ids.tsv
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
dists sampleA sampleB sampleC
sampleA 0.0 0.0 33.333333333333336
sampleB 0.0 0.0 33.333333333333336
sampleC 33.333333333333336 33.333333333333336 0.0
query_id ref_id dist
sampleA sampleA 0.0
sampleA sampleB 0.0
sampleA sampleC 33.333333333333336
sampleB sampleA 0.0
sampleB sampleB 0.0
sampleB sampleC 33.333333333333336
sampleC sampleC 0.0
sampleC sampleA 33.333333333333336
sampleC sampleB 33.333333333333336
14 changes: 10 additions & 4 deletions tests/data/distances/expected_dists-partial-mismatched-ids.tsv
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
dists sampleA sampleB sample3
sampleA 0.0 0.0 33.333333333333336
sampleB 0.0 0.0 33.333333333333336
sample3 33.333333333333336 33.333333333333336 0.0
query_id ref_id dist
sampleA sampleA 0.0
sampleA sampleB 0.0
sampleA sample3 33.333333333333336
sampleB sampleA 0.0
sampleB sampleB 0.0
sampleB sample3 33.333333333333336
sample3 sample3 0.0
sample3 sampleA 33.333333333333336
sample3 sampleB 33.333333333333336
14 changes: 10 additions & 4 deletions tests/data/distances/expected_dists.tsv
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
dists sample1 sample2 sample3
sample1 0.0 0.0 33.333333333333336
sample2 0.0 0.0 33.333333333333336
sample3 33.333333333333336 33.333333333333336 0.0
query_id ref_id dist
sample1 sample1 0.0
sample1 sample2 0.0
sample1 sample3 33.333333333333336
sample2 sample1 0.0
sample2 sample2 0.0
sample2 sample3 33.333333333333336
sample3 sample3 0.0
sample3 sample1 33.333333333333336
sample3 sample2 33.333333333333336
11 changes: 10 additions & 1 deletion workflows/fastmatchirida.nf
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ Workflowfastmatchirida.initialise(params, log)
include { LOCIDEX_MERGE } from '../modules/local/locidex/merge/main'
include { PROFILE_DISTS } from '../modules/local/profile_dists/main'
include { INPUT_ASSURE } from "../modules/local/input_assure/main"
include { PROCESS_OUTPUT } from "../modules/local/process_output/main"
include { APPEND_METADATA } from "../modules/local/append_metadata/main"

/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down Expand Up @@ -147,11 +149,18 @@ workflow FASTMATCH {
}

// Options related to profile dists
mapping_format = Channel.value(params.pd_outfmt)
mapping_format = Channel.value("pairwise")

distances = PROFILE_DISTS(merged.combined_profiles, mapping_format, mapping_file, columns_file)
ch_versions = ch_versions.mix(distances.versions)

// Append metadata to references:
distances_metadata = APPEND_METADATA(distances.results, metadata_rows, metadata_headers)

// Process the output:
processed_output = PROCESS_OUTPUT(distances_metadata.distances, 0)
ch_versions = ch_versions.mix(processed_output.versions)

CUSTOM_DUMPSOFTWAREVERSIONS (
ch_versions.unique().collectFile(name: 'collated_versions.yml')
)
Expand Down

0 comments on commit 70962d5

Please sign in to comment.