-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #3 from phac-nml/output-stub
Result Retrieval from FastMatch Pipeline
- Loading branch information
Showing
16 changed files
with
275 additions
and
48 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
#!/usr/bin/env python | ||
|
||
from pathlib import Path | ||
from mimetypes import guess_type | ||
from functools import partial | ||
import gzip | ||
import sys | ||
import argparse | ||
import pandas as pd | ||
|
||
|
||
def get_open(f): | ||
if "gzip" == guess_type(str(f))[1]: | ||
return partial(gzip.open) | ||
else: | ||
return open | ||
|
||
def main(argv=None): | ||
|
||
parser = argparse.ArgumentParser( | ||
description="Parses a profile_dists distances to create query-reference-format output for the FastMatch pipeline.", | ||
epilog="Example: python process_output.py --input distances.tsv --output results.tsv --threshold 10", | ||
) | ||
|
||
parser.add_argument( | ||
"--input", | ||
action="store", | ||
dest="input", | ||
type=str, | ||
help="profile_dists-generated distance matrix", | ||
default=None, | ||
required=True, | ||
) | ||
|
||
parser.add_argument( | ||
"--threshold", | ||
action="store", | ||
dest="threshold", | ||
type=float, | ||
help="distance threshold to be included in output", | ||
default=None, | ||
required=True, | ||
) | ||
|
||
parser.add_argument( | ||
"--output", | ||
action="store", | ||
dest="output", | ||
type=str, | ||
help="output prefix (without extension)", | ||
default=None, | ||
required=True, | ||
) | ||
|
||
args = parser.parse_args(argv) | ||
|
||
input = Path(args.input) | ||
tsv_path = Path(args.output + ".tsv") | ||
excel_path = Path(args.output + ".xlsx") | ||
threshold = args.threshold | ||
|
||
data = pd.read_csv(input, sep="\t") | ||
data = data[data['Distance'] <= threshold] | ||
data.to_csv(tsv_path, sep="\t", index=False) | ||
data.to_excel(excel_path) | ||
|
||
print("Output written to:") | ||
print(tsv_path) | ||
print(excel_path) | ||
|
||
return 0 | ||
|
||
|
||
if __name__ == "__main__": | ||
sys.exit(main()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
process APPEND_METADATA { | ||
tag "Appends metadata to distances" | ||
label 'process_single' | ||
|
||
input: | ||
val distances_path // distance data as a TSV path | ||
// this needs to be "val", because "path" | ||
// won't stage the file correctly for exec | ||
val metadata_rows // metadata rows (no headers) to be appened, list of lists | ||
val metadata_headers // headers to name the metadata columns | ||
|
||
output: | ||
path("distances_and_metadata.tsv"), emit: distances | ||
|
||
exec: | ||
def distances_rows // has a header row | ||
def metadata_rows_map = [:] | ||
def sample_names_map = [:] // maps sample names to Irida IDs | ||
def merged = [] | ||
|
||
distances_path.withReader { reader -> | ||
distances_rows = reader.readLines()*.split('\t') | ||
} | ||
|
||
// Create a map of the metadata rows: | ||
// Start on i = 0 because there are no headers included. | ||
for(int i = 0; i < metadata_rows.size(); i++) | ||
{ | ||
// "sample" -> ["sample", meta1, meta2, meta3, ...] | ||
sample_name = metadata_rows[i][0] | ||
metadata_rows_map[sample_name] = metadata_rows[i] | ||
|
||
// "sample" -> "Irida ID" | ||
sample_names_map[sample_name] = metadata_rows[i][1] | ||
} | ||
|
||
// Create the header row: | ||
merged.add(["Query ID", "Query Sample Name", "Reference ID", "Reference Sample Name", "Distance"] | ||
+ metadata_headers[1..-1]) | ||
|
||
// Merge the remaining rows in original order: | ||
// Start on i = 1 because we don't want the headers. | ||
for(int i = 1; i < distances_rows.size(); i++) | ||
{ | ||
query_sample_name = distances_rows[i][0] | ||
query_irida_id = sample_names_map[query_sample_name] | ||
reference_sample_name = distances_rows[i][1] | ||
reference_irida_id = sample_names_map[reference_sample_name] | ||
distance = distances_rows[i][2] | ||
|
||
merged_row = [query_irida_id, query_sample_name, reference_irida_id, reference_sample_name, distance] \ | ||
+ metadata_rows_map[reference_sample_name][2..-1] | ||
|
||
merged.add(merged_row) | ||
} | ||
|
||
task.workDir.resolve("distances_and_metadata.tsv").withWriter { writer -> | ||
merged.each { writer.writeLine it.join("\t") } | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
process PROCESS_OUTPUT { | ||
label 'process_single' | ||
|
||
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? | ||
'https://depot.galaxyproject.org/singularity/staramr:0.10.0--pyhdfd78af_0': | ||
'biocontainers/staramr:0.10.0--pyhdfd78af_0' }" | ||
|
||
input: | ||
path distances | ||
val threshold | ||
|
||
output: | ||
path "results.tsv", emit: tsv | ||
path "results.xlsx", emit: excel | ||
path "versions.yml", emit: versions | ||
|
||
when: | ||
task.ext.when == null || task.ext.when | ||
|
||
script: | ||
def args = task.ext.args ?: '' | ||
""" | ||
process_output.py \\ | ||
$args \\ | ||
--input $distances \\ | ||
--output results \\ | ||
--threshold $threshold | ||
cat <<-END_VERSIONS > versions.yml | ||
"${task.process}": | ||
process_outout : 0.1.0 | ||
END_VERSIONS | ||
""" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,10 @@ | ||
dists sample1 sample2 sample3 | ||
sample1 0 1 2 | ||
sample2 1 0 3 | ||
sample3 2 3 0 | ||
query_id ref_id dist | ||
sample1 sample1 0 | ||
sample1 sample2 1 | ||
sample1 sample3 2 | ||
sample2 sample2 0 | ||
sample2 sample1 1 | ||
sample2 sample3 3 | ||
sample3 sample3 0 | ||
sample3 sample1 2 | ||
sample3 sample2 3 |
14 changes: 10 additions & 4 deletions
14
tests/data/distances/expected_dists-hash-keep-one-loci.tsv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,10 @@ | ||
dists sample1 sample2 sample3 | ||
sample1 0 1 0 | ||
sample2 1 0 1 | ||
sample3 0 1 0 | ||
query_id ref_id dist | ||
sample1 sample1 0 | ||
sample1 sample3 0 | ||
sample1 sample2 1 | ||
sample2 sample2 0 | ||
sample2 sample1 1 | ||
sample2 sample3 1 | ||
sample3 sample1 0 | ||
sample3 sample3 0 | ||
sample3 sample2 1 |
14 changes: 10 additions & 4 deletions
14
tests/data/distances/expected_dists-hash-missing-count-missing.tsv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,10 @@ | ||
dists sample1 sample2 sample3 | ||
sample1 0 2 2 | ||
sample2 2 0 2 | ||
sample3 2 2 0 | ||
query_id ref_id dist | ||
sample1 sample1 0 | ||
sample1 sample2 2 | ||
sample1 sample3 2 | ||
sample2 sample2 0 | ||
sample2 sample1 2 | ||
sample2 sample3 2 | ||
sample3 sample3 0 | ||
sample3 sample1 2 | ||
sample3 sample2 2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,10 @@ | ||
dists sample1 sample2 sample3 | ||
sample1 0 1 1 | ||
sample2 1 0 2 | ||
sample3 1 2 0 | ||
query_id ref_id dist | ||
sample1 sample1 0 | ||
sample1 sample2 1 | ||
sample1 sample3 1 | ||
sample2 sample2 0 | ||
sample2 sample1 1 | ||
sample2 sample3 2 | ||
sample3 sample3 0 | ||
sample3 sample1 1 | ||
sample3 sample2 2 |
8 changes: 5 additions & 3 deletions
8
tests/data/distances/expected_dists-hash-more-missing-remove-sample.tsv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,5 @@ | ||
dists sample1 sample2 | ||
sample1 0 2 | ||
sample2 2 0 | ||
query_id ref_id dist | ||
sample1 sample1 0 | ||
sample1 sample2 2 | ||
sample2 sample2 0 | ||
sample2 sample1 2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,10 @@ | ||
dists sample1 sample2 sample3 | ||
sample1 0 2 3 | ||
sample2 2 0 2 | ||
sample3 3 2 0 | ||
query_id ref_id dist | ||
sample1 sample1 0 | ||
sample1 sample2 2 | ||
sample1 sample3 3 | ||
sample2 sample2 0 | ||
sample2 sample1 2 | ||
sample2 sample3 2 | ||
sample3 sample3 0 | ||
sample3 sample2 2 | ||
sample3 sample1 3 |
14 changes: 10 additions & 4 deletions
14
tests/data/distances/expected_dists-hash-remove-missing-loci.tsv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,10 @@ | ||
dists sample1 sample2 sample3 | ||
sample1 0 1 1 | ||
sample2 1 0 2 | ||
sample3 1 2 0 | ||
query_id ref_id dist | ||
sample1 sample1 0 | ||
sample1 sample2 1 | ||
sample1 sample3 1 | ||
sample2 sample2 0 | ||
sample2 sample1 1 | ||
sample2 sample3 2 | ||
sample3 sample3 0 | ||
sample3 sample1 1 | ||
sample3 sample2 2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,10 @@ | ||
dists sampleA sampleB sampleC | ||
sampleA 0.0 0.0 33.333333333333336 | ||
sampleB 0.0 0.0 33.333333333333336 | ||
sampleC 33.333333333333336 33.333333333333336 0.0 | ||
query_id ref_id dist | ||
sampleA sampleA 0.0 | ||
sampleA sampleB 0.0 | ||
sampleA sampleC 33.333333333333336 | ||
sampleB sampleA 0.0 | ||
sampleB sampleB 0.0 | ||
sampleB sampleC 33.333333333333336 | ||
sampleC sampleC 0.0 | ||
sampleC sampleA 33.333333333333336 | ||
sampleC sampleB 33.333333333333336 |
14 changes: 10 additions & 4 deletions
14
tests/data/distances/expected_dists-partial-mismatched-ids.tsv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,10 @@ | ||
dists sampleA sampleB sample3 | ||
sampleA 0.0 0.0 33.333333333333336 | ||
sampleB 0.0 0.0 33.333333333333336 | ||
sample3 33.333333333333336 33.333333333333336 0.0 | ||
query_id ref_id dist | ||
sampleA sampleA 0.0 | ||
sampleA sampleB 0.0 | ||
sampleA sample3 33.333333333333336 | ||
sampleB sampleA 0.0 | ||
sampleB sampleB 0.0 | ||
sampleB sample3 33.333333333333336 | ||
sample3 sample3 0.0 | ||
sample3 sampleA 33.333333333333336 | ||
sample3 sampleB 33.333333333333336 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,10 @@ | ||
dists sample1 sample2 sample3 | ||
sample1 0.0 0.0 33.333333333333336 | ||
sample2 0.0 0.0 33.333333333333336 | ||
sample3 33.333333333333336 33.333333333333336 0.0 | ||
query_id ref_id dist | ||
sample1 sample1 0.0 | ||
sample1 sample2 0.0 | ||
sample1 sample3 33.333333333333336 | ||
sample2 sample1 0.0 | ||
sample2 sample2 0.0 | ||
sample2 sample3 33.333333333333336 | ||
sample3 sample3 0.0 | ||
sample3 sample1 33.333333333333336 | ||
sample3 sample2 33.333333333333336 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters