Skip to content

Commit

Permalink
Merge pull request #157 from polio-nanopore/phylo-dev
Browse files Browse the repository at this point in the history
one report pr
  • Loading branch information
aineniamh authored Oct 16, 2023
2 parents 8a62483 + 9274cde commit 60f22d9
Show file tree
Hide file tree
Showing 10 changed files with 2,010 additions and 38 deletions.
47 changes: 26 additions & 21 deletions piranha/analysis/phylo_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@ def get_seqs_and_clusters(sample_seqs,supplementary_sequences,reference_sequence
seq_metadata = collections.defaultdict(dict)
seq_clusters = collections.defaultdict(list)

header = ["name","sample","barcode","query","reference_group"]
header = VALUE_PHYLO_HEADER

for record in SeqIO.parse(sample_seqs,"fasta"):
for record in SeqIO.parse(sample_seqs,KEY_FASTA):
for ref_group in config[KEY_REFERENCES_FOR_CNS]:
num_cns = collections.Counter()
if ref_group in record.id:
Expand All @@ -30,27 +30,27 @@ def get_seqs_and_clusters(sample_seqs,supplementary_sequences,reference_sequence
new_record.id = name
seq_clusters[ref_group].append(new_record)

seq_metadata[name]["name"] = name
seq_metadata[name]["sample"] = sample
seq_metadata[name]["barcode"] = barcode
seq_metadata[name]["query"] = "True"
seq_metadata[name]["reference_group"] = ref_group
seq_metadata[name][KEY_NAME] = name
seq_metadata[name][KEY_SAMPLE] = sample
seq_metadata[name][KEY_BARCODE] = barcode
seq_metadata[name][KEY_SOURCE] = KEY_SAMPLE
seq_metadata[name][KEY_REFERENCE_GROUP] = ref_group


print(green("Reference groups for phylo pipeline:"))
for i in seq_clusters:
print(f"- {i}")

if supplementary_sequences:
for record in SeqIO.parse(supplementary_sequences,"fasta"):
for record in SeqIO.parse(supplementary_sequences,KEY_FASTA):
for ref_group in seq_clusters:
if ref_group in record.description:
seq_clusters[ref_group].append(record)

seq_metadata[record.id]["name"] = record.id
seq_metadata[record.id]["sample"] = record.id
seq_metadata[record.id]["query"] = "False"
seq_metadata[record.id]["reference_group"] = ref_group
seq_metadata[record.id][KEY_NAME] = record.id
seq_metadata[record.id][KEY_SAMPLE] = record.id
seq_metadata[record.id][KEY_SOURCE] = "Background"
seq_metadata[record.id][KEY_REFERENCE_GROUP] = ref_group

if supplementary_metadata:
with open(supplementary_metadata, "r") as f:
Expand All @@ -66,17 +66,22 @@ def get_seqs_and_clusters(sample_seqs,supplementary_sequences,reference_sequence
if col in row:
seq_metadata[sample][col] = row[col]

for record in SeqIO.parse(reference_sequences, "fasta"):
for record in SeqIO.parse(reference_sequences, KEY_FASTA):
for ref_group in seq_clusters:
if ref_group in record.description:
seq_clusters[ref_group].append(record)

seq_metadata[record.id]["name"] = record.id
seq_metadata[record.id]["sample"] = record.id
seq_metadata[record.id]["query"] = "False"
seq_metadata[record.id]["reference_group"] = ref_group
seq_metadata[record.id][KEY_NAME] = record.id
seq_metadata[record.id][KEY_SAMPLE] = record.id

seq_metadata[record.id][KEY_REFERENCE_GROUP] = ref_group

if "Sabin" in record.description:
seq_metadata[record.id][KEY_SOURCE] = "Sabin"
else:
seq_metadata[record.id][KEY_SOURCE] = "Reference"

for record in SeqIO.parse(outgroup_sequences, "fasta"):
for record in SeqIO.parse(outgroup_sequences, KEY_FASTA):
for ref_group in seq_clusters:
if ref_group in record.description:
new_record = record
Expand Down Expand Up @@ -111,7 +116,7 @@ def get_seqs_and_clusters(sample_seqs,supplementary_sequences,reference_sequence
writer = csv.DictWriter(fw,fieldnames=header, lineterminator='\n')
writer.writeheader()
for record in seq_metadata:
if seq_metadata[record]["reference_group"] == i:
if seq_metadata[record][KEY_REFERENCE_GROUP] == i:
row = seq_metadata[record]
for col in header:
if col not in row:
Expand All @@ -122,11 +127,11 @@ def get_seqs_and_clusters(sample_seqs,supplementary_sequences,reference_sequence
writer0.writerow(row)

with open(os.path.join(phylo_outdir, f"{i}.fasta"),"w") as fw:
SeqIO.write(seq_clusters[i], fw, "fasta")
SeqIO.write(seq_clusters[i], fw, KEY_FASTA)

tree_annotations = config[KEY_TREE_ANNOTATIONS]
for i in header:
if i not in ["sample","barcode","query","name"]:
if i not in [KEY_SAMPLE,KEY_BARCODE,KEY_SOURCE,KEY_NAME]:
tree_annotations+= f"{i} "


Expand Down
Loading

0 comments on commit 60f22d9

Please sign in to comment.