-
Notifications
You must be signed in to change notification settings - Fork 0
/
AMRF_summary.py
114 lines (95 loc) · 4.68 KB
/
AMRF_summary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!/usr/bin/env python
# Script that makes an Excel file to summarize the raw data results from AMRFinderPlus on the E. coli WGS
# Importing the required libraries
import os
from os import listdir
from os.path import isfile, join
import re
import csv
import openpyxl
import xlsxwriter
# Folder with AMRF+ results & file with gene-AB reference list
input_dir = "/home/guest/BIT11_Traineeship/Ecoli_AMR/AMR_output/"
reflist = "/home/guest/BIT11_Traineeship/Ecoli_AMR/combi_reflist_RESF_BN.xlsx"
# List of antibiotics to look for in the ResFinder results
AB_list = ["amikacin", "amoxicillin", "amoxicillin+clavulanic acid", "aztreonam",
"cefepime", "ceftazidime", "ciprofloxacin", "colistin", "meropenem",
"piperacillin", "piperacillin+tazobactam", "tigecycline", "tobramycin",
"trimethoprim", "sulfamethoxazole"
]
# STEP 1 : Make an Excel file to store the summary data further in the script
####################################################################################################################################
output_file = "AMRFPlus_summary.xlsx"
#output_file = "AMRFPlus_summary_blaTEM-1.xlsx"
output_dir = "/home/guest/BIT11_Traineeship/Ecoli_AMR/"
wb = xlsxwriter.Workbook(os.path.join(output_dir, output_file))
ws = wb.add_worksheet("AMRFPlus_summary")
# Write the header line
header = ["File_ID"] + AB_list
ws.write_row(0, 0, header)
# Initialize the Excel line counter
excel_line = 2
# STEP 2 : Retrieve relevant genes from AMRF+ data by comparing to the reference list and save the associated antibiotic resistances
####################################################################################################################################
# Look in the input directory for the .tsv files
files = [f for f in listdir(input_dir) if isfile(join(input_dir, f))]
# Loop through each file
for file in files:
# Initialize lists to store genes per sample & antibiotics linked to genes in the sample
genes = []
ABs = []
# Extract the sample name via regular expression (match the first 6 characters = MTT...)
match = re.match(r'^.{6}', file)
if match:
sample = match.group()
# Open .tsv-files with raw data
file_path = os.path.join(input_dir, file)
with open(file_path) as file_to_read:
next(file_to_read) # Skip the header line when reading the file
tsv_file = csv.reader(file_to_read, delimiter="\t")
# Go through each line of data
for line in tsv_file:
# Split the gene name on delimiter "_" and take the first part
gene_name = line[5].split("_")[0]
#print(gene_name)
# Save the genename to the list of genes
genes.append(gene_name)
# Load the reference list workbook & select the combi list (both BN & RESF results)
ref_wb = openpyxl.load_workbook(reflist)
ref_ws = ref_wb["combi_reflist"]
# Loop through all genes from the sample and look for a match in the reference list
for gene in genes:
for row in ref_ws.iter_rows(min_row=3, max_col=2, max_row=ref_ws.max_row):
gene_reflist = row[0].value
if gene == gene_reflist:
# Save the antibiotics from the reference list to the list of ABs
AB_reflist = row[1].value
antibiotics = [antibiotic.strip() for antibiotic in AB_reflist.split(',')] # Split the string into individual antibiotics and strip whitespace
for antibiotic in antibiotics:
if antibiotic not in ABs:
ABs.append(antibiotic)
"""
if gene == "blaTEM-1":
if "amoxicillin" not in ABs:
ABs.append("amoxicillin")
if "piperacillin" not in ABs:
ABs.append("piperacillin")
"""
# STEP 3 : Fill in the Excel file with the phenotypic data for each AB in the study for each sample/strain
#####################################################################################################
# Make a list to store the phenotypic data and sample name
phenotypic_data = [sample]
# Go through the ABs from the study and check if the AMR genes are linked to them
for AB in AB_list:
if AB in ABs:
phenotypic_data.append("R")
else:
phenotypic_data.append("S")
# Write the phenotypic data from the strain to the Excel file
ws.write_row(excel_line, 0, phenotypic_data)
# Add to the Excel line counter
excel_line += 1
# Close the Excel file
wb.close()
# Print a message to indicate the script has finished
print(f"Summary Excel file {output_file} has been created successfully at location {output_dir}!")