-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCARD_summary_MDB_reflist.py
218 lines (196 loc) · 8.57 KB
/
CARD_summary_MDB_reflist.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
#!/usr/bin/env python
# Script that makes an Excel file to summarize the raw data results from CARD on the E. coli WGS
# Importing the required libraries
import os
import re
import json
import pprint
import xlsxwriter
import openpyxl
# Folder with CARD results & file with gene-AB reference list
input_dir = "/home/anked/BIT11_Traineeship/Ecoli_AMR/RGI_CARD/Ecoli_WGS/RGI_output/"
reflist = "/home/anked/BIT11_Traineeship/Ecoli_AMR/reflists.xlsx"
#input_dir = "/home/guest/BIT11_Traineeship/Ecoli_AMR/Ruwe_data/CARD_subset15/"
#reflist = "/home/guest/BIT11_Traineeship/Ecoli_AMR/Final_documents/reflists.xlsx"
# List of antibiotics to look for in the CARD results
AB_list = ["amikacin", "amoxicillin", "amoxicillin+clavulanic acid", "aztreonam",
"cefepime", "ceftazidime", "ciprofloxacin", "colistin", "meropenem",
"piperacillin", "piperacillin+tazobactam", "tigecycline", "tobramycin",
"trimethoprim", "sulfamethoxazole"
]
# Open the reference list with vocabulary adjustments
adjust_vocab = openpyxl.load_workbook("/home/guest/BIT11_Traineeship/Ecoli_AMR/Final_documents/reflists_MDB.xlsx")
adjust_ws = adjust_vocab["AMRF_CARD_BN_RESF_combi_reflist"]
# STEP 1 : Make an Excel file to store the summary data further in the script
####################################################################################################################################
#output_file = "CARD_summary.xlsx"
output_file = "CARD_summary_reflist_MDB_test.xlsx"
output_dir = "/home/anked/BIT11_Traineeship/Ecoli_AMR/"
#output_dir = "/home/guest/BIT11_Traineeship/Ecoli_AMR/Final_documents/Summary_Excel/"
wb = xlsxwriter.Workbook(os.path.join(output_dir, output_file))
ws = wb.add_worksheet("CARD_summary")
# Write the header line
header = ["File_ID"] + AB_list
ws.write_row(0, 0, header)
# Initialize the Excel line counter
excel_line = 2
# STEP 2 : Retrieve relevant genes from CARD data by comparing to the reference list and save the associated antibiotic resistances
####################################################################################################################################
# Look in the input directory for the .txt files
files = [f for f in os.listdir(input_dir) if f.endswith(".txt")]
# Sort the files based on the numeric part of the sample ID (e.g., MTT001, MTT002)
files.sort(key=lambda f: int(re.search(r'\d+', f).group()))
# Go through each file
for file in files:
# Initialize lists to store genes per sample & antibiotics linked to genes in the sample
genes = []
ABs = []
# Extract the sample name via regular expression (match the first 6 characters = MTT...)
match = re.match(r'^.{6}', file)
if match:
sample = match.group()
# Open the .txt-files
file_path = os.path.join(input_dir, file)
with open(file_path) as f:
# Read each line of the file
for line in f:
# Extract the gene name and add to the list of genes from the sample
select_crit = line.split('\t')[5]
if select_crit == "Perfect" or select_crit == "Strict":
gene_name = line.split('\t')[8].lower()
if gene_name not in genes:
genes.append(gene_name)
# Load the reference list workbook & select the combi list (both BN & RESF results)
ref_wb = openpyxl.load_workbook(reflist)
ref_ws = ref_wb["RESF_BN_combi_reflist"]
# Loop through all genes from the sample and look for a match in the reference list
for gene in genes:
if gene =="AAC(3)-IVa":
gene = "aac(3)-IV"
if gene =="AAC(3)-IVa":
gene = "aac(3)-IV"
if gene == "AAC(3)-VIa":
gene == "aac(3)-VIa"
if gene == "aadA":
gene == "aadA1"
if gene == "ant(3'')-Ia":
gene == "aadA1"
if gene == "blaACT":
gene == "blaACT-1"
if gene == "blaCTX-M":
gene == "blaCTX-M-1"
if gene == "blaTEM":
gene == "blaTEM-1A"
if gene == "blaTEMp":
gene == "blaTEM-1A"
if gene == "catI":
gene == "catA1"
if gene == "CTX-M-1":
gene == "blaCTX-M-1"
if gene == "CTX-M-15":
gene == "blaCTX-M-15"
if gene == "DfrA36":
gene == "dfrA36"
if gene == "Enterobacter cloacae acrA":
gene == "acrA"
if gene == "ErmB":
gene == "erm(B)"
if gene == "Escherichia coli acrA":
gene == "acrA"
if gene == "Escherichia coli ampC beta-lactamase":
gene == "blaACT-1"
if gene == "Escherichia coli cyaA with mutation conferring resistance to fosfomycin":
gene == "cyaA"
if gene == "Escherichia coli GlpT with mutation conferring resistance to fosfomycin":
gene == "glpT"
if gene == "Escherichia coli gyrA conferring resistance to fluoroquinolones":
gene == "gyrA"
if gene == "Escherichia coli gyrA with mutation conferring resistance to triclosan":
gene == "gyrA"
if gene == "Escherichia coli mdfA":
gene == "mdf(A)"
if gene == "Escherichia coli nfsA mutations conferring resistance to nitrofurantoin":
gene == "nfsA"
if gene == "Escherichia coli parC conferring resistance to fluoroquinolones":
gene == "parC"
if gene == "Escherichia coli PtsI with mutation conferring resistance to fosfomycin":
gene == "ptsI"
if gene == "Escherichia coli soxR with mutation conferring antibiotic resistance":
gene == "soxR"
if gene == "Escherichia coli UhpT with mutation conferring resistance to fosfomycin":
gene == "uhpT"
if gene == "mdf(A)":
gene == "mdf(A)"
if gene == "mphA":
gene == "mph(A)"
if gene == "mphB":
gene == "mph(B)"
if gene == "OXA-1":
gene == "blaOXA-1"
if gene == "OXA-2":
gene == "blaOXA-2"
if gene == "OXA-9":
gene == "blaOXA-9"
if gene == "qnrE":
gene == "qnrE1"
if gene == "qnrS":
gene == "qnrS1"
if gene == "QnrS1":
gene == "qnrS1"
if gene == "Salmonella enterica gyrA with mutation conferring resistance to triclosan":
gene == "gyrA"
if gene == "SAT-2":
gene == "sat2"
if gene == "SHV-1":
gene == "blaSHV-1"
if gene == "SHV-4":
gene == "blaSHV-4"
if gene == "TEM-1":
gene == "blaTEM-1"
if gene == "TEM-214":
gene == "blaTEM-214"
if gene == "TEM-24":
gene == "blaTEM-24"
if gene == "TEM-30":
gene == "blaTEM-30"
if gene == "TEM-32":
gene == "blaTEM-32"
if gene == "TEM-33":
gene == "blaTEM-33"
if gene == "TEM-34":
gene == "blaTEM-34"
if gene == "TEM-40":
gene == "blaTEM-40"
if gene == "TEM-52":
gene == "blaTEM-52"
if gene == "TEM-54":
gene == "blaTEM-54"
for row in ref_ws.iter_rows(min_row=3, max_col=2, max_row=ref_ws.max_row):
gene_reflist = row[0].value
if gene.lower() == gene_reflist.lower():
#print(gene)
# Save the antibiotics from the reference list to the list of ABs
AB_reflist = row[1].value
if AB_reflist:
antibiotics = [antibiotic.strip() for antibiotic in AB_reflist.split(';')] # Split the string into individual antibiotics and strip whitespace
for antibiotic in antibiotics:
if antibiotic not in ABs:
ABs.append(antibiotic)
# STEP 3 : Fill in the Excel file with the phenotypic data for each AB in the study for each sample/strain
#####################################################################################################
# Make a list to store the phenotypic data and sample name
phenotypic_data = [sample]
# Go through the ABs from the study and check if the AMR genes are linked to them
for AB in AB_list:
if AB in ABs:
phenotypic_data.append("R")
else:
phenotypic_data.append("S")
# Write the phenotypic data from the strain to the Excel file
ws.write_row(excel_line, 0, phenotypic_data)
# Add to the Excel line counter
excel_line += 1
# Close the Excel file
wb.close()
# Print a message to indicate the script has finished
print(f"Summary Excel file {output_file} has been created successfully at location {output_dir}!")