-
Notifications
You must be signed in to change notification settings - Fork 0
/
RESF_reflist.py
124 lines (103 loc) · 5.54 KB
/
RESF_reflist.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/env python
# Script to go through the ResFinder raw data in files named "pheno_table.txt" of the E. coli WGS & create a reference list of all the genes found and their corresponding resistance phenotypes/antibiotics
# Importing the required libraries
import os
import xlsxwriter
# Define the input-dir & output-file
input_dir = "/home/guest/BIT11_Traineeship/Ecoli_AMR/ResFinder_output"
output_file = "/home/guest/BIT11_Traineeship/Ecoli_AMR/Reference_lists/RESF_reflist.xlsx"
# STEP 1 : Retrieve data from the input_dir and save it in a dictionary
############################################################################################################################################################
# Loop through the input directory and look for the files named "pheno_table.txt"
data_dict = {}
for subdir, dirs, files in os.walk(input_dir):
sample = os.path.basename(subdir)
for file in files:
if file == "pheno_table.txt" :
file_path = os.path.join(subdir, file)
#print(f"Found data for sample {sample} in file {file}")
# Read the text file
with open(file_path, 'r') as file:
lines = file.readlines()
# Flags for processing lines
process_lines = False
# Iterate through the lines
for line in lines:
# Strip the line of leading/trailing whitespaces
line = line.strip()
# Skip empty lines
if not line:
continue
# Start processing after the line starting with "# Antimicrobial"
if line.startswith("# Antimicrobial"):
process_lines = True
continue # Skip the current header line
# Stop processing at the line starting with "# WARNING"
if line.startswith("# WARNING"):
break
# Process the lines only if the flag is set
if process_lines:
# Split the line into columns and retrieve phenotype, AB and gene
parts = line.split('\t')
phenotype = parts[2].strip()
if phenotype == "Resistant" and len(parts) >= 5: # Only process the line if the phenotype is "Resistant" and the gene column is present
antibiotic = parts[0].strip()
genes = parts[4].strip() # Genes are only present when the phenotype is "Resistant"
#print(f"\n {phenotype} - {antibiotic} - {genes}.")
# Split the genes into a list of genes
gene_list = genes.split(", ")
#print(f"Original Gene List : {gene_list}")
# Retain only short gene name
gene_list_short = [gene.split(" ")[0] for gene in gene_list]
#print(f"Shortened Gene List : {gene_list_short}")
# Make a dictionary with gene & linked ABs
for gene in gene_list_short:
if gene not in data_dict:
data_dict[gene] = [antibiotic]
else:
if antibiotic not in data_dict[gene]:
data_dict[gene].append(antibiotic)
else:
continue
if file == "PointFinder_results.txt" :
file_path = os.path.join(subdir, file)
#print(f"Found data for sample {sample} in file {file}")
# Read the text file
with open(file_path, 'r') as file:
lines = file.readlines()
# Skip the first line
lines.pop(0)
# Iterate through the lines
for line in lines:
# Strip the line of leading/trailing whitespaces
line = line.strip()
# Split the line into columns
parts = line.split('\t')
gene = parts[0].strip()
# Split gene names based on space and save the first part
gene = gene.split(" ")[0]
ABs = parts[3].strip()
if gene not in data_dict:
data_dict[gene] = [ABs]
else:
if ABs not in data_dict[gene]:
data_dict[gene].append(ABs)
else:
continue
# STEP 2 : Create an Excel file to store the reference list
############################################################################################################################################################
wb = xlsxwriter.Workbook(output_file)
ws = wb.add_worksheet("RESF_reflist")
# Write the header line
header = ["Gene", "Antibiotic"]
ws.write_row(0, 0, header)
# Write the data to the Excel file
row = 2
for gene, AB_list in data_dict.items():
ws.write(row, 0, gene)
ws.write(row, 1, ", ".join(AB_list))
row += 1
# Close the Excel file
wb.close()
# Print a message when the Excel-file is succesfully finished
print(f"Reference list has been created successfully and saved as {output_file}.")