-
Notifications
You must be signed in to change notification settings - Fork 1
/
assign_taxonomy_with_tango.py
117 lines (101 loc) · 4.78 KB
/
assign_taxonomy_with_tango.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import os
from string import strip
import argparse
import subprocess
import shlex
def options():
parser = argparse.ArgumentParser(description="Format ITSoneDB fasta sequences and taxonomy for QIIME. Assumes VSEARCH in in your path.",
prefix_chars="-")
parser.add_argument("-f", "--itsonedb_total_fasta", type=str,
help="file containing all the ITSoneDB fasta sequences",
action="store", required=True)
parser.add_argument("-t", "--taxa_folder", type=str,
help="folder containing the NCBI taxonomy data",
action="store", required=True)
parser.add_argument("-p", "--identity_clustering", type=int,
help="Identity percent for clustering [default = 0.99",
action="store", required=False, default=0.99)
return parser.parse_args()
def clustering_execution(fasta, id_perc):
cluster = "vsearch_cluster_%i" % int(id_perc * 100)
centroids = "%i_ref_ITSoneDB.fa" % int(id_perc * 100)
cmd = shlex.split("vsearch --cluster_fast %s -uc %s --centroids %s --id %f" % (fasta, cluster, centroids, id_perc))
p = subprocess.Popen(cmd)
p.wait()
cluster_dict = {}
with open(cluster) as a:
for line in a:
s = map(strip, line.split("\t"))
if s[0] == "S":
cluster_dict.setdefault(acc_name(s[-2]), [])
cluster_dict[acc_name(s[-2])].append(acc_name(s[-2]))
elif s[0] == "H":
cluster_dict[acc_name(s[-1])].append(acc_name(s[-2]))
print len(cluster_dict)
return cluster_dict, centroids
def acc_name(l):
return l.split("|")[0]
def generate_taxonomic_info(taxa_folder, itsonedb_fasta):
acc2node = {}
with open(itsonedb_fasta) as fasta:
for line in fasta:
if line.startswith(">"):
s = map(strip, line.split("|"))
acc2node[s[0].strip(">")] = s[1]
node2parent = {}
node2order = {}
with open(os.path.join(taxa_folder, "nodes.dmp")) as nodesfile:
for line in nodesfile:
s = map(strip, line.split("|"))
node2parent[s[0]] = s[1]
node2order[s[0]] = s[2]
node2name = {}
with open(os.path.join(taxa_folder, "names.dmp")) as nodesfile:
for line in nodesfile:
s = map(strip, line.split("|"))
if s[3] == "scientific name":
node2name[s[0]] = s[1]
return acc2node, node2order, node2name, node2parent
# NCBI taxonomy pre-processing
# perl preprocess.pl --taxonomy-type NCBI --taxonomy ../../../data_transfer_rel_138/ncbi_taxonomy_data/nodes.dmp --output ITSoneDB_r138_ref
def tango_excution(cluster_dict, acc2node):
wd = os.getcwd()
with open("data_for_tango.csv", "w") as tmp:
for acc, lista in cluster_dict.items():
lista = [acc2node[i] for i in lista]
tmp.write("%s %s\n" % (acc, " ".join(lista)))
os.chdir("New_TANGO_perl_version")
cmd = shlex.split("perl tango.pl --taxonomy ITSoneDB_r138_ref --matches %s --output %s" % (
os.path.join(wd, "data_for_tango.csv"), os.path.join(wd, "tango_recalssification")))
p = subprocess.Popen(cmd)
p.wait()
acc2ass = {}
with open(os.path.join(wd, "tango_recalssification")) as a:
for line in a:
s = map(strip, line.split("\t"))
acc2ass[s[0]] = s[-1].split(";")[0]
os.chdir(wd)
return acc2ass
def define_common_taxonomy(acc2ass, node2order, node2name, id_perc, node2parent):
with open("tax_%i_ref_ITSoneDB_tango_based.fa" % int(id_perc * 100), "w") as tmp:
accepted_rank = ["kingdom", "phylum", "class", "order", "family", "genus", "species"]
for acc, node in acc2ass.items():
path = []
parent = node2parent[node]
while node != parent:
path.append(node)
node = parent
parent = node2parent[node]
path.reverse()
basic_list = ["k__", "p__", "c__", "o__", "f__", "g__", "s__"]
for node in path:
if node2order[node] in accepted_rank:
index = accepted_rank.index(node2order[node])
basic_list[index] = node2order[node][0] + "__" + node2name[node]
tmp.write("%s\t%s\n" % (acc, ";".join([i for i in basic_list])))
if __name__ == "__main__":
info_options = options()
fasta_its1, ncbi_taxa_folder, perc_id = info_options.itsonedb_total_fasta, info_options.taxa_folder, info_options.identity_clustering
otu_dict, centroidi = clustering_execution(fasta_its1, perc_id)
acc2node_dict, order_dict, name_dict, parent_dict = generate_taxonomic_info(ncbi_taxa_folder, fasta_its1)
define_common_taxonomy(acc2node_dict, order_dict, name_dict, perc_id, parent_dict)