-
Notifications
You must be signed in to change notification settings - Fork 6
/
gene2proteinMapping.py
executable file
·54 lines (42 loc) · 1.91 KB
/
gene2proteinMapping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
__author__ = "Hedra"
__email__ = "[email protected]"
# The following script imports the id table gene2preteinIDs.csv
# this file is extracted using uniprotIDmap.R
# Run uniprotIDmap.R to get entrez2uniprot.csv or Download it here
# https://gitlab.com/opencog-bio/ocImport/raw/master/data/entrez2uniprot.csv.xz
# Requires: entrez2uniprot.csv
import pandas as pd
import os
import math
import metadata
from datetime import date
from atomwrappers import *
# Define helper functions
script = "https://github.com/MOZI-AI/knowledge-import/gene2proteinMapping.py"
if not "entrez2uniprot.csv" in os.listdir("raw_data/"):
print("Generate the entres to protein ID mapping table first \n" )
else:
data = pd.read_csv("raw_data/entrez2uniprot.csv", dtype={'uniprot': str, 'entrez': float, 'symbol': str})
print("Started importing")
prot = []
genes = []
if not os.path.exists(os.getcwd()+'/dataset'):
os.makedirs('dataset')
with open("dataset/entrez_to_protein_{}.scm".format(str(date.today())), 'w') as f:
for i in range(len(data)):
try:
g = data.iloc[i]['symbol']
p = data.iloc[i]['uniprot'].strip()
genes.append(g)
prot.append(p)
expresion = CEvaluationLink(CPredicateNode("expresses"), CListLink(CGeneNode(g.upper()),ProteinNode(p)))
f.write(expresion.recursive_print() + "\n")
except:
continue
if not math.isnan(data.iloc[i]['entrez']):
entrez_id = str(int(data.iloc[i]['entrez']))
has_entrez = CEvaluationLink(CPredicateNode("has_entrez_id"), CListLink(CGeneNode(g.upper()),Entrez(entrez_id)))
f.write(has_entrez.recursive_print() + "\n")
metadata.update_meta("gene2proteinMapping:latest",
"entrez2uniprot.csv",script,genes=len(set(genes)),prot=len(set(prot)))
print("Done")