-
Notifications
You must be signed in to change notification settings - Fork 17
/
DRKG_drug_spider.py
143 lines (135 loc) · 5.03 KB
/
DRKG_drug_spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#coding=utf-8
import sqlite3
import requests
from bs4 import BeautifulSoup as bs
import math
import pandas as pd
import time
def download(basic_url,url_id,num_retries=150):
url=basic_url+str(url_id)
try:
html=requests.get(url).content
soup=bs(html,"html.parser")
except:
if num_retries>0:
print (num_retries)
return download(url,'',num_retries-1)
else:
time.sleep(15)
return download(url,150)
return soup
def get_id(smd_page,bd_page):
url_id=[]
for i in range(smd_page):
print (i+1)
#basic_url='https://www.drugbank.ca/drugs?approved=1&c=name&d=up&page='
#drug are divided into two kinds: smd——small molecule drug bd——biotech drug, they have different websites
basic_url='https://www.drugbank.ca/drugs?approved=1&c=name&ca=0&d=up&eu=0&experimental=1&illicit=1&investigational=1&nutraceutical=1&us=0&withdrawn=1&page='
soup=download(basic_url,i+1,num_retries=150)
for j in soup.select('.name-value strong a'):
url_id.append(j.attrs['href'].split('/')[-1])
for i in range(bd_page):
print (i+1)
#basic_url='https://www.drugbank.ca/biotech_drugs?approved=1&Protein+Based+Therapies=0&page='
basic_url='https://www.drugbank.ca/biotech_drugs?utf8=%E2%9C%93&approved=0&nutraceutical=0&illicit=0&investigational=0&withdrawn=0&experimental=0&us=0&ca=0&eu=0&Protein+Based+Therapies=0&Nucleic+Acid+Based+Therapies=0&Gene+Therapies=0&Vaccines=0&Allergenics=0&Cell+transplant+therapies=0&commit=Apply+Filter&page='
soup=download(basic_url,i+1,num_retries=150)
for j in soup.select('.name-value strong a'):
url_id.append(j.attrs['href'].split('/')[-1])
return url_id
def identification(soup):
iden_list=[]
attr_list=[]
d={}
for i in soup.find('dl').findAll('dt'):
iden_list.append(i.text)
for i in soup.find('dl').findAll('dd'):
attr_list.append(i.text)
for i in range(len(attr_list)):
d[iden_list[i]]=attr_list[i]
return d
#Eperisone
def interactions(url_id,name):
interaction=''
#url='https://www.drugbank.ca/drugs/'+url_id+'/drug_interactions.json?group=approved&'
url = 'https://www.drugbank.ca/drugs/' + url_id + '/drug_interactions.json?&'
try:
length = requests.get(url).json()['recordsTotal']
for j in range(math.floor(length/100)+1):
new_url=url+'start='+str(100*j)+'&length=100'
for i in requests.get(new_url).json()['data']:
interaction_id=(bs(i[0],'lxml').find('a').attrs['href'].split('/')[-1]+'|')
name2=i[0].split('<')[1]
name2=name2.split('>')[-1]
interaction+=interaction_id
event.append(i[1])
#creat table named event in advance so that it can insert
cur.execute("insert into event(id1,name1,id2,name2,interaction)values(?,?,?,?,?)",(url_id,name,interaction_id[:-1],name2,i[1]))
interaction=interaction[:-1]
except:
pass
return interaction,event
def head_attr(soup):
d={}
try:
for i in soup.select('.bond-list-container'):
attr=''
for j in i.select('.bond-list strong a'):
attr+=(j.attrs['href'].split('/')[-1]+'|')
d[i.h3.text]=attr[:-1]
except:
pass
return d
conn=sqlite3.connect("Drug.db")
cur=conn.cursor()
'''
For all the drugs, set the parameters for get_id as (460,93) (It means we have 460 pages for Small Moleculde Drugs
and 93 pages for Biotech Drugs)
For the approved drugs, set them as (106,57)
'''
#url_id=get_id(106,56)
'''
The url_id is a dictionary/series of the drugs ids you need to get. So you can also list the drug ids you need on a excel.
Here is an example. We read url_id from drug_list.xlsx.
'''
basic_url='https://www.drugbank.ca/drugs/'
event=[]
drug=pd.read_excel("drug_list.xlsx",header=None)
url_id=drug.iloc[:,0]
for i in url_id:
soup=download(basic_url,i,num_retries=150)
try:
d_iden=identification(soup)
except:
continue
try:
name=d_iden['Name']
except:
name=''
try:
smile=d_iden['SMILES']
if smile=='Not Available':
smile=''
except:
smile=''
interaction,event=interactions(i,name)
d_attr=head_attr(soup)
try:
target=d_attr['Targets']
except:
target=''
try:
enzyme=d_attr['Enzymes']
except:
enzyme=''
try:
carrier=d_attr['Carriers']
except:
carrier=''
try:
transporter=d_attr['Transporters']
except:
transporter=''
#Creat a table named drug first, so that you can use the insert sql code.
cur.execute("insert into drug(id,name,interaction,smile,target,enzyme,carrier,transporter)values(?,?,?,?,?,?,?,?)",(i,name,interaction,smile,target,enzyme,carrier,transporter))
conn.commit()
conn.close()