-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathScienceDirect.py
186 lines (153 loc) · 5.91 KB
/
ScienceDirect.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
#!/usr/local/bin/python3
# -*- coding: utf-8 -*-
#
# Information about a ScienceDirect reference / Citation
import re
import alert
import base64
import html.parser
SENDER = "[email protected]"
SD_JHU_PII_URL = "http://www.sciencedirect.com.proxy1.library.jhu.edu/science/article/pii/"
SD_PII_URL = "http://www.sciencedirect.com/science/article/pii/"
class Paper(alert.PaperAlert, html.parser.HTMLParser):
"""
Describe a particular paper being reported by ScienceDirect
"""
def __init__(self):
"""
"""
super(alert.PaperAlert,self).__init__()
html.parser.HTMLParser.__init__(self)
self.title = ""
self.authors = ""
self.source = ""
self.doiUrl = ""
self.doi = ""
self.url = ""
self.hopkinsUrl = ""
self.search = ""
return None
def getFirstAuthorLastName(self):
"""
Dita Musalkova, Jakub Minks, Gabriela Storkanova, Lenka Dvorakova
This will mess up on van Drysdale etc.
"""
if self.authors:
return(self.authors.split(",")[0].split(" ")[-1])
else:
return None
def getFirstAuthorLastNameLower(self):
firstAuthor = self.getFirstAuthorLastName()
if firstAuthor:
firstAuthor = firstAuthor.lower()
return firstAuthor
class Email(alert.Alert, html.parser.HTMLParser):
"""
All the information in a Science Direct Email alert.
Parse HTML email body from ScienceDirect. The body maybe reporting more
than one paper.
"""
#searchStartRe = re.compile(r'Access (the|all \d+) new result[s]*')
searchStartRe = re.compile(r'(More\.\.\. )*Access (the|all \d+) new result[s]*')
def __init__(self, email):
html.parser.HTMLParser.__init__(self)
self.papers = []
self.search = ""
self.currentPaper = None
self.inSearch = False
self.inTitleLink = False
self.inTitleText = False
self.inTitleTextSpanDepth = 0
self.afterTitleBeforeSource = False
self.inSource = False
self.inAuthors = False
# SD email body content is base64 encoded. Decode it.
emailBodyText = base64.standard_b64decode(email.getBodyText())
self.feed(emailBodyText.decode('utf-8')) # process the HTML body text.
return None
def handle_data(self, data):
data = data.strip()
startingSearch = Email.searchStartRe.match(data)
if startingSearch:
self.inSearch = True
elif self.inSearch:
if data == '':
self.inSearch = False
else:
data = data.replace('quot;', '"')
self.search += data
elif self.inTitleText:
self.currentPaper.title += data + " "
elif self.inSource:
self.currentPaper.source = data
self.inSource = False
elif self.inAuthors:
self.currentPaper.authors += data
return(None)
def handle_starttag(self, tag, attrs):
if tag == "td" and len(attrs) > 0 and attrs[0][0] == "class" and attrs[0][1] == "txtcontent":
"""
Paper has started; next tag is an anchor, and it has paper URL
We now have a long URL that points to a public HTML version of
the paper. We don't have a doi. But we will have a title shortly.
Should we use the URL to get the DOI? Or will sciencedirect
just always be title match?
ScienceDirect has an API we could use to extract the DOI, or we
could putll it from the HTML page.
For now, go with title only match
"""
self.inTitleLink = True
self.currentPaper = Paper()
self.papers.append(self.currentPaper)
elif tag == "a" and self.inTitleLink:
fullUrl = attrs[0][1]
urlArgs = fullUrl.split("&")
for urlArg in urlArgs:
if urlArg[0:8] == "_piikey=":
self.currentPaper.url = SD_PII_URL + urlArg[8:]
self.currentPaper.hopkinsUrl = SD_JHU_PII_URL + urlArg[8:]
break
self.inTitleLink = False
elif tag == "span" and attrs[0][0] == "class" and attrs[0][1] == "artTitle":
self.inTitleText = True
self.inTitleTextSpanDepth = 1
elif self.inTitleText and tag == "span":
self.inTitleTextSpanDepth += 1
elif tag == "i" and self.afterTitleBeforeSource:
self.inSource = True
self.afterTitleBeforeSource = False
elif tag == "span" and attrs[0][0] == "class" and attrs[0][1] == "authorTxt":
self.inAuthors = True
return (None)
def handle_endtag(self, tag):
if self.inTitleText and tag == "span":
self.inTitleTextSpanDepth -= 1
if self.inTitleTextSpanDepth == 0:
self.inTitleText = False
self.afterTitleBeforeSource = True
self.currentPaper.title = self.currentPaper.title.strip()
elif self.inAuthors and tag == "span":
self.inAuthors = False
return (None)
def handle_startendtag(self, tag, attrs):
"""
Process tags like IMG and BR that don't have end tags.
"""
return(None)
def handle_entityref(self, name):
"""
Having troubles with embedded 's in Author list.
"""
if name == "nbsp" and self.inAuthors:
self.currentPaper.authors += " "
return(None)
def getPapers(self):
"""
Return list of referencing papers in this alert.
"""
return(self.papers)
def getSearch(self):
"""
Returns text identifying what web os science search this alert is for.
"""
return(self.search)