Skip to content

Commit

Permalink
Merge pull request #41 from ferru97/DEV
Browse files Browse the repository at this point in the history
New version v1.2
  • Loading branch information
ferru97 authored Jun 2, 2021
2 parents 6d34417 + c54914c commit 8a3c5df
Show file tree
Hide file tree
Showing 13 changed files with 222 additions and 172 deletions.
88 changes: 44 additions & 44 deletions PyPaperBot/Crossref.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@


def getBibtex(DOI):
try:
try:
url_bibtex = "http://api.crossref.org/works/" + DOI + "/transform/application/x-bibtex"
x = requests.get(url_bibtex)
return str(x.text)
Expand All @@ -20,63 +20,63 @@ def getBibtex(DOI):
def getPapersInfoFromDOIs(DOI, restrict):
paper_found = Paper()
paper_found.DOI = DOI

try:
paper = get_entity(DOI, EntityType.PUBLICATION, OutputType.JSON)
if paper!=None and len(paper)>0:
if "title" in paper:
paper_found.title = paper["title"][0]
if "short-container-title" in paper and len(paper["short-container-title"])>0:
paper_found.jurnal = paper["short-container-title"][0]
if restrict==None or restrict!=1:

if restrict==None or restrict!=1:
paper_found.setBibtex(getBibtex(paper_found.DOI))
except:
print("Paper not found "+DOI)

return paper_found


#Get paper information from Crossref and return a list of Paper
def getPapersInfo(papers, scholar_search_link, restrict):
def getPapersInfo(papers, scholar_search_link, restrict, scholar_results):
papers_return = []
num = 1
for paper in papers:
title = paper['title']
queries = {'query.bibliographic': title.lower(),'sort':'relevance',"select":"DOI,title,deposited,author,short-container-title"}

print("Searching paper {} of {} on Crossref...".format(num,len(papers)))
num += 1

found_timestamp = 0
paper_found = Paper(title,paper['link'],scholar_search_link, paper['cites'], paper['link_pdf'], paper['year'], paper['authors'])
while True:
try:
for el in iterate_publications_as_json(max_results=30, queries=queries):

el_date = 0
if "deposited" in el and "timestamp" in el["deposited"]:
el_date = int(el["deposited"]["timestamp"])

if (paper_found.DOI==None or el_date>found_timestamp) and "title" in el and similarStrings(title.lower() ,el["title"][0].lower())>0.75:
found_timestamp = el_date

if "DOI" in el:
paper_found.DOI = el["DOI"].strip().lower()
if "short-container-title" in el and len(el["short-container-title"])>0:
paper_found.jurnal = el["short-container-title"][0]

if restrict==None or restrict!=1:
paper_found.setBibtex(getBibtex(paper_found.DOI))

break
except ConnectionError as e:
print("Wait 10 seconds and try again...")
time.sleep(10)

papers_return.append(paper_found)
time.sleep(random.randint(1,10))
while num <= scholar_results:
title = paper['title']
queries = {'query.bibliographic': title.lower(),'sort':'relevance',"select":"DOI,title,deposited,author,short-container-title"}

print("Searching paper {} of {} on Crossref...".format(num,scholar_results))
num += 1

found_timestamp = 0
paper_found = Paper(title,paper['link'],scholar_search_link, paper['cites'], paper['link_pdf'], paper['year'], paper['authors'])
while True:
try:
for el in iterate_publications_as_json(max_results=30, queries=queries):

el_date = 0
if "deposited" in el and "timestamp" in el["deposited"]:
el_date = int(el["deposited"]["timestamp"])

if (paper_found.DOI==None or el_date>found_timestamp) and "title" in el and similarStrings(title.lower() ,el["title"][0].lower())>0.75:
found_timestamp = el_date

if "DOI" in el:
paper_found.DOI = el["DOI"].strip().lower()
if "short-container-title" in el and len(el["short-container-title"])>0:
paper_found.jurnal = el["short-container-title"][0]

if restrict==None or restrict!=1:
paper_found.setBibtex(getBibtex(paper_found.DOI))

break
except ConnectionError as e:
print("Wait 10 seconds and try again...")
time.sleep(10)

papers_return.append(paper_found)

time.sleep(random.randint(1,10))

return papers_return
43 changes: 21 additions & 22 deletions PyPaperBot/Downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def setSciHubUrl():
if found:
print("\nUsing {} as Sci-Hub instance".format(NetInfo.SciHub_URL))
else:
print("\nNo working Sci-Hub instance found!\nIf in your country Sci-Hub is not available consider using a VPN")
print("\nNo working Sci-Hub instance found!\nIf in your country Sci-Hub is not available consider using a VPN or a proxy")
NetInfo.SciHub_URL = "https://sci-hub.st"


Expand All @@ -32,66 +32,65 @@ def getSaveDir(folder, fname):
while path.exists(dir_):
n += 1
dir_ = path.join(folder, "("+str(n)+")"+fname)

return dir_

def saveFile(file_name,content, paper,dwn_source):
def saveFile(file_name,content, paper,dwn_source):
f = open(file_name, 'wb')
f.write(content)
f.close()

paper.downloaded = True
paper.downloadedFrom = dwn_source


def downloadPapers(papers, dwnl_dir, num_limit, SciHub_URL=None):

def downloadPapers(papers, dwnl_dir, num_limit, scholar_results, SciHub_URL=None):
def URLjoin(*args):
return "/".join(map(lambda x: str(x).rstrip('/'), args))

NetInfo.SciHub_URL = SciHub_URL
if NetInfo.SciHub_URL==None:
setSciHubUrl()

num_downloaded = 0
paper_number = 1
paper_files = []
for p in papers:
if p.canBeDownloaded() and (num_limit==None or num_downloaded<num_limit):
print("Download {} of {} -> {}".format(paper_number, len(papers), p.title))
for p in papers:
if p.canBeDownloaded() and (num_limit==None or num_downloaded<num_limit):
print("Download {} of {} -> {}".format(paper_number, scholar_results, p.title))
paper_number += 1

pdf_dir = getSaveDir(dwnl_dir, p.getFileName())

faild = 0
while p.downloaded==False and faild!=4:
while p.downloaded==False and faild!=4:
try:
dwn_source = 1 #1 scihub 2 scholar
dwn_source = 1 #1 scihub 2 scholar
if faild==0 and p.DOI!=None:
url = URLjoin(NetInfo.SciHub_URL, p.DOI)
if faild==1 and p.scholar_link!=None:
url = URLjoin(NetInfo.SciHub_URL, p.scholar_link)
url = URLjoin(NetInfo.SciHub_URL, p.scholar_link)
if faild==2 and p.scholar_link!=None and p.scholar_link[-3:]=="pdf":
url = p.scholar_link
dwn_source = 2
if faild==3 and p.pdf_link!=None:
url = p.pdf_link
dwn_source = 2
dwn_source = 2

if url!="":
r = requests.get(url, headers=NetInfo.HEADERS)
content_type = r.headers.get('content-type')

if dwn_source==1 and 'application/pdf' not in content_type:
time.sleep(random.randint(1,5))

pdf_link = getSchiHubPDF(r.text)
if(pdf_link != None):
r = requests.get(pdf_link, headers=NetInfo.HEADERS)
content_type = r.headers.get('content-type')

if 'application/pdf' in content_type:
paper_files.append(saveFile(pdf_dir,r.content,p,dwn_source))
except Exception:
pass

faild += 1
24 changes: 12 additions & 12 deletions PyPaperBot/HTMLparsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def schoolarParser(html):
result = []
soup = BeautifulSoup(html, "html.parser")
for element in soup.findAll("div", class_="gs_r gs_or gs_scl"):
if isBook(element) == False:
if isBook(element) == False:
title = None
link = None
link_pdf = None
Expand All @@ -19,7 +19,7 @@ def schoolarParser(html):
authors = None
for h3 in element.findAll("h3", class_="gs_rt"):
found = False
for a in h3.findAll("a"):
for a in h3.findAll("a"):
if found == False:
title = a.text
link = a.get("href")
Expand Down Expand Up @@ -48,16 +48,16 @@ def schoolarParser(html):
year = None
else:
year = str(year)
if title!=None:
if title!=None:
result.append({
'title' : title,
'link' : link,
'cites' : cites,
'link_pdf' : link_pdf,
'year' : year,
'authors' : authors})
return result
return result



def isBook(tag):
Expand All @@ -72,19 +72,19 @@ def isBook(tag):
def getSchiHubPDF(html):
result = None
soup = BeautifulSoup(html, "html.parser")

iframe = soup.find(id='pdf')
plugin = soup.find(id='plugin')

if iframe!=None:
result = iframe.get("src")

if plugin!=None and result==None:
result = plugin.get("src")

if result!=None and result[0]!="h":
result = "https:"+result

return result

def SciHubUrls(html):
Expand All @@ -96,6 +96,6 @@ def SciHubUrls(html):
link = a.get("href")
if link.startswith("https://sci-hub.") or link.startswith("http://sci-hub."):
result.append(link)

return result

Loading

0 comments on commit 8a3c5df

Please sign in to comment.