Merge pull request #41 from ferru97/DEV

New version v1.2
ferru97 · Jun 2, 2021 · 8a3c5df · 8a3c5df
2 parents 6d34417 + c54914c
commit 8a3c5df
Show file tree

Hide file tree

Showing 13 changed files with 222 additions and 172 deletions.
diff --git a/PyPaperBot/Crossref.py b/PyPaperBot/Crossref.py
@@ -9,7 +9,7 @@
 
 
 def getBibtex(DOI):
-    try: 
+    try:
         url_bibtex = "http://api.crossref.org/works/" + DOI + "/transform/application/x-bibtex"
         x = requests.get(url_bibtex)
         return str(x.text)
@@ -20,63 +20,63 @@ def getBibtex(DOI):
 def getPapersInfoFromDOIs(DOI, restrict):
     paper_found = Paper()
     paper_found.DOI = DOI
-    
+
     try:
         paper = get_entity(DOI, EntityType.PUBLICATION, OutputType.JSON)
         if paper!=None and len(paper)>0:
             if "title" in paper:
                 paper_found.title = paper["title"][0]
             if "short-container-title" in paper and len(paper["short-container-title"])>0:
                 paper_found.jurnal = paper["short-container-title"][0]
-                
-            if restrict==None or restrict!=1:    
+
+            if restrict==None or restrict!=1:
                 paper_found.setBibtex(getBibtex(paper_found.DOI))
     except:
         print("Paper not found "+DOI)
-            
+
     return paper_found
-        
+
 
 #Get paper information from Crossref and return a list of Paper
-def getPapersInfo(papers, scholar_search_link, restrict):
+def getPapersInfo(papers, scholar_search_link, restrict, scholar_results):
     papers_return = []
     num = 1
     for paper in papers:
-        title = paper['title']
-        queries = {'query.bibliographic': title.lower(),'sort':'relevance',"select":"DOI,title,deposited,author,short-container-title"}
-
-        print("Searching paper {} of {} on Crossref...".format(num,len(papers)))
-        num += 1
-
-        found_timestamp = 0
-        paper_found = Paper(title,paper['link'],scholar_search_link, paper['cites'], paper['link_pdf'], paper['year'], paper['authors'])
-        while True:
-            try:
-                for el in iterate_publications_as_json(max_results=30, queries=queries):
-
-                    el_date = 0
-                    if "deposited" in el and "timestamp" in el["deposited"]:
-                        el_date = int(el["deposited"]["timestamp"])
-
-                    if (paper_found.DOI==None or el_date>found_timestamp) and "title" in el and similarStrings(title.lower() ,el["title"][0].lower())>0.75:
-                        found_timestamp = el_date
-
-                        if "DOI" in el:
-                            paper_found.DOI = el["DOI"].strip().lower()
-                        if "short-container-title" in el and len(el["short-container-title"])>0:
-                            paper_found.jurnal = el["short-container-title"][0]
-
-                        if restrict==None or restrict!=1:    
-                            paper_found.setBibtex(getBibtex(paper_found.DOI))
-
-                break
-            except ConnectionError as e:
-                print("Wait 10 seconds and try again...")
-                time.sleep(10)
-
-             
-        papers_return.append(paper_found)
-                
-        time.sleep(random.randint(1,10))
-        
+        while num <= scholar_results:
+            title = paper['title']
+            queries = {'query.bibliographic': title.lower(),'sort':'relevance',"select":"DOI,title,deposited,author,short-container-title"}
+
+            print("Searching paper {} of {} on Crossref...".format(num,scholar_results))
+            num += 1
+
+            found_timestamp = 0
+            paper_found = Paper(title,paper['link'],scholar_search_link, paper['cites'], paper['link_pdf'], paper['year'], paper['authors'])
+            while True:
+                try:
+                    for el in iterate_publications_as_json(max_results=30, queries=queries):
+
+                        el_date = 0
+                        if "deposited" in el and "timestamp" in el["deposited"]:
+                            el_date = int(el["deposited"]["timestamp"])
+
+                        if (paper_found.DOI==None or el_date>found_timestamp) and "title" in el and similarStrings(title.lower() ,el["title"][0].lower())>0.75:
+                            found_timestamp = el_date
+
+                            if "DOI" in el:
+                                paper_found.DOI = el["DOI"].strip().lower()
+                            if "short-container-title" in el and len(el["short-container-title"])>0:
+                                paper_found.jurnal = el["short-container-title"][0]
+
+                            if restrict==None or restrict!=1:
+                                paper_found.setBibtex(getBibtex(paper_found.DOI))
+
+                    break
+                except ConnectionError as e:
+                    print("Wait 10 seconds and try again...")
+                    time.sleep(10)
+
+            papers_return.append(paper_found)
+
+            time.sleep(random.randint(1,10))
+
     return papers_return
diff --git a/PyPaperBot/Downloader.py b/PyPaperBot/Downloader.py
@@ -22,7 +22,7 @@ def setSciHubUrl():
     if found:
         print("\nUsing {} as Sci-Hub instance".format(NetInfo.SciHub_URL))
     else:
-        print("\nNo working Sci-Hub instance found!\nIf in your country Sci-Hub is not available consider using a VPN")
+        print("\nNo working Sci-Hub instance found!\nIf in your country Sci-Hub is not available consider using a VPN or a proxy")
         NetInfo.SciHub_URL = "https://sci-hub.st"
 
 
@@ -32,66 +32,65 @@ def getSaveDir(folder, fname):
     while path.exists(dir_):
        n += 1
        dir_ = path.join(folder, "("+str(n)+")"+fname)
-    
+
     return dir_
 
-def saveFile(file_name,content, paper,dwn_source): 
+def saveFile(file_name,content, paper,dwn_source):
     f = open(file_name, 'wb')
     f.write(content)
     f.close()
 
     paper.downloaded = True
     paper.downloadedFrom = dwn_source
-
-
-def downloadPapers(papers, dwnl_dir, num_limit, SciHub_URL=None):
+
+def downloadPapers(papers, dwnl_dir, num_limit, scholar_results, SciHub_URL=None):
     def URLjoin(*args):
         return "/".join(map(lambda x: str(x).rstrip('/'), args))
-      
+
     NetInfo.SciHub_URL = SciHub_URL
     if NetInfo.SciHub_URL==None:
         setSciHubUrl()
-    
+
     num_downloaded = 0
     paper_number = 1
     paper_files = []
-    for p in papers: 
-        if p.canBeDownloaded() and (num_limit==None or num_downloaded<num_limit):        
-            print("Download {} of {} -> {}".format(paper_number, len(papers), p.title))
+    for p in papers:
+        if p.canBeDownloaded() and (num_limit==None or num_downloaded<num_limit):
+            print("Download {} of {} -> {}".format(paper_number, scholar_results, p.title))
             paper_number += 1
-            
+
             pdf_dir = getSaveDir(dwnl_dir, p.getFileName())
-                                    
+
             faild = 0
-            while p.downloaded==False and faild!=4:        
+            while p.downloaded==False and faild!=4:
                 try:
-                    dwn_source = 1 #1 scihub 2 scholar 
+                    dwn_source = 1 #1 scihub 2 scholar
                     if faild==0 and p.DOI!=None:
                         url = URLjoin(NetInfo.SciHub_URL, p.DOI)
                     if faild==1 and p.scholar_link!=None:
-                        url = URLjoin(NetInfo.SciHub_URL, p.scholar_link)          
+                        url = URLjoin(NetInfo.SciHub_URL, p.scholar_link)
                     if faild==2 and p.scholar_link!=None and p.scholar_link[-3:]=="pdf":
                         url = p.scholar_link
                         dwn_source = 2
                     if faild==3 and p.pdf_link!=None:
                         url = p.pdf_link
-                        dwn_source = 2                        
-                    
+                        dwn_source = 2
+
                     if url!="":
                         r = requests.get(url, headers=NetInfo.HEADERS)
                         content_type = r.headers.get('content-type')
-                            
+
                         if dwn_source==1 and 'application/pdf' not in content_type:
                             time.sleep(random.randint(1,5))
-                                
+
                             pdf_link = getSchiHubPDF(r.text)
                             if(pdf_link != None):
                                 r = requests.get(pdf_link, headers=NetInfo.HEADERS)
                                 content_type = r.headers.get('content-type')
-        
+
                         if 'application/pdf' in content_type:
                             paper_files.append(saveFile(pdf_dir,r.content,p,dwn_source))
                 except Exception:
                     pass
-                
+
                 faild += 1
diff --git a/PyPaperBot/HTMLparsers.py b/PyPaperBot/HTMLparsers.py
@@ -10,7 +10,7 @@ def schoolarParser(html):
     result = []
     soup = BeautifulSoup(html, "html.parser")
     for element in soup.findAll("div", class_="gs_r gs_or gs_scl"):
-        if isBook(element) == False:       
+        if isBook(element) == False:
             title = None
             link = None
             link_pdf = None
@@ -19,7 +19,7 @@ def schoolarParser(html):
             authors = None
             for h3 in element.findAll("h3", class_="gs_rt"):
                 found = False
-                for a in h3.findAll("a"): 
+                for a in h3.findAll("a"):
                     if found == False:
                         title = a.text
                         link = a.get("href")
@@ -48,16 +48,16 @@ def schoolarParser(html):
                     year = None
                 else:
                     year = str(year)
-            if title!=None:         
+            if title!=None:
                 result.append({
                     'title' : title,
                     'link' : link,
                     'cites' : cites,
                     'link_pdf' : link_pdf,
                     'year' : year,
                     'authors' : authors})
-    return result            
-        
+    return result
+
 
 
 def isBook(tag):
@@ -72,19 +72,19 @@ def isBook(tag):
 def getSchiHubPDF(html):
     result = None
     soup = BeautifulSoup(html, "html.parser")
-    
+
     iframe = soup.find(id='pdf')
     plugin = soup.find(id='plugin')
-    
+
     if iframe!=None:
         result = iframe.get("src")
-        
+
     if plugin!=None and result==None:
         result = plugin.get("src")
-        
+
     if result!=None and result[0]!="h":
         result = "https:"+result
-    
+
     return result
 
 def SciHubUrls(html):
@@ -96,6 +96,6 @@ def SciHubUrls(html):
             link = a.get("href")
             if link.startswith("https://sci-hub.") or link.startswith("http://sci-hub."):
                 result.append(link)
-    
+
     return result
-    
+