Skip to content
This repository has been archived by the owner on Mar 1, 2023. It is now read-only.

Commit

Permalink
Setting to account for papers added during crawl
Browse files Browse the repository at this point in the history
  • Loading branch information
rabdill committed Aug 27, 2018
1 parent 38dee66 commit 27071ad
Showing 1 changed file with 45 additions and 26 deletions.
71 changes: 45 additions & 26 deletions spider/spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,32 @@
import db
import config

TESTING = False # this is just for testing, so we don't crawl the whole site during development TODO delete
TESTING = False
# this is just for testing, so we don't crawl
# the whole site during development TODO delete

testing_pagecount = 50
# how many pages to grab from a single collection
# before bailing, if TESTING is True

polite = True
# whether to add pauses at several places in the crawl

stop_on_recognized = True
# whether to stop crawling once we've encountered a set
# number of papers that we've already recorded. setting this
# to 0 would make sense, except if papers are added to a
# collection WHILE you're indexing it, the crawler dies early.
# (if this is set to False, the crawler will go through every
# single page of results for a collection, which is probably
# wasteful.)

recognized_limit = 20
# if stop_on_recognized is True, how many papers we have
# to recognize *in a row* before we assume that we've indexed
# all the papers at that point in the chronology.


polite = True # whether to add pauses at several places in the crawl
stop_on_recognized = True # whether to stop crawling a collection once we
# encounter a paper that's already been indexed, or
# if every crawling session should look on every page
# for unindexed papers.

class Author(object):
def __init__(self, given, surname):
Expand Down Expand Up @@ -110,9 +128,7 @@ def record(self, connection, spider):
if responses[0][0] == self.url:
print("Found article already: {}".format(self.title))
connection.db.commit()
if stop_on_recognized:
return False
return True
return False
else:
cursor.execute("UPDATE articles SET url=%s, title=%s, collection=%s WHERE doi=%s RETURNING id;", (self.url, self.title, self.collection, self.doi))
print("Updated revision for article DOI {}: {}".format(self.doi, self.title))
Expand Down Expand Up @@ -245,8 +261,13 @@ def find_record_new_articles(self, collection):
print("\n---\n\nFetching page 0 in {}".format(collection))
r = self.session.get("https://www.biorxiv.org/collection/{}".format(collection))
results = pull_out_articles(r.html, collection)
keep_going = self.record_articles(results)
if not keep_going: return # if we already knew about the first entry, we're done
consecutive_recognized = 0
for x in results:
if not x.record(self.connection, self): # TODO: don't pass the whole damn spider here
consecutive_recognized += 1
if consecutive_recognized > recognized_limit: return
else:
consecutive_recognized = 0

pagecount = testing_pagecount if TESTING else determine_page_count(r.html) # Also just for testing TODO delete
for p in range(1, pagecount): # iterate through pages
Expand All @@ -255,8 +276,12 @@ def find_record_new_articles(self, collection):
print("\n---\n\nFetching page {} in {}".format(p, collection)) # pages are zero-indexed
r = self.session.get("https://www.biorxiv.org/collection/{}?page={}".format(collection, p))
results = pull_out_articles(r.html, collection)
keep_going = self.record_articles(results)
if not keep_going: break # If we encounter a recognized article, we're done
for x in results:
if not x.record(self.connection, self):
consecutive_recognized += 1
if consecutive_recognized > recognized_limit: return
else:
consecutive_recognized = 0

def fetch_abstracts(self):
with self.connection.db.cursor() as cursor:
Expand Down Expand Up @@ -444,7 +469,7 @@ def _rank_articles_alltime(self):
interval = 1000 # assuming we can pass the cursor into it without breaking everything
while True:
end = start + interval if start + interval < len(params) else len(params)
print("Recording ranks {} through {}...".format(start, end-1))
print("Recording ranks {} through {}.".format(start, end-1))
cursor.executemany(sql, params[start:end])
start += interval

Expand Down Expand Up @@ -495,7 +520,7 @@ def _rank_articles_hotness(self):
interval = 1000
while True:
end = start + interval if start + interval < len(params) else len(params)
print("Recording ranks {} through {}...".format(start, end-1))
print("Recording ranks {} through {}.".format(start, end-1))
cursor.executemany(sql, params[start:end])
start += interval

Expand Down Expand Up @@ -526,7 +551,7 @@ def _rank_articles_categories(self, category):
interval = 1000
while True:
end = start + interval if start + interval < len(params) else len(params)
print("Recording ranks {} through {}...".format(start, end-1))
print("Recording ranks {} through {}.".format(start, end-1))
cursor.executemany(sql, params[start:end])
start += interval

Expand All @@ -544,7 +569,7 @@ def _rank_articles_bouncerate(self):
interval = 1000
while True:
end = start + interval if start + interval < len(params) else len(params)
print("Recording ranks {} through {}...".format(start, end-1))
print("Recording ranks {} through {}.".format(start, end-1))
cursor.executemany(sql, params[start:end])
start += interval

Expand All @@ -566,7 +591,7 @@ def _rank_articles_ytd(self):
interval = 1000
while True:
end = start + interval if start + interval < len(params) else len(params)
print("Recording ranks {} through {}...".format(start, end-1))
print("Recording ranks {} through {}.".format(start, end-1))
cursor.executemany(sql, params[start:end])
start += interval

Expand Down Expand Up @@ -598,7 +623,7 @@ def _rank_articles_month(self):
interval = 1000
while True:
end = start + interval if start + interval < len(params) else len(params)
print("Recording ranks {} through {}...".format(start, end-1))
print("Recording ranks {} through {}.".format(start, end-1))
cursor.executemany(sql, params[start:end])
start += interval

Expand Down Expand Up @@ -648,7 +673,7 @@ def _rank_authors_alltime(self):
interval = 1000
while True:
end = start + interval if start + interval < len(params) else len(params)
print("Recording ranks {} through {}...".format(start, end-1))
print("Recording ranks {} through {}.".format(start, end-1))
cursor.executemany(sql, params[start:end])
start += interval

Expand All @@ -665,12 +690,6 @@ def update_article(self, article_id, abstract):
self.connection.db.commit()
print("Recorded abstract for ID {}".format(article_id, abstract))

def record_articles(self, articles):
# return value is whether we encountered any articles we had already
for x in articles:
if not x.record(self.connection, self): return False # TODO: don't pass the whole damn spider here
return True

def calculate_vectors(self):
print("Calculating vectors...")
with self.connection.db.cursor() as cursor:
Expand Down

0 comments on commit 27071ad

Please sign in to comment.