-
Notifications
You must be signed in to change notification settings - Fork 0
/
Crawler.py
364 lines (292 loc) · 13.9 KB
/
Crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
# -*- coding: utf-8 -*-
#
# MIT License
#
# Copyright (c) 2020 Mike Simms
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import argparse
import logging
import os
import requests
import signal
import sys
import time
import traceback
import urllib
from bs4 import BeautifulSoup
from url_normalize import url_normalize
import CrawlerDatabase
import Keys
ERROR_LOG = 'error.log'
g_crawler = None # Allows us to get the main object from the signal handler
# Import things so that they have the same name regardless of whether we are using python2 or python3.
if sys.version_info[0] < 3:
import imp
import urlparse
from urlparse import urljoin
else:
from importlib.machinery import SourceFileLoader
import urllib.parse as urlparse
from urllib.parse import urljoin
def signal_handler(signal, frame):
"""Called when the interrupt signal is received."""
global g_crawler
print("Exiting...")
if g_crawler is not None:
g_crawler.running = False
print("Done")
def create_website_object(module_name):
"""Load the module that implements website-specific logic and instantiates an object of the class that does the work."""
if module_name and os.path.isfile(module_name):
if sys.version_info[0] < 3:
module = imp.load_source("", module_name)
else:
module = mymodule = SourceFileLoader('modname', module_name).load_module()
return module.create()
return None
def get_url_root(url):
p = urlparse.urlparse(url)
if p.hostname and len(p.hostname) > 0:
return p.hostname
return p.path
class Crawler(object):
"""Class containing the URL handlers."""
def __init__(self, seed_url, rate_secs, website_objs, db, max_depth, min_revisit_secs, crawl_other_websites, verbose):
"""Constructor."""
self.seed_url = seed_url
self.rate_secs = rate_secs
self.website_objs = website_objs
self.db = db
self.max_depth = max_depth
self.min_revisit_secs = min_revisit_secs
self.crawl_other_websites = crawl_other_websites
self.verbose = verbose
self.running = True
self.last_crawl_time = 0 # The timestamp of the last time we visited a URL.
self.error_urls = [] # These URLs are giving us problems, skip them.
super(Crawler, self).__init__()
def verbose_print(self, msg):
"""Helper function."""
if self.verbose:
print(msg)
def log_error(self, log_str):
"""Writes an error message to the log file."""
logger = logging.getLogger()
logger.error(log_str)
self.verbose_print(log_str)
def create_or_update_database(self, url, raw_content, extracted_content):
"""Helper function."""
if self.db is None:
return
# Let the user know what's going on.
self.verbose_print("Storing " + url + " in the database...")
# Update database.
page_from_db = self.db.retrieve_page(url)
now = time.time()
if page_from_db:
success = self.db.update_page(url, now, raw_content, extracted_content)
else:
success = self.db.create_page(url, now, raw_content, extracted_content)
if not success:
self.log_error("ERROR: Failed to store " + url + " in the database...")
def parse_content(self, url, raw_content):
"""Parses data that was read from either a file or URL."""
# Let the user know what's going on.
self.verbose_print("Parsing " + url + "...")
# Parse the page.
soup = BeautifulSoup(raw_content, 'html5lib')
# Let the website object extract whatever information it wants from the page.
extracted_content = None
for website_obj in self.website_objs:
extracted_content = website_obj.parse(url, soup)
# Harvest any new URLs.
urls_to_crawl = []
for a in soup.find_all('a', href=True):
urls_to_crawl.append(a['href'])
urls_to_crawl = list(dict.fromkeys(urls_to_crawl)) # Remove duplicates
return extracted_content, urls_to_crawl
def visit_new_urls(self, parent_url, urls_to_crawl, current_depth):
"""Visits URLs that we haven't visited yet."""
# Crawl all new URLs.
for new_url in urls_to_crawl:
# If the crawling has been cancelled.
if self.running is False:
return
# Do we need to throttle ourselves?
if self.rate_secs is not None and time.time() - self.last_crawl_time < self.rate_secs:
self.verbose_print("Sleeping for " + str(self.rate_secs) + " second(s).")
time.sleep(self.rate_secs)
# Crawl the URL.
crawled = self.crawl_url(parent_url, new_url, current_depth + 1)
def crawl_file(self, file_name):
"""Starts crawling from a file."""
# Open the file.
with open(file_name, 'r') as f:
# Read the entire contents of the file.
content = f.read()
# Crawl the content.
extracted_content, urls_to_crawl = self.parse_content("", content)
# Visit the fresh URLs.
self.visit_new_urls(url, urls_to_crawl, 0)
def crawl_url(self, parent_url, child_url, current_depth):
"""Crawls, starting at the given URL, up to the maximum depth."""
# If we've exceeded the maximum depth.
if self.max_depth is not None and current_depth >= self.max_depth:
self.verbose_print("Maximum crawl depth exceeded.")
return False
# Canonicalize the URL.
url = urljoin(parent_url, child_url)
url = url_normalize(url)
# Drop any query parameters.
parts = url.split('#')
url = parts[0]
# Is this URL from the seed website? Do we care?
if not self.crawl_other_websites:
root_url = get_url_root(url)
if root_url != self.seed_url:
self.verbose_print("Skipping " + url + " because the settings do not allow us to crawl links outside of the seed location.")
return False
# If this URL has given us problems then skip it.
if url in self.error_urls:
self.verbose_print("Skipping " + url + " because it has given us problems.")
return False
# Only proceed if we have a module that can parse this URL (though proceed if we don't have any modules loaded).
# Also, if we have a module that can parse it, see if it has any cookies it wants to add to the request.
cookies = None
if len(self.website_objs) > 0:
interesting = False
for website_obj in self.website_objs:
interesting = website_obj.is_interesting_url(url)
if interesting:
cookies = website_obj.make_cookies(url)
break
if not interesting:
self.verbose_print("Skipping " + url + " because there are no modules to parse it.")
return False
# If we've been here before and it was within our revisit window then just skip.
# Don't bother doing this check for the first URL, since it'll be the one the user told us to crawl.
if current_depth > 0 and self.db and self.min_revisit_secs and self.min_revisit_secs > 0:
# Get the database record corresonding to this URL.
page_from_db = self.db.retrieve_page(url)
if page_from_db and Keys.LAST_VISIT_TIME_KEY in page_from_db:
# How long since we were last here?
now = time.time()
last_visited_diff = now - page_from_db[Keys.LAST_VISIT_TIME_KEY]
if last_visited_diff < self.min_revisit_secs:
last_visited_units = "second"
if last_visited_diff >= 86400:
last_visited_diff = last_visited_diff / 86400
last_visited_units = "day"
elif last_visited_diff >= 3600:
last_visited_diff = last_visited_diff / 3600
last_visited_units = "hour"
elif last_visited_diff >= 60:
last_visited_diff = last_visited_diff / 60
last_visited_units = "minute"
self.verbose_print("Skipping " + url + " because we visited it " + str(last_visited_diff) + " " + last_visited_units + "(s) ago.")
return False
try:
# Download the page from the URL.
self.verbose_print("Requesting data from " + url + "...")
response = requests.get(url, cookies=cookies, headers={'User-Agent': 'Mozilla/5.0'})
# If downloaded....
if response.status_code == 200:
# Process the content. Anything the parsing module wants stored will be returned in the blob.
extracted_content, urls_to_crawl = self.parse_content(url, response.content)
# Note that we visited this webpage.
self.create_or_update_database(url, response.content, extracted_content)
# Make a note of the time.
self.last_crawl_time = time.time()
# Visit the fresh URLs.
self.visit_new_urls(url, urls_to_crawl, current_depth)
return True
# Nothing downloaded.
else:
# Make sure we don't go here again.
self.error_urls.append(url)
# Print an error.
self.log_error("ERROR: Received HTTP Code " + str(response.status_code) + ".")
except:
# Make sure we don't go here again.
self.error_urls.append(url)
# Log an error.
self.log_error(traceback.format_exc())
self.log_error(sys.exc_info()[0])
self.log_error("ERROR: Exception requesting data.")
return False
def main():
"""Entry point for the app."""
global g_crawler
# Parse command line options.
parser = argparse.ArgumentParser()
parser.add_argument("--file", default="", help="File to crawl.", required=False)
parser.add_argument("--url", default="", help="URL to crawl.", required=False)
parser.add_argument("--rate", type=int, default=1, help="Rate, in seconds, at which to crawl.", required=False)
parser.add_argument("--max-depth", type=int, default=None, help="Maximum crawl depth.", required=False)
parser.add_argument("--min-revisit-secs", type=int, default=86400, help="Minimum number of seconds before allowing a URL to be revisited.", required=False)
parser.add_argument("--website-modules", default="", help="Python modules that implement website-specific logic.", required=False)
parser.add_argument("--mongodb-addr", default="localhost:27017", help="Address of the mongo database.", required=False)
parser.add_argument("--crawl-other-websites", action="store_true", default=False, help="If not set will stay on links that belong to the seed URL.", required=False)
parser.add_argument("--verbose", action="store_true", default=False, help="Enables verbose output.", required=False)
try:
args = parser.parse_args()
except IOError as e:
parser.error(e)
sys.exit(1)
# Print the settings.
print("Using the following settings:")
var_args = vars(args)
for arg_name in var_args:
print("* " + arg_name + ": " + str(var_args[arg_name]))
print("")
# Sanity check.
if len(args.file) == 0 and len(args.url) == 0:
print("Neither a file nor a URL to crawl was specified.")
parser.print_help(sys.stderr)
sys.exit(1)
# Instantiate the object that connects to the database.
db = None
if args.mongodb_addr is not None:
db = CrawlerDatabase.MongoDatabase()
db.connect(args.mongodb_addr)
# Instantiate the object that implements website-specific logic.
website_objs = []
if len(args.website_modules) > 0:
website_module_names = args.website_modules.split(',')
for website_module_name in website_module_names:
website_obj = create_website_object(website_module_name)
website_objs.append(website_obj)
seed_url = ""
if len(args.url) > 0:
seed_url = get_url_root(args.url)
# Instantiate the object that does the crawling.
g_crawler = Crawler(seed_url, args.rate, website_objs, db, args.max_depth, args.min_revisit_secs, args.crawl_other_websites, args.verbose)
# Register the signal handler.
signal.signal(signal.SIGINT, signal_handler)
# Configure the error logger.
logging.basicConfig(filename=ERROR_LOG, filemode='w', level=logging.DEBUG, format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
# Crawl a file.
if len(args.file) > 0:
g_crawler.crawl_file(args.file)
# Crawl a URL.
if len(args.url) > 0:
g_crawler.crawl_url("", args.url, 0)
if __name__ == "__main__":
main()