-
Notifications
You must be signed in to change notification settings - Fork 59
/
Copy pathsite_downloader.py
31 lines (25 loc) · 969 Bytes
/
site_downloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import requests
from bs4 import BeautifulSoup
from requests import RequestException, HTTPError, ConnectionError, URLRequired, TooManyRedirects
def download(url, tries=3):
"""
This function downloads a site using request
and also has some functionality in place to
catch exceptions and do retries if the script
didn't work.
"""
# creates a user agent in requests
headers = requests.utils.default_headers()
headers.update({
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
})
try:
site = requests.get(url, headers=headers)
soup = BeautifulSoup(site.text, 'lxml')
except (RequestException, HTTPError, ConnectionError, URLRequired, TooManyRedirects) as e:
print('Download error: {}'.format(e))
if tries > 0:
# recursive call until tries is 0
return download(url, tries - 1)
soup = None
return soup