-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdaddy.py
119 lines (100 loc) · 3.66 KB
/
daddy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# Broken link detector
#
# Traverses website and generates list of broken links and broken images
#
import logging, re, sys, time
from bs4 import BeautifulSoup
import requests
def main():
url_stack = []
visited_urls = set()
bad_urls = set()
bad_images = set()
# This base_url should be the full url to the base domain
if len(sys.argv) < 2:
base_url = "http://www.example.com/"
print 'Using hard coded domain: %s' % base_url
else:
base_url = sys.argv[1]
validate_url(base_url)
url_stack.append(base_url)
# Set up logger
FORMAT = "%(asctime)-15s %(message)s"
logging.basicConfig(format=FORMAT, filename='crawl.log', filemode='w', level=logging.DEBUG)
# Crawl through the site, grab links/images
while True:
url = url_stack.pop()
# Only visit link if it's on our site and we haven't visited it before
if base_url in url and url not in visited_urls:
resp = requests.get(url)
if resp.status_code == requests.codes.ok:
# Check for bad images on page
for image in parse_for_images(resp):
if not check_img(image):
bad_images.add(image)
logging.info('BAD IMAGE: %s', image)
# Grab links on page
found_urls = parse_for_links(resp)
for found_url in found_urls:
# Only push onto stack if we haven't visited it before
if found_url not in visited_urls:
url_stack.append(found_url)
else:
# If we get here, the status code wasn't ok => link is broken
bad_urls.add(url)
logging.info('BROKEN LINK: %s returns %s', url, resp.status_code)
visited_urls.add(url)
# This is to be nice to the server -- adjust as needed
time.sleep(.5)
# This is the closest thing Python has to a do-while
if len(url_stack) == 0:
break
# Report all the broken links found:
f = open('bad_links.log', 'w')
f.write(str( len(bad_urls) ) ) #First line
for url in bad_urls:
f.write(url)
f.close()
# Report all the bad images found:
f = open('bad_images.log', 'w')
f.write(str( len(bad_images) ) ) # First line
for image in bad_images:
f.write(image)
f.close()
sys.exit()
def parse_for_links(resp):
"""Returns a list of hyperlinks found in response. Empty list is returned if no links found"""
links = []
parser = BeautifulSoup(resp.text)
# find anchor tags and parse for href
for link in parser.find_all('a'):
href = link.get('href')
if href is None:
print "Weird formatting", link
else:
links.append(link.get('href'))
return links
def parse_for_images(resp):
images = []
parser = BeautifulSoup(resp.text)
# find img tags and parse for href
for image in parser.find_all('img'):
src = image.get('src')
if src is None:
print "Weird formatting", image
else:
images.append(image.get('src'))
return images
def check_img(url):
resp = requests.get(url)
return 'image' in resp.headers['content-type']
def validate_url(url):
# This is a so-so regular expression used to validate the user input url
# We just need the url to begin with 'http://' or 'https://'
# This could be improved later for a deeper check
if re.match('https*://.*', url) is not None:
return True
else:
sys.exit("Invalid base url: %s" % url)
if __name__ == "__main__":
main()