-
Notifications
You must be signed in to change notification settings - Fork 1
/
expander.py
174 lines (143 loc) · 4.61 KB
/
expander.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
# -*- coding: utf-8 -*-
"""
Batch URL expander
Note: Speed up messy data by expanding only known short URLs
(c) Kevin Driscoll, 2014
"""
from socket import error as SocketError
import fileinput
import multiprocessing
import re
import urllib
import urllib2
# Constants
USER_AGENT = u'shortURL lengthener/0.1 +http://kevindriscoll.info/'
# HTTP Error codes
HTTP_REDIRECT_CODES = [
301, # Moved Permanently
302, # Found, Moved temporarily
303, # See other, Moved
307, # Temporary redirect
'301', # Moved Permanently
'302', # Found, Moved temporarily
'303', # See other, Moved
'307' # Temporary redirect
]
# HTTP Timeout (in seconds)
# For more info on socket timeout:
# http://www.voidspace.org.uk/python/articles/urllib2.shtml#sockets-and-layers
HTTP_TIMEOUT = 5 # 60
HTTP_MAX_REDIRECTS = 13
class LazyHTTPRedirectHandler(urllib2.HTTPRedirectHandler):
def redirect_request(self, req, fp, code, msg, header, newurl):
"""On redirect, raise the HTTPError and die
"""
return None
# URL utilities
SHORT_DOMAINS = []
fn = "short_domains.txt"
with open(fn, "r") as f:
for line in f:
SHORT_DOMAINS.append(line.strip())
def extract_domain(u):
"""Attempt to extract a domain name from u
Returns string or None
"""
domain_re = r'^(www.)?([^/]*)/?.*'
if u.startswith('http'):
brief = u.split('://', 1)[-1]
else:
brief = u
m = re.search(domain_re, brief)
if m:
return m.group(2)
else:
return None
def is_short_url(u):
"""Compare domain name against list of known short domains
Returns True or False
"""
domain = extract_domain(u)
return (domain in SHORT_DOMAINS)
# Lengthening functions
def lengthen(u):
"""Return short_long dict of all URLs
between u and its ultimate location
"""
# For description of error handling, see:
# http://www.voidspace.org.uk/python/articles/urllib2.shtml#httperror
# Create URL opener that doesn't auto follow redirs
opener = urllib2.build_opener(LazyHTTPRedirectHandler)
# Create list of URLs
hops = [u]
# Set nexturl to the first URL
nexturl = u
# Follow all redirects, adding URLs to hops
while nexturl and (len(hops) < HTTP_MAX_REDIRECTS):
request = urllib2.Request(nexturl)
request.add_header('User-agent', USER_AGENT)
try:
r = opener.open(request, timeout=HTTP_TIMEOUT)
except urllib2.HTTPError as err:
if err.code in HTTP_REDIRECT_CODES:
if u'location' in err.headers.keys():
loc = err.headers[u'location']
# Check for relative URL
if not loc[:4] == 'http':
nexturl = urllib.basejoin(err.geturl(), loc)
else:
nexturl = loc
else:
nexturl = None
else:
nexturl = None
except urllib2.URLError as err:
# Server not found, etc.
nexturl = None
except ValueError:
# Most likely an invalid URL
nexturl = None
except urllib2.httplib.BadStatusLine as err:
# The server sent an unfamiliar status code
# Not caught by urllib2, see:
# http://bugs.python.org/issue8823
print err
nexturl = None
except urllib2.httplib.InvalidURL as err:
# Usually happens when there is a colon
# but no port number
print err
nexturl = None
except SocketError as err:
print err
nexturl = None
else:
# Ultimate destination reached
nexturl = None
# Append the result to the hops list
# None represents the end of the chain
if nexturl:
hops.append(nexturl)
return hops
def multilengthen(q):
"""Lengthen a list of short URLs in parallel
Yields lists with chains of URL "hops"
Note: output order will not match input order
"""
pool = multiprocessing.Pool()
for urlchain in pool.imap_unordered(lengthen, q, 1000):
yield urlchain
if __name__=="__main__":
# Read list of short URLs
shorturls = []
for line in fileinput.input():
url = line.strip()
if is_short_url(url):
shorturls.append(url)
# Expand short URLs in parallel
# Print short-long pairs as they arrive
for urlchain in multilengthen(shorturls):
output = urlchain[0]
output += '\t'
output += urlchain[-1]
print output