-
Notifications
You must be signed in to change notification settings - Fork 0
/
wgety.py
194 lines (144 loc) · 5.78 KB
/
wgety.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
import codecs
import os
import re
import time
try: # python 3
from urllib.parse import urlparse
from http.client import HTTPConnection
except ImportError:
from httplib import HTTPConnection
from urlparse import urlparse
try:
import argparse
except ImportError:
from optparse import OptionParser
regex_map = {
'ROOT': [
' (src|href)=(["\'])(/[^"\']*)',
'^(src|href)=(["\'])(/[^"\']*)'
],
'PARENT': [
' (src|href)=(["\'])\.\./([^"\']*)',
'^(src|href)=(["\'])\.\./([^"\']*)'
],
'CURRENT': [
' (src|href)=(["\'])\./([^"\']*)',
'^(src|href)=(["\'])\./([^"\']*)'
' (src|href)=(["\'])([^"\']*)',
'^(src|href)=(["\'])([^"\']*)'
]
}
except_startswith_links = ['#', 'http', 'mailto', 'javascript' ]
class FileProgress(object):
def __init__(self, total):
self.total = total is not None and float(total) or 0 # has not content-length
return
def open(self, filename, mode):
return open(filename, mode)
def write(self, fo, contents):
fo.write(contents)
if self.total > 0: # download percent
sys.stdout.write('\r%d%%' % int(float(fo.tell())/self.total*100))
else: # has not content-length, show bytes downloaded
sys.stdout.write('\r%d bytes' % int(float(fo.tell())))
sys.stdout.flush()
class Wgety(object):
BUFFER_SIZE = 512
def __init__(self):
return
def _wgety(self, url, filename):
http = HTTPConnection(urlparse(url).netloc)
http.request('GET', url, headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.76 Safari/537.36' })
response = http.getresponse()
fp = FileProgress(response.getheader('Content-Length'))
f = fp.open(filename, 'wb')
data = response.read(self.BUFFER_SIZE)
while data:
try:
fp.write(f, data)
data = response.read(self.BUFFER_SIZE)
except:
break
sys.stdout.write('\r')
f.close();
def _get_absolute_link(self, url, line):
link_type = None
url_parsed = urlparse(url)
t_url = url_parsed.path[1:].split('/')
host = url_parsed.scheme + '://' + url_parsed.netloc
t_url.insert(0, host)
line = self._find_replace(line, t_url, regex_map)
return line
def _find_replace(self, line, url_tokens, regex_map):
def _root(obj):
path = obj.group(3).startswith('/') and obj.group(3) or ('/' + obj.group(3))
return ' ' + obj.group(1) + '=' + obj.group(2) + url_tokens[0] + path
def _parent(obj):
_tokens = list(url_tokens);
for i in range(2): _tokens.pop()
path = obj.group(3).startswith('/') and obj.group(3) or ('/' + obj.group(3))
return ' ' + obj.group(1) + '=' + obj.group(2) + '/'.join(_tokens) + path
def _current(obj):
_tokens = list(url_tokens);
for i in range(1): _tokens.pop()
for links in except_startswith_links:
if obj.group(3).startswith(links):
return obj.group(0)
path = obj.group(3).startswith('/') and obj.group(3) or ('/' + obj.group(3))
return ' ' + obj.group(1) + '=' + obj.group(2) + '/'.join(_tokens) + path
for map in regex_map:
for exp in regex_map[map]:
if map == 'ROOT':
line = re.sub(exp, _root, line)
elif map == 'PARENT':
line = re.sub(exp, _parent, line)
if map == 'CURRENT':
line = re.sub(exp, _current, line)
return line
def _compile(self, url, src_filename, dst_filename, absolute_link=True):
if os.path.exists(dst_filename):
os.remove(dst_filename)
if absolute_link:
src = codecs.open(src_filename, 'rb', 'utf-8')
dst = codecs.open(dst_filename, 'wb', 'utf-8')
l = src.readline()
while l:
l = self._get_absolute_link(url, l.strip());
if len(l): dst.write(l + '\n')
l = src.readline()
src.close()
dst.close()
os.remove(src_filename)
else:
os.rename(src_filename, dst_filename)
return
def execute(self, url, filename=None, absolute_link=None):
if filename is None:
filename = url.split('/')[-1]
temp_filename = '.' + filename
if not url.startswith('http'): url = 'http://' + url # if http, https not included
print('Getting... ' + url)
self._wgety(url, temp_filename)
if absolute_link is None: # get absolute_link option
if os.path.splitext(filename)[1].lower() in ('.html', '.htm'):
absolute_link = True
else:
absolute_link = False
print('Compiling... ' + filename)
self._compile(url, temp_filename, filename, absolute_link=absolute_link)
print('Done.')
if __name__ == '__main__':
l_argv = len(sys.argv)
parser = argparse.ArgumentParser(description='wgety for Python')
parser.add_argument('url', metavar='url', nargs=1, help='Download target url')
parser.add_argument('filename', metavar='filename', nargs='?', default=None, help='This option will save to the local filename.')
parser.add_argument('-a', '--absolute', action='store_true', help='Change to absolute link.')
args = parser.parse_args()
if l_argv > 1:
wgety = Wgety()
wgety.execute(url=sys.argv[1], filename=args.filename, absolute_link=args.absolute);
else:
print(args.accumulate(args.integers))