-
Notifications
You must be signed in to change notification settings - Fork 0
/
ocho_scraper.py
406 lines (305 loc) · 12.8 KB
/
ocho_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
#################################################
'''
ocho_scraper.py checks a URL and grabs the files listed. If ran multiple times,
it grabs new files added to the directory without redownloading all the files again.
To download newer versions of individual files, just delete the folder you want to
redownload and run the script again.
The folders will be created where you run this script.
If you want to delete all content and redownload to get the newest files, use
the -a flag: python3 ocho_scaper.py -a
This script also creates an index.html which helps you find what you are looking
for in all of the course materials.
A log of operations can be found in this directory. It is called .ocho_log
DEPENDENCIES:
1. ocho_indexer.py
2. ocho_progress_bar.py
USE:
1. Run script:
python3 ocho_scraper.py
2. PROFIT $$$$$:
COHORT 8 FOREVER!
'''
#################################################
import urllib
from urllib import request
import os
import sys
import shutil
import math
import datetime
from ocho_indexer import index
import ocho_progress_bar
import secret
def find_and_keep(list_to_edit, key_to_find):
'''
This function looks through the html page and returns just the links
'''
temp_list = []
for i in list_to_edit:
if key_to_find in i and not ">../<" in i:
temp_list.append(i)
return temp_list
def split_and_append(list_to_edit, split_at, keep_index):
'''
Lots of work has to be done to the html to grab just the URL.
This is used as a callback to help cut stuff out.
keep_index is the index you want to keep after splitting the value.
'''
temp_list = []
for i in list_to_edit:
temp_list.append(i.split(split_at)[keep_index])
return temp_list
def sanatize_input(response):
'''
This function groups all of the necessary steps to clean the links
'''
response_list = []
response_list = response.split()
response_list = find_and_keep(response_list, "</a>")
response_list = split_and_append(response_list, '="', 1)
response_list = split_and_append(response_list, '/"', 0)
response_list = split_and_append(response_list, '">', 0)
return response_list
def dl_and_search(url, local_path, filename):
'''
Download and search the html file for more links containing ".html"
if other links are found, start another dl_and_search with new URL
'''
# grab the index page of URL
try:
request.urlretrieve(url + filename, filename=local_path + filename)
except urllib.error.HTTPError:
append_log(url + filename + " not found, skipping")
return
append_log('created ' + filename + ' from ' + url + filename)
# open file
file = open(local_path + filename, 'r')
# iterate through lines of file
for line in file:
line = line.strip('\n')
line = line.strip(' ')
# find line with .html link and get new_filename
if '.html">' in line:
new_filename = line.split('href="')[1].split('">')[0]
# run new dl_and_search
dl_and_search(url, local_path, new_filename)
file.close()
def check_and_create_files(URL, TOPDIR, response_list):
'''
This function creates the files and directories and skips over files in
the list if you already have a folder for them. If you want to update that
file, you can just delete that directory and rerun the script.
'''
progress_tick = 0
for lecture in response_list:
if(URL.split('/')[-2] == 'lectures'):
progress_total = len(response_list)
bar_name = '[LECTURES] SCRAPE PROGRESS'
else:
progress_total = len(response_list)
bar_name = '[EXERCISES] SCRAPE PROGRESS'
ocho_progress_bar.progress_bar(
bar_name, bar, progress_total, progress_tick)
# variables used for file paths
html_dir = TOPDIR + "/" + lecture
html_path = TOPDIR + "/" + lecture + "/" + lecture + ".html"
static_url = URL + lecture + "/_static/"
static_dir = TOPDIR + "/" + lecture + "/_static/"
images_url = URL + lecture + "/_images/"
images_dir = TOPDIR + "/" + lecture + "/_images/"
zip_url = URL + lecture
zip_dir = TOPDIR + "/" + lecture
# If lectures folder doesn't exist, create it
if not os.path.exists(TOPDIR):
os.makedirs(TOPDIR)
# HTML pages are created here
if not ".zip" in lecture:
if not os.path.exists(TOPDIR + "/" + lecture):
# Make HTML directory
append_log("\n" + html_dir + " doesn't exist.")
lecture_dir = URL + lecture
os.makedirs(html_dir)
append_log("created " + html_dir)
# Make HTML file
request.urlretrieve(lecture_dir, filename=html_path)
append_log("created " + html_path +
" from " + lecture_dir + ".html")
# If working on exercises, create solution folder and grab all files
if TOPDIR == 'exercises':
EX_TOPDIR = TOPDIR + "/" + lecture + "/solution"
DL_URL = lecture_dir + "/solution/"
os.makedirs(EX_TOPDIR)
append_log("created " + EX_TOPDIR)
dl_and_search(DL_URL, EX_TOPDIR, 'index.html')
# Make static directory
os.makedirs(static_dir)
append_log("created " + static_dir)
# Check to see if there are static items in the static directory.
static = True
try:
static_response = request.urlopen(static_url)
except urllib.error.HTTPError:
append_log('There are no static files')
static = False
# If there are images, create them
if(static):
static_response = str(static_response.read())
static_response = sanatize_input(static_response)
for static in static_response:
request.urlretrieve(
static_url + static, filename=static_dir + static)
append_log("created " + static_dir +
static + " from " + static_url + static)
# Make images directory
os.makedirs(images_dir)
append_log("created " + images_dir)
# Check to see if there are images in the image directory.
images = True
try:
images_response = request.urlopen(images_url)
except urllib.error.HTTPError:
append_log('There are no images')
images = False
# If there are images, create them
if(images):
images_response = str(images_response.read())
images_response = sanatize_input(images_response)
for image in images_response:
request.urlretrieve(
images_url + image, filename=images_dir + image)
append_log("created " + images_dir + image +
" from " + images_url + image)
else:
append_log("Skipping " + URL + lecture)
else:
# ZIP files are created here
if not os.path.exists(zip_dir):
append_log("\n" + zip_dir + " doesn't exist.")
request.urlretrieve(zip_url, filename=zip_dir)
append_log("created " + zip_dir + " from " + zip_url)
else:
append_log("Skipping " + URL + lecture)
progress_tick += 1
ocho_progress_bar.progress_bar(
bar_name, bar, progress_total, progress_tick)
def get_toc_data(contents):
'''
Extract the data from the html response and return a list
'''
contents = contents.split()
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
contents_list = []
i = 0
for line in contents:
if 'href=' in line and '>../<' not in line and '.zip' not in line:
# date
day = contents[i+1].split('-')[0]
month = int(months.index(contents[i+1].split('-')[1])) + 1
if len(str(month)) == 1:
month = '0' + str(month)
year = contents[i+1].split('-')[2]
# this is the date string that is used by the indexer later on.
# in order to get it to sort correctly, the pattern must be
# year, month, day. In the indexer, it is printed as
# month, day, year.
date = str(year) + str(month) + str(day)
# header
header = line.split('"')[1].split('"')[0].rstrip('/')
# path
path = link = line.split('"')[1].split('"')[0] + header + '.html'
contents_list.append((date, header, path))
i += 1
contents_list.sort()
return contents_list
def append_log(str='', start=False):
'''
Create a log of each operation and store in .ocho_log
'''
file = open(".ocho_log", 'a+')
# if start is True, add the current date and time to the top of the entry
if start == True:
file.write('\n' + datetime.datetime.now().strftime('%B %d, %Y - %H:%M\n'))
file.write('##################################################')
file.write(str + '\n')
file.close()
def scrape(URLS):
'''
This function starts the script with a list of urls. It iterates through the list and
creates both folders and all of the elements inside of them. After that, it runs the
indexer, which creates an index.html that helps you find what you are looking for in
all of the course materials.
'''
# Check for -a flag:
# -a removes all folders recursively that were previously created, and creates all
# new files with a new scrape.
# initial URLs check. If the URL errors, stop entire script.
for URL in URLS:
try:
response = request.urlopen(URL)
except urllib.error.HTTPError:
print(
f'{URL} is creating an HTTP Error.\nStopping operation.\n')
return
# start log
append_log(start=True)
# too many arguments given
if len(sys.argv) > 2:
print('ERROR: Incorrect number of arguments.')
print(sys.argv[0])
print('Scrape files and skip over existing folders.')
print(f'{sys.argv[0]} -a')
print('Remove folders and rescrape all files.')
return
elif len(sys.argv) == 2:
# one argument, but it isn't "-a"
if sys.argv[1] != "-a":
print('ERROR: Incorrect input.')
print(sys.argv[0])
print('Scrape files and skip over existing folders.')
print(f'{sys.argv[0]} -a')
print('Remove folders and rescrape all files.')
return
# -a given, so remove all previous folders recursively and scrape
elif sys.argv[1] == "-a":
# check the URLs one more time to make sure they work before deleting
for URL in URLS:
try:
response = request.urlopen(URL)
except urllib.error.HTTPError:
print(
f'{URL} is creating an HTTP Error.\nStopping operation.\nNO FILES DELETED! YA VELCOME.')
append_log(
f'{URL} is creating an HTTP Error.\nStopping operation.\nNO FILES DELETED! YA VELCOME.')
return
for folder in URLS:
folder = folder.split('/')[-2]
if os.path.exists(folder):
shutil.rmtree(folder)
print(f'Removed {folder} folder')
append_log(f'Removed {folder} folder')
TOPDIR = ''
contents_list = []
contents_list_ex = []
# loop through URLs
for URL in URLS:
TOPDIR = URL.split('/')[4]
response_list = request.urlopen(URL)
response_list = str(response_list.read())
# store contents of request for table of contents
contents = response_list
if not contents_list:
contents_list = get_toc_data(contents)
else:
contents_list_ex = get_toc_data(contents)
response_list = sanatize_input(response_list)
check_and_create_files(URL, TOPDIR, response_list)
# create index.html
index(contents_list, contents_list_ex)
print('\ncreated index.html\n\nAll up to date!')
append_log('\ncreated index.html\n\nAll up to date!')
#################################################
# Start scraper
bar = ocho_progress_bar.create_bar()
URLS = [secret.url1, secret.url2]
scrape(URLS)