-
Notifications
You must be signed in to change notification settings - Fork 0
/
ImagesFromUrl.py
85 lines (63 loc) · 2.48 KB
/
ImagesFromUrl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import mimetypes
import re
import urllib2
import urllib
import urlparse
import os
def get_all_urls_from_http_string(data):
http_strings = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', data)
print("found " + str(len(http_strings)) + " URL(s)")
return http_strings
def is_image_url(url):
if url.endswith("script>"):
return False
try:
if mimetypes.guess_type(url)[0].startswith("image"):
return True
except AttributeError:
print("Could not guess MIME type of " + url)
return False
return False
def get_all_image_urls(data):
http_strings = get_all_urls_from_http_string(data)
image_http_strings = []
for http_string in http_strings:
if is_image_url(http_string):
print("found " + http_string)
image_http_strings.append(http_string)
print("Found " + str(len(image_http_strings)) + " image URL(s)")
return image_http_strings
def read_url_as_string(url):
print("Reading " + url + " as string")
req = urllib2.Request(url, headers={'User-Agent': "Magic Browser"})
con = urllib2.urlopen(req)
url_string = con.read()
print("Got string of length " + str(len(url_string)))
return url_string
def save_images_from_url(url, save_path):
url_data = read_url_as_string(url)
image_urls = get_all_image_urls(url_data)
if len(image_urls) == 0:
return
if not os.path.isdir(save_path):
os.mkdir(save_path)
for image_url in image_urls:
image_filename = urlparse.urlsplit(image_url).path.split('/')[-1]
urllib.urlretrieve(image_url, save_path + "\\" + ''.join(determine_savefile_name(save_path, image_filename)))
def determine_savefile_name(save_path, image_filename):
full_filename_with_path = save_path + "\\" + ''.join(image_filename)
if not os.path.isfile(full_filename_with_path):
return image_filename
root, extension = os.path.splitext(os.path.expanduser(full_filename_with_path))
directory = os.path.dirname(root)
filename = os.path.basename(root)
candidate = filename + extension
index = 0
ls = set(os.listdir(directory))
while candidate in ls:
candidate = "{}_{}{}".format(filename, index, extension)
index += 1
return candidate
url_to_save_images_from = "http://www.ebscoind.com"
save_path = r"C:\temp\ebscoind"
save_images_from_url(url_to_save_images_from, save_path)