-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgoogle_images.py
139 lines (83 loc) · 3.95 KB
/
google_images.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import requests
import base64
def get_images(search_query: str, starting_image_id: int, ending_image_id: int) -> (bytes, str):
'''
Gets images from Google Images with a specified search queary
:param search_query: The Google search query
:param starting_image_id: The number corrisponding to the order in which the image appears after the search;
that is, 0 corrisponds to the first image result, 1 to the second, 2 to the third...
This is the id of the first image, inclusive
:param ending_image_id: This is the id of the last image, exclusive.
:returns: A tuple containing two lists (images, file_types). Images contains the byte data of each image
and file_types contains the coorisponding filetypes. For each id that is not present after the queary,
the function saves None to both the image and the file_type
:raises ConnectionError: If no valid web-response is recieved from Google
'''
response = response = requests.get("https://www.google.com/search?q=" + search_query.replace(" ", "+")
+ "&tbm=isch",
headers = __get_headers())
if response == False:
raise ConnectionError("Could not recieve data from Google")
images, file_types = [], []
for i in range(starting_image_id, ending_image_id):
b64_str, file_type = __get_base64_image_string(response.text, i)
# Check for invalid id
if b64_str == None:
images.append(None)
file_types.append(None)
continue
byte_data = base64.b64decode(b64_str)
images.append(byte_data)
file_types.append(file_type)
return images, file_types
# --------------- HELPER FUNCTIONS --------------- #
def __get_headers() -> dict:
'''
Gets the minimum headers required for Google to handle requests
:returns: The headers
'''
return {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36"}
def __get_base64_image_string(text: str, image_id: int) -> (str, str):
'''
Gets the base64 encoded string representing an image within "text"
:param text: The body of text containing the image's string representation
:param image_id: The id of the image within "text"
:returns: A touple, (base64_string, type), which contains the base64 encoded string and the image data type.
If no image with the id specified ecists on the webpage, the function returns (None, None)
'''
first_marker = "_setImgSrc('" + str(image_id)
second_marker = "image\\/"
third_marker = "base64,"
temp_index = text.find("_setImgSrc('" + str(image_id))
if temp_index == -1:
return None, None
image_type = __get_substring(text[temp_index:], text[temp_index:].find(second_marker) + len(second_marker), [";"])
starting_index = temp_index + text[temp_index:].find(third_marker) + len(third_marker)
base64_string = __get_substring(text, starting_index, ["\'"], ["\\"])
# pad string to be proper length
pad = len(base64_string) % 4
base64_string += "="*pad
return base64_string, image_type
def __get_substring(string: str, starting_character_index: int, terminating_characters: list, ignore_characters: list = []) -> str:
'''
Gets the substring within "string" which begins at the "starting_character_index" and
ends at the "terminating_character".
:param string: The string in which the desired substring resides
:param starting_character_index: The index of the first character in the substring
:param terminating_characters: The characters which proceeds the final character in the substring. If any character is reached, the substring ends.
:param ignore_characters: The characters to be ignored
:returns: The substring
'''
s = ""
i = starting_character_index
curr_char = string[i]
while curr_char not in terminating_characters:
# Skip ignored characters
if curr_char in ignore_characters:
i += 1
curr_char = string[i]
continue
s += curr_char
i += 1
curr_char = string[i]
return s