-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape_google_images.py
118 lines (101 loc) · 4.88 KB
/
scrape_google_images.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import requests
import os
import io
from PIL import Image
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
def get_driver():
#selenium 4 - updated way of getting the web driver
# to prevent the following error we change the options of the web driver:
# ERROR:device_event_log_impl.cc(215)] [17:16:46.922]
# USB: usb_device_handle_win.cc:1046 Failed to read descriptor from node connection:
# A device attached to the system is not functioning. (0x1F)
options = Options()
options.add_argument("start-maximized")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
driver.get("https://www.google.com")
options.add_experimental_option('excludeSwitches', ['enable-logging'])
return driver
def get_image_urls(searchterm:str, driver, max_links_to_fetch:int, sleep_between_interactions:float):
# get the url + the driver to search
google_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img".format(q = searchterm)
driver.get(google_url)
# surpass "accept all" in the "Before you continue to Google" page
driver.find_element(By.XPATH,
'//*[@id="yDmH0d"]/c-wiz/div/div/div/div[2]/div[1]/div[3]/div[1]/div[1]/form[2]/div/div/button').click()
# on the google image page
image_urls = []
image_count = 0
results_start =0
while image_count < max_links_to_fetch:
# scroll to page end so that all images can be seen
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(sleep_between_interactions)
# get all elements that are a thumbnail image
thumbnail_results = driver.find_elements(By.CSS_SELECTOR, '.bRMDJf')
number_results = len(thumbnail_results)
print(f"Found: {number_results} search results. Extracting links from {number_results}:{number_results}")
# click on the thumbnail images to be able to extract the urls
for img in thumbnail_results[results_start:number_results]:
try:
img.click()
time.sleep(sleep_between_interactions)
except Exception:
continue
# Extract image urls from elements
actual_images = driver.find_elements(By.CSS_SELECTOR, '.n3VNCb.KAlRDb')
if not actual_images:
continue
actual_img = actual_images[0]
if actual_img.get_attribute('src') and 'http' in actual_img.get_attribute('src'):
image_urls.append(actual_img.get_attribute('src'))
image_count = len(image_urls)
if image_count >= max_links_to_fetch:
print(f"Found: {image_count} image links, done!")
time.sleep(sleep_between_interactions)
return(image_urls)
else:
print("Found:", len(image_urls), "image links, looking for more ...")
time.sleep(sleep_between_interactions)
# if there are not enough pictures on this page,
# go to the next by clicking the load more button
try:
driver.execute_script("document.querySelector('.mye4qd').click();")
results_start = len(thumbnail_results)
except Exception as e:
print(e)
return(image_urls)
def save_image(folder_path:str, name:str, url:str, counter):
# open the url
try:
response = requests.get(url)
except Exception as e:
print(f"could not download {url} - {e}")
return
# save the image
try:
im = Image.open(io.BytesIO(response.content))
im.save(folder_path + "/" + name + "_" + str(counter) +'.jpg')
print(f"succes! saved image {url} as {folder_path}")
except Exception as e:
print(f"could not save {url} - {e}")
def scrape_images_from_google(search_term: str, target_path = './images', number_images = 10):
# search term
search_term_underscores = search_term.replace(" ", "_")
# make folder by search term name
target_folder = target_path + "/" + search_term_underscores
if not os.path.exists(target_folder):
os.makedirs(target_folder)
# open driver and scrape the URLS
with get_driver() as driver:
res = get_image_urls(search_term, driver, number_images, 1.0)
# save images in the target folder with increasing index
count = 0
for url in res:
save_image(target_folder, search_term_underscores, url, count)
count += 1
scrape_images_from_google("polecat") # inputs for target path and number of images are optional