-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexhibits.py
36 lines (28 loc) · 1.05 KB
/
exhibits.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# file: exhibits.py
# save images from exhibits pages via csv file with URLs
import csv
import os
import requests
from bs4 import BeautifulSoup
def main(
csv_file: "path to csv file", # type: ignore
):
with open(csv_file) as csv_fp:
csv_reader = csv.DictReader(csv_fp)
for row in csv_reader:
print("🌐", row["url"])
URL = row["url"]
os.mkdir('_outputs/exhibits/' + URL.split("=")[-1])
getURL = requests.get(URL, headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(getURL.text, 'html.parser')
imgs = soup.find_all('img')
imageURLs = []
for img in imgs:
src = img.get('src')
imageURLs.append(requests.compat.urljoin(URL, src))
for imageURL in imageURLs:
image = requests.get(imageURL)
open('_outputs/exhibits/' + URL.split("=")[-1] + "/" + imageURL.split('/')[-1], 'wb').write(image.content)
if __name__ == "__main__":
# fmt: off
import plac; plac.call(main)