-
-
Notifications
You must be signed in to change notification settings - Fork 1
/
scraper.py
69 lines (53 loc) · 1.7 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import requests
from bs4 import BeautifulSoup
import re
import terminal as terminal
import os
from dotenv import load_dotenv
def scrape(url):
response = requests.get(url)
soup = BeautifulSoup(response.content, "html5lib")
ps = soup.find_all("p")
result = []
for p in ps:
result.append(p.text)
result = clean_text(result)
return result
def clean_text(text):
if type(text) is list:
result = []
for t in text:
if type(t) != str:
return f"Error: clean_text() cannot clean text of type {type(t)}"
else:
# t = t.replace("\n", "")
if t.strip() != "":
result.append(re.sub("[\[].*?[\]]", "", t))
return result
elif type(text) is str:
result = re.sub("[\(\[].*?[\)\]]", "", t)
return result
else:
return f"Error: clean_text() cannot clean data of type{type(text)}"
def search(term):
# I'm not sure if this is the best way to do this
url = f"https://en.wikipedia.org/wiki/{term}"
return scrape(url)
def get_images(term, filename):
load_dotenv()
KEY = os.getenv("GOOGLE_IMAGES_KEY")
ID = os.getenv("GOOGLE_IMAGES_ID")
from google_images_search import GoogleImagesSearch
# you can provide API key and CX using arguments,
# or you can set environment variables: GCS_DEVELOPER_KEY, GCS_CX
gis = GoogleImagesSearch(KEY, ID)
# define search params:
_search_params = {
'q': term,
'num': 1,
'safe': 'high'
}
# this will search and download:
gis.search(search_params=_search_params,
path_to_dir='./images', custom_image_name=filename)
# get_images("cat")