This repository has been archived by the owner on Oct 28, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
scraperbox.py
63 lines (47 loc) · 1.5 KB
/
scraperbox.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import urllib.parse
import urllib.request
import ssl
import json
import os
import pandas as pd
API_TOKEN = os.environ.get("SCRAPER_BOX_TOKEN")
ssl._create_default_https_context = ssl._create_unverified_context
def fetch_gsearch_result(search_str):
# Urlencode the query string
q = urllib.parse.quote_plus(search_str)
# Create the query URL.
query = "https://api.scraperbox.com/google"
query += "?token=%s" % API_TOKEN
query += "&q=%s" % q
query += "&proxy_location=jp"
query += "&results=100"
print("query:", query)
# Call the API.
request = urllib.request.Request(query)
print("Request start")
raw_response = urllib.request.urlopen(request).read()
print("Request end")
raw_json = raw_response.decode("utf-8")
response = json.loads(raw_json)
# Print the first result title
print(response)
print(response["organic_results"][0]["title"])
results = response["organic_results"]
site_names = [res["title"] for res in results]
num_items = len(site_names)
keywords = [search_str] * num_items
urls = [res["link"] for res in results]
snippets = [res["snippet"] for res in results]
rankings = list(range(1, num_items+1))
df = pd.DataFrame(data=dict(
keyword=keywords,
site_name=site_names,
URL=urls,
snippet=snippets,
ranking=rankings,
))
return df
if __name__ == '__main__':
search_str = input()
df = fetch_gsearch_result(search_str)
df.to_csv(search_str + ".csv")