-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsearch_assisted_embassies.py
87 lines (80 loc) · 3.28 KB
/
search_assisted_embassies.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import requests
import csv
import time
import urllib.parse
import pandas as pd
# API key and base parameters
base_params = {
"api_key": "APIKEY", # Replace with your actual SerpAPI key
"engine": "google",
"location": "United States",
"google_domain": "google.com",
"gl": "us",
"hl": "en",
"start": "0",
"num": "20"
}
# Read the embassies from 'embassies.csv' using pandas
embassies_df = pd.read_csv('embassies.csv')
# List of sites to search
sites = ["instagram.com", "facebook.com", "youtube.com", "t.me", "tiktok.com", "x.com"]
def get_query(sites, embassy_name):
site_query = ' OR '.join([f'site:{site}' for site in sites])
return f'{embassy_name} ({site_query})'
# Prepare field names for the CSV file
embassy_columns = embassies_df.columns.tolist()
result_columns = ['site', 'title', 'link', 'snippet']
fieldnames = embassy_columns + result_columns
# Open the output CSV file
with open('results_embassies.csv', 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
# Loop over each embassy (row) in the DataFrame
for index, row in embassies_df.iterrows():
embassy_name = row['Name']
query = get_query(sites, embassy_name)
print(f"Processing query: {query}")
for start in range(0, 10, 10):
params = base_params.copy()
params['q'] = query
params['start'] = str(start)
params['num'] = '10'
try:
response = requests.get('https://serpapi.com/search', params=params)
data = response.json()
except Exception as e:
print(f"Error fetching results for query: {query} start: {start}")
print(e)
continue
# Check for errors in the response
if 'error' in data:
print(f"Error in response for query: {query} start: {start}")
print(data['error'])
break # Exit the loop if there's an error
# Process the results
if 'organic_results' in data and data['organic_results']:
for result in data['organic_results']:
# Get relevant fields
title = result.get('title', '')
link = result.get('link', '')
snippet = result.get('snippet', '')
# Extract the site from the link
parsed_url = urllib.parse.urlparse(link)
site = parsed_url.netloc.replace('www.', '')
if site not in sites:
continue # Skip sites not in our list
# Prepare the row data by combining embassy info and result info
row_data = row.to_dict()
row_data.update({
'site': site,
'title': title,
'link': link,
'snippet': snippet
})
# Write to CSV
writer.writerow(row_data)
else:
print(f"No more results for query: {query} starting at {start}")
break # Exit the loop if no more results
# Respect rate limits
time.sleep(1)