-
Notifications
You must be signed in to change notification settings - Fork 33
/
main.py
62 lines (48 loc) · 2.45 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
'''
Copyright 2013 Kendrick Ledet
This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>.
Google-EmailScraper
Purpose: Scraper that searches Google based on a query and scrapes all emails found on each page.
Output files are saved as csv.
Date: 5/26/13
'''
from xgoogle.search import GoogleSearch
import urllib2, re, csv, os
import argparse
class ScrapeProcess(object):
emails = [] # for duplication prevention
def __init__(self, filename):
self.filename = filename
self.csvfile = open(filename, 'wb+')
self.csvwriter = csv.writer(self.csvfile)
def go(self, query, pages):
search = GoogleSearch(query)
search.results_per_page = 10
for i in range(pages):
search.page = i
results = search.get_results()
for page in results:
self.scrape(page)
def scrape(self, page):
try:
request = urllib2.Request(page.url.encode("utf8"))
html = urllib2.urlopen(request).read()
except Exception, e:
return
emails = re.findall(r'([A-Za-z0-9\.\+_-]+@[A-Za-z0-9\._-]+\.[a-zA-Z]*)', html)
for email in emails:
if email not in self.emails: # if not a duplicate
self.csvwriter.writerow([page.title.encode('utf8'), page.url.encode("utf8"), email])
self.emails.append(email)
parser = argparse.ArgumentParser(description='Scrape Google results for emails')
parser.add_argument('-query', type=str, default='test', help='a query to use for the Google search')
parser.add_argument('-pages', type=int, default=10, help='number of Google results pages to scrape')
parser.add_argument('-o', type=str, default='emails.csv', help='output filename')
args = parser.parse_args()
args.o = args.o+'.csv' if '.csv' not in args.o else args.o # make sure filename has .csv extension
s = ScrapeProcess(args.o)
s.go(args.query, args.pages)