forked from martinhadid/twitter-scraper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
94 lines (73 loc) · 2.82 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from databasemanager import DatabaseManager
from mysql import connector
from logger import Logger
import config
from commandline import CommandLine
from driver import Driver
from scraper import Scraper
from twitterclient import TwitterClient
from coin import Coin
import ssl
import tweepy
from datetime import datetime
from datetime import timedelta
import traceback
from database_utilities import *
"""global variable to log info and error to scraper_logs"""
logger = Logger()
# Avoid untrusted ssl certificates issues
ssl._create_default_https_context = ssl._create_unverified_context
def configure_search(cli, start_date, end_date):
"""Prepares the Url to be requested"""
url = config.scraper['twitter_search_url']
url += '%23{}%20'.format(cli.coin)
url += 'since%3A{}%20until%3A{}&'.format(start_date, end_date)
url += 'l={}&'.format(cli.language)
url += 'src=typd'
return url
def get_date_range(cli):
"""Gets range of dates to be scraped"""
date = cli.start_date
dates = []
while date != cli.end_date:
dates.append(date)
my_date = datetime.strptime(date, "%Y-%m-%d")
date = (my_date + timedelta(days=1)).strftime("%Y-%m-%d")
return dates
def main_coin(cli):
"""Generate coin instance"""
coin = Coin(config.coin_tickers[cli.coin])
coin.get_current_price()
coin.set_hist_price(cli.start_date, cli.end_date)
return coin
def main():
cli = CommandLine()
date_range = get_date_range(cli)
coin = main_coin(cli)
coin_db(config.database_name, coin)
for i in range(len(date_range) - 1):
url = configure_search(cli, date_range[i], date_range[i + 1])
logger.info('Scraping from ' + str(date_range[i]) + 'to' +str(date_range[i + 1]))
driver = Driver()
scraper = Scraper(driver, url)
twitter_client = TwitterClient()
try:
# First we scrape the site for TWEETS and USERS.
tweets, users = scraper.scrape()
# Retweeted tweet's original users are not handled by the scraper and will have incomplete info:
extra_usernames = scraper.get_extra_usernames(users, tweets)
# We complete these with the API
users += twitter_client.get_users_missing_data(extra_usernames)
# Save to DB
main_db(config.database_name, tweets, users, date_range[i])
except connector.errors.ProgrammingError:
logger.error('DB doesn\'t exists, please run create_db.sql')
except connector.errors.DatabaseError:
logger.error('Can\'t connect to server')
except tweepy.error.RateLimitError:
logger.error('Twitter API rate limit exceeded.')
except Exception:
logger.error('Something went wrong!' + traceback.format_exc())
driver.quit()
if __name__ == '__main__':
main()