forked from karpathy/arxiv-sanity-preserver
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtwitter_daemon.py
189 lines (160 loc) · 7.28 KB
/
twitter_daemon.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
"""
Periodically checks Twitter for tweets about arxiv papers we recognize
and logs the tweets into mongodb database "arxiv", under "tweets" collection.
"""
import os
import re
import pytz
import time
import math
import pickle
import datetime
from dateutil import parser
import twitter # pip install python-twitter
import pymongo
from utils import Config
# settings
# -----------------------------------------------------------------------------
sleep_time = 60*10 # in seconds, between twitter API calls. Default rate limit is 180 per 15 minutes
max_tweet_records = 15
# convenience functions
# -----------------------------------------------------------------------------
def get_keys():
lines = open('twitter.txt', 'r').read().splitlines()
return lines
def extract_arxiv_pids(r):
pids = []
for u in r.urls:
m = re.search('arxiv.org/abs/(.+)', u.expanded_url)
if m:
rawid = m.group(1)
pids.append(rawid)
return pids
def get_latest_or_loop(q):
results = None
while results is None:
try:
results = api.GetSearch(raw_query="q=%s&result_type=recent&count=100" % (q, ))
except Exception as e:
print('there was some problem (waiting some time and trying again):')
print(e)
time.sleep(sleep_time)
return results
epochd = datetime.datetime(1970,1,1,tzinfo=pytz.utc) # time of epoch
def tprepro(tweet_text):
# take tweet, return set of words
t = tweet_text.lower()
t = re.sub(r'[^\w\s]','',t) # remove punctuation
ws = set([w for w in t.split() if not w.startswith('#')])
return ws
# -----------------------------------------------------------------------------
# authenticate to twitter API
keys = get_keys()
api = twitter.Api(consumer_key=keys[0],
consumer_secret=keys[1],
access_token_key=keys[2],
access_token_secret=keys[3])
# connect to mongodb instance
client = pymongo.MongoClient()
mdb = client.arxiv
tweets = mdb.tweets # the "tweets" collection in "arxiv" database
tweets_top1 = mdb.tweets_top1
tweets_top7 = mdb.tweets_top7
tweets_top30 = mdb.tweets_top30
print('mongodb tweets collection size:', tweets.count())
print('mongodb tweets_top1 collection size:', tweets_top1.count())
print('mongodb tweets_top7 collection size:', tweets_top7.count())
print('mongodb tweets_top30 collection size:', tweets_top30.count())
# load banned accounts
banned = {}
if os.path.isfile(Config.banned_path):
with open(Config.banned_path, 'r') as f:
lines = f.read().split('\n')
for l in lines:
if l: banned[l] = 1 # mark banned
print('banning users:', list(banned.keys()))
# main loop
last_db_load = None
while True:
dnow_utc = datetime.datetime.now(datetime.timezone.utc)
# fetch all database arxiv pids that we know about (and handle an upadte of the db file)
if last_db_load is None or os.stat(Config.db_path).st_mtime > last_db_load:
last_db_load = time.time()
print('(re-) loading the paper database', Config.db_path)
db = pickle.load(open(Config.db_path, 'rb'))
# fetch the latest mentioning arxiv.org
results = get_latest_or_loop('arxiv.org')
to_insert = []
for r in results:
arxiv_pids = extract_arxiv_pids(r)
arxiv_pids = [p for p in arxiv_pids if p in db] # filter to those that are in our paper db
if not arxiv_pids: continue # nothing we know about here, lets move on
if tweets.find_one({'id':r.id}): continue # we already have this item
if r.user.screen_name in banned: continue # banned user, very likely a bot
# create the tweet. intentionally making it flat here without user nesting
d = parser.parse(r.created_at) # datetime instance
tweet = {}
tweet['id'] = r.id
tweet['pids'] = arxiv_pids # arxiv paper ids mentioned in this tweet
tweet['inserted_at_date'] = dnow_utc
tweet['created_at_date'] = d
tweet['created_at_time'] = (d - epochd).total_seconds() # seconds since epoch
tweet['lang'] = r.lang
tweet['text'] = r.text
tweet['user_screen_name'] = r.user.screen_name
tweet['user_image_url'] = r.user.profile_image_url
tweet['user_followers_count'] = r.user.followers_count
tweet['user_following_count'] = r.user.friends_count
to_insert.append(tweet)
if to_insert: tweets.insert_many(to_insert)
print('processed %d/%d new tweets. Currently maintaining total %d' % (len(to_insert), len(results), tweets.count()))
# run over 1,7,30 days
pid_to_words_cache = {}
for days in [1,7,30]:
tweets_top = {1:tweets_top1, 7:tweets_top7, 30:tweets_top30}[days]
# precompute: compile together all votes over last 5 days
dminus = dnow_utc - datetime.timedelta(days=days)
relevant = tweets.find({'created_at_date': {'$gt': dminus}})
raw_votes, votes, records_dict = {}, {}, {}
for tweet in relevant:
# some tweets are really boring, like an RT
tweet_words = tprepro(tweet['text'])
isok = not(tweet['text'].startswith('RT') or tweet['lang'] != 'en' or len(tweet['text']) < 40)
# give people with more followers more vote, as it's seen by more people and contributes to more hype
float_vote = min(math.log10(tweet['user_followers_count'] + 1), 4.0)/2.0
for pid in tweet['pids']:
if not pid in records_dict:
records_dict[pid] = {'pid':pid, 'tweets':[], 'vote': 0.0, 'raw_vote': 0} # create a new entry for this pid
# good tweets make a comment, not just a boring RT, or exactly the post title. Detect these.
if pid in pid_to_words_cache:
title_words = pid_to_words_cache[pid]
else:
title_words = tprepro(db[pid]['title'])
pid_to_words_cache[pid] = title_words
comment_words = tweet_words - title_words # how much does the tweet have other than just the actual title of the article?
isok2 = int(isok and len(comment_words) >= 3)
# add up the votes for papers
tweet_sort_bonus = 10000 if isok2 else 0 # lets bring meaningful comments up front.
records_dict[pid]['tweets'].append({'screen_name':tweet['user_screen_name'], 'image_url':tweet['user_image_url'], 'text':tweet['text'], 'weight':float_vote + tweet_sort_bonus, 'ok':isok2, 'id':str(tweet['id']) })
votes[pid] = votes.get(pid, 0.0) + float_vote
raw_votes[pid] = raw_votes.get(pid, 0) + 1
# record the total amount of vote/raw_vote for each pid
for pid in votes:
records_dict[pid]['vote'] = votes[pid] # record the total amount of vote across relevant tweets
records_dict[pid]['raw_vote'] = raw_votes[pid]
# crop the tweets to only some number of highest weight ones (for efficiency)
for pid, d in records_dict.items():
d['num_tweets'] = len(d['tweets']) # back this up before we crop
d['tweets'].sort(reverse=True, key=lambda x: x['weight'])
if len(d['tweets']) > max_tweet_records: d['tweets'] = d['tweets'][:max_tweet_records]
# some debugging information
votes = [(v,k) for k,v in votes.items()]
votes.sort(reverse=True, key=lambda x: x[0]) # sort descending by votes
print('top votes:', votes[:min(len(votes), 10)])
# write the results to mongodb
if records_dict:
tweets_top.delete_many({}) # clear the whole tweets_top collection
tweets_top.insert_many(list(records_dict.values())) # insert all precomputed records (minimal tweets) with their votes
# and sleep for a while
print('sleeping', sleep_time)
time.sleep(sleep_time)