-
Notifications
You must be signed in to change notification settings - Fork 3
/
TwitterWordPhrequency.py
executable file
·88 lines (73 loc) · 2.22 KB
/
TwitterWordPhrequency.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/usr/bin/env python3
import twitfunctions as twit
import datetime as dt
import pandas as pd
import json
import sys
import re
import os
import string
if len(sys.argv) != 2:
raise ValueError('Please enter a twitter handle to analyze')
target = sys.argv[1]
# Grab target information from file OR Twitter if new
try:
with open(f'{target}.json', 'r') as target_file:
target_dict = json.load(target_file)
target_file.close()
target_json = json.dumps(target_dict, indent=4)
except EnvironmentError:
target_dict = twit.getUserInfobyName(target)
target_json = json.dumps(target_dict, indent=4)
with open(f'{target}.json', 'w') as target_file:
json.dump(target_dict, target_file)
target_file.close()
# Grab target timeline from file OR Twitter if new
count = 200
try:
with open(f'{target}_timeline.json', 'r') as timeline_file:
target_timeline_dict = json.load(timeline_file)
timeline_file.close()
timeline_json = json.dumps(target_timeline_dict, indent=4)
except EnvironmentError:
target_timeline_dict = twit.getUserTimeline(target, count)
timeline_json = json.dumps(target_timeline_dict, indent=4)
with open(f'{target}_timeline.json', 'w') as timeline_file:
json.dump(target_timeline_dict, timeline_file)
timeline_file.close()
# data holders for sanitization and scraping
links = []
word_dict = {}
rt_list = []
hashtags = []
word_count = 0
table = str.maketrans('', '', string.punctuation)
for id in target_timeline_dict:
text = id['text']
text = text.lower()
curr_string = text.split()
for w in curr_string:
word_count += 1
if text.startswith('rt'):
rt_list.append(text)
pass
curr_string = text.split()
for word in curr_string:
if 'http:' in word or 'https:' in word:
links.append(word)
if word.startswith('#'):
hashtags.append(word)
word = word.translate(table)
if word not in word_dict:
word_dict[word] = 1
else:
word_dict[word] += 1
sort_words = sorted(word_dict.items(), key=lambda x: x[1], reverse=True)
for i in sort_words:
print(i[0], i[1])
print(links)
"""
for rt in rt_list:
print(rt)
print(hashtags)
"""