-
Notifications
You must be signed in to change notification settings - Fork 24
/
format_tweets.py
83 lines (57 loc) · 1.68 KB
/
format_tweets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
'''
format_tweets.py
Includes methods for processing raw tweet data
Created by Miles Luders
'''
import collect_tweets
from string import ascii_lowercase
from nltk.corpus import words as english_words, stopwords
def remove_excess_whitespace(text):
return ' '.join(text.split())
def convert_to_lowercase(text):
return text.lower()
def remove_non_alpha_chars(text):
T = list(text)
i = 0
while i < len(T):
if T[i] not in ascii_lowercase and T[i] != ' ':
del T[i]
else:
i += 1
return ''.join(T)
def format_syntax(text):
a = convert_to_lowercase(text)
b = remove_non_alpha_chars(a)
c = remove_excess_whitespace(b)
return c
def remove_non_english_words(text, english):
T = text.split(' ') # ["hello", "world"]
i = 0
while i < len(T):
if T[i] not in english:
del T[i]
else:
i += 1
return ' '.join(T)
def remove_stopwords(text, stop):
T = text.split(' ')
i = 0
while i < len(T):
if T[i] in stop:
del T[i]
else:
i += 1
return ' '.join(T)
def format_semantic(text):
english = set(w.lower() for w in english_words.words())
stop = set(w.lower() for w in stopwords.words())
a = remove_non_english_words(text, english)
b = remove_stopwords(a, stop)
return b
if __name__ == '__main__':
'''print("hello")
T = collect_tweets.get_latest_tweets('bitcoin', 2)
for t in T:
print(remove_excess_whitespace(t[1])+"\n")'''
#print(format_syntax("# ILoveNY bcuz $ $ money"))
print(format_semantic("hello to the world"))