forked from ravikiranj/twitter-sentiment-analyzer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbuild_training_set.py
executable file
·53 lines (48 loc) · 1.6 KB
/
build_training_set.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import classifier_helper
from classifier_helper import *
'''
inpfile = open("training.1600000.processed.noemoticon.csv", "r")
line = inpfile.readline()
maxCount = 100
count = 1
tweets = []
while line:
count += 1
if count > maxCount:
break
splitArr = line.split(',"')
unprocessed_tweet = splitArr[5]
#tweet = process_tweet(unprocessed_tweet)
tweet = process_tweet_modified(unprocessed_tweet)
tweets.append(tweet)
line = inpfile.readline()
#end while loop
'''
inpfile = open("baseline_output.txt", "r")
line = inpfile.readline()
count = 1
tweetItems = []
opinions = []
while line:
count += 1
splitArr = line.split('|')
processed_tweet = splitArr[0].strip()
opinion = splitArr[1].strip()
tweet_item = processed_tweet, opinion
if(opinion != 'neutral' and opinion != 'negative' and opinion != 'positive'):
print('Error with tweet = %s, Line = %s') % (processed_tweet, count)
tweetItems.append(tweet_item)
line = inpfile.readline()
#end while loop
tweets = []
for (words, sentiment) in tweetItems:
words_filtered = [e.lower() for e in words.split() if len(e) >= 3]
tweets.append((words_filtered, sentiment))
word_features = get_word_features(get_words_in_tweets(tweets))
set_word_features(word_features)
training_set = nltk.classify.apply_features(extract_features, tweets)
classifier = nltk.NaiveBayesClassifier.train(training_set)
tweet = 'im so sad'
print classifier.classify(extract_features(tweet.split()))
print nltk.classify.accuracy(classifier, training_set)
classifier.show_most_informative_features(20)