forked from Cesar456/sentimentCN
-
Notifications
You must be signed in to change notification settings - Fork 0
/
sentiment_analysis.R
executable file
·108 lines (88 loc) · 4.55 KB
/
sentiment_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# sentiment analysis using R
# https://github.com/abromberg/sentiment_analysis
library(plyr)
library(stringr)
library(e1071)
setwd("D:/Dropbox/sentimentCN/sentiment_analysis-master/")
##############
"read dict"
##############
#load up word polarity list and format it
afinn_list = read.delim(file='./AFINN/AFINN-111.txt', header=FALSE, stringsAsFactors=FALSE)
names(afinn_list) = c('word', 'score')
afinn_list$word = tolower(afinn_list$word)
#categorize words as very negative to very positive and add some movie-specific words
vNegTerms = afinn_list$word[afinn_list$score==-5 | afinn_list$score==-4]
negTerms = c(afinn_list$word[afinn_list$score==-3 | afinn_list$score==-2 | afinn_list$score==-1],
"second-rate", "moronic", "third-rate", "flawed", "juvenile", "boring", "distasteful",
"ordinary", "disgusting", "senseless", "static", "brutal", "confused", "disappointing",
"bloody", "silly", "tired", "predictable", "stupid", "uninteresting", "trite", "uneven",
"outdated", "dreadful", "bland")
posTerms = c(afinn_list$word[afinn_list$score==3 | afinn_list$score==2 | afinn_list$score==1],
"first-rate", "insightful", "clever", "charming", "comical", "charismatic",
"enjoyable", "absorbing", "sensitive", "intriguing", "powerful", "pleasant",
"surprising", "thought-provoking", "imaginative", "unpretentious")
vPosTerms = c(afinn_list$word[afinn_list$score==5 | afinn_list$score==4],
"uproarious", "riveting", "fascinating", "dazzling", "legendary")
#############################
"function of sentimentScore"
#############################
sentimentScore = function(sentences, vNegTerms, negTerms, posTerms, vPosTerms){
final_scores = matrix('', 0, 5)
scores = laply(sentences, function(sentence, vNegTerms, negTerms, posTerms, vPosTerms){
initial_sentence = sentence
#remove unnecessary characters and split up by word
sentence = gsub('[[:punct:]]', '', sentence)
sentence = gsub('[[:cntrl:]]', '', sentence)
sentence = gsub('\\d+', '', sentence)
sentence = tolower(sentence)
wordList = str_split(sentence, '\\s+')
words = unlist(wordList)
#build vector with matches between sentence and each category
vPosMatches = match(words, vPosTerms)
posMatches = match(words, posTerms)
vNegMatches = match(words, vNegTerms)
negMatches = match(words, negTerms)
#sum up number of words in each category
vPosMatches = sum(!is.na(vPosMatches))
posMatches = sum(!is.na(posMatches))
vNegMatches = sum(!is.na(vNegMatches))
negMatches = sum(!is.na(negMatches))
score = c(vNegMatches, negMatches, posMatches, vPosMatches)
#add row to scores table
newrow = c(initial_sentence, score)
final_scores = rbind(final_scores, newrow)
return(final_scores)
}, vNegTerms, negTerms, posTerms, vPosTerms)
return(scores)
}
################
"load data"
################
#load up positive and negative sentences and format
posText = read.delim(file='polarityData/rt-polaritydata/rt-polarity-pos.txt', header=FALSE, stringsAsFactors=FALSE)
negText = read.delim(file='polarityData/rt-polaritydata/rt-polarity-neg.txt', header=FALSE, stringsAsFactors=FALSE)
posText = posText$V1
negText = negText$V1
posText = unlist(lapply(posText, function(x) { str_split(x, "\n") }))
negText = unlist(lapply(negText, function(x) { str_split(x, "\n") }))
#build tables of positive and negative sentences with scores
posResult = as.data.frame(sentimentScore(posText, vNegTerms, negTerms, posTerms, vPosTerms))
negResult = as.data.frame(sentimentScore(negText, vNegTerms, negTerms, posTerms, vPosTerms))
posResult = cbind(posResult, 'positive')
negResult = cbind(negResult, 'negative')
colnames(posResult) = c('sentence', 'vNeg', 'neg', 'pos', 'vPos', 'sentiment')
colnames(negResult) = c('sentence', 'vNeg', 'neg', 'pos', 'vPos', 'sentiment')
#########################################################
"run the naive bayes algorithm using all four categories"
#########################################################
results = rbind(posResult, negResult)
classifier = naiveBayes(results[,2:5], results[,6])
predicted = predict(classifier, results)
results$predicted = predicted
results[1:50,2:7]
#display the confusion table for the classification ran on the same data
confTable = table(predicted, results[,6], dnn=list('predicted','actual'))
confTable
#run a binomial test for confidence interval of results
binom.test(confTable[1,1] + confTable[2,2], nrow(results), p=0.5)