-
Notifications
You must be signed in to change notification settings - Fork 0
/
twitter_text_analysis.R
76 lines (60 loc) · 2.61 KB
/
twitter_text_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
library("lda")
library("RTextTools")
library("RSiteCatalyst")
require(tm)
# Search high frequency keywords
Dat <- read.csv(file.choose(), header=T, sep=",")
head(Dat$tweet_text)
Dat$tweet_text <- sapply(Dat$tweet_text,function(row) iconv(row,to = 'UTF-8'))
a <- Corpus(VectorSource(Dat$tweet_text))
head(a)
# preprocessing
a <- tm_map(a, tolower)
a <- tm_map(a, removePunctuation)
a <- tm_map(a, removeNumbers)
# this list needs to be edited and this function repeated a few times to remove high frequency context specific words with no semantic value
a <- tm_map(a, removeWords, stopwords("english"))
# rows represent terms and columns represent documents they're contained in
mat <- TermDocumentMatrix(a,control = list(weighting = function(x) weightTfIdf(x, normalize = FALSE)))
# Interpreting term document matrix
inspect(mat[1:50,1:50])
#automatic discovery of topics in the text through “topic
#modeling” with latent Dirichlet allocation (LDA), a popular topic modeling
#algorithm.
dim(mat)
mat <- removeSparseTerms(mat, 0.99)
mat <- mat[rowSums(as.matrix(mat)) > 0, ]
dim(mat)
# applyling lda algorithm
k = 10;#number of topics
SEED = 8822; # number of tweets used
CSC_TM <-list(VEM = LDA(mat, k = k, control = list(seed = SEED)),VEM_fixed = LDA(mat, k = k,control = list(estimate.alpha = FALSE, seed = SEED)),Gibbs = LDA(mat, k = k, method = "Gibbs",control = list(seed = SEED, burnin = 1000,thin = 100, iter = 1000)),CTM = CTM(mat, k = k,control = list(seed = SEED,var = list(tol = 10^-4), em = list(tol = 10^-3))))
sapply(CSC_TM[1:2], slot, "alpha")
sapply(CSC_TM, function(x) mean(apply(posterior(x)$topics, 1, function(z) - sum(z * log(z)))))
Topic <- topics(CSC_TM[["VEM"]], 1)
Terms <- terms(CSC_TM[["VEM"]], 8)
Terms
Topic
# generate a LDA model the optimum number of topics
lda <- LDA(mat, 10)
lda
# get keywords for each topic, just for a quick look
get_terms(lda, 10)
# gets topic numbers per document
get_topics(lda, 5)
lda_topics<-get_topics(lda, 5)
lda_topics
# create object containing parameters of the word distribution for each topic
beta <- lda@beta
beta
# create object containing posterior topic distribution for each document
gamma <- lda@gamma
gamma
# create object containing terms (words) that can be used to line up with beta and gamma
terms <- lda@terms
terms
# puts the terms (or words) as the column names for the topic weights.
colnames(beta) <- terms
id <- t(apply(beta, 1, order)) # order the beta values
beta_ranked <- lapply(1:nrow(id),function(i)beta[i,id[i,]]) # gives table of words per topic with words ranked in order of beta values. Useful for determining the most important words per topic
beta_ranked