-
Notifications
You must be signed in to change notification settings - Fork 21
/
Copy pathInteractiveLdaCluster.R
63 lines (48 loc) · 2.26 KB
/
InteractiveLdaCluster.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#install.packages("text2vec", dependencies=TRUE)
#install.packages("tm", dependencies=TRUE)
#install.packages("magrittr", dependencies=TRUE)
#install.packages("LDAvis")
library(text2vec)
library(tm)
library(magrittr)
library(LDAvis)
#May take a while - wait patiently
my_file = "my_Scopus_TSE_articles_clean_data.RData"
#draw_my_IAMap = function(my_file) {
print(paste("Interactive LDA Cluster, my_file: ", my_file))
my_temp_file = paste(my_data_dir, "/", sep="")
my_temp_file = paste(my_temp_file, my_file, sep="")
load(my_temp_file)
my_text <- paste (my_articles$Title, my_articles$Abstract_clean)
#Here we use text2vec package as it contains very fast LDA clustering implementation
#Note there is some overlap with between text2vec, tm, and topicmodels pacakges
my_tokens = my_text %>% tolower %>% word_tokenizer
it = itoken(my_tokens)
my_stopwords = c(stopwords::stopwords(language = "en", source = "snowball"),"myStopword1", "myStopword2")
#Remove stopwords
my_vocab <- create_vocabulary(it, stopwords=my_stopwords)
#Limit vocabulary to 40000 terms.
#Each word is only allowed to appear in 50% of the documents
#vocab <- prune_vocabulary(vocab, max_number_of_terms=40000, doc_proportion_max=0.5)
my_vocab <- prune_vocabulary(my_vocab,
term_count_min=10,
vocab_term_max=40000,
doc_proportion_max=0.5)
#Create DTM
#dtm = create_dtm(it, vocab_vectorizer(vocab),'lda_c')
vectorizer=vocab_vectorizer(my_vocab)
dtm = create_dtm(it, vectorizer,'dgCMatrix')
#LDA model with 20 topics to provide an overview
#lda_model = LatentDirichletAllocation$new(n_topics=20, vocabulary=my_vocab)
lda_model = LatentDirichletAllocation$new(n_topics=20)
#We run 200 iterations
doc_topic_distr = lda_model$fit_transform(dtm, n_iter=200, check_convergence_every_n=5)
#Run LDAvis visualisation if needed (make sure LDAvis package installed)
#Be patient this might take while before anything is plotted. See viewer tap on the right
if (!all(lda_model$topic_word_distribution == 0)) {
lda_model$plot()
} else {
print("No Data!")
}
print("Finished Interactive LDA Cluster")
#}