forked from cfia-data-science/PHAEDE
-
Notifications
You must be signed in to change notification settings - Fork 0
/
fasttext.R
54 lines (46 loc) · 2.25 KB
/
fasttext.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# data partition
data_training <- data[training_index, ]
data_testing <- data[-training_index, ]
# cbow word vector model
set.seed(1)
model_file_cbow <- build_vectors(documents = tolower(data_training$Title_Description),
model_path = "model_cbow",
modeltype = "cbow",
dim = 20,
wordNgrams = 2)
model_cbow <- load_model(model_file_cbow)
# skip-gram word vector model
set.seed(1)
model_file_skipgram <- build_vectors(documents = tolower(data_training$Title_Description),
model_path = "model_skipgram",
modeltype = "skipgram",
dim = 20,
wordNgrams = 2)
model_skipgram <- load_model(model_file_skipgram)
# fasttext model
set.seed(1)
start_time <- Sys.time()
model_file_fasttext <- build_supervised(documents = tolower(data_training$Title_Description),
targets = data_training$Risk,
model_path = "model_fasttext",
dim = 20,
lr = 1,
epoch = 20,
wordNgrams = 2,
pretrainedVectors = "model_skipgram.vec")
end_time <- Sys.time()
model_fasttext <- load_model(model_file_fasttext)
# fasttext running time
time_fasttext <- end_time - start_time
# extract word vectors
word_dictionay <- get_dictionary(model_skipgram)
word_vectors <- get_word_vectors(model_skipgram, word_dictionay)
# extract document vectors and generate document features
title_description_vectors <- get_sentence_representation(model_fasttext, data$Title_Description)
title_description_features <- as.data.frame(title_description_vectors)
title_description_features$Risk <- data$Risk
# prediction are returned as a list with words and probabilities
predictions_fasttext <- predict(model_fasttext, sentences = tolower(data_testing$Title_Description))
# confusion matrix
cm_fasttext <- confusionMatrix(factor(sapply(predictions_fasttext, names)), data_testing$Risk)
cm_fasttext