.Rhistory

#
# Which values are greater than 3.5?
larger_than_3.5 <- my_vector > 3.5
larger_than_3.5
# Inspect vector more closely
class(larger_than_3.5)
str(larger_than_3.5)
summary(larger_than_3.5)
# Grab only the values larger than 3.5
my_vector2 <- my_vector[larger_than_3.5]
my_vector2
# Grab only the values larger than 3.5
my_vector3 <- my_vector[my_vector > 3.5]
my_vector3
# Grow the vector
my_bigger_vector <- c(my_vector, 11:15, 16, 17, 18, 19, 20)
my_bigger_vector
# How big is it now?
length(my_bigger_vector)
dim(my_bigger_vector)
#===========================================================================
# String Vectors
#
# Create a vector of strings
force_users <- c("Yoda", "Darth Vader", "Obi Wan", "Mace Windu",
"Darth Maul", "Luke Skywalker", "Darth Sidious")
# Inspect vector more closely
class(force_users)
str(force_users)
summary(force_users)
# Add 1 to string vector
force_users + 1
# Add another force user
force_users <- force_users + "Kylo Ren"
# Add more force users
more_force_users <- c(force_users, "Qui-Gon Jinn", "Darth Tyranus")
more_force_users
# How big is the vector?
length(more_force_users)
# How long is each string in the vector?
name_lengths <- nchar(more_force_users)
name_lengths
#===========================================================================
# Missing Values
#
# Build a vector with missing values
birthplaces <- c(NA, "Tatooine", "Stewjon", "Haruun Kal", "Dathomir",
"Polis Massa", "Naboo", "Coruscant", "Serenno")
birthplaces
# Inspect closer
class(birthplaces)
str(birthplaces)
summary(birthplaces)
# Vectorized operation
is.na(birthplaces)
nchar(birthplaces)
nchar("")
# Logical operations
birthplaces[!is.na(birthplaces)]
#===========================================================================
# Factor Vectors
#
# Create factor (categorical) vector
affiliation <- as.factor(c("Jedi", "Sith", "Rogue"))
affiliation
# Inspect
class(affiliation)
str(affiliation)
summary(affiliation)
levels(affiliation)
# Explore representations
as.numeric(affiliation)
as.character(affiliation)
#===========================================================================
# Data Frames
#
star_wars <- data.frame(id = 1:length(more_force_users),
more_force_users,
birthplaces = as.factor(birthplaces),
affiliation = c("Jedi", "Sith",
"Jedi", "Jedi",
"Sith", "Jedi",
"Sith", "Jedi",
"Sith"),
stringsAsFactors = FALSE)
# Inspect
View(star_wars)
head(star_wars)
summary(star_wars)
str(star_wars)
# Set up factors
star_wars$affiliation <- as.factor(star_wars$affiliation)
# Reinspect
str(star_wars)
# Additional slicing syntax
star_wars$more_force_users[3]
star_wars$more_force_users[star_wars$affiliation == "Sith"]
# Load-up some built in data
data(iris)
data(mtcars)
# Get help on built-in data
?mtcars
# Understand the shape of a data frame
nrow(mtcars)
ncol(mtcars)
dim(mtcars)
# Understand the metadata of a data frame
names(mtcars)
names(mtcars)[3]
colnames(mtcars)
colnames(mtcars)[3:5]
rownames(mtcars)
rownames(mtcars)[c(3, 4, 5)]
# Cool RStudio feature - spreadsheet view of a data frame
View(mtcars)
# See a few rows at the top and bottom of a data frame
head(mtcars)
tail(mtcars)
# All-up view of a data frame
summary(mtcars)
# Understand the data type of a data frame
class(mtcars)
str(mtcars)
setwd("~/Dropbox/DataScienceDojo/IntroToTextAnalyticsWithR")
spam.raw <- read.csv("spam.csv", stringsAsFactors = FALSE, fileEncoding = "UTF-16")
View(spam.raw)
# Clean up the data frame and view our handiwork.
spam.raw <- spam.raw[, 1:2]
names(spam.raw) <- c("Label", "Text")
View(spam.raw)
# Check data to see if there are missing values.
length(which(!complete.cases(spam.raw)))
# Convert our class label into a factor.
spam.raw$Label <- as.factor(spam.raw$Label)
# The first step, as always, is to explore the data.
# First, let's take a look at distibution of the class labels (i.e., ham vs. spam).
prop.table(table(spam.raw$Label))
# Next up, let's get a feel for the distribution of text lengths of the SMS
# messages by adding a new feature for the length of each message.
spam.raw$TextLength <- nchar(spam.raw$Text)
summary(spam.raw$TextLength)
# Visualize distribution with ggplot2, adding segmentation for ham/spam.
library(ggplot2)
ggplot(spam.raw, aes(x = TextLength, fill = Label)) +
theme_bw() +
geom_histogram(binwidth = 5) +
labs(y = "Text Count", x = "Length of Text",
title = "Distribution of Text Lengths with Class Labels")
# Load up the .CSV data and explore in RStudio.
spam.raw <- read.csv("spam.csv", stringsAsFactors = FALSE, fileEncoding = "UTF-16")
#View(spam.raw)
# Clean up the data frame and view our handiwork.
spam.raw <- spam.raw[, 1:2]
names(spam.raw) <- c("Label", "Text")
#View(spam.raw)
# Check data to see if there are missing values.
length(which(!complete.cases(spam.raw)))
# Convert our class label into a factor.
spam.raw$Label <- as.factor(spam.raw$Label)
# The first step, as always, is to explore the data.
# First, let's take a look at distibution of the class labels (i.e., ham vs. spam).
prop.table(table(spam.raw$Label))
# Next up, let's get a feel for the distribution of text lengths of the SMS
# messages by adding a new feature for the length of each message.
spam.raw$TextLength <- nchar(spam.raw$Text)
summary(spam.raw$TextLength)
# Visualize distribution with ggplot2, adding segmentation for ham/spam.
library(ggplot2)
ggplot(spam.raw, aes(x = TextLength, fill = Label)) +
theme_bw() +
geom_histogram(binwidth = 5) +
labs(y = "Text Count", x = "Length of Text",
title = "Distribution of Text Lengths with Class Labels")
# At a minimum we need to split our data into a training set and a
# test set. In a true project we would want to use a three-way split
# of training, validation, and test.
#
# As we know that our data has non-trivial class imbalance, we'll
# use the mighty caret package to create a randomg train/test split
# that ensures the correct ham/spam class label proportions (i.e.,
# we'll use caret for a random stratified split).
library(caret)
help(package = "caret")
# Use caret to create a 70%/30% stratified split. Set the random
# seed for reproducibility.
set.seed(32984)
indexes <- createDataPartition(spam.raw$Label, times = 1,
p = 0.7, list = FALSE)
train <- spam.raw[indexes,]
test <- spam.raw[-indexes,]
# Verify proportions.
prop.table(table(train$Label))
prop.table(table(test$Label))
# Text analytics requires a lot of data exploration, data pre-processing
# and data wrangling. Let's explore some examples.
# HTML-escaped ampersand character.
train$Text[21]
# HTML-escaped '<' and '>' characters. Also note that Mallika Sherawat
# is an actual person, but we will ignore the implications of this for
# this introductory tutorial.
train$Text[38]
# A URL.
train$Text[357]
# There are many packages in the R ecosystem for performing text
# analytics. One of the newer packages in quanteda. The quanteda
# package has many useful functions for quickly and easily working
# with text data.
library(quanteda)
help(package = "quanteda")
# Tokenize SMS text messages.
train.tokens <- tokens(train$Text, what = "word",
remove_numbers = TRUE, remove_punct = TRUE,
remove_symbols = TRUE, remove_hyphens = TRUE)
# Take a look at a specific SMS message and see how it transforms.
train.tokens[[357]]
# Lower case the tokens.
train.tokens <- tokens_tolower(train.tokens)
train.tokens[[357]]
# Use quanteda's built-in stopword list for English.
# NOTE - You should always inspect stopword lists for applicability to
#        your problem/domain.
train.tokens <- tokens_select(train.tokens, stopwords(),
selection = "remove")
train.tokens[[357]]
# Perform stemming on the tokens.
train.tokens <- tokens_wordstem(train.tokens, language = "english")
train.tokens[[357]]
#
# # Create our first bag-of-words model.
# train.tokens.dfm <- dfm(train.tokens, tolower = FALSE)
#
#
# # Transform to a matrix and inspect.
# train.tokens.matrix <- as.matrix(train.tokens.dfm)
# View(train.tokens.matrix[1:20, 1:100])
# dim(train.tokens.matrix)
#
#
# # Investigate the effects of stemming.
# colnames(train.tokens.matrix)[1:50]
#
# Our function for calculating relative term frequency (TF)
term.frequency <- function(row) {
row / sum(row)
}
# Our function for calculating inverse document frequency (IDF)
inverse.doc.freq <- function(col) {
corpus.size <- length(col)
doc.count <- length(which(col > 0))
log10(corpus.size / doc.count)
}
# Our function for calculating TF-IDF.
tf.idf <- function(x, idf) {
x * idf
}
#
# # First step, normalize all documents via TF.
# train.tokens.df <- apply(train.tokens.matrix, 1, term.frequency)
# dim(train.tokens.df)
# View(train.tokens.df[1:20, 1:100])
#
#
# # Second step, calculate the IDF vector that we will use - both
# # for training data and for test data!
# train.tokens.idf <- apply(train.tokens.matrix, 2, inverse.doc.freq)
# str(train.tokens.idf)
#
#
# # Lastly, calculate TF-IDF for our training corpus.
# train.tokens.tfidf <-  apply(train.tokens.df, 2, tf.idf, idf = train.tokens.idf)
# dim(train.tokens.tfidf)
# View(train.tokens.tfidf[1:25, 1:25])
#
#
# # Transpose the matrix
# train.tokens.tfidf <- t(train.tokens.tfidf)
# dim(train.tokens.tfidf)
# View(train.tokens.tfidf[1:25, 1:25])
#
#
# # Check for incopmlete cases.
# incomplete.cases <- which(!complete.cases(train.tokens.tfidf))
# train$Text[incomplete.cases]
#
#
# # Fix incomplete cases
# train.tokens.tfidf[incomplete.cases,] <- rep(0.0, ncol(train.tokens.tfidf))
# dim(train.tokens.tfidf)
# sum(which(!complete.cases(train.tokens.tfidf)))
#
#
# # Make a clean data frame using the same process as before.
# train.tokens.tfidf.df <- cbind(Label = train$Label, data.frame(train.tokens.tfidf))
# names(train.tokens.tfidf.df) <- make.names(names(train.tokens.tfidf.df))
# N-grams allow us to augment our document-term frequency matrices with
# word ordering. This often leads to increased performance (e.g., accuracy)
# for machine learning models trained with more than just unigrams (i.e.,
# single terms). Let's add bigrams to our training data and the TF-IDF
# transform the expanded featre matrix to see if accuracy improves.
# Add bigrams to our feature matrix.
train.tokens <- tokens_ngrams(train.tokens, n = 1:2)
train.tokens[[357]]
# Transform to dfm and then a matrix.
train.tokens.dfm <- dfm(train.tokens, tolower = FALSE)
train.tokens.matrix <- as.matrix(train.tokens.dfm)
train.tokens.dfm
# Normalize all documents via TF.
train.tokens.df <- apply(train.tokens.matrix, 1, term.frequency)
# Calculate the IDF vector that we will use for training and test data!
train.tokens.idf <- apply(train.tokens.matrix, 2, inverse.doc.freq)
# Calculate TF-IDF for our training corpus
train.tokens.tfidf <-  apply(train.tokens.df, 2, tf.idf,
idf = train.tokens.idf)
# We'll leverage the irlba package for our singular value
# decomposition (SVD). The irlba package allows us to specify
# the number of the most important singular vectors we wish to
# calculate and retain for features.
library(irlba)
# Time the code execution
start.time <- Sys.time()
# Perform SVD. Specifically, reduce dimensionality down to 300 columns
# for our latent semantic analysis (LSA).
train.irlba <- irlba(train.tokens.tfidf, nv = 300, maxit = 600)
# Total time of execution on workstation was
total.time <- Sys.time() - start.time
total.time
View(train.irlba$v)
spam.raw <- read.csv("spam.csv", stringsAsFactors = FALSE, fileEncoding = "UTF-16")
View(spam.raw)
spam.raw <- spam.raw[, 1:2]
names(spam.raw) <- c("Label", "Text")
View(spam.raw)
length(which(!complete.cases(spam.raw)))
spam.raw$Label <- as.factor(spam.raw$Label)
prop.table(table(spam.raw$Label))
spam.raw$TextLength <- nchar(spam.raw$Text)
summary(spam.raw$TextLength)
library(caret)
set.seed(32984)
indexes <- createDataPartition(spam.raw$Label, times = 1,
p = 0.7, list = FALSE)
train <- spam.raw[indexes,]
test <- spam.raw[-indexes,]
prop.table(table(train$Label))
prop.table(table(test$Label))
train$Text[21]
train$Text[38]
train$Text[357]
library(quanteda)
train.tokens <- tokens(train$Text, what = "word",
remove_numbers = TRUE, remove_punct = TRUE,
remove_symbols = TRUE, remove_hyphens = TRUE)
train.tokens[[357]]
train.tokens <- tokens_tolower(train.tokens)
train.tokens[[357]]
train.tokens <- tokens_select(train.tokens, stopwords(),
selection = "remove")
train.tokens[[357]]
train.tokens <- tokens_wordstem(train.tokens, language = "english")
term.frequency <- function(row) {
row / sum(row)
}
inverse.doc.freq <- function(col) {
corpus.size <- length(col)
doc.count <- length(which(col > 0))
log10(corpus.size / doc.count)
}
tf.idf <- function(x, idf) {
x * idf
}
train.tokens <- tokens_ngrams(train.tokens, n = 1:2)
train.tokens[[357]]
train.tokens.dfm <- dfm(train.tokens, tolower = FALSE)
train.tokens.matrix <- as.matrix(train.tokens.dfm)
train.tokens.dfm
train.tokens.idf <- apply(train.tokens.matrix, 2, inverse.doc.freq)
train.tokens.tfidf <-  apply(train.tokens.df, 2, tf.idf,
idf = train.tokens.idf)
train.tokens.df <- apply(train.tokens.matrix, 1, term.frequency)
train.tokens.tfidf <- t(train.tokens.tfidf)
train.tokens.df <- apply(train.tokens.matrix, 1, term.frequency)
train.tokens.idf <- apply(train.tokens.matrix, 2, inverse.doc.freq)
train.tokens.tfidf <-  apply(train.tokens.df, 2, tf.idf,
idf = train.tokens.idf)
train.tokens.tfidf <- t(train.tokens.tfidf)
dim(train.tokens.tfidf)
library(irlba)
start.time <- Sys.time()
# Perform SVD. Specifically, reduce dimensionality down to 300 columns
# for our latent semantic analysis (LSA).
train.irlba <- irlba(t(train.tokens.tfidf), nv = 300, maxit = 600)
# Total time of execution on workstation was
total.time <- Sys.time() - start.time
total.time
spam.raw <- read.csv("spam.csv", stringsAsFactors = FALSE, fileEncoding = "UTF-16")
spam.raw <- spam.raw[, 1:2]
names(spam.raw) <- c("Label", "Text")
spam.raw$Label <- as.factor(spam.raw$Label)
prop.table(table(spam.raw$Label))
spam.raw$TextLength <- nchar(spam.raw$Text)
summary(spam.raw$TextLength)
library(caret)
set.seed(32984)
indexes <- createDataPartition(spam.raw$Label, times = 1,
p = 0.7, list = FALSE)
train <- spam.raw[indexes,]
test <- spam.raw[-indexes,]
library(quanteda)
train.tokens <- tokens(train$Text, what = "word",
remove_numbers = TRUE, remove_punct = TRUE,
remove_symbols = TRUE, remove_hyphens = TRUE)
train.tokens <- tokens_select(train.tokens, stopwords(),
selection = "remove")
train.tokens <- tokens_wordstem(train.tokens, language = "english")
train.tokens.dfm <- dfm(train.tokens, tolower = FALSE)
train.tokens.matrix <- as.matrix(train.tokens.dfm)
set.seed(48743)
cv.folds <- createMultiFolds(train$Label, k = 10, times = 3)
cv.cntrl <- trainControl(method = "repeatedcv", number = 10,
repeats = 3, index = cv.folds)
term.frequency <- function(row) {
row / sum(row)
}
inverse.doc.freq <- function(col) {
corpus.size <- length(col)
doc.count <- length(which(col > 0))
log10(corpus.size / doc.count)
}
tf.idf <- function(x, idf) {
x * idf
}
train.tokens <- tokens_ngrams(train.tokens, n = 1:2)
train.tokens.dfm <- dfm(train.tokens, tolower = FALSE)
train.tokens.matrix <- as.matrix(train.tokens.dfm)
train.tokens.df <- apply(train.tokens.matrix, 1, term.frequency)
train.tokens.idf <- apply(train.tokens.matrix, 2, inverse.doc.freq)
train.tokens.tfidf <-  apply(train.tokens.df, 2, tf.idf,
idf = train.tokens.idf)
train.tokens.tfidf <- t(train.tokens.tfidf)
incomplete.cases <- which(!complete.cases(train.tokens.tfidf))
train.tokens.tfidf[incomplete.cases,] <- rep(0.0, ncol(train.tokens.tfidf))
train.tokens.tfidf.df <- cbind(Label = train$Label, data.frame(train.tokens.tfidf))
names(train.tokens.tfidf.df) <- make.names(names(train.tokens.tfidf.df))
gc()
library(irlba)
# Time the code execution
start.time <- Sys.time()
# Perform SVD. Specifically, reduce dimensionality down to 300 columns
# for our latent semantic analysis (LSA).
train.irlba <- irlba(t(train.tokens.tfidf), nv = 300, maxit = 600)
# Total time of execution on workstation was
total.time <- Sys.time() - start.time
total.time
sigma.inverse <- 1 / train.irlba$d
u.transpose <- t(train.irlba$u)
document <- train.tokens.tfidf[1,]
document.hat <- sigma.inverse * u.transpose %*% document
document.hat[1:10]
train.irlba$v[1, 1:10]
train.svd <- data.frame(Label = train$Label, train.irlba$v)
cl <- makeCluster(10, type = "SOCK")
library(doSNOW)
cl <- makeCluster(10, type = "SOCK")
registerDoSNOW(cl)
start.time <- Sys.time()
# This will be the last run using single decision trees. With a much smaller
# feature matrix we can now use more powerful methods like the mighty Random
# Forest from now on!
rpart.cv.4 <- train(Label ~ ., data = train.svd, method = "rpart",
trControl = cv.cntrl, tuneLength = 7)
# Processing is done, stop cluster.
stopCluster(cl)
# Total time of execution on workstation was
total.time <- Sys.time() - start.time
total.time
rpart.cv.4
train.svd$TextLength <- train$TextLength
load("rf.cv.2.RData")
rf.cv.2
confusionMatrix(train.svd$Label, rf.cv.2$finalModel$predicted)
library(randomForest)
varImpPlot(rf.cv.1$finalModel)
varImpPlot(rf.cv.2$finalModel)
test.tokens <- tokens(test$Text, what = "word",
remove_numbers = TRUE, remove_punct = TRUE,
remove_symbols = TRUE, remove_hyphens = TRUE)
test.tokens <- tokens_tolower(test.tokens)
test.tokens <- tokens_select(test.tokens, stopwords(),
selection = "remove")
test.tokens <- tokens_wordstem(test.tokens, language = "english")
test.tokens <- tokens_ngrams(test.tokens, n = 1:2)
test.tokens.dfm <- dfm(test.tokens, tolower = FALSE)
train.tokens.dfm
test.tokens.dfm
test.tokens.dfm <- dfm_select(test.tokens.dfm, features = train.tokens.dfm)
test.tokens.matrix <- as.matrix(test.tokens.dfm)
test.tokens.dfm
test.tokens.df <- apply(test.tokens.matrix, 1, term.frequency)
str(test.tokens.df)
test.tokens.tfidf <-  apply(test.tokens.df, 2, tf.idf, idf = train.tokens.idf)
dim(test.tokens.tfidf)
View(test.tokens.tfidf[1:25, 1:25])
test.tokens.tfidf <- t(test.tokens.tfidf)
summary(test.tokens.tfidf[1,])
test.tokens.tfidf[is.na(test.tokens.tfidf)] <- 0.0
summary(test.tokens.tfidf[1,])
test.svd.raw <- t(sigma.inverse * u.transpose %*% t(test.tokens.tfidf))
test.svd <- data.frame(Label = test$Label, test.svd.raw,
TextLength = test$TextLength)
preds <- predict(rf.cv.3, test.svd)
preds <- predict(rf.cv.2, test.svd)
confusionMatrix(preds, test.svd$Label)