slides_ensemble.Rmd

---
title: "Illustrating Ensemble Models"
output: 
  html_document:
      toc: yes
      toc_float: yes
      code_folding: hide
---

This case comes from the UCL Machine Learning Repository, and is called the [Bank Marketing Data Set](https://archive.ics.uci.edu/ml/datasets/bank+marketing).

The data is related with direct marketing campaigns of a Portuguese banking institution. The marketing campaigns were based on phone calls. Often, more than one contact to the same client was required, in order to access if the product (bank term deposit) would be ('yes') or not ('no') bought.

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, error=TRUE)
```

```{r, warning = FALSE, message = FALSE}
library(tidyverse)
library(caret)
library(caretEnsemble)
library(rpart)
library(naivebayes)
```

# Read in the data

```{r readdata, cache=TRUE}
df <- read_csv("data/bank-additional/bank-additional-full.csv")
df = df %>% 
  dplyr::rename(bought=y) %>%
  mutate_if(is.character, as.factor)

head(df)
summary(df)
```

# Split the data and Set Some Constants

```{r split, cache=TRUE}
set.seed(123)
train.index <- createDataPartition(df$bought, p = .8, list = FALSE)
train <- df[ train.index,]
test  <- df[-train.index,]

# fastAdaboost will complain later if we don't do this:
test = as.data.frame(test)
test$bought = as.factor(test$bought)
train = as.data.frame(train)
train$bought = as.factor(train$bought)

formula = bought ~ .
positive = "yes"
actual = test$bought
```

# Committee (manual)

Build a DT.

```{r rpart, warning = FALSE, cache=TRUE}
set.seed(123)
ctrl = trainControl( method = "repeatedcv", 
                     number = 10, repeats = 5, classProbs = TRUE, allowParallel = TRUE)
rpart_fit <- train(formula, data = train, method="rpart", trControl = ctrl, metric="Kappa")
rpart_pred = predict(rpart_fit, test)
```

Build a NB.

```{r nb, warning = FALSE, cache=TRUE}
set.seed(123)
ctrl = trainControl( method = "repeatedcv", 
                     number = 10, repeats = 5, classProbs = TRUE, allowParallel = TRUE)
nb_fit <- train(formula, data = train, "naive_bayes", trControl = ctrl, metric="Kappa")
nb_pred = predict(nb_fit, test)
```

Build a PLS.

```{r pls, warning = FALSE, cache=TRUE}
set.seed(123)
ctrl = trainControl( method = "repeatedcv", 
                     number = 10, repeats = 5, classProbs = TRUE, allowParallel = TRUE)
pls_fit <- train(formula, data = train, "pls", trControl = ctrl)
pls_pred = predict(pls_fit, test)
```

Combine them together. 
```{r, cache=TRUE}
head(rpart_pred)

committee_pred = as.data.frame(cbind(as.character(rpart_pred), as.character(nb_pred), as.character(pls_pred)))
head(committee_pred, n=50)
str(committee_pred)
table(rpart_pred, committee_pred$rpart_pred)

committee_pred$yes_count = apply(committee_pred[,1:3], 1, function(x) sum(x=="yes"))
str(committee_pred$yes_count)
committee_pred$no_count = apply(committee_pred[,1:3], 1, function(x) sum(x=="no"))

committee_pred$vote = factor(ifelse(committee_pred$yes_count >= committee_pred$no_count, "yes", "no"))
head(committee_pred, n=50)

caret::confusionMatrix(data=rpart_pred, reference=actual, positive=positive, dnn=c("Predicted", "Actual"))
caret::confusionMatrix(data=nb_pred, reference=actual, positive=positive, dnn=c("Predicted", "Actual"))
caret::confusionMatrix(data=pls_pred, reference=actual, positive=positive, dnn=c("Predicted", "Actual"))
caret::confusionMatrix(data=committee_pred$vote, reference=actual, positive=positive, dnn=c("Predicted", "Actual"))
```


# Committee (caret)

```{r committee, warning = FALSE, cache=TRUE}
set.seed(123)

library(parallel)
library(doParallel)
cluster <- makeCluster(40) # convention to leave 1 core for OS
registerDoParallel(cluster)

ctrl = trainControl(
  method="boot",
  number=10,
  savePredictions="final",
  classProbs=TRUE,
  index=createResample(train$bought, 10),
  summaryFunction=twoClassSummary,
  allowParallel = TRUE
  )

model_list <- caretList(
  bought~., data=train,
  trControl=ctrl,
  methodList=c("pls", "rpart")
)

ensemble_fit <- caretEnsemble(
  model_list, 
  metric="ROC",
  trControl=trainControl(
    number=2,
    summaryFunction=twoClassSummary,
    classProbs=TRUE
    ))
summary(ensemble_fit)

ensemble_pred = predict(ensemble_fit, test)
caret::confusionMatrix(data=ensemble_pred, reference=actual, positive=positive, dnn=c("Predicted", "Actual"))
```

# Stacking


```{r stacking, cache=TRUE}
set.seed(123)

stack_fit <- caretStack(
  model_list,
  method="glm",
  metric="ROC",
  trControl=trainControl(
    method="boot",
    number=10,
    savePredictions="final",
    classProbs=TRUE,
    summaryFunction=twoClassSummary
  )
)
summary(stack_fit)

stack_pred = predict(stack_fit, test)
caret::confusionMatrix(data=stack_pred, reference=actual, positive=positive, dnn=c("Predicted", "Actual"))


```

# Random Forests

```{r rf, warning = FALSE, cache=TRUE}
library(randomForest)
set.seed(123) 
rf2_fit = randomForest(formula, data=train, mtry=3, ntree=100, importance=TRUE)

summary(rf2_fit)

rf2_pred = predict(rf2_fit, test, type="class") 
caret::confusionMatrix(data=rf2_pred, reference=actual, positive=positive, dnn=c("Predicted", "Actual"))

varImpPlot(rf2_fit)
```

# Boosting with Adaboost

```{r adaboost, warning = FALSE, cache=TRUE}
library(fastAdaboost)
set.seed(123)
boost = adaboost(formula, data=train, nIter=20)
boost_pred = predict(boost, newdata=test)
str(boost_pred)
caret::confusionMatrix(data=boost_pred$class, reference=actual, positive=positive, dnn=c("Predicted", "Actual"))
```


# Boosting with XGBoost

```{r xgboost, warning = FALSE, cache=TRUE}
library(xgboost)
library(Matrix)
set.seed(123)

train_label = as.numeric(ifelse(train$bought == "yes", 1.0, 0.0))
train_data = train
dv = dummyVars(formula, train_data)
train_data = predict(dv, train_data)
train_data = Matrix(train_data, sparse=TRUE)


test_label = as.numeric(ifelse(test$bought == "yes", 1.0, 0.0))
test_data = test
dv = dummyVars(formula, test_data)
test_data = predict(dv, test_data)
test_data = Matrix(test_data, sparse=TRUE)


xgboost_fit <- xgboost(data = train_data,  label=train_label, nround=20, verbose=2, objective="binary:logistic")

str(xgboost_fit)
summary(xgboost_fit)

xgboost_pred = predict(xgboost_fit, newdata=test_data)
head(xgboost_pred)
summary(xgboost_pred)
xgboost_pred2 = as.factor(ifelse(xgboost_pred > 0.5, "yes", "no"))
head(xgboost_pred2)
head(test_label)
caret::confusionMatrix(data=xgboost_pred2, reference=actual, positive=positive, dnn=c("Predicted", "Actual"))
```