-
Notifications
You must be signed in to change notification settings - Fork 1
/
Market basket analysis.R
80 lines (58 loc) · 2.84 KB
/
Market basket analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
library(arules)
library(arulesViz)
library(dplyr)
library(plyr)
library(lubridate)
library(ggplot2)
library(knitr)
library(RColorBrewer)
setwd("D:\\Programming\\R")
df <- read.csv("Online Retail.csv")
head(df)
df <- df[complete.cases(df), ] # Drop missing values
df %>% mutate(Description = as.factor(Description),
Country = as.factor(Country)) # Change Description and Country columns to factors
df$Date <- as.Date(df$InvoiceDate) # Change InvoiceDate to Date datatype
df$InvoiceDate <- as.Date(df$InvoiceDate)
TransTime<- format(as.POSIXct(df$InvoiceDate),"%H:%M:%S") # Extract time from the InvoiceDate column
InvoiceNo <- as.numeric(as.character(df$InvoiceNo)) # Convert InvoiceNo into numeric
cbind(df, TransTime, InvoiceNo) # Add new columns to original dataframe
glimpse(df)
# Group by invoice number and combine order item strings with a comma
transactionData <- ddply(df,c("InvoiceNo","Date"),
function(df1)paste(df1$Description,collapse = ","))
transactionData$InvoiceNo <- NULL # Don't need these columns
transactionData$Date <- NULL
colnames(transactionData) <- c("items")
head(transactionData)
write.csv(transactionData,"market_basket_transactions.csv", quote = FALSE, row.names = TRUE)
# MBA analysis
# From package arules
tr <- read.transactions('market_basket_transactions.csv', format = 'basket', sep=',')
summary(tr)
itemFrequencyPlot(tr,topN=20,type="absolute",col=brewer.pal(8,'Pastel2'), main="Absolute Item Frequency Plot")
# Generate the a priori rules
association.rules <- apriori(tr, parameter = list(supp=0.001, conf=0.8,maxlen=10))
summary(association.rules)
inspect(association.rules[1:10]) # Top 10 association rules
# Select rules which are subsets of larger rules -> Remove rows where the sums of the subsets are > 1
subset.rules <- which(colSums(is.subset(association.rules, association.rules)) > 1) # get subset rules in vector
# What did customers buy before buying "METAL"
metal.association.rules <- apriori(tr, parameter = list(supp=0.001, conf=0.8),appearance = list(default="lhs",rhs="METAL"))
inspect(head(metal.association.rules))
# What did customers buy after buying "METAL"
metal.association.rules2 <- apriori(tr, parameter = list(supp=0.001, conf=0.8),appearance = list(lhs="METAL",default="rhs"))
inspect(head(metal.association.rules2))
# Plotting
# Filter rules with confidence greater than 0.4 or 40%
subRules<-association.rules[quality(association.rules)$confidence>0.4]
#Plot SubRules
plot(subRules)
# Top 10 rules viz
top10subRules <- head(subRules, n = 10, by = "confidence")
plot(top10subRules, method = "graph", engine = "htmlwidget")
# Filter top 20 rules with highest lift
# Paralell Coordinates plot - visualize which products along with which items cause what kind of sales.
# Closer arrows re bought together
subRules2<-head(subRules, n=20, by="lift")
plot(subRules2, method="paracoord")