-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathRules.R
94 lines (63 loc) · 2.56 KB
/
Rules.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#!/usr/bin/Rscript
# This script requires that the historical.R be run first so that
# the historical data is prep'd
options(scipen=999)
library(arules)
library(smbinning)
# Change working directory
if ( file.exists('/home/john') ) {
setwd("/home/john/Dropbox/pls")
} else if ( file.exists('/home/user') ) {
setwd("/home/user/pls")
} else {
setwd("C:/Users/john/Dropbox/pls")
}
# Load the saved historical data prep'd for R
load('data/filterNotes.rda')
# Only model complete, fully paid or charged off data
data = subset(notes,(complete==TRUE & (loan_status=='Fully_Paid' | loan_status=='Charged_Off')))
data$loan_status <- factor(droplevels(data$loan_status))
data=subset(data, intRate > 15)
# Select on features id data needed to create rules
features=c("loanAmount", "term", "intRate", "installment", "empLength", "homeOwnership", "annualInc", "purpose", "addrState",
"dti", "delinq2Yrs", "ficoRangeLow",
"inqLast6Mths", "openAcc", "pubRec", "revolBal", "revolUtil", "totalAcc",
"earliestCrLineMonths", "installmentIncomeRatio", "revolBalAnnualIncRatio")
data=data[,c(features,"loan_status")]
data=na.omit(data)
# Create fmla
fmla <- as.formula(paste("loan_status ~ ",paste(features,collapse=" + ")))
# Loan Status: 0 for bad notes, 1 for fully paid
data$lsFlag=ifelse(data$loan_status == 'Charged_Off' |
data$loan_status == 'Default', 0, 1)
data$lsFlag=as.integer(data$lsFlag)
# discretize features with respect to loan status
for(col in features) {
if ( ! is.factor(data[[col]]) ) {
if ( length(unique(data[[col]])) < 10 ) {
data[[col]]=discretize(data[[col]],method="cluster",categories=4)
next
}
x=smbinning(data,y="lsFlag",x=col,p=0.05)
if ( x[1]=="No Bins") {
data[[col]]=discretize(data[[col]],method="cluster",categories=4)
next
}
data[[col]]=cut(data[[col]],unique(c(min(data[[col]]),x$cuts,max(data[[col]]))),dig.lab=10,include.lowest=TRUE,right=TRUE)
}
}
data$lsFlag<-NULL
# rules <- apriori(data)
# inspect(rules)
rules <- apriori(data,
parameter = list(supp=0.05, conf=0.85),
appearance = list(rhs=c("loan_status=Charged_Off", "loan_status=Fully_Paid"),
default="lhs"),
control = list(verbose=F))
subset.matrix <- is.subset(rules, rules)
subset.matrix[lower.tri(subset.matrix, diag=T)] <- NA
redundant <- colSums(subset.matrix, na.rm=T) >= 1
rules.pruned <- rules[!redundant]
rules<-rules.pruned
rules.sorted <- sort(rules, by="lift")
inspect(rules.sorted)