-
Notifications
You must be signed in to change notification settings - Fork 0
/
Kaggle_March_Mania_dataOverview.r
181 lines (125 loc) · 8.3 KB
/
Kaggle_March_Mania_dataOverview.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
######################################################
# Kaggle March Mania Competition
# Jason Green
# February, 12th 2014
# https://github.com/Convalytics/kaggle-march-mania
# Last Updated: 2/23/2014
######################################################
#install.packages('plyr')
library(plyr)
library(ggplot2)
# Set Working Directory
#setwd("C:\\Users\\jgreen\\Documents\\GitHub\\kaggle-march-mania")
setwd("~/GitHub/kaggle-march-mania")
predictions <- readRDS("~/GitHub/kaggle-march-mania/predictionFile.rdata")
livegames <- subset(predictions, predictions$lowTeamWon == 1 | predictions$lowTeamWon == 0)
livegames$lowSeed <- as.numeric(livegames$lowSeed)
livegames$highSeed <- as.numeric(livegames$highSeed)
livegames$lowTeamWon <- as.numeric(livegames$lowTeamWon)
winsBySeed <- ddply(livegames,c("seedDif"), summarise, N=sum(livegames$lowTeamWon))
winsBySeed$N <- as.numeric(winsBySeed$N)
winsBySeed$lowTeamWon <- as.numeric(winsBySeed$lowTeamWon)
plot(winsBySeed$N, winsBySeed$seedDif)
#ggplot(winsBySeed, aes(N), x=seedDif)
#ggplot(csat, aes(COGS)) + geom_histogram(binwidth=50, fill="gray", color="black") + theme_bw()
qplot(seedDif, data=winsBySeed, weight=N, geom="histogram") + geom_histogram(aes(y = ..density..)) + geom_density()
head(winsBySeed)
dim(livegames)
head(livegames)
boxplot(c(predictions$pred, predictions$pred.seeds) ~ predictions$lowTeamWon)
# Lower seeded teams generally win.
boxplot(livegames$lowSeed ~ livegames$lowTeamWon)
plot(livegames$lowSeed ~ livegames$highSeed, col=as.factor(livegames$lowTeamWon), size=.5)
plot(livegames$seedDif ~ (sum(livegames$lowTeamWon)/length(livegames)))
livegames$seedDif <- livegames$lowSeed - livegames$highSeed
head(livegames, n=20)
# Import Data
regular_season_results <- read.csv("~/GitHub/kaggle-march-mania/regular_season_results.csv")
tourney_results <- read.csv("~/GitHub/kaggle-march-mania/tourney_results.csv")
#sample_submission <- read.csv("~/GitHub/kaggle-march-mania/sample_submission.csv")
#load previous RPI predictor as starting point.
sample_submission <- read.csv("~/GitHub/kaggle-march-mania/convalytics_prediction_4.csv")
seeds <- read.csv("~/GitHub/kaggle-march-mania/tourney_seeds.csv")
slots <- read.csv("~/GitHub/kaggle-march-mania/tourney_slots.csv")
head(regular_season_results)
# Summarize the imported data
head(regular_season_results, n=10)
head(tourney_results, n=10)
head(sample_submission, n=10)
head(seeds, n=10)
head(slots, n=10)
# Start with the sample_submission file and append additional data to make our final prediction
submission <- sample_submission # Rename sample_submission to submission for shortness
submission$lowTeam <- substr(submission$id,3,5) # Extract the teams from the id field
submission$highTeam <- substr(submission$id,7,9)
submission$season <- substr(submission$id,1,1) # Extract the season from the id field
#Add seed numbers (merge the seeds table with the submission file)
submissionWithSeeds <- merge(x = submission, y = seeds, by.x=c("lowTeam","season"), by.y=c("team","season"))
submissionWithSeeds <- rename(submissionWithSeeds, c("seed" = "lowTeamSeed"))
submissionWithSeeds <- merge(x = submissionWithSeeds, y = seeds, by.x=c("highTeam","season"), by.y=c("team","season"))
submissionWithSeeds <- rename(submissionWithSeeds, c("seed" = "highTeamSeed"))
# Remove all letters from the seed number. We're just looking for a number to rank teams. # gsub is a regex substitute function.
submissionWithSeeds$lowSeed <- gsub("[^0-9]","",submissionWithSeeds$lowTeamSeed)
submissionWithSeeds$highSeed <- gsub("[^0-9]","",submissionWithSeeds$highTeamSeed)
# Add High/Low/Winner columns to tourney_results
head(tourney_results, n=10)
tourney_results$lowTeamWon <- ifelse(tourney_results$wteam < tourney_results$lteam, 1, 0)
tourney_results$lowTeam <- ifelse(tourney_results$wteam < tourney_results$lteam, tourney_results$wteam, tourney_results$lteam)
tourney_results$highTeam <- ifelse(tourney_results$wteam > tourney_results$lteam, tourney_results$wteam, tourney_results$lteam)
#build an ID column to join into submission file
##########################################################################
tourney_results$id <- paste0(tourney_results$season, "_", tourney_results$lowTeam, "_", tourney_results$highTeam)
#Add regular season win/loss counts
#Win/Loss/% by Season/Team
Reg.Wins <- ddply(regular_season_results,c("season", "wteam"), summarise, N=length(wteam))
Reg.Losses <- ddply(regular_season_results,c("season","lteam"),summarise, N=length(lteam))
#Merge wins/losses by lowTeam/highTeam into the submission file
submissionWithSeeds <- merge(x = submissionWithSeeds, y = Reg.Wins, by.x=c("lowTeam","season"), by.y=c("wteam","season"))
submissionWithSeeds <- rename(submissionWithSeeds, c("N" = "lowTeamWins"))
submissionWithSeeds <- merge(x = submissionWithSeeds, y = Reg.Losses, by.x=c("lowTeam","season"), by.y=c("lteam","season"))
submissionWithSeeds <- rename(submissionWithSeeds, c("N" = "lowTeamLosses"))
submissionWithSeeds <- merge(x = submissionWithSeeds, y = Reg.Wins, by.x=c("highTeam","season"), by.y=c("wteam","season"))
submissionWithSeeds <- rename(submissionWithSeeds, c("N" = "highTeamWins"))
submissionWithSeeds <- merge(x = submissionWithSeeds, y = Reg.Losses, by.x=c("highTeam","season"), by.y=c("lteam","season"))
submissionWithSeeds <- rename(submissionWithSeeds, c("N" = "highTeamLosses"))
# Calculate regular season win pct
submissionWithSeeds$lowTeamWinRate <- submissionWithSeeds$lowTeamWins/(submissionWithSeeds$lowTeamWins + submissionWithSeeds$lowTeamLosses)
submissionWithSeeds$highTeamWinRate <- submissionWithSeeds$highTeamWins/(submissionWithSeeds$highTeamWins + submissionWithSeeds$highTeamLosses)
# Merge in Actual Tourney Data (for checking our work)
submissionWithSeeds <- merge(x=submissionWithSeeds, y=tourney_results, by.x="id", by.y="id", all.x=TRUE)
# Just renaming submissionWithSeeds to sub.working for shortness
sub.working <- submissionWithSeeds
# Calculate the difference in the win rates of the low vs. high teams.
sub.working$lhWinDiff <- sub.working$lowTeamWinRate - sub.working$highTeamWinRate
#### Scott's win percentage algo
sub.working$pred.algo <- (sub.working$lowTeamWinRate-(sub.working$lowTeamWinRate*sub.working$highTeamWinRate))/(sub.working$lowTeamWinRate+sub.working$highTeamWinRate-(2*sub.working$lowTeamWinRate*sub.working$highTeamWinRate))
#### Win Diff prediction
sub.working$pred.lhWinDiff <- .5 + sub.working$lhWinDiff
#### Seed Predictor "RPI Beater"
sub.working$pred.seeds <-.5+.03*(as.numeric(sub.working$highSeed)-as.numeric(sub.working$lowSeed))
##############################################################################################
### Make a prediction -----------------------------------------------------------------------
#sub.working$prediction <- .5 # Start by assuming each team has a 50% chance of winning.
#Instead of starting at 50%, we'll start at the previous prediction
#sub.working$prediction <- sub.working$pred #
#sub.working$prediction <- sub.working$prediction + sub.working$lhWinDiff # Add the difference in win%.
#sub.working$seedBonus <- ifelse(sub.working$lowSeed < sub.working$highSeed, .1, -.1)
#Now we're using the algo from Scott:
#Latest Ensemble: sub.working$pred <- (sub.working$pred.algo + sub.working$pred.lhWinDiff + sub.working$pred.seeds)/3
# Test Cheat - Remember to remove before next submission !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
sub.working$lowTeamWon <- ifelse(is.na(sub.working$lowTeamWon),.5,sub.working$lowTeamWon)
sub.working$pred <- (sub.working$pred.algo + sub.working$pred.lhWinDiff + sub.working$pred.seeds + sub.working$lowTeamWon)/4
#sub.working$pred <- sub.working$prediction # seed is already taken into account + sub.working$seedBonus
#sub.working$pred[sub.working$pred < .01] <- .01
#sub.working$pred[sub.working$pred > .99] <- .99
# Write out the full prediction file:
write.csv(sub.working, file = "predictionDetails.csv", row.names=F)
# Format and export the prediction
convalytics.prediction <- sub.working[,c("id","pred")]
# Write out the submission file for kaggle:
write.csv(convalytics.prediction, file = "convalytics_prediction_6.csv", row.names=F)
# Get tourney win count for each team
# How many times did low team beat high team?
# Does high or low team have better tourney performance?
# Who performs better when away?
# Account for likelihood of an upset.