Skip to content

Commit

Permalink
Issue4 modeling (#6)
Browse files Browse the repository at this point in the history
* Begin merging Lahman and FanGraphs #4

* Fix discrepancy in OBP calculation #4

The issue was that when I was loading Lahman, saving it, and reloading
the same dataset it gave NA for HBP. This meant my calculated OBP was
slightly off from what was provided. The solution was to use the
Lahman::Bating instead of read_csv(lahman_data.csv)

* Create training and test sets #4

* Save player sample mean RMSE #4

* Include OLS model of 5 year lagged OBP #4

* Look at variables selected for each lambda #4

* Create quartiles time series plot #4

* Add PA filtering condition for plots #4

* Set PA minimums for inclusion in training set #4

* Set training set 1975-2017 validation set 2018-2020 #4

* Use LASSO for finding most important variables #4

* Include cur_BB in training #4

* Run spline model #4

* Get RMSE on test set #4

* Calculate NA statistics for provided dataset #4

* Save OLS residual plot #4

* Update NA comment #4

* Remove current OBP from LASSO #4

* Improve spline elbow plot #4

* Comment reference URLs #4

* Make ridge plot for 1970-1999 #4

* Add links to packages in README #4

* Alphabetize packages #4

* Add pacman to README packages #4

* Include code file in README #4

* Adjust filtering condition from OBP to PA for lag plots and ridge plots #4

* Adjust x axis for OBP ridge to facilitate comparision #4
  • Loading branch information
nateschor authored Feb 6, 2023
1 parent f6bb1f9 commit 9cc9769
Show file tree
Hide file tree
Showing 23 changed files with 639 additions and 61 deletions.
25 changes: 24 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1 +1,24 @@
# OBP
# OBP

## Goal: Predict OBP with the smallest RMSE possible for 2021 season

To run: In `code/`, run the scripts 01_* through 09_* in order. If you do not have the `pacman` package you will need to install it, and then `pacman` will handle installing any additional packages you do not have installed

Additional packages used:
* [broom](https://cran.r-project.org/web/packages/broom/vignettes/broom.html)
* [DataExplorer](https://cran.r-project.org/web/packages/DataExplorer/vignettes/dataexplorer-intro.html)
* [ggridges](https://cran.r-project.org/package=ggridges/vignettes/introduction.html)
* [ggthemes](https://yutannihilation.github.io/allYourFigureAreBelongToUs/ggthemes/)
* [glmnetUtils](https://cran.r-project.org/web/packages/glmnetUtils/vignettes/intro.html)
* [here](https://cran.r-project.org/web/packages/here/vignettes/here.html)
* [Lahman](https://cran.r-project.org/web/packages/Lahman/Lahman.pdf)
* [pacman](https://cran.r-project.org/web/packages/pacman/readme/README.html)
* [skimr](https://cran.r-project.org/web/packages/skimr/vignettes/skimr.html)
* [tictoc](https://cran.r-project.org/package=tictoc)
* [tidylog](https://cran.r-project.org/web/packages/tidylog/readme/README.html)
* [tidymodels](https://www.tidymodels.org/)
* [tidyverse](https://www.tidyverse.org/)
* [tsibble](https://tsibble.tidyverts.org/)

### Session Information:
* R version 4.0.3
14 changes: 5 additions & 9 deletions code/01_prepare_lahman_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ pacman::p_load(
path_batting <- here("data/lahman/raw/batting.csv")
path_people <- here("data/lahman/raw/people.csv")

write_csv(Batting, path_batting)
write_csv(People, path_people)
write_csv(Lahman::Batting, path_batting)
write_csv(Lahman::People, path_people)

# FanGraphs ---------------------------------------------------------------

Expand Down Expand Up @@ -47,21 +47,17 @@ df_fg_longer %>%

# Lahman ------------------------------------------------------------------

df_batting <- read_csv(path_batting) %>%
df_batting <- Lahman::Batting %>%
select(playerID, yearID, stint, G:GIDP) %>%
glimpse()

df_batting %>%
group_by(playerID, yearID, stint) # confirm that dataset is at the player-year-stint level

df_people <- read_csv(path_people) %>%
df_people <- Lahman::People %>%
select(playerID, bbrefID, nameFirst, nameLast) %>%
glimpse()

df_crosswalk <- read_csv(here("data/sfbb_player_id_crosswalk.csv")) %>% # https://www.smartfantasybaseball.com/tools/
select(bbrefID = "BREFID", fgID = "IDFANGRAPHS") %>%
glimpse()

df_batting_aggregated <- df_batting %>%
group_by(playerID, yearID) %>%
summarize(
Expand All @@ -84,7 +80,7 @@ df_batting_stats <- df_batting_aggregated %>%
) %>%
glimpse()

df_tsibble <- as_tsibble(df_batting_stats, key = bbrefID, index = yearID) %>%
df_tsibble <- as_tsibble(df_batting_stats, key = bbrefID, index = yearID) %>% # https://tsibble.tidyverts.org/
fill_gaps() %>% # make missing seasons explicit instead of implicit so that previous row = last played season
arrange(bbrefID, yearID) %>%
group_by(bbrefID)
Expand Down
133 changes: 83 additions & 50 deletions code/02_plot_OBP.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,82 @@ pacman::p_load(
tidyverse,
tidylog,
here,
ggridges
ggridges,
ggthemes
)

df_lahman <- read_csv(here("data/lahman/derived/df_batting_lag5.csv")) %>%
glimpse()


# Quartiles over time -----------------------------------------------------

df_quartiles_pre_2020 <- df_lahman %>%
filter(yearID <= 2019) %>%
filter(cur_PA >= 100) %>%
group_by(yearID) %>%
summarize(
`25th` = quantile(cur_OBP, probs = .25, na.rm = TRUE),
`50th` = quantile(cur_OBP, probs = .5, na.rm = TRUE),
`75th` = quantile(cur_OBP, probs = .75, na.rm = TRUE),
) %>%
pivot_longer(., cols = -yearID, names_to = "Quartile", values_to = "OBP") %>%
mutate(
Quartile = factor(Quartile, levels = c("75th", "50th", "25th"))
)

df_quartiles_2020 <- df_lahman %>%
filter(yearID == 2020) %>%
filter(cur_PA >= 30) %>%
group_by(yearID) %>%
summarize(
`25th` = quantile(cur_OBP, probs = .25, na.rm = TRUE),
`50th` = quantile(cur_OBP, probs = .5, na.rm = TRUE),
`75th` = quantile(cur_OBP, probs = .75, na.rm = TRUE),
) %>%
pivot_longer(., cols = -yearID, names_to = "Quartile", values_to = "OBP") %>%
mutate(
Quartile = factor(Quartile, levels = c("75th", "50th", "25th"))
)

df_quartiles <- bind_rows(
df_quartiles_pre_2020,
df_quartiles_2020
)

p_quartiles <- ggplot(df_quartiles, aes(x = yearID, y = OBP, color = Quartile)) +
geom_line(size = 1) +
geom_point(size = 1.5) +
scale_x_continuous(breaks = seq(1870, 2020, 10)) +
scale_color_ptol() +
geom_vline(xintercept = 1970, linetype = "dashed") +
theme_minimal() +
labs(
x = "Season"
) +
theme(
panel.grid = element_blank()
)

ggsave(plot = p_quartiles, filename = here("report/figures/quartiles_ts.png"))

# Lag Plots ---------------------------------------------------------------

df_lag_plot <- df_lahman %>%
filter(yearID %in% 2015:2020) %>%
filter(
if_all(contains("OBP"), ~ between(., .001, .999))
) %>%
df_lag_plot_pre_2020 <- df_lahman %>%
filter(yearID %in% 1970:2019) %>%
filter(cur_PA >= 100) %>%
select(bbrefID, yearID, contains("OBP"))

df_lag_plot_2020 <- df_lahman %>%
filter(yearID == 2020) %>%
filter(cur_PA >= 30) %>%
select(bbrefID, yearID, contains("OBP"))

df_lag_plot <- bind_rows(
df_lag_plot_pre_2020,
df_lag_plot_2020
)

Plot_Lags <- function(plotting_data, num_lag, save = FALSE) {

x_axis <- paste0("lagged_OBP_", num_lag)
Expand All @@ -27,7 +88,7 @@ Plot_Lags <- function(plotting_data, num_lag, save = FALSE) {
geom_point(alpha = .3, size = 1.5) +
geom_smooth(se = FALSE, color = "#4477AA", size = 1) +
geom_abline(slope = 1, linetype = "dashed", color = "#CC6677", size = 1) +
scale_x_continuous(limits = c(0, .7), expand = expansion(0, 0)) +
scale_x_continuous(limits = c(0, .7), expand = expansion(.05, 0)) +
scale_y_continuous(limits = c(0, .7), expand = expansion(0, 0)) +
labs(
x = x_axis_title,
Expand All @@ -45,6 +106,9 @@ Plot_Lags <- function(plotting_data, num_lag, save = FALSE) {

ggsave(plot = p, filename = path_plot)

} else {

return(p)
}

}
Expand All @@ -54,76 +118,47 @@ v_lags <- 1:5
walk(v_lags, ~ Plot_Lags(df_lag_plot, ., save = TRUE))


# 2019 vs. 2020 to see if relationship persists ---------------------------

df_2020 <- df_lahman %>%
filter(yearID == 2020) %>%
select(bbrefID, yearID, cur_OBP, lagged_OBP_1) %>%
filter(
if_all(contains("OBP"), ~ between(., .001, .999))
)

p <- ggplot(data = df_2020, aes(x = lagged_OBP_1, y = cur_OBP)) +
geom_point(alpha = .3, size = 1.5) +
geom_smooth(se = FALSE, color = "#4477AA", size = 1) +
geom_abline(slope = 1, linetype = "dashed", color = "#CC6677", size = 1) +
scale_x_continuous(limits = c(0, .7), expand = expansion(0, 0)) +
scale_y_continuous(limits = c(0, .7), expand = expansion(0, 0)) +
labs(
x = "2019 OBP",
y = "2020 OBP"
) +
theme_minimal() +
theme(
panel.grid.minor = element_blank()
)

ggsave(plot = p, filename = here("report/figures/2019_2020_lag.png"))

# Ridge Plots -------------------------------------------------------------

median_2019 <- df_ridge_plot %>%
filter(yearID == 2019) %>%
pull(cur_OBP) %>%
median(., na.rm = TRUE)

Plot_Ridges <- function(start_year, save = FALSE) {
Plot_Ridges <- function(start_year, end_year, save = FALSE) {

df_ridge_plot <- df_lahman %>%
filter(yearID %in% start_year:2020) %>%
filter(between(cur_OBP, .001, .999)) %>%
filter(yearID %in% start_year:end_year) %>%
filter(cur_PA > 30) %>%
select(bbrefID, yearID, cur_OBP)

p <- ggplot(df_ridge_plot, aes(x = cur_OBP, y = factor(yearID))) +
geom_density_ridges(fill = "#4477AA") +
scale_x_continuous(expand = expansion(0, 0)) +
scale_x_continuous(expand = expansion(0, 0), breaks = c(0, .2, .4, .6)) +
scale_y_discrete(expand = expansion(0, 0)) +
labs(
x = "OBP",
y = "Season"
) +
geom_vline(xintercept = median_2019, linetype = "dashed", color = "#CC6677", size = 1) +
theme_minimal()

if(save == TRUE) {

file_name <- paste0("ridge_", start_year, "_2020.png")
file_name <- paste0("ridge_", start_year, "_", end_year, ".png")
path_plot <- here("report/figures/", file_name)

ggsave(plot = p, filename = path_plot)

} else{

return(p)
}

}

v_ridge_start_years <- c(2000, 2013, 2016)

walk(v_ridge_start_years, ~ Plot_Ridges(., save = TRUE))
v_ridge_start_years <- c(1970, 2000)
v_ridge_end_years <- c(1999, 2020)

walk2(v_ridge_start_years, v_ridge_end_years, ~ Plot_Ridges(.x, .y, save = TRUE))

df_ridge_plot <- df_lahman %>%
filter(yearID %in% 2016:2020) %>%
filter(between(cur_OBP, .001, .999)) %>%
filter(cur_PA >= 30) %>%
transmute(
bbrefID,
yearID,
Expand All @@ -141,5 +176,3 @@ p <- ggplot(df_ridge_plot, aes(x = OBP_per_PA, y = factor(yearID))) +
theme_minimal()

ggsave(plot = p, filename = here("report/figures/ridge_2016_2020_OBP_per_PA.png"))


33 changes: 32 additions & 1 deletion code/03_perform_EDA.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ pacman::p_load(


df_lahman <- read_csv(here("data/lahman/derived/df_batting_lag5.csv")) %>%
filter(between(yearID, 1995, 2020)) %>%
filter(between(yearID, 1970, 2020)) %>%
glimpse()

skim(df_lahman)
Expand All @@ -27,3 +27,34 @@ df_lahman %>%
select(contains("_")) %>%
na.omit() %>%
DataExplorer::plot_prcomp(., variance_cap = .60)

df_fg <- read_csv("data/fangraphs/raw/obp.csv") %>%
mutate(
playerid = as.character(playerid)
) %>%
select(-OBP_21, -PA_21)


df_fg %>%
map_dbl(~ sum(is.na(.)))

skimr::skim(df)

df_fg %>%
select(playerid, contains("OBP")) %>%
pivot_longer(-playerid) %>%
group_by(playerid) %>%
summarize(
num_missing = sum(is.na(value))
) %>%
filter(num_missing >1)

df_fg %>%
select(playerid, num_range("PA_", 16:20)) %>%
filter(
PA_16 >= 100,
PA_17 >= 100,
PA_18 >= 100,
PA_19 >= 100,
PA_20 >= 30,
)
Loading

0 comments on commit 9cc9769

Please sign in to comment.