diff --git a/README.md b/README.md index 8ab6137..d6a3f54 100644 --- a/README.md +++ b/README.md @@ -1 +1,24 @@ -# OBP \ No newline at end of file +# OBP + +## Goal: Predict OBP with the smallest RMSE possible for 2021 season + +To run: In `code/`, run the scripts 01_* through 09_* in order. If you do not have the `pacman` package you will need to install it, and then `pacman` will handle installing any additional packages you do not have installed + +Additional packages used: +* [broom](https://cran.r-project.org/web/packages/broom/vignettes/broom.html) +* [DataExplorer](https://cran.r-project.org/web/packages/DataExplorer/vignettes/dataexplorer-intro.html) +* [ggridges](https://cran.r-project.org/package=ggridges/vignettes/introduction.html) +* [ggthemes](https://yutannihilation.github.io/allYourFigureAreBelongToUs/ggthemes/) +* [glmnetUtils](https://cran.r-project.org/web/packages/glmnetUtils/vignettes/intro.html) +* [here](https://cran.r-project.org/web/packages/here/vignettes/here.html) +* [Lahman](https://cran.r-project.org/web/packages/Lahman/Lahman.pdf) +* [pacman](https://cran.r-project.org/web/packages/pacman/readme/README.html) +* [skimr](https://cran.r-project.org/web/packages/skimr/vignettes/skimr.html) +* [tictoc](https://cran.r-project.org/package=tictoc) +* [tidylog](https://cran.r-project.org/web/packages/tidylog/readme/README.html) +* [tidymodels](https://www.tidymodels.org/) +* [tidyverse](https://www.tidyverse.org/) +* [tsibble](https://tsibble.tidyverts.org/) + +### Session Information: +* R version 4.0.3 diff --git a/code/01_prepare_lahman_data.R b/code/01_prepare_lahman_data.R index 2f75b2f..ebd9c82 100644 --- a/code/01_prepare_lahman_data.R +++ b/code/01_prepare_lahman_data.R @@ -11,8 +11,8 @@ pacman::p_load( path_batting <- here("data/lahman/raw/batting.csv") path_people <- here("data/lahman/raw/people.csv") -write_csv(Batting, path_batting) -write_csv(People, path_people) +write_csv(Lahman::Batting, path_batting) +write_csv(Lahman::People, path_people) # FanGraphs --------------------------------------------------------------- @@ -47,21 +47,17 @@ df_fg_longer %>% # Lahman ------------------------------------------------------------------ -df_batting <- read_csv(path_batting) %>% +df_batting <- Lahman::Batting %>% select(playerID, yearID, stint, G:GIDP) %>% glimpse() df_batting %>% group_by(playerID, yearID, stint) # confirm that dataset is at the player-year-stint level -df_people <- read_csv(path_people) %>% +df_people <- Lahman::People %>% select(playerID, bbrefID, nameFirst, nameLast) %>% glimpse() -df_crosswalk <- read_csv(here("data/sfbb_player_id_crosswalk.csv")) %>% # https://www.smartfantasybaseball.com/tools/ - select(bbrefID = "BREFID", fgID = "IDFANGRAPHS") %>% - glimpse() - df_batting_aggregated <- df_batting %>% group_by(playerID, yearID) %>% summarize( @@ -84,7 +80,7 @@ df_batting_stats <- df_batting_aggregated %>% ) %>% glimpse() -df_tsibble <- as_tsibble(df_batting_stats, key = bbrefID, index = yearID) %>% +df_tsibble <- as_tsibble(df_batting_stats, key = bbrefID, index = yearID) %>% # https://tsibble.tidyverts.org/ fill_gaps() %>% # make missing seasons explicit instead of implicit so that previous row = last played season arrange(bbrefID, yearID) %>% group_by(bbrefID) diff --git a/code/02_plot_OBP.R b/code/02_plot_OBP.R index 34aedeb..06d6e1f 100644 --- a/code/02_plot_OBP.R +++ b/code/02_plot_OBP.R @@ -3,21 +3,82 @@ pacman::p_load( tidyverse, tidylog, here, - ggridges + ggridges, + ggthemes ) df_lahman <- read_csv(here("data/lahman/derived/df_batting_lag5.csv")) %>% glimpse() + +# Quartiles over time ----------------------------------------------------- + +df_quartiles_pre_2020 <- df_lahman %>% + filter(yearID <= 2019) %>% + filter(cur_PA >= 100) %>% + group_by(yearID) %>% + summarize( + `25th` = quantile(cur_OBP, probs = .25, na.rm = TRUE), + `50th` = quantile(cur_OBP, probs = .5, na.rm = TRUE), + `75th` = quantile(cur_OBP, probs = .75, na.rm = TRUE), + ) %>% + pivot_longer(., cols = -yearID, names_to = "Quartile", values_to = "OBP") %>% + mutate( + Quartile = factor(Quartile, levels = c("75th", "50th", "25th")) + ) + +df_quartiles_2020 <- df_lahman %>% + filter(yearID == 2020) %>% + filter(cur_PA >= 30) %>% + group_by(yearID) %>% + summarize( + `25th` = quantile(cur_OBP, probs = .25, na.rm = TRUE), + `50th` = quantile(cur_OBP, probs = .5, na.rm = TRUE), + `75th` = quantile(cur_OBP, probs = .75, na.rm = TRUE), + ) %>% + pivot_longer(., cols = -yearID, names_to = "Quartile", values_to = "OBP") %>% + mutate( + Quartile = factor(Quartile, levels = c("75th", "50th", "25th")) + ) + +df_quartiles <- bind_rows( + df_quartiles_pre_2020, + df_quartiles_2020 +) + +p_quartiles <- ggplot(df_quartiles, aes(x = yearID, y = OBP, color = Quartile)) + + geom_line(size = 1) + + geom_point(size = 1.5) + + scale_x_continuous(breaks = seq(1870, 2020, 10)) + + scale_color_ptol() + + geom_vline(xintercept = 1970, linetype = "dashed") + + theme_minimal() + + labs( + x = "Season" + ) + + theme( + panel.grid = element_blank() + ) + +ggsave(plot = p_quartiles, filename = here("report/figures/quartiles_ts.png")) + # Lag Plots --------------------------------------------------------------- -df_lag_plot <- df_lahman %>% - filter(yearID %in% 2015:2020) %>% - filter( - if_all(contains("OBP"), ~ between(., .001, .999)) - ) %>% +df_lag_plot_pre_2020 <- df_lahman %>% + filter(yearID %in% 1970:2019) %>% + filter(cur_PA >= 100) %>% select(bbrefID, yearID, contains("OBP")) +df_lag_plot_2020 <- df_lahman %>% + filter(yearID == 2020) %>% + filter(cur_PA >= 30) %>% + select(bbrefID, yearID, contains("OBP")) + +df_lag_plot <- bind_rows( + df_lag_plot_pre_2020, + df_lag_plot_2020 +) + Plot_Lags <- function(plotting_data, num_lag, save = FALSE) { x_axis <- paste0("lagged_OBP_", num_lag) @@ -27,7 +88,7 @@ Plot_Lags <- function(plotting_data, num_lag, save = FALSE) { geom_point(alpha = .3, size = 1.5) + geom_smooth(se = FALSE, color = "#4477AA", size = 1) + geom_abline(slope = 1, linetype = "dashed", color = "#CC6677", size = 1) + - scale_x_continuous(limits = c(0, .7), expand = expansion(0, 0)) + + scale_x_continuous(limits = c(0, .7), expand = expansion(.05, 0)) + scale_y_continuous(limits = c(0, .7), expand = expansion(0, 0)) + labs( x = x_axis_title, @@ -45,6 +106,9 @@ Plot_Lags <- function(plotting_data, num_lag, save = FALSE) { ggsave(plot = p, filename = path_plot) + } else { + + return(p) } } @@ -54,76 +118,47 @@ v_lags <- 1:5 walk(v_lags, ~ Plot_Lags(df_lag_plot, ., save = TRUE)) -# 2019 vs. 2020 to see if relationship persists --------------------------- - -df_2020 <- df_lahman %>% - filter(yearID == 2020) %>% - select(bbrefID, yearID, cur_OBP, lagged_OBP_1) %>% - filter( - if_all(contains("OBP"), ~ between(., .001, .999)) - ) - -p <- ggplot(data = df_2020, aes(x = lagged_OBP_1, y = cur_OBP)) + - geom_point(alpha = .3, size = 1.5) + - geom_smooth(se = FALSE, color = "#4477AA", size = 1) + - geom_abline(slope = 1, linetype = "dashed", color = "#CC6677", size = 1) + - scale_x_continuous(limits = c(0, .7), expand = expansion(0, 0)) + - scale_y_continuous(limits = c(0, .7), expand = expansion(0, 0)) + - labs( - x = "2019 OBP", - y = "2020 OBP" - ) + - theme_minimal() + - theme( - panel.grid.minor = element_blank() - ) - -ggsave(plot = p, filename = here("report/figures/2019_2020_lag.png")) - # Ridge Plots ------------------------------------------------------------- -median_2019 <- df_ridge_plot %>% - filter(yearID == 2019) %>% - pull(cur_OBP) %>% - median(., na.rm = TRUE) - -Plot_Ridges <- function(start_year, save = FALSE) { +Plot_Ridges <- function(start_year, end_year, save = FALSE) { df_ridge_plot <- df_lahman %>% - filter(yearID %in% start_year:2020) %>% - filter(between(cur_OBP, .001, .999)) %>% + filter(yearID %in% start_year:end_year) %>% + filter(cur_PA > 30) %>% select(bbrefID, yearID, cur_OBP) p <- ggplot(df_ridge_plot, aes(x = cur_OBP, y = factor(yearID))) + geom_density_ridges(fill = "#4477AA") + - scale_x_continuous(expand = expansion(0, 0)) + + scale_x_continuous(expand = expansion(0, 0), breaks = c(0, .2, .4, .6)) + scale_y_discrete(expand = expansion(0, 0)) + labs( x = "OBP", y = "Season" ) + - geom_vline(xintercept = median_2019, linetype = "dashed", color = "#CC6677", size = 1) + theme_minimal() if(save == TRUE) { - file_name <- paste0("ridge_", start_year, "_2020.png") + file_name <- paste0("ridge_", start_year, "_", end_year, ".png") path_plot <- here("report/figures/", file_name) ggsave(plot = p, filename = path_plot) + } else{ + + return(p) } } -v_ridge_start_years <- c(2000, 2013, 2016) - -walk(v_ridge_start_years, ~ Plot_Ridges(., save = TRUE)) +v_ridge_start_years <- c(1970, 2000) +v_ridge_end_years <- c(1999, 2020) +walk2(v_ridge_start_years, v_ridge_end_years, ~ Plot_Ridges(.x, .y, save = TRUE)) df_ridge_plot <- df_lahman %>% filter(yearID %in% 2016:2020) %>% - filter(between(cur_OBP, .001, .999)) %>% + filter(cur_PA >= 30) %>% transmute( bbrefID, yearID, @@ -141,5 +176,3 @@ p <- ggplot(df_ridge_plot, aes(x = OBP_per_PA, y = factor(yearID))) + theme_minimal() ggsave(plot = p, filename = here("report/figures/ridge_2016_2020_OBP_per_PA.png")) - - diff --git a/code/03_perform_EDA.R b/code/03_perform_EDA.R index 967633e..06a5414 100644 --- a/code/03_perform_EDA.R +++ b/code/03_perform_EDA.R @@ -9,7 +9,7 @@ pacman::p_load( df_lahman <- read_csv(here("data/lahman/derived/df_batting_lag5.csv")) %>% - filter(between(yearID, 1995, 2020)) %>% + filter(between(yearID, 1970, 2020)) %>% glimpse() skim(df_lahman) @@ -27,3 +27,34 @@ df_lahman %>% select(contains("_")) %>% na.omit() %>% DataExplorer::plot_prcomp(., variance_cap = .60) + +df_fg <- read_csv("data/fangraphs/raw/obp.csv") %>% + mutate( + playerid = as.character(playerid) + ) %>% + select(-OBP_21, -PA_21) + + +df_fg %>% + map_dbl(~ sum(is.na(.))) + +skimr::skim(df) + +df_fg %>% + select(playerid, contains("OBP")) %>% + pivot_longer(-playerid) %>% + group_by(playerid) %>% + summarize( + num_missing = sum(is.na(value)) + ) %>% + filter(num_missing >1) + +df_fg %>% + select(playerid, num_range("PA_", 16:20)) %>% + filter( + PA_16 >= 100, + PA_17 >= 100, + PA_18 >= 100, + PA_19 >= 100, + PA_20 >= 30, + ) diff --git a/code/04_prepare_data_splits.R b/code/04_prepare_data_splits.R new file mode 100644 index 0000000..a1063d5 --- /dev/null +++ b/code/04_prepare_data_splits.R @@ -0,0 +1,154 @@ + +pacman::p_load( + tidyverse, + tidylog, + here, + Lahman +) + +df_lahman <- read_csv(here("data/lahman/derived/df_batting_lag5.csv")) %>% + glimpse() + +df_fg <- read_csv(here("data/fangraphs/raw/obp.csv")) %>% + select(-birth_date) %>% + rename( + "fgID" = playerid + ) %>% + glimpse() + +df_crosswalk <- read_csv(here("data/sfbb_player_id_crosswalk.csv")) %>% # "https://www.smartfantasybaseball.com/tools/" + select(bbrefID = "BREFID", fgID = "IDFANGRAPHS", PLAYERNAME) %>% + glimpse() + +df_fg_OBP_longer <- df_fg %>% + select(Name, fgID, starts_with("OBP")) %>% + pivot_longer(., starts_with("OBP"), names_to = "yearID", values_to = "OBP", names_prefix = "OBP_") %>% + mutate( + yearID = 2000 + as.integer(yearID) + ) %>% + glimpse() + +df_fg_PA_longer <- df_fg %>% + select(Name, fgID, starts_with("PA")) %>% + pivot_longer(., starts_with("PA"), names_to = "yearID", values_to = "PA", names_prefix = "PA_") %>% + mutate( + yearID = 2000 + as.integer(yearID) + ) %>% + glimpse() + +df_fg_longer <- inner_join(df_fg_OBP_longer, df_fg_PA_longer, by = c("fgID", "yearID", "Name")) %>% + mutate( + fgID = as.character(fgID) + ) + +df_fg_longer %>% + group_by(fgID, yearID) # confirm that dataset is now at the player-year level + +df_fg_crosswalk <- df_fg_longer %>% + left_join(., df_crosswalk, by = "fgID") + +df_fg_crosswalk %>% + select(Name, PLAYERNAME, everything()) %>% + distinct(Name, .keep_all = TRUE) # confirm that for players with bbrefID they are merged correctly with fgID + + +df_missing_bbrefid <- df_fg_longer %>% + left_join(., df_crosswalk, by = "fgID") %>% + filter(is.na(bbrefID)) + + +df_missing_bbrefid_filled <- df_missing_bbrefid %>% + select(Name, fgID, bbrefID) %>% + distinct(Name, .keep_all = TRUE) %>% + print() %>% + mutate( + bbrefID = c( # manually add missing bbref IDs from bbref URL + "marchra01", + "deverjo01", + "fargajo01", + "waltodo01", + "craigwi01", + "ramoshe01", + "palacjo01", + "mazeipa01", + "riverse01", + "gittech01", + "celesgi01", + "godoyjo01", + "hagerja01", + "heathni01", + "deichgr01" + ) + ) %>% + print() %>% + select(-Name) + +df_missing_merged <- df_missing_bbrefid %>% + select(-bbrefID) %>% + left_join(., df_missing_bbrefid_filled, by = "fgID") + +df_fg_missing_resolved <- df_fg_crosswalk %>% + filter(!is.na(bbrefID)) %>% + bind_rows(df_missing_merged) + +df_fg_missing_resolved %>% + group_by(Name, yearID) %>% + count(., sort = TRUE) %>% + filter(n > 1) # hanlde Ohtani showing up as pitcher and hitter + +df_fg_cleaned <- df_fg_missing_resolved %>% + distinct(bbrefID, yearID, .keep_all = TRUE) # remove duplicated Ohtani entry + +df_fg_cleaned %>% + group_by(fgID) # confirm there are still 572 players + +df_fg_cleaned %>% + group_by(fgID, yearID) # confirm data at year-playerid level + +df_fg_cleaned %>% + left_join(., df_lahman, by = c("bbrefID", "yearID")) %>% + transmute( + Name, + bbrefID, + yearID, + OBP, + cur_OBP, + delta_OBP = OBP - cur_OBP + ) %>% + pull(delta_OBP) %>% + qplot() + +df_test <- left_join(df_fg_cleaned, df_lahman, by = c("bbrefID", "yearID")) %>% + filter(yearID == 2021) %>% + select(Name, fgID, bbrefID, yearID, OBP, PA, starts_with("lagged_")) %>% + glimpse() + +write_csv(df_test, here("data/modeling/df_test.csv")) + +df_training_2020 <- df_lahman %>% + filter(yearID == 2020) %>% + filter(cur_PA >= 30) %>% + mutate( + Name = paste(nameFirst, nameLast) + ) %>% + select(Name, bbrefID, yearID, cur_OBP, starts_with("lagged_")) %>% + glimpse() + +df_training_1975_2019 <- df_lahman %>% + filter(between(yearID, 1975, 2019)) %>% + filter(cur_PA >= 100) %>% + mutate( + Name = paste(nameFirst, nameLast) + ) %>% + select(Name, bbrefID, yearID, cur_OBP, cur_BB, starts_with("lagged_")) %>% + glimpse() + +df_training <- bind_rows( + df_training_1975_2019, + df_training_2020 +) + +df_training %>% + group_by(yearID, bbrefID) + +write_csv(df_training, here("data/modeling/df_training.csv")) diff --git a/code/05_generate_sample_mean_predictions.R b/code/05_generate_sample_mean_predictions.R new file mode 100644 index 0000000..7214bfd --- /dev/null +++ b/code/05_generate_sample_mean_predictions.R @@ -0,0 +1,42 @@ + +pacman::p_load( + tidyverse, + tidylog, + here, + yardstick +) + +df_training_raw <- read_csv(here("data/modeling/df_training.csv")) %>% + glimpse() + +df_training <- df_training_raw %>% + filter(between(yearID, 1975, 2017)) %>% + select(bbrefID, yearID, cur_OBP) %>% + group_by(bbrefID) %>% + summarize( + fitted_OBP = mean(cur_OBP, na.rm = TRUE) + ) %>% + print() + +median_obp_2013_2017 <- df_training_raw %>% + filter(between(yearID, 2013, 2017)) %>% + pull(cur_OBP) %>% + median(., na.rm = TRUE) + +df_validation <- df_training_raw %>% + filter(yearID %in% 2018:2020) %>% + select(bbrefID, yearID, cur_OBP) %>% + left_join(., df_training, by = "bbrefID") %>% + mutate( + fitted_OBP = coalesce(fitted_OBP, median_obp_2013_2017) # for players with no obserations before 2018, replace with league median OBP 2013-2017 + ) %>% + print() + +rmse_vec( # https://yardstick.tidymodels.org/reference/rmse.html + truth = df_validation$cur_OBP, + estimate = df_validation$fitted_OBP +) + + + + diff --git a/code/06_run_OBP_OLS.R b/code/06_run_OBP_OLS.R new file mode 100644 index 0000000..d8621af --- /dev/null +++ b/code/06_run_OBP_OLS.R @@ -0,0 +1,57 @@ + +pacman::p_load( + tidyverse, + tidylog, + here, + yardstick, + broom +) + +df_training_raw <- read_csv(here("data/modeling/df_training.csv")) %>% + glimpse() + +df_training <- df_training_raw %>% + filter(between(yearID, 1975, 2017)) %>% + select(contains("OBP")) %>% + print() + +df_validation <- df_training_raw %>% + filter(yearID %in% 2018:2020) %>% + select(contains("OBP")) %>% + print() + + +model_obp <- lm(cur_OBP ~ ., data = df_training) + +df_training_fitted <- augment(model_obp) + +rmse_vec( + truth = df_training_fitted$cur_OBP, + estimate = df_training_fitted$.fitted +) + +ggplot(df_training_fitted, aes(x = cur_OBP, y = .resid)) + + geom_point() + + theme_minimal() + +df_validation_fitted <- augment(x = model_obp, newdata = df_validation) + +rmse_vec( + truth = df_validation_fitted$cur_OBP, + estimate = df_validation_fitted$.fitted, + na_rm = TRUE +) + +p_resid <- ggplot(df_validation_fitted, aes(x = cur_OBP, y = .resid)) + + geom_point() + + theme_minimal() + + labs( + x = "True OBP", + y = "OBP Residual" + ) + + theme( + panel.grid.minor = element_blank() + ) + +ggsave(plot = p_resid, filename = here("report/figures/OLS_residuals.png")) + diff --git a/code/07_run_LASSO.R b/code/07_run_LASSO.R new file mode 100644 index 0000000..8844fc5 --- /dev/null +++ b/code/07_run_LASSO.R @@ -0,0 +1,47 @@ + +pacman::p_load( + tidyverse, + tidylog, + here, + yardstick, + glmnetUtils +) + +df_training_raw <- read_csv(here("data/modeling/df_training.csv")) %>% + glimpse() + +df_training <- df_training_raw %>% + filter(yearID <= 2017) %>% + select(contains("_")) %>% + select(-cur_BB) %>% + glimpse() + +df_validation <- df_training_raw %>% + filter(yearID %in% 2018:2020) %>% + select(contains("_")) %>% + select(-cur_BB) %>% + glimpse() + +model_lasso <- glmnet(cur_OBP ~ ., data = df_training, alpha = 1) # alpha == 1 is LASSO, 0 is Ridge https://cran.r-project.org/web/packages/glmnetUtils/vignettes/intro.html + +mx_coefs <- coef(model_lasso) + +df_vars_selected_for_each_lambda <- tidy(mx_coefs) %>% + rename( + variable = row, + lambda_index = column, + coefficient = value + ) %>% + filter(variable != "(Intercept)") %>% + select(variable, lambda_index) %>% + group_by(lambda_index) %>% + mutate( + selected_vars = list(variable) + ) %>% + distinct(lambda_index, .keep_all = TRUE) %>% + select(-variable) # most important vars look to be OBP lags 1-3 and BB lags 1-3 + + + + + diff --git a/code/08_run_splines.R b/code/08_run_splines.R new file mode 100644 index 0000000..b2be4c6 --- /dev/null +++ b/code/08_run_splines.R @@ -0,0 +1,122 @@ + +pacman::p_load( + tidyverse, + tidylog, + here, + tidymodels, + tictoc, + ggthemes +) + +df_training_raw <- read_csv(here("data/modeling/df_training.csv")) %>% + glimpse() + +df_training <- df_training_raw %>% + filter(yearID <= 2017) %>% + select(bbrefID, yearID, cur_OBP, lagged_OBP_1:lagged_OBP_3, lagged_BB_1:lagged_BB_3) %>% + group_by(bbrefID) %>% + mutate( + across(contains("lagged"), ~ coalesce(., mean(., na.rm = TRUE))) # if lagged value is missing, replace with the player's mean for that value + ) %>% + ungroup() %>% + group_by(yearID) %>% + mutate( + across( + contains("lagged"), ~ coalesce(., mean(., na.rm = TRUE)) # if lagged value still missing, replace with the league avarage of the lag for that year + ) + ) %>% + ungroup() + +df_validation <- df_training_raw %>% + filter(yearID %in% 2018:2020) %>% + select(bbrefID, yearID, cur_OBP, lagged_OBP_1:lagged_OBP_3, lagged_BB_1:lagged_BB_3) %>% + group_by(bbrefID) %>% + mutate( + across(contains("lagged"), ~ coalesce(., mean(., na.rm = TRUE))) # if lagged value is missing, replace with the player's mean for that value + ) %>% + ungroup() %>% + group_by(yearID) %>% + mutate( + across( + contains("lagged"), ~ coalesce(., mean(., na.rm = TRUE)) # if lagged value still missing, replace with the league avarage of the lag for that year + ) + ) %>% + ungroup() + +Fit_Splines <- function(predictor_list, dof) { + + splines_recipe <- recipe(cur_OBP ~ ., data = df_training) %>% + update_role(bbrefID, yearID, new_role = "id") %>% + step_intercept() %>% + step_ns(predictor_list, deg_free = dof) %>% # https://recipes.tidymodels.org/reference/step_ns.html + prep(., strings_as_factors = FALSE) + + df_training_baked <- bake(splines_recipe, df_training) %>% + select(contains("_")) + + df_validation_baked <- bake(splines_recipe, df_validation) %>% + select(contains("_")) + + model_splines <- lm(cur_OBP ~ ., data = df_training_baked) + + df_training_fitted <- augment(model_splines) + + train_rmse <- rmse_vec( + df_training_fitted$cur_OBP, + df_training_fitted$.fitted + ) + + df_validation_fitted <- augment(x = model_splines, newdata = df_validation_baked) + + val_rmse <- rmse_vec( + df_validation_fitted$cur_OBP, + df_validation_fitted$.fitted + ) + + tibble( + "Spline DoF" = dof, + "Predictors" = list(predictor_list), + "Training RMSE" = train_rmse, + "Validation RMSE" = val_rmse + ) + +} + +v_spline_dof <- 2:10 + +df_OBP_2 <- map_dfr(v_spline_dof, ~ Fit_Splines(c("lagged_OBP_1", "lagged_OBP_2"), .)) +df_OBP_3 <- map_dfr(v_spline_dof, ~ Fit_Splines(c("lagged_OBP_1", "lagged_OBP_2", "lagged_OBP_3"), .)) +df_OBP_2_BB_1 <- map_dfr(v_spline_dof, ~ Fit_Splines(c("lagged_OBP_1", "lagged_OBP_2", "lagged_BB_1"), .)) +df_OBP_2_BB_2 <- map_dfr(v_spline_dof, ~ Fit_Splines(c("lagged_OBP_1", "lagged_OBP_2", "lagged_BB_1", "lagged_BB_2"), .)) +df_OBP_3_BB_2 <- map_dfr(v_spline_dof, ~ Fit_Splines(c("lagged_OBP_1", "lagged_OBP_2", "lagged_OBP_3", "lagged_BB_1", "lagged_BB_2"), .)) + +df_splines_models <- bind_rows( + df_OBP_2, + df_OBP_3, + df_OBP_2_BB_1, + df_OBP_2_BB_2, + df_OBP_3_BB_2 +) %>% + group_by(Predictors) %>% + mutate( + `Group ID` = cur_group_id() %>% as.factor() # https://stackoverflow.com/questions/39650511/r-group-by-variable-and-then-assign-a-unique-id + ) + +ggplot(df_splines_models, aes(x = `Spline DoF`, y = `Validation RMSE`, color = `Group ID`)) + + geom_line(size = 1) + + geom_point(size = 2) + + scale_color_ptol() + + scale_x_continuous(breaks = 2:10) + + labs( + x = "Spline Degrees of Freedom", + y = "Validation RMSE" + ) + + theme_minimal() + + theme( + panel.grid.minor = element_blank(), + text = element_text(size = 14) + ) + +df_splines_models %>% + filter(`Spline DoF` == 3, `Group ID` == 2) + diff --git a/code/09_get_test_performance.R b/code/09_get_test_performance.R new file mode 100644 index 0000000..d3720a8 --- /dev/null +++ b/code/09_get_test_performance.R @@ -0,0 +1,73 @@ + +pacman::p_load( + tidyverse, + tidylog, + tidymodels, + here +) + + +df_training_raw <- read_csv(here("data/modeling/df_training.csv")) %>% + glimpse() + +df_test_raw <- read_csv(here("data/modeling/df_test.csv")) %>% + select(bbrefID, yearID, cur_OBP = OBP, num_range("lagged_OBP_", 1:3)) %>% + glimpse() + +df_training <- df_training_raw %>% + select(bbrefID, yearID, cur_OBP, num_range("lagged_OBP_", 1:3)) %>% + group_by(bbrefID) %>% + mutate( + across(contains("lagged"), ~ coalesce(., mean(., na.rm = TRUE))) # if lagged value is missing, replace with the player's mean for that value + ) %>% + ungroup() %>% + group_by(yearID) %>% + mutate( + across( + contains("lagged"), ~ coalesce(., mean(., na.rm = TRUE)) # if lagged value still missing, replace with the league avarage of the lag for that year + ) + ) %>% + ungroup() %>% + glimpse() + +df_test_na_filled <- df_test_raw %>% + select(bbrefID, contains("lagged")) %>% + pivot_longer(., -bbrefID) %>% + group_by(bbrefID) %>% + mutate( + own_player_mean = coalesce(value, mean(value, na.rm = TRUE)) + ) %>% + mutate( + training_set_mean = coalesce(own_player_mean, .324) # mean of lags 1-3 in the training data + ) %>% + ungroup() %>% + select(bbrefID, name, OBP = training_set_mean) %>% + pivot_wider(., names_from = name, values_from = OBP) + +df_test <- df_test_na_filled %>% + inner_join(., df_test_raw %>% select(bbrefID, cur_OBP), by = "bbrefID") + +splines_recipe <- recipe(cur_OBP ~ ., data = df_training) %>% + update_role(bbrefID, yearID, new_role = "id") %>% + step_intercept() %>% + step_ns(c("lagged_OBP_1", "lagged_OBP_2", "lagged_OBP_3"), deg_free = 3) %>% + prep(., strings_as_factors = FALSE) + +df_training_baked <- bake(splines_recipe, df_training) %>% + select(contains("_")) + +df_test_baked <- bake(splines_recipe, df_test) %>% + select(contains("_")) + +df_final_model <- lm(cur_OBP ~ ., data = df_training_baked) + +df_test_fitted <- augment(df_final_model, newdata = df_test_baked) + +rmse_vec( + truth = df_test_fitted$cur_OBP, + estimate = df_test_fitted$.fitted +) + + + + diff --git a/report/figures/2019_2020_lag.png b/report/figures/2019_2020_lag.png index 17e6b59..5e7bd4c 100644 Binary files a/report/figures/2019_2020_lag.png and b/report/figures/2019_2020_lag.png differ diff --git a/report/figures/OLS_residuals.png b/report/figures/OLS_residuals.png new file mode 100644 index 0000000..3f2ccbe Binary files /dev/null and b/report/figures/OLS_residuals.png differ diff --git a/report/figures/lag_1.png b/report/figures/lag_1.png index 14e5a0b..55c494b 100644 Binary files a/report/figures/lag_1.png and b/report/figures/lag_1.png differ diff --git a/report/figures/lag_2.png b/report/figures/lag_2.png index f89a94d..40c171e 100644 Binary files a/report/figures/lag_2.png and b/report/figures/lag_2.png differ diff --git a/report/figures/lag_3.png b/report/figures/lag_3.png index 8b86825..5446d31 100644 Binary files a/report/figures/lag_3.png and b/report/figures/lag_3.png differ diff --git a/report/figures/lag_4.png b/report/figures/lag_4.png index ed68032..eab26a4 100644 Binary files a/report/figures/lag_4.png and b/report/figures/lag_4.png differ diff --git a/report/figures/lag_5.png b/report/figures/lag_5.png index b3b8260..ec9001a 100644 Binary files a/report/figures/lag_5.png and b/report/figures/lag_5.png differ diff --git a/report/figures/quartiles_ts.png b/report/figures/quartiles_ts.png new file mode 100644 index 0000000..dd2c0a0 Binary files /dev/null and b/report/figures/quartiles_ts.png differ diff --git a/report/figures/ridge_1970_1999.png b/report/figures/ridge_1970_1999.png new file mode 100644 index 0000000..e2730a2 Binary files /dev/null and b/report/figures/ridge_1970_1999.png differ diff --git a/report/figures/ridge_2000_2020.png b/report/figures/ridge_2000_2020.png index d5c23ef..fff4f54 100644 Binary files a/report/figures/ridge_2000_2020.png and b/report/figures/ridge_2000_2020.png differ diff --git a/report/figures/ridge_2013_2020.png b/report/figures/ridge_2013_2020.png index 7416788..5a06f97 100644 Binary files a/report/figures/ridge_2013_2020.png and b/report/figures/ridge_2013_2020.png differ diff --git a/report/figures/ridge_2016_2020.png b/report/figures/ridge_2016_2020.png index 0be54e2..fef6ba9 100644 Binary files a/report/figures/ridge_2016_2020.png and b/report/figures/ridge_2016_2020.png differ diff --git a/report/figures/ridge_2016_2020_OBP_per_PA.png b/report/figures/ridge_2016_2020_OBP_per_PA.png index f497b71..a3ba6f8 100644 Binary files a/report/figures/ridge_2016_2020_OBP_per_PA.png and b/report/figures/ridge_2016_2020_OBP_per_PA.png differ