diff --git a/README.md b/README.md
index 8ab6137..d6a3f54 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,24 @@
-# OBP
\ No newline at end of file
+# OBP
+
+## Goal: Predict OBP with the smallest RMSE possible for 2021 season
+
+To run: In `code/`, run the scripts 01_* through 09_* in order. If you do not have the `pacman` package you will need to install it, and then `pacman` will handle installing any additional packages you do not have installed 
+
+Additional packages used:
+* [broom](https://cran.r-project.org/web/packages/broom/vignettes/broom.html)
+* [DataExplorer](https://cran.r-project.org/web/packages/DataExplorer/vignettes/dataexplorer-intro.html)
+* [ggridges](https://cran.r-project.org/package=ggridges/vignettes/introduction.html)
+* [ggthemes](https://yutannihilation.github.io/allYourFigureAreBelongToUs/ggthemes/)
+* [glmnetUtils](https://cran.r-project.org/web/packages/glmnetUtils/vignettes/intro.html)
+* [here](https://cran.r-project.org/web/packages/here/vignettes/here.html)
+* [Lahman](https://cran.r-project.org/web/packages/Lahman/Lahman.pdf)
+* [pacman](https://cran.r-project.org/web/packages/pacman/readme/README.html)
+* [skimr](https://cran.r-project.org/web/packages/skimr/vignettes/skimr.html)
+* [tictoc](https://cran.r-project.org/package=tictoc)
+* [tidylog](https://cran.r-project.org/web/packages/tidylog/readme/README.html)
+* [tidymodels](https://www.tidymodels.org/)
+* [tidyverse](https://www.tidyverse.org/)
+* [tsibble](https://tsibble.tidyverts.org/)
+
+### Session Information:
+* R version 4.0.3
diff --git a/code/01_prepare_lahman_data.R b/code/01_prepare_lahman_data.R
index 2f75b2f..ebd9c82 100644
--- a/code/01_prepare_lahman_data.R
+++ b/code/01_prepare_lahman_data.R
@@ -11,8 +11,8 @@ pacman::p_load(
 path_batting <- here("data/lahman/raw/batting.csv")
 path_people <- here("data/lahman/raw/people.csv")
 
-write_csv(Batting, path_batting)
-write_csv(People, path_people)
+write_csv(Lahman::Batting, path_batting)
+write_csv(Lahman::People, path_people)
 
 # FanGraphs ---------------------------------------------------------------
 
@@ -47,21 +47,17 @@ df_fg_longer %>%
 
 # Lahman ------------------------------------------------------------------
 
-df_batting <- read_csv(path_batting) %>% 
+df_batting <- Lahman::Batting %>% 
   select(playerID, yearID, stint, G:GIDP) %>% 
   glimpse()
 
 df_batting %>% 
   group_by(playerID, yearID, stint) # confirm that dataset is at the player-year-stint level 
 
-df_people <- read_csv(path_people) %>% 
+df_people <- Lahman::People %>% 
   select(playerID, bbrefID, nameFirst, nameLast) %>% 
   glimpse()
 
-df_crosswalk <- read_csv(here("data/sfbb_player_id_crosswalk.csv")) %>% # https://www.smartfantasybaseball.com/tools/
-  select(bbrefID = "BREFID", fgID = "IDFANGRAPHS") %>% 
-  glimpse()
-
 df_batting_aggregated <- df_batting %>% 
   group_by(playerID, yearID) %>% 
   summarize(
@@ -84,7 +80,7 @@ df_batting_stats <- df_batting_aggregated %>%
   ) %>% 
   glimpse()
 
-df_tsibble <- as_tsibble(df_batting_stats, key = bbrefID, index = yearID) %>% 
+df_tsibble <- as_tsibble(df_batting_stats, key = bbrefID, index = yearID) %>% # https://tsibble.tidyverts.org/
   fill_gaps() %>% # make missing seasons explicit instead of implicit so that previous row = last played season
   arrange(bbrefID, yearID) %>% 
   group_by(bbrefID)
diff --git a/code/02_plot_OBP.R b/code/02_plot_OBP.R
index 34aedeb..06d6e1f 100644
--- a/code/02_plot_OBP.R
+++ b/code/02_plot_OBP.R
@@ -3,21 +3,82 @@ pacman::p_load(
   tidyverse,
   tidylog,
   here,
-  ggridges
+  ggridges,
+  ggthemes
 )
 
 df_lahman <- read_csv(here("data/lahman/derived/df_batting_lag5.csv")) %>% 
   glimpse()
 
+
+# Quartiles over time -----------------------------------------------------
+
+df_quartiles_pre_2020 <- df_lahman %>% 
+  filter(yearID <= 2019) %>% 
+  filter(cur_PA >= 100) %>% 
+  group_by(yearID) %>% 
+  summarize(
+    `25th` = quantile(cur_OBP, probs = .25, na.rm = TRUE),
+    `50th` = quantile(cur_OBP, probs = .5, na.rm = TRUE),
+    `75th` = quantile(cur_OBP, probs = .75, na.rm = TRUE),
+  ) %>%  
+  pivot_longer(., cols = -yearID, names_to = "Quartile", values_to = "OBP") %>% 
+  mutate(
+    Quartile = factor(Quartile, levels = c("75th", "50th", "25th"))
+  )
+
+df_quartiles_2020 <- df_lahman %>% 
+  filter(yearID == 2020) %>% 
+  filter(cur_PA >= 30) %>% 
+  group_by(yearID) %>% 
+  summarize(
+    `25th` = quantile(cur_OBP, probs = .25, na.rm = TRUE),
+    `50th` = quantile(cur_OBP, probs = .5, na.rm = TRUE),
+    `75th` = quantile(cur_OBP, probs = .75, na.rm = TRUE),
+  ) %>%  
+  pivot_longer(., cols = -yearID, names_to = "Quartile", values_to = "OBP") %>% 
+  mutate(
+    Quartile = factor(Quartile, levels = c("75th", "50th", "25th"))
+  )
+
+df_quartiles <- bind_rows(
+  df_quartiles_pre_2020,
+  df_quartiles_2020
+)
+
+p_quartiles <- ggplot(df_quartiles, aes(x = yearID, y = OBP, color = Quartile)) +
+  geom_line(size = 1) +
+  geom_point(size = 1.5) +
+  scale_x_continuous(breaks = seq(1870, 2020, 10)) +
+  scale_color_ptol() +
+  geom_vline(xintercept = 1970, linetype = "dashed") +
+  theme_minimal() +
+  labs(
+    x = "Season"
+  ) +
+  theme(
+    panel.grid = element_blank()
+  ) 
+
+ggsave(plot = p_quartiles, filename = here("report/figures/quartiles_ts.png"))
+
 # Lag Plots ---------------------------------------------------------------
 
-df_lag_plot <- df_lahman %>% 
-  filter(yearID %in% 2015:2020) %>% 
-  filter(
-    if_all(contains("OBP"), ~ between(., .001, .999))
-  ) %>% 
+df_lag_plot_pre_2020 <- df_lahman %>% 
+  filter(yearID %in% 1970:2019) %>% 
+  filter(cur_PA >= 100) %>% 
   select(bbrefID, yearID, contains("OBP"))
 
+df_lag_plot_2020 <- df_lahman %>% 
+  filter(yearID == 2020) %>% 
+  filter(cur_PA >= 30) %>% 
+  select(bbrefID, yearID, contains("OBP"))
+
+df_lag_plot <- bind_rows(
+  df_lag_plot_pre_2020,
+  df_lag_plot_2020
+)
+
 Plot_Lags <- function(plotting_data, num_lag, save = FALSE) {
   
   x_axis <- paste0("lagged_OBP_", num_lag)
@@ -27,7 +88,7 @@ Plot_Lags <- function(plotting_data, num_lag, save = FALSE) {
     geom_point(alpha = .3, size = 1.5) +
     geom_smooth(se = FALSE, color = "#4477AA", size = 1) +
     geom_abline(slope = 1, linetype = "dashed", color = "#CC6677", size = 1) +
-    scale_x_continuous(limits = c(0, .7), expand = expansion(0, 0)) +
+    scale_x_continuous(limits = c(0, .7), expand = expansion(.05, 0)) +
     scale_y_continuous(limits = c(0, .7), expand = expansion(0, 0)) +
     labs(
       x = x_axis_title,
@@ -45,6 +106,9 @@ Plot_Lags <- function(plotting_data, num_lag, save = FALSE) {
     
     ggsave(plot = p, filename = path_plot)
     
+  } else {
+    
+    return(p)
   }
   
 }
@@ -54,76 +118,47 @@ v_lags <- 1:5
 walk(v_lags, ~ Plot_Lags(df_lag_plot, ., save = TRUE))
 
 
-# 2019 vs. 2020 to see if relationship persists ---------------------------
-
-df_2020 <- df_lahman %>% 
-  filter(yearID == 2020) %>% 
-  select(bbrefID, yearID, cur_OBP, lagged_OBP_1) %>% 
-  filter(
-    if_all(contains("OBP"), ~ between(., .001, .999))
-  )
-
-p <- ggplot(data = df_2020, aes(x = lagged_OBP_1, y = cur_OBP)) +
-  geom_point(alpha = .3, size = 1.5) +
-  geom_smooth(se = FALSE, color = "#4477AA", size = 1) +
-  geom_abline(slope = 1, linetype = "dashed", color = "#CC6677", size = 1) +
-  scale_x_continuous(limits = c(0, .7), expand = expansion(0, 0)) +
-  scale_y_continuous(limits = c(0, .7), expand = expansion(0, 0)) +
-  labs(
-    x = "2019 OBP",
-    y = "2020 OBP"
-  ) +
-  theme_minimal() +
-  theme(
-    panel.grid.minor = element_blank()
-  )
-
-ggsave(plot = p, filename = here("report/figures/2019_2020_lag.png"))
-
 # Ridge Plots -------------------------------------------------------------
 
-median_2019 <- df_ridge_plot %>% 
-  filter(yearID == 2019) %>% 
-  pull(cur_OBP) %>% 
-  median(., na.rm = TRUE)
-
-Plot_Ridges <- function(start_year, save = FALSE) {
+Plot_Ridges <- function(start_year, end_year, save = FALSE) {
   
   df_ridge_plot <- df_lahman %>% 
-    filter(yearID %in% start_year:2020) %>% 
-    filter(between(cur_OBP, .001, .999)) %>% 
+    filter(yearID %in% start_year:end_year) %>% 
+    filter(cur_PA > 30) %>% 
     select(bbrefID, yearID, cur_OBP)
   
   p <- ggplot(df_ridge_plot, aes(x = cur_OBP, y = factor(yearID))) +
     geom_density_ridges(fill = "#4477AA") +
-    scale_x_continuous(expand = expansion(0, 0)) +
+    scale_x_continuous(expand = expansion(0, 0), breaks = c(0, .2, .4, .6)) +
     scale_y_discrete(expand = expansion(0, 0)) +
     labs(
       x = "OBP",
       y = "Season"
     ) +
-    geom_vline(xintercept = median_2019, linetype = "dashed", color = "#CC6677", size = 1) +
     theme_minimal()
   
   if(save == TRUE) {
     
-    file_name <- paste0("ridge_", start_year, "_2020.png")
+    file_name <- paste0("ridge_", start_year, "_", end_year, ".png")
     path_plot <- here("report/figures/", file_name)
     
     ggsave(plot = p, filename = path_plot)
     
+  } else{
+    
+    return(p)
   }
   
 }
 
-v_ridge_start_years <- c(2000, 2013, 2016)
-
-walk(v_ridge_start_years, ~ Plot_Ridges(., save = TRUE))
+v_ridge_start_years <- c(1970, 2000)
+v_ridge_end_years <- c(1999, 2020)
 
+walk2(v_ridge_start_years, v_ridge_end_years, ~ Plot_Ridges(.x, .y, save = TRUE))
 
 df_ridge_plot <- df_lahman %>% 
   filter(yearID %in% 2016:2020) %>% 
-  filter(between(cur_OBP, .001, .999)) %>% 
+  filter(cur_PA >= 30) %>% 
   transmute(
     bbrefID, 
     yearID, 
@@ -141,5 +176,3 @@ p <- ggplot(df_ridge_plot, aes(x = OBP_per_PA, y = factor(yearID))) +
   theme_minimal()
 
 ggsave(plot = p, filename = here("report/figures/ridge_2016_2020_OBP_per_PA.png"))
-
-
diff --git a/code/03_perform_EDA.R b/code/03_perform_EDA.R
index 967633e..06a5414 100644
--- a/code/03_perform_EDA.R
+++ b/code/03_perform_EDA.R
@@ -9,7 +9,7 @@ pacman::p_load(
 
 
 df_lahman <- read_csv(here("data/lahman/derived/df_batting_lag5.csv")) %>% 
-  filter(between(yearID, 1995, 2020)) %>% 
+  filter(between(yearID, 1970, 2020)) %>% 
   glimpse()
 
 skim(df_lahman)
@@ -27,3 +27,34 @@ df_lahman %>%
   select(contains("_")) %>% 
   na.omit() %>% 
   DataExplorer::plot_prcomp(., variance_cap = .60)
+
+df_fg <- read_csv("data/fangraphs/raw/obp.csv") %>% 
+  mutate(
+    playerid = as.character(playerid)
+  ) %>% 
+  select(-OBP_21, -PA_21)
+
+
+df_fg %>% 
+  map_dbl(~ sum(is.na(.)))
+
+skimr::skim(df)
+
+df_fg %>% 
+  select(playerid, contains("OBP")) %>% 
+  pivot_longer(-playerid) %>%
+  group_by(playerid) %>% 
+  summarize(
+    num_missing = sum(is.na(value))
+  ) %>% 
+  filter(num_missing >1)
+
+df_fg %>% 
+  select(playerid, num_range("PA_", 16:20)) %>% 
+  filter(
+    PA_16 >= 100,
+    PA_17 >= 100,
+    PA_18 >= 100,
+    PA_19 >= 100,
+    PA_20 >= 30,
+  )
diff --git a/code/04_prepare_data_splits.R b/code/04_prepare_data_splits.R
new file mode 100644
index 0000000..a1063d5
--- /dev/null
+++ b/code/04_prepare_data_splits.R
@@ -0,0 +1,154 @@
+
+pacman::p_load(
+  tidyverse,
+  tidylog,
+  here,
+  Lahman
+)
+
+df_lahman <- read_csv(here("data/lahman/derived/df_batting_lag5.csv")) %>% 
+  glimpse()
+
+df_fg <- read_csv(here("data/fangraphs/raw/obp.csv")) %>% 
+  select(-birth_date) %>% 
+  rename(
+    "fgID" = playerid
+  ) %>% 
+  glimpse() 
+
+df_crosswalk <- read_csv(here("data/sfbb_player_id_crosswalk.csv")) %>% # "https://www.smartfantasybaseball.com/tools/"
+  select(bbrefID = "BREFID", fgID = "IDFANGRAPHS", PLAYERNAME) %>% 
+  glimpse()
+
+df_fg_OBP_longer <- df_fg %>% 
+  select(Name, fgID, starts_with("OBP")) %>% 
+  pivot_longer(., starts_with("OBP"), names_to = "yearID", values_to = "OBP", names_prefix = "OBP_") %>% 
+  mutate(
+    yearID = 2000 + as.integer(yearID)
+  ) %>% 
+  glimpse()
+
+df_fg_PA_longer <- df_fg %>% 
+  select(Name, fgID, starts_with("PA")) %>% 
+  pivot_longer(., starts_with("PA"), names_to = "yearID", values_to = "PA", names_prefix = "PA_") %>% 
+  mutate(
+    yearID = 2000 + as.integer(yearID)
+  ) %>% 
+  glimpse()
+
+df_fg_longer <- inner_join(df_fg_OBP_longer, df_fg_PA_longer, by = c("fgID", "yearID", "Name")) %>% 
+  mutate(
+    fgID = as.character(fgID)
+  )
+
+df_fg_longer %>% 
+  group_by(fgID, yearID) # confirm that dataset is now at the player-year level
+
+df_fg_crosswalk <- df_fg_longer %>% 
+  left_join(., df_crosswalk, by = "fgID")
+
+df_fg_crosswalk %>% 
+  select(Name, PLAYERNAME, everything()) %>% 
+  distinct(Name, .keep_all = TRUE)  # confirm that for players with bbrefID they are merged correctly with fgID
+
+
+df_missing_bbrefid <- df_fg_longer %>% 
+  left_join(., df_crosswalk, by = "fgID") %>% 
+  filter(is.na(bbrefID))
+
+
+df_missing_bbrefid_filled <- df_missing_bbrefid %>% 
+  select(Name, fgID, bbrefID) %>% 
+  distinct(Name, .keep_all = TRUE) %>% 
+  print() %>% 
+  mutate(
+    bbrefID = c( # manually add missing bbref IDs from bbref URL
+      "marchra01",
+      "deverjo01",
+      "fargajo01",
+      "waltodo01",
+      "craigwi01",
+      "ramoshe01",
+      "palacjo01",
+      "mazeipa01",
+      "riverse01",
+      "gittech01",
+      "celesgi01",
+      "godoyjo01",
+      "hagerja01",
+      "heathni01",
+      "deichgr01"
+    )
+  ) %>% 
+  print() %>% 
+  select(-Name)
+
+df_missing_merged <- df_missing_bbrefid %>% 
+  select(-bbrefID) %>% 
+  left_join(., df_missing_bbrefid_filled, by = "fgID")
+
+df_fg_missing_resolved <- df_fg_crosswalk %>% 
+  filter(!is.na(bbrefID)) %>% 
+  bind_rows(df_missing_merged)
+
+df_fg_missing_resolved %>% 
+  group_by(Name, yearID) %>% 
+  count(., sort = TRUE) %>% 
+  filter(n > 1) # hanlde Ohtani showing up as pitcher and hitter
+
+df_fg_cleaned <- df_fg_missing_resolved %>% 
+  distinct(bbrefID, yearID, .keep_all = TRUE) # remove duplicated Ohtani entry
+
+df_fg_cleaned %>% 
+  group_by(fgID) # confirm there are still 572 players
+
+df_fg_cleaned %>% 
+  group_by(fgID, yearID) # confirm data at year-playerid level
+
+df_fg_cleaned %>% 
+  left_join(., df_lahman, by = c("bbrefID", "yearID")) %>% 
+  transmute(
+    Name,
+    bbrefID,
+    yearID,
+    OBP,
+    cur_OBP,
+    delta_OBP = OBP - cur_OBP
+  ) %>% 
+  pull(delta_OBP) %>% 
+  qplot()
+
+df_test <- left_join(df_fg_cleaned, df_lahman, by = c("bbrefID", "yearID")) %>% 
+  filter(yearID == 2021) %>% 
+  select(Name, fgID, bbrefID, yearID, OBP, PA, starts_with("lagged_")) %>% 
+  glimpse()
+
+write_csv(df_test, here("data/modeling/df_test.csv"))
+
+df_training_2020 <- df_lahman %>% 
+  filter(yearID == 2020) %>% 
+  filter(cur_PA >= 30) %>% 
+  mutate(
+    Name = paste(nameFirst, nameLast)
+  ) %>% 
+  select(Name, bbrefID, yearID, cur_OBP, starts_with("lagged_")) %>% 
+  glimpse()
+
+df_training_1975_2019 <- df_lahman %>% 
+  filter(between(yearID, 1975, 2019)) %>% 
+  filter(cur_PA >= 100) %>% 
+  mutate(
+    Name = paste(nameFirst, nameLast)
+  ) %>% 
+  select(Name, bbrefID, yearID, cur_OBP, cur_BB, starts_with("lagged_")) %>% 
+  glimpse()
+
+df_training <- bind_rows(
+  df_training_1975_2019,
+  df_training_2020
+)
+
+df_training %>% 
+  group_by(yearID, bbrefID)
+
+write_csv(df_training, here("data/modeling/df_training.csv"))
diff --git a/code/05_generate_sample_mean_predictions.R b/code/05_generate_sample_mean_predictions.R
new file mode 100644
index 0000000..7214bfd
--- /dev/null
+++ b/code/05_generate_sample_mean_predictions.R
@@ -0,0 +1,42 @@
+
+pacman::p_load(
+  tidyverse,
+  tidylog,
+  here,
+  yardstick
+)
+
+df_training_raw <- read_csv(here("data/modeling/df_training.csv")) %>% 
+  glimpse()
+
+df_training <- df_training_raw %>% 
+  filter(between(yearID, 1975, 2017)) %>%  
+  select(bbrefID, yearID, cur_OBP) %>% 
+  group_by(bbrefID) %>% 
+  summarize(
+    fitted_OBP = mean(cur_OBP, na.rm = TRUE)
+  ) %>% 
+  print()
+
+median_obp_2013_2017 <- df_training_raw %>% 
+  filter(between(yearID, 2013, 2017)) %>% 
+  pull(cur_OBP) %>% 
+  median(., na.rm = TRUE)
+
+df_validation <- df_training_raw %>% 
+  filter(yearID %in% 2018:2020) %>% 
+  select(bbrefID, yearID, cur_OBP) %>%  
+  left_join(., df_training, by = "bbrefID") %>% 
+  mutate(
+    fitted_OBP = coalesce(fitted_OBP, median_obp_2013_2017) # for players with no obserations before 2018, replace with league median OBP 2013-2017
+  ) %>% 
+  print()
+
+rmse_vec( # https://yardstick.tidymodels.org/reference/rmse.html
+  truth = df_validation$cur_OBP,
+  estimate = df_validation$fitted_OBP
+)
+
+
+
+
diff --git a/code/06_run_OBP_OLS.R b/code/06_run_OBP_OLS.R
new file mode 100644
index 0000000..d8621af
--- /dev/null
+++ b/code/06_run_OBP_OLS.R
@@ -0,0 +1,57 @@
+
+pacman::p_load(
+  tidyverse,
+  tidylog,
+  here,
+  yardstick,
+  broom
+)
+
+df_training_raw <- read_csv(here("data/modeling/df_training.csv")) %>% 
+  glimpse()
+
+df_training <- df_training_raw %>% 
+  filter(between(yearID, 1975, 2017)) %>% 
+  select(contains("OBP")) %>% 
+  print()
+
+df_validation <- df_training_raw %>% 
+  filter(yearID %in% 2018:2020) %>% 
+  select(contains("OBP")) %>% 
+  print()
+
+
+model_obp <- lm(cur_OBP ~ ., data = df_training)
+
+df_training_fitted <- augment(model_obp)
+
+rmse_vec(
+  truth = df_training_fitted$cur_OBP,
+  estimate = df_training_fitted$.fitted
+)
+
+ggplot(df_training_fitted, aes(x = cur_OBP, y = .resid)) +
+  geom_point() +
+  theme_minimal()
+
+df_validation_fitted <- augment(x = model_obp, newdata = df_validation)
+
+rmse_vec(
+  truth = df_validation_fitted$cur_OBP,
+  estimate = df_validation_fitted$.fitted,
+  na_rm = TRUE
+)
+
+p_resid <- ggplot(df_validation_fitted, aes(x = cur_OBP, y = .resid)) +
+  geom_point() +
+  theme_minimal() +
+  labs(
+    x = "True OBP",
+    y = "OBP Residual"
+  ) +
+  theme(
+    panel.grid.minor = element_blank()
+  ) 
+
+ggsave(plot = p_resid, filename = here("report/figures/OLS_residuals.png"))
+  
diff --git a/code/07_run_LASSO.R b/code/07_run_LASSO.R
new file mode 100644
index 0000000..8844fc5
--- /dev/null
+++ b/code/07_run_LASSO.R
@@ -0,0 +1,47 @@
+
+pacman::p_load(
+  tidyverse,
+  tidylog,
+  here,
+  yardstick,
+  glmnetUtils
+)
+
+df_training_raw <- read_csv(here("data/modeling/df_training.csv")) %>% 
+  glimpse()
+
+df_training <- df_training_raw %>% 
+  filter(yearID <= 2017) %>% 
+  select(contains("_")) %>% 
+  select(-cur_BB) %>% 
+  glimpse()
+  
+df_validation <- df_training_raw %>% 
+  filter(yearID %in% 2018:2020) %>% 
+  select(contains("_")) %>% 
+  select(-cur_BB) %>% 
+  glimpse()
+
+model_lasso <- glmnet(cur_OBP ~ ., data = df_training, alpha = 1) # alpha == 1 is LASSO, 0 is Ridge https://cran.r-project.org/web/packages/glmnetUtils/vignettes/intro.html 
+
+mx_coefs <- coef(model_lasso)
+
+df_vars_selected_for_each_lambda <- tidy(mx_coefs) %>% 
+  rename(
+    variable = row,
+    lambda_index = column,
+    coefficient = value
+  ) %>% 
+  filter(variable != "(Intercept)") %>% 
+  select(variable, lambda_index) %>% 
+  group_by(lambda_index) %>% 
+  mutate(
+    selected_vars = list(variable)
+  ) %>% 
+  distinct(lambda_index, .keep_all = TRUE) %>% 
+  select(-variable) # most important vars look to be OBP lags 1-3 and BB lags 1-3
+
+
+
+
+
diff --git a/code/08_run_splines.R b/code/08_run_splines.R
new file mode 100644
index 0000000..b2be4c6
--- /dev/null
+++ b/code/08_run_splines.R
@@ -0,0 +1,122 @@
+
+pacman::p_load(
+  tidyverse,
+  tidylog,
+  here,
+  tidymodels,
+  tictoc,
+  ggthemes
+)
+
+df_training_raw <- read_csv(here("data/modeling/df_training.csv")) %>% 
+  glimpse()
+
+df_training <- df_training_raw %>% 
+  filter(yearID <= 2017) %>% 
+  select(bbrefID, yearID, cur_OBP, lagged_OBP_1:lagged_OBP_3, lagged_BB_1:lagged_BB_3) %>% 
+  group_by(bbrefID) %>% 
+  mutate(
+    across(contains("lagged"), ~ coalesce(., mean(., na.rm = TRUE))) # if lagged value is missing, replace with the player's mean for that value
+  ) %>% 
+  ungroup() %>%
+  group_by(yearID) %>% 
+  mutate(
+    across(
+      contains("lagged"), ~ coalesce(., mean(., na.rm = TRUE)) # if lagged value still missing, replace with the league avarage of the lag for that year
+    )
+  ) %>% 
+  ungroup()
+
+df_validation <- df_training_raw %>% 
+  filter(yearID %in% 2018:2020) %>% 
+  select(bbrefID, yearID, cur_OBP, lagged_OBP_1:lagged_OBP_3, lagged_BB_1:lagged_BB_3) %>% 
+  group_by(bbrefID) %>% 
+  mutate(
+    across(contains("lagged"), ~ coalesce(., mean(., na.rm = TRUE))) # if lagged value is missing, replace with the player's mean for that value
+  ) %>% 
+  ungroup() %>%
+  group_by(yearID) %>% 
+  mutate(
+    across(
+      contains("lagged"), ~ coalesce(., mean(., na.rm = TRUE)) # if lagged value still missing, replace with the league avarage of the lag for that year
+    )
+  ) %>% 
+  ungroup()
+
+Fit_Splines <- function(predictor_list, dof) {
+  
+  splines_recipe <- recipe(cur_OBP ~ ., data = df_training) %>% 
+    update_role(bbrefID, yearID, new_role = "id") %>% 
+    step_intercept() %>% 
+    step_ns(predictor_list, deg_free = dof) %>% # https://recipes.tidymodels.org/reference/step_ns.html
+    prep(., strings_as_factors = FALSE)
+  
+  df_training_baked <- bake(splines_recipe, df_training) %>% 
+    select(contains("_"))
+  
+  df_validation_baked <- bake(splines_recipe, df_validation) %>% 
+    select(contains("_"))
+  
+  model_splines <- lm(cur_OBP ~ ., data = df_training_baked) 
+  
+  df_training_fitted <- augment(model_splines)
+  
+  train_rmse <- rmse_vec(
+    df_training_fitted$cur_OBP,
+    df_training_fitted$.fitted
+  )
+  
+  df_validation_fitted <- augment(x = model_splines, newdata = df_validation_baked)
+  
+  val_rmse <- rmse_vec(
+    df_validation_fitted$cur_OBP,
+    df_validation_fitted$.fitted
+  )
+  
+  tibble(
+    "Spline DoF" = dof,
+    "Predictors" = list(predictor_list),
+    "Training RMSE" = train_rmse,
+    "Validation RMSE"  = val_rmse 
+  )
+  
+}
+
+v_spline_dof <- 2:10
+
+df_OBP_2 <- map_dfr(v_spline_dof, ~ Fit_Splines(c("lagged_OBP_1", "lagged_OBP_2"), .))
+df_OBP_3 <- map_dfr(v_spline_dof, ~ Fit_Splines(c("lagged_OBP_1", "lagged_OBP_2", "lagged_OBP_3"), .))
+df_OBP_2_BB_1 <- map_dfr(v_spline_dof, ~ Fit_Splines(c("lagged_OBP_1", "lagged_OBP_2", "lagged_BB_1"), .))
+df_OBP_2_BB_2 <- map_dfr(v_spline_dof, ~ Fit_Splines(c("lagged_OBP_1", "lagged_OBP_2", "lagged_BB_1",  "lagged_BB_2"), .))
+df_OBP_3_BB_2 <- map_dfr(v_spline_dof, ~ Fit_Splines(c("lagged_OBP_1", "lagged_OBP_2", "lagged_OBP_3", "lagged_BB_1", "lagged_BB_2"), .))
+
+df_splines_models <- bind_rows(
+  df_OBP_2,
+  df_OBP_3,
+  df_OBP_2_BB_1,
+  df_OBP_2_BB_2,
+  df_OBP_3_BB_2
+) %>% 
+  group_by(Predictors) %>% 
+  mutate(
+    `Group ID` = cur_group_id() %>% as.factor() # https://stackoverflow.com/questions/39650511/r-group-by-variable-and-then-assign-a-unique-id
+  )
+  
+ggplot(df_splines_models, aes(x = `Spline DoF`, y = `Validation RMSE`, color = `Group ID`)) +
+  geom_line(size = 1) +
+  geom_point(size = 2) +
+  scale_color_ptol() +
+  scale_x_continuous(breaks = 2:10) +
+  labs(
+    x = "Spline Degrees of Freedom",
+    y = "Validation RMSE"
+  ) +
+  theme_minimal() +
+  theme(
+    panel.grid.minor = element_blank(),
+    text = element_text(size = 14)
+  )
+
+df_splines_models %>% 
+  filter(`Spline DoF` == 3, `Group ID` == 2)
+
diff --git a/code/09_get_test_performance.R b/code/09_get_test_performance.R
new file mode 100644
index 0000000..d3720a8
--- /dev/null
+++ b/code/09_get_test_performance.R
@@ -0,0 +1,73 @@
+
+pacman::p_load(
+  tidyverse,
+  tidylog,
+  tidymodels,
+  here
+)
+
+
+df_training_raw <- read_csv(here("data/modeling/df_training.csv")) %>% 
+  glimpse()
+
+df_test_raw <- read_csv(here("data/modeling/df_test.csv")) %>% 
+  select(bbrefID, yearID, cur_OBP = OBP, num_range("lagged_OBP_", 1:3)) %>% 
+  glimpse()
+
+df_training <- df_training_raw %>% 
+  select(bbrefID, yearID, cur_OBP, num_range("lagged_OBP_", 1:3)) %>% 
+  group_by(bbrefID) %>% 
+  mutate(
+    across(contains("lagged"), ~ coalesce(., mean(., na.rm = TRUE))) # if lagged value is missing, replace with the player's mean for that value
+  ) %>% 
+  ungroup() %>%
+  group_by(yearID) %>% 
+  mutate(
+    across(
+      contains("lagged"), ~ coalesce(., mean(., na.rm = TRUE)) # if lagged value still missing, replace with the league avarage of the lag for that year
+    )
+  ) %>% 
+  ungroup() %>% 
+  glimpse()
+
+df_test_na_filled <- df_test_raw %>% 
+  select(bbrefID, contains("lagged")) %>% 
+  pivot_longer(., -bbrefID) %>% 
+  group_by(bbrefID) %>% 
+  mutate(
+    own_player_mean = coalesce(value, mean(value, na.rm = TRUE))
+  ) %>% 
+  mutate(
+    training_set_mean = coalesce(own_player_mean, .324) # mean of lags 1-3 in the training data
+  ) %>% 
+  ungroup() %>% 
+  select(bbrefID, name, OBP = training_set_mean) %>% 
+  pivot_wider(., names_from = name, values_from = OBP)
+
+df_test <- df_test_na_filled %>% 
+  inner_join(., df_test_raw %>% select(bbrefID, cur_OBP), by = "bbrefID")
+
+splines_recipe <- recipe(cur_OBP ~ ., data = df_training) %>% 
+  update_role(bbrefID, yearID, new_role = "id") %>% 
+  step_intercept() %>% 
+  step_ns(c("lagged_OBP_1", "lagged_OBP_2", "lagged_OBP_3"), deg_free = 3) %>% 
+  prep(., strings_as_factors = FALSE)
+
+df_training_baked <- bake(splines_recipe, df_training) %>% 
+  select(contains("_"))
+
+df_test_baked <- bake(splines_recipe, df_test) %>% 
+  select(contains("_"))
+
+df_final_model <- lm(cur_OBP ~ ., data = df_training_baked)
+
+df_test_fitted <- augment(df_final_model, newdata = df_test_baked)
+
+rmse_vec(
+  truth = df_test_fitted$cur_OBP,
+  estimate = df_test_fitted$.fitted
+)
+
+
+
+
diff --git a/report/figures/2019_2020_lag.png b/report/figures/2019_2020_lag.png
index 17e6b59..5e7bd4c 100644
Binary files a/report/figures/2019_2020_lag.png and b/report/figures/2019_2020_lag.png differ
diff --git a/report/figures/OLS_residuals.png b/report/figures/OLS_residuals.png
new file mode 100644
index 0000000..3f2ccbe
Binary files /dev/null and b/report/figures/OLS_residuals.png differ
diff --git a/report/figures/lag_1.png b/report/figures/lag_1.png
index 14e5a0b..55c494b 100644
Binary files a/report/figures/lag_1.png and b/report/figures/lag_1.png differ
diff --git a/report/figures/lag_2.png b/report/figures/lag_2.png
index f89a94d..40c171e 100644
Binary files a/report/figures/lag_2.png and b/report/figures/lag_2.png differ
diff --git a/report/figures/lag_3.png b/report/figures/lag_3.png
index 8b86825..5446d31 100644
Binary files a/report/figures/lag_3.png and b/report/figures/lag_3.png differ
diff --git a/report/figures/lag_4.png b/report/figures/lag_4.png
index ed68032..eab26a4 100644
Binary files a/report/figures/lag_4.png and b/report/figures/lag_4.png differ
diff --git a/report/figures/lag_5.png b/report/figures/lag_5.png
index b3b8260..ec9001a 100644
Binary files a/report/figures/lag_5.png and b/report/figures/lag_5.png differ
diff --git a/report/figures/quartiles_ts.png b/report/figures/quartiles_ts.png
new file mode 100644
index 0000000..dd2c0a0
Binary files /dev/null and b/report/figures/quartiles_ts.png differ
diff --git a/report/figures/ridge_1970_1999.png b/report/figures/ridge_1970_1999.png
new file mode 100644
index 0000000..e2730a2
Binary files /dev/null and b/report/figures/ridge_1970_1999.png differ
diff --git a/report/figures/ridge_2000_2020.png b/report/figures/ridge_2000_2020.png
index d5c23ef..fff4f54 100644
Binary files a/report/figures/ridge_2000_2020.png and b/report/figures/ridge_2000_2020.png differ
diff --git a/report/figures/ridge_2013_2020.png b/report/figures/ridge_2013_2020.png
index 7416788..5a06f97 100644
Binary files a/report/figures/ridge_2013_2020.png and b/report/figures/ridge_2013_2020.png differ
diff --git a/report/figures/ridge_2016_2020.png b/report/figures/ridge_2016_2020.png
index 0be54e2..fef6ba9 100644
Binary files a/report/figures/ridge_2016_2020.png and b/report/figures/ridge_2016_2020.png differ
diff --git a/report/figures/ridge_2016_2020_OBP_per_PA.png b/report/figures/ridge_2016_2020_OBP_per_PA.png
index f497b71..a3ba6f8 100644
Binary files a/report/figures/ridge_2016_2020_OBP_per_PA.png and b/report/figures/ridge_2016_2020_OBP_per_PA.png differ