CI_Assignment.Rmd

---
title: "CI_Assignment"
output: html_document
---

```{r setup, include = FALSE}

knitr::opts_chunk$set(echo = TRUE)

```

## Packages 

```{r include = FALSE}

library(tidyverse)
library(ggplot2)
library(scales)
library(gridExtra)
library(corrplot)
library(Hmisc)
library(mice)
library(MatchIt)
library(boot)

```

## Flags 

```{r}

# Imputation flags 
# If all FALSE removes any rows with NAs as opposed to imputing 
mean_imputation = FALSE 
median_imputation = FALSE 
mice_imputation = TRUE 
n_mice_imputations = 5 

```

## Initial Data Loading and Checks 

```{r}

# Load data 

# Scotland alcohol related hospital visits by local authority 
filepath = "scotland_hospital_visits_local_authority.csv"
scotland_hospital_visits_local_authority = read.csv(filepath, header = TRUE)

# Scotland alcohol related hospital visits national 
filepath = "scotland_hospital_visits_whole.csv"
scotland_hospital_visits_whole = read.csv(filepath, header = TRUE)

# Scotland population by local authority and national 
filepath = "scotland_population_local_authority_and_whole.csv"
scotland_population_local_authority_whole = read.csv(filepath, header = TRUE)

# England alcohol related hospital visits per 100k by local authority 
filepath = "england_hospital_visits_per100k_local_authority_joined.csv"
england_hospital_visits_local_authority = read.csv(filepath, header = TRUE)

# UK annual population survey data by local authority - 2017 
filepath = "annual_population_survey.csv"
uk_pop_survey_2017 = read.csv(filepath, header = TRUE)

# UK annual hours and earnings data by local authority - 2017 
filepath = "annual_hours_and_earnings_survey.csv"
uk_earnings_survey_2017 = read.csv(filepath, header = TRUE)

# UK annual population survey data by age, sex, local authority - 2017 
filepath = "annual_population_age_sex_survey.csv"
uk_pop_ages_sex_survey_2017 = read.csv(filepath, header = TRUE)

```

```{r}

# Preprocess data 

# Scotland alcohol related hospital visits by local authority 
scotland_hospital_visits_local_authority_clean = scotland_hospital_visits_local_authority %>% 
  dplyr::select(financial_year, local_authority, sg_code, stays) %>% 
  dplyr::mutate(year = str_sub(str_trim(financial_year), start = 1, end = 4)) %>% 
  dplyr::mutate(year = as.numeric(year)) %>% 
  dplyr::mutate(local_authority = dplyr::case_when(local_authority == "Edinburgh City" ~ "City of Edinburgh", 
                   local_authority == "Ayrshire East" ~ "East Ayrshire", 
                   local_authority == "Dunbartonshire East" ~ "East Dunbartonshire", 
                   local_authority == "Lothian East" ~ "East Lothian", 
                   local_authority == "Renfrewshire East" ~ "East Renfrewshire", 
                   local_authority == "Ayrshire North" ~ "North Ayrshire", 
                   local_authority == "Lanarkshire North" ~ "North Lanarkshire", 
                   local_authority == "Borders" ~ "Scottish Borders", 
                   local_authority == "Ayrshire South" ~ "South Ayrshire", 
                   local_authority == "Lanarkshire South" ~ "South Lanarkshire", 
                   local_authority == "Dunbartonshire West" ~ "West Dunbartonshire", 
                   local_authority == "Lothian West" ~ "West Lothian", 
                   TRUE ~ local_authority))

# Check for NAs
colSums(is.na(scotland_hospital_visits_local_authority_clean))

# Scotland alcohol related hospital visits national 
scotland_hospital_visits_whole_clean = scotland_hospital_visits_whole %>% 
  dplyr::select(financial_year, grouping, stays) %>% 
  dplyr::mutate(year = str_sub(str_trim(financial_year), start = 1, end = 4)) %>% 
  dplyr::mutate(year = as.numeric(year))

# Check for NAs
colSums(is.na(scotland_hospital_visits_whole_clean))

# Scotland yearly population by local authority 
scotland_population_local_authority_clean = scotland_population_local_authority_whole %>% 
  dplyr::filter(local_authority != "Scotland") %>% 
  dplyr::mutate(year = as.numeric(year))

# Check for NAs
colSums(is.na(scotland_population_local_authority_clean))

# Check local authority names match 
sort(unique(scotland_hospital_visits_local_authority_clean$local_authority)) == sort(unique(scotland_population_local_authority_clean$local_authority))

# Scotland yearly population national 
scotland_population_whole_clean = scotland_population_local_authority_whole %>% 
  dplyr::filter(local_authority == "Scotland")

# Create Scotland alcohol related hospital visits per 100,00 by local authority 
scotland_hospital_visits_local_authority_per100k = scotland_hospital_visits_local_authority_clean %>% 
  dplyr::left_join(scotland_population_local_authority_clean, by = c("local_authority", "year")) %>% 
  dplyr::mutate(stays_per100k = (stays * 100000) / population) # %>% 
  # dplyr::filter(local_authority != "Dumfries and Galloway" & local_authority != "Scottish Borders") # Remove border local authorities check 

# Create Scotland alcohol related hospital visits per 100,00 national 
scotland_hospital_visits_whole_per100k = scotland_hospital_visits_whole_clean %>% 
  dplyr::left_join(scotland_population_whole_clean, by = c("year")) %>% 
  dplyr::mutate(stays_per100k = (stays * 100000) / population) %>% 
  dplyr::filter(year >= 1997)

# England alcohol related hospital visits by local authority 
england_hospital_visits_local_authority_per100k = england_hospital_visits_local_authority %>% 
  dplyr::mutate(year = str_sub(str_trim(financial_year), start = 1, end = 4)) %>% 
  dplyr::select(financial_year, year, local_authority, stays_per100k) %>% 
  dplyr::mutate(stays_per100k = as.numeric(stays_per100k), 
                year = as.numeric(year)) %>% 
  dplyr::filter(!is.na(stays_per100k)) %>% 
  dplyr::group_by(local_authority) %>% 
  dplyr::filter(n() >= 8) %>% 
  dplyr::ungroup() 

# UK annual population survey data by local authority - 2017 
uk_pop_survey_2017_clean = uk_pop_survey_2017 %>% 
  dplyr::mutate(economic_activity_rate_16to64 = as.numeric(economic_activity_rate_16to64), 
                employment_rate_16to64 = as.numeric(employment_rate_16to64), 
                unemployment_rate_16to64 = as.numeric(unemployment_rate_16to64)) %>% 
  dplyr::select(-unemployment_rate_16plus) %>% 
  dplyr::mutate(economic_activity_rate_16to64 = if_else(local_authority == "Orkney Islands" | local_authority == "Shetland Islands", 
                                                        (77.3 + 84.6 + 83.7) / 3, 
                                                        economic_activity_rate_16to64), 
                employment_rate_16to64 = if_else(local_authority == "Orkney Islands" | local_authority == "Shetland Islands", 
                                                        (75.6 + 81.0 + 81.1) / 3, 
                                                        employment_rate_16to64), 
                unemployment_rate_16to64 = if_else(local_authority == "Orkney Islands" | local_authority == "Shetland Islands", 
                                                        (2.3 + 4.2 + 3.0) / 3, 
                                                        unemployment_rate_16to64)) %>% # economic activity rate, employment rate, and unemployment rate are imputed for Orkney and Shetland using the mean of the included local authorities which are also part of the wider highlands and islands area 
  dplyr::filter_all(all_vars(!is.na(.))) 

# UK annual hours and earnings data by local authority - 2017 
uk_earnings_survey_2017_clean = uk_earnings_survey_2017 %>% 
  dplyr::mutate(median_weekly_pay_gross = as.numeric(median_weekly_pay_gross), 
                median_annual_pay_gross = as.numeric(median_annual_pay_gross), 
                mean_weekly_pay_gross = as.numeric(mean_weekly_pay_gross), 
                mean_annual_pay_gross = as.numeric(mean_annual_pay_gross)) %>% 
  dplyr::filter_all(all_vars(!is.na(.))) 

# UK annual population survey data by age, sex, local authority - 2017 
uk_pop_ages_sex_survey_2017_clean = uk_pop_ages_sex_survey_2017 %>% 
  dplyr::mutate(pop_all = as.numeric(pop_all),
                pop_0to15 = as.numeric(pop_0to15),
                pop_16to64 = as.numeric(pop_16to64), 
                pop_65plus = as.numeric(pop_65plus), 
                pop_all_m = as.numeric(pop_all_m), 
                pop_0to15_m = as.numeric(pop_0to15_m), 
                pop_16to64_m = as.numeric(pop_16to64_m), 
                pop_65plus_m = as.numeric(pop_65plus_m), 
                pop_all_f = as.numeric(pop_all_f), 
                pop_0to15_f = as.numeric(pop_0to15_f), 
                pop_16to64_f = as.numeric(pop_16to64_f), 
                pop_65plus_f = as.numeric(pop_65plus_f)) %>% 
  dplyr::filter_all(all_vars(!is.na(.))) %>% 
  dplyr::mutate(percentage_male = pop_all_m / (pop_all_m + pop_all_f), 
                percentage_0to15 = pop_0to15 / pop_all, 
                percentage_16to64 = pop_16to64 / pop_all, 
                percentage_65plus = pop_65plus / pop_all) 

# Create local authority matching dataset 
local_authority_characteristics = uk_pop_survey_2017_clean %>%  
  dplyr::inner_join(uk_earnings_survey_2017_clean, by = c("local_authority", "area_code")) %>% 
  dplyr::inner_join(uk_pop_ages_sex_survey_2017_clean, by = c("local_authority", "area_code")) 

if (mean_imputation == TRUE) {
  
  # Create local authority matching dataset 
  local_authority_characteristics = uk_pop_survey_2017_clean %>%  
    dplyr::full_join(uk_earnings_survey_2017_clean, by = c("local_authority", "area_code")) %>% 
    dplyr::full_join(uk_pop_ages_sex_survey_2017_clean, by = c("local_authority", "area_code")) %>% 
    dplyr::filter(local_authority %in% unique(england_hospital_visits_local_authority_per100k$local_authority) | local_authority %in% unique(scotland_hospital_visits_local_authority_per100k$local_authority)) %>% 
    dplyr::select(c(local_authority, economic_activity_rate_16to64, employment_rate_16to64, unemployment_rate_16to64, 
                  median_annual_pay_gross, percentage_male, percentage_0to15, percentage_16to64, percentage_65plus))

  local_authority_characteristics = data.frame(lapply(local_authority_characteristics, function(x) impute(x, mean)))

}

if (median_imputation == TRUE) {
  
  # Create local authority matching dataset 
  local_authority_characteristics = uk_pop_survey_2017_clean %>%  
    dplyr::full_join(uk_earnings_survey_2017_clean, by = c("local_authority", "area_code")) %>% 
    dplyr::full_join(uk_pop_ages_sex_survey_2017_clean, by = c("local_authority", "area_code")) %>% 
    dplyr::filter(local_authority %in% unique(england_hospital_visits_local_authority_per100k$local_authority) | local_authority %in% unique(scotland_hospital_visits_local_authority_per100k$local_authority)) %>% 
    dplyr::select(c(local_authority, economic_activity_rate_16to64, employment_rate_16to64, unemployment_rate_16to64, 
                  median_annual_pay_gross, percentage_male, percentage_0to15, percentage_16to64, percentage_65plus))

  local_authority_characteristics = data.frame(lapply(local_authority_characteristics, function(x) impute(x, median)))

}

if (mice_imputation == TRUE) {
  
  # Create local authority matching dataset 
  local_authority_characteristics = uk_pop_survey_2017_clean %>%  
    dplyr::full_join(uk_earnings_survey_2017_clean, by = c("local_authority", "area_code")) %>% 
    dplyr::full_join(uk_pop_ages_sex_survey_2017_clean, by = c("local_authority", "area_code")) %>% 
    dplyr::filter(local_authority %in% unique(england_hospital_visits_local_authority_per100k$local_authority) | local_authority %in% unique(scotland_hospital_visits_local_authority_per100k$local_authority)) %>% 
    dplyr::select(c(local_authority, economic_activity_rate_16to64, employment_rate_16to64, unemployment_rate_16to64, 
                  median_annual_pay_gross, percentage_male, percentage_0to15, percentage_16to64, percentage_65plus))
  
  # Use MICE to impute missing values and produce 5 datasets with different imputed missing values
  pred_matrix = quickpred(local_authority_characteristics)
  
  pred_matrix[, "local_authority"] = 0
  
  methods = rep("pmm", ncol(local_authority_characteristics))
  names(methods) = colnames(local_authority_characteristics)
  methods["local_authority"] = ""
  
  # local_authority_characteristics = mice(local_authority_characteristics, m = 5, method = methods, predictorMatrix = pred_matrix, seed = 12345, printFlag = TRUE)
  
  mice_model = mice(local_authority_characteristics, m = 5, method = methods, predictorMatrix = pred_matrix, seed = 12345, printFlag = TRUE)
  
  local_authority_characteristics_list = list()
  
  for(i in 1:n_mice_imputations) {
    
    local_authority_characteristics_list[[i]] = complete(mice_model, action = i)

  }
  
  
}

# Filter England alcohol related hospital visits by local authority to only include local authorities with complete matching data 
england_hospital_visits_local_authority_per100k_has_characteristics = england_hospital_visits_local_authority_per100k %>% 
  dplyr::filter(local_authority %in% unique(local_authority_characteristics$local_authority))

# Number of England local authorities with per 100k data 
length(unique(england_hospital_visits_local_authority_per100k$local_authority))

# Number of England local authorities with per 100k data and complete matching data 
length(unique(england_hospital_visits_local_authority_per100k_has_characteristics$local_authority))

if (mice_imputation == FALSE) {

# Filter local authority matching dataset 
local_authority_characteristics_has_per100k = local_authority_characteristics %>% 
  dplyr::filter(local_authority %in% unique(england_hospital_visits_local_authority_per100k$local_authority) | local_authority %in% unique(scotland_hospital_visits_local_authority_per100k$local_authority)) %>% 
  dplyr::mutate(treatment = if_else(local_authority %in% unique(scotland_hospital_visits_local_authority_per100k$local_authority), 
                                    1, 0))

}

if (mice_imputation == TRUE) {

local_authority_characteristics_has_per100k_list = list()

for (i in 1:n_mice_imputations) {
  
  # Filter local authority matching dataset 
  local_authority_characteristics_has_per100k = local_authority_characteristics_list[[i]] %>% 
  dplyr::filter(local_authority %in% unique(england_hospital_visits_local_authority_per100k$local_authority) | local_authority %in% unique(scotland_hospital_visits_local_authority_per100k$local_authority)) %>% 
  dplyr::mutate(treatment = if_else(local_authority %in% unique(scotland_hospital_visits_local_authority_per100k$local_authority), 
                                    1, 0))
  
  local_authority_characteristics_has_per100k_list[[i]] = local_authority_characteristics_has_per100k
  
}

}

# Number of Scotland and England local authorities with per 100k data and complete matching data 
length(unique(local_authority_characteristics_has_per100k$local_authority))

# Create dataframe of Scotland and England outcomes 
hospitalisations = scotland_hospital_visits_local_authority_per100k %>% 
  dplyr::select(financial_year, year, local_authority, stays_per100k) %>% 
  dplyr::filter(year >= 2012 & year <= 2019)

hospitalisations = rbind(hospitalisations, england_hospital_visits_local_authority_per100k_has_characteristics)

```

```{r}

# Plot to check imputation was reasonable 
s_plot = stripplot(mice_model, employment_rate_16to64, pch = 20, cex = 1.2, alpha = 0.75)

s_plot = update(s_plot, 
                main = "Imputed Values (Red) for Employment Rate (16 to 64)", 
                xlab = "Imputation Run (0 = Initial Dataset)", 
                ylab = "Employment Rate")

s_plot

# Plot to check imputation was reasonable 
s_plot = stripplot(mice_model, economic_activity_rate_16to64, pch = 20, cex = 1.2, alpha = 0.75)

s_plot = update(s_plot, 
                main = "Imputed Values (Red) for Economic Activity Rate (16 to 64)", 
                xlab = "Imputation Run (0 = Initial Dataset)", 
                ylab = "Employment Rate")

s_plot

# Plot to check imputation was reasonable 
s_plot = stripplot(mice_model, unemployment_rate_16to64, pch = 20, cex = 1.2, alpha = 0.75)

s_plot = update(s_plot, 
                main = "Imputed Values (Red) for Unemployment Rate (16 to 64)", 
                xlab = "Imputation Run (0 = Initial Dataset)", 
                ylab = "Employment Rate")

s_plot

# Plot to check imputation was reasonable 
s_plot = stripplot(mice_model, median_annual_pay_gross, pch = 20, cex = 1.2, alpha = 0.75)

s_plot = update(s_plot, 
                main = "Imputed Values (Red) for Median Annual Gross Income", 
                xlab = "Imputation Run (0 = Initial Dataset)", 
                ylab = "Employment Rate")

s_plot

```

```{r}

# Scotland alcohol related hospital visits per 100,000 by local authority 
ggplot(data = scotland_hospital_visits_local_authority_per100k, aes(x = year, y = stays_per100k, colour = local_authority)) +
  geom_line(size = 0.75) +
  scale_x_continuous(breaks = breaks_width(4), minor_breaks = NULL) +
  scale_y_continuous(breaks = breaks_width(100)) +
  labs(x = "Year", y = "Hospital stays per 100,000", title = "Alcohol Related Hospital Visits per 100,000 by Local Authority - Scotland") +
  theme(panel.border = element_rect(colour = "black", fill = NA, linewidth = 0.5), axis.text = element_text(color = "black", size = 10),
        axis.title = element_text(size = 12), plot.title = element_text(size = 12, face = "bold"), legend.title = element_blank())

```

```{r}

# Scotland alcohol related hospital visits per 100,000 by local authority 
ggplot(data = scotland_hospital_visits_whole_per100k, aes(x = year, y = stays_per100k)) +
  geom_line(size = 0.75) +
  scale_x_continuous(breaks = breaks_width(4), minor_breaks = NULL) +
  scale_y_continuous(breaks = breaks_width(100), limits = c(0, 1000)) +
  labs(x = "Year", y = "Hospital stays per 100,000", title = "Alcohol Related Hospital Visits per 100,000 by Local Authority - Scotland") +
  theme(panel.border = element_rect(colour = "black", fill = NA, linewidth = 0.5), axis.text = element_text(color = "black", size = 10),
        axis.title = element_text(size = 12), plot.title = element_text(size = 12, face = "bold"), legend.title = element_blank())

```

```{r}

# Visualise Scotland vs England total 
scot_total = scotland_hospital_visits_whole_per100k %>% 
  dplyr::rename(country = local_authority) %>% 
  dplyr::select(c(country, year, stays_per100k)) %>% 
  dplyr::filter(year >= 2012 & year <= 2019)

country = c("England", "England", "England", "England", "England", "England", "England", "England")
year = c(2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019)
stays_per100k = c(610, 620, 580, 560, 490, 490, 510, 520)

eng_total = data.frame(country, year, stays_per100k)

scot_eng_total = rbind(scot_total, eng_total)

# Scotland and England alcohol related hospital visits per 100,000 by local authority 
ggplot(data = scot_eng_total, aes(x = year, y = stays_per100k, colour = country)) +
  geom_line(size = 0.75) +
  scale_x_continuous(breaks = breaks_width(1), minor_breaks = NULL) +
  scale_y_continuous(breaks = breaks_width(100), limits = c(0, 800)) +
  labs(x = "Year", y = "Hospital stays per 100,000", title = "Alcohol Related Hospital Visits per 100,000 Scotland vs England") +
  theme(panel.border = element_rect(colour = "black", fill = NA, linewidth = 0.5), axis.text = element_text(color = "black", size = 10),
        axis.title = element_text(size = 12), plot.title = element_text(size = 12, face = "bold"), legend.title = element_blank())

```

```{r}

# Plot with pair correlations to justify Mahalanobis 
covariates = local_authority_characteristics_has_per100k[, c("economic_activity_rate_16to64", "employment_rate_16to64", "unemployment_rate_16to64",
                                                             "median_annual_pay_gross", "percentage_male", "percentage_0to15",
                                                             "percentage_16to64", "percentage_65plus")]

cor_matrix = cor(covariates, method = "spearman")

corrplot(cor_matrix, method = "color", type = "upper", order = "hclust",
         tl.col = "black", tl.srt = 90, addCoef.col = "black", number.cex = 1)

```


```{r}

if (mice_imputation == FALSE) {

# Perform matching based on Mahalanobis distance 
matchit_output = matchit(treatment ~ economic_activity_rate_16to64 + employment_rate_16to64 + unemployment_rate_16to64 + 
                           median_annual_pay_gross + percentage_male + percentage_0to15 + percentage_16to64 + 
                           percentage_65plus, 
                         data = local_authority_characteristics_has_per100k, 
                         method = "nearest", distance = "mahalanobis")

# Match data 
matched_data = match.data(matchit_output)

# Match matrix 
match_matrix = matchit_output$match.matrix

# Create matched pairs dataframe 
matched_pairs = data.frame(Treatment_ID = rownames(match_matrix), 
                           Control_ID = as.vector(match_matrix))

# Merge back into original data 
matched_details = merge(local_authority_characteristics_has_per100k, matched_pairs, by.x = "row.names", by.y = "Treatment_ID")
matched_details = merge(matched_details, local_authority_characteristics_has_per100k, by.x = "Control_ID", by.y = "row.names", suffixes = c("_treat", "_control"))

# Just local authorities 
matched_local_authorities = matched_details %>% 
  dplyr::select(Control_ID, Row.names, local_authority_treat, local_authority_control)

}

```

```{r}

if (mice_imputation == TRUE) {

matched_local_authorities_list = list()

matched_data_list = list()
  
for (i in 1:n_mice_imputations) {
  # Perform matching based on Mahalanobis distance 
  matchit_output = matchit(treatment ~ economic_activity_rate_16to64 + employment_rate_16to64 + unemployment_rate_16to64 + 
                             median_annual_pay_gross + percentage_male + percentage_0to15 + percentage_16to64 + 
                             percentage_65plus, 
                           data = local_authority_characteristics_has_per100k_list[[i]], 
                           method = "nearest", distance = "mahalanobis")
  
  # Match data 
  matched_data = match.data(matchit_output)
  
  matched_data_list[[i]] = matched_data
  
  # Match matrix 
  match_matrix = matchit_output$match.matrix
  
  # Create matched pairs dataframe 
  matched_pairs = data.frame(Treatment_ID = rownames(match_matrix), 
                             Control_ID = as.vector(match_matrix))
  
  # Merge back into original data 
  matched_details = merge(local_authority_characteristics_has_per100k, matched_pairs, by.x = "row.names", by.y = "Treatment_ID")
  matched_details = merge(matched_details, local_authority_characteristics_has_per100k, by.x = "Control_ID", by.y = "row.names", suffixes = c("_treat", "_control"))
  
  # Just local authorities 
  matched_local_authorities = matched_details %>% 
    dplyr::select(Control_ID, Row.names, local_authority_treat, local_authority_control)
  
  matched_local_authorities_list[[i]] = matched_local_authorities
  
  }
  
}

```

```{r}

if (mice_imputation == TRUE) {
  
for (i in 1:n_mice_imputations) {
  
  matched_data_vis = matched_data_list[[i]] %>% 
    dplyr::select(local_authority, treatment, economic_activity_rate_16to64, employment_rate_16to64, unemployment_rate_16to64, 
                  median_annual_pay_gross, percentage_male, percentage_0to15, percentage_16to64, percentage_65plus) %>% 
    dplyr::mutate(treatment = as.factor(treatment))

  # Histogram of distribution of Local AUthority variable treatment vs control 
  plot = ggplot(data = matched_data_vis, aes(x = employment_rate_16to64, y = after_stat(count), fill = treatment)) +
           geom_histogram(binwidth = 2.5, colour = "black", position = "identity", alpha = 0.3) +
           scale_fill_manual(values = c("1" = "red", "0" = "blue"), labels = c("1" = "Treatment Group", "0" = "Control Group")) +
           scale_x_continuous(breaks = breaks_width(2.5), minor_breaks = NULL) +
           scale_y_continuous(breaks = breaks_width(2)) +
           labs(x = "Employment Rate Ages 16 to 64", y = "Count", title = "Distribution of Employment Rate Ages 16 to 64 Treatment vs Control", fill = NULL) +
           theme(panel.border = element_rect(colour = "black", fill = NA, linewidth = 0.5), axis.text = element_text(color = "black", size = 10),
                 axis.title = element_text(size = 12), plot.title = element_text(size = 12, face = "bold"))
  
  print(plot)
  
}
  
} else {

matched_data_vis = matched_data %>% 
  dplyr::select(local_authority, treatment, economic_activity_rate_16to64, employment_rate_16to64, unemployment_rate_16to64, 
                median_annual_pay_gross, percentage_male, percentage_0to15, percentage_16to64, percentage_65plus) %>% 
  dplyr::mutate(treatment = as.factor(treatment))

# Histogram of distribution of Local AUthority variable treatment vs control 
ggplot(data = matched_data_vis, aes(x = employment_rate_16to64, y = after_stat(count), fill = treatment)) +
  geom_histogram(binwidth = 2.5, colour = "black", position = "identity", alpha = 0.3) +
  scale_fill_manual(values = c("1" = "red", "0" = "blue"), labels = c("1" = "Treatment Group", "0" = "Control Group")) +
  scale_x_continuous(breaks = breaks_width(2.5), minor_breaks = NULL) +
  scale_y_continuous(breaks = breaks_width(2)) +
  labs(x = "Employment Rate Ages 16 to 64", y = "Count", title = "Distribution of Employment Rate Ages 16 to 64 Treatment vs Control", fill = NULL) +
  theme(panel.border = element_rect(colour = "black", fill = NA, linewidth = 0.5), axis.text = element_text(color = "black", size = 10),
        axis.title = element_text(size = 12), plot.title = element_text(size = 12, face = "bold"))

}

```

```{r}

if (mice_imputation == FALSE) {

# Basic pre vs post difference in difference 
hospitalisations_wide = hospitalisations %>% 
  dplyr::select(-financial_year) %>% 
  tidyr::pivot_wider(names_from = year, values_from = stays_per100k)

dif_in_dif = matched_local_authorities %>% 
  dplyr::left_join(hospitalisations_wide, by = c("local_authority_treat" = "local_authority")) %>% 
  dplyr::left_join(hospitalisations_wide, by = c("local_authority_control" = "local_authority"), suffix = c("_treat", "_control")) 

# Pre and post treatment outcomes for the treatment and control groups 
pre_treat = mean(dif_in_dif$`2017_treat`)
post_treat = mean(dif_in_dif$`2019_treat`)
pre_control = mean(dif_in_dif$`2017_control`)
post_control = mean(dif_in_dif$`2019_control`)

dif_in_dif_estimate = (post_treat - pre_treat) - (post_control - pre_control)

dif_in_dif_estimate

}

```

```{r}

if (mice_imputation == TRUE) {

dif_in_dif_estimate_list = c()

dif_in_dif_list = list()

for (i in 1:n_mice_imputations) {
  
  # Basic pre vs post difference in difference 
  hospitalisations_wide = hospitalisations %>% 
    dplyr::select(-financial_year) %>% 
    tidyr::pivot_wider(names_from = year, values_from = stays_per100k)
  
  dif_in_dif = matched_local_authorities_list[[i]] %>% 
    dplyr::left_join(hospitalisations_wide, by = c("local_authority_treat" = "local_authority")) %>% 
    dplyr::left_join(hospitalisations_wide, by = c("local_authority_control" = "local_authority"), suffix = c("_treat", "_control")) 
  
  # Pre and post treatment outcomes for the treatment and control groups 
  pre_treat = mean(dif_in_dif$`2017_treat`)
  post_treat = mean(dif_in_dif$`2019_treat`)
  pre_control = mean(dif_in_dif$`2017_control`)
  post_control = mean(dif_in_dif$`2019_control`)
  
  dif_in_dif_estimate = (post_treat - pre_treat) - (post_control - pre_control)
  
  dif_in_dif_estimate
  
  dif_in_dif_estimate_list[[i]] = dif_in_dif_estimate
  
  dif_in_dif_list[[i]] = dif_in_dif
  
}
  
  dif_in_dif_estimate_list
  
}

```

```{r}

if (mice_imputation == FALSE) {

# Test parallel trends assumption 
hospitalisations_long = dif_in_dif %>% 
  tidyr::pivot_longer(cols = starts_with("20"), 
                      names_to = "year", 
                      values_to = "stays_per100k") %>% 
  dplyr::mutate(group = case_when(str_ends(year, "_treat") ~ "Treatment", 
                                  str_ends(year, "_control") ~ "Control", 
                                  TRUE ~ NA)) %>% 
  dplyr::mutate(year = as.numeric(str_sub(str_trim(year), start = 1, end = 4)) + 1) %>% 
  dplyr::group_by(group, year) %>% 
  dplyr::summarise(stays_per100k = mean(stays_per100k))

# Treatment and control alcohol related hospital visits per 100,000 
ggplot(data = hospitalisations_long, aes(x = year, y = stays_per100k, colour = group)) +
  geom_line(size = 0.75) +
  geom_vline(xintercept = 2018.33, color = "red", size = 0.6, alpha = 0.8) +
  annotate("text", x = 2019.05, y = 25, label = "Intervention", color = "red", size = 4) +
  geom_vline(xintercept = 2017.92, color = "blue", size = 0.6, alpha = 0.8) +
  annotate("text", x = 2016.425, y = 25, label = "Intervention Announcement", color = "blue", size = 4) +
  scale_x_continuous(breaks = breaks_width(1), minor_breaks = NULL) +
  scale_y_continuous(breaks = breaks_width(100), limits = c(0, 800)) +
  labs(x = "Year", y = "Mean Hospital stays per 100,000", title = "Alcohol Related Hospital Visits per 100,000 Treatment vs Control") +
  theme(panel.border = element_rect(colour = "black", fill = NA, linewidth = 0.5), axis.text = element_text(color = "black", size = 10),
        axis.title = element_text(size = 12), plot.title = element_text(size = 12, face = "bold"), legend.title = element_blank())

}

```

```{r}

if (mice_imputation == TRUE) {

for (i in 1:n_mice_imputations) {
  
  # Test parallel trends assumption 
  hospitalisations_long = dif_in_dif_list[[i]] %>% 
    tidyr::pivot_longer(cols = starts_with("20"), 
                        names_to = "year", 
                        values_to = "stays_per100k") %>% 
    dplyr::mutate(group = case_when(str_ends(year, "_treat") ~ "Treatment", 
                                    str_ends(year, "_control") ~ "Control", 
                                    TRUE ~ NA)) %>% 
    dplyr::mutate(year = as.numeric(str_sub(str_trim(year), start = 1, end = 4)) + 1) %>% 
    dplyr::group_by(group, year) %>% 
    dplyr::summarise(stays_per100k = mean(stays_per100k))
  
  # Treatment and control alcohol related hospital visits per 100,000 
  plot = ggplot(data = hospitalisations_long, aes(x = year, y = stays_per100k, colour = group)) +
           geom_line(size = 0.75) +
           geom_vline(xintercept = 2018.33, color = "black", size = 0.6, alpha = 0.8) +
           annotate("text", x = 2019.05, y = 25, label = "Intervention", color = "black", size = 4) +
           geom_vline(xintercept = 2017.92, color = "#333333", size = 0.6, alpha = 0.8, linetype = "dashed") +
           annotate("text", x = 2016.425, y = 25, label = "Intervention Announcement", color = "#333333", size = 4) +
           scale_x_continuous(breaks = breaks_width(1), minor_breaks = NULL) +
           scale_y_continuous(breaks = breaks_width(100), limits = c(0, 800)) +
           scale_color_manual(values = c("Treatment" = "#EC4646", "Control" = "#4646EC")) +
           labs(x = "Year", y = "Mean Hospital stays per 100,000", title = "Alcohol Related Hospital Visits per 100,000 Treatment vs Control") +
           theme(panel.border = element_rect(colour = "black", fill = NA, linewidth = 0.5), axis.text = element_text(color = "black", size = 10),
                 axis.title = element_text(size = 12), plot.title = element_text(size = 12, face = "bold"), legend.title = element_blank())
  
  print(plot)
  
}

}

```

```{r}

if (mice_imputation == TRUE) {
  
for (i in 1:n_mice_imputations) {
  
  if (i == 1) {
    
    # Test parallel trends assumption 
    hospitalisations_long = dif_in_dif_list[[i]] %>% 
      tidyr::pivot_longer(cols = starts_with("20"), 
                          names_to = "year", 
                          values_to = "stays_per100k") %>% 
      dplyr::mutate(group = case_when(str_ends(year, "_treat") ~ "Treatment", 
                                      str_ends(year, "_control") ~ "Control", 
                                      TRUE ~ NA)) %>% 
      dplyr::mutate(year = as.numeric(str_sub(str_trim(year), start = 1, end = 4)) + 1) %>% 
      dplyr::group_by(group, year) %>% 
      dplyr::summarise(stays_per100k = mean(stays_per100k), .groups = "drop") %>% 
      dplyr::mutate(facet = i)
    
  }
  
  # Test parallel trends assumption 
  hospitalisations_next = dif_in_dif_list[[i]] %>% 
    tidyr::pivot_longer(cols = starts_with("20"), 
                        names_to = "year", 
                        values_to = "stays_per100k") %>% 
    dplyr::mutate(group = case_when(str_ends(year, "_treat") ~ "Treatment", 
                                    str_ends(year, "_control") ~ "Control", 
                                    TRUE ~ NA)) %>% 
    dplyr::mutate(year = as.numeric(str_sub(str_trim(year), start = 1, end = 4)) + 1) %>% 
    dplyr::group_by(group, year) %>% 
    dplyr::summarise(stays_per100k = mean(stays_per100k), .groups = "drop") %>% 
    dplyr::mutate(facet = i)
  
  hospitalisations_long = rbind(hospitalisations_long, hospitalisations_next)
  
}

hospitalisations_long$group = factor(hospitalisations_long$group, levels = c("Treatment", "Control", "Intervention Announcement", "Intervention"))

legend_order = factor(c("Treatment", "Control", "Intervention Announcement", "Intervention"), levels = c("Intervention", "Intervention Announcement", "Treatment", "Control"))

# Plot parallel trends all on one graph 
ggplot(data = hospitalisations_long, aes(x = year, y = stays_per100k, colour = group)) +
  geom_line(size = 0.75) +
  geom_vline(aes(xintercept = 2018.33, color = "Intervention"), size = 0.6, alpha = 0.8) +
  geom_vline(aes(xintercept = 2017.92, color = "Intervention Announcement"), size = 0.6, alpha = 0.8, linetype = "dashed") +
  scale_color_manual(values = c("Intervention" = "red", "Intervention Announcement" = "blue", "Treatment" = "#00BFC4", "Control" = "#F8766D")) +
  scale_x_continuous(breaks = breaks_width(1), minor_breaks = NULL) +
  scale_y_continuous(breaks = breaks_width(100), limits = c(0, 800)) +
  scale_color_manual(values = c("Treatment" = "#EC4646", "Control" = "#4646EC", "Intervention Announcement" = "#333333", "Intervention" = "black")) +
  labs(x = "Year", y = "Mean Hospital stays per 100,000", title = "Alcohol Related Hospital Visits per 100,000 Treatment vs Control",
       color = "Legend") +
  theme(panel.border = element_rect(colour = "black", fill = NA, linewidth = 0.5), 
        axis.text = element_text(color = "black", size = 8), 
        axis.text.x = element_text(angle = 90, hjust = 1), 
        axis.title = element_text(size = 12), 
        plot.title = element_text(size = 12, face = "bold"), 
        legend.title = element_blank()) +
  facet_wrap(~facet)

}

```

```{r}

if (mice_imputation == FALSE) {

# Placebo test for every two year time period prior and spanning the intervention date 
dif_in_dif_2y_estimates = list()

# Dif in dif calculation for each placebo and actual estimate 
for (i in 1:6) {
  
  pre_treat_i = mean(dif_in_dif[, (i + 4)])
  post_treat_i = mean(dif_in_dif[, (i + 6)])
  pre_control_i = mean(dif_in_dif[, (i + 12)])
  post_control_i = mean(dif_in_dif[, (i + 14)])
  
  dif_in_dif_estimate_i = (post_treat_i - pre_treat_i) - (post_control_i - pre_control_i)
  
  dif_in_dif_2y_estimates[[i]] = dif_in_dif_estimate_i
  
}

# Function to compute dif in dif estimate 
dif_in_dif_function = function(data, indices) {
  
  # Bootstrap resampling 
  data_subset = data[indices, ]
  
  # Dif in dif calculation for each placebo and actual estimate 
  estimates = sapply(1:6, function(i) {
    
    pre_treat_i = mean(data_subset[, (i + 4)])
    post_treat_i = mean(data_subset[, (i + 6)])
    pre_control_i = mean(data_subset[, (i + 12)])
    post_control_i = mean(data_subset[, (i + 14)])
    
    (post_treat_i - pre_treat_i) - (post_control_i - pre_control_i)
    
  })
  
  return(estimates)
  
}

# Bootstrap to calculate standard errors for causal estimate and placebo causal estimates 
set.seed(12345)

bootstrap_results = boot(data = dif_in_dif, statistic = dif_in_dif_function, R = 10000)

# Standard errors 
boot_se = apply(bootstrap_results$t, 2, sd)

# Bootstrap confidence intervals 
boot_ci = t(apply(bootstrap_results$t, 2, function(x) {
  
  quantile(x, probs = c(0.025, 0.975))
  
}))

# Dataframe rows 
dif_in_dif_2y_estimates = unlist(dif_in_dif_2y_estimates) 
year = c(2015.25, 2016.25, 2017.25, 2018.25, 2019.25, 2020.25) # End dates of years for showing treatment date against 
lower_ci = boot_ci[, 1]
upper_ci = boot_ci[, 2]
se = boot_se

# Create dataframe 
dif_in_dif_2y_estimates_placebo = data.frame(year, dif_in_dif_2y_estimates, lower_ci, upper_ci, se)

# Calculate p-values 
dif_in_dif_2y_estimates_placebo = dif_in_dif_2y_estimates_placebo %>% 
  dplyr::mutate(p_value = 2 * pnorm(-abs(dif_in_dif_2y_estimates / se)))

# Placebo test results for every two year time period prior to and spanning the intervention date 
ggplot(data = dif_in_dif_2y_estimates_placebo, aes(x = year, y = dif_in_dif_2y_estimates)) +
  geom_line(size = 1, linetype = "dotted", color = "black", size = 0.75, alpha = 0.75) +
  geom_point(shape = 16, size = 3, color = "black") +
  geom_errorbar(aes(ymin = lower_ci, ymax = upper_ci), width = 0.1) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "dark grey", size = 0.75, alpha = 0.8) +
  geom_vline(xintercept = 2018.33, color = "red", size = 0.6, alpha = 0.8) +
  annotate("text", x = 2018.75, y = -75.5, label = "Intervention", color = "red", size = 4) +
  geom_vline(xintercept = 2017.92, color = "blue", size = 0.6, alpha = 0.8) +
  annotate("text", x = 2017.05, y = -75.5, label = "Intervention Announcement", color = "blue", size = 4) +
  scale_x_continuous(breaks = breaks_width(1), minor_breaks = NULL) +
  scale_y_continuous(breaks = breaks_width(10), limits = c(-80, 80)) +
  labs(x = "Year", y = "Dif in Dif Estimated Placebo Treatment Effects", title = "Estimated Causal Effect Including Pretreatment Placebos") +
  theme(panel.border = element_rect(colour = "black", fill = NA, linewidth = 0.5), axis.text = element_text(color = "black", size = 10),
        axis.title = element_text(size = 12), plot.title = element_text(size = 12, face = "bold"), legend.title = element_blank())

# Note 2020 represents 2019/20 etc. 

}

```

```{r}

if (mice_imputation == FALSE) {

# Placebo test for every one year time period prior and spanning the intervention date 
dif_in_dif_1y_estimates = list()

# Dif in dif calculation for each placebo and actual estimate 
for (i in 1:6) {
  
  pre_treat_i = mean(dif_in_dif[, (i + 4)])
  post_treat_i = mean(dif_in_dif[, (i + 5)])
  pre_control_i = mean(dif_in_dif[, (i + 12)])
  post_control_i = mean(dif_in_dif[, (i + 13)])
  
  dif_in_dif_estimate_i = (post_treat_i - pre_treat_i) - (post_control_i - pre_control_i)
  
  dif_in_dif_1y_estimates[[i]] = dif_in_dif_estimate_i
  
}

# Function to compute dif in dif estimate 
dif_in_dif_function = function(data, indices) {
  
  # Bootstrap resampling 
  data_subset = data[indices, ]
  
  # Dif in dif calculation for each placebo and actual estimate 
  estimates = sapply(1:6, function(i) {
    
    pre_treat_i = mean(data_subset[, (i + 4)])
    post_treat_i = mean(data_subset[, (i + 5)])
    pre_control_i = mean(data_subset[, (i + 12)])
    post_control_i = mean(data_subset[, (i + 13)])
    
    (post_treat_i - pre_treat_i) - (post_control_i - pre_control_i)
    
  })
  
  return(estimates)
  
}

# Bootstrap to calculate standard errors for causal estimate and placebo causal estimates 
set.seed(12345)

bootstrap_results = boot(data = dif_in_dif, statistic = dif_in_dif_function, R = 10000)

# Standard errors 
boot_se = apply(bootstrap_results$t, 2, sd)

# Bootstrap confidence intervals 
boot_ci = t(apply(bootstrap_results$t, 2, function(x) {
  
  quantile(x, probs = c(0.025, 0.975))
  
}))

# Dataframe rows 
dif_in_dif_1y_estimates = unlist(dif_in_dif_1y_estimates) 
year = c(2014.25, 2015.25, 2016.25, 2017.25, 2018.25, 2019.25)
lower_ci = boot_ci[, 1]
upper_ci = boot_ci[, 2]
se = boot_se

# Create dataframe 
dif_in_dif_1y_estimates_placebo = data.frame(year, dif_in_dif_1y_estimates, lower_ci, upper_ci, se)

# Calculate p-values 
dif_in_dif_1y_estimates_placebo = dif_in_dif_1y_estimates_placebo %>% 
  dplyr::mutate(p_value = 2 * pnorm(-abs(dif_in_dif_1y_estimates / se)))

# Placebo test results for every one year time period prior to and spanning the intervention date 
ggplot(data = dif_in_dif_1y_estimates_placebo, aes(x = year, y = dif_in_dif_1y_estimates)) +
  geom_line(size = 1, linetype = "dotted", color = "black", size = 0.75, alpha = 0.75) +
  geom_point(shape = 16, size = 3, color = "black") +
  geom_errorbar(aes(ymin = lower_ci, ymax = upper_ci), width = 0.1) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "dark grey", size = 0.75, alpha = 0.8) +
  geom_vline(xintercept = 2018.33, color = "red", size = 0.6, alpha = 0.8) +
  annotate("text", x = 2018.75, y = -57, label = "Intervention", color = "red", size = 4) +
  geom_vline(xintercept = 2017.92, color = "blue", size = 0.6, alpha = 0.8) +
  annotate("text", x = 2017.05, y = -57, label = "Intervention Announcement", color = "blue", size = 4) +
  scale_x_continuous(breaks = breaks_width(1), minor_breaks = NULL) +
  scale_y_continuous(breaks = breaks_width(10), limits = c(-60, 60)) +
  labs(x = "Year", y = "Dif in Dif Estimated Placebo Treatment Effects", title = "Alcohol Related Hospital Visits per 100,000 Treatment vs Control") +
  theme(panel.border = element_rect(colour = "black", fill = NA, linewidth = 0.5), axis.text = element_text(color = "black", size = 10),
        axis.title = element_text(size = 12), plot.title = element_text(size = 12, face = "bold"), legend.title = element_blank())

# Note 2019 represents 2018/19 etc. 

}

```

```{r}

if (mice_imputation == TRUE) {

dif_in_dif_2y_estimates_placebo_list = list()

for (i in 1:n_mice_imputations) {
  
  # Placebo test for every two year time period prior and spanning the intervention date 
  dif_in_dif_2y_estimates = list()
  
  # Dif in dif calculation for each placebo and actual estimate 
  for (j in 1:6) {
    
    pre_treat_j = mean(dif_in_dif_list[[i]][, (j + 4)])
    post_treat_j = mean(dif_in_dif_list[[i]][, (j + 6)])
    pre_control_j = mean(dif_in_dif_list[[i]][, (j + 12)])
    post_control_j = mean(dif_in_dif_list[[i]][, (j + 14)])
    
    dif_in_dif_estimate_j = (post_treat_j - pre_treat_j) - (post_control_j - pre_control_j)
    
    dif_in_dif_2y_estimates[[j]] = dif_in_dif_estimate_j
    
  }
  
  # Function to compute dif in dif estimate 
  dif_in_dif_function = function(data, indices) {
    
    # Bootstrap resampling 
    data_subset = data[indices, ]
    
    # Dif in dif calculation for each placebo and actual estimate 
    estimates = sapply(1:6, function(k) {
    
      pre_treat_k = mean(data_subset[, (k + 4)])
      post_treat_k = mean(data_subset[, (k + 6)])
      pre_control_k = mean(data_subset[, (k + 12)])
      post_control_k = mean(data_subset[, (k + 14)])
      
      (post_treat_k - pre_treat_k) - (post_control_k - pre_control_k)
      
    })
    
    return(estimates)
    
  }

  # Bootstrap to calculate standard errors for causal estimate and placebo causal estimates 
  set.seed(12345)
  
  bootstrap_results = boot(data = dif_in_dif_list[[i]], statistic = dif_in_dif_function, R = 10000)
  
  # Standard errors 
  boot_se = apply(bootstrap_results$t, 2, sd)
  
  # Bootstrap confidence intervals 
  boot_ci = t(apply(bootstrap_results$t, 2, function(x) {
    
    quantile(x, probs = c(0.025, 0.975))
    
  }))
  
  # Dataframe rows 
  dif_in_dif_2y_estimates = unlist(dif_in_dif_2y_estimates) 
  year = c(2015.25, 2016.25, 2017.25, 2018.25, 2019.25, 2020.25) # End dates of years for showing treatment date against 
  lower_ci = boot_ci[, 1]
  upper_ci = boot_ci[, 2]
  se = boot_se
  
  # Create dataframe 
  dif_in_dif_2y_estimates_placebo = data.frame(year, dif_in_dif_2y_estimates, lower_ci, upper_ci, se)
  
  # Calculate p-values 
  dif_in_dif_2y_estimates_placebo = dif_in_dif_2y_estimates_placebo %>% 
    dplyr::mutate(p_value = 2 * pnorm(-abs(dif_in_dif_2y_estimates / se)))
    
  dif_in_dif_2y_estimates_placebo_list[[i]] = dif_in_dif_2y_estimates_placebo
  
}

for (i in 1:n_mice_imputations) {
  
  if (i == 1) {
    
    dif_in_dif_2y_estimates_placebo_df = dif_in_dif_2y_estimates_placebo_list[[i]] %>% 
      dplyr::mutate(mice_run = i) %>% 
      dplyr::mutate(mice_run = as.character(mice_run))
    
  }
  
  if (i != 1) {
    
    dif_in_dif_2y_estimates_placebo_i = dif_in_dif_2y_estimates_placebo_list[[i]] %>% 
      dplyr::mutate(mice_run = i) %>% 
      dplyr::mutate(mice_run = as.character(mice_run))
    
    dif_in_dif_2y_estimates_placebo_df = rbind(dif_in_dif_2y_estimates_placebo_df, dif_in_dif_2y_estimates_placebo_i)
    
  }
  
}

# Rubin's Rules to get overall standard error, confidence interval and p-value estimates 
dif_in_dif_2y_estimates_overall = dif_in_dif_2y_estimates_placebo_df %>% 
  dplyr::group_by(year) %>% 
  dplyr::summarise(mean_estimate = mean(dif_in_dif_2y_estimates), 
                   within_imputation_variance = mean((se)^2), 
                   between_imputation_variance = var(dif_in_dif_2y_estimates), 
                   total_variance = mean((se)^2) + (1 + (1 / n_mice_imputations)) * var(dif_in_dif_2y_estimates), 
                   se_pooled = sqrt(mean((se)^2) + (1 + (1 / n_mice_imputations)) * var(dif_in_dif_2y_estimates)), 
                   lower_ci = mean(dif_in_dif_2y_estimates) - 1.96 * sqrt(mean((se)^2) + (1 + (1 / n_mice_imputations)) * var(dif_in_dif_2y_estimates)), 
                   upper_ci = mean(dif_in_dif_2y_estimates) + 1.96 * sqrt(mean((se)^2) + (1 + (1 / n_mice_imputations)) * var(dif_in_dif_2y_estimates)), 
                   p_value = 2 * (1 - pnorm(abs(mean(dif_in_dif_2y_estimates) / sqrt(mean((se)^2) + (1 + (1 / n_mice_imputations)) * var(dif_in_dif_2y_estimates))))))

# Placebo test results for every two year time period prior to and spanning the intervention date 
ggplot(data = dif_in_dif_2y_estimates_overall, aes(x = year, y = mean_estimate)) +
  geom_line(size = 1, linetype = "dotted", color = "black", size = 0.75, alpha = 0.75) +
  geom_point(shape = 16, size = 3, color = "black") +
  geom_errorbar(aes(ymin = lower_ci, ymax = upper_ci), width = 0.1) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "dark grey", size = 0.75, alpha = 0.8) +
  geom_vline(xintercept = 2018.33, color = "red", size = 0.6, alpha = 0.8) +
  annotate("text", x = 2018.75, y = -75.5, label = "Intervention", color = "red", size = 4) +
  geom_vline(xintercept = 2017.92, color = "#333333", size = 0.6, alpha = 0.8, linetype = "dashed") +
  annotate("text", x = 2017.05, y = -75.5, label = "Intervention Announcement", color = "#333333", size = 4) +
  scale_x_continuous(breaks = breaks_width(1), minor_breaks = NULL) +
  scale_y_continuous(breaks = breaks_width(10), limits = c(-80, 80)) +
  labs(x = "Year", y = "Dif in Dif Estimated Placebo Treatment Effects", title = "Estimated Causal Effect 2-Year Pretreatment Placebos") +
  theme(panel.border = element_rect(colour = "black", fill = NA, linewidth = 0.5), axis.text = element_text(color = "black", size = 10),
        axis.title = element_text(size = 12), plot.title = element_text(size = 12, face = "bold"), legend.title = element_blank())

# Note 2020 represents 2019/20 etc. 

}

```

```{r}

if (mice_imputation == TRUE) {

dif_in_dif_1y_estimates_placebo_list = list()

for (i in 1:n_mice_imputations) {
  
  # Placebo test for every one year time period prior and spanning the intervention date 
  dif_in_dif_1y_estimates = list()
  
  # Dif in dif calculation for each placebo and actual estimate 
  for (j in 1:6) {
    
    pre_treat_j = mean(dif_in_dif_list[[i]][, (j + 4)])
    post_treat_j = mean(dif_in_dif_list[[i]][, (j + 5)])
    pre_control_j = mean(dif_in_dif_list[[i]][, (j + 12)])
    post_control_j = mean(dif_in_dif_list[[i]][, (j + 13)])
    
    dif_in_dif_estimate_j = (post_treat_j - pre_treat_j) - (post_control_j - pre_control_j)
    
    dif_in_dif_1y_estimates[[j]] = dif_in_dif_estimate_j
    
  }
  
  # Function to compute dif in dif estimate 
  dif_in_dif_function = function(data, indices) {
    
    # Bootstrap resampling 
    data_subset = data[indices, ]
    
    # Dif in dif calculation for each placebo and actual estimate 
    estimates = sapply(1:6, function(k) {
    
      pre_treat_k = mean(data_subset[, (k + 4)])
      post_treat_k = mean(data_subset[, (k + 6)])
      pre_control_k = mean(data_subset[, (k + 12)])
      post_control_k = mean(data_subset[, (k + 14)])
      
      (post_treat_k - pre_treat_k) - (post_control_k - pre_control_k)
      
    })
    
    return(estimates)
    
  }

  # Bootstrap to calculate standard errors for causal estimate and placebo causal estimates 
  set.seed(12345)
  
  bootstrap_results = boot(data = dif_in_dif_list[[i]], statistic = dif_in_dif_function, R = 10000)
  
  # Standard errors 
  boot_se = apply(bootstrap_results$t, 2, sd)
  
  # Bootstrap confidence intervals 
  boot_ci = t(apply(bootstrap_results$t, 2, function(x) {
    
    quantile(x, probs = c(0.025, 0.975))
    
  }))
  
  # Dataframe rows 
  dif_in_dif_1y_estimates = unlist(dif_in_dif_1y_estimates) 
  year = c(2014.25, 2015.25, 2016.25, 2017.25, 2018.25, 2019.25) # End dates of years for showing treatment date against 
  lower_ci = boot_ci[, 1]
  upper_ci = boot_ci[, 2]
  se = boot_se
  
  # Create dataframe 
  dif_in_dif_1y_estimates_placebo = data.frame(year, dif_in_dif_1y_estimates, lower_ci, upper_ci, se)
  
  # Calculate p-values 
  dif_in_dif_1y_estimates_placebo = dif_in_dif_1y_estimates_placebo %>% 
    dplyr::mutate(p_value = 2 * pnorm(-abs(dif_in_dif_1y_estimates / se)))
    
  dif_in_dif_1y_estimates_placebo_list[[i]] = dif_in_dif_1y_estimates_placebo
  
}

for (i in 1:n_mice_imputations) {
  
  if (i == 1) {
    
    dif_in_dif_1y_estimates_placebo_df = dif_in_dif_1y_estimates_placebo_list[[i]] %>% 
      dplyr::mutate(mice_run = i) %>% 
      dplyr::mutate(mice_run = as.character(mice_run))
    
  }
  
  if (i != 1) {
    
    dif_in_dif_1y_estimates_placebo_i = dif_in_dif_1y_estimates_placebo_list[[i]] %>% 
      dplyr::mutate(mice_run = i) %>% 
      dplyr::mutate(mice_run = as.character(mice_run))
    
    dif_in_dif_1y_estimates_placebo_df = rbind(dif_in_dif_1y_estimates_placebo_df, dif_in_dif_1y_estimates_placebo_i)
    
  }
  
}

# Rubin's Rules to get overall standard error, confidence interval and p-value estimates 
dif_in_dif_1y_estimates_overall = dif_in_dif_1y_estimates_placebo_df %>% 
  dplyr::group_by(year) %>% 
  dplyr::summarise(mean_estimate = mean(dif_in_dif_1y_estimates), 
                   within_imputation_variance = mean((se)^2), 
                   between_imputation_variance = var(dif_in_dif_1y_estimates), 
                   total_variance = mean((se)^2) + (1 + (1 / n_mice_imputations)) * var(dif_in_dif_1y_estimates), 
                   se_pooled = sqrt(mean((se)^2) + (1 + (1 / n_mice_imputations)) * var(dif_in_dif_1y_estimates)), 
                   lower_ci = mean(dif_in_dif_1y_estimates) - 1.96 * sqrt(mean((se)^2) + (1 + (1 / n_mice_imputations)) * var(dif_in_dif_1y_estimates)), 
                   upper_ci = mean(dif_in_dif_1y_estimates) + 1.96 * sqrt(mean((se)^2) + (1 + (1 / n_mice_imputations)) * var(dif_in_dif_1y_estimates)), 
                   p_value = 2 * (1 - pnorm(abs(mean(dif_in_dif_1y_estimates) / sqrt(mean((se)^2) + (1 + (1 / n_mice_imputations)) * var(dif_in_dif_1y_estimates))))))

# Placebo test results for every one year time period prior to and spanning the intervention date 
ggplot(data = dif_in_dif_1y_estimates_overall, aes(x = year, y = mean_estimate)) +
  geom_line(size = 1, linetype = "dotted", color = "black", size = 0.75, alpha = 0.75) +
  geom_point(shape = 16, size = 3, color = "black") +
  geom_errorbar(aes(ymin = lower_ci, ymax = upper_ci), width = 0.1) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "dark grey", size = 0.75, alpha = 0.8) +
  geom_vline(xintercept = 2018.33, color = "red", size = 0.6, alpha = 0.8) +
  annotate("text", x = 2018.75, y = -75.5, label = "Intervention", color = "red", size = 4) +
  geom_vline(xintercept = 2017.92, color = "#333333", size = 0.6, alpha = 0.8, linetype = "dashed") +
  annotate("text", x = 2017.05, y = -75.5, label = "Intervention Announcement", color = "#333333", size = 4) +
  scale_x_continuous(breaks = breaks_width(1), minor_breaks = NULL) +
  scale_y_continuous(breaks = breaks_width(10), limits = c(-80, 80)) +
  labs(x = "Year", y = "Dif in Dif Estimated Placebo Treatment Effects", title = "Estimated Causal Effect 1-Year Pretreatment Placebos") +
  theme(panel.border = element_rect(colour = "black", fill = NA, linewidth = 0.5), axis.text = element_text(color = "black", size = 10),
        axis.title = element_text(size = 12), plot.title = element_text(size = 12, face = "bold"), legend.title = element_blank())

# Note 2019 represents 2018/19 etc. 

}

```