From ebbca02123a3a127a1565090c27a5f2f98378bf9 Mon Sep 17 00:00:00 2001
From: Jaime Villacampa <jaime.villacampa@phs.scot>
Date: Fri, 9 Jul 2021 14:42:52 +0100
Subject: [PATCH] adding simd 2020, ugly but works

---
 Young people in deprived quintile.R | 106 ++++++++++++++++++----------
 1 file changed, 67 insertions(+), 39 deletions(-)

diff --git a/Young people in deprived quintile.R b/Young people in deprived quintile.R
index 1d4d872..c339f36 100644
--- a/Young people in deprived quintile.R	
+++ b/Young people in deprived quintile.R	
@@ -21,80 +21,108 @@ source("1.indicator_analysis.R") #Normal indicator functions
 #Small function to standarize each years info. Function parameters:
 #Data is for what basefile to use, list_pos is for the position of the data frame
 #simd for which simd variables-year to look at, year for what year is the data created.
-prepare_file <- function(dz_list) {
-  raw_data <<- readRDS(paste0(lookups, "Population/DZ11_pop_basefile.rds")) %>% 
-    filter(age<26 & datazone2011 %in% dz_list & year>2010) %>% group_by(year, datazone2011) %>% 
+prepare_file <- function(dz_list14, dz_list17) {
+  pop <- readRDS(paste0(lookups, "Population/DZ11_pop_basefile.rds")) %>% 
+    filter(age<26 & year>2010) %>% group_by(year, datazone2011) %>% 
     summarise(numerator = sum(denominator, na.rm= T)) %>% ungroup %>% 
     rename(datazone = datazone2011)
   
+  raw_data <<- rbind(
+    pop %>% filter((datazone %in% dz_list14 & between(year, 2011, 2016))),
+    pop %>% filter((datazone %in% dz_list17 & year > 2016)))
+  
 }
 
 ###############################################.
 ## Part 1 - Format raw data ready for analysis functions ----
 ###############################################.
 
-simd_data <- readRDS(paste0(cl_out_depr, 'DataZone2011_simd2016.rds')) %>% 
+simd_data14 <- readRDS(paste0(cl_out_depr, 'DataZone2011_simd2016.rds')) %>% 
   setNames(tolower(names(.))) %>% 
   select(datazone2011, simd2016_crime_rank, simd2016_access_rank, simd2016_inc_rank)
 
+simd_data17 <- readRDS(paste0(cl_out_depr, 'DataZone2011_simd2020v2.rds')) %>% 
+  setNames(tolower(names(.))) %>% 
+  select(datazone2011, simd2020v2_crime_rank, simd2020v2_access_rank, simd2020v2_inc_rank)
+
 # Population 25 or under
+# Selecting pop years used to create each simd version 2014 for SIMD2016, 2017 for SIMD2020
 pop <- readRDS(paste0(lookups, "Population/DZ11_pop_basefile.rds")) %>% 
-  filter(year == "2014") %>% group_by(year, datazone2011) %>% 
+  filter(year %in% c("2014", "2017") & age <26) %>% 
+  group_by(year, datazone2011) %>% 
   summarise(pop = sum(denominator, na.rm= T)) %>% ungroup
 
-pop_total <- pop %>% group_by(year) %>% summarise(pop = sum(pop, na.rm= T)) %>% 
-  ungroup %>% pull(pop)
-  
-cut_breaks <- c(0, pop_total/5, pop_total/5*2, pop_total/5*3, pop_total/5*4, pop_total)
+pop14 <- pop %>% filter(year == "2014") 
+pop17 <- pop %>% filter(year == "2017") 
+
+pop_total14 <- pop14 %>% group_by(year) %>% 
+  summarise(pop = sum(pop, na.rm= T)) %>% ungroup %>% pull(pop)
 
-simd_data <- left_join(simd_data, pop, by = c("datazone2011")) %>% 
+pop_total17 <- pop17 %>% group_by(year) %>% 
+  summarise(pop = sum(pop, na.rm= T)) %>% ungroup %>% pull(pop)
+
+# Creating the population thresholds for each quintile
+cut_breaks14 <- c(0, pop_total14/5, pop_total14/5*2, pop_total14/5*3, pop_total14/5*4, pop_total14)
+cut_breaks17 <- c(0, pop_total17/5, pop_total17/5*2, pop_total17/5*3, pop_total17/5*4, pop_total17)
+
+# Preparing files for simd 2016
+simd_data14 <- left_join(simd_data14, pop14, by = c("datazone2011")) %>% 
   arrange(simd2016_crime_rank) %>% # crime pop weighted quintile
   mutate(cum_pop_crime=cumsum(pop),
-         crime_quintile = as.numeric(paste(cut(cum_pop_crime, cut_breaks, include.lowest=TRUE,
+         crime_quintile = as.numeric(paste(cut(cum_pop_crime, cut_breaks14, include.lowest=TRUE,
                                         labels=c("5", "4", "3", "2", "1"))))) %>% 
   arrange(simd2016_access_rank) %>% # access pop weighted quintile
   mutate(cum_pop_access=cumsum(pop),
-         access_quintile = as.numeric(paste(cut(cum_pop_access, cut_breaks, include.lowest=TRUE,
+         access_quintile = as.numeric(paste(cut(cum_pop_access, cut_breaks14, include.lowest=TRUE,
                                                labels=c("5", "4", "3", "2", "1"))))) %>% 
   arrange(simd2016_inc_rank) %>% # income pop weighted quintile
   mutate(cum_pop_inc=cumsum(pop),
-         inc_quintile = as.numeric(paste(cut(cum_pop_inc, cut_breaks, include.lowest=TRUE,
+         inc_quintile = as.numeric(paste(cut(cum_pop_inc, cut_breaks14, include.lowest=TRUE,
                                                labels=c("5", "4", "3", "2", "1"))))) %>% 
   select(-starts_with("cum_pop"), -starts_with("simd"))
 
-crime_dz <- simd_data %>% filter(crime_quintile == "5") %>% pull(datazone2011)
-inc_dz <- simd_data %>% filter(inc_quintile == "5") %>% pull(datazone2011)
-access_dz <- simd_data %>% filter(access_quintile == "5") %>% pull(datazone2011)
+# Preparing files for simd 2020
+simd_data17 <- left_join(simd_data17, pop17, by = c("datazone2011")) %>% 
+  arrange(simd2020v2_crime_rank) %>% # crime pop weighted quintile
+  mutate(cum_pop_crime=cumsum(pop),
+         crime_quintile = as.numeric(paste(cut(cum_pop_crime, cut_breaks17, include.lowest=TRUE,
+                                               labels=c("5", "4", "3", "2", "1"))))) %>% 
+  arrange(simd2020v2_access_rank) %>% # access pop weighted quintile
+  mutate(cum_pop_access=cumsum(pop),
+         access_quintile = as.numeric(paste(cut(cum_pop_access, cut_breaks17, include.lowest=TRUE,
+                                                labels=c("5", "4", "3", "2", "1"))))) %>% 
+  arrange(simd2020v2_inc_rank) %>% # income pop weighted quintile
+  mutate(cum_pop_inc=cumsum(pop),
+         inc_quintile = as.numeric(paste(cut(cum_pop_inc, cut_breaks17, include.lowest=TRUE,
+                                             labels=c("5", "4", "3", "2", "1"))))) %>% 
+  select(-starts_with("cum_pop"), -starts_with("simd"))
+
 
-saveRDS(prepare_file(inc_dz), paste0(data_folder, "Prepared Data/young_people_income_raw.rds"))
-saveRDS(prepare_file(crime_dz), paste0(data_folder, "Prepared Data/young_people_crime_raw.rds"))
-saveRDS(prepare_file(access_dz), paste0(data_folder, "Prepared Data/young_people_access_raw.rds"))
+crime_dz14 <- simd_data14 %>% filter(crime_quintile == "5") %>% pull(datazone2011)
+inc_dz14 <- simd_data14 %>% filter(inc_quintile == "5") %>% pull(datazone2011)
+access_dz14 <- simd_data14 %>% filter(access_quintile == "5") %>% pull(datazone2011)
+crime_dz17 <- simd_data17 %>% filter(crime_quintile == "5") %>% pull(datazone2011)
+inc_dz17 <- simd_data17 %>% filter(inc_quintile == "5") %>% pull(datazone2011)
+access_dz17 <- simd_data17 %>% filter(access_quintile == "5") %>% pull(datazone2011)
+
+saveRDS(prepare_file(inc_dz14, inc_dz17), 
+        paste0(data_folder, "Prepared Data/young_people_income_raw.rds"))
+saveRDS(prepare_file(crime_dz14, crime_dz17), 
+        paste0(data_folder, "Prepared Data/young_people_crime_raw.rds"))
+saveRDS(prepare_file(access_dz14, access_dz17), 
+        paste0(data_folder, "Prepared Data/young_people_access_raw.rds"))
 
 ###############################################.
 ## Part 2 - Calling the analysis functions ----
 ###############################################.
 ###############################################.
-# Crime 
-analyze_first(filename = "young_people_crime", geography = "datazone11", measure = "percent", 
-              yearstart = 2011, yearend = 2018, time_agg = 1, pop = "DZ11_pop_under26")
-
-analyze_second(filename = "young_people_crime", measure = "percent", time_agg = 1, 
-               ind_id = 13005, year_type = "calendar", qa = F)
+filenames <- c("young_people_crime", "young_people_access", "young_people_income")
+# Running functions for the three indicators
+mapply(analyze_first, filename = filenames, geography = "datazone11", measure = "percent", 
+              yearstart = 2011, yearend = 2019, time_agg = 1, pop = "DZ11_pop_under26")
 
-###############################################.
-# Access 
-analyze_first(filename = "young_people_access", geography = "datazone11", measure = "percent", 
-              yearstart = 2011, yearend = 2018, time_agg = 1, pop = "DZ11_pop_under26")
-
-analyze_second(filename = "young_people_access", measure = "percent", time_agg = 1, 
-               ind_id = 13003, year_type = "calendar", qa = F)
-
-###############################################.
-# Income 
-analyze_first(filename = "young_people_income", geography = "datazone11", measure = "percent", 
-              yearstart = 2011, yearend = 2018, time_agg = 1, pop = "DZ11_pop_under26")
+mapply(analyze_second(filename = "young_people_crime", measure = "percent", time_agg = 1, 
+               ind_id = c(13005, 13003, 13004), year_type = "calendar", qa = F))
 
-analyze_second(filename = "young_people_income", measure = "percent", time_agg = 1, 
-               ind_id = 13004, year_type = "calendar", qa = F)
 ## END