-
Notifications
You must be signed in to change notification settings - Fork 1
/
4.Data Quality Checks_inequalities indicators.Rmd
434 lines (373 loc) · 20.4 KB
/
4.Data Quality Checks_inequalities indicators.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
---
title: "Data Quality Checks: Inequalities"
subtitle: "Report looks for errors in files intended for deprivation tab of ScotPHO profiles tool"
output: html_document
runtime: shiny
---
```{r setup, include=FALSE}
#global options for markdown report
knitr::opts_chunk$set(echo = FALSE)
```
```{r, include=FALSE}
#read in geography code names dictionary
code_dictionary <-readRDS(paste0(data_folder,"Lookups/Geography/codedictionary.rds"))
#Read in indicator file being checked
data_indicator <- readRDS(paste0(data_folder, "Data to be checked/", filename, "_ineq.rds")) %>%
mutate(geo_type=substr(code,1,3)) #add field identifying geography type
#Add geography code names
data_indicator <- left_join(x=data_indicator, y=code_dictionary, by="code") %>%
mutate(code_name=paste0(code,": ",as.character(areaname))) #add field with geography code & name
#Test which optional geographies are present in dataset
scot <- any(substr(data_indicator$code,1,3) =="S00")
board <- any(substr(data_indicator$code,1,3) == "S08")
ca <- any(substr(data_indicator$code,1,3) == "S12")
#Read in old indicator data file from network /Shiny data folder - if one exists (new indicators won't have a historic file)
old_indicator_filepath <- paste0(data_folder, "Shiny Data/", filename, "_ineq.rds") #create value which is file path to old file
# statement that checks if there an old indicator data file of the same name exists in Shiny data folder
if(file.exists(old_indicator_filepath)) {
skip_historic_file_checks <-"n" # if historic file found then instruct this report not to skip over thse checks
old_data_indicator <- readRDS(paste0(data_folder, "Shiny Data/", filename, "_ineq.rds")) #read in file if one exists
} else {
skip_historic_file_checks <-"y" #if no historic file found then instruct this report to skip over some of the checks
old_indicator_filepath <-"No historic file available for comparison. Check filename spelling or 'shiny data' network folder if you were expecting a comparison" #if no historic file exists set value to no file available
}
# Pre-defined geographies that will be used in 'Check 3' comparing old & new file
# check_extras is an optional parameter that can be used to supply additional bespoke geography codes to check
# check_codes <- c("S00000001","S08000031","S12000036",check_extras)
```
Indicator: **`r filename`** <br>
Date: **`r format(Sys.time(), '%d %B, %Y %H:%M')`** <br>
**New** indicator file: /`r (paste0(data_folder, "Data to be checked/", filename, ".rds"))`<br>
**Old** indicator file (for comparisons): /`r (paste0(old_indicator_filepath))` <br>
------------------------------------------------------------------------------------
**Geographies present in indicator output file** <br>
NHS board: `r board` <br>
Council area: `r ca` <br>
Scotland: `r scot` <br>
------------------------------------------------------------------------------------
##**Data check 1:** <br>
##Expected Geographies
```{r include=FALSE}
#Code chunk for counting unique geographies & quintiles appear for each of the quintile types
#Within Scotland Quintiles - all geo types (Scotland, NHS Board and local authority) should be present but not necessarily all quintiles
# Within HB Quintiles - expect 14 nhs boards only, all quintiles (5 quintils + total=6) should be available
# Within LA Quintiles - expect 32 la only, all quintiles should be available
#df counts how many distinct quintiles for each geography by quint type, areaname and year - used in checks on quintiles present
quintile_checks <- data_indicator %>%
group_by(quint_type, trend_axis, code, areaname, geo_type, year) %>%
summarise(count_q=n()) %>%
ungroup()
#df counts how many distinct geographies by quint type and year - used to check which geographies are present
geo_checks <- quintile_checks %>%
group_by(quint_type, geo_type, trend_axis) %>%
summarise(count=n()) %>%
spread(geo_type, count) %>%
ungroup()
#flextable to check how many geogrpahy types are present each year for within scotland quintiles
ft_sc_quin_check <- geo_checks %>%
subset(quint_type=="sc_quin") %>%
flextable() %>%
color(~ S00!=1, ~S00, color = "red") %>% # Should be 14 NHS boards
color(~ S08!=14, ~S08, color = "red") %>% # Should be 14 NHS boards
color(~ S12!=32, ~S08, color = "red") %>%# Should be 14 NHS boards
autofit()
# flextable on hb quintiles - summary table of years and geogrpahy types present
ft_hb_quin_check <- geo_checks %>%
subset(quint_type=="hb_quin") %>%
flextable() %>%
color(~ !is.na(S00), ~S00, color = "red") %>% # Should be no records for scotland code
color(~ S08!=14, ~S08, color = "red") %>% # Expect 14 NHS boards for board quintiles
color(~ !is.na(S12), ~S12, color = "red") %>% # Should be no records for council codes
autofit()
# flextable on hb quintiles - summary table of years and geogrpahy types present
ft_ca_quin_check <- geo_checks %>%
subset(quint_type=="ca_quin") %>%
flextable() %>%
color(~ !is.na(S00), ~S00, color = "red") %>% # Should be no records for scotland code
color(~ S12!=32, ~S12, color = "red") %>% # Expect 14 NHS boards for board quintiles
color(~ !is.na(S08), ~S08, color = "red") %>% # Should be no records for council codes
autofit()
```
### Within Scotland deprivation quintiles
Within Scotland quintiles should return data for Scotland, nhs board and councils.<br>
Table below indicates which type & how many geography codes are present & highlights if this is unexpected count
```{r}
#print flextable
ft_sc_quin_check
```
### Within health board or council deprivation quintiles
Within hb/ca quintiles should only return data for either nhs board or councils.<br>
Conditional formatting will show up if distinct geography counts are not as expected
```{r}
#print flextable
ft_hb_quin_check
ft_ca_quin_check
```
------------------------------------------------------------------------------------
##Data check 2: Expected deprivation quintiles
###Within Health Board/Council Quintiles
```{r}
# if ca or hb quintile sum not equal to 6 theres an issue (since within geography deprivation shold always have 5 quintiles plus a total)
hbca_quintile_warning <- quintile_checks %>%
mutate(q_count= ifelse(quint_type!="sc_quin" && count_q!=6, TRUE, FALSE)) %>%
summarise(quint_error_total=sum(q_count,na.rm=TRUE))
#Text output when ca or hb quintiles not correct
ifelse(hbca_quintile_warning $quint_error_total == 0, "Within CA and HB quintiles all appear present :-)",
"STOP AND CHECK....looks like not all geographies have correct number of quintiles for within CA or HB quintiles")
```
###Within Scotland quintiles"
```{r}
sc_quintile_warning <- quintile_checks %>%
subset(quint_type=="sc_quin") %>%
mutate(q_count= ifelse(count_q!=6, TRUE, FALSE)) %>%
summarise(quint_error_total=sum(q_count,na.rm=TRUE))
#Text output when one or more sc quintile is missing from an NHS Board of Council Area
ifelse(sc_quintile_warning $quint_error_total == 0, "No geogrpahies are missing a Scotland quintile",
paste0("STOP and check table below!...There are ", sc_quintile_warning$quint_error_total[1]," geography/time periods where not all Scotland quintiles are present. This is most likely in small areas eg Island regions. If many regions are affected then indicator may not be suitable for publication."))
# df to checkk how many areas are missing one or more scottish quintiles
sc_quintile_check <- data_indicator %>%
subset(quint_type=="sc_quin") %>%
mutate(q=paste0("Q",as.character(quintile))) %>%
group_by(trend_axis, areaname, code,geo_type, q) %>%
summarise(count=n())%>%
spread(q, count) %>%
mutate(qsum=sum(Q1,Q2,Q3,Q4,Q5,QTotal, na.rm=TRUE)) %>%
ungroup() %>%
subset(qsum<6) %>%
arrange(geo_type,code, trend_axis)
# Flextable displying which geographies and years do not ahave all scotland quintiles represented
ft_sc_quintile_check <- sc_quintile_check %>%
select(geo_type, areaname, code,trend_axis, Q1:qsum) %>%
flextable() %>%
bg(i= ~Q5==1, j= ~Q5, bg="red") %>%
merge_v(j=c("areaname","code","geo_type")) %>%
autofit() %>%
theme_vanilla()
ft_sc_quintile_check
```
------------------------------------------------------------------------------------
<!-- HISTORICAL DATA CHECKS NEEDS WORK -->
<!-- ##Data check 3: Compare previous file -->
<!-- The table below compares the numerator, measure and CI of the latest file against the last shiny data file prepared.<br> -->
<!-- Conditional formatting will highlight when figures do not match, tolerance is to 3 decimal places <br> -->
<!-- Sometimes figures can change, e.g when the SMR records are more complete or new references files like the postcode lookups have been used.<br> -->
<!-- Use your judgement to decide if any differences are within acceptable range. -->
<!-- ```{r} -->
<!-- if(new_ind==FALSE){ -->
<!-- new_file <- data_indicator %>% subset(code %in% check_codes) -->
<!-- old_file <- old_data_indicator %>% subset(code %in% check_codes)%>% -->
<!-- select (code, year, quint_type,quintile, numerator, rate:rel_range) -->
<!-- #checks below only for matching rates/inequality estimates, not looking at CIs? -->
<!-- matched <- merge(x = new_file, y = old_file, by=c("code", "year","quint_type", "quintile")) %>% -->
<!-- mutate(rate_match = round((rate.x-rate.y)/rate.x,3), -->
<!-- sii_match = round((sii.x-sii.y)/sii.x,3), -->
<!-- rii_match = round((rii.x-rii.y)/rii.x,3), -->
<!-- par_match = round((par.x-par.y)/par.x,3), -->
<!-- year=as.factor(year), -->
<!-- sum_matches=sum(rate_match, sii_match, rii_match, par_match, na.rm=TRUE)) %>% -->
<!-- subset(sum_matches!=0) -->
<!-- if (nrow(matched) == 0) {"Old and new data files values match" -->
<!-- } else{ flextable(matched, -->
<!-- col_keys = c("code", "areaname","year","numerator.x", "numerator.y","numerator_match", -->
<!-- "rate.x", "rate.y","rate_match", -->
<!-- "sii.x","sii.y", "sii_match", -->
<!-- "rii.x","rii.y", "rii_match", -->
<!-- "par.x","par.y", "par_match")) %>% -->
<!-- set_header_labels(numerator.x="numerator_new", -->
<!-- numerator.y="numerator_old", -->
<!-- rate.x ="rate_new", -->
<!-- rate.y="rate_old", -->
<!-- sii.x="lowci_new", -->
<!-- sii.y="lowci_old", -->
<!-- rii.x="upci_new", -->
<!-- rii.y="upci_old", -->
<!-- par.x="upci_new", -->
<!-- par.y="upci_old") %>% -->
<!-- autofit()} -->
<!-- }else if(new_ind==TRUE){ -->
<!-- "New_ind parameter set to TRUE - no historic checks run" -->
<!-- } -->
<!-- ``` -->
---------------------------------------------------------------------------------
##Data check 4:
####Rates within Confidence intervals
Do any rates fall outwith the confidence limits - they shouldn't!?
```{r, echo=FALSE}
# Check no rates/percentages sit outside of CI range
confidence_check <- data_indicator %>%
mutate(ci_error = ifelse(rate<lowci | rate>upci, TRUE, FALSE),
sii_ci_error = ifelse(sii<lowci_sii | sii>upci_sii, TRUE, FALSE),
rii_ci_error = ifelse(rii<lowci_rii | rii>upci_rii, TRUE, FALSE)) %>%
summarise(ci_error_total = sum(ci_error,sii_ci_error,rii_ci_error, na.rm = TRUE))
ifelse(confidence_check$ci_error_total == 0, "No rates outwith confidence limits :-)",
"Uh-oh looks like there are some rates outwith confidence limits :(")
```
---------------------------------------------------------------------------------
##Data check 5:
#### Small numbers
Look at proportion of areas and quintiles where more than 10% of row include counts of less than 5 and therefore may not be robust.
```{r}
small_count_data <- data_indicator %>%
group_by(geo_type, quint_type, quintile, year) %>%
summarise(count=n(),u5=sum(numerator<5)) %>%
mutate(percent_U5=u5/count*100,
year =as.factor(year)) %>%
ungroup() %>%
subset(percent_U5>10)
flextable(small_count_data)
```
-------------------------------------------------------------------------------
<!-- ##Data check 6:-->
<!-- #### Check if there is a linear relationship across deprivation quintiles-->
<!-- This is a quick way to summarise if relationship with deprivation is clearly linear or not. If the relationship is--> <!-- <!-- obviously not linear then question whether measures such as SII and RII are helpful? -->
<!-- ```{r} -->
<!-- # look at which geographies codes/types have linear or non-linear patterns across deprivation quintiles -->
<!-- linear_trends <- data_indicator %>% -->
<!-- select(code, year, rate, quint_type, quintile, geo_type) %>% -->
<!-- subset(quintile != "Total") %>% -->
<!-- mutate(quintile=paste0("Q",quintile)) %>% -->
<!-- group_by(code, year, quint_type) %>% -->
<!-- spread(quintile, rate) %>% -->
<!-- mutate(pattern=case_when((Q1>Q2 & Q2>Q3 & Q3>Q4 & Q4>Q5)~"descending",(Q1<Q2 & Q2<Q3 & Q3<Q4 & Q4<Q5) ~"ascending", TRUE~"non_linear")) %>% -->
<!-- group_by(geo_type, quint_type, year, pattern) %>% -->
<!-- summarise(count=n()) %>% -->
<!-- group_by(geo_type, quint_type,year) %>% -->
<!-- mutate(sum=sum(count)) %>% -->
<!-- pivot_wider(names_from = pattern, values_from = count,values_fill=list(count=0)) %>% -->
<!-- mutate(percent_linear=100-round(non_linear/sum*100,1)) -->
<!-- ft <- linear_trends %>% -->
<!-- #select(geo_type,quint_type,year,ascending, descending, non_linear,sum,percent_linear) %>% -->
<!-- select_if(names(.) %in% c('geo_type','quint_type','year','ascending', 'descending','non_linear','sum','percent_linear')) %>% -->
<!-- flextable() %>% -->
<!-- merge_v(j=c("geo_type","quint_type")) %>% -->
<!-- autofit()%>% -->
<!-- theme_vanilla() -->
<!-- ft -->
<!-- ``` -->
-------------------------------------------------------------------------------
##Data check 7: Shiny app
```{r}
shinyApp(
ui = fluidPage(
fluidRow(
fluidRow(column(12,(p("Use dropdowns to adjust charts - remember geography code selected needs to match the quintile type")))),
column(5,selectInput("code_selected", "Area:", choices = unique(data_indicator$code_name),multiple = FALSE, selected="S00000001: Scotland")),
column(3,selectInput("quint_selected", "Quintile:", choices = unique(data_indicator$quint_type),multiple = FALSE, selected="sc_quin")),
column(3,selectInput("year_selected", "Year:", choices = unique(data_indicator$year),multiple = FALSE,selected=max(data_indicator$year)))),
fluidRow(column(8,(h3("Check annual rates by quintile and rate trends")))),
fluidRow(
column(6,plotlyOutput("rate_bar")),
column(6,plotlyOutput("rate_trend"))),
fluidRow(column(8,(h3("Check SII and RII trends")))),
fluidRow(
column(6,plotlyOutput("sii_trend")),
column(6,plotlyOutput("rii_trend"))),
fluidRow(column(8,(h3("Check PAR")))),
fluidRow(
column(6,plotlyOutput("par_bar")),
column(6,plotlyOutput("par_trend")))
),
server = function(input, output) {
#reactive simd trend data
simd_trend_data<- reactive({
rate_data <- data_indicator %>% # reactive dataset for trend chart
select(code,code_name, year, numerator, denominator, rate, quint_type, quintile, geo_type, par, rii, sii) %>%
subset(quint_type==input$quint_selected & code_name==input$code_selected) %>%
arrange(quintile) %>% #this is needed to make palette assignments work well
droplevels()
})
#reactive simd data single year
simd_singleyr_data<- reactive({
bar_rate_data <- simd_trend_data() %>% # reactive dataset for bar chart
subset(year==input$year_selected) %>%
mutate(average=rate[quintile=="Total"]) %>%
filter(quintile !="Total") %>%
droplevels()
})
# RATE-BAR CHART
output$rate_bar <- renderPlotly({
pal_rate_bar <- case_when( simd_singleyr_data()$quintile == "1" ~ '#022031', #Palette for plot
simd_singleyr_data()$quintile == "2" ~ '#313695',
simd_singleyr_data()$quintile == "3" ~ '#4575b4',
simd_singleyr_data()$quintile == "4" ~ '#74add1',
simd_singleyr_data()$quintile == "5" ~ '#abd9e9')
rate_bar <- plot_ly(data= simd_singleyr_data(), x=~quintile) %>%
#comparator line
add_trace(y = ~average, name = "Average", type = 'scatter', mode = 'lines',
line = list(color = '#FF0000'), hoverinfo="skip") %>%
#quintile bars
add_bars(y = ~rate, color = ~ quintile, marker = list(color = pal_rate_bar)) %>%
layout(title=paste0("Year: ",input$year_selected),bargap = 0.1, margin=list(b = 140),
showlegend = FALSE) %>%
config(displayModeBar = F, displaylogo = F, editable =F) # taking out toolbar
})
# RATE-LINE CHART
output$rate_trend <- renderPlotly({
#Palette for trend plot
pal_simd_trend <- case_when(simd_trend_data()$quintile == "1" ~ '#022031',
simd_trend_data()$quintile == "2" ~ '#313695',
simd_trend_data()$quintile == "3" ~ '#4575b4',
simd_trend_data()$quintile == "4" ~ '#74add1',
simd_trend_data()$quintile == "5" ~ '#abd9e9',
simd_trend_data()$quintile == "Total" ~ '#FF0000')
tooltiptext_trend <- c(paste0(simd_trend_data()$year, ": ", simd_trend_data()$rate))
#Create trend plot
rate_trend <- plot_ly(data= simd_trend_data(), x=~year,hoverinfo="text") %>%
add_lines(y = ~rate, name = ~quintile, type = 'scatter',text = tooltiptext_trend,
mode = 'lines', color = ~quintile, colors = pal_simd_trend) %>%
layout(yaxis = list(rangemode = "tozero")) # force yaxis to start at zero
})
# SII
output$sii_trend <- renderPlotly({
index_data <- simd_trend_data() %>% filter((quintile == "Total"))
tooltiptext_sii <- c(paste0(index_data$year, ": ", index_data$sii))
sii_trend <-plot_ly(data=index_data, x=~year, hoverinfo="text") %>%
add_lines(y = ~sii, name = "Absolute inequality (SII)", type = 'scatter',
text = tooltiptext_sii, mode = 'lines',
line = list(color = '#74add1')) %>%
layout(yaxis = list(rangemode = "tozero")) %>% # force yaxis to start at zero
config(displayModeBar = FALSE, displaylogo = F, editable =F) # taking out toolbar
})
#RII
output$rii_trend <- renderPlotly({
index_data <- simd_trend_data() %>% filter((quintile == "Total"))
tooltiptext_rii <- c(paste0(index_data$year, ": ", index_data$rii))
plot_ly(data=index_data, x=~year,hoverinfo="text") %>%
add_lines(y = ~rii, name = "Relative gap", type = 'scatter',
text = tooltiptext_rii, mode = 'lines',line = list(color = '#313695')) %>%
layout(yaxis = list(rangemode = "tozero")) %>% # force yaxis to start at zero
config(displayModeBar = FALSE, displaylogo = F, editable =F) # taking out toolbar
})
#PAR Barchart
output$par_bar <- renderPlotly({
par_bar_data <- simd_singleyr_data() %>%
mutate(baseline = rate[quintile == "5"],
diff_baseline = rate - rate[quintile == "5"]) %>%
droplevels()
plot_ly(data = par_bar_data, x = ~quintile,hoverinfo="text") %>%
add_bars(y = ~baseline, name= "", marker = list(color = "#4da6ff"), showlegend = FALSE) %>%
add_bars(y = ~diff_baseline, name = "Attributable to deprivation",
marker = list(color = "#ffa64d"), showlegend = FALSE) %>%
layout(title=paste0("Year: ",input$year_selected),
bargap = 0.1, barmode = 'stack', showlegend = T,
legend = list(x = 0.9, y = 0.9),
margin = list(b = 140)) %>%
config(displayModeBar = FALSE, displaylogo = F, editable =F) # taking out toolbar
})
#Par Trends
output$par_trend <- renderPlotly({
#preparing data needed
simd_partrend_data <- simd_trend_data() %>%
subset(quintile == "Total") %>%
droplevels()
par_trend_plot <- plot_ly(data=simd_partrend_data, x=~year,
hoverinfo="text") %>%
add_lines(y = ~par, type = 'scatter', mode = 'lines', line = list(color = "#4575b4")) %>%
layout(margin = list(b = 140),
yaxis = list(rangemode = "tozero")) %>% # force yaxis to start at zero) %>% #to avoid labels getting cut out
config(displayModeBar = FALSE, displaylogo = F, editable =F) # taking out toolbar
})
},# server close
options = list(height = 2000)
)
```