-
Notifications
You must be signed in to change notification settings - Fork 0
/
stats19_glm.R
59 lines (46 loc) · 2.68 KB
/
stats19_glm.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
library(tidyverse)
rm (list = ls())
load(file="Stats19_05-15_ready_31_1_2017.rda") # contains single object Stats19
## ss19 <- sample_frac(ss19, 0.01) # if we want a 1% random sample
###
# [1] "accident_index" "year" "roadtype"
# [4] "cas_severity" "cas_mode" "cas_male"
# [7] "strike_mode" "strike_male"
###
ss19 <- select(d, everything()) %>%
mutate(year=factor(year)) %>% # treat year as categorical for the moment
filter(cas_male %in% c(1, 0)) %>%
filter(cas_mode %in% c("cyclist","pedestrian", "car/taxi"))
ssg <- group_by(ss19, year, cas_mode, cas_male, cas_severity, strike_mode) %>%
summarise(count=n()) %>%
droplevels() %>%
as.data.frame() %>% # remove "grouped" class, which breaks filling with zeroes
mutate(cas_dist = ifelse(cas_mode=="pedestrian", 134.6,
ifelse(cas_mode=="cyclist", 30.9,
ifelse(cas_mode=="car/taxi", 4000.1, NA)))) %>%
mutate(strike_dist = ifelse(strike_mode=="pedestrian", 134.6,
ifelse(strike_mode=="cyclist", 30.9,
ifelse(strike_mode=="car/taxi", 2419.1, NA)))) %>%
complete(year, cas_mode, cas_male, cas_severity, strike_mode, strike_dist, cas_dist, fill=list(count=0))
fit <- glm(count ~ cas_male + cas_mode + cas_severity + strike_mode, data=ssg, family=poisson)
## note: Can't include both casualty mode and distance in the same model if distance is just a function of mode
## add predicted count with standard error
pred <- predict(fit, se.fit=TRUE, type="response")
ssg <- mutate(ssg, count_fit=pred[["fit"]], count_se=pred[["se.fit"]])
as.data.frame(ssg)[1:10,]
## Predicted counts likely biased and SEs underestimated because we are not including all relevant predictors
## So now fit a "saturated" model including all variables and their full interactions
fitsat <- glm(count ~ cas_male*cas_mode*cas_severity*strike_mode, data=ssg, family=poisson)
pred <- predict(fitsat, se.fit=TRUE, type="response")
ssg <- mutate(ssg, count_fit_sat=pred[["fit"]], count_se_sat=pred[["se.fit"]])
options(scipen=1e+07) # turn off scientific notation for small numbers
as.data.frame(ssg)[1:10,]
## fitted counts match observed counts more closely, and standard errors are bigger.
## Model comparison (lower AIC: saturated model is more efficient despite having many more parameters (df))
AIC(fit, fitsat)
## TODO
## Include distance properly
## Expert consideration of what variables/interactions should always be included
## Some kind of constraints to represent prior knowledge about what should be excluded
## Negative binomial model may fit better
## ...