-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathProcess raw data.R
148 lines (131 loc) · 6.69 KB
/
Process raw data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# When working the personal laptop
setwd('C:/Users/min/OneDrive - IIASA/STAC - Diversity Committee 2017/Family survey 2017/Result')
### Read in the raw data
raw <- read.xlsx("Family well-being survey_final.xlsx", 1)
# New name list for the variables
a <- c("id", "timestamp", "association", "gender", "tot.duration", "ftype", "sep.duration", "sep.reason.mult", "sep.reason.open", "sep.join",
"child.duration", "child.age1", "child.age2", "child.age3", "child.age4", "child.age5", "stayhome", "nursingleave",
"childcare.info1", "childcare.info2", "childcare.info3", "childcare.info4", "childcare.info5",
"childcare.exp", "fam.beihilf", "fam.bei.reason.open", "fam.bei.applic",
"child.ins.cover", "cover.reason", "pediatrician", "ped.reason.open",
"insu.type", "ins.concern.open",
"Austrian", "EU.citizen", "german", "german.study", "german.reason.mult", "german.challenge.open",
"socializing", "circle.mult", "social.challenge.open", "partner.work", "search.time", "relev.field", "searching", "voluntary",
"barrier.mult", "search.challenge.open",
"challenge.soc", "challenge.fin", "challenge.job", "challenge.child", "flextime", "telecom", "dualcareer",
"spec.measure.open", "comment.open")
names(raw) <- a
### Construct a master data table for analysis and change data types (factor -> character)
master <- raw %>% arrange(ftype) %>%
mutate_at(vars(ends_with("open")), as.character) %>%
mutate_at(vars(ends_with("mult")), as.character) %>%
mutate_at(vars(ends_with("mult")), strsplit, ";") %>%
rowwise() %>%
mutate(num_child = 5-(is.na(child.age1)+is.na(child.age2)+is.na(child.age3)+is.na(child.age4)+is.na(child.age5))) %>%
ungroup() %>%
mutate(ftype = as.factor(ftype)) %>%
mutate_at(vars(starts_with("challenge.")), function(x){factor(x, levels=c("We don't have this problem.",
"A minor problem.",
"Somewhat problematic.",
"Serious.",
"Extremely serious. I can consider leaving because of this."))}) %>%
mutate(tot.duration=factor(tot.duration, levels=c("Less than a year", "1-3 years", "3-5 years", "5-10 years", "More than 10 years"))) %>%
mutate(short = (tot.duration=="Less than a year" | tot.duration=="1-3 years")) %>% # Doesn't mean much for Austrians
mutate(origin = ifelse(Austrian!="No", 1, ifelse(EU.citizen!="No", 2, 3))) %>%
mutate(origin = factor(origin, labels=c("Austrian", "Other-EU", "Non-EU"))) %>%
select(id, ftype, origin, short, everything()) %>%
mutate(gender_factor = factor(gender, labels=c("Female employees", "Male employees", "Prefer not to say")))
### Separating the mult-type answers
mult.vars <- names(master %>% select(ends_with("mult")))
mults <- list()
for (i in 1:length(mult.vars)) {
# Unnest those grouped strings for each mult var (end up with long table)
mults[[i]] <- master %>% select(id, ends_with("mult")) %>% unnest_(mult.vars[i])
# Spread (long to wide) the table. New columns are in the order of number of answers
mults[[i]] <- mults[[i]] %>%
mutate(ord=factor(get(mult.vars[i]),
levels=names(table(get(mult.vars[i])))[order(table(get(mult.vars[i])), decreasing=TRUE)],
labels=1:length(table(get(mult.vars[i]))))) %>%
mutate_cond(is.na(ord), ord=1) %>% # To incorporate rows with NA for this mult variable
mutate(ord=paste0(mult.vars[i], str_pad(ord, 2, pad="0"))) %>% # To be used as names for the new columns
spread(ord, get(mult.vars[i]))
master <- left_join(master, mults[[i]])
}
# Erase the original mult vars
master <- select(master, -ends_with("mult"))
###Separating master based on family ftype (in case we need to treat them differently. May not be used)
# Level 3 "single parent of a 2 year old" - not branched correctly, so need to be ignored.
master.single <- master %>% filter(as.integer(ftype)==1) %>% select_if(~sum(!is.na(.)) > 0)
master.sep.family <- master %>% filter(as.integer(ftype)==2) %>% select_if(~sum(!is.na(.)) > 0)
master.fam.nokid <- master %>% filter(as.integer(ftype)==4)
master.fam.kid <- master %>% filter(as.integer(ftype)==5) %>%
mutate_cond(is.na(tot.duration), tot.duration="More than 10 years") # One specific case
master.fam <- rbind(master.fam.nokid, master.fam.kid) %>%
mutate(cat = paste0(as.integer(ftype), as.integer(origin), as.integer(short))) %>%
# mutate(short = (tot.duration=="Less than a year" | tot.duration=="1-3 years")) %>%
select_if(~sum(!is.na(.)) > 0) %>%
select(id, cat, ftype, origin, short, everything())
### Text mining?
### Will not be used - Not giving a concrete result
# library(tidytext)
# library(janeaustenr)
# library(topicmodels)
# library(mallet)
#
# word.freq <- master %>% select(comment.open) %>% unnest_tokens(word, comment.open) %>%
# anti_join(stop_words) %>%
# count(word, sort = TRUE) %>% filter(!is.na(word))
#
# word.freq %>%
# inner_join(get_sentiments("bing")) %>%
# count(sentiment)
#
# word.dtm <- word.freq %>% mutate(document=1) %>% cast_dtm(document, word, n)
#
# ap_lda <- LDA(word.dtm, k = 2, control = list(seed = 1234))
# ap_topics <- tidy(ap_lda, matrix = "beta")
#
# ap_top_terms <- ap_topics %>%
# group_by(topic) %>%
# top_n(10, beta) %>%
# ungroup() %>%
# arrange(topic, -beta)
#
# ap_top_terms %>%
# mutate(term = reorder(term, beta)) %>%
# ggplot(aes(term, beta, fill = factor(topic))) +
# geom_col(show.legend = FALSE) +
# facet_wrap(~ topic, scales = "free") +
# coord_flip()
#
# beta_spread <- ap_topics %>%
# mutate(topic = paste0("topic", topic)) %>%
# spread(topic, beta) %>%
# dplyr::filter(topic1 > .001 | topic2 > .001) %>%
# mutate(log_ratio = log2(topic2 / topic1))
#
# beta_spread %>% arrange(log_ratio) %>% slice(c(1:10,(dim(beta_spread)-9):dim(beta_spread))) %>%
# ggplot(aes(x=log_ratio, y=term, fill="lightgreen")) + geom_bar()
#
#
# # Tutorial
# tidy_books <- austen_books() %>%
# group_by(book) %>%
# mutate(linenumber = row_number(),
# chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
# ignore_case = TRUE)))) %>%
# ungroup() %>%
# unnest_tokens(word, text)
# nrcjoy <- get_sentiments("nrc") %>%
# filter(sentiment == "joy")
# tidy_books %>%
# filter(book == "Emma") %>%
# inner_join(nrcjoy) %>%
# count(word, sort = TRUE)
# tidy_books %>%
# inner_join(get_sentiments("bing")) %>%
# count(book, index = linenumber %/% 80, sentiment)
#
# ap_td <- tidy(AssociatedPress)
#
# mallet_model <- MalletLDA(num.topics = 4)