-
Notifications
You must be signed in to change notification settings - Fork 0
/
practice.Rmd
367 lines (290 loc) · 11.7 KB
/
practice.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
---
title: "Practising_w_swc"
author: "Laura"
date: "12/16/2019"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(
echo = FALSE,
fig.path = "Figs/",
fig.width = 11,
message = FALSE,
warning = FALSE,
results = "hide"
)
library(tidyverse); library(lubridate); library(babynames)
```
```{r}
cats <- tibble(coat = c("calico", "black", "tabby"),
weight = c(2.1, 5.0, 3.2),
likes_string = c(1, 0, 1))
cats %>%
write_csv(path = "data/feline-data.csv")
read_csv("data/feline-data.csv")
gapminder <- read_csv("data/gapminder_data.csv")
str(gapminder)
colnames(gapminder)
```
### Creating Publication-Quality Graphics with ggplot2
```{r}
gapminder %>%
ggplot(aes(year, lifeExp, by = country)) +
geom_point() +
geom_line(aes(color = continent))
gapminder %>%
ggplot(aes(gdpPercap, lifeExp,
by = continent, color = continent)) +
geom_point(alpha = 0.5, aes(shape = continent)) +
geom_smooth(method = "lm") +
scale_x_log10()
gapminder %>%
filter(continent == "Americas") %>%
ggplot(aes(year, lifeExp)) +
facet_wrap(~ country) +
geom_line() +
theme(axis.text.x = element_text(angle = 45))
lifeExp_plot <- gapminder %>%
filter(continent == "Americas") %>%
ggplot(aes(year, lifeExp)) +
facet_wrap(~ country) +
geom_line() +
labs(x = "Year", # x axis title
y = "Life expectancy", # y axis title
title = "Figure 1", # main title of figure
color = "Continent" # title of legend
) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
ggsave(filename = "results/lifeExp.png", plot = lifeExp_plot, width = 12, height = 10, dpi = 300, units = "cm")
```
```{r}
gapminder %>%
ggplot(aes(year, lifeExp)) +
facet_grid(~ continent) +
geom_boxplot(aes(group = year)) +
labs(y = "Life Expectancy", x = "") +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank())
```
### Dataframe Manipulation with dplyr
```{r}
gapminder %>%
filter(continent == "Africa") %>%
select(lifeExp, country, year)
gapminder %>%
group_by(country) %>%
summarise(mean_life_exp = mean(lifeExp)) %>%
arrange(desc(mean_life_exp)) %>%
tail()
```
Calculate the average life expectancy in 2002 of 2 randomly selected countries for each continent. Then arrange the continent names in reverse order. Hint: Use the dplyr functions arrange() and sample_n(), they have similar syntax to other dplyr functions.
```{r}
gdp_future_bycontinents_byyear_high_lifeExp <- gapminder %>%
mutate(gdp_futureExpectation = ifelse(lifeExp > 40, gdpPercap * 1.5, gdpPercap)) %>%
group_by(continent, year) %>%
summarize(mean_gdpPercap = mean(gdpPercap),
mean_gdpPercap_expected = mean(gdp_futureExpectation))
gapminder %>%
# Get the start letter of each country
mutate(startsWith = substr(country, start = 1, stop = 1)) %>%
# Filter countries that start with "A" or "Z"
filter(startsWith %in% c("A", "Z")) %>%
# Make the plot
ggplot(aes(x = year, y = lifeExp, color = continent)) +
geom_line() +
facet_wrap( ~ country)
gapminder %>%
filter(year == 2002) %>%
group_by(continent) %>%
sample_n(2) %>%
summarise(mean_le = mean(lifeExp)) %>%
arrange(desc(continent))
```
### Dataframe Manipulation with tidyr
```{r}
gap_wide <- read_csv("https://raw.githubusercontent.com/swcarpentry/r-novice-gapminder/gh-pages/_episodes_rmd/data/gapminder_wide.csv")
glimpse(gap_wide)
gap_long <- gap_wide %>%
pivot_longer(cols = c(starts_with("gdp"),
starts_with("lifeExp"),
starts_with("pop")),
names_to = "obstype_year",
values_to = "obs_value")
glimpse(gap_long)
gap_long <- gap_long %>%
separate(obstype_year,
into = c("obstype", "year"),
convert = TRUE)
glimpse(gap_long)
gap_long
gap_long %>%
group_by(continent, obstype) %>%
summarize(mean = mean(obs_value))
gap_normal <- gap_long %>%
pivot_wider(names_from = obstype, values_from = obs_value)
gap_normal
```
Now let’s convert the long all the way back to the wide. In the wide format, we will keep country and continent as ID variables and pivot the observations across the 3 metrics (pop,lifeExp,gdpPercap) and time (year). First we need to create appropriate labels for all our new variables (time*metric combinations) and we also need to unify our ID variables to simplify the process of defining gap_wide.
```{r}
gap_long
gap_long %>%
unite("obstype_year", obstype, year) %>%
unite("continent_country", continent, country) %>%
pivot_wider(names_from = obstype_year, values_from = obs_value)
gap_long %>%
unite("country_obstype_year", country, obstype, year) %>%
pivot_wider(names_from = country_obstype_year, values_from = obs_value)
```
### Data Analysis and Visualization in R for Ecologists
```{r}
class(num_char <- c(1, 2, 3, "a"))
class(num_logical <- c(1, 2, 3, TRUE))
class(char_logical <- c("a", "b", "c", TRUE))
class(tricky <- c(1, 2, 3, "4"))
heights <- c(63, 69, 60, 65, NA, 68, 61, 70, 61, 59, 64, 69, 63, 63, NA, 72, 65, 64, 70, 63, 65)
median(na.omit(heights))
sum(na.omit(heights) > 67)
```
```{r}
download.file(url="https://ndownloader.figshare.com/files/2292169",
destfile = "data/portal_data_joined.csv")
surveys <- read_csv("data/portal_data_joined.csv")
glimpse(surveys)
str(surveys)
surveys$date <- mdy(paste(surveys$month, surveys$day, surveys$year, sep = "-"))
surveys$date <- make_date(surveys$year, surveys$month, surveys$day)
surveys %>%
select(month, day, year, date) %>%
filter(is.na(date))
```
Using pipes, subset the surveys data to include animals collected before 1995 and retain only the columns year, sex, and weight.
```{r}
surveys %>%
filter(year < 1995) %>%
select(year, sex, weight)
```
Create a new data frame from the surveys data that meets the following criteria: contains only the species_id column and a new column called hindfoot_half containing values that are half the hindfoot_length values. In this hindfoot_half column, there are no NAs and all values are less than 30.
```{r}
mine <- surveys %>%
filter(hindfoot_length < 60 & !is.na(hindfoot_length)) %>%
mutate(hindfoot_half = hindfoot_length / 2) %>%
select(species_id, hindfoot_half)
surveys_hindfoot_half <- surveys %>%
filter(!is.na(hindfoot_length)) %>%
mutate(hindfoot_half = hindfoot_length / 2) %>%
filter(hindfoot_half < 30) %>%
select(species_id, hindfoot_half)
all.equal(mine, surveys_hindfoot_half)
```
How many animals were caught in each plot_type surveyed?
Use group_by() and summarize() to find the mean, min, and max hindfoot length for each species (using species_id). Also add the number of observations (hint: see ?n).
What was the heaviest animal measured in each year? Return the columns year, genus, species_id, and weight.
```{r}
surveys %>%
group_by(year) %>%
filter(!is.na(weight)) %>%
filter(weight == max(weight)) %>%
select(year, genus, species_id, weight) %>%
arrange(year)
surveys
surveys %>%
count(plot_type)
surveys %>%
filter(species_id %in% c("SS", "ST", "UR")) %>%
select(species_id, hindfoot_length)
surveys %>%
group_by(species_id) %>%
filter(!is.na(hindfoot_length)) %>%
summarise(mean = mean(hindfoot_length),
min = min(hindfoot_length),
max = max(hindfoot_length),
n = n())
```
Spread the surveys data frame with year as columns, plot_id as rows, and the number of genera per plot as the values. You will need to summarize before reshaping, and use the function n_distinct() to get the number of unique genera within a particular chunk of data. It’s a powerful function! See ?n_distinct for more.
```{r}
swide <- surveys %>%
group_by(year, plot_id) %>%
summarise(unique_genera = n_distinct(genus)) %>%
arrange(plot_id) %>%
pivot_wider(names_from = year, values_from = unique_genera)
```
Now take that data frame and gather() it again, so each row is a unique plot_id by year combination.
```{r}
swide %>%
pivot_longer(`1977`:`2002`, names_to = "year",
values_to = "distinct_genera")
```
The surveys data set has two measurement columns: hindfoot_length and weight. This makes it difficult to do things like look at the relationship between mean values of each measurement per year in different plot types. Let’s walk through a common solution for this type of problem. First, use gather() to create a dataset where we have a key column called measurement and a value column that takes on the value of either hindfoot_length or weight. Hint: You’ll need to specify which columns are being gathered.
```{r}
survlong <- surveys %>%
pivot_longer(cols = c(hindfoot_length, weight),
names_to = "measurement", values_to = "value")
```
With this new data set, calculate the average of each measurement in each year for each different plot_type. Then spread() them into a data set with a column for hindfoot_length and weight. Hint: You only need to specify the key and value columns for spread().
```{r}
survlong %>%
group_by(year, plot_type, measurement) %>%
filter(!is.na(value)) %>%
summarize(mean = mean(value)) %>%
pivot_wider(names_from = measurement, values_from = mean)
```
### Some plotting
```{r}
surveys_complete <- read_csv("data/portal_data_joined.csv")
yearly_sex_counts <- surveys_complete %>%
count(year, genus, sex)
ggplot(data = yearly_sex_counts, mapping = aes(x = year, y = n, color = sex)) +
geom_line() +
facet_wrap(vars(genus)) +
labs(title = "Observed genera through time",
x = "Year of observation",
y = "Number of individuals") +
theme_bw() +
theme(axis.text.x = element_text(colour = "grey20", size = 12, angle = 90, hjust = 0.5, vjust = 0.5),
axis.text.y = element_text(colour = "grey20", size = 12),
text = element_text(size = 16))
# if you put theme_ at the end the other them statement is not followed (at least not well)
```
### Primers dplyr challenge
```{r}
babynames %>%
group_by(year, sex) %>%
mutate(rank = min_rank(desc(n))) %>%
filter(rank == 1, sex == "M") %>%
ungroup() %>%
summarise(distinct = n_distinct(name))
babynames %>%
filter(sex == "M") %>%
group_by(year) %>%
filter(n == max(n)) %>%
ungroup() %>%
count(name)
```
Which gender uses more names?
In the chunk below, calculate and then plot the number of distinct names used each year for boys and girls. Place year on the x axis, the number of distinct names on they y axis and color the lines by sex.
```{r}
babynames %>%
group_by(sex, year) %>%
summarize(distinct = n_distinct(name)) %>%
ggplot(aes(year, distinct, color = sex)) +
geom_line() +
theme_bw()
```
Let’s make sure that we’re not confounding our search with the total number of boys and girls born each year. With the chunk below, calculate and then plot over time the total number of boys and girls by year. Is the relative number of boys and girls constant?
```{r}
babynames %>%
group_by(sex, year) %>%
summarize(total = sum(n)) %>%
ggplot(aes(year, total, color = sex)) +
geom_line()
```
Hmm. Sometimes there are more girls and sometimes more boys. In addition, the entire population has been grown over time. Let’s account for this weith a new metric: the average number of children per name.
If girls have a smaller number of children per name, that would imply that they use more names overall (and vice versa).
In the chunk below, calculate and plot the average number of children per name by year and sex over time. How do you interpret the results?
```{r}
babynames %>%
group_by(sex, year) %>%
summarise(average = mean(n)) %>%
ggplot(aes(year, average, color = sex)) +
geom_line()
```