To view the slides below in presentation mode, open lecture.html in a web browser.
Arvind R. Subramaniam
Assistant Member
Basic Sciences Division and Computational Biology Program
Fred Hutchinson Cancer Research Center
- Lecture 13 – Working with data using
R
/ =tidyverse= - =Tidyverse= Functions for Working with Tabular Data
- Read tabular data into a
DataFrame
(tibble
) - Use the pipe
%>%
operator to chain commands - All these commands produce the same output.
- Simple Data Manipulations –
select
columns - Simple Data Manipulations –
filter
rows - Simple Data Manipulations –
arrange
rows - Use
mutate
to create new columns - Use
mutate
to create new columns - Use
mutate
to modify existing columns - Use
TYPE_join
to join two data frames - =inner_join= keeps common rows
- =left_join= keeps all rows in left data frame
- =right_join= keeps all rows in right data frame
- Use
summarize
to calculate stats across rows - Use
summarize
to calculate stats across rows - Use
group_by
to group subsets of rows - Use
group_by
to group subsets of rows - =group_by= +
summarize
for statistics by group - =group_by= +
summarize
for statistics by group - =%>%= enables complex data analysis pipelines
- =%>%= and
+
if you want to plot
Import/Export | Visualize | Transform |
---|---|---|
read_tsv | geom_point | select |
write_tsv | geom_line | filter |
facet_grid | arrange | |
mutate | ||
join | ||
group_by | ||
summarize |
library(tidyverse)
data <- read_tsv("data/example_dataset_1.tsv")
print(data, n = 5)
data %>%
print(n = 5)
-
print(data, n= 5)
-
data %>% print(., n = 5)
-
data %>% print(n = 5)
data %>%
print(n = 2)
data %>%
select(strain, mean_ratio, insert_sequence, kozak_region) %>%
print(n = 2)
data %>%
filter(kozak_region == "A")
data %>%
filter(kozak_region == "A", insert_sequence == "10×AGA")
data %>%
filter(kozak_region == "A") %>%
filter(insert_sequence == "10×AGA")
data %>%
arrange(mean_ratio)
data <- read_tsv("data/example_dataset_2.tsv") %>%
print()
# A tibble: 16 x 3 strain mean_yfp mean_rfp <chr> <int> <int> 1 schp688 1748 20754 2 schp684 3294 20585 3 schp690 3535 20593 4 schp687 4658 20860 5 schp686 5000 21171 6 schp685 7379 22956 7 schp683 9365 23866 8 schp689 8693 22649 9 schp679 2528 19906 10 schp675 3687 20438 11 schp681 3705 20227 12 schp678 4378 20630 13 schp677 3967 20604 14 schp676 2657 20223 15 schp674 1270 20316 16 schp680 1117 19377
data <- data %>%
mutate(mean_ratio = mean_yfp / mean_rfp) %>%
print()
data %>%
mutate(mean_ratio = round(mean_ratio, 2)) %>%
print()
annotations <- read_tsv("data/example_dataset_3.tsv") %>%
print()
# A tibble: 17 x 3 strain insert_sequence kozak_region <chr> <chr> <chr> 1 schp674 10×AAG G 2 schp675 10×AAG B 3 schp676 10×AAG F 4 schp677 10×AAG E 5 schp678 10×AAG D 6 schp679 10×AAG A 7 schp680 10×AAG H 8 schp681 10×AAG C 9 schp683 10×AGA G 10 schp684 10×AGA B 11 schp685 10×AGA F 12 schp686 10×AGA E 13 schp687 10×AGA D 14 schp688 10×AGA A 15 schp689 10×AGA H 16 schp690 10×AGA C 17 control <NA> <NA>
data %>%
inner_join(annotations, by = "strain") %>%
print()
# A tibble: 16 x 6 strain mean_yfp mean_rfp mean_ratio insert_sequence kozak_region <chr> <int> <int> <dbl> <chr> <chr> 1 schp688 1748 20754 0.0842 10×AGA A 2 schp684 3294 20585 0.160 10×AGA B 3 schp690 3535 20593 0.172 10×AGA C 4 schp687 4658 20860 0.223 10×AGA D 5 schp686 5000 21171 0.236 10×AGA E 6 schp685 7379 22956 0.321 10×AGA F 7 schp683 9365 23866 0.392 10×AGA G 8 schp689 8693 22649 0.384 10×AGA H 9 schp679 2528 19906 0.127 10×AAG A 10 schp675 3687 20438 0.180 10×AAG B 11 schp681 3705 20227 0.183 10×AAG C 12 schp678 4378 20630 0.212 10×AAG D 13 schp677 3967 20604 0.193 10×AAG E 14 schp676 2657 20223 0.131 10×AAG F 15 schp674 1270 20316 0.0625 10×AAG G 16 schp680 1117 19377 0.0576 10×AAG H
data %>%
left_join(annotations, by = "strain") %>%
print()
# A tibble: 16 x 6 strain mean_yfp mean_rfp mean_ratio insert_sequence kozak_region <chr> <int> <int> <dbl> <chr> <chr> 1 schp688 1748 20754 0.0842 10×AGA A 2 schp684 3294 20585 0.160 10×AGA B 3 schp690 3535 20593 0.172 10×AGA C 4 schp687 4658 20860 0.223 10×AGA D 5 schp686 5000 21171 0.236 10×AGA E 6 schp685 7379 22956 0.321 10×AGA F 7 schp683 9365 23866 0.392 10×AGA G 8 schp689 8693 22649 0.384 10×AGA H 9 schp679 2528 19906 0.127 10×AAG A 10 schp675 3687 20438 0.180 10×AAG B 11 schp681 3705 20227 0.183 10×AAG C 12 schp678 4378 20630 0.212 10×AAG D 13 schp677 3967 20604 0.193 10×AAG E 14 schp676 2657 20223 0.131 10×AAG F 15 schp674 1270 20316 0.0625 10×AAG G 16 schp680 1117 19377 0.0576 10×AAG H
data %>%
right_join(annotations, by = "strain") %>%
print()
data %>%
summarize(max_yfp = max(mean_yfp),
max_rfp = max(mean_rfp)) %>%
print()
data %>%
summarize(max_yfp = max(mean_yfp),
max_rfp = max(mean_rfp)) %>%
print()
Other examples of summary functions:
min() | mean() | sd() | first() | n() |
data <- read_tsv("data/example_dataset_4.tsv") %>%
print(n = 10)
# A tibble: 74 x 4 strain yfp rfp replicate <chr> <int> <int> <int> 1 schp690 3640 20944 1 2 schp690 3502 20881 2 3 schp690 3569 20063 3 4 schp690 3475 20773 4 5 schp690 3487 20307 5 6 schp689 9790 24399 1 7 schp689 9821 24932 2 8 schp689 9310 23007 3 9 schp689 6269 19075 4 10 schp689 8273 21835 5 # ... with 64 more rows
data %>%
group_by(strain) %>%
print(n = 10)
data %>%
group_by(strain) %>%
summarize(mean_yfp = mean(yfp), mean_rfp = mean(rfp)) %>%
print()
data %>%
group_by(strain) %>%
summarize(mean_yfp = mean(yfp), mean_rfp = mean(rfp),
se_yfp = sd(yfp) / sqrt(n()),
se_rfp = sd(rfp) / sqrt(n())) %>%
print()
data %>%
group_by(strain) %>%
summarize(mean_yfp = mean(yfp), mean_rfp = mean(rfp)) %>%
mutate(mean_ratio = mean_yfp / mean_rfp) %>%
left_join(annotations, by = "strain") %>%
print()
data %>%
group_by(strain) %>%
summarize(mean_yfp = mean(yfp), mean_rfp = mean(rfp)) %>%
mutate(mean_ratio = mean_yfp / mean_rfp) %>%
left_join(annotations, by = "strain") %>%
ggplot(aes(x = kozak_region, y = mean_ratio,
color = insert_sequence, group = insert_sequence)) +
geom_line() +
geom_point()