Skip to content

Latest commit

 

History

History
451 lines (367 loc) · 14.8 KB

lecture.org

File metadata and controls

451 lines (367 loc) · 14.8 KB

Note

To view the slides below in presentation mode, open lecture.html in a web browser.

Lecture 13 – Working with data using R / tidyverse

Arvind R. Subramaniam

Assistant Member

Basic Sciences Division and Computational Biology Program

Fred Hutchinson Cancer Research Center

Contents

Tidyverse Functions for Working with Tabular Data

Import/ExportVisualizeTransform
read_tsvgeom_pointselect
write_tsvgeom_linefilter
facet_gridarrange
mutate
join
group_by
summarize

Read tabular data into a DataFrame (tibble)

library(tidyverse)

data <- read_tsv("data/example_dataset_1.tsv")
print(data, n = 5)

Use the pipe %>% operator to chain commands

data %>%
  print(n = 5)

All these commands produce the same output.

  1. print(data, n= 5)
        
  2. data %>%
        print(., n = 5)
        
  3. data %>%
        print(n = 5)
        

Simple Data Manipulations – select columns

data %>%
  print(n = 2)
data %>%
  select(strain, mean_ratio, insert_sequence, kozak_region) %>%
  print(n = 2)

Simple Data Manipulations – filter rows

data %>%
  filter(kozak_region == "A")
data %>%
  filter(kozak_region == "A", insert_sequence == "10×AGA")
data %>%
  filter(kozak_region == "A") %>% 
  filter(insert_sequence == "10×AGA")

Simple Data Manipulations – arrange rows

data %>%
  arrange(mean_ratio)

Use mutate to create new columns

data <- read_tsv("data/example_dataset_2.tsv") %>%
    print()
# A tibble: 16 x 3
   strain  mean_yfp mean_rfp
   <chr>      <int>    <int>
 1 schp688     1748    20754
 2 schp684     3294    20585
 3 schp690     3535    20593
 4 schp687     4658    20860
 5 schp686     5000    21171
 6 schp685     7379    22956
 7 schp683     9365    23866
 8 schp689     8693    22649
 9 schp679     2528    19906
10 schp675     3687    20438
11 schp681     3705    20227
12 schp678     4378    20630
13 schp677     3967    20604
14 schp676     2657    20223
15 schp674     1270    20316
16 schp680     1117    19377

Use mutate to create new columns

data <- data %>%
  mutate(mean_ratio = mean_yfp / mean_rfp) %>%
  print()

Use mutate to modify existing columns

data %>%
  mutate(mean_ratio = round(mean_ratio, 2)) %>%
  print()

Use TYPE_join to join two data frames

annotations <- read_tsv("data/example_dataset_3.tsv") %>%
  print()
# A tibble: 17 x 3
   strain  insert_sequence kozak_region
   <chr>   <chr>           <chr>       
 1 schp674 10×AAG          G           
 2 schp675 10×AAG          B           
 3 schp676 10×AAG          F           
 4 schp677 10×AAG          E           
 5 schp678 10×AAG          D           
 6 schp679 10×AAG          A           
 7 schp680 10×AAG          H           
 8 schp681 10×AAG          C           
 9 schp683 10×AGA          G           
10 schp684 10×AGA          B           
11 schp685 10×AGA          F           
12 schp686 10×AGA          E           
13 schp687 10×AGA          D           
14 schp688 10×AGA          A           
15 schp689 10×AGA          H           
16 schp690 10×AGA          C           
17 control <NA>            <NA>

inner_join keeps common rows

data %>%
  inner_join(annotations, by = "strain") %>% 
  print()
# A tibble: 16 x 6
   strain  mean_yfp mean_rfp mean_ratio insert_sequence kozak_region
   <chr>      <int>    <int>      <dbl> <chr>           <chr>       
 1 schp688     1748    20754     0.0842 10×AGA          A           
 2 schp684     3294    20585     0.160  10×AGA          B           
 3 schp690     3535    20593     0.172  10×AGA          C           
 4 schp687     4658    20860     0.223  10×AGA          D           
 5 schp686     5000    21171     0.236  10×AGA          E           
 6 schp685     7379    22956     0.321  10×AGA          F           
 7 schp683     9365    23866     0.392  10×AGA          G           
 8 schp689     8693    22649     0.384  10×AGA          H           
 9 schp679     2528    19906     0.127  10×AAG          A           
10 schp675     3687    20438     0.180  10×AAG          B           
11 schp681     3705    20227     0.183  10×AAG          C           
12 schp678     4378    20630     0.212  10×AAG          D           
13 schp677     3967    20604     0.193  10×AAG          E           
14 schp676     2657    20223     0.131  10×AAG          F           
15 schp674     1270    20316     0.0625 10×AAG          G           
16 schp680     1117    19377     0.0576 10×AAG          H

left_join keeps all rows in left data frame

data %>%
  left_join(annotations, by = "strain") %>% 
  print()
# A tibble: 16 x 6
   strain  mean_yfp mean_rfp mean_ratio insert_sequence kozak_region
   <chr>      <int>    <int>      <dbl> <chr>           <chr>       
 1 schp688     1748    20754     0.0842 10×AGA          A           
 2 schp684     3294    20585     0.160  10×AGA          B           
 3 schp690     3535    20593     0.172  10×AGA          C           
 4 schp687     4658    20860     0.223  10×AGA          D           
 5 schp686     5000    21171     0.236  10×AGA          E           
 6 schp685     7379    22956     0.321  10×AGA          F           
 7 schp683     9365    23866     0.392  10×AGA          G           
 8 schp689     8693    22649     0.384  10×AGA          H           
 9 schp679     2528    19906     0.127  10×AAG          A           
10 schp675     3687    20438     0.180  10×AAG          B           
11 schp681     3705    20227     0.183  10×AAG          C           
12 schp678     4378    20630     0.212  10×AAG          D           
13 schp677     3967    20604     0.193  10×AAG          E           
14 schp676     2657    20223     0.131  10×AAG          F           
15 schp674     1270    20316     0.0625 10×AAG          G           
16 schp680     1117    19377     0.0576 10×AAG          H

right_join keeps all rows in right data frame

data %>%
  right_join(annotations, by = "strain") %>% 
  print()

Use summarize to calculate stats across rows

data %>%
  summarize(max_yfp = max(mean_yfp),
            max_rfp = max(mean_rfp)) %>%
  print()
  

Use summarize to calculate stats across rows

data %>%
  summarize(max_yfp = max(mean_yfp),
            max_rfp = max(mean_rfp)) %>%
  print()
  

Other examples of summary functions:

min()mean()sd()first()n()

Use group_by to group subsets of rows

data <- read_tsv("data/example_dataset_4.tsv") %>% 
  print(n = 10)
# A tibble: 74 x 4
   strain    yfp   rfp replicate
   <chr>   <int> <int>     <int>
 1 schp690  3640 20944         1
 2 schp690  3502 20881         2
 3 schp690  3569 20063         3
 4 schp690  3475 20773         4
 5 schp690  3487 20307         5
 6 schp689  9790 24399         1
 7 schp689  9821 24932         2
 8 schp689  9310 23007         3
 9 schp689  6269 19075         4
10 schp689  8273 21835         5
# ... with 64 more rows

Use group_by to group subsets of rows

data %>% 
  group_by(strain) %>%
  print(n = 10)

group_by + summarize for statistics by group

data %>% 
  group_by(strain) %>%
  summarize(mean_yfp = mean(yfp), mean_rfp = mean(rfp)) %>%
  print()

group_by + summarize for statistics by group

data %>% 
  group_by(strain) %>%
  summarize(mean_yfp = mean(yfp), mean_rfp = mean(rfp),
            se_yfp = sd(yfp) / sqrt(n()), 
            se_rfp = sd(rfp) / sqrt(n())) %>%
  print()

%>% enables complex data analysis pipelines

data %>% 
  group_by(strain) %>%
  summarize(mean_yfp = mean(yfp), mean_rfp = mean(rfp)) %>%
  mutate(mean_ratio = mean_yfp / mean_rfp) %>%
  left_join(annotations, by = "strain") %>%
  print()

%>% and + if you want to plot

data %>% 
  group_by(strain) %>%
  summarize(mean_yfp = mean(yfp), mean_rfp = mean(rfp)) %>%
  mutate(mean_ratio = mean_yfp / mean_rfp) %>%
  left_join(annotations, by = "strain") %>%
  ggplot(aes(x = kozak_region, y = mean_ratio, 
             color = insert_sequence, group = insert_sequence)) +
  geom_line() +
  geom_point()

img/complex_pipeline_example.png