practice.Rmd

---
title: "Practising_w_swc"
author: "Laura"
date: "12/16/2019"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(
	echo = FALSE,
	fig.path = "Figs/",
	fig.width = 11,
	message = FALSE,
	warning = FALSE,
	results = "hide"
)
library(tidyverse); library(lubridate); library(babynames)
```

```{r}
cats <- tibble(coat = c("calico", "black", "tabby"), 
                    weight = c(2.1, 5.0, 3.2), 
                    likes_string = c(1, 0, 1))
cats %>% 
  write_csv(path = "data/feline-data.csv")

read_csv("data/feline-data.csv")

gapminder <- read_csv("data/gapminder_data.csv")
str(gapminder)
colnames(gapminder)
```

### Creating Publication-Quality Graphics with ggplot2

```{r}
gapminder %>% 
  ggplot(aes(year, lifeExp, by = country)) +
  geom_point() +
  geom_line(aes(color = continent)) 

gapminder %>% 
  ggplot(aes(gdpPercap, lifeExp, 
             by = continent, color = continent)) +
    geom_point(alpha = 0.5, aes(shape = continent)) +
    geom_smooth(method = "lm") +
    scale_x_log10()

gapminder %>% 
  filter(continent == "Americas") %>% 
  ggplot(aes(year, lifeExp)) +
    facet_wrap(~ country) +
    geom_line() + 
    theme(axis.text.x = element_text(angle = 45))


lifeExp_plot <- gapminder %>% 
  filter(continent == "Americas") %>% 
  ggplot(aes(year, lifeExp)) +
    facet_wrap(~ country) +
    geom_line() + 
    labs(x = "Year",              # x axis title
         y = "Life expectancy",   # y axis title
         title = "Figure 1",      # main title of figure
         color = "Continent"      # title of legend
         ) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

ggsave(filename = "results/lifeExp.png", plot = lifeExp_plot, width = 12, height = 10, dpi = 300, units = "cm")
```

```{r}
gapminder %>% 
  ggplot(aes(year, lifeExp)) +
    facet_grid(~ continent) +
    geom_boxplot(aes(group = year)) +
    labs(y = "Life Expectancy", x = "") +
    theme(axis.text.x = element_blank(),
          axis.ticks.x = element_blank())
```

### Dataframe Manipulation with dplyr

```{r}
gapminder %>% 
  filter(continent == "Africa") %>% 
  select(lifeExp, country, year)

gapminder %>% 
  group_by(country) %>% 
  summarise(mean_life_exp = mean(lifeExp)) %>% 
  arrange(desc(mean_life_exp)) %>% 
  tail()
```
Calculate the average life expectancy in 2002 of 2 randomly selected countries for each continent. Then arrange the continent names in reverse order. Hint: Use the dplyr functions arrange() and sample_n(), they have similar syntax to other dplyr functions.

```{r}

gdp_future_bycontinents_byyear_high_lifeExp <- gapminder %>%
    mutate(gdp_futureExpectation = ifelse(lifeExp > 40, gdpPercap * 1.5, gdpPercap)) %>%
    group_by(continent, year) %>%
    summarize(mean_gdpPercap = mean(gdpPercap),
              mean_gdpPercap_expected = mean(gdp_futureExpectation))

gapminder %>%
   # Get the start letter of each country
   mutate(startsWith = substr(country, start = 1, stop = 1)) %>%
   # Filter countries that start with "A" or "Z"
   filter(startsWith %in% c("A", "Z")) %>%
   # Make the plot
   ggplot(aes(x = year, y = lifeExp, color = continent)) +
   geom_line() +
   facet_wrap( ~ country)

gapminder %>% 
  filter(year == 2002) %>% 
  group_by(continent) %>% 
  sample_n(2) %>% 
  summarise(mean_le = mean(lifeExp)) %>% 
  arrange(desc(continent))
```
### Dataframe Manipulation with tidyr

```{r}
gap_wide <- read_csv("https://raw.githubusercontent.com/swcarpentry/r-novice-gapminder/gh-pages/_episodes_rmd/data/gapminder_wide.csv")

glimpse(gap_wide)

gap_long <- gap_wide %>% 
  pivot_longer(cols = c(starts_with("gdp"), 
                        starts_with("lifeExp"),
                        starts_with("pop")),
               names_to = "obstype_year",
               values_to = "obs_value")
glimpse(gap_long)

gap_long <- gap_long %>% 
  separate(obstype_year, 
           into = c("obstype", "year"), 
           convert = TRUE)
glimpse(gap_long)
gap_long

gap_long %>% 
  group_by(continent, obstype) %>% 
  summarize(mean = mean(obs_value))

gap_normal <- gap_long %>% 
  pivot_wider(names_from = obstype, values_from = obs_value)

gap_normal

```
Now let’s convert the long all the way back to the wide. In the wide format, we will keep country and continent as ID variables and pivot the observations across the 3 metrics (pop,lifeExp,gdpPercap) and time (year). First we need to create appropriate labels for all our new variables (time*metric combinations) and we also need to unify our ID variables to simplify the process of defining gap_wide.

```{r}
gap_long
gap_long %>% 
  unite("obstype_year", obstype, year) %>% 
  unite("continent_country", continent, country) %>% 
  pivot_wider(names_from = obstype_year, values_from = obs_value)

gap_long %>% 
  unite("country_obstype_year", country, obstype, year) %>% 
  pivot_wider(names_from = country_obstype_year, values_from = obs_value)

```

### Data Analysis and Visualization in R for Ecologists

```{r}
class(num_char <- c(1, 2, 3, "a"))
class(num_logical <- c(1, 2, 3, TRUE))
class(char_logical <- c("a", "b", "c", TRUE))
class(tricky <- c(1, 2, 3, "4"))

heights <- c(63, 69, 60, 65, NA, 68, 61, 70, 61, 59, 64, 69, 63, 63, NA, 72, 65, 64, 70, 63, 65)

median(na.omit(heights))
sum(na.omit(heights) > 67)
```

```{r}
download.file(url="https://ndownloader.figshare.com/files/2292169",
              destfile = "data/portal_data_joined.csv")

surveys <- read_csv("data/portal_data_joined.csv")
glimpse(surveys)
str(surveys)

surveys$date <- mdy(paste(surveys$month, surveys$day, surveys$year, sep = "-"))

surveys$date <- make_date(surveys$year, surveys$month, surveys$day)

surveys %>% 
  select(month, day, year, date) %>% 
  filter(is.na(date))
```

Using pipes, subset the surveys data to include animals collected before 1995 and retain only the columns year, sex, and weight.

```{r}
surveys %>% 
  filter(year < 1995) %>% 
  select(year, sex, weight)
```
Create a new data frame from the surveys data that meets the following criteria: contains only the species_id column and a new column called hindfoot_half containing values that are half the hindfoot_length values. In this hindfoot_half column, there are no NAs and all values are less than 30.

```{r}
mine <- surveys %>% 
  filter(hindfoot_length < 60 & !is.na(hindfoot_length)) %>% 
  mutate(hindfoot_half = hindfoot_length / 2) %>% 
  select(species_id, hindfoot_half)

surveys_hindfoot_half <- surveys %>%
    filter(!is.na(hindfoot_length)) %>%
    mutate(hindfoot_half = hindfoot_length / 2) %>%
    filter(hindfoot_half < 30) %>%
    select(species_id, hindfoot_half)

all.equal(mine, surveys_hindfoot_half)
```

How many animals were caught in each plot_type surveyed?

Use group_by() and summarize() to find the mean, min, and max hindfoot length for each species (using species_id). Also add the number of observations (hint: see ?n).

What was the heaviest animal measured in each year? Return the columns year, genus, species_id, and weight.

```{r}
surveys %>% 
  group_by(year) %>%
  filter(!is.na(weight)) %>% 
  filter(weight == max(weight)) %>% 
  select(year, genus, species_id, weight) %>% 
  arrange(year)
  
surveys
surveys %>% 
  count(plot_type)

surveys %>% 
  filter(species_id %in% c("SS", "ST", "UR")) %>% 
  select(species_id, hindfoot_length)
surveys %>% 
  group_by(species_id) %>% 
  filter(!is.na(hindfoot_length)) %>% 
  summarise(mean = mean(hindfoot_length),
            min = min(hindfoot_length),
            max = max(hindfoot_length),
            n = n())
```
Spread the surveys data frame with year as columns, plot_id as rows, and the number of genera per plot as the values. You will need to summarize before reshaping, and use the function n_distinct() to get the number of unique genera within a particular chunk of data. It’s a powerful function! See ?n_distinct for more.

```{r}
swide <- surveys %>% 
  group_by(year, plot_id) %>% 
  summarise(unique_genera = n_distinct(genus)) %>% 
  arrange(plot_id) %>% 
  pivot_wider(names_from = year, values_from = unique_genera)
```


Now take that data frame and gather() it again, so each row is a unique plot_id by year combination.

```{r}
swide %>% 
  pivot_longer(`1977`:`2002`, names_to = "year", 
               values_to = "distinct_genera")
```


The surveys data set has two measurement columns: hindfoot_length and weight. This makes it difficult to do things like look at the relationship between mean values of each measurement per year in different plot types. Let’s walk through a common solution for this type of problem. First, use gather() to create a dataset where we have a key column called measurement and a value column that takes on the value of either hindfoot_length or weight. Hint: You’ll need to specify which columns are being gathered.

```{r}
survlong <- surveys %>% 
  pivot_longer(cols = c(hindfoot_length, weight), 
               names_to = "measurement", values_to = "value")
```


With this new data set, calculate the average of each measurement in each year for each different plot_type. Then spread() them into a data set with a column for hindfoot_length and weight. Hint: You only need to specify the key and value columns for spread().

```{r}
survlong %>% 
  group_by(year, plot_type, measurement) %>% 
  filter(!is.na(value)) %>% 
  summarize(mean = mean(value)) %>% 
  pivot_wider(names_from = measurement, values_from = mean)
```
### Some plotting

```{r}
surveys_complete <- read_csv("data/portal_data_joined.csv")

yearly_sex_counts <- surveys_complete %>%
  count(year, genus, sex)

ggplot(data = yearly_sex_counts, mapping = aes(x = year, y = n, color = sex)) +
    geom_line() +
    facet_wrap(vars(genus)) +
    labs(title = "Observed genera through time",
        x = "Year of observation",
        y = "Number of individuals") +
    theme_bw() +
    theme(axis.text.x = element_text(colour = "grey20", size = 12, angle = 90, hjust = 0.5, vjust = 0.5),
                        axis.text.y = element_text(colour = "grey20", size = 12),
          text = element_text(size = 16))
# if you put theme_ at the end the other them statement is not followed (at least not well)
```

### Primers dplyr challenge

```{r}
babynames %>% 
  group_by(year, sex) %>% 
  mutate(rank = min_rank(desc(n))) %>% 
  filter(rank == 1, sex == "M") %>% 
  ungroup() %>% 
  summarise(distinct = n_distinct(name))
  
 babynames %>%
  filter(sex == "M") %>%
  group_by(year) %>%
  filter(n == max(n)) %>%
  ungroup() %>%
  count(name) 
```

Which gender uses more names?

In the chunk below, calculate and then plot the number of distinct names used each year for boys and girls. Place year on the x axis, the number of distinct names on they y axis and color the lines by sex.

```{r}
babynames %>%
  group_by(sex, year) %>%
  summarize(distinct = n_distinct(name)) %>%
  ggplot(aes(year, distinct, color = sex)) +
    geom_line() +
    theme_bw()

```

Let’s make sure that we’re not confounding our search with the total number of boys and girls born each year. With the chunk below, calculate and then plot over time the total number of boys and girls by year. Is the relative number of boys and girls constant?

```{r}
babynames %>%
  group_by(sex, year) %>%
  summarize(total = sum(n)) %>%
  ggplot(aes(year, total, color = sex)) + 
    geom_line()
```

Hmm. Sometimes there are more girls and sometimes more boys. In addition, the entire population has been grown over time. Let’s account for this weith a new metric: the average number of children per name.

If girls have a smaller number of children per name, that would imply that they use more names overall (and vice versa).

In the chunk below, calculate and plot the average number of children per name by year and sex over time. How do you interpret the results?

```{r}
babynames %>%
  group_by(sex, year) %>%
  summarise(average = mean(n)) %>%
  ggplot(aes(year, average, color = sex)) + 
    geom_line()
```