short_report_1.Rmd

---
title: "Short Report 1 Code"
author: "Dongyoung Kim and Kunwu Lyu"
date: "`r Sys.Date()`"
output: pdf_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(ggplot2)
library(GGally)
library(broom)
library(patchwork)
library(purrr)
```

## Data Wrangling
```{r}
#first spotify data, with crossover variable
#BE SURE TO DELETE THE LINE BREAK BEFORE RUNNING THIS
spotify_data_crossover <- read.csv(file = 
                                     "https://www.math.carleton.edu/ckelling/regression/report_cross_spotify_data.csv") %>% 
  mutate(mode = as.factor(mode),
         key = as.factor(key),
         crossover_categ = as.factor(crossover_categ))

#second spotify dataset- all data, crossover labelled or not
#BE SURE TO DELETE THE LINE BREAK BEFORE RUNNING THIS
full_musc_data <- read.csv(file = "https://www.math.carleton.edu/ckelling/regression/report_nocross_spotify_data.csv") %>% 
  mutate(mode = as.factor(mode),
         key = as.factor(key))

# select specified artists and create a new data frame
artists <- c("Gladys Knight & The Pips", "Stevie Wonder", "The Temptations") 
spotify_data_crossover_filtered <- spotify_data_crossover %>% 
  filter(artist_name %in% 
           c("Gladys Knight & The Pips", "Stevie Wonder", "The Temptations"))

glimpse(spotify_data_crossover_filtered)
```

## EDA
```{r}
# scatterplot matrix
ggpairs(spotify_data_crossover_filtered, 
        columns = c("energy", "loudness", "danceability"))
ggsave("scat_mtrx.png") # save as png for better importing

# Vis
# EDA with energy vs danceability
eda_energy <- ggplot(data = spotify_data_crossover_filtered, 
                     aes(x = energy, y = danceability)) + 
  geom_point() + 
  facet_grid(cols = vars(crossover_categ)) + # separate by crossover
  labs(x = "Energy", y = "Danceability", title = "Energy vs Danceability")

# EDA with loudness vs danceability
eda_loudness <- ggplot(data = spotify_data_crossover_filtered, 
                       aes(x = loudness, y = danceability)) + 
  geom_point() + 
  labs(x = "Loudness", y = "Danceability", title = "Loudness vs Danceability") +
  facet_grid(cols = vars(crossover_categ))  # separate by crossover

eda_energy
eda_loudness

ggsave("eda_energy.png", eda_energy)
ggsave("eda_loudness.png", eda_loudness)
```


## MLR and Assumptions
```{r}
# MLR Model
danceability_lm <- lm(danceability ~ loudness + energy * crossover_categ, 
                      data = spotify_data_crossover_filtered)
summary(danceability_lm)

# Assumptions Checking
danceability_lm_aug <- augment(danceability_lm, # add original data for later
                                                # checking assumptions
                               data = spotify_data_crossover_filtered) 

# residual plot
danceability_lm_res1 <- ggplot(danceability_lm_aug, aes(x = energy, y = .resid)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed") +
  labs(x = "Energy (std)", y = "residuals",
       title = "Residual Plot")

danceability_lm_res2 <- ggplot(danceability_lm_aug, aes(x = loudness, y = .resid)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed") +
  labs(x = "Loudness (dB)", y = "residuals",
       title = "Residual Plot")

danceability_lm_res3 <- ggplot(danceability_lm_aug, aes(x = .fitted, y = .resid)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed") +
  labs(x = "Predicted Danceability", y = "residuals",
       title = "Residual Plot")

# normal qq plot
danceability_lm_qq <- ggplot(danceability_lm_aug, aes(sample = .resid))+
  geom_qq() +
  geom_qq_line() +
  labs(y = "Sample Quantiles", x = "Normal Quantiles")

combined_plot <- (danceability_lm_res1 | danceability_lm_res2) / 
  (danceability_lm_res3 | danceability_lm_qq)
res_qq <- combined_plot + 
  plot_layout(guides = 'collect') + 
  plot_annotation(title = "Residual Plot and Normal Q-Q Plot of Spotify MLR") 
res_qq
ggsave("res+qq.png", res_qq)

# Cluster independence
cluster_dep <- ggplot(danceability_lm_aug, aes(x = artist_name, y = .resid)) +
  geom_boxplot() + 
  labs(y = "resid", x = "Artist") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Serial independence, w/ boxplot
ggplot(danceability_lm_aug, aes(x = as.factor(album_release_year), y = .resid)) +
  geom_boxplot() +
  labs(y = "resid", x = "Year") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Serial independence, w/ scatterplot (better visuals)
serial_dep <- ggplot(danceability_lm_aug, 
                     aes(x = album_release_year, y = .resid)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed") +
  # geom_smooth(method = "lm", se = T) + 
  labs(y = "resid", x = "Year") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) 

cluster_dep | serial_dep -> independence_check
independence_check
ggsave("independence.png", independence_check)
```


## R^2
```{r}
######## R^2 #########
danceability_lm_red <- lm(danceability ~ loudness + energy, data = spotify_data_crossover_filtered)

anova(danceability_lm_red, danceability_lm)
```