-
Notifications
You must be signed in to change notification settings - Fork 0
/
short_report_1.Rmd
152 lines (126 loc) · 4.95 KB
/
short_report_1.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
---
title: "Short Report 1 Code"
author: "Dongyoung Kim and Kunwu Lyu"
date: "`r Sys.Date()`"
output: pdf_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(ggplot2)
library(GGally)
library(broom)
library(patchwork)
library(purrr)
```
## Data Wrangling
```{r}
#first spotify data, with crossover variable
#BE SURE TO DELETE THE LINE BREAK BEFORE RUNNING THIS
spotify_data_crossover <- read.csv(file =
"https://www.math.carleton.edu/ckelling/regression/report_cross_spotify_data.csv") %>%
mutate(mode = as.factor(mode),
key = as.factor(key),
crossover_categ = as.factor(crossover_categ))
#second spotify dataset- all data, crossover labelled or not
#BE SURE TO DELETE THE LINE BREAK BEFORE RUNNING THIS
full_musc_data <- read.csv(file = "https://www.math.carleton.edu/ckelling/regression/report_nocross_spotify_data.csv") %>%
mutate(mode = as.factor(mode),
key = as.factor(key))
# select specified artists and create a new data frame
artists <- c("Gladys Knight & The Pips", "Stevie Wonder", "The Temptations")
spotify_data_crossover_filtered <- spotify_data_crossover %>%
filter(artist_name %in%
c("Gladys Knight & The Pips", "Stevie Wonder", "The Temptations"))
glimpse(spotify_data_crossover_filtered)
```
## EDA
```{r}
# scatterplot matrix
ggpairs(spotify_data_crossover_filtered,
columns = c("energy", "loudness", "danceability"))
ggsave("scat_mtrx.png") # save as png for better importing
# Vis
# EDA with energy vs danceability
eda_energy <- ggplot(data = spotify_data_crossover_filtered,
aes(x = energy, y = danceability)) +
geom_point() +
facet_grid(cols = vars(crossover_categ)) + # separate by crossover
labs(x = "Energy", y = "Danceability", title = "Energy vs Danceability")
# EDA with loudness vs danceability
eda_loudness <- ggplot(data = spotify_data_crossover_filtered,
aes(x = loudness, y = danceability)) +
geom_point() +
labs(x = "Loudness", y = "Danceability", title = "Loudness vs Danceability") +
facet_grid(cols = vars(crossover_categ)) # separate by crossover
eda_energy
eda_loudness
ggsave("eda_energy.png", eda_energy)
ggsave("eda_loudness.png", eda_loudness)
```
## MLR and Assumptions
```{r}
# MLR Model
danceability_lm <- lm(danceability ~ loudness + energy * crossover_categ,
data = spotify_data_crossover_filtered)
summary(danceability_lm)
# Assumptions Checking
danceability_lm_aug <- augment(danceability_lm, # add original data for later
# checking assumptions
data = spotify_data_crossover_filtered)
# residual plot
danceability_lm_res1 <- ggplot(danceability_lm_aug, aes(x = energy, y = .resid)) +
geom_point() +
geom_hline(yintercept = 0, linetype = "dashed") +
labs(x = "Energy (std)", y = "residuals",
title = "Residual Plot")
danceability_lm_res2 <- ggplot(danceability_lm_aug, aes(x = loudness, y = .resid)) +
geom_point() +
geom_hline(yintercept = 0, linetype = "dashed") +
labs(x = "Loudness (dB)", y = "residuals",
title = "Residual Plot")
danceability_lm_res3 <- ggplot(danceability_lm_aug, aes(x = .fitted, y = .resid)) +
geom_point() +
geom_hline(yintercept = 0, linetype = "dashed") +
labs(x = "Predicted Danceability", y = "residuals",
title = "Residual Plot")
# normal qq plot
danceability_lm_qq <- ggplot(danceability_lm_aug, aes(sample = .resid))+
geom_qq() +
geom_qq_line() +
labs(y = "Sample Quantiles", x = "Normal Quantiles")
combined_plot <- (danceability_lm_res1 | danceability_lm_res2) /
(danceability_lm_res3 | danceability_lm_qq)
res_qq <- combined_plot +
plot_layout(guides = 'collect') +
plot_annotation(title = "Residual Plot and Normal Q-Q Plot of Spotify MLR")
res_qq
ggsave("res+qq.png", res_qq)
# Cluster independence
cluster_dep <- ggplot(danceability_lm_aug, aes(x = artist_name, y = .resid)) +
geom_boxplot() +
labs(y = "resid", x = "Artist") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Serial independence, w/ boxplot
ggplot(danceability_lm_aug, aes(x = as.factor(album_release_year), y = .resid)) +
geom_boxplot() +
labs(y = "resid", x = "Year") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Serial independence, w/ scatterplot (better visuals)
serial_dep <- ggplot(danceability_lm_aug,
aes(x = album_release_year, y = .resid)) +
geom_point() +
geom_hline(yintercept = 0, linetype = "dashed") +
# geom_smooth(method = "lm", se = T) +
labs(y = "resid", x = "Year") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
cluster_dep | serial_dep -> independence_check
independence_check
ggsave("independence.png", independence_check)
```
## R^2
```{r}
######## R^2 #########
danceability_lm_red <- lm(danceability ~ loudness + energy, data = spotify_data_crossover_filtered)
anova(danceability_lm_red, danceability_lm)
```