-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCGW_KW_test.Rmd
117 lines (85 loc) · 3.31 KB
/
CGW_KW_test.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
---
title: "CGW_Kruskal-Wallis_test"
author: "eqmh"
date: '2022-03-29'
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
## Load libraries
```{r include=FALSE}
library(lubridate)
library(reshape2)
library(tidyverse)
```
## Read in data
```{r include=TRUE}
data_tbl <- read.csv("All CGW surface data_v2.csv")
month <- month(as.POSIXlt(data_tbl$Date, format="%m/%d/%Y"))
year <- year(as.POSIXlt(data_tbl$Date, format="%m/%d/%Y"))
variable <- data_tbl$Salinity
df <- data.frame(month, year, variable)
# remove 9999 values
df <- df[df$variable !=9999, ]
```
## Subsetting data for a selected years
```{r include=TRUE}
first_year <- 2016:2017
second_year <- 2020:2021
df_filt_first <- df[df$year == first_year, ]
df_filt_second <- df[df$year == second_year, ]
# checking distribution (normal or non-normal). If p < 0.01 data is not normally distributed
shapiro.test(df_filt_first$variable)
shapiro.test(df_filt_second$variable)
```
## Boxplotting monthly data from a selected year
```{r include=TRUE}
# Plot data for the first selected year
mo_1 <- month.name[df_filt_first$month]
mo_data_1 <- data.frame("month" = mo_1, "salinity" = df_filt_first$variable)
mo_data_1$month <- factor(mo_data_1$month, levels = month.name)
p1 <- ggplot(mo_data_1, aes(x = month, y = salinity)) +
geom_boxplot() +
ggtitle(paste(as.character(first_year[1]), "-", as.character(first_year[2]))) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
p1
# Plot data for the second selected year
mo_2 <- month.name[df_filt_second$month]
mo_data_2 <- data.frame("month" = mo_2, "salinity" = df_filt_second$variable)
mo_data_2$month <- factor(mo_data_2$month, levels = month.name)
p2 <- ggplot(mo_data_2, aes(x = month, y = salinity)) +
geom_boxplot() +
ggtitle(paste(as.character(second_year[1]), "-", as.character(second_year[2]))) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
p2
```
## Run Kruskal-Wallis test
```{r include=TRUE}
# For entire time periods: first versus second selected periods (p < 0.01 indicates periods are different)
kw_test <- kruskal.test(list(df_filt_first$variable, df_filt_second$variable))
print(kw_test)
# Compare month of the first versus months of the second periods (e.g. Jan 2016-2017 vs Jan 2020-2021; p < 0.01 indicates periods are different)
kw_mo_p <- matrix(ncol = 1, nrow = 12)
kw_mo_chi <- matrix(ncol = 1, nrow = 12)
for (j in 1:12){
df_mo_first <- df_filt_first[df_filt_first$month == j, ]
df_mo_second <- df_filt_second[df_filt_second$month == j, ]
kw_test_mo <- kruskal.test(list(df_mo_first$variable, df_mo_second$variable))
kw_mo_p[j, ] <- kw_test_mo$p.value
kw_mo_chi[j, ] <- kw_test_mo$statistic
}
kw_mo <- data.frame("chi-squared" = kw_mo_chi, "p_value" = kw_mo_p)
# Plot Chi-squared to identify months that are statistically different between sampling periods (check months for which p < 0.05)
q <- ggplot(data = kw_mo, aes(x = 1:12, y = chi.squared)) +
geom_line() +
scale_x_continuous(breaks = seq(0, 12, by = 1)) +
labs(x = "month", y = "Chi-squared value")
q
# Plot p values to identify months that are statistically different between sampling periods
qq <- ggplot(data = kw_mo, aes(x = 1:12, y = p_value)) +
geom_line() +
scale_x_continuous(breaks = seq(0, 12, by = 1)) +
labs(x = "month", y = "p value")
qq
```