-
Notifications
You must be signed in to change notification settings - Fork 0
/
puberty_cca.Rmd
236 lines (171 loc) · 8.95 KB
/
puberty_cca.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
---
title: "puberty_cca"
author: "Theresa Cheng"
date: "3/29/2019"
output: html_document
---
```{r setup and load data, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
# load packages, install as needed
packages = c("tidyr", "dplyr", "ggplot2", "skimr", "knitr", "PMA", "psych", "candisc", "caret")
package.check <- lapply(packages, FUN = function(x) {
if (!require(x, character.only = TRUE)) {
install.packages(x, dependencies = TRUE)
library(x, character.only = TRUE) }})
puberty_EM <- read.csv("/Volumes/psych-cog/dsnlab/TAG/projects/W1_puberty_cca/puberty_df_EM_imputed1.csv")
# acquire hormone only dataframe
hormones <- puberty_EM[, c("mean_saliva_EST", "mean_saliva_TEST", "mean_saliva_DHEA","hair_EST_pgug","hair_TEST_pgmg", "hair_DHEA_pgmg")] #
# acquire physical maturation self report variables only
physical_mat <- puberty_EM[, c("pds1_height", "pds2_hair", "pds3_skin", "pds4_breasts", "pbip1_breasts", "pbip2_pubic_hair")] # "pds6_period",
#physical_mat <- as.data.frame(lapply(physical_mat, factor, ordered = TRUE)) # recode all of these as factors
rm(packages, package.check)
```
```{r inspect univariate descriptive stats, echo = FALSE}
# univariate descriptive statistics
skim(hormones)
skim(physical_mat)
skim(puberty_EM$ageS2)
```
Examining univariate descriptive statistics of our dataset verifies that there is no missing data due to EM imputation at a prior step. Transformed hormone distributions are approximately normal, and we should verify whether the lower and upper limits are within biologically plausible ranges (did not set bounds during imputation). Ignoring the middle column of the PDS data (because its responses have four levels rather than five), the physical maturation data is approximately normal as well. In contrast, the age distribution is fairly flat, verifying that we sampled fairly evenly across the age range.
```{r inspect bivariate descriptive stats, echo = FALSE}
# hormone to hormone
cor_matrix_h <- cor(hormones)
corrplot::corrplot(cor_matrix_h, type = "upper",
tl.col = "black", tl.srt = 45)
# physical maturation to physical maturation
cor_matrix_pm <- cor(physical_mat, method = "spearman") # ordinal data
corrplot::corrplot(cor_matrix_pm, type = "upper",
tl.col = "black", tl.srt = 45)
# hormones to physical maturation
cor_matrix_h_pm <- cor(hormones, physical_mat)
corrplot::corrplot(cor_matrix_h_pm, tl.col = "black", tl.srt = 45)
```
Examining correlation matrices suggests that the high correlations between the physical maturation measures (especially those between the PDS items related to hair growth and breast development, along with similar PBIP measures) might problematic in terms of collinearity. Additionally, the fairly low correlations between some of the hair hormone measures (especially the negative correlations with DHEA) may add noise to the model.
```{r cca functions}
# Modified from Dinga et al., 2018
select_and_cca_fit <- function(X, Y){
#select
correlations <- cor(Y, X, method = "spearman")
correlations <- apply(correlations, 2, function(x){max(abs(x))})
corr.threshold <- sort(correlations, decreasing = T)
selected.X <- correlations >= corr.threshold
selected.X <- X[,selected.X]
#cca fit
cca_model <- candisc::cancor(selected.X, Y)
#return fitted model containing canonical correlations and wilks lambdas
return(cca_model)
}
predict.cancor <- function(cancor.obj, X, Y){
X_pred <- as.matrix(X) %*% cancor.obj$coef$X
Y_pred <- as.matrix(Y) %*% cancor.obj$coef$Y
XY_pred <- list(X_pred, Y_pred)
names(XY_pred) <- c("X_pred", "Y_pred")
return(XY_pred)
}
cca_cv <- function(hormone_variables, phys_mat_variables, age){
n_folds <- 10
folds <- createFolds(as.factor(age), n_folds, list=F)
results_cancor <- list()
for (fold in 1:n_folds) {
# create training and test set
train_hormones <- hormone_variables[folds != fold,]
train_phys_mat <- phys_mat_variables[folds != fold,]
test_hormones <- hormone_variables[folds == fold,]
test_phys_mat <- phys_mat_variables[folds == fold,]
# fit on training set
cancor.fit <- select_and_cca_fit(train_hormones, train_phys_mat)
# predict on test set
XY_pred_cancor <- predict.cancor(cancor.fit,
test_hormones[,cancor.fit$names$X],
test_phys_mat)
results_cancor[[fold]] <- diag(cor(XY_pred_cancor[[1]],
XY_pred_cancor[[2]]))
}
return(do.call(rbind, results_cancor))
}
```
```{r run with candisc functions}
physical_mat_pds <- physical_mat[, c("pds1_height", "pds2_hair", "pds3_skin", "pds4_breasts" )]
physical_mat_pbip <- physical_mat[, c("pds1_height", "pbip2_pubic_hair", "pds3_skin", "pbip1_breasts")]
# run once with pds hair and breast measures
cca_model <- select_and_cca_fit(hormones, physical_mat)
cca_model$cancor
canonical.variates <- predict.cancor(cca_model,
hormones[,cca_model$names$X],
physical_mat)
cca_x_loadings <- cor(hormones, canonical.variates$X_pred)
cca_x_loadings
cca_y_loadings <- cor(physical_mat, canonical.variates$Y_pred)
cca_y_loadings
cor_matrix_pm
```
Using all of the data, the canonical loadings of breast development onto the second variate are both positively and negatively associated with breast development. It seems likely that multicollinearity is an issue. We note particularly high correlations between pds2_hair and pbip2_pubic_hair, and pds4_breasts and pbip1_breasts. For now, we take the approach of running everything with PDS only, and then re-running with pbip in order to compare how using difference questionnaires for those two measures impacts the canonical loadings.
```{r}
# run once with pds hair and breast measures
cca_model_pds <- select_and_cca_fit(hormones, physical_mat_pds)
cca_model_pds$cancor
canonical.variates <- predict.cancor(cca_model_pds,
hormones[,cca_model_pds$names$X],
physical_mat_pds)
cca_x_loadings_pds <- cor(hormones, canonical.variates$X_pred)
cca_x_loadings_pds
cca_y_loadings_pds <- cor(physical_mat_pds, canonical.variates$Y_pred)
cca_y_loadings_pds
par(mfrow=c(1,2))
plot(canonical.variates$X_pred[,1],
canonical.variates$Y_pred[,1],
bty='n',
xlab='Hormone canonical variate 1',
ylab='Physical mat pds canonical variate 1')
round(cca_model_pds$cancor[1], 2) # doesn't work, but r = .55
plot(canonical.variates$X_pred[,2],
canonical.variates$Y_pred[,2],
bty='n',
xlab='Hormone canonical variate 2',
ylab='Physical mat pds canonical variate 2')
round(cca_model_pds$cancor[2], 2) # doesn't work, but r = .35
real_model_pds <- cca_model_pds
real_results_cancor <- real_model_pds$cancor
real_results_wilks_pds <- Wilks(real_model_pds)
real_results_wilks_pds
```
The first canonical variate seems to explain significant variation in the data, but none of the others. Examining both the x and y loadings suggests that this is a general puberty factor.
```{r cca with pbip hair and breast measures}
# run once with pbip hair and breast measures
cca_model_pbip <- select_and_cca_fit(hormones, physical_mat[, c("pds1_height", "pbip2_pubic_hair", "pds3_skin", "pbip1_breasts" )])
cca_model_pbip$cancor
canonical.variates <- predict.cancor(cca_model_pbip,
hormones[,cca_model_pbip$names$X],
physical_mat_pbip)
cca_x_loadings_pbip <- cor(hormones, canonical.variates$X_pred)
cca_x_loadings_pbip
cca_y_loadings_pbip <- cor(physical_mat_pbip, canonical.variates$Y_pred)
cca_y_loadings_pbip
par(mfrow=c(1,2))
plot(canonical.variates$X_pred[,1],
canonical.variates$Y_pred[,1],
bty='n',
xlab='Hormone canonical variate 1',
ylab='Physical mat pbip canonical variate 1')
round(cca_model_pbip$cancor[1], 2) # doesn't work, but r = .55
plot(canonical.variates$X_pred[,2],
canonical.variates$Y_pred[,2],
bty='n',
xlab='Hormone canonical variate 2',
ylab='Physical mat pbip canonical variate 2')
round(cca_model_pbip$cancor[2], 2) # doesn't work, but r = .35
real_model <- cca_model_pbip
real_results_cancor <- real_model$cancor
real_results_wilks_pbip <- Wilks(real_model)
```
```{r run cca in pma}
#using PMA package
# use permutations to identify best L1 bound for x and z
perm.out <- CCA.permute(hormones, physical_mat, typex = "standard", typez = "standard", nperms = 100, penaltyxs=seq(.1,1,len=10), penaltyzs=seq(.1,1,len=10))
print(perm.out)
plot(perm.out)
cca_output <- CCA(x = hormones, z = physical_mat, typex = "standard", typez = "standard", penaltyx = .9, penaltyz = .9, K = 3)
# note: in examining the pma package documentation and the linked paper (Tibshirani and Wang), it seems that the "ordered" type doesn't deal with ordinal data, but rather with columns that have some kind of order to them, as in genes on a chromosome as in comparative genomic hybridization data.
# okay so damn. PMA offers penalized CCA options but no Wilks lambda test. candisc offers the reverse. PMA *DOES* have a permutation test and seems to handle the multicollinearity a bit better, but it only ever shows the FIRST variate.
```
# next examine CCA with ordinal variables? package homals may be a place to start. but it seems like the next direction may be