9.CCA.Rmd

---
title: "CCA"
author: "Erica Ryu"
date: "10/4/2022"
output:
  html_document:
    df_print: paged
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

# 9. CCA

The purpose of this script is to conduct canonical correspondence analysis (CCA) to identify which specific lifestyle factors are associated with the microbiome

## load packages
```{r}
library(phyloseq)
library(FactoMineR)
library(factoextra)
library(vegan)
library(dplyr)
library(ggplot2)
library(tidyr)
```

## load data
```{r}
phyloseq_complete <- readRDS("output/ps_complete.rds")

# subset based on extraction kit
qiagen <- subset_samples(phyloseq_complete, Condition == "Qiagen")
psoil <- subset_samples(phyloseq_complete, Condition == "Psoil")

# load alpha diversity 
rfxn_micro <- read.csv("output/rarefaction_edit03292024_final.csv", header = TRUE)
```

## set colors
```{r}
fourcolors <- c("darkslateblue", "deepskyblue", "lightblue3", "lightsalmon")
fivecolors <- c("darkslateblue", "deepskyblue", "lightblue3", "lightsalmon", "firebrick")
```

## set up function
```{r}
# calculate beta diversity
beta_ordinate <- function(physeq){
  # set up data
  ps.prop <- transform_sample_counts(physeq, function(otu) otu/sum(otu))
  ps.prop@otu_table <- na.omit(ps.prop@otu_table)
  ## calculate distance and ordinate
  ord.pcoa <- ordinate(ps.prop, method = "PCoA", distance = "bray")
}

# set up beta diversity data for plotting
plot_beta <- function(beta, physeq){
  # extract axes
  PCOAaxes <- beta$vectors[,c(1,2,3,4)]
  # extract lifestyle column from metadata and add to vectors
  lifestyle <- physeq@sam_data$Lifestyle
  PCOAaxes_meta <- cbind(PCOAaxes, lifestyle)
  df_PCOA<- as.data.frame(as.matrix(PCOAaxes_meta))
  # change industrial to American industrial
  df_PCOA$lifestyle<- gsub("Industrial", "American Industrial", df_PCOA$lifestyle)
  # add space to Recently Settled
  df_PCOA$lifestyle <- gsub("RecentlySettled", "Recently Settled", df_PCOA$lifestyle)
  df_PCOA$lifestyle <- factor(df_PCOA$lifestyle, levels=c("Foragers", "Recently Settled", "Agriculturalists", "Expats", "American Industrial"))
  return(df_PCOA)
}
```

## set up metadata for analysis
```{r}
CA_meta <- as.data.frame(qiagen@sam_data)
CA_meta <- subset(CA_meta, Condition == "Qiagen")

df_CA <- as.data.frame(as.matrix(CA_meta))
df_CA_num <- dplyr::select(df_CA, "SEX2":"RHR")
dim(df_CA_num)

## remove factors and samples with lots of NAs
# filter out Americans due to no survey data
df_CA_num$Lifestyle <- df_CA$Lifestyle
wo_euro <- subset(df_CA_num, Lifestyle != "Industrial")

# remove columns specific to women (pregnancy, menstruation, etc)
wo_mens <- dplyr::select(wo_euro, -c("MENS2":"MISC2"))

# remove columns for bitter taste perception
wo_bitter <- dplyr::select(wo_mens, -c("BTP_CTRL2":"BTP_SBZ2"))

# remove columns for vitals
wo_vital <- dplyr::select(wo_bitter, -c("HGT2":"RHR"))

# remove geographical info
wo_geo <- dplyr::select(wo_vital, -c("LAT2":"ALT2"))

## change sporadic NAs
# change NAs in EXER_FREQ2
wo_geo$EXER_FREQ2[is.na(wo_geo$EXER_FREQ2)] <- 0

# change NAs in SICK_LOC2
wo_geo$SICK_LOC2[is.na(wo_geo$SICK_LOC2)] <- 0

# change NAs in BRUSH_FREQ2
wo_geo$BRUSH_FREQ2[is.na(wo_geo$BRUSH_FREQ2)] <- 0

# change NAs in BLKT2
blkt <- wo_geo
blkt$BLKT2[is.na(blkt$BLKT2)] <- 0

# change NA in HS
household <- blkt
household$HS2[is.na(household$HS2)] <- 2

# final df for CA
df_CA_wo_lifestyle <- dplyr::select(household, -c("Lifestyle"))
df_CA_final <- as.data.frame(sapply(df_CA_wo_lifestyle, as.numeric))
dim(df_CA_final)
```

## make labels human readable
```{r}
colnames(df_CA_final) <- c("sex", "literacy", "location", "drinking_water", "fuel", "kitchen_loc", "toilet", "grain", "Sisnu", "fish", "meat", "black_tea", "milk_tea", "soda", "milk", "yogurt", "yogurt_freq", "fermented", "ferm_freq", "food_source", "scarcity", "smoking", "tobacco", "alcohol", "exercise", "exercise_freq", "sick_checkup", "checkup_loc", "health_travel", "meds", "ayurvedic", "brushing", "brushing_freq", "age", "education", "household", "children")
```

## CA
```{r}
rownames(df_CA_final) <- rownames(df_CA_wo_lifestyle)

set.seed(100)
ca_res <- CA(df_CA_final, ncp = 37, graph = TRUE)
print(ca_res)

# examine eigenvalues to see what percentage of variance they explain 
eig.val <- get_eigenvalue(ca_res)
fviz_screeplot(ca_res, addlabels = TRUE, ylim = c(0, 20))

# what is the avg eigenvalue above which axis should be kept?
col_max <- 1/(ncol(df_CA_final)-1)
row_max <- 1/(nrow(df_CA_final)-1)
```

## determine specific contributions to each axis
```{r}
dim1_contrib <- fviz_contrib(ca_res, choice = "col", axes = 1, top = 10)
dim2_contrib <- fviz_contrib(ca_res, choice = "col", axes = 2, top = 10)

ggsave(file="figures/dim1_contrib_small.pdf", plot=dim1_contrib, width=4, height=2)
ggsave(file="figures/dim2_contrib_small.pdf", plot=dim2_contrib, width=4, height=2)
```

## plot CA
```{r}
# prep sample coordinates
row_axes <- get_ca_row(ca_res)$coord
row_axes <- as.data.frame(row_axes[,c(1,2,3,4)])

## add lifestyle info
row_axes$lifestyle  <- ifelse(grepl("CHE",row.names(row_axes)), "Foragers",
                              ifelse(grepl("NEW00",row.names(row_axes)) | grepl("NEW10",row.names(row_axes)) | grepl("THA",row.names(row_axes)), "Agriculturalists",
                              ifelse(grepl("NEW01",row.names(row_axes)) | grepl("NEW11",row.names(row_axes)), "Expats", "Recently Settled")))
row_axes$lifestyle <- factor(row_axes$lifestyle, levels=c("Foragers", "Recently Settled", "Agriculturalists", "Expats"))
## change column names
colnames(row_axes) <- c("Dim1", "Dim2", "Dim3", "Dim4", "lifestyle")

# prep metadata coordinates
col_axes <- get_ca_col(ca_res)$coord
col_axes <- as.data.frame(col_axes[,c(1,2,3,4)])

## change column names
colnames(col_axes) <- c("Dim1", "Dim2", "Dim3", "Dim4")

# for better visualization
scaling_factor <- 1.07

# only label factors that are in the top 10 
CA_labels <- col_axes[c("Sisnu", "literacy", "fuel", "health_travel", "education", "smoking", "scarcity", "ayurvedic", "children", "grain", "alcohol", "tobacco", "sex", "location", "yogurt_freq"),]

# plot
CA_plot <- ggplot(row_axes, aes(x = Dim1, y = Dim2)) +
  geom_point(shape = 16, size = 2, aes(color = lifestyle)) + # colored closed circles are individuals
  scale_color_manual(name=NULL,
                    values=fourcolors,
                    breaks=c("Foragers", "Recently Settled", "Agriculturalists", "Expats"),
                    labels=c("Foragers", "Recently Settled", "Agriculturalists", "Expats")) +
  geom_point(data=col_axes, shape = 21, color = "black",
            aes(x=Dim1,
                y=Dim2))+ # black open circles are metdata
  geom_text(data=CA_labels, size = 3, color="black",
            aes(x=Dim1*scaling_factor, 
                y=Dim2*scaling_factor,
                label=rownames(CA_labels)))+
  labs(title = "CA",
       x = "CA1 (15.88%)",
       y = "CA2 (12.97%)")

ggsave(file="figures/ca_plot.pdf", plot=CA_plot, width=8, height=6)
# shape legend (metadata) added in illustrator
```

## plot CA axes
```{r}
# plot
ca1_axes <- ggplot(row_axes, aes(x = lifestyle, y = as.numeric(Dim1), group = lifestyle), color = black) +
  geom_violin(alpha = 0.8, aes(fill=lifestyle)) +
  geom_boxplot(width=0.1, color="black", alpha=0.2, outlier.shape = NA) + 
  scale_y_continuous(breaks=seq(-1,1,0.2), labels = scales::comma) +
  scale_fill_manual(name=NULL,
                    values=fivecolors,
                    breaks=c("Foragers", "Recently Settled", "Agriculturalists", "Expats"),
                    labels=c("Foragers", "Recently Settled", "Agriculturalists", "Expats")) +
  geom_jitter(size = 1, col="darkgreen", width = 0.1) +
  labs(x = "Lifestyle", y = "CA1 (15.88%)") + 
  theme(axis.text.x = element_text(angle = 45, hjust = 0.5, vjust=0.5)) + 
  theme(legend.position = "none")

ggsave(file="figures/ca1_axes.pdf", plot=ca1_axes, width=4, height=3)

ca2_axes <- ggplot(row_axes, aes(x = lifestyle, y = as.numeric(Dim2), group = lifestyle), color = black) +
  geom_violin(alpha = 0.8, aes(fill=lifestyle)) +
  geom_boxplot(width=0.1, color="black", alpha=0.2, outlier.shape = NA) + 
  scale_y_continuous(breaks=seq(-1,1,0.2), labels = scales::comma) +
  scale_fill_manual(name=NULL,
                    values=fivecolors,
                    breaks=c("Foragers", "Recently Settled", "Agriculturalists", "Expats"),
                    labels=c("Foragers", "Recently Settled", "Agriculturalists", "Expats")) +
  geom_jitter(size = 1, col="darkgreen", width = 0.1) +
  labs(x = "Lifestyle", y = "CA2 (12.97%)") + 
  theme(axis.text.x = element_text(angle = 45, hjust = 0.5, vjust=0.5)) + 
  theme(legend.position = "none")

ggsave(file="figures/ca2_axes.pdf", plot=ca2_axes, width=4, height=3)
```

## assess if CA axes are correlated with PCoA axes
```{r}
# prep CA data
ca.data <- as.data.frame(ca_res$row$coord)
ca.data$SampleID <- factor(row.names(ca.data))
ca.data <- ca.data[order(ca.data$SampleID),]
ca.data <- ca.data[,c(1,2,3,4)]
colnames(ca.data) <- c("CA1", "CA2", "CA3", "CA4")
dim(ca.data)
  
# prep PCoA data
pcoa_corr <- subset_samples(qiagen, Lifestyle != "Industrial") # remove Europeans due to no survey data
bray_ordinate_corr <- beta_ordinate(pcoa_corr)
bray_plot_corr <- plot_beta(bray_ordinate_corr, pcoa_corr)
bray_plot_corr_num <-  as.data.frame(sapply(bray_plot_corr, as.numeric))
rownames(bray_plot_corr_num) <- rownames(bray_plot_corr)
colnames(bray_plot_corr_num) <- c("PCoA1", "PCoA2", "PCoA3", "PCoA4", "lifestyle")
dim(bray_plot_corr_num)

# combine PCoA and CA data
pcoa_ca <- merge(ca.data, bray_plot_corr_num, by = "row.names")
pcoa_ca$lifestyle  <- ifelse(grepl("CHE",pcoa_ca$Row.names), "Foragers",
                              ifelse(grepl("NEW00",pcoa_ca$Row.names) | grepl("NEW10",pcoa_ca$Row.names) | grepl("THA",pcoa_ca$Row.names), "Agriculturalists",
                              ifelse(grepl("NEW01",pcoa_ca$Row.names) | grepl("NEW11",pcoa_ca$Row.names), "Expats", "Recently Settled")))

### plot and calculate correlations between axes
## axes 1
# plot 
pcoa1_ca1 <- ggplot(pcoa_ca , aes(x = PCoA1, y = CA1)) +
  geom_point(shape = 21, color = "black", size = 3, aes(fill = lifestyle)) +
  scale_fill_manual(name=NULL,
                    values=fourcolors,
                    breaks=c("Foragers", "Recently Settled", "Agriculturalists", "Expats"),
                    labels=c("Foragers", "Recently Settled", "Agriculturalists", "Expats")) +
  geom_smooth(method = "glm") +
  labs(title = "PCoA1 vs CA1",
       x = "PCoA1",
       y = "CA1")

ggsave(file="figures/pcoa1_ca1_small.pdf", plot=pcoa1_ca1, width=4, height=2)

# correlation
cor.test(pcoa_ca$PCoA1, pcoa_ca$CA1, alternative = "two.sided", method=c("spearman"))

## axes 2
# plot
pcoa2_ca2 <- ggplot(pcoa_ca , aes(x = PCoA2, y = CA2*-1)) +
  geom_point(shape = 21, color = "black", size = 3, aes(fill = lifestyle)) +
  scale_fill_manual(name=NULL,
                    values=fourcolors,
                    breaks=c("Foragers", "Recently Settled", "Agriculturalists", "Expats"),
                    labels=c("Foragers", "Recently Settled", "Agriculturalists", "Expats")) +
  geom_smooth(method = "glm") +
  labs(title = "PCoA2 vs CA2",
       x = "PCoA2",
       y = "CA2")

ggsave(file="figures/pcoa2_ca2_small.pdf", plot=pcoa2_ca2, width=4, height=2)

# correlation
cor.test(pcoa_ca$PCoA2, pcoa_ca$CA2, alternative = "two.sided", method=c("spearman"))

## axes 3
# plot
pcoa3_ca3 <- ggplot(pcoa_ca , aes(x = PCoA3, y = CA3*-1)) +
  geom_point(shape = 21, color = "black", size = 3, aes(fill = lifestyle)) +
  scale_fill_manual(name=NULL,
                    values=fourcolors,
                    breaks=c("Foragers", "Recently Settled", "Agriculturalists", "Expats"),
                    labels=c("Foragers", "Recently Settled", "Agriculturalists", "Expats")) +
  geom_smooth(method = "glm") +
  labs(title = "PCoA3 vs CA3",
       x = "PCoA3",
       y = "CA3")

# correlation
cor.test(pcoa_ca$PCoA3, pcoa_ca$CA3, alternative = "two.sided", method=c("spearman"))

ggsave(file="figures/pcoa3_ca3_small.pdf", plot=pcoa3_ca3, width=4, height=2)

## CA1 vs PCoA2
pcoa2_ca1 <- ggplot(pcoa_ca , aes(x = PCoA2, y = CA1)) +
  geom_point(shape = 21, color = "black", size = 3, aes(fill = lifestyle)) +
  scale_fill_manual(name=NULL,
                    values=fourcolors,
                    breaks=c("Foragers", "Recently Settled", "Agriculturalists", "Expats"),
                    labels=c("Foragers", "Recently Settled", "Agriculturalists", "Expats")) +
  geom_smooth(method = "glm") +
  labs(title = "PCoA2 vs CA1",
       x = "PCoA2",
       y = "CA1")

# correlation
cor.test(pcoa_ca$PCoA2, pcoa_ca$CA1, alternative = "two.sided", method=c("spearman"))

ggsave(file="figures/pcoa2_ca1_small.pdf", plot=pcoa2_ca1, width=4, height=2)

## CA2 vs PCoA1
pcoa1_ca2 <- ggplot(pcoa_ca , aes(x = PCoA1, y = CA2)) +
  geom_point(shape = 21, color = "black", size = 3, aes(fill = lifestyle)) +
  scale_fill_manual(name=NULL,
                    values=fourcolors,
                    breaks=c("Foragers", "Recently Settled", "Agriculturalists", "Expats"),
                    labels=c("Foragers", "Recently Settled", "Agriculturalists", "Expats")) +
  geom_smooth(method = "glm") +
  labs(title = "PCoA1 vs CA2",
       x = "PCoA1",
       y = "CA2")

# correlation
cor.test(pcoa_ca$PCoA1, pcoa_ca$CA2, alternative = "two.sided", method=c("spearman"))

ggsave(file="figures/pcoa1_ca2_small.pdf", plot=pcoa1_ca2, width=4, height=2)
```

## CA axes vs alpha diversity
```{r}
# subset alpha diversity to qiagen samples
qiagen_rfxn <- subset(rfxn_micro, Condition == "Qiagen")

# subset to shannon and faiths
shannon_qiagen <- subset(qiagen_rfxn, measure == "Shannon")
faiths_qiagen <- subset(qiagen_rfxn, measure == "Faiths")

# remove europeans from alpha diversity
shannon_CA <- shannon_qiagen[!grepl("EUR", shannon_qiagen$SampleID),]
faiths_CA <- faiths_qiagen[!grepl("EUR", faiths_qiagen$SampleID),]

# select alpha mean value and lifestyle
shannon_CA <- shannon_CA[, c("mean", "Lifestyle")]
rownames(shannon_CA) <- rownames(ca.data)
faiths_CA <- faiths_CA[, c("mean", "Lifestyle")]
rownames(faiths_CA) <- rownames(ca.data)

shannon_CA_comb <- merge(shannon_CA, ca.data, by = "row.names")
faiths_CA_comb <- merge(faiths_CA, ca.data, by = "row.names")

### correlations between Shannon and CA
## shannon and CA1
# plot 
shannon_ca1 <- ggplot(shannon_CA_comb, aes(x = mean, y = CA1)) +
  geom_point(shape = 21, color = "black", size = 3, aes(fill = Lifestyle)) +
  scale_fill_manual(name=NULL,
                    values=fourcolors,
                    breaks=c("Foragers", "RecentlySettled", "Agriculturalists", "Expats"),
                    labels=c("Foragers", "Recently Settled", "Agriculturalists", "Expats")) +
  geom_smooth(method = "glm") +
  labs(title = "Shannon Alpha vs CA1",
       x = "Shannon Alpha",
       y = "CA1")

ggsave(file="figures/shannon_ca1_small.pdf", plot=shannon_ca1, width=4, height=2)

# correlation
cor.test(shannon_CA_comb$mean, shannon_CA_comb$CA1, alternative = "two.sided", method=c("spearman"))

## shannon and CA2
# plot
shannon_ca2 <- ggplot(shannon_CA_comb, aes(x = mean, y = CA2)) +
  geom_point(shape = 21, color = "black", size = 3, aes(fill = Lifestyle)) +
  scale_fill_manual(name=NULL,
                    values=fourcolors,
                    breaks=c("Foragers", "RecentlySettled", "Agriculturalists", "Expats"),
                    labels=c("Foragers", "Recently Settled", "Agriculturalists", "Expats")) +
  geom_smooth(method = "glm") +
  labs(title = "Shannon Alpha vs CA2",
       x = "Shannon Alpha",
       y = "CA2")

ggsave(file="figures/shannon_ca2_small.pdf", plot=shannon_ca2, width=4, height=2)

# correlation
cor.test(shannon_CA_comb$mean, shannon_CA_comb$CA2, alternative = "two.sided", method=c("spearman"))

## faiths and CA1
# plot
faiths_ca1 <- ggplot(faiths_CA_comb, aes(x = mean, y = CA1)) +
  geom_point(shape = 21, color = "black", size = 3, aes(fill = Lifestyle)) +
  scale_fill_manual(name=NULL,
                    values=fourcolors,
                    breaks=c("Foragers", "RecentlySettled", "Agriculturalists", "Expats"),
                    labels=c("Foragers", "Recently Settled", "Agriculturalists", "Expats")) +
  geom_smooth(method = "glm") +
  labs(title = "Faiths Alpha vs CA1",
       x = "Faiths Alpha",
       y = "CA1")

ggsave(file="figures/faiths_ca1_small.pdf", plot=faiths_ca1, width=4, height=2)

# correlation
cor.test(faiths_CA_comb$mean, faiths_CA_comb$CA1, alternative = "two.sided", method=c("spearman"))

## faiths and CA2
# plot
faiths_ca2 <- ggplot(faiths_CA_comb, aes(x = mean, y = CA2)) +
  geom_point(shape = 21, color = "black", size = 3, aes(fill = Lifestyle)) +
  scale_fill_manual(name=NULL,
                    values=fourcolors,
                    breaks=c("Foragers", "RecentlySettled", "Agriculturalists", "Expats"),
                    labels=c("Foragers", "Recently Settled", "Agriculturalists", "Expats")) +
  geom_smooth(method = "glm") +
  labs(title = "Faiths Alpha vs CA2",
       x = "Faiths Alpha",
       y = "CA2")

ggsave(file="figures/faiths_ca2_small.pdf", plot=faiths_ca2, width=4, height=2)

# correlation
cor.test(faiths_CA_comb$mean, faiths_CA_comb$CA2, alternative = "two.sided", method=c("spearman"))
```

## prep taxa table and survey data for CCA
```{r}
### prep taxa table
# remove Americans from original phyloseq object
phyloseq_wo_euro <- subset_samples(qiagen, Lifestyle != "Industrial")

# collapse to genus level
phy_genus <-  tax_glom(phyloseq_wo_euro, "Genus", NArm = FALSE);phy_genus

# convert to relative abundance
phy_genus_RA <- transform_sample_counts(phy_genus, function(otu) otu/sum(otu))

# extract taxa table
df_OTU_CCA <- as.data.frame(as.matrix(phy_genus_RA@otu_table))

# log transform
log_df_OTU_CCA <- decostand(df_OTU_CCA, "log")

## add taxa names
# get taxa table
CCA_tax_table <- as.data.frame(as.matrix(phy_genus_RA@tax_table))

# change NA to unclassified taxon
CCA_tax_table$Genus <- replace_na(CCA_tax_table$Genus, "_unclassified")

# and label unclassified taxonomic level
CCA_tax_table$Phylum <- ifelse(is.na(CCA_tax_table$Phylum), CCA_tax_table$Kingdom, CCA_tax_table$Phylum)
CCA_tax_table$Class <- ifelse(is.na(CCA_tax_table$Class), CCA_tax_table$Phylum, CCA_tax_table$Class)
CCA_tax_table$Order <- ifelse(is.na(CCA_tax_table$Order), CCA_tax_table$Class, CCA_tax_table$Order)
CCA_tax_table$Family <- ifelse(is.na(CCA_tax_table$Family), CCA_tax_table$Order, CCA_tax_table$Family)

CCA_tax_table$Genus <- ifelse(CCA_tax_table$Genus == "_unclassified", paste(CCA_tax_table$Family, CCA_tax_table$Genus, sep = ""), CCA_tax_table$Genus)

# isolate to genera
CCA_genera_table <- subset(CCA_tax_table, select = c(Genus))

# add taxonomic info to taxa table
CCA_OTU_log_t <- t(log_df_OTU_CCA)
CCA_otu_tax <- merge(CCA_OTU_log_t, CCA_genera_table, by = 0, all = TRUE)

# rename NA_classified
CCA_otu_tax$Genus <- gsub("NA_unclassified", "Unclassified", CCA_otu_tax$Genus)

# make Genus the rownames and remove extra columns
rownames(CCA_otu_tax) <- CCA_otu_tax$Genus
CCA_otu_tax <- subset(CCA_otu_tax, select = -c(Genus, Row.names))

# transpose back
CCA_otu_tax_t <- t(CCA_otu_tax)

### prep survey data
df_CA_factor <- mutate_if(df_CA_final, is.numeric, as.factor) # convert to factor
```

# CCA with CA1
```{r}
set.seed(101)
cca_model_CA1_CA2 <- cca(CCA_otu_tax_t ~ alcohol + tobacco + smoking + scarcity + sex + location + yogurt_freq + literacy + Sisnu + grain + fuel + health_travel + education + ayurvedic + children, data = df_CA_factor)

vif.cca(cca_model_CA1_CA2)

cca_model_CA1_CA2

# test significance of the model
anova.cca(cca_model_CA1_CA2)
anova.cca(cca_model_CA1_CA2, by = "terms")
anova.cca(cca_model_CA1_CA2, by = "axis")
```

## prep for CCA with CA1 final plot 
```{r}
df_sites  <- as.data.frame(summary(cca_model_CA1_CA2)$sites[,1:2])# get the sites CC1 and CC2 scores
df_environ  <- as.data.frame(scores(cca_model_CA1_CA2, display = 'bp')) # get the environment vars CC1 and CC2 scores for arrows

cca1_varex<-round(summary(cca_model_CA1_CA2)$cont$importance[2,1]*100,2) #Get percentage of variance explained by first axis
cca2_varex<-round(summary(cca_model_CA1_CA2)$cont$importance[2,2]*100,2) #Get percentage of variance explained by second axis

# add lifestyle info to sites
df_sites$lifestyle  <- ifelse(grepl("CHE",row.names(df_sites)), "Foragers",
                              ifelse(grepl("NEW00",row.names(df_sites)) | grepl("NEW10",row.names(df_sites)) | grepl("THA",row.names(df_sites)), "Agriculturalists",
                              ifelse(grepl("NEW01",row.names(df_sites)) | grepl("NEW11",row.names(df_sites)), "Expats", "Recently Settled")))

# fix environmental labels
rownames(df_environ) <- c("alcohol", "tobacco", "smoking", "scarcity", "sex", "location", "yogurt once month", "yogurt twice month", "yogurt twice week", "literacy", "Sisnu", "grain", "fuel elec/gas", "fuel biogas", "health_travel", "education 2-6", "education 7-11", "education 12+", "unknown ayurvedic", "ayurvedic 0-6mo", "ayurvedic 6-12 mo", "1-2 children", "3-5 children", "6+ children")

# filter to only significant factors for visualization
df_environ_fig <- df_environ[c("alcohol", "smoking", "location", "Sisnu", "grain"),]

scaling_factor_CCA <- 2.5 # for better visualization

CCA_sig_plot <- ggplot(data=df_sites)+
  geom_hline(yintercept=0, 
             linetype="dashed") +
  geom_vline(xintercept=0, 
             linetype="dashed") +
  coord_fixed()+
  # Add site points
  geom_point(shape = 21, color = c("black"), size = 3, 
            aes(x=CCA1,
                y=CCA2, fill = lifestyle))+
  scale_fill_manual(name=NULL,
                    values=fourcolors,
                    breaks=c("Foragers", "Recently Settled", "Agriculturalists", "Expats"),
                    labels=c("Foragers", "Recently Settled", "Agriculturalists", "Expats")) +
  stat_ellipse(level = 0.7, aes(x=CCA1, y=CCA2, color = lifestyle),type = "norm") +
  scale_color_manual(name=NULL,
                    values=fourcolors,
                    breaks=c("Foragers", "Recently Settled", "Agriculturalists", "Expats"),
                    labels=c("Foragers", "Recently Settled", "Agriculturalists", "Expats")) +
  #Add environmental vars arrows
  geom_segment(data=df_environ_fig, 
               aes(x=0, #Starting coordinate in CCA1 = 0 
                   xend=CCA1*scaling_factor_CCA,#Ending coordinate in CCA1  
                   y=0, #Start in CCA2 = 0
                   yend=CCA2*scaling_factor_CCA), #Ending coordinate in CCA2 
               color="firebrick", #set color
               arrow=arrow(length=unit(0.01,"npc"))#Set the size of the lines that form the tip of the arrow
               )+
  #Add environmental vars text
  geom_text(data=df_environ_fig, size = 3, 
            aes(x=CCA1*scaling_factor_CCA, 
                y=CCA2*scaling_factor_CCA,
                label=rownames(df_environ_fig),
                hjust=0.5*(1-sign(CCA1)),#Add the text of each environmental var at the end of the arrow
                vjust=0.5*(1-sign(CCA2))),#Add the text of each environmental var at the end of the arrow 
            color="firebrick")+
  #Set bw theme
  theme_bw()+
  #Set x and y axis titles
  labs(x=paste0("CCA1 (",cca1_varex," %)"),
       y=paste0("CCA2 (",cca2_varex," %)"))

ggsave(file="figures/CCA.pdf", plot=CCA_sig_plot, width=5, height=4)
```

## check to see if CA axes follow the lifestyle trend
```{r}
library(DescTools) # this MUST be loaded after running permanova and all other vegan functions, because loading both DescTools and vegan causes vegan to stop working

JonckheereTerpstraTest(as.numeric(Dim1) ~ lifestyle, data = row_axes)
JonckheereTerpstraTest(as.numeric(Dim2) ~ lifestyle, data = row_axes)
```

## CCA without variable selection
```{r}
# CCA with CA1
set.seed(101)
cca_model <- cca(CCA_otu_tax_t ~ alcohol + tobacco + smoking + scarcity + sex + location + yogurt_freq + literacy + Sisnu + grain + fuel + health_travel + education + ayurvedic + drinking_water + kitchen_loc + toilet + fish + meat + black_tea + milk_tea + soda + milk + yogurt + fermented + ferm_freq + food_source + exercise + exercise_freq + sick_checkup + checkup_loc + meds + brushing + brushing_freq + age + household + children, data = df_CA_factor)

vif.cca(cca_model_CA1_CA2)
```
```