-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLakeO_Analyses_2019-2021.R
7650 lines (6963 loc) · 375 KB
/
LakeO_Analyses_2019-2021.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
###### BATCH CORRECTION & ASSOCIATED ANALYSES ######
##First had to go through and manually assign batches to the samples within the
##metadata file (based on mapping files)
## 10 KNOWN SEQUENCING RUNS IN TOTAL (an unknown sequence run making 11 "UNK")
###### SET WORKING DIRECTORY AND SEED ####
setwd("F:/Paise_Thesis/LakeO_Data/2019-2021_LakeO_Data/Analyses/LakeO_BatchCorrected/Analyses_Corrected")
#or setwd("/Volumes/PaiseSSD-T7/Paise_Thesis/LakeO_Data/2019-2021_LakeO_Data/Analyses/LakeO_BatchCorrected/Analyses_Corrected") for use on the lab computer
set.seed(1998)
###### Packages ######
library(vegan)
library(ggplot2)
library(tidyverse)
library(reshape2)
library(BiocManager)
library(MMUPHin)
#updating BiocManager and installing mmuphin
# if (!require("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
# BiocManager::install(version = "3.16")
# BiocManager::install("MMUPHin")
###### Creating relative abundance data ######
set.seed(1998)
dat<-read.csv("feature_Y123_nobcmASVs-nobelow10korDupes.csv", header=TRUE, row.names = 1)
dat<-data.matrix(dat)
typeof(dat) #"integer"
dat <- t(dat)
row.names(dat) # row names should now be the sample names
metadata <- read.csv("Metadata-Diversity_BATCH.csv", header = TRUE, row.names = 1)
typeof(metadata) ## "list"
dat <- as.data.frame(dat)
typeof(dat)
common.rownames <- intersect(rownames(dat), rownames(metadata))
dat <- dat[common.rownames,]
metadata <- metadata[common.rownames,]
all.equal(rownames(dat),rownames(metadata))
otu.abund<-which(colSums(dat)>2)
dat.dom<-dat[,otu.abund] #dominant taxa
dat.pa<-decostand(dat.dom, method ="pa") #presence/absence data
dat.otus.01per<-which(colSums(dat.pa) > (0.01*nrow(dat.pa)))
dat.01per<-dat.dom[,dat.otus.01per] #removed ASVs that occur less than 0.1%; 8,340 taxa present
dat.otus.001per<-which(colSums(dat.pa) > (0.001*nrow(dat.pa)))
dat.001per<-dat.dom[,dat.otus.001per] #removed ASVs that occur less than 0.01%; 44,623 taxa present
#increases the number of ASVs - includes more "microdiversity"
dat.ra<-decostand(dat.01per, method = "total") #relative abundance of >1% taxa
###### ANOSIM by Sequencing Batch ######
set.seed(1998)
##create relative abundance table in above code
##create Bray-Curtis dissimilarity distance matrix
ra.bc.dist<-vegdist(dat.ra, method = "bray")
##betadisper calculates dispersion (variances) within each group
dis.Batch <- betadisper(ra.bc.dist,metadata$Batch)
##permutest determines if the variances differ by groups (If differences are SIGNIFICANT - use ANOSIM
## if not use PERMANOVA (adonis))
permutest(dis.Batch, pairwise=TRUE, permutations=999)
# Df Sum Sq Mean Sq F N.Perm Pr(>F)
# Groups 10 1.0196 0.101957 10.832 999 0.001 *** SIGNIFICANT - USE ANOSIM!!
# Residuals 530 4.9886 0.009413
# ---
# Pairwise comparisons:
# (Observed p-value below diagonal, permuted p-value above diagonal)
# ELIZA2 ELIZA23 ELIZA3 LO22 LO310 LO8382 NOAA PAIS1 PAIS2 PAIS3 UNK
# ELIZA2 2.5800e-01 2.2800e-01 4.0000e-03 1.0000e-03 2.8000e-02 7.0300e-01 5.4400e-01 2.0000e-02 2.0000e-03 0.001
# ELIZA23 2.4836e-01 2.9000e-02 4.5000e-02 1.0000e-03 9.7000e-02 3.1600e-01 4.7000e-01 1.6100e-01 4.9000e-02 0.001
# ELIZA3 1.9314e-01 2.3483e-02 3.0000e-03 1.0000e-03 6.0000e-03 4.5400e-01 1.1300e-01 3.0000e-03 1.0000e-03 0.001
# LO22 4.2982e-03 4.4251e-02 1.9370e-03 1.1800e-01 9.9300e-01 7.5000e-02 1.0000e-02 5.1900e-01 7.1500e-01 0.005
# LO310 1.4327e-04 5.5957e-04 8.7490e-06 1.2846e-01 1.8300e-01 3.0000e-03 1.0000e-03 3.1000e-02 4.1000e-02 0.966
# LO8382 2.2967e-02 9.5642e-02 4.8732e-03 9.9098e-01 1.8031e-01 1.1600e-01 3.4000e-02 6.0500e-01 7.5900e-01 0.029
# NOAA 7.1430e-01 3.1669e-01 4.7882e-01 7.6250e-02 3.0428e-03 1.0361e-01 5.3600e-01 1.1600e-01 7.0000e-02 0.001
# PAIS1 5.5294e-01 4.9884e-01 9.5982e-02 8.2362e-03 2.4554e-04 3.9403e-02 5.4000e-01 5.3000e-02 4.0000e-03 0.001
# PAIS2 1.9845e-02 1.6321e-01 2.9914e-03 5.0465e-01 2.4943e-02 5.7184e-01 1.0857e-01 4.4072e-02 7.0500e-01 0.002
# PAIS3 1.6726e-03 4.7249e-02 1.0649e-03 6.9056e-01 3.8967e-02 7.4158e-01 7.1715e-02 3.8772e-03 7.0437e-01 0.001
# UNK 6.1433e-14 3.1874e-10 7.8632e-11 4.5319e-03 9.6747e-01 2.3258e-02 3.6162e-05 3.0384e-14 5.3552e-05 3.8664e-05
##ANOSIM - determining if the differences between two or more groups are significant.
## The ANOSIM statistic “R” compares the mean of ranked dissimilarities between groups to
## the mean of ranked dissimilarities within groups. An R value close to “1" suggests
## dissimilarity between groups while an R value close to “0” suggests an even distribution of
## high and low ranks within and between groups”
## the higher the R value, the more dissimilar your groups are in terms of microbial community composition.
anosim(ra.bc.dist, metadata$Batch, permutations = 999)
# ANOSIM statistic R: 0.1486
# Significance: 0.001
anosim(ra.bc.dist, metadata$Batch, permutations = 9999)
# ANOSIM statistic R: 0.1486
# Significance: 0.0001
## Conclusion? There are significantly weak differences between batches so the
## data needs to be batch corrected and ALL analyses redone.
###### BATCH CORRECTION ######
set.seed(1998)
library(MMUPHin)
library(vegan)
## Loading in feature- and metadata
dat <- read.csv("feature_Y123_nobcmASVs-nobelow10korDupes.csv", header=TRUE, row.names = 1)
dat <- data.matrix(dat)
typeof(dat) #"integer"
dat <- t(dat) #transposing data matrix
row.names(dat) # row names should now be the sample names
metadata <- read.csv("Metadata-Diversity_BATCH.csv", header = TRUE, row.names = 1)
typeof(metadata) ## "list"
dat <- as.data.frame(dat)
typeof(dat)
common.rownames <- intersect(rownames(dat), rownames(metadata))
dat <- dat[common.rownames,]
metadata <- metadata[common.rownames,]
all.equal(rownames(dat),rownames(metadata)) #TRUE
## Batch Correction (following Harvard tutorial)
#looking at how many samples are in each batch
table(metadata$Batch)
# ELIZA2 ELIZA23 ELIZA3 LO22 LO310 LO8382 NOAA PAIS1 PAIS2 PAIS3 UNK
# 62 50 11 38 20 20 6 98 40 72 124
#Adjusting (removing) batch effect
#taxa should be rows in feature table and samples should be rows in metadata
#feature table should be a matrix while metadata should be a dataframe
fit_adjust_batch <- adjust_batch(feature_abd = t(dat),
batch = "Batch",
data = metadata)
Lake_abd_adj <- fit_adjust_batch$feature_abd_adj #now adjusted feature table MATRIX
Lake_abd_adj <- as.data.frame(Lake_abd_adj) #converting to data frame
write.csv(Lake_abd_adj, "feature_Y123_ADJUSTED.csv") #saving as csv
###### Creating a rarefaction curve on the read counts ######
library(vegan)
#load in data with NO blank samples or blank ASVs
rardat<-read.csv("feature_Y123_noblanksorbASVs.csv", header=TRUE, row.names=1, sep=',')
#as you can see the samples are in columns and need to be in the rows so we need to flip or transpose the file
#transpose the data to rows
trans.rardat <- t(rardat)
## check file to make sure it worked
trans.rardat[1:5,1:5] #shows rows 1 through 5 and the samples should now be the rows
##making the transformed data matrix into main
rardat <- trans.rardat
##changing back into data frame instead of matrix (transforming the data frame turned it into a matrix)
rardat <-as.data.frame(rardat)
#check data file to make sure it looks okay
View(rardat)
rowSums(rardat) #sums the value of each row in the data frame
#### Creating the rarefaction curve
#count the number of species within each sample
S <- specnumber(rardat)
raremax <- min(rowSums(rardat)) ## takes the sample with the lowest sample size which is 0 in this dataset
#creating color palette for curve
colors() ## lists the color names that are built into R
cc <- palette()
palette(c(cc,"purple","brown")) ## creating the color ramp for the plot
cc <- palette()
#plotting the rarefaction curves
## auto removes samples that have no reads
pars <- expand.grid()
Hklim <- rarecurve(rardat, step = 2000, sample=raremax, col = cc, label = TRUE, main="Rarefaction Curve for Lake O read counts",
cex= 0.14, cex.axis= 0.7, cex.lab= 1, xlim=c(0,100000), xlab = "# of Reads", ylab = "# of ASVs", tidy = T)
#### ####
###### ANALYSES ON BATCH CORRECTED DATA ######
###### SET WORKING DIRECTORY AND SEED ####
setwd("F:/Paise_Thesis/LakeO_Data/2019-2021_LakeO_Data/Analyses/LakeO_BatchCorrected/Analyses_Corrected")
#or setwd("/Volumes/PaiseSSD-T7/Paise_Thesis/LakeO_Data/2019-2021_LakeO_Data/Analyses/LakeO_BatchCorrected/Analyses_Corrected")
#for use on the lab computer
set.seed(1998)
###### PACKAGES ######
library(phyloseq)
library(vegan)
library(ggplot2)
library(tidyverse)
library(RVAideMemoire)
library(DESeq2)
library(corrplot)
library(multcompView)
library(pgirmess)
library(data.table)
library(microbiome)
library(BiocManager)
library(ggthemes)
library(gplots)
library(RColorBrewer)
library(cooccur)
library(visNetwork)
library(Hmisc)
library(cowplot)
library(reshape2)
library(sjmisc)
library(MASS)
library(scales)
library(forcats)
library(leaflet)
library(eulerr)
library(microbiomeutilities)
##Installing packages
BiocManager::install("DESeq2")
BiocManager::install("lefser")
BiocManager::install("ALDEx2")
BiocManager::install("ANCOMBC")
BiocManager::install("phyloseq")
BiocManager::install("microbiome")
BiocManager::install("microbiomeutilities")
##Had to install using binaries (3/9/23 on iMAC)
install.packages("tibble", type="binary")
install.packages("Hmisc", type="binary")
## Notes on packages:
# pgirmess = Kruskal-Wallis Test
# RVAideMemoire = PERMANOVA
# cowplot = making multiple plots using ggplots objects
###### GENERATING CITATIONS FOR R AND R PACKAGES ######
citation() #retrieves the citation for R
## Citation for each package
citation("ggplot2")
citation("phyloseq")
citation("vegan")
citation("microbiome")
citation("MMUPHin")
citation("pgirmess")
citation("multcompView")
citation("RVAideMemoire")
###### Prepping data for analyses #####
### import feature-table data ###
##change to csv or import as a tsv using read.table function
dat<-read.csv("feature_Y123_ADJUSTED.csv", header=TRUE, row.names = 1) ## do not add "header =" or "row.names =" for merging
# 561 samples; 65294 taxa
dat<-data.matrix(dat) ##if data is not recognized as a data.frame numeric
typeof(dat) #"integer"
#check data file to make sure it looks okay
#as you can see the samples are in columns and need to be in the rows so we need to flip or transpose the file
#transpose the data to rows
trans.dat <- t(dat)
## check file to make sure it worked
trans.dat[1:5,1:5] #shows rows 1 through 5 and the samples should now be the rows
##set transposed data to main data variable
dat <-trans.dat
row.names(dat) # row names should now be the sample names
### import metadata ###
###(if you intend to do any statistical analyses in R)
##If not skip to refining and normalizing steps
metadata <- read.csv("Metadata-Diversity_BATCH.csv", header = TRUE, row.names = 1)
##should read "list"
typeof(metadata) ## "list"
dat <- as.data.frame(dat) ## had to change dat back into a data frame to check for matching rows
typeof(dat) ## "list"
##check to make sure the sample names match and are correct
common.rownames <- intersect(rownames(dat), rownames(metadata))
##541 rows are in common (20 S80 samples NOT included)
##if there are any rows that do not match, they will not be included in the statistical analysis or relative abundance tables
dat <- dat[common.rownames,]
metadata <- metadata[common.rownames,]
##check that all rows match
all.equal(rownames(dat),rownames(metadata)) #TRUE so yes they all match
dat[1:5,1:3] ## double-checking that everything looks good
##merging the working feature and taxonomy tables
feat <- dat
tax <- read.csv("taxonomy_Y123_edited&cleaned.csv")
feattax <- merge.data.frame(feat, tax, by= "FeatureID", all.x=TRUE, all.y = TRUE)
write.csv(feattax, "feat-tax_Y123_cleaned.csv")
## CONTINUE HERE IF YOU ARE IGNORING METADATA ###
## refining and normalizing data #
##remove singletons and doubletons - ASVs that only show up once or twice
##this can be modified or removed if desired. Depends on what you want to know
library(vegan)
otu.abund<-which(colSums(dat)>2)
dat.dom<-dat[,otu.abund] #46838 taxa
##all this will get rid of ASVs that appear less than a certain percent in the data
##this is not always something that you should do depending on your question.
dat.pa<-decostand(dat.dom, method ="pa") #"pa" = standardization method that scales your data to presence/absence (0/1)
##remove ASVs that occur <0.01 ***
dat.otus.01per<-which(colSums(dat.pa) > (0.01*nrow(dat.pa)))
dat.01per<-dat.dom[,dat.otus.01per]
# 8,340 taxa
write.csv(as.data.frame(t(dat.01per)), "feature_Y123_0.01per.csv")
##remove ASVs that occur <0.001 ---> increases the number of ASVs - includes more "micro-diversity"
dat.otus.001per<-which(colSums(dat.pa) > (0.001*nrow(dat.pa)))
dat.001per<-dat.dom[,dat.otus.001per]
# 46,838 taxa
## relative abundance --> normalization ##
dat.ra<-decostand(dat.01per, method = "total") #"total" = standardization method that divides your data by margin total (def. margin = 1)
##export relative abundance table(s)
write.csv(dat.ra, "relative-abundance_Y123.csv")
## SHORTCUT WITH NO EXPLANATIONS
## re-creating relative abundance table
set.seed(1998)
dat<-read.csv("feature_Y123_ADJUSTED.csv", header=TRUE, row.names = 1)
dat<-data.matrix(dat)
typeof(dat)
dat <- t(dat)
row.names(dat)
metadata <- read.csv("Metadata-Diversity_BATCH.csv", header = TRUE, row.names = 1)
typeof(metadata)
dat <- as.data.frame(dat)
typeof(dat)
common.rownames <- intersect(rownames(dat), rownames(metadata))
dat <- dat[common.rownames,]
metadata <- metadata[common.rownames,]
all.equal(rownames(dat),rownames(metadata))
otu.abund<-which(colSums(dat)>2)
dat.dom<-dat[,otu.abund]
dat.pa<-decostand(dat.dom, method ="pa")
dat.otus.01per<-which(colSums(dat.pa) > (0.01*nrow(dat.pa)))
dat.01per<-dat.dom[,dat.otus.01per]
dat.otus.001per<-which(colSums(dat.pa) > (0.001*nrow(dat.pa)))
dat.001per<-dat.dom[,dat.otus.001per]
dat.ra<-decostand(dat.01per, method = "total")
###### Merging relative abundance with taxonomy and getting averages ######
Yr1 <- read.csv("Year1_RA.csv")
Yr2 <- read.csv("Year2_RA.csv")
Yr3 <- read.csv("Year3_RA.csv")
tax <- read.csv("taxonomy_Y123_edited&cleaned.csv")
Yr1t <- merge.data.frame(Yr1,tax,by= "FeatureID", all.x = TRUE)
Yr2t <- merge.data.frame(Yr2,tax,by= "FeatureID", all.x = TRUE)
Yr3t <- merge.data.frame(Yr3,tax,by= "FeatureID", all.x = TRUE)
write.csv(Yr1t, "Year1_RA.csv")
write.csv(Yr2t, "Year2_RA.csv")
write.csv(Yr3t, "Year3_RA.csv")
### Average and St.dev abundance of each phylum in each year
library(tidyverse)
## Year 1
#first merge data with matching taxonomy and load csv
Yr1 <- read.csv("Year1_RA.csv", row.names = 1)
#Sum by phylum across samples
physumY1 <- Yr1 %>%
group_by(Phylum) %>%
summarise(across(where(is.numeric), sum))
#Average phylum across samples
Y1mean <- apply(physumY1[,-1], 1, mean, na.rm=TRUE)
#Standard deviation across samples
Y1std <- apply(physumY1[,-1], 1, sd, na.rm=TRUE)
#merge average and st.dev with rows
Y1avsd <- as.data.frame(cbind(physumY1$Phylum,Y1mean, Y1std))
#Renaming columns and saving as csv
colnames(Y1avsd)[1] ="Phylum"
colnames(Y1avsd)[2] ="Average"
colnames(Y1avsd)[3] ="Stand.Dev"
write.csv(Y1avsd, "Year1_AvSD-UPDATED.csv")
#Extract top 10 phyla and save as csv
top101 <- names(top10phy.names.Y1)
Y1avsd10 <- filter(Y1avsd,
Y1avsd$Phylum %in% top101)
write.csv(Y1avsd10, "Year1_AvSD_TOP10-UPDATED.csv")
## Year 2
Yr2 <- read.csv("Year2_RA.csv", row.names = 1)
#Sum by phylum across samples
physumY2 <- Yr2 %>%
group_by(Phylum) %>%
summarise(across(where(is.numeric), sum))
#Average phylum across samples
Y2mean <- apply(physumY2[,-1], 1, mean, na.rm=TRUE)
#Standard deviation across samples
Y2std <- apply(physumY2[,-1], 1, sd, na.rm=TRUE)
#merge average and st.dev with rows
Y2avsd <- as.data.frame(cbind(physumY2$Phylum,Y2mean, Y2std))
#Renaming columns and saving as csv
colnames(Y2avsd)[1] ="Phylum"
colnames(Y2avsd)[2] ="Average"
colnames(Y2avsd)[3] ="Stand.Dev"
write.csv(Y2avsd, "Year2_AvSD-UPDATED.csv")
#Extract top 10 phyla and save as csv
top102 <- names(top10phy.names.Y2)
Y2avsd10 <- filter(Y2avsd,
Y2avsd$Phylum %in% top102)
write.csv(Y2avsd10, "Year2_AvSD_TOP10-UPDATED.csv")
## Year 3
Yr3 <- read.csv("Year3_RA.csv", row.names = 1)
#Sum by phylum across samples
physumY3 <- Yr3 %>%
group_by(Phylum) %>%
summarise(across(where(is.numeric), sum))
#Average phylum across samples
Y3mean <- apply(physumY3[,-1], 1, mean, na.rm=TRUE)
#Standard deviation across samples
Y3std <- apply(physumY3[,-1], 1, sd, na.rm=TRUE)
#merge average and st.dev with rows
Y3avsd <- as.data.frame(cbind(physumY3$Phylum,Y3mean, Y3std))
#Renaming columns and saving as csv
colnames(Y3avsd)[1] ="Phylum"
colnames(Y3avsd)[2] ="Average"
colnames(Y3avsd)[3] ="Stand.Dev"
write.csv(Y3avsd, "Year3_AvSD-UPDATED.csv")
#Extract top 10 phyla and save as csv
top103 <- names(top10phy.names.Y3)
Y3avsd10 <- filter(Y3avsd,
Y3avsd$Phylum %in% top103)
write.csv(Y3avsd10, "Year3_AvSD_TOP10-UPDATED.csv")
# Merge all years together and save as csv
#Original lists
#put all data frames into list
Y123avstd <- list(Y1avsd, Y2avsd, Y3avsd)
#merge all data frames in list
all <- Y123avstd %>% reduce(full_join, by='Phylum')
#renaming columns
colnames(all)[2] ="Y1mean"
colnames(all)[3] ="Y1std"
colnames(all)[4] ="Y2mean"
colnames(all)[5] ="Y2std"
colnames(all)[6] ="Y3mean"
colnames(all)[7] ="Y3std"
#Top 10 lists
Y123avstd10 <- list(Y1avsd10, Y2avsd10, Y3avsd10)
top10 <- Y123avstd10 %>% reduce(full_join, by='Phylum')
colnames(top10)[2] ="Y1mean"
colnames(top10)[3] ="Y1std"
colnames(top10)[4] ="Y2mean"
colnames(top10)[5] ="Y2std"
colnames(top10)[6] ="Y3mean"
colnames(top10)[7] ="Y3std"
#Save as csvs
write.csv(all, "Year123_AvSD.csv")
write.csv(top10, "Year123_AvSD_TOP10.csv")
###### Separating feature table by Station (CSVs) ######
CLV <- as.data.frame(t(dat.ra[grep("^CLV10A", rownames(dat.ra)),]))
KISS <- as.data.frame(t(dat.ra[grep("^KISSR0.0", rownames(dat.ra)),]))
L1 <- as.data.frame(t(dat.ra[grep("^L001", rownames(dat.ra)),]))
L4 <- as.data.frame(t(dat.ra[grep("^L004", rownames(dat.ra)),]))
L5 <- as.data.frame(t(dat.ra[grep("^L005", rownames(dat.ra)),]))
L6 <- as.data.frame(t(dat.ra[grep("^L006", rownames(dat.ra)),]))
L7 <- as.data.frame(t(dat.ra[grep("^L007", rownames(dat.ra)),]))
L8 <- as.data.frame(t(dat.ra[grep("^L008", rownames(dat.ra)),]))
LZ2 <- as.data.frame(t(dat.ra[grep("^LZ2_", rownames(dat.ra)),]))
Z25A <- as.data.frame(t(dat.ra[grep("^LZ25A", rownames(dat.ra)),]))
Z30 <- as.data.frame(t(dat.ra[grep("^LZ30", rownames(dat.ra)),]))
Z40 <- as.data.frame(t(dat.ra[grep("^LZ40", rownames(dat.ra)),]))
PALM <- as.data.frame(t(dat.ra[grep("^PALMOUT", rownames(dat.ra)),]))
PEL <- as.data.frame(t(dat.ra[grep("^PELBAY3", rownames(dat.ra)),]))
POLE3S <- as.data.frame(t(dat.ra[grep("^POLE3S", rownames(dat.ra)),]))
PO <- as.data.frame(t(dat.ra[grep("^POLESOUT", rownames(dat.ra)),]))
RIT <- as.data.frame(t(dat.ra[grep("^RITTAE2", rownames(dat.ra)),]))
S308 <- as.data.frame(t(dat.ra[grep("^S308", rownames(dat.ra)),]))
S77 <- as.data.frame(t(dat.ra[grep("^S77", rownames(dat.ra)),]))
S79 <- as.data.frame(t(dat.ra[grep("^S79", rownames(dat.ra)),]))
#S80 not included in adjusted dataset
###### Separating feature table by Year then Station (CSVs) ######
dat1 <- as.data.frame(t(dat.ra[grep("_19$", rownames(dat.ra)),]))
dat2 <- as.data.frame(t(dat.ra[grep("_20$", rownames(dat.ra)),]))
dat3 <- as.data.frame(t(dat.ra[grep("_21$", rownames(dat.ra)),]))
write.csv(dat1,"feature_Y1r_ADJUSTED.csv")
write.csv(dat2,"feature_Y2r_ADJUSTED.csv")
write.csv(dat3,"feature_Y3r_ADJUSTED.csv")
dat1 <- as.data.frame(t(dat1))
dat2 <- as.data.frame(t(dat2))
dat3 <- as.data.frame(t(dat3))
#Year 1 Stations
CLV <- as.data.frame(t(dat1[grep("^CLV10A", rownames(dat1)),]))
KISS <- as.data.frame(t(dat1[grep("^KISSR0.0", rownames(dat1)),]))
L1 <- as.data.frame(t(dat1[grep("^L001", rownames(dat1)),]))
L4 <- as.data.frame(t(dat1[grep("^L004", rownames(dat1)),]))
L5 <- as.data.frame(t(dat1[grep("^L005", rownames(dat1)),]))
L6 <- as.data.frame(t(dat1[grep("^L006", rownames(dat1)),]))
L7 <- as.data.frame(t(dat1[grep("^L007", rownames(dat1)),]))
L8 <- as.data.frame(t(dat1[grep("^L008", rownames(dat1)),]))
LZ2 <- as.data.frame(t(dat1[grep("^LZ2_", rownames(dat1)),]))
Z25A <- as.data.frame(t(dat1[grep("^LZ25A", rownames(dat1)),]))
Z30 <- as.data.frame(t(dat1[grep("^LZ30", rownames(dat1)),]))
Z40 <- as.data.frame(t(dat1[grep("^LZ40", rownames(dat1)),]))
PALM <- as.data.frame(t(dat1[grep("^PALMOUT", rownames(dat1)),]))
PEL <- as.data.frame(t(dat1[grep("^PELBAY3", rownames(dat1)),]))
POLE3S <- as.data.frame(t(dat1[grep("^POLE3S", rownames(dat1)),]))
PO <- as.data.frame(t(dat1[grep("^POLESOUT", rownames(dat1)),]))
RIT <- as.data.frame(t(dat1[grep("^RITTAE2", rownames(dat1)),]))
S308 <- as.data.frame(t(dat1[grep("^S308", rownames(dat1)),]))
S77 <- as.data.frame(t(dat1[grep("^S77", rownames(dat1)),]))
S79 <- as.data.frame(t(dat1[grep("^S79", rownames(dat1)),]))
#Year 2 Stations
CLV <- as.data.frame(t(dat2[grep("^CLV10A", rownames(dat2)),]))
KISS <- as.data.frame(t(dat2[grep("^KISSR0.0", rownames(dat2)),]))
L1 <- as.data.frame(t(dat2[grep("^L001", rownames(dat2)),]))
L4 <- as.data.frame(t(dat2[grep("^L004", rownames(dat2)),]))
L5 <- as.data.frame(t(dat2[grep("^L005", rownames(dat2)),]))
L6 <- as.data.frame(t(dat2[grep("^L006", rownames(dat2)),]))
L7 <- as.data.frame(t(dat2[grep("^L007", rownames(dat2)),]))
L8 <- as.data.frame(t(dat2[grep("^L008", rownames(dat2)),]))
LZ2 <- as.data.frame(t(dat2[grep("^LZ2_", rownames(dat2)),]))
Z25A <- as.data.frame(t(dat2[grep("^LZ25A", rownames(dat2)),]))
Z30 <- as.data.frame(t(dat2[grep("^LZ30", rownames(dat2)),]))
Z40 <- as.data.frame(t(dat2[grep("^LZ40", rownames(dat2)),]))
PALM <- as.data.frame(t(dat2[grep("^PALMOUT", rownames(dat2)),]))
PEL <- as.data.frame(t(dat2[grep("^PELBAY3", rownames(dat2)),]))
POLE3S <- as.data.frame(t(dat2[grep("^POLE3S", rownames(dat2)),]))
PO <- as.data.frame(t(dat2[grep("^POLESOUT", rownames(dat2)),]))
RIT <- as.data.frame(t(dat2[grep("^RITTAE2", rownames(dat2)),]))
S308 <- as.data.frame(t(dat2[grep("^S308", rownames(dat2)),]))
S77 <- as.data.frame(t(dat2[grep("^S77", rownames(dat2)),]))
S79 <- as.data.frame(t(dat2[grep("^S79", rownames(dat2)),]))
#Year 3 Stations
CLV <- as.data.frame(t(dat3[grep("^CLV10A", rownames(dat3)),]))
KISS <- as.data.frame(t(dat3[grep("^KISSR0.0", rownames(dat3)),]))
L1 <- as.data.frame(t(dat3[grep("^L001", rownames(dat3)),]))
L4 <- as.data.frame(t(dat3[grep("^L004", rownames(dat3)),]))
L5 <- as.data.frame(t(dat3[grep("^L005", rownames(dat3)),]))
L6 <- as.data.frame(t(dat3[grep("^L006", rownames(dat3)),]))
L7 <- as.data.frame(t(dat3[grep("^L007", rownames(dat3)),]))
L8 <- as.data.frame(t(dat3[grep("^L008", rownames(dat3)),]))
LZ2 <- as.data.frame(t(dat3[grep("^LZ2_", rownames(dat3)),]))
Z25A <- as.data.frame(t(dat3[grep("^LZ25A", rownames(dat3)),]))
Z30 <- as.data.frame(t(dat3[grep("^LZ30", rownames(dat3)),]))
Z40 <- as.data.frame(t(dat3[grep("^LZ40", rownames(dat3)),]))
PALM <- as.data.frame(t(dat3[grep("^PALMOUT", rownames(dat3)),]))
PEL <- as.data.frame(t(dat3[grep("^PELBAY3", rownames(dat3)),]))
POLE3S <- as.data.frame(t(dat3[grep("^POLE3S", rownames(dat3)),]))
PO <- as.data.frame(t(dat3[grep("^POLESOUT", rownames(dat3)),]))
RIT <- as.data.frame(t(dat3[grep("^RITTAE2", rownames(dat3)),]))
S308 <- as.data.frame(t(dat3[grep("^S308", rownames(dat3)),]))
S77 <- as.data.frame(t(dat3[grep("^S77", rownames(dat3)),]))
S79 <- as.data.frame(t(dat3[grep("^S79", rownames(dat3)),]))
###### TOP 10 TAXA BAR CHART - ALL YEARS TOGETHER ######
asvdat <- as.data.frame(t(dat.ra))
taxdat <- read.csv("taxonomy_Y123_edited&cleaned.csv", header = TRUE, row.names = 1)
meta <- read.csv("Metadata-Diversity_BATCH.csv", header = TRUE, row.names = 1)
asvmat <- data.matrix(asvdat)
taxmat <- as.matrix(taxdat) # use as.matrix NOT as.data.matrix as the data will convert the data into numbers
ASV <- otu_table(asvmat, taxa_are_rows = TRUE)
TAX <- tax_table(taxmat)
META <- sample_data(meta)
#Merging metadata, taxonomy, and ASV tables into one phyloseq object
physeq <- phyloseq(ASV,TAX,META)
#Use transform functions from microbiome package
transform <- microbiome::transform
#Merge rare taxa in to "Other"
physeq_transform <- transform(physeq, "compositional")
ASV # 8,340 taxa & 541 samples
TAX # 8,340 taxa by 7 tax. ranks
META # 541 samples by 42 sample variables
### Basic stats of seq. reads
#Check number of microbes observed in each sample
sample_sums(physeq)
##Basic stats for reads of samples
sum(sample_sums(physeq))
#Total reads = 24,093,755
mean(sample_sums(physeq))
#Mean = 44,535.59
min(sample_sums(physeq))
#Min= 10,029
max(sample_sums(physeq))
#Max = 193,655
sd(sample_sums(physeq))
#Stan.Dev = 24,782.95
ntaxa(physeq)
#Total ASVs = 65,294
physeq
# phyloseq-class experiment-level object
# otu_table() OTU Table: [ 8340 taxa and 541 samples ]
# sample_data() Sample Data: [ 541 samples by 42 sample variables ]
# tax_table() Taxonomy Table: [ 8340 taxa by 7 taxonomic ranks ]
##Retrieves the unique taxonomic ranks observed in the data set
##[#] = rank (starting from Domain and onward DPCOFGS)
get_taxa_unique(physeq, taxonomic.rank=rank_names(physeq)[7], errorIfNULL=TRUE)
#Unique Domains = 4
#Unique Phyla = 56
#Unique Classes = 142
#Unique Orders = 351
#Unique Families = 508
#Unique Genera = 728
#Unique Species = 317
## make sure there is a phyloseq object which includes the data, metadata, and taxonomy ##
## Aggregating by Taxa levels
phyPhy <- aggregate_taxa(physeq, 'Phylum')
phyClass <- aggregate_taxa(physeq, 'Class')
phyOrd <- aggregate_taxa(physeq, 'Order')
phyGen <- aggregate_taxa(physeq, 'Genus')
LakeOPhy <- as.data.frame(taxa_sums(phyPhy))
LakeOClass <- as.data.frame(taxa_sums(phyClass))
LakeOOrd <- as.data.frame(taxa_sums(phyOrd))
LakeOGenus <- as.data.frame(taxa_sums(phyGen))
#Saving each table as CSV
write.csv(LakeOPhy, "LakeOPhylaTotals.csv")
write.csv(LakeOClass, "LakeOClassesTotals.csv")
write.csv(LakeOOrd, "LakeOOrdersTotals.csv")
write.csv(LakeOGenus, "LakeOGeneraTotals.csv")
## Top 10 Phyla
#Sort Phylum by abundance and pick the top 10
top10phy.names <- sort(tapply(taxa_sums(physeq_transform), tax_table(physeq_transform)[, "Phylum"], sum), TRUE)[1:10]
## write.csv(top10phy.names, "Top10PhylaLakeO.csv")
# Proteobacteria Bacteroidota Cyanobacteria Actinobacteriota Verrucomicrobiota Planctomycetota Acidobacteriota
# 121.550676 110.168874 81.682736 57.976055 38.301827 34.610471 15.164802
# Bdellovibrionota Chloroflexi Gemmatimonadota
# 14.615002 11.278973 9.640009
#Cut down the physeq data to only the top 10 Phyla
top10phyla <- subset_taxa(physeq_transform, Phylum %in% names(top10phy.names))
## Plotting taxa stacked bar based on Zone
LakePhylaZ <- plot_bar(top10phyla, x="Zone", y="Abundance", fill="Phylum")
LakePhylaZ <- LakePhylaZ +
geom_bar(aes(fill=Phylum), stat="identity", position="fill", width = 0.96)+ #width=0.96 removes any space between bars
ggtitle("Top 10 Phyla in Lake Okeechobee by Zone - March 2019 to October 2021")+
facet_grid(.~Year, scales = "free",
labeller = as_labeller(c('1'='Year 1 (2019)',
'2'='Year 2 (2020)',
'3'='Year 3 (2021)')))+ #scales=free -> allows ggplot to change the axes for the data shown in each facet
theme_light()+ #labeller -> changing the labels of the grid
theme(axis.text.x = element_text(angle = 90, vjust = 0.28))+ #vjust= moves the x-axis text labels
theme(plot.title = element_text(color="navyblue", size=14, face="bold.italic", hjust = 0.5))+ #hjust= 0.5 centers the title
theme(legend.title = element_text(face="italic"))
##Changing the color (by changing the default in ggplot2 [from HELP])
LakeOTop10 <- c("#2bcaf4","#24630e","#edc427","#1f60aa","#333333",
"#41ea27","#806bb4","#5f421b","#f08539","#ff9eed")
## listed by phyla in alphabetical order
withr::with_options(list(ggplot2.discrete.fill = LakeOTop10, ggplot2.discrete.colour = LakeOTop10),print(LakePhylaZ))
###### Top 10 phyla each year (CSVs) ######
#Subsetting original ASV table by year
Y1r <- dat.ra[grep("_19$", rownames(dat.ra)),]
Y2r <- dat.ra[grep("_20$", rownames(dat.ra)),]
Y3r <- dat.ra[grep("_21$", rownames(dat.ra)),]
write.csv(t(Y1), "Year1_RA.csv")
write.csv(t(Y2), "Year2_RA.csv")
write.csv(t(Y3), "Year3_RA.csv")
# OR
#Load in data if already exported to CSVs
Y1r <- read.csv("Year1_RA.csv", row.names = 1)
Y2r <- read.csv("Year2_RA.csv", row.names = 1)
Y3r <- read.csv("Year3_RA.csv", row.names = 1)
#Top 10 phyla in each year
##Year 1
asvdat <- Y1r
taxdat <- read.csv("taxonomy_Y123_edited&cleaned.csv", header = TRUE, row.names = 1)
meta <- read.csv("Metadata-Diversity_BATCH.csv", header = TRUE, row.names = 1)
asvmat <- data.matrix(asvdat)
taxmat <- as.matrix(taxdat) # use as.matrix NOT as.data.matrix as the data will convert the data into numbers
ASV <- otu_table(asvmat, taxa_are_rows = TRUE)
TAX <- tax_table(taxmat)
META <- sample_data(meta)
phyY1<- phyloseq(ASV,TAX,META)
transform <- microbiome::transform
phyY1_transform <- transform(phyY1, "compositional")
### Assigning Top 10 Phyla
#Sort Phylum by abundance and pick the top 10
top10phy.names.Y1 <- sort(tapply(taxa_sums(phyY1_transform), tax_table(phyY1_transform)[, "Phylum"], sum), TRUE)[1:10]
top10phy.names.Y1
# Proteobacteria Bacteroidota Cyanobacteria Actinobacteriota Planctomycetota Verrucomicrobiota Bdellovibrionota
# 37.118712 34.048403 18.633005 16.562391 11.084878 10.877789 5.230602
# Acidobacteriota Chloroflexi Crenarchaeota
# 4.481436 3.249468 2.864711
#Cut down the physeq data to only the top 10 Phyla
top10phylaY1 <- subset_taxa(phyY1_transform, Phylum %in% names(top10phy.names.Y1))
#Saving names and proportions as a data frame then saving as csv
topphylaY1 <- as.data.frame(top10phy.names.Y1)
colnames(topphylaY1)[1] ="Abundance"
write.csv(topphylaY1, "Top10Phyla_Year1.csv")
##Year 2
asvdat <- Y2r
taxdat <- read.csv("taxonomy_Y123_edited&cleaned.csv", header = TRUE, row.names = 1)
meta <- read.csv("Metadata-Diversity_BATCH.csv", header = TRUE, row.names = 1)
asvmat <- data.matrix(asvdat)
taxmat <- as.matrix(taxdat) # use as.matrix NOT as.data.matrix as the data will convert the data into numbers
ASV <- otu_table(asvmat, taxa_are_rows = TRUE)
TAX <- tax_table(taxmat)
META <- sample_data(meta)
phyY2<- phyloseq(ASV,TAX,META)
phyY2_transform <- transform(phyY2, "compositional")
### Assigning Top 10 Phyla
#Sort Phylum by abundance and pick the top 10
top10phy.names.Y2 <- sort(tapply(taxa_sums(phyY2_transform), tax_table(phyY2_transform)[, "Phylum"], sum), TRUE)[1:10]
#Cut down the physeq data to only the top 10 Phyla
top10phylaY2 <- subset_taxa(phyY2_transform, Phylum %in% names(top10phy.names.Y2))
#Saving names and proportions as a data frame then saving as csv
topphylaY2 <- as.data.frame(top10phy.names.Y2)
colnames(topphylaY2)[1] ="Abundance"
write.csv(topphylaY2, "Top10Phyla_Year2.csv")
##Year 3
asvdat <- Y3r
taxdat <- read.csv("taxonomy_Y123_edited&cleaned.csv", header = TRUE, row.names = 1)
meta <- read.csv("Metadata-Diversity_BATCH.csv", header = TRUE, row.names = 1)
asvmat <- data.matrix(asvdat)
taxmat <- as.matrix(taxdat) # use as.matrix NOT as.data.matrix as the data will convert the data into numbers
ASV <- otu_table(asvmat, taxa_are_rows = TRUE)
TAX <- tax_table(taxmat)
META <- sample_data(meta)
phyY3<- phyloseq(ASV,TAX,META)
transform <- microbiome::transform
phyY3_transform <- transform(phyY3, "compositional")
### Assigning Top 10 Phyla
#Sort Phylum by abundance and pick the top 10
top10phy.names.Y3 <- sort(tapply(taxa_sums(phyY3_transform), tax_table(phyY3_transform)[, "Phylum"], sum), TRUE)[1:10]
#Cut down the physeq data to only the top 10 Phyla
top10phylaY3 <- subset_taxa(phyY3_transform, Phylum %in% names(top10phy.names.Y3))
#Saving names and proportions as a data frame then saving as csv
topphylaY3 <- as.data.frame(top10phy.names.Y3)
colnames(topphylaY3)[1] ="Abundance"
write.csv(topphylaY3, "Top10Phyla_Year3.csv")
###### Top 10 by Stations (CSVs) - ALL YEARS TOGETHER ######
## Use sample name order from Metadata file to keep samples in chronological order
#Note: psmelt() turns phyloseq object into a large dataframe that is in LONG format
## CLV10A
asvdat <- CLV
taxdat <- read.csv("taxonomy_Y123_edited&cleaned.csv", header = TRUE, row.names = 1)
meta <- read.csv("Metadata-Diversity_BATCH.csv", header = TRUE, row.names = 1)
asvmat <- data.matrix(asvdat)
taxmat <- as.matrix(taxdat) # use as.matrix NOT as.data.matrix as the data will convert the data into numbers
ASV <- otu_table(asvmat, taxa_are_rows = TRUE)
TAX <- tax_table(taxmat)
META <- sample_data(meta)
phyCLV <- phyloseq(ASV,TAX,META)
transform <- microbiome::transform
phyCLV_transform <- transform(phyCLV, "compositional")
### Assigning Top 10 Phyla
#Sort Phylum by abundance and pick the top 10
top10phy.names.CLV <- sort(tapply(taxa_sums(phyCLV_transform), tax_table(phyCLV_transform)[, "Phylum"], sum), TRUE)[1:10]
#Cut down the physeq data to only the top 10 Phyla
top10phylaCLV <- subset_taxa(phyCLV_transform, Phylum %in% names(top10phy.names.CLV))
#Saving names and proportions as a data frame then saving as csv
topphylaCLV <- as.data.frame(top10phy.names.CLV)
colnames(topphylaCLV)[1] ="Abundance"
write.csv(topphylaCLV, "Top10Phyla_CLV.csv")
## KISSR0.0 - (Firmicutes removed-> KISSR0.0_3_20)
asvdat <- KISS
taxdat <- read.csv("taxonomy_Y123_edited&cleaned.csv", header = TRUE, row.names = 1)
meta <- read.csv("Metadata-Diversity_BATCH.csv", header = TRUE, row.names = 1)
asvmat <- data.matrix(asvdat)
taxmat <- as.matrix(taxdat) # use as.matrix NOT as.data.matrix as the data will convert the data into numbers
ASV <- otu_table(asvmat, taxa_are_rows = TRUE)
TAX <- tax_table(taxmat)
META <- sample_data(meta)
phyKISS <- phyloseq(ASV,TAX,META)
transform <- microbiome::transform
phyKISS_transform <- transform(phyKISS, "compositional")
## Top 10 Phyla
#Sort Phylum by abundance and pick the top 10
top10phy.names.KISS <- sort(tapply(taxa_sums(phyKISS_transform), tax_table(phyKISS_transform)[, "Phylum"], sum), TRUE)[1:10]
#Cut down the physeq data to only the top 10 Phyla
top10phylaKISS <- subset_taxa(phyKISS_transform, Phylum %in% names(top10phy.names.KISS))
#Saving names and proportions as a data frame then saving as csv
topphylaKISS <- as.data.frame(top10phy.names.KISS)
colnames(topphylaKISS)[1] ="Abundance"
write.csv(topphylaKISS, "Top10Phyla_KISS.csv")
## L001
asvdat <- L1
taxdat <- read.csv("taxonomy_Y123_edited&cleaned.csv", header = TRUE, row.names = 1)
meta <- read.csv("Metadata-Diversity_BATCH.csv", header = TRUE, row.names = 1)
asvmat <- data.matrix(asvdat)
taxmat <- as.matrix(taxdat) # use as.matrix NOT as.data.matrix as the data will convert the data into numbers
ASV <- otu_table(asvmat, taxa_are_rows = TRUE)
TAX <- tax_table(taxmat)
META <- sample_data(meta)
phyL1 <- phyloseq(ASV,TAX,META)
transform <- microbiome::transform
phyL1_transform <- transform(phyL1, "compositional")
## Top 10 Phyla
#Sort Phylum by abundance and pick the top 10
top10phy.names.L1 <- sort(tapply(taxa_sums(phyL1_transform), tax_table(phyL1_transform)[, "Phylum"], sum), TRUE)[1:10]
#Cut down the physeq data to only the top 10 Phyla
top10phylaL1 <- subset_taxa(phyL1_transform, Phylum %in% names(top10phy.names.L1))
#Saving names and proportions as a data frame then saving as csv
topphylaL1 <- as.data.frame(top10phy.names.L1)
colnames(topphylaL1)[1] ="Abundance"
write.csv(topphylaL1, "Top10Phyla_L001.csv")
## L004
asvdat <- L4
taxdat <- read.csv("taxonomy_Y123_edited&cleaned.csv", header = TRUE, row.names = 1)
meta <- read.csv("Metadata-Diversity_BATCH.csv", header = TRUE, row.names = 1)
asvmat <- data.matrix(asvdat)
taxmat <- as.matrix(taxdat) # use as.matrix NOT as.data.matrix as the data will convert the data into numbers
ASV <- otu_table(asvmat, taxa_are_rows = TRUE)
TAX <- tax_table(taxmat)
META <- sample_data(meta)
phyL4 <- phyloseq(ASV,TAX,META)
transform <- microbiome::transform
phyL4_transform <- transform(phyL4, "compositional")
## Top 10 Phyla
#Sort Phylum by abundance and pick the top 10
top10phy.names.L4 <- sort(tapply(taxa_sums(phyL4_transform), tax_table(phyL4_transform)[, "Phylum"], sum), TRUE)[1:10]
#Cut down the physeq data to only the top 10 Phyla
top10phylaL4 <- subset_taxa(phyL4_transform, Phylum %in% names(top10phy.names.L4))
#Saving names and proportions as a data frame then saving as csv
topphylaL4 <- as.data.frame(top10phy.names.L4)
colnames(topphylaL4)[1] ="Abundance"
write.csv(topphylaL4, "Top10Phyla_L004.csv")
## L005 (Firmicutes removed-> L005_3_20)
asvdat <- L5
taxdat <- read.csv("taxonomy_Y123_edited&cleaned.csv", header = TRUE, row.names = 1)
meta <- read.csv("Metadata-Diversity_BATCH.csv", header = TRUE, row.names = 1)
asvmat <- data.matrix(asvdat)
taxmat <- as.matrix(taxdat) # use as.matrix NOT as.data.matrix as the data will convert the data into numbers
ASV <- otu_table(asvmat, taxa_are_rows = TRUE)
TAX <- tax_table(taxmat)
META <- sample_data(meta)
phyL5 <- phyloseq(ASV,TAX,META)
transform <- microbiome::transform
phyL5_transform <- transform(phyL5, "compositional")
## Top 10 Phyla
#Sort Phylum by abundance and pick the top 10
top10phy.names.L5 <- sort(tapply(taxa_sums(phyL5_transform), tax_table(phyL5_transform)[, "Phylum"], sum), TRUE)[1:10]
#Cut down the physeq data to only the top 10 Phyla
top10phylaL5 <- subset_taxa(phyL5_transform, Phylum %in% names(top10phy.names.L5))
#Saving names and proportions as a data frame then saving as csv
topphylaL5 <- as.data.frame(top10phy.names.L5)
colnames(topphylaL5)[1] ="Abundance"
write.csv(topphylaL5, "Top10Phyla_L005.csv")
## L006
asvdat <- L6
taxdat <- read.csv("taxonomy_Y123_edited&cleaned.csv", header = TRUE, row.names = 1)
meta <- read.csv("Metadata-Diversity_BATCH.csv", header = TRUE, row.names = 1)
asvmat <- data.matrix(asvdat)
taxmat <- as.matrix(taxdat) # use as.matrix NOT as.data.matrix as the data will convert the data into numbers
ASV <- otu_table(asvmat, taxa_are_rows = TRUE)
TAX <- tax_table(taxmat)
META <- sample_data(meta)
phyL6 <- phyloseq(ASV,TAX,META)
phyL6_transform <- transform(phyL6, "compositional")
## Top 10 Phyla
#Sort Phylum by abundance and pick the top 10
top10phy.names.L6 <- sort(tapply(taxa_sums(phyL6_transform), tax_table(phyL6_transform)[, "Phylum"], sum), TRUE)[1:10]
#Cut down the physeq data to only the top 10 Phyla
top10phylaL6 <- subset_taxa(phyL6_transform, Phylum %in% names(top10phy.names.L6))
#Saving names and proportions as a data frame then saving as csv
topphylaL6 <- as.data.frame(top10phy.names.L6)
colnames(topphylaL6)[1] ="Abundance"
write.csv(topphylaL6, "Top10Phyla_L006.csv")
## L007
asvdat <- L7
taxdat <- read.csv("taxonomy_Y123_edited&cleaned.csv", header = TRUE, row.names = 1)
meta <- read.csv("Metadata-Diversity_BATCH.csv", header = TRUE, row.names = 1)
asvmat <- data.matrix(asvdat)
taxmat <- as.matrix(taxdat) # use as.matrix NOT as.data.matrix as the data will convert the data into numbers
ASV <- otu_table(asvmat, taxa_are_rows = TRUE)
TAX <- tax_table(taxmat)
META <- sample_data(meta)
phyL7 <- phyloseq(ASV,TAX,META)
phyL7_transform <- transform(phyL7, "compositional")
## Top 10 Phyla
#Sort Phylum by abundance and pick the top 10
top10phy.names.L7 <- sort(tapply(taxa_sums(phyL7_transform), tax_table(phyL7_transform)[, "Phylum"], sum), TRUE)[1:10]
#Cut down the physeq data to only the top 10 Phyla
top10phylaL7 <- subset_taxa(phyL7_transform, Phylum %in% names(top10phy.names.L7))
#Saving names and proportions as a data frame then saving as csv
topphylaL7 <- as.data.frame(top10phy.names.L7)
colnames(topphylaL7)[1] ="Abundance"
write.csv(topphylaL7, "Top10Phyla_L007.csv")
## L008
asvdat <- L8
taxdat <- read.csv("taxonomy_Y123_edited&cleaned.csv", header = TRUE, row.names = 1)
meta <- read.csv("Metadata-Diversity_BATCH.csv", header = TRUE, row.names = 1)
asvmat <- data.matrix(asvdat)
taxmat <- as.matrix(taxdat) # use as.matrix NOT as.data.matrix as the data will convert the data into numbers
ASV <- otu_table(asvmat, taxa_are_rows = TRUE)
TAX <- tax_table(taxmat)
META <- sample_data(meta)
phyL8 <- phyloseq(ASV,TAX,META)
phyL8_transform <- transform(phyL8, "compositional")
## Top 10 Phyla
#Sort Phylum by abundance and pick the top 10
top10phy.names.L8 <- sort(tapply(taxa_sums(phyL8_transform), tax_table(phyL8_transform)[, "Phylum"], sum), TRUE)[1:10]
#Cut down the physeq data to only the top 10 Phyla
top10phylaL8 <- subset_taxa(phyL8_transform, Phylum %in% names(top10phy.names.L8))
#Saving names and proportions as a data frame then saving as csv
topphylaL8 <- as.data.frame(top10phy.names.L8)
colnames(topphylaL8)[1] ="Abundance"
write.csv(topphylaL8, "Top10Phyla_L008.csv")
## LZ25A
asvdat <- Z25A
taxdat <- read.csv("taxonomy_Y123_edited&cleaned.csv", header = TRUE, row.names = 1)
meta <- read.csv("Metadata-Diversity_BATCH.csv", header = TRUE, row.names = 1)
asvmat <- data.matrix(asvdat)
taxmat <- as.matrix(taxdat) # use as.matrix NOT as.data.matrix as the data will convert the data into numbers
ASV <- otu_table(asvmat, taxa_are_rows = TRUE)
TAX <- tax_table(taxmat)
META <- sample_data(meta)
phy25A <- phyloseq(ASV,TAX,META)
transform <- microbiome::transform
phy25A_transform <- transform(phy25A, "compositional")
## Top 10 Phyla
#Sort Phylum by abundance and pick the top 10
top10phy.names.25A <- sort(tapply(taxa_sums(phy25A_transform), tax_table(phy25A_transform)[, "Phylum"], sum), TRUE)[1:10]
#Cut down the physeq data to only the top 10 Phyla
top10phyla25A <- subset_taxa(phy25A_transform, Phylum %in% names(top10phy.names.25A))
#Saving names and proportions as a data frame then saving as csv
topphyla25A <- as.data.frame(top10phy.names.25A)
colnames(topphyla25A)[1] ="Abundance"
write.csv(topphyla25A, "Top10Phyla_LZ25A.csv")
## LZ2 (Firmicutes contam. removed LZ2_3_20)
asvdat <- LZ2
taxdat <- read.csv("taxonomy_Y123_edited&cleaned.csv", header = TRUE, row.names = 1)
meta <- read.csv("Metadata-Diversity_BATCH.csv", header = TRUE, row.names = 1)
asvmat <- data.matrix(asvdat)
taxmat <- as.matrix(taxdat) # use as.matrix NOT as.data.matrix as the data will convert the data into numbers
ASV <- otu_table(asvmat, taxa_are_rows = TRUE)
TAX <- tax_table(taxmat)
META <- sample_data(meta)
phyLZ2 <- phyloseq(ASV,TAX,META)
phyLZ2_transform <- transform(phyLZ2, "compositional")
## Top 10 Phyla
#Sort Phylum by abundance and pick the top 10
top10phy.names.LZ2 <- sort(tapply(taxa_sums(phyLZ2_transform), tax_table(phyLZ2_transform)[, "Phylum"], sum), TRUE)[1:10]
#Cut down the physeq data to only the top 10 Phyla
top10phylaLZ2 <- subset_taxa(phyLZ2_transform, Phylum %in% names(top10phy.names.LZ2))
#Saving names and proportions as a data frame then saving as csv
topphylaLZ2 <- as.data.frame(top10phy.names.LZ2)
colnames(topphylaLZ2)[1] ="Abundance"
write.csv(topphylaLZ2, "Top10Phyla_LZ2.csv")
## LZ30
asvdat <- Z30
taxdat <- read.csv("taxonomy_Y123_edited&cleaned.csv", header = TRUE, row.names = 1)
meta <- read.csv("Metadata-Diversity_BATCH.csv", header = TRUE, row.names = 1)
asvmat <- data.matrix(asvdat)
taxmat <- as.matrix(taxdat) # use as.matrix NOT as.data.matrix as the data will convert the data into numbers
ASV <- otu_table(asvmat, taxa_are_rows = TRUE)