I need to do some analysis on an output table from the calculation of alpha diversity (within-sample diversity) for some ecological samples from a program.
As the original data table is too large, I am only showing a part of the data here, which corresponds to the chao1 and par data frames mentioned in the scripts below, respectively:
list(sequence.depth = c(10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
217, 217, 217, 217, 217, 217, 217, 217, 217, 217, 424, 424, 424,
424, 424, 424, 424, 424, 424, 424), iteration = c(0, 1, 2, 3,
4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4,
5, 6, 7, 8, 9), sample1 = c(13, 55, 55, 23, 13, 55, 23, 29, 55,
23, 623.5, 440, 545.7142857, 353, 515.4615385, 767.8, 602.2,
354.3333333, 349.25, 508.7692308, 782.7142857, 648.3571429, 555.0344828,
662.4166667, 651.037037, 491.0285714, 621.2857143, 600.7, 536.5714286,
683.5517241), sample2 = c(23, 23, 55, 55, 55, 16, 23, 55, 13,
55, 603, 408.5, 422.0769231, 328.4, 304.5, 379.2142857, 375,
469, 483.1111111, 384.75, 807.8333333, 793, 545.75, 542.8, 599.1153846,
555.1363636, 688, 655.5652174, 491.2758621, 673.1363636), sample3 = c(55,
55, 55, 55, 55, 55, 55, 55, 55, 55, 942.6470588, 763.1818182,
757.7142857, 817.5652174, 951.1176471, 622.3846154, 1065.75,
1436.1, 771.4545455, 1240.428571, 1091.622222, 1305.571429, 1029.6,
1349, 1282.357143, 1120.139535, 1362.153846, 1324.447368, 1151.923077,
990.3958333), sample4 = c(55, 55, 55, 23, 55, 55, 55, 23, 55,
55, 1134.4, 1245.2, 1233.4, 1112.066667, 680.2916667, 910.25,
1216.071429, 1168.5, 853.4736842, 1180.5625, 1142.130435, 1438.157895,
1074.222222, 1223.976744, 1062.163265, 1346.25, 1245.434783,
1330.073171, 1134.022222, 1395.833333), sample5 = c(55, 55, 55,
55, 55, 55, 29, 55, 55, 55, 711.55, 713.8947368, 656.3157895,
695.8947368, 679.1052632, 880.0588235, 812.5555556, 959.5714286,
551.4347826, 619.6363636, 895.5227273, 876.9772727, 884.2790698,
895.8139535, 840.7659574, 882.1363636, 922.0238095, 894.5227273,
859, 895.8139535), sample6 = c(55, 55, 55, 55, 55, 55, 55, 55,
55, 55, 508.6071429, 852.0588235, 867, 773.0526316, 786.5, 1064.2,
699.6363636, 892.1764706, 702.25, 948.8333333, 1139.4, 892.5294118,
1331.684211, 1068, 1297.676471, 1024.27907, 1096.142857, 1148.227273,
1031.046512, 935.5957447), sample7 = c(23, 13, 9, 29, 55, 8,
29, 22, 8.5, 7, 241.6, 576.5, 140.3333333, 412.5, 370, 192.5454545,
123.1111111, 263, 164.3333333, 223.5, 326.5, 442.3333333, 297.6875,
448.2727273, 345.5, 640.3333333, 439.1, 359.6, 356.0714286, 292.8333333
), sample8 = c(22, 23, 23, 29, 23, 23, 55, 23, 29, 23, 582.0909091,
333.5263158, 657, 337.8666667, 372.0714286, 470.5, 594.3, 553.0909091,
368.55, 544.8, 689.3571429, 626.24, 941.5, 649.9642857, 651.5,
632.4347826, 653.7777778, 767.7727273, 674.3913043, 713.7727273
), sample9 = c(55, 23, 23, 55, 23, 23, 55, 55, 55, 55, 368, 445.1666667,
669.0909091, 339.6521739, 335.3, 635.4615385, 382.1578947, 538.6666667,
425.3684211, 459.3125, 583.3636364, 624.1935484, 590.0909091,
600, 731.2222222, 677.2758621, 619.1818182, 592, 747.7307692,
619.5), sample10 = c(55, 23, 55, 55, 55, 23, 55, 13, 23, 55,
1184.153846, 820.5, 971.0769231, 793.6470588, 1051.4, 811.5,
540.64, 705.5555556, 880.0588235, 906.0666667, 1263.6, 956.2380952,
1036.108108, 1013.029412, 1228.794118, 1005.263158, 937.4186047,
1104.333333, 898.1162791, 1071.135135), sample11 = c(23, 23,
55, 55, 23, 55, 55, 55, 55, 55, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA))
structure(list(plate = structure(c(2L, 2L, 5L, 5L, 5L, 5L, 4L,
2L, 3L, 1L, 1L), .Label = c("Plate1", "Plate2", "Plate3", "Plate4",
"Plate5"), class = "factor"), sequence_run = structure(c(8L,
10L, 6L, 5L, 4L, 3L, 2L, 9L, 11L, 1L, 7L), .Label = c("Run1",
"Run10", "Run11", "Run12", "Run13", "Run14", "Run2", "Run4",
"Run5", "Run6", "Run7"), class = "factor"), type = structure(c(2L,
2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("flower",
"leaf"), class = "factor"), environment = structure(c(1L, 1L,
2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), .Label = c("field1", "field2",
"field3"), class = "factor"), sample = structure(c(1L, 4L, 5L,
6L, 7L, 8L, 9L, 10L, 11L, 2L, 3L), .Label = c("sample1", "sample10",
"sample11", "sample2", "sample3", "sample4", "sample5", "sample6",
"sample7", "sample8", "sample9"), class = "factor")), .Names = c("plate",
"sequence_run", "type", "environment", "sample"), class = "data.frame", row.names = c(NA,
-11L))
The analysis I want to do is to partition the variation in the alpha diversity to the variables plate, sequence_run, sample_type, and environment, and I plan to use permutation-based multiple regression in R.
My original questions: I just cannot visualize in my mind how I should prepare this table for R so that I can do the analysis. So my question is, how should I organize this output table for analysis in R? And is there a way to deal with NAs in the 'aovp' function?
Please note that the iterations from 0 to 9 at the Nth sequence depth is due to the random sampling of N sequences from that sample to calculate alpha diversity for 10 times by the program. If there were not enough sequences in that sample, the program generated an 'NA'.
Thanks for your help!
update: Thanks, Gray. When I first asked this question, I didn't have a clue. I have tried the scripts shown here to organize it into a very long data frame with each row representing the alpha diversity measure for each sample at each iteration and sequence depth, along with the other four parameters (sequence_run, plate, type, and environment). But I cannot do aovp on it, as it is too big. So I doubt if this is the right way to organize the data for the analysis I proposed to do:
library(lmPerm)
library(reshape2)
library(plyr)
chao1=read.table(file.choose(),head=T,sep='\t')
par=read.table(file.choose(),head=T)
#make a new column in par so that par$all contains
#all the variable names and sample names
par$all=paste0(par$plate,'x',par$sequence_run,'x',par$type,'x',par$environment,'x',par$sample)
name1=as.vector(par$all)
colnames(chao1)[3:dim(chao1)[2]]=name1
#use melt to make a stacked table
t3=melt(chao1,'sequence.depth' )
t4=t3[31:dim(t3)[1],]
colnames(t4)=c('sequence.depth','all','div')
#use strsplit to get variables for each sample from t4$all
#and make a new data frame named alpha to store the diversity
#and variables/sample names
list=strsplit(as.character(t4$all),'x')
df=ldply(list)
colnames(df)=c('plate','run','type','env','sample')
alpha=data.frame(t4[,1],t4[,3],df)
colnames(alpha)[1:2]=c('depth','div')
#do aovp on the data frame alpha
mod1=aovp(alpha$div~alpha$env+alpha$plate+alpha$run+alpha$type)
#Error: cannot allocate vector of size 76.4 Gb
#this error was for the original table, not the part of data shown here
Updates again: The above scripts ran fine with the included example data. However, I got the memory error when using all the real data. So I believe the way I organized the alpha diversity data into this long data frame is not efficient, and I may be wrong in dealing with the iterations and NAs. Could someone help me with it? Thanks.