R code for ecological data analysis

# ============================================================
# Tutorial on plotting significant taxa and environmental variables on an NMDS plot using ggplot2
# by Umer Zeeshan Ijaz (http://userweb.eng.gla.ac.uk/umer.ijaz)
# =============================================================
 
library(vegan)
library(ggplot2)
library(grid)
 
# This R script is an extension of vegan library's bioenv()
# function and uses the bio.env() and bio.step() of
#	http://menugget.blogspot.co.uk/2011/06/clarke-and-ainsworths-bioenv-and-bvstep.html
#	The original author suggested these functions to overcome
#	the inflexibility of the bioenv() function which uses
#	a similarity matrix based on normalized "euclidean" distance.
# The new functions are given below and implement the following algorithms: 
# Clarke, K. R & Ainsworth, M. 1993. A method of linking multivariate community structure to environmental variables. Marine Ecology Progress Series, 92, 205-219.
# Clarke, K. R., Gorley, R. N., 2001. PRIMER v5: User Manual/Tutorial. PRIMER-E, Plymouth, UK.
# Clarke, K. R., Warwick, R. M., 2001. Changes in Marine Communities: An Approach to Statistical Analysis and Interpretation, 2nd edition. PRIMER-E Ltd, Plymouth, UK.
# Clarke, K. R., Warwick, R. M., 1998. Quantifying structural redundancy in ecological communities. Oecologia, 113:278-289. 
 
bv.step <- function(fix.mat, var.mat, 
                    fix.dist.method="bray", var.dist.method="euclidean", correlation.method="spearman",
                    scale.fix=FALSE, scale.var=TRUE,
                    max.rho=0.95,
                    min.delta.rho=0.001,
                    random.selection=TRUE,
                    prop.selected.var=0.2,
                    num.restarts=10,
                    var.always.include=NULL,
                    var.exclude=NULL,
                    output.best=10
){
 
  if(dim(fix.mat)[1] != dim(var.mat)[1]){stop("fixed and variable matrices must have the same number of rows")}
  if(sum(var.always.include %in% var.exclude) > 0){stop("var.always.include and var.exclude share a variable")}
  require(vegan)
 
  if(scale.fix){fix.mat<-scale(fix.mat)}else{fix.mat<-fix.mat}
  if(scale.var){var.mat<-scale(var.mat)}else{var.mat<-var.mat}
 
  fix.dist <- vegdist(as.matrix(fix.mat), method=fix.dist.method)
 
  #an initial removal phase
  var.dist.full <- vegdist(as.matrix(var.mat), method=var.dist.method)
  full.cor <- suppressWarnings(cor.test(fix.dist, var.dist.full, method=correlation.method))$estimate
  var.comb <- combn(1:ncol(var.mat), ncol(var.mat)-1)
  RES <- data.frame(var.excl=rep(NA,ncol(var.comb)), n.var=ncol(var.mat)-1, rho=NA)
  for(i in 1:dim(var.comb)[2]){
    var.dist <- vegdist(as.matrix(var.mat[,var.comb[,i]]), method=var.dist.method)
    temp <- suppressWarnings(cor.test(fix.dist, var.dist, method=correlation.method))
    RES$var.excl[i] <- c(1:ncol(var.mat))[-var.comb[,i]]
    RES$rho[i] <- temp$estimate
  }
  delta.rho <- RES$rho - full.cor
  exclude <- sort(unique(c(RES$var.excl[which(abs(delta.rho) < min.delta.rho)], var.exclude)))
 
  if(random.selection){
    num.restarts=num.restarts
    prop.selected.var=prop.selected.var
    prob<-rep(1,ncol(var.mat))
    if(prop.selected.var< 1){
      prob[exclude]<-0
    }
    n.selected.var <- min(sum(prob),prop.selected.var*dim(var.mat)[2])
  } else {
    num.restarts=1
    prop.selected.var=1  
    prob<-rep(1,ncol(var.mat))
    n.selected.var <- min(sum(prob),prop.selected.var*dim(var.mat)[2])
  }
 
  RES_TOT <- c()
  for(i in 1:num.restarts){
    step=1
    RES <- data.frame(step=step, step.dir="F", var.incl=NA, n.var=0, rho=0)
    attr(RES$step.dir, "levels") <- c("F","B")
    best.comb <- which.max(RES$rho)
    best.rho <- RES$rho[best.comb]
    delta.rho <- Inf
    selected.var <- sort(unique(c(sample(1:dim(var.mat)[2], n.selected.var, prob=prob), var.always.include)))
    while(best.rho < max.rho & delta.rho > min.delta.rho & RES$n.var[best.comb] < length(selected.var)){
      #forward step
      step.dir="F"
      step=step+1
      var.comb <- combn(selected.var, RES$n.var[best.comb]+1, simplify=FALSE)
      if(RES$n.var[best.comb] == 0){
        var.comb.incl<-1:length(var.comb)
      } else {
        var.keep <- as.numeric(unlist(strsplit(RES$var.incl[best.comb], ",")))
        temp <- NA*1:length(var.comb)
        for(j in 1:length(temp)){
          temp[j] <- all(var.keep %in% var.comb[[j]]) 
        }
        var.comb.incl <- which(temp==1)
      }
 
      RES.f <- data.frame(step=rep(step, length(var.comb.incl)), step.dir=step.dir, var.incl=NA, n.var=RES$n.var[best.comb]+1, rho=NA)
      for(f in 1:length(var.comb.incl)){
        var.incl <- var.comb[[var.comb.incl[f]]]
        var.incl <- var.incl[order(var.incl)]
        var.dist <- vegdist(as.matrix(var.mat[,var.incl]), method=var.dist.method)
        temp <- suppressWarnings(cor.test(fix.dist, var.dist, method=correlation.method))
        RES.f$var.incl[f] <- paste(var.incl, collapse=",")
        RES.f$rho[f] <- temp$estimate
      }
 
      last.F <- max(which(RES$step.dir=="F"))
      RES <- rbind(RES, RES.f[which.max(RES.f$rho),])
      best.comb <- which.max(RES$rho)
      delta.rho <- RES$rho[best.comb] - best.rho 
      best.rho <- RES$rho[best.comb]
 
      if(best.comb == step){
        while(best.comb == step & RES$n.var[best.comb] > 1){
          #backward step
          step.dir="B"
          step <- step+1
          var.keep <- as.numeric(unlist(strsplit(RES$var.incl[best.comb], ",")))
          var.comb <- combn(var.keep, RES$n.var[best.comb]-1, simplify=FALSE)
          RES.b <- data.frame(step=rep(step, length(var.comb)), step.dir=step.dir, var.incl=NA, n.var=RES$n.var[best.comb]-1, rho=NA)
          for(b in 1:length(var.comb)){
            var.incl <- var.comb[[b]]
            var.incl <- var.incl[order(var.incl)]
            var.dist <- vegdist(as.matrix(var.mat[,var.incl]), method=var.dist.method)
            temp <- suppressWarnings(cor.test(fix.dist, var.dist, method=correlation.method))
            RES.b$var.incl[b] <- paste(var.incl, collapse=",")
            RES.b$rho[b] <- temp$estimate
          }
          RES <- rbind(RES, RES.b[which.max(RES.b$rho),])
          best.comb <- which.max(RES$rho)
          best.rho<- RES$rho[best.comb]
        }
      } else {
        break()
      }
 
    }
 
    RES_TOT <- rbind(RES_TOT, RES[2:dim(RES)[1],])
    print(paste(round((i/num.restarts)*100,3), "% finished"))
  }
 
  RES_TOT <- unique(RES_TOT[,3:5])
 
 
  if(dim(RES_TOT)[1] > output.best){
    order.by.best <- RES_TOT[order(RES_TOT$rho, decreasing=TRUE)[1:output.best],]
  } else {
    order.by.best <-  RES_TOT[order(RES_TOT$rho, decreasing=TRUE), ]
  }
  rownames(order.by.best)<-NULL
 
  order.by.i.comb <- c()
  for(i in 1:length(selected.var)){
    f1 <- which(RES_TOT$n.var==i)
    f2 <- which.max(RES_TOT$rho[f1])
    order.by.i.comb <- rbind(order.by.i.comb, RES_TOT[f1[f2],])
  }
  rownames(order.by.i.comb)<-NULL
 
  if(length(exclude)<1){var.exclude=NULL} else {var.exclude=exclude}
  out <- list(
    order.by.best=order.by.best,
    order.by.i.comb=order.by.i.comb,
    best.model.vars=paste(colnames(var.mat)[as.numeric(unlist(strsplit(order.by.best$var.incl[1], ",")))], collapse=","),
    best.model.rho=order.by.best$rho[1],
    var.always.include=var.always.include,
    var.exclude=var.exclude
  )
  out
 
}
 
bio.env <- function(fix.mat, var.mat, 
                    fix.dist.method="bray", var.dist.method="euclidean", correlation.method="spearman",
                    scale.fix=FALSE, scale.var=TRUE,
                    output.best=10,
                    var.max=ncol(var.mat)
){
  if(dim(fix.mat)[1] != dim(var.mat)[1]){stop("fixed and variable matrices must have the same number of rows")}
  if(var.max > dim(var.mat)[2]){stop("var.max cannot be larger than the number of variables (columns) in var.mat")}
 
  require(vegan)
 
  combn.sum <- sum(factorial(ncol(var.mat))/(factorial(1:var.max)*factorial(ncol(var.mat)-1:var.max)))
 
  if(scale.fix){fix.mat<-scale(fix.mat)}else{fix.mat<-fix.mat}
  if(scale.var){var.mat<-scale(var.mat)}else{var.mat<-var.mat}
  fix.dist <- vegdist(fix.mat, method=fix.dist.method)
  RES_TOT <- c()
  best.i.comb <- c()
  iter <- 0
  for(i in 1:var.max){
    var.comb <- combn(1:ncol(var.mat), i, simplify=FALSE)
    RES <- data.frame(var.incl=rep(NA, length(var.comb)), n.var=i, rho=0)
    for(f in 1:length(var.comb)){
      iter <- iter+1
      var.dist <- vegdist(as.matrix(var.mat[,var.comb[[f]]]), method=var.dist.method)
      temp <- suppressWarnings(cor.test(fix.dist, var.dist, method=correlation.method))
      RES$var.incl[f] <- paste(var.comb[[f]], collapse=",")
      RES$rho[f] <- temp$estimate
      if(iter %% 100 == 0){print(paste(round(iter/combn.sum*100, 3), "% finished"))}
    }
 
    order.rho <- order(RES$rho, decreasing=TRUE)
    best.i.comb <- c(best.i.comb, RES$var.incl[order.rho[1]])
    if(length(order.rho) > output.best){
      RES_TOT <- rbind(RES_TOT, RES[order.rho[1:output.best],])
    } else {
      RES_TOT <- rbind(RES_TOT, RES)
    }
  }
  rownames(RES_TOT)<-NULL
 
  if(dim(RES_TOT)[1] > output.best){
    order.by.best <- order(RES_TOT$rho, decreasing=TRUE)[1:output.best]
  } else {
    order.by.best <- order(RES_TOT$rho, decreasing=TRUE)
  }
  OBB <- RES_TOT[order.by.best,]
  rownames(OBB) <- NULL
 
  order.by.i.comb <- match(best.i.comb, RES_TOT$var.incl)
  OBC <- RES_TOT[order.by.i.comb,]
  rownames(OBC) <- NULL
 
  out <- list(
    order.by.best=OBB,
    order.by.i.comb=OBC,
    best.model.vars=paste(colnames(var.mat)[as.numeric(unlist(strsplit(OBB$var.incl[1], ",")))], collapse=",") ,
    best.model.rho=OBB$rho[1]
  )
  out
}
 
abund_table<-read.csv("SPE_pitlatrine.csv",row.names=1,check.names=FALSE)
#Transpose the data to have sample names on rows
abund_table<-t(abund_table)
meta_table<-read.csv("ENV_pitlatrine.csv",row.names=1,check.names=FALSE)
#Just a check to ensure that the samples in meta_table are in the same order as in abund_table
meta_table<-meta_table[rownames(abund_table),]
#Get grouping information
grouping_info<-data.frame(row.names=rownames(abund_table),t(as.data.frame(strsplit(rownames(abund_table),"_"))))
# > head(grouping_info)
# X1 X2 X3
# T_2_1   T  2  1
# T_2_10  T  2 10
# T_2_12  T  2 12
# T_2_2   T  2  2
# T_2_3   T  2  3
# T_2_6   T  2  6
 
 
#Parameters
cmethod<-"pearson" #Correlation method to use: pearson, pearman, kendall
fmethod<-"bray" #Fixed distance method: euclidean, manhattan, gower, altGower, canberra, bray, kulczynski, morisita,horn, binomial, and cao
vmethod<-"bray" #Variable distance method: euclidean, manhattan, gower, altGower, canberra, bray, kulczynski, morisita,horn, binomial, and cao
nmethod<-"bray" #NMDS distance method:  euclidean, manhattan, gower, altGower, canberra, bray, kulczynski, morisita,horn, binomial, and cao
 
 
res <- bio.env(wisconsin(abund_table), meta_table,fix.dist.method=fmethod, var.dist.method=vmethod, correlation.method=cmethod,
               scale.fix=FALSE, scale.var=TRUE) 
 
#Get the 10 best subset of environmental variables
envNames<-colnames(meta_table)
bestEnvFit<-""
for(i in (1:length(res$order.by.best$var.incl)))
{
  bestEnvFit[i]<-paste(paste(envNames[as.numeric(unlist(strsplit(res$order.by.best$var.incl[i], split=",")))],collapse=' + '), " = ",res$order.by.best$rho[i],sep="")
}
bestEnvFit<-data.frame(bestEnvFit)
colnames(bestEnvFit)<-"Best combination of environmental variables with similarity score"
 
#> bestEnvFit
#Best combination of environmental variables with similarity score
#1                                               Carbo = 0.112933470129642
#2                                  Temp + VS + Carbo = 0.0850146558049758
#3                                       Temp + Carbo = 0.0836568099887996
#4                                               Temp = 0.0772024618792259
#5                                          perCODsbyt = 0.076360226020659
#6              TS + CODs + perCODsbyt + Prot + Carbo = 0.0752573390422113
#7              Temp + CODt + CODs + perCODsbyt + NH4 = 0.0749607646516179
#8  Temp + TS + VFA + perCODsbyt + NH4 + Prot + Carbo = 0.0718412745558854
#9                             pH + Temp + TS + Carbo = 0.0693960850973541
#10                                Temp + NH4 + Carbo = 0.0687961955200212
 
res.bv.step.biobio <- bv.step(wisconsin(abund_table), wisconsin(abund_table), 
                              fix.dist.method=fmethod, var.dist.method=vmethod,correlation.method=cmethod,
                              scale.fix=FALSE, scale.var=FALSE, 
                              max.rho=0.95, min.delta.rho=0.001,
                              random.selection=TRUE,
                              prop.selected.var=0.3,
                              num.restarts=10,
                              output.best=10,
                              var.always.include=NULL) 
 
#Get the 10 best subset of taxa
taxaNames<-colnames(abund_table)
bestTaxaFit<-""
for(i in (1:length(res.bv.step.biobio$order.by.best$var.incl)))
{
  bestTaxaFit[i]<-paste(paste(taxaNames[as.numeric(unlist(strsplit(res.bv.step.biobio$order.by.best$var.incl[i], split=",")))],collapse=' + '), " = ",res.bv.step.biobio$order.by.best$rho[i],sep="")
}
bestTaxaFit<-data.frame(bestTaxaFit)
colnames(bestTaxaFit)<-"Best combination of taxa with similarity score"
 
#> bestTaxaFit
#Best combination of taxa with similarity score
#1                             Bacilli + Bacteroidia + Chrysiogenetes + Clostridia + Dehalococcoidetes + Fibrobacteria + Flavobacteria + Fusobacteria + Gammaproteobacteria + Methanomicrobia + Mollicutes + Opitutae + Synergistia + Thermomicrobia + Unknown = 0.900998476657462
#2                                              Bacilli + Bacteroidia + Clostridia + Dehalococcoidetes + Fibrobacteria + Flavobacteria + Fusobacteria + Gammaproteobacteria + Methanomicrobia + Mollicutes + Opitutae + Synergistia + Thermomicrobia + Unknown = 0.899912718316228
#3                                                        Bacilli + Bacteroidia + Clostridia + Dehalococcoidetes + Fibrobacteria + Flavobacteria + Fusobacteria + Gammaproteobacteria + Methanomicrobia + Mollicutes + Opitutae + Synergistia + Thermomicrobia = 0.896821772937576
#4  Clostridia + Dehalococcoidetes + Deltaproteobacteria + Epsilonproteobacteria + Erysipelotrichi + Flavobacteria + Fusobacteria + Methanobacteria + Mollicutes + Opitutae + Planctomycetacia + Sphingobacteria + Subdivision3 + Synergistia + Thermomicrobia = 0.892670058822226
#5                                                                     Bacilli + Bacteroidia + Clostridia + Dehalococcoidetes + Fibrobacteria + Flavobacteria + Fusobacteria + Gammaproteobacteria + Methanomicrobia + Opitutae + Synergistia + Thermomicrobia = 0.892533063335985
#6                   Clostridia + Dehalococcoidetes + Deltaproteobacteria + Epsilonproteobacteria + Erysipelotrichi + Flavobacteria + Fusobacteria + Methanobacteria + Mollicutes + Opitutae + Planctomycetacia + Sphingobacteria + Subdivision3 + Synergistia = 0.891217789278463
#7                                     Clostridia + Dehalococcoidetes + Deltaproteobacteria + Epsilonproteobacteria + Erysipelotrichi + Flavobacteria + Fusobacteria + Mollicutes + Opitutae + Planctomycetacia + Sphingobacteria + Subdivision3 + Synergistia = 0.888669881927483
#8                                                                                       Bacilli + Bacteroidia + Clostridia + Dehalococcoidetes + Fibrobacteria + Flavobacteria + Fusobacteria + Gammaproteobacteria + Opitutae + Synergistia + Thermomicrobia = 0.887052815492516
#9                                                  Clostridia + Dehalococcoidetes + Deltaproteobacteria + Epsilonproteobacteria + Erysipelotrichi + Flavobacteria + Fusobacteria + Opitutae + Planctomycetacia + Sphingobacteria + Subdivision3 + Synergistia = 0.885880090785632
#10                             Acidobacteria_Gp4 + Actinobacteria + Bacilli + Betaproteobacteria + Clostridia + Dehalococcoidetes + Erysipelotrichi + Fibrobacteria + Lentisphaeria + Methanomicrobia + Opitutae + Sphingobacteria + Thermomicrobia + Unknown = 0.882956559638505
 
#Generate NMDS plot
MDS_res=metaMDS(abund_table, distance = nmethod, k = 2, trymax = 50)
 
bio.keep <- as.numeric(unlist(strsplit(res.bv.step.biobio$order.by.best$var.incl[1], ",")))
bio.fit <- envfit(MDS_res, abund_table[,bio.keep,drop=F], perm = 999)
 
#> bio.fit
#
#***VECTORS
#
#NMDS1    NMDS2     r2 Pr(>r)    
#Bacilli              0.94632  0.32322 0.0423  0.167    
#Bacteroidia         -0.24478 -0.96958 0.0383  0.218    
#Chrysiogenetes      -0.22674  0.97395 0.0078  0.660    
#Clostridia          -0.98067 -0.19567 0.1092  0.017 *  
#Dehalococcoidetes   -0.79160  0.61103 0.1421  0.009 ** 
#Fibrobacteria       -0.90885 -0.41712 0.1320  0.010 ** 
#Flavobacteria        0.56629  0.82421 0.1746  0.003 ** 
#Fusobacteria         0.26601 -0.96397 0.0284  0.333    
#Gammaproteobacteria  0.67008  0.74229 0.2024  0.001 ***
#Methanomicrobia     -0.99912 -0.04188 0.0602  0.120    
#Mollicutes           0.99885 -0.04798 0.0052  0.769    
#Opitutae             0.55774  0.83002 0.2033  0.002 ** 
#Synergistia         -0.99732  0.07322 0.2079  0.002 ** 
#Thermomicrobia       0.38169  0.92429 0.2514  0.001 ***
#Unknown             -0.99232 -0.12373 0.1570  0.009 ** 
#---
#Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Permutation: free
#Number of permutations: 999
 
#use the best set of environmental variables in env.keep
eval(parse(text=paste("env.keep <- c(",res$order.by.best$var.incl[1],")",sep="")))
env.fit <- envfit(MDS_res, meta_table[,env.keep,drop=F], perm = 999) 
 
#Get site information
df<-scores(MDS_res,display=c("sites"))
 
#Add grouping information
df<-data.frame(df,Type=grouping_info[rownames(df),1])
 
#Get the vectors for bioenv.fit
df_biofit<-scores(bio.fit,display=c("vectors"))
df_biofit<-df_biofit*vegan:::ordiArrowMul(df_biofit)
df_biofit<-as.data.frame(df_biofit)
 
#Get the vectors for env.fit
df_envfit<-scores(env.fit,display=c("vectors"))
df_envfit<-df_envfit*vegan:::ordiArrowMul(df_envfit)
df_envfit<-as.data.frame(df_envfit)
 
#Draw samples
p<-ggplot()
p<-p+geom_point(data=df,aes(NMDS1,NMDS2,colour=Type))
#Draw taxas
p<-p+geom_segment(data=df_biofit, aes(x = 0, y = 0, xend = NMDS1, yend = NMDS2),
                  arrow = arrow(length = unit(0.2, "cm")),color="#808080",alpha=0.5)
 
p<-p+geom_text(data=as.data.frame(df_biofit*1.1),aes(NMDS1, NMDS2, label = rownames(df_biofit)),color="#808080",alpha=0.5)
#Draw environmental variables
p<-p+geom_segment(data=df_envfit, aes(x = 0, y = 0, xend = NMDS1, yend = NMDS2),
                  arrow = arrow(length = unit(0.2, "cm")),color="#4C005C",alpha=0.5)
 
p<-p+geom_text(data=as.data.frame(df_envfit*1.1),aes(NMDS1, NMDS2, label = rownames(df_envfit)),color="#4C005C",alpha=0.5)
p<-p+theme_bw()
pdf("NMDS_bioenv.pdf")
print(p)
dev.off()

CCA.R

# ============================================================
# Tutorial on drawing a CCA plot with significant environmental variables using ggplot2
# by Umer Zeeshan Ijaz (http://userweb.eng.gla.ac.uk/umer.ijaz)
# =============================================================
 
library(vegan)
library(grid)
 
abund_table<-read.csv("SPE_pitlatrine.csv",row.names=1,check.names=FALSE)
#Transpose the data to have sample names on rows
abund_table<-t(abund_table)
 
meta_table<-read.csv("ENV_pitlatrine.csv",row.names=1,check.names=FALSE)
 
#Just a check to ensure that the samples in meta_table are in the same order as in abund_table
meta_table<-meta_table[rownames(abund_table),]
 
#Filter out any samples taxas that have zero entries 
abund_table<-subset(abund_table,rowSums(abund_table)!=0)
 
#Convert to relative frequencies
abund_table<-abund_table/rowSums(abund_table)
 
#Use adonis to find significant environmental variables
abund_table.adonis <- adonis(abund_table ~ ., data=meta_table)
 
#> abund_table.adonis
#Call:
#  adonis(formula = abund_table ~ ., data = meta_table) 
#Permutation: free
#Number of permutations: 999
#
#Terms added sequentially (first to last)
#
#Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)    
#pH          1    0.2600 0.26002  2.1347 0.01974  0.067 .  
#Temp        1    1.6542 1.65417 13.5806 0.12559  0.001 ***
#TS          1    0.8298 0.82983  6.8128 0.06300  0.001 ***
#VS          1    0.3143 0.31434  2.5807 0.02387  0.038 *  
#VFA         1    0.2280 0.22796  1.8715 0.01731  0.109    
#CODt        1    0.1740 0.17400  1.4285 0.01321  0.209    
#CODs        1    0.0350 0.03504  0.2877 0.00266  0.909    
#perCODsbyt  1    0.1999 0.19986  1.6409 0.01517  0.148    
#NH4         1    0.1615 0.16154  1.3262 0.01226  0.236    
#Prot        1    0.3657 0.36570  3.0024 0.02777  0.024 *  
#Carbo       1    0.5444 0.54439  4.4693 0.04133  0.003 ** 
#Residuals  69    8.4045 0.12180         0.63809           
#Total      80   13.1713                 1.00000           
#---
#  Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
 
#Extract the best variables
bestEnvVariables<-rownames(abund_table.adonis$aov.tab)[abund_table.adonis$aov.tab$"Pr(>F)"<=0.01]
 
#Last two are NA entries, so we have to remove them
bestEnvVariables<-bestEnvVariables[!is.na(bestEnvVariables)]
 
#We are now going to use only those environmental variables in cca that were found significant
eval(parse(text=paste("sol <- cca(abund_table ~ ",do.call(paste,c(as.list(bestEnvVariables),sep=" + ")),",data=meta_table)",sep="")))
 
#You can use the following to use all the environmental variables
#sol<-cca(abund_table ~ ., data=meta_table)
 
scrs<-scores(sol,display=c("sp","wa","lc","bp","cn"))
 
#Check the attributes
# > attributes(scrs)
# $names
# [1] "species"     "sites"       "constraints" "biplot"     
# [5] "centroids"  
 
#Extract site data first
df_sites<-data.frame(scrs$sites,t(as.data.frame(strsplit(rownames(scrs$sites),"_"))))
colnames(df_sites)<-c("x","y","Country","Latrine","Depth")
 
#Draw sites
p<-ggplot()
p<-p+geom_point(data=df_sites,aes(x,y,colour=Country))
 
#Draw biplots
multiplier <- vegan:::ordiArrowMul(scrs$biplot)
 
# Reference: http://www.inside-r.org/packages/cran/vegan/docs/envfit
# The printed output of continuous variables (vectors) gives the direction cosines 
# which are the coordinates of the heads of unit length vectors. In plot these are 
# scaled by their correlation (square root of the column r2) so that "weak" predictors 
# have shorter arrows than "strong" predictors. You can see the scaled relative lengths 
# using command scores. The plotted (and scaled) arrows are further adjusted to the 
# current graph using a constant multiplier: this will keep the relative r2-scaled 
# lengths of the arrows but tries to fill the current plot. You can see the multiplier 
# using vegan:::ordiArrowMul(result_of_envfit), and set it with the argument arrow.mul. 
 
df_arrows<- scrs$biplot*multiplier
colnames(df_arrows)<-c("x","y")
df_arrows=as.data.frame(df_arrows)
 
p<-p+geom_segment(data=df_arrows, aes(x = 0, y = 0, xend = x, yend = y),
                 arrow = arrow(length = unit(0.2, "cm")),color="#808080",alpha=0.5)
 
p<-p+geom_text(data=as.data.frame(df_arrows*1.1),aes(x, y, label = rownames(df_arrows)),color="#808080",alpha=0.5)
 
# Draw species
df_species<- as.data.frame(scrs$species)
colnames(df_species)<-c("x","y")
 
# Either choose text or points
#p<-p+geom_text(data=df_species,aes(x,y,label=rownames(df_species)))
#p<-p+geom_point(data=df_species,aes(x,y,shape="Species"))+scale_shape_manual("",values=2)
 
p<-p+theme_bw()
pdf("CCA.pdf")
print(p)
dev.off()

richness.R

# ============================================================
# Tutorial on drawing a richness plot using ggplot2
# by Umer Zeeshan Ijaz (http://userweb.eng.gla.ac.uk/umer.ijaz)
# =============================================================
 
abund_table<-read.csv("SPE_pitlatrine.csv",row.names=1,check.names=FALSE)
#Transpose the data to have sample names on rows
abund_table<-t(abund_table)
meta_table<-read.csv("ENV_pitlatrine.csv",row.names=1,check.names=FALSE)
 
#Filter out samples with fewer counts
abund_table<-abund_table[rowSums(abund_table)>200,]
 
#Extract the corresponding meta_table for the samples in abund_table
meta_table<-meta_table[rownames(abund_table),]
 
 
#Load vegan library
library(vegan)
 
# Calculate species richness
N <- rowSums(abund_table)
S <- specnumber(abund_table)
S.rar <-rarefy(abund_table, min(N))
 
# Regression of S.rar against meta_table
S.lm <- lm(S.rar ~ ., data = meta_table)
summary(S.lm)
 
#Format the data for ggplot
 
df_S<-NULL
for (i in 1:dim(meta_table)[2]){
  tmp<-data.frame(row.names=NULL,Sample=names(S.rar),Richness=S.rar,Env=meta_table[,i],Label=rep(colnames(meta_table)[i],length(meta_table[,i])))
  if (is.null(df_S)){
    df_S=tmp
  }else{
    df_S<-rbind(df_S,tmp)
  }
}
 
#Get grouping information
grouping_info<-data.frame(row.names=rownames(df_S),t(as.data.frame(strsplit(as.character(df_S[,"Sample"]),"_"))))
colnames(grouping_info)<-c("Countries","Latrine","Depth")
 
#Merge this information with df_S
df_S<-cbind(df_S,grouping_info)
 
# > head(df_S)
# Sample  Richness  Env Label Countries Latrine Depth
# 1  T_2_1  8.424693 7.82    pH         T       2     1
# 2 T_2_12  4.389534 8.84    pH         T       2    12
# 3  T_2_2 10.141087 6.49    pH         T       2     2
# 4  T_2_3 10.156289 6.46    pH         T       2     3
# 5  T_2_6 10.120478 7.69    pH         T       2     6
# 6  T_2_9 11.217727 7.60    pH         T       2     9
 
 
#We now make sure there are no factors in df_S
df_S$Label<-as.character(df_S$Label)
df_S$Countries<-as.character(df_S$Countries)
df_S$Latrine<-as.character(df_S$Latrine)
df_S$Depth<-as.character(df_S$Depth)
 
#We need a Pvalue formatter
formatPvalues <- function(pvalue) {
  ra<-""
  if(pvalue <= 0.1) ra<-"."
  if(pvalue <= 0.05) ra<-"*"
  if(pvalue <= 0.01) ra<-"**"
  if(pvalue <= 0.001) ra<-"***"
  return(ra)
}
 
 
#Now use ddply to get correlations
library(plyr)
cors<-ddply(df_S,.(Label), summarise, cor=round(cor(Richness,Env),2), sig=formatPvalues(cor.test(Richness,Env)$p.value))
 
# > head(cors)
# Label   cor sig
# 1      Carbo -0.59 ***
# 2       CODs -0.46 ***
# 3       CODt -0.08    
# 4        NH4 -0.43 ***
# 5 perCODsbyt -0.60 ***
# 6         pH  0.05
 
#We have to do a trick to assign labels as facet_wrap doesn't have an explicit labeller
#Reference 1: http://stackoverflow.com/questions/19282897/how-to-add-expressions-to-labels-in-facet-wrap
#Reference 2: http://stackoverflow.com/questions/11979017/changing-facet-label-to-math-formula-in-ggplot2
facet_wrap_labeller <- function(gg.plot,labels=NULL) {
  #works with R 3.0.1 and ggplot2 0.9.3.1
  require(gridExtra)
 
  g <- ggplotGrob(gg.plot)
  gg <- g$grobs      
  strips <- grep("strip_t", names(gg))
 
  for(ii in seq_along(labels))  {
    modgrob <- getGrob(gg[[strips[ii]]], "strip.text", 
                       grep=TRUE, global=TRUE)
    gg[[strips[ii]]]$children[[modgrob$name]] <- editGrob(modgrob,label=labels[ii])
  }
 
  g$grobs <- gg
  class(g) = c("arrange", "ggplot",class(g)) 
  g
}
 
#Now convert your labels
wrap_labels<-do.call(paste,c(cors[c("Label")],"(",cors[c("cor")]," ",cors[c("sig")],")",sep=""))
 
 
p<-ggplot(df_S,aes(Env,Richness)) + 
  geom_point(aes(colour=Countries)) +
  geom_smooth(,method="lm", size=1, se=T) +
  facet_wrap( ~ Label , scales="free", ncol=3) +theme_bw() 
p<-facet_wrap_labeller(p,labels=wrap_labels)
pdf("Richness.pdf",width=14,height=14)
print(p)
dev.off()

heatmap.R

TAXAplot.R

Correlation.R

# ============================================================
# Tutorial on drawing a correlation map using ggplot2
# by Umer Zeeshan Ijaz (http://userweb.eng.gla.ac.uk/umer.ijaz)
# =============================================================
 
abund_table<-read.csv("SPE_pitlatrine.csv",row.names=1,check.names=FALSE)
#Transpose the data to have sample names on rows
abund_table<-t(abund_table)
meta_table<-read.csv("ENV_pitlatrine.csv",row.names=1,check.names=FALSE)
 
#Filter out samples with fewer counts
abund_table<-abund_table[rowSums(abund_table)>200,]
 
#Extract the corresponding meta_table for the samples in abund_table
meta_table<-meta_table[rownames(abund_table),]
 
#You can use sel_env to specify the variables you want to use and sel_env_label to specify the labes for the pannel
sel_env<-c("pH","Temp","TS","VS","VFA","CODt","CODs","perCODsbyt","NH4","Prot","Carbo")
sel_env_label <- list(
  'pH'="PH",
  'Temp'="Temperature",
  'TS'="TS",
  'VS'="VS",
  'VFA'="VFA",
  'CODt'="CODt",
  'CODs'="CODs",
  'perCODsbyt'="%CODs/t",
  'NH4'="NH4",
  'Prot'="Protein",
  'Carbo'="Carbon"
)
 
sel_env_label<-t(as.data.frame(sel_env_label))
sel_env_label<-as.data.frame(sel_env_label)
colnames(sel_env_label)<-c("Trans")
sel_env_label$Trans<-as.character(sel_env_label$Trans)
 
#Now get a filtered table based on sel_env
meta_table_filtered<-meta_table[,sel_env]
abund_table_filtered<-abund_table[rownames(meta_table_filtered),]
 
#Apply normalisation (either use relative or log-relative transformation)
#x<-abund_table_filtered/rowSums(abund_table_filtered)
x<-log((abund_table_filtered+1)/(rowSums(abund_table_filtered)+dim(abund_table_filtered)[2]))
 
x<-x[,order(colSums(x),decreasing=TRUE)]
#Extract list of top N Taxa
N<-51
taxa_list<-colnames(x)[1:N]
#remove "__Unknown__" and add it to others
taxa_list<-taxa_list[!grepl("Unknown",taxa_list)]
N<-length(taxa_list)
x<-data.frame(x[,colnames(x) %in% taxa_list])
y<-meta_table_filtered
 
#Get grouping information
grouping_info<-data.frame(row.names=rownames(abund_table),t(as.data.frame(strsplit(rownames(abund_table),"_"))))
# > head(grouping_info)
# X1 X2 X3
# T_2_1   T  2  1
# T_2_10  T  2 10
# T_2_12  T  2 12
# T_2_2   T  2  2
# T_2_3   T  2  3
# T_2_6   T  2  6
 
#Let us group on countries
groups<-grouping_info[,1]
 
#You can use kendall, spearman, or pearson below:
method<-"kendall"
 
 
#Now calculate the correlation between individual Taxa and the environmental data
df<-NULL
for(i in colnames(x)){
  for(j in colnames(y)){
    for(k in unique(groups)){
      a<-x[groups==k,i,drop=F]
      b<-y[groups==k,j,drop=F]
    tmp<-c(i,j,cor(a[complete.cases(b),],b[complete.cases(b),],use="everything",method=method),cor.test(a[complete.cases(b),],b[complete.cases(b),],method=method)$p.value,k)
    if(is.null(df)){
    df<-tmp  
    }
    else{
      df<-rbind(df,tmp)
    }    
  }
  }
}
 
df<-data.frame(row.names=NULL,df)
colnames(df)<-c("Taxa","Env","Correlation","Pvalue","Type")
df$Pvalue<-as.numeric(as.character(df$Pvalue))
df$AdjPvalue<-rep(0,dim(df)[1])
df$Correlation<-as.numeric(as.character(df$Correlation))
 
#You can adjust the p-values for multiple comparison using Benjamini & Hochberg (1995):
# 1 -> donot adjust
# 2 -> adjust Env + Type (column on the correlation plot)
# 3 -> adjust Taxa + Type (row on the correlation plot for each type)
# 4 -> adjust Taxa (row on the correlation plot)
# 5 -> adjust Env (panel on the correlation plot)
adjustment_label<-c("NoAdj","AdjEnvAndType","AdjTaxaAndType","AdjTaxa","AdjEnv")
adjustment<-5
 
if(adjustment==1){
  df$AdjPvalue<-df$Pvalue
} else if (adjustment==2){
  for(i in unique(df$Env)){
    for(j in unique(df$Type)){
      sel<-df$Env==i & df$Type==j
      df$AdjPvalue[sel]<-p.adjust(df$Pvalue[sel],method="BH")
    }
  }
} else if (adjustment==3){
  for(i in unique(df$Taxa)){
    for(j in unique(df$Type)){
      sel<-df$Taxa==i & df$Type==j
      df$AdjPvalue[sel]<-p.adjust(df$Pvalue[sel],method="BH")
    }
  }
} else if (adjustment==4){
  for(i in unique(df$Taxa)){
    sel<-df$Taxa==i
    df$AdjPvalue[sel]<-p.adjust(df$Pvalue[sel],method="BH")
  }
} else if (adjustment==5){
  for(i in unique(df$Env)){
    sel<-df$Env==i
    df$AdjPvalue[sel]<-p.adjust(df$Pvalue[sel],method="BH")
  }
}
 
#Now we generate the labels for signifant values
df$Significance<-cut(df$AdjPvalue, breaks=c(-Inf, 0.001, 0.01, 0.05, Inf), label=c("***", "**", "*", ""))
 
#We ignore NAs
df<-df[complete.cases(df),]
 
#We want to reorganize the Env data based on they appear
df$Env<-factor(df$Env,as.character(df$Env))
 
#We use the function to change the labels for facet_grid in ggplot2
Env_labeller <- function(variable,value){
  return(sel_env_label[as.character(value),"Trans"])
}
 
p <- ggplot(aes(x=Type, y=Taxa, fill=Correlation), data=df)
p <- p + geom_tile() + scale_fill_gradient2(low="#2C7BB6", mid="white", high="#D7191C") 
p<-p+theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust=0.5))
p<-p+geom_text(aes(label=Significance), color="black", size=3)+labs(y=NULL, x=NULL, fill=method)
p<-p+facet_grid(. ~ Env, drop=TRUE,scale="free",space="free_x",labeller=Env_labeller)
pdf(paste("Correlation_",adjustment_label[adjustment],".pdf",sep=""),height=8,width=22)
print(p)
dev.off()

NMDS_ordisurf.R

KW.R

# ============================================================
# Tutorial on finding significant taxa and then plotting them using ggplot2
# by Umer Zeeshan Ijaz (http://userweb.eng.gla.ac.uk/umer.ijaz)
# =============================================================
library(reshape)
library(ggplot2)
 
#Load the abundance table 
abund_table<-read.csv("SPE_pitlatrine.csv",row.names=1,check.names=FALSE)
 
#Transpose the data to have sample names on rows
abund_table<-t(abund_table)
 
#Get grouping information
grouping_info<-data.frame(row.names=rownames(abund_table),t(as.data.frame(strsplit(rownames(abund_table),"_"))))
# > head(grouping_info)
# X1 X2 X3
# T_2_1   T  2  1
# T_2_10  T  2 10
# T_2_12  T  2 12
# T_2_2   T  2  2
# T_2_3   T  2  3
# T_2_6   T  2  6
 
#Use countries as grouping information
groups<-as.factor(grouping_info[,1])
 
#Apply normalisation (either use relative or log-relative transformation)
#data<-abund_table/rowSums(abund_table)
data<-log((abund_table+1)/(rowSums(abund_table)+dim(abund_table)[2]))
data<-as.data.frame(data)
 
#Reference: http://www.bigre.ulb.ac.be/courses/statistics_bioinformatics/practicals/microarrays_berry_2010/berry_feature_selection.html
kruskal.wallis.alpha=0.01
kruskal.wallis.table <- data.frame()
for (i in 1:dim(data)[2]) {
  ks.test <- kruskal.test(data[,i], g=groups)
  # Store the result in the data frame
  kruskal.wallis.table <- rbind(kruskal.wallis.table,
                                data.frame(id=names(data)[i],
                                           p.value=ks.test$p.value
                                ))
  # Report number of values tested
  cat(paste("Kruskal-Wallis test for ",names(data)[i]," ", i, "/", 
            dim(data)[2], "; p-value=", ks.test$p.value,"\n", sep=""))
}
 
 
kruskal.wallis.table$E.value <- kruskal.wallis.table$p.value * dim(kruskal.wallis.table)[1]
 
kruskal.wallis.table$FWER <- pbinom(q=0, p=kruskal.wallis.table$p.value, 
                                    size=dim(kruskal.wallis.table)[1], lower.tail=FALSE)
 
kruskal.wallis.table <- kruskal.wallis.table[order(kruskal.wallis.table$p.value,
                                                   decreasing=FALSE), ]
kruskal.wallis.table$q.value.factor <- dim(kruskal.wallis.table)[1] / 1:dim(kruskal.wallis.table)[1]
kruskal.wallis.table$q.value <- kruskal.wallis.table$p.value * kruskal.wallis.table$q.value.factor
pdf("KW_correction.pdf")
plot(kruskal.wallis.table$p.value,
     kruskal.wallis.table$E.value,
     main='Multitesting corrections',
     xlab='Nominal p-value',
     ylab='Multitesting-corrected statistics',
     log='xy',
     col='blue',
     panel.first=grid(col='#BBBBBB',lty='solid'))
lines(kruskal.wallis.table$p.value,
      kruskal.wallis.table$FWER,
      pch=20,col='darkgreen', type='p'
)
lines(kruskal.wallis.table$p.value,
      kruskal.wallis.table$q.value,
      pch='+',col='darkred', type='p'
)
abline(h=kruskal.wallis.alpha, col='red', lwd=2)
legend('topleft', legend=c('E-value', 'p-value', 'q-value'), col=c('blue', 'darkgreen','darkred'), lwd=2,bg='white',bty='o')
dev.off()
 
last.significant.element <- max(which(kruskal.wallis.table$q.value <= kruskal.wallis.alpha))
selected <- 1:last.significant.element
diff.cat.factor <- kruskal.wallis.table$id[selected]
diff.cat <- as.vector(diff.cat.factor)
 
print(kruskal.wallis.table[selected,])
 
#Now we plot taxa significantly different between the categories
df<-NULL
for(i in diff.cat){
  tmp<-data.frame(data[,i],groups,rep(paste(i," q = ",round(kruskal.wallis.table[kruskal.wallis.table$id==i,"q.value"],5),sep=""),dim(data)[1]))
  if(is.null(df)){df<-tmp} else { df<-rbind(df,tmp)} 
}
colnames(df)<-c("Value","Type","Taxa")
 
p<-ggplot(df,aes(Type,Value,colour=Type))+ylab("Log-relative normalised")
p<-p+geom_boxplot()+geom_jitter()+theme_bw()+
  facet_wrap( ~ Taxa , scales="free", ncol=3)
p<-p+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))
pdf("KW_significant.pdf",width=10,height=14)
print(p)
dev.off()

NB.R

# ============================================================
# Tutorial on finding significant taxa and then plotting them using ggplot2
# by Umer Zeeshan Ijaz (http://userweb.eng.gla.ac.uk/umer.ijaz)
# =============================================================
 
library(ggplot2)
library(DESeq2)
 
#Load the abundance table 
abund_table<-read.csv("SPE_pitlatrine.csv",row.names=1,check.names=FALSE)
 
#Transpose the data to have sample names on rows
abund_table<-t(abund_table)
 
#Get grouping information
grouping_info<-data.frame(row.names=rownames(abund_table),t(as.data.frame(strsplit(rownames(abund_table),"_"))))
# > head(grouping_info)
# X1 X2 X3
# T_2_1   T  2  1
# T_2_10  T  2 10
# T_2_12  T  2 12
# T_2_2   T  2  2
# T_2_3   T  2  3
# T_2_6   T  2  6
 
#We will convert our table to DESeqDataSet object
countData = round(as(abund_table, "matrix"), digits = 0)
# We will add 1 to the countData otherwise DESeq will fail with the error:
# estimating size factors
# Error in estimateSizeFactorsForMatrix(counts(object), locfunc = locfunc,  : 
# every gene contains at least one zero, cannot compute log geometric means
countData<-(t(countData+1)) 
 
dds <- DESeqDataSetFromMatrix(countData, grouping_info, as.formula(~ X1))
 
#Reference:https://github.com/MadsAlbertsen/ampvis/blob/master/R/amp_test_species.R
 
#Differential expression analysis based on the Negative Binomial (a.k.a. Gamma-Poisson) distribution
#Some reason this doesn't work: data_deseq_test = DESeq(dds, test="wald", fitType="parametric")
data_deseq_test = DESeq(dds)
 
## Extract the results
res = results(data_deseq_test, cooksCutoff = FALSE)
res_tax = cbind(as.data.frame(res), as.matrix(countData[rownames(res), ]), OTU = rownames(res))
 
sig = 0.001
fold = 0
plot.point.size = 2
label=T
tax.display = NULL
tax.aggregate = "OTU"
 
res_tax_sig = subset(res_tax, padj < sig & fold < abs(log2FoldChange))
 
res_tax_sig <- res_tax_sig[order(res_tax_sig$padj),]
 
## Plot the data
### MA plot
res_tax$Significant <- ifelse(rownames(res_tax) %in% rownames(res_tax_sig) , "Yes", "No")
res_tax$Significant[is.na(res_tax$Significant)] <- "No"
p1 <- ggplot(data = res_tax, aes(x = baseMean, y = log2FoldChange, color = Significant)) +
  geom_point(size = plot.point.size) +
  scale_x_log10() +
  scale_color_manual(values=c("black", "red")) +
  labs(x = "Mean abundance", y = "Log2 fold change")+theme_bw()
if(label == T){
  if (!is.null(tax.display)){
    rlab <- data.frame(res_tax, Display = apply(res_tax[,c(tax.display, tax.aggregate)], 1, paste, collapse="; "))
  } else {
    rlab <- data.frame(res_tax, Display = res_tax[,tax.aggregate])
  }
  p1 <- p1 + geom_text(data = subset(rlab, Significant == "Yes"), aes(label = Display), size = 4, vjust = 1)
}
pdf("NB_MA.pdf")
print(p1)
dev.off()
 
res_tax_sig_abund = cbind(as.data.frame(countData[rownames(res_tax_sig), ]), OTU = rownames(res_tax_sig), padj = res_tax[rownames(res_tax_sig),"padj"]) 
 
#Apply normalisation (either use relative or log-relative transformation)
#data<-abund_table/rowSums(abund_table)
data<-log((abund_table+1)/(rowSums(abund_table)+dim(abund_table)[2]))
data<-as.data.frame(data)
 
#Now we plot taxa significantly different between the categories
df<-NULL
for(i in res_tax[rownames(res_tax_sig),"OTU"]){
  tmp<-data.frame(data[,i],groups,rep(paste(i," padj = ",round(res_tax[i,"padj"],5),sep=""),dim(data)[1]))
  if(is.null(df)){df<-tmp} else { df<-rbind(df,tmp)} 
}
colnames(df)<-c("Value","Type","Taxa")
 
p<-ggplot(df,aes(Type,Value,colour=Type))+ylab("Log-relative normalised")
p<-p+geom_boxplot()+geom_jitter()+theme_bw()+
  facet_wrap( ~ Taxa , scales="free", ncol=3)
p<-p+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))
pdf("NB_significant.pdf",width=10,height=14)
print(p)
dev.off()

Cluster.R

Normalise.R

# ============================================================
# Tutorial on normalisation of abundance tables based on Susan Holmes's paper
# by Umer Zeeshan Ijaz (http://userweb.eng.gla.ac.uk/umer.ijaz)
# =============================================================
 
library(phyloseq)
library(DESeq)
library(edgeR)
 
#===Normalization functions===#
#Reference: http://www.ploscompbiol.org/article/info%3Adoi%2F10.1371%2Fjournal.pcbi.1003531
 
edgeRnorm = function(physeq, ...){
  # Enforce orientation.
  if( !taxa_are_rows(physeq) ){
    physeq <- t(physeq)
  }
  x = as(otu_table(physeq), "matrix")
  # See if adding a single observation, 1, 
  # everywhere (so not zeros) prevents errors
  # without needing to borrow and modify 
  # calcNormFactors (and its dependent functions)
  # It did. This fixed all problems. 
  # Can the 1 be reduced to something smaller and still work?
  x = x + 1
  # Now turn into a DGEList
  y = edgeR::DGEList(counts=x, remove.zeros=TRUE)
  # Perform edgeR-encoded normalization, using the specified method (...)
  z = edgeR::calcNormFactors(y, ...)
  # A check that we didn't divide by zero inside `calcNormFactors`
  if( !all(is.finite(z$samples$norm.factors)) ){
    stop("Something wrong with edgeR::calcNormFactors on this data, non-finite $norm.factors")
  }
  return(z)
}
 
 
deseq_varstab = function(physeq, sampleConditions=rep("A", nsamples(physeq)), ...){
  # Enforce orientation.
  if( !taxa_are_rows(physeq) ){
    physeq <- t(physeq)
  }
  x = as(otu_table(physeq), "matrix")
  # The same tweak as for edgeR to avoid NaN problems
  # that cause the workflow to stall/crash.
  x = x + 1
  cds = newCountDataSet(x, sampleConditions)
  # First estimate library size factors
  cds = estimateSizeFactors(cds)
  # Variance estimation, passing along additional options
  cds = estimateDispersions(cds, ...)
  # Determine which column(s) have the dispersion estimates
  dispcol = grep("disp\\_", colnames(fData(cds)))
  # Enforce that there are no infinite values in the dispersion estimates
  if( any(!is.finite(fData(cds)[, dispcol])) ){
    fData(cds)[which(!is.finite(fData(cds)[, dispcol])), dispcol] <- 0.0
  }
  vsmat = exprs(varianceStabilizingTransformation(cds))
  otu_table(physeq) <- otu_table(vsmat, taxa_are_rows=TRUE)
  return(physeq)
}
 
proportion = function(physeq){
  # Normalize total sequences represented
  normf = function(x, tot=max(sample_sums(physeq))){ tot*x/sum(x) }
  physeq = transform_sample_counts(physeq, normf)
  # Scale by dividing each variable by its standard deviation.
  #physeq = transform_sample_counts(physeq, function(x) x/sd(x))
  # Center by subtracting the median
  #physeq = transform_sample_counts(physeq, function(x) (x-median(x)))
  return(physeq)
}
 
randomsubsample = function(physeq, smalltrim=0.15, replace=TRUE){
  # Set the minimum value as the smallest library quantile, n`smalltrim` 
  samplemin = sort(sample_sums(physeq))[-(1:floor(smalltrim*nsamples(physeq)))][1]
  physeqr = rarefy_even_depth(physeq, samplemin, rngseed=TRUE,
                              replace=replace, trimOTUs=TRUE)
  return(physeqr)
}
#/===Normalization functions===#
 
 
abund_table<-read.csv("SPE_pitlatrine.csv",row.names=1,check.names=FALSE)
 
#Transpose the data to have sample names on rows
abund_table<-t(abund_table)
 
#We first need to convert table in phyloseq format
abund_table_phyloseq=phyloseq(otu_table(abund_table, taxa_are_rows = FALSE))
 
#Additionally we give it grouping data
grouping_info<-data.frame(row.names=rownames(abund_table),t(as.data.frame(strsplit(rownames(abund_table),"_"))))
# > head(grouping_info)
# X1 X2 X3
# T_2_1   T  2  1
# T_2_10  T  2 10
# T_2_12  T  2 12
# T_2_2   T  2  2
# T_2_3   T  2  3
# T_2_6   T  2  6
 
 
sampleData=sample_data(grouping_info)
abund_table_phyloseq=merge_phyloseq(abund_table_phyloseq, sampleData)
 
#> abund_table_phyloseq
#phyloseq-class experiment-level object
#otu_table()   OTU Table:         [ 52 taxa and 81 samples ]
#sample_data() Sample Data:       [ 81 samples by 3 sample variables ]
 
 
#Now apply normalisation methods:
 
#Proportions
a<-proportion(abund_table_phyloseq) 
abund_table_N_proportions<-as.data.frame(otu_table(a))
 
#Rarefy
b<-randomsubsample(abund_table_phyloseq, smalltrim=0.15, replace=FALSE) 
abund_table_phyloseq_N_rarefied<-as.data.frame(t(otu_table(b)))
 
# method="TMM" is the weighted trimmed mean of M-values 
# (to the reference) proposed by Robinson and Oshlack (2010), 
# where the weights are from the delta method on Binomial data. 
c<-edgeRnorm(abund_table_phyloseq,method="TMM")
abund_table_N_edgeRTMM<-as.data.frame(t(c$counts))
 
# method="RLE" is the scaling factor method proposed by 
# Anders and Huber (2010). We call it "relative log expression", 
# as median library is calculated from the geometric mean of all 
# columns and the median ratio of each sample to the median library 
# is taken as the scale factor.
d<-edgeRnorm(abund_table_phyloseq,method="RLE")
abund_table_N_edgeRRLE<-as.data.frame(t(d$counts))
 
# method="upperquartile" is the upper-quartile normalization method 
# of Bullard et al (2010), in which the scale factors are calculated 
# from the 75% quantile of the counts for each library, after removing 
# transcripts which are zero in all libraries. This idea is generalized
# here to allow scaling by any quantile of the distributions.
e<-edgeRnorm(abund_table_phyloseq,method="upperquartile") #UQ-logFC method
abund_table_N_edgeRupperquartile<-as.data.frame(t(e$counts))
 
# DESeq variance stabilizing transformations
f<-deseq_varstab(abund_table_phyloseq, method="blind", sharingMode="maximum", fitType="local")
abund_table_N_varstab<-as.data.frame(t(otu_table(f)))

FSO.R

# ============================================================
# Tutorial on fuzzy set ordination
# by Umer Zeeshan Ijaz (http://userweb.eng.gla.ac.uk/umer.ijaz)
# =============================================================
 
library(fso)
library(ggplot2)
library(vegan)
 
#===dependencies===#
 
#We need a Pvalue formatter
formatPvalues <- function(pvalue) {
  ra<-""
  if(pvalue <= 0.1) ra<-"."
  if(pvalue <= 0.05) ra<-"*"
  if(pvalue <= 0.01) ra<-"**"
  if(pvalue <= 0.001) ra<-"***"
  return(ra)
}
 
#Reference: http://www.nku.edu/~boycer/fso/
sim.binary <- function (df, method = NULL, diag=FALSE, upper=FALSE){
  df <- as.matrix(df)
  a <- df %*% t(df)
  b <- df %*% (1 - t(df))
  c <- (1 - df) %*% t(df)
  d <- ncol(df) - a - b - c
  #1=Jaccard, 2=Baroni-Urbani & Buser, 3=Kulczynski, 4=Ochiai, 5=Sorensen
  if (method == 1) {
    sim <- a/(a + b + c)
  }
  else if (method == 2) {
    sim <- (a + sqrt(a*d))/(a + b + c + sqrt(a*d))
  }
  else if (method == 3) {
    sim <- 0.5* (a/(a + b) + a/(a + c))
  }
  else if (method == 4) {
    sim <- a/sqrt((a + b) * (a + c))
  }
  else if (method == 5) {
    sim <- 2 * a/(2 * a + b + c)
  }
  sim2 <- sim[row(sim) > col(sim)]
  class(sim2) <- "dist"
  attr(sim2, "Labels") <- dimnames(df)[[1]]
  attr(sim2, "Diag") <- diag
  attr(sim2, "Upper") <- upper
  attr(sim2, "Size") <- nrow(df)
  attr(sim2, "call") <- match.call()
 
  return(sim2)
}
 
sim.abund <- function (df, method = NULL, diag = FALSE, upper = FALSE) 
{
  METHODS <- c("Baroni-Urbani & Buser", "Horn", "Yule (Modified)")
  if (!inherits(df, "data.frame")) 
    stop("df is not a data.frame")
  if (any(df < 0)) 
    stop("non negative value expected in df")
  if (is.null(method)) {
    cat("1 = Baroni-Urbani & Buser\n")
    cat("2 = Horn\n")
    cat("3 = Yule\n")
    cat("Select an integer (1-3): ")
    method <- as.integer(readLines(n = 1))
  }
  df <- as.matrix(df)
  sites <- nrow(df)
  species <- ncol(df)
  sim <- array(0, c(as.integer(sites),as.integer(sites)))
  spmax <- apply(df,2,max)
 
  if (method == 1) {
    #compute similarities (Baroni-Urbani & Buser)
    for (x in 1:sites) {
      for (y in 1:sites) {
        h1 <- 0
        h2 <- 0
        h3 <- 0
        for (i in 1:species) {
          h1 <- h1 + min(df[x,i],df[y,i])
          h2 <- h2 + max(df[x,i],df[y,i])
          h3 <- h3 + spmax[i] - max(df[x,i],df[y,i])
        }
        numer <- h1 + sqrt(h1*h3)
        denom <- h2 + sqrt(h1*h3)
        sim[x,y] <- ifelse(identical(denom,0), 0, numer/denom)
      }
    }
  }
 
  else if (method == 2) {
    #compute similarities (Horn)
    for (x in 1:sites) {
      for (y in 1:sites) {
        h1 <- 0
        h2 <- 0
        h3 <- 0
        for (i in 1:species) {
          if((df[x,i] + df[y,i]) > 0) h1 <- h1 + (df[x,i] + df[y,i]) * log10(df[x,i] + df[y,i])
          if(df[x,i] > 0) h2 <- h2 + df[x,i] * log10(df[x,i])
          if(df[y,i] > 0) h3 <- h3 + df[y,i] * log10(df[y,i])
        }
        x.sum <- sum(df[x,])
        y.sum <- sum(df[y,])
        xy.sum <- x.sum + y.sum
        if (identical(xy.sum, 0)) (sim[x,y] <- 0) else (sim[x,y] <- (h1 - h2 - h3)/(xy.sum * log10(xy.sum)-x.sum * log10(x.sum) - y.sum * log10(y.sum)))
        if (sim[x,y] < 1.0e-10) sim[x,y] <- 0
      }
    }
 
  }
 
  else if (method == 3) {
    #compute similarities (Yule)
    for (x in 1:sites) {
      for (y in 1:sites) {
        h1 <- 0
        h2 <- 0
        h3 <- 0
        h4 <- 0
        for (i in 1:species) {
          h1 <- h1 + min(df[x,i], df[y,i])
          h2 <- h2 + max(df[x,i], df[y,i]) - df[y,i]
          h3 <- h3 + max(df[x,i], df[y,i]) - df[x,i]
          h4 <- h4 + spmax[i] - max(df[x,i], df[y,i])
        }
        numer <- sqrt(h1*h4)
        denom <- sqrt(h1*h4) + sqrt(h2*h3)
        sim[x,y] <- ifelse(identical(denom,0), 0, numer/denom)
      }
    }
  }  
 
  else stop("Non convenient method")
  sim >- t(sim)
  sim2 <- sim[row(sim) > col(sim)]
  attr(sim2, "Size") <- sites
  attr(sim2, "Labels") <- dimnames(df)[[1]]
  attr(sim2, "Diag") <- diag
  attr(sim2, "Upper") <- upper
  attr(sim2, "method") <- METHODS[method]
  attr(sim2, "call") <- match.call()
  class(sim2) <- "dist"
  return(sim2)
}
 
st.acr <- function(sim, diag=FALSE, upper = FALSE)
{
  library(vegan)
  dis <- 1 - sim
  #use "shortest" or "extended"
  edis <- as.matrix(stepacross(dis, path = "shortest", toolong = 1))
  sim <- 1 - edis
  amax <- max(sim)
  amin <- min(sim)
  sim <- (sim-amin)/(amax-amin)
  sim2 <- sim[row(sim) > col(sim)]
  attr(sim2, "Size") <- nrow(sim)
  attr(sim2, "Labels") <- dimnames(sim)[[1]]
  attr(sim2, "Diag") <- diag
  attr(sim2, "Upper") <- upper
  attr(sim2, "call") <- match.call()
  class(sim2) <- "dist"
  return(sim2)
}
#/===dependencies===#
 
#===generateFSO===#
#Example usage:
# data<- otu_table #(N X P)
# Env<- env_table #(N X Q)
# group<-as.character(group_categories) #(N X 1)
# method<-2 # 1 = Baroni-Urbani & Buser, 2 = Horn, 3 = Yule
# indices<-c(1,2) #Specify column no. of Env table; indices<-NULL will take all columns
# generateFSO(data,Env,group,method,indices=NULL,"FSO")
 
generateFSO<-function(data,Env,group,method,indices,filename){
 
  if(is.null(indices)){
    indices<-seq(1:dim(Env)[2])
  } 
 
  cat("Calculating dissimilarity matrix\n")
  sim <- sim.abund(data,method=method)
  dis.ho <- 1 - sim
  df<-NULL
  for(i in names(Env)[indices]){
    cat(paste("Processing",i,"\n"))
    param.fso<-fso(Env[,i],dis.ho,permute=1000)
    tmp<-data.frame(mu=param.fso$mu,param=param.fso$data, group=as.factor(group),label=rep(paste(i,"(",round(param.fso$r,2)," ",formatPvalues(param.fso$p),")",sep=""),dim(data)[1]))
    if(is.null(df)){df<-tmp} else {df<-rbind(df,tmp)}
  }
  pdf(paste(filename,".pdf",sep=""),height=14,width=14)
  p <- ggplot(df, aes(param, mu)) + 
    geom_point(aes(colour = group)) +geom_smooth(,method="lm", size=1, se=T) +theme_bw()+
    facet_wrap( ~ label , scales="free", ncol=3)
  print(p)
  dev.off()
  whole.fso<-fso(as.formula(paste("~",paste(lapply(indices,function(x) paste("Env[,",x,"]",sep="")),collapse="+"))),dis=dis.ho,data=Env,permute=1000)
  whole.fso$var<-names(Env)[indices]
  print(summary(whole.fso))
}
 
#===/generateFSO===#
 
#===generateMFSO===#
#Example usage:
# data<- otu_table #(N X P)
# Env<- env_table #(N X Q)
# group<-as.character(group_categories) #(N X 1)
# method<-2 # 1 = Baroni-Urbani & Buser, 2 = Horn, 3 = Yule
# indices<-c(1,2) #Specify column no. of Env table; indices<-NULL will take all columns
# step_across<-TRUE #Step-across correction might improve the ordination
# generateMFSO(data,Env,group,method,indices=NULL,step_across=TRUE,"MFSO")
 
 
generateMFSO<-function(data,Env,group,method,indices,step_across=FALSE,filename){
  if(is.null(indices)){
    indices<-seq(1:dim(Env)[2])
  } 
 
  cat("Calculating dissimilarity matrix\n")
  sim <- sim.abund(data,method=method)
  if(step_across){sim <- st.acr(sim)}  
  dis.ho <- 1 - sim
  whole.mfso<-mfso(as.formula(paste("~",paste(lapply(indices,function(x) paste("Env[,",x,"]",sep="")),collapse="+"))),dis=dis.ho,data=Env,scaling=2,permute=1000)
  whole.mfso$var<-names(Env)[indices]
  names(whole.mfso$data)<-names(Env)[indices]
  df<-NULL
  for (i in 1:(ncol(whole.mfso$mu) - 1)) {
    for (j in (i + 1):ncol(whole.mfso$mu)) {
      cat(paste("Processing",names(whole.mfso$data)[i]," and ",names(whole.mfso$data)[j],"\n"))
      tmp<-data.frame(x=whole.mfso$mu[, i],y=whole.mfso$mu[, j], group=as.factor(group),label=rep(paste("x=mu(",names(whole.mfso$data)[i],")",", y=mu(",names(whole.mfso$data)[j],")",sep=""),dim(data)[1]))
      if(is.null(df)){df<-tmp} else {df<-rbind(df,tmp)}
    }
  }
  pdf(paste(filename,"_mu.pdf",sep=""),height=35,width=14)
  p <- ggplot(df, aes(x, y)) + 
    geom_point(aes(colour = group)) +geom_smooth(,method="lm", size=1, se=T) +theme_bw()+
    facet_wrap( ~ label , scales="free", ncol=4)
  print(p)
  dev.off()
 
  tmp <- as.numeric(whole.mfso$notmis)
  dis <- as.matrix(dis.ho)[tmp, tmp]
  a <- dist(whole.mfso$mu)
  y <- as.dist(dis)
  pdf(paste(filename,"_correlation.pdf",sep=""))
  plot(y, a, ylab = "Ordination Distance", xlab = "Matrix Dissimilarity",pch=16,col=rgb(0, 0, 0, 0.3))
  text(min(y), max(a), paste("r = ", format(cor(y, a), 
                                            digits = 3)), pos = 4)
  dev.off()
  print(summary(whole.mfso))
}
#===/generateMFSO===#
 
abund_table<-read.csv("SPE_pitlatrine.csv",row.names=1,check.names=FALSE)
#Transpose the data to have sample names on rows
abund_table<-t(abund_table)
 
meta_table<-read.csv("ENV_pitlatrine.csv",row.names=1,check.names=FALSE)
 
#Just a check to ensure that the samples in meta_table are in the same order as in abund_table
meta_table<-meta_table[rownames(abund_table),]
 
#Get grouping information
grouping_info<-data.frame(row.names=rownames(abund_table),t(as.data.frame(strsplit(rownames(abund_table),"_"))))
# > head(grouping_info)
# X1 X2 X3
# T_2_1   T  2  1
# T_2_10  T  2 10
# T_2_12  T  2 12
# T_2_2   T  2  2
# T_2_3   T  2  3
# T_2_6   T  2  6
 
generateFSO(as.data.frame(abund_table),meta_table,grouping_info[,1],2,NULL,"FSO")
 
# Fuzzy set statistics
# 
# col   variable         r     p         d
# 1   11      Carbo 0.7140058 0.001 0.6943693
# 2    2       Temp 0.6202771 0.001 0.7671225
# 3   10       Prot 0.4659610 0.001 0.5658457
# 4    9        NH4 0.3819395 0.001 0.5115750
# 5    3         TS 0.3653211 0.002 0.5182094
# 6    8 perCODsbyt 0.3338634 0.004 0.5630355
# 7    5        VFA 0.3289921 0.004 0.4786655
# 8    6       CODt 0.3126350 0.003 0.8194612
# 9    4         VS 0.2768421 0.006 0.8264804
# 10   7       CODs 0.2717105 0.007 0.5133256
# 11   1         pH 0.1052026 0.153 0.5623866
# 
# Correlations of fuzzy sets
# 
#             pH        Temp        TS        VS          VFA         CODt
# pH          1.0000000 -0.7386860  0.9647511 -0.53378849 -0.9269124 -0.7363250
# Temp       -0.7386860  1.0000000 -0.7662551 -0.13447847  0.8173559  0.1627051
# TS          0.9647511 -0.7662551  1.0000000 -0.52324123 -0.9783639 -0.7529399
# VS         -0.5337885 -0.1344785 -0.5232412  1.00000000  0.4241976  0.9459625
# VFA        -0.9269124  0.8173559 -0.9783639  0.42419761  1.0000000  0.6793673
# CODt       -0.7363250  0.1627051 -0.7529399  0.94596254  0.6793673  1.0000000
# CODs       -0.9336815  0.7106207 -0.9818299  0.57275257  0.9839748  0.7936315
# perCODsbyt -0.8789800  0.9163897 -0.9321846  0.22357388  0.9708713  0.5029946
# NH4        -0.9456583  0.8863344 -0.9595074  0.32493987  0.9686050  0.5826021
# Prot       -0.9656359  0.6903348 -0.9890375  0.61667884  0.9636521  0.8233212
# Carbo      -0.8189456  0.9819195 -0.8547344  0.02029398  0.9018883  0.3181941
#             CODs      perCODsbyt        NH4       Prot       Carbo
# pH         -0.9336815 -0.8789800 -0.9456583 -0.9656359 -0.81894558
# Temp        0.7106207  0.9163897  0.8863344  0.6903348  0.98191952
# TS         -0.9818299 -0.9321846 -0.9595074 -0.9890375 -0.85473437
# VS          0.5727526  0.2235739  0.3249399  0.6166788  0.02029398
# VFA         0.9839748  0.9708713  0.9686050  0.9636521  0.90188831
# CODt        0.7936315  0.5029946  0.5826021  0.8233212  0.31819413
# CODs        1.0000000  0.9212524  0.9342960  0.9858916  0.81594415
# perCODsbyt  0.9212524  1.0000000  0.9704735  0.8894673  0.96957738
# NH4         0.9342960  0.9704735  1.0000000  0.9362088  0.94362521
# Prot        0.9858916  0.8894673  0.9362088  1.0000000  0.79378163
# Carbo       0.8159442  0.9695774  0.9436252  0.7937816  1.00000000
 
generateMFSO(as.data.frame(abund_table),meta_table,grouping_info[,1],2,indices=NULL,step_across=TRUE,"MFSO")
 
# Statistics for multidimensional fuzzy set ordination
# 
#     variable cumulative_r     increment p-value       gamma
# 1          pH    0.5747713  5.747713e-01   0.847 1.000000000
# 2        Temp    0.8611518  2.863805e-01   0.196 0.312136124
# 3          TS    0.8664168  5.265049e-03   0.725 0.027349087
# 4          VS    0.8696778  3.261003e-03   0.775 0.124736229
# 5         VFA    0.8715679  1.890017e-03   0.773 0.012001049
# 6        CODt    0.8757048  4.136928e-03   0.591 0.050717093
# 7        CODs    0.8756964 -8.426964e-06   0.718 0.001398279
# 8  perCODsbyt    0.8761959  4.995468e-04   0.732 0.003167491
# 9         NH4    0.8767607  5.648345e-04   0.515 0.008190486
# 10       Prot    0.8774340  6.732944e-04   0.645 0.004340200
# 11      Carbo    0.8775444  1.104193e-04   0.707 0.001035478

phyloseq.R

# ============================================================
# Tutorial on Unifrac distances for OTU tables using phyloseq
# by Umer Zeeshan Ijaz (http://userweb.eng.gla.ac.uk/umer.ijaz)
# =============================================================
 
library(ggplot2)
 
#Reference: http://joey711.github.io/phyloseq-demo/phyloseq-demo.html
#Reference: https://rdp.cme.msu.edu/tutorials/stats/using_rdp_output_with_phyloseq.html
#Reference: http://www.daijiang.name/en/2014/05/17/phylogenetic-functional-beta-diversity/
#Reference: http://www.wernerlab.org/teaching/qiime/overview/d
#Reference: http://joey711.github.io/waste-not-supplemental/
 
# Question:
# Given an OTU table (All_Good_P2_C03.csv) and corresponding OTU sequences (All_Good_P2_C03.fa), how do you generate phylogenetic tree?
# 
# Solution:
#   
# STEP 1: Aligning sequences
# We'll be using a reference alignment to align our sequences.  In QIIME, this reference alignment is core_set_aligned.fasta.imputed 
# and QIIME already knows where it is. The following is done using PyNAST, though alignment can also be done with MUSCLE and Infernal (http://qiime.org/scripts/align_seqs.html). 
# 
# $ align_seqs.py -i All_Good_P2_C03.fa -o alignment/
# 
# STEP 2: Filtering alignment
# This alignment contains lots of gaps, and it includes hypervariable regions that make it difficult to build an accurate tree. So, we'll 
# filter it.  Filtering an alignment of 16S rRNA gene sequences can involve a Lane mask. In QIIME, this Lane mask for the GG core is lanemask_in_1s_and_0s
# 
# $ filter_alignment.py -i alignment/All_Good_P2_C03_aligned.fasta -o alignment
# 
# STEP 3: Make tree
# make_phylogeny.py script uses the FastTree approximately maximum likelihood program, a good model of evolution for 16S rRNA gene sequences
# 
# $ make_phylogeny.py -i alignment/All_Good_P2_C03_aligned_pfiltered.fasta -o All_Good_P2_C03.tre
# 
# STEP 4: Generate taxonomy using RDP Classifier
# 
# $ java -Xmx1g -jar $RDP_PATH/classifier.jar classify -f filterbyconf -o All_Good_P2_C03_Assignments.txt All_Good_P2_C03.fa
# 
# STEP 5: Finally generate a csv file that can be imported in R
# 
# $ <All_Good_P2_C03_Assignments.txt awk -F"\t" 'BEGIN{print "OTUs,Domain,Phylum,Class,Order,Family,Genus"}{gsub(" ","_",$0);gsub("\"","",$0);print $1","$3","$6","$9","$12","$15","$18}' > All_Good_P2_C03_Taxonomy.csv
# 
 
abund_table<-read.csv("All_Good_P2_C03.csv",row.names=1,check.names=FALSE)
#Transpose the data to have sample names on rows
abund_table<-t(abund_table)
 
meta_table<-read.csv("ENV_pitlatrine.csv",row.names=1,check.names=FALSE)
 
#Get grouping information
grouping_info<-data.frame(row.names=rownames(meta_table),t(as.data.frame(strsplit(rownames(meta_table),"_"))))
colnames(grouping_info)<-c("Country","Latrine","Depth")
meta_table<-data.frame(meta_table,grouping_info)
 
#Filter out samples not present in meta_table
abund_table<-abund_table[rownames(abund_table) %in% rownames(meta_table),]
 
#Load the tree using ape package
library(ape)
OTU_tree <- read.tree("All_Good_P2_C03.tre")
 
#Now load the taxonomy
OTU_taxonomy<-read.csv("All_Good_P2_C03_Taxonomy.csv",row.names=1,check.names=FALSE)
 
library(phyloseq)
#Convert the data to phyloseq format
OTU = otu_table(as.matrix(abund_table), taxa_are_rows = FALSE)
TAX = tax_table(as.matrix(OTU_taxonomy))
SAM = sample_data(meta_table)
physeq<-merge_phyloseq(phyloseq(OTU, TAX),SAM,OTU_tree)
 
#Plot richness
p<-plot_richness(physeq, x = "Country", color = "Depth")
p<-p+theme_bw()
pdf("phyloseq_richness.pdf",width=14)
print(p)
dev.off()
 
 
#Now take the subset of the data for latrines at depth 1 and for Prevotellaceae
physeq_subset<-subset_taxa(subset_samples(physeq,Depth=="1"),Family=="Prevotellaceae")
p <- plot_tree(physeq_subset, color = "Country", label.tips = "Genus", size = "abundance",text.size=2)
pdf("phyloseq_tree.pdf",width=8)
print(p)
dev.off()
 
#We now take the 500 most abundant Taxa
physeq_subset<-prune_taxa(names(sort(taxa_sums(physeq), TRUE)[1:500]), physeq)
 
#Plot heatmap
p<-plot_heatmap(physeq_subset,  method=NULL, sample.label="Country", taxa.label="Class")
pdf("phyloseq_heatmap.pdf",width=8)
print(p)
dev.off()
 
#Plot bar
p<-plot_bar(physeq_subset, x="Country", fill="Phylum")+theme_bw()
pdf("phyloseq_bar.pdf",width=8)
print(p)
dev.off()
 
#Make separate panel for each country
p<-plot_bar(physeq_subset, x="Phylum", fill="Phylum",facet_grid=~Country)+theme_bw()
p<-p+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))
pdf("phyloseq_bar_separate.pdf",width=8)
print(p)
dev.off()
 
#Make an PCoA ordination plot based on abundance based unifrac distances with the following commands
ord <- ordinate(physeq_subset, method="PCoA", distance="unifrac", weighted=TRUE)
p <- plot_ordination(physeq_subset, ord, color="Country",title="Phyloseq's Weighted Unifrac")
p <- p + geom_point(size=5) + theme_bw()
pdf("phyloseq_unifrac.pdf",width=8)
print(p)
dev.off()
 
 
#The GUniFrac package can also be used to calculate unifrac distances and has additional features. 
#Unifrac distances are traditionally calculated on either presence/absence data, or abundance data. 
#The former can be affected by PCR and sequencing errors leading to a high number of spurious and 
#usually rare OTUs, and the latter can give undue weight to the more abundant OTUs. 
#GUniFrac's methods include use of a parameter alpha that controls the weight given to abundant OTUs 
#and also a means of adjusting variances.
 
library(GUniFrac)
library(phangorn)
 
#The function GUniFrac requires a rooted tree, but unlike phyloseq's ordination function 
#will not try to root an unrooted one. We will apply mid-point rooting with the midpoint function 
#from the phangorn package
 
unifracs <- GUniFrac(as.matrix(otu_table(physeq_subset)), midpoint(phy_tree(physeq_subset)), alpha = c(0, 0.5, 1))$unifracs
 
# We can extract a variety of distance matrices with different weightings.
dw <- unifracs[, , "d_1"]  # Weighted UniFrac
du <- unifracs[, , "d_UW"]  # Unweighted UniFrac
dv <- unifracs[, , "d_VAW"]  # Variance adjusted weighted UniFrac
d0 <- unifracs[, , "d_0"]  # GUniFrac with alpha 0
d5 <- unifracs[, , "d_0.5"]  # GUniFrac with alpha 0.5
 
# use vegan's cmdscale function to make a PCoA ordination from a distance matrix.
pcoa <- cmdscale(dw, k = nrow(as.matrix(otu_table(physeq_subset))) - 1, eig = TRUE, add = TRUE)
p<-plot_ordination(physeq_subset, pcoa, color="Country", 
               title="GUniFrac Weighted Unifrac") + geom_point(size=5)+ theme_bw()
pdf("GUniFrac_weighted.pdf",width=8)
print(p)
dev.off()
 
pcoa <- cmdscale(du, k = nrow(as.matrix(otu_table(physeq_subset))) - 1, eig = TRUE, add = TRUE)
p<-plot_ordination(physeq_subset, pcoa, color="Country", 
                   title="GUniFrac Unweighted UniFrac") + geom_point(size=5)+ theme_bw()
pdf("GUniFrac_unweighted.pdf",width=8)
print(p)
dev.off()
 
pcoa <- cmdscale(dv, k = nrow(as.matrix(otu_table(physeq_subset))) - 1, eig = TRUE, add = TRUE)
p<-plot_ordination(physeq_subset, pcoa, color="Country", 
                   title="GUniFrac Variance adjusted weighted UniFrac") + geom_point(size=5)+ theme_bw()
pdf("GUniFrac_variance.pdf",width=8)
print(p)
dev.off()
 
pcoa <- cmdscale(d0, k = nrow(as.matrix(otu_table(physeq_subset))) - 1, eig = TRUE, add = TRUE)
p<-plot_ordination(physeq_subset, pcoa, color="Country", 
                   title="GUniFrac with alpha 0") + geom_point(size=5)+ theme_bw()
pdf("GUniFrac_alpha0.pdf",width=8)
print(p)
dev.off()
 
pcoa <- cmdscale(d5, k = nrow(as.matrix(otu_table(physeq_subset))) - 1, eig = TRUE, add = TRUE)
p<-plot_ordination(physeq_subset, pcoa, color="Country", 
                   title="GUniFrac with alpha 0.5") + geom_point(size=5)+ theme_bw()
pdf("GUniFrac_alpha0.5.pdf",width=8)
print(p)
dev.off()
 
#Permanova - Distance based multivariate analysis of variance
adonis(as.dist(d5) ~ as.matrix(sample_data(physeq_subset)[,"Country"]))
# > adonis(as.dist(d5) ~ as.matrix(sample_data(physeq_subset)[,"Country"]))
# 
# Call:
#   adonis(formula = as.dist(d5) ~ as.matrix(sample_data(physeq_subset)[,      "Country"])) 
# 
# Permutation: free
# Number of permutations: 999
# 
# Terms added sequentially (first to last)
# 
# Df SumsOfSqs MeanSqs F.Model     R2 Pr(>F)    
# as.matrix(sample_data(physeq_subset)[, "Country"])  1    3.7531  3.7531  14.558 0.1556  0.001 ***
#   Residuals                                          79   20.3666  0.2578         0.8444           
# Total                                              80   24.1198                 1.0000           
# ---
#   Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
 
 
#Each distance measure is most powerful in detecting only a certain scenario. When multiple distance
#matrices are available, separate tests using each distance matrix will lead to loss of power due to
#multiple testing correction. Combing the distance matrices in a single test will improve power.
#PermanovaG combines multiple distance matrices by taking the maximum of pseudo-F statistics
#for each distance matrix. Significance is assessed by permutation.
 
PermanovaG(unifracs[, , c("d_0", "d_0.5", "d_1")]  ~ as.matrix(sample_data(physeq_subset)[,"Country"]))
 
# > PermanovaG(unifracs[, , c("d_0", "d_0.5", "d_1")]  ~ as.matrix(sample_data(physeq_subset)[,"Country"]))
# $aov.tab
# F.Model p.value
# as.matrix(sample_data(physeq_subset)[, "Country"]) 14.55806   0.001
 
 
#Make a network based on GUniFrac with alpha 0.5
p<-plot_net(physeq_subset, distance=as.dist(d5), maxdist = 0.4, color = "Country", shape="Latrine")
p<-p + scale_shape_manual(values=seq(1,15))
pdf("phyloseq_network.pdf",width=8)
print(p)
dev.off()
 
#Reference:http://joey711.github.io/waste-not-supplemental/dispersion-survey/dispersion.html
#Justification that microbiome count data from amplicon sequencing experiments can be treated 
#with the same procedures as for differential expression data from RNA-Seq experiments, 
#that microbiome data is overdispersed, that is, has an effectively non-zero value for 
#dispersion parameter in the Negative Binomial generalization of Poisson discrete counts.
 
#UZI has modified the deseq_var from above as there was a bug
deseq_var = function(physeq, bioreps) {
  require("DESeq")
  require("plyr")
  # physeq = GP bioreps = 'BioReps'
  biorep_levels = levels(factor(get_variable(physeq, bioreps)))
  ldfs = lapply(biorep_levels, function(L, physeq, bioreps) {
    # L = biorep_levels[1]
    whsa = as(get_variable(physeq, bioreps), "character") %in% L
    # Skip if there are 2 or fewer instances among BioReps Prob don't want to
    # assume very much from estimates of variance for which N<=2
    if (sum(whsa) <= 2) {
      return(NULL)
    }
    ps1 = prune_samples(whsa, physeq)
    # Prune the 'zero' OTUs from this sample set.
    ps1 = prune_taxa(taxa_sums(ps1) > 5, physeq)
    # Coerce to matrix, round up to nearest integer just in case
    x = ceiling(as(otu_table(ps1), "matrix"))
    # Add 1 to everything to avoid log-0 problems
    x = x + 1
    #UZI put t(x) instead of x in the following
    cds = newCountDataSet(countData = t(x), conditions = get_variable(physeq, 
                                                                   bioreps))
    # Estimate Size factors
    cds = estimateSizeFactors(cds, locfunc = median)
    # Estimate Dispersions, DESeq
    cds = estimateDispersions(cds, method = "pooled", sharingMode = "fit-only", 
                              fitType = "local")
    # get the scale-normalized mean and variance
    df = getBaseMeansAndVariances(counts(cds), sizeFactors(cds))
    colnames(df) <- c("Mean", "Variance")
    ## Calculate Mean and Variance for each OTU after rarefying Use a non-random
    ## definition of 'rarefying'
    ps1r = transform_sample_counts(ps1, function(x, Smin = min(sample_sums(ps1), 
                                                               na.rm = TRUE)) {
      round(x * Smin/sum(x), digits = 0)
    })
    ps1r = prune_taxa(taxa_sums(ps1r) > 5, ps1r)
    rdf = data.frame(Mean = apply(otu_table(ps1r), 1, mean), Variance = apply(otu_table(ps1r), 
                                                                              1, var))
    # Return results, including cds and 'rarefied' mean-var
    return(list(df = df, cds = cds, rdf = rdf))
  }, physeq, bioreps)
  names(ldfs) <- biorep_levels
  # Remove the elements that were skipped.
  ldfs <- ldfs[!sapply(ldfs, is.null)]
  # Group into data.frames of variance estimates
  df = ldply(ldfs, function(L) {
    data.frame(L$df, OTUID = rownames(L$df))
  })
  rdf = ldply(ldfs, function(L) {
    data.frame(L$rdf, OTUID = rownames(L$rdf))
  })
  colnames(df)[1] <- bioreps
  colnames(rdf)[1] <- bioreps
  # Remove values that are zero for both
  df <- df[!(df$Mean == 0 & df$Variance == 0), ]
  rdf <- rdf[!(rdf$Mean == 0 & rdf$Variance == 0), ]
  # Create a `data.frame` of the DESeq predicted.
  deseqfitdf = ldply(ldfs, function(x) {
    df = x$df
    xg = seq(range(df$Mean)[1], range(df$Mean)[2], length.out = 100)
    fitVar = xg + fitInfo(x$cds)$dispFun(xg) * xg^2
    return(data.frame(Mean = xg, Variance = fitVar))
  })
  colnames(deseqfitdf)[1] <- bioreps
  return(list(datadf = df, fitdf = deseqfitdf, raredf = rdf))
}
 
restrovar = deseq_var(physeq, "Country")
 
#UZI has taken facet_wrap(~BioReps) out from the following
plot_dispersion <- function(datadf, fitdf, title = "Microbiome Counts, Common-Scale", 
                            xbreaks = c(10, 1000, 1e+05)) {
  require("ggplot2")
  p = ggplot(datadf, aes(Mean, Variance)) + geom_point(alpha = 0.25, size = 1.5)  
  p = p + scale_y_log10()
  p = p + scale_x_log10(breaks = xbreaks, labels = xbreaks)
  p = p + geom_abline(intercept = 0, slope = 1, linetype = 1, size = 0.5, 
                      color = "gray60")
  p = p + geom_line(data = fitdf, color = "red", linetype = 1, size = 0.5)
  p = p + ggtitle(title)
  return(p)
}
 
p<-plot_dispersion(restrovar$datadf, restrovar$fitdf)+facet_wrap(~Country)+theme_bw()
pdf("phyloseq_dispersion.pdf",width=8)
print(p)
dev.off()

R code for ecological data analysis

Material

Instructions

NMDS.R

NMDS_bioenv.R

CCA.R

richness.R

heatmap.R

TAXAplot.R

Correlation.R

NMDS_ordisurf.R

KW.R

NB.R

Cluster.R

Normalise.R

FSO.R

phyloseq.R