#!/bin/Rscript
# ***************************************************************
# Name:      taxo_NMDS_bioenv.R
# Purpose:   This R script is an extension of vegan library's bioenv()
#		         function and uses the bio.env() and bio.step() of
#		         http://menugget.blogspot.co.uk/2011/06/clarke-and-ainsworths-bioenv-and-bvstep.html
#		         The original author suggested these functions to overcome
#		         the inflexibility of the bioenv() function which uses
#		         a similarity matrix based on normalized "euclidean" distance.
#		         This script finds the best subset of species and environmental variables and plots them on NMDS plot
# Version:   0.2
# History:   0.1 to 0.2 Automatically remove NA entries
# Authors:   Umer Zeeshan Ijaz (Umer.Ijaz@glasgow.ac.uk)
#                 http://userweb.eng.gla.ac.uk/umer.ijaz
#            Christopher Quince (Christopher.Quince@glasgow.ac.uk)
#                 http://userweb.eng.gla.ac.uk/christopher.quince
# Created:   2012-09-10
# License:   Copyright (c) 2012 Computational Microbial Genomics Group, University of Glasgow, UK
# 
#            This program is free software: you can redistribute it and/or modify
#            it under the terms of the GNU General Public License as published by
#            the Free Software Foundation, either version 3 of the License, or
#            (at your option) any later version.
#
#            This program is distributed in the hope that it will be useful,
#            but WITHOUT ANY WARRANTY; without even the implied warranty of
#            MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#            GNU General Public License for more details.
#
#            You should have received a copy of the GNU General Public License
#            along with this program.  If not, see <http://www.gnu.org/licenses/>.
# **************************************************************/
suppressPackageStartupMessages(library("optparse"))
suppressPackageStartupMessages(library("calibrate"))
suppressPackageStartupMessages(library("vegan"))
suppressPackageStartupMessages(library("RColorBrewer"))
suppressPackageStartupMessages(library("gplots"))

#specify desired options in a list
option_list <- list(
make_option("--sfile", action="store",default=NULL, help="Species abundance file"),
make_option("--efile", action="store",default=NULL, help="environmental file"),
make_option("--opath", action="store",default=NULL, help="Output path"),
make_option("--fsize", action="store", default="1.2", help="Font size [default %default]"),
make_option("--width", type="integer",default=800, help="Width of jpeg files [default %default]"),
make_option("--height", type="integer",default=800, help="Height of jpeg files [default %default]"),
make_option("--mreads", type="integer",default=200, help="Minimum reads [default %default]"),
make_option("--nrestarts", type="integer",default=10, help="Number of restarts [default %default]"),
make_option("--ccol", type="integer",default=1, help="Minimum reads [default %default]"),
make_option("--correlation", type="integer",default=1, help="Correlation to use: 1=pearson, 2=spearman, 3=kendall [default %default]"),
make_option("--fmethod", type="integer",default=6, help="Fixed distance method: 1=euclidean, 2=manhattan, 3=gower, 4=altGower, 5=canberra, 6=bray, 7=kulczynski, 8=morisita,9=horn, 10=binomial, 11=cao [default %default]"),
make_option("--vmethod", type="integer",default=6, help="Variable distance method: 1=euclidean, 2=manhattan, 3=gower, 4=altGower, 5=canberra, 6=bray, 7=kulczynski, 8=morisita,9=horn, 10=binomial, 11=cao [default %default]"),
make_option("--nmethod", type="integer",default=6, help="NMDS distance method: 1=euclidean, 2=manhattan, 3=gower, 4=altGower, 5=canberra, 6=bray, 7=kulczynski, 8=morisita,9=horn, 10=binomial, 11=cao [default %default]"),
make_option("--dmode", action="store_true",default=FALSE, help="Site names display modes [default %default]")
)

#get command line options
opt<-parse_args(OptionParser(usage="%prog [options] file", option_list=option_list))

if(is.null(opt$sfile) || is.null(opt$efile))
quit()

#generate colors function
generateColors <- function(siteNames,column) {
    
    #generate the indices
    colInd<-""
    
    if(column==0){
        colInd<-rep("blue",length(siteNames))
    }
    else{
        #first split the names with respect to underscore and construct a parsedString matrix
        for(i in seq(1:length(siteNames)))
        {
            tmp<-t(data.matrix(unlist(strsplit(siteNames[i],"_"))))
            if(column>length(tmp)){
                column<-length(tmp)
            }
            if(i==1){
                parsedString<-tmp
            }
            else{
                parsedString<-rbind(parsedString,tmp)
            }
        }
        #generate template colors
        crp<- colorRampPalette(c("blue", "#007FFF", "cyan","#7FFF7F", "yellow", "#FF7F00", "red"),space = "Lab")(length(unique(parsedString[,column])))        
        for(i in seq(1:length(siteNames)))
        {
            colInd[i]<-crp[match(parsedString[i,column],unique(parsedString[,column]))]
        }
    }
    return(colInd)
}


#Importing data now
AS_C <-read.csv(opt$sfile,header=TRUE,row.names=1)
Env <- read.csv(opt$efile,header=TRUE,row.names=1)

#transpose the data
AS <- t(AS_C)

#filter the data for reads smaller than a particular number
AS<-AS[rowSums(AS)>opt$mreads,]
Env<- Env[rownames(AS),]

#use complete.cases function to filter out those rows that have NA entry in Env
Env<-Env[complete.cases(Env),]
AS<-AS[rownames(Env),]


# The new functions are given below and implement the following algorithms: 
# Clarke, K. R & Ainsworth, M. 1993. A method of linking multivariate community structure to environmental variables. Marine Ecology Progress Series, 92, 205-219.
# Clarke, K. R., Gorley, R. N., 2001. PRIMER v5: User Manual/Tutorial. PRIMER-E, Plymouth, UK.
# Clarke, K. R., Warwick, R. M., 2001. Changes in Marine Communities: An Approach to Statistical Analysis and Interpretation, 2nd edition. PRIMER-E Ltd, Plymouth, UK.
# Clarke, K. R., Warwick, R. M., 1998. Quantifying structural redundancy in ecological communities. Oecologia, 113:278-289. 

bv.step <- function(fix.mat, var.mat, 
                    fix.dist.method="bray", var.dist.method="euclidean", correlation.method="spearman",
                    scale.fix=FALSE, scale.var=TRUE,
                    max.rho=0.95,
                    min.delta.rho=0.001,
                    random.selection=TRUE,
                    prop.selected.var=0.2,
                    num.restarts=10,
                    var.always.include=NULL,
                    var.exclude=NULL,
                    output.best=10
){
  
  if(dim(fix.mat)[1] != dim(var.mat)[1]){stop("fixed and variable matrices must have the same number of rows")}
  if(sum(var.always.include %in% var.exclude) > 0){stop("var.always.include and var.exclude share a variable")}
  require(vegan)
  
  if(scale.fix){fix.mat<-scale(fix.mat)}else{fix.mat<-fix.mat}
  if(scale.var){var.mat<-scale(var.mat)}else{var.mat<-var.mat}
  
  fix.dist <- vegdist(as.matrix(fix.mat), method=fix.dist.method)
  
  #an initial removal phase
  var.dist.full <- vegdist(as.matrix(var.mat), method=var.dist.method)
  full.cor <- suppressWarnings(cor.test(fix.dist, var.dist.full, method=correlation.method))$estimate
  var.comb <- combn(1:ncol(var.mat), ncol(var.mat)-1)
  RES <- data.frame(var.excl=rep(NA,ncol(var.comb)), n.var=ncol(var.mat)-1, rho=NA)
  for(i in 1:dim(var.comb)[2]){
    var.dist <- vegdist(as.matrix(var.mat[,var.comb[,i]]), method=var.dist.method)
    temp <- suppressWarnings(cor.test(fix.dist, var.dist, method=correlation.method))
    RES$var.excl[i] <- c(1:ncol(var.mat))[-var.comb[,i]]
    RES$rho[i] <- temp$estimate
  }
  delta.rho <- RES$rho - full.cor
  exclude <- sort(unique(c(RES$var.excl[which(abs(delta.rho) < min.delta.rho)], var.exclude)))
  
  if(random.selection){
    num.restarts=num.restarts
    prop.selected.var=prop.selected.var
    prob<-rep(1,ncol(var.mat))
    if(prop.selected.var< 1){
      prob[exclude]<-0
    }
    n.selected.var <- min(sum(prob),prop.selected.var*dim(var.mat)[2])
  } else {
    num.restarts=1
    prop.selected.var=1  
    prob<-rep(1,ncol(var.mat))
    n.selected.var <- min(sum(prob),prop.selected.var*dim(var.mat)[2])
  }
  
  RES_TOT <- c()
  for(i in 1:num.restarts){
    step=1
    RES <- data.frame(step=step, step.dir="F", var.incl=NA, n.var=0, rho=0)
    attr(RES$step.dir, "levels") <- c("F","B")
    best.comb <- which.max(RES$rho)
    best.rho <- RES$rho[best.comb]
    delta.rho <- Inf
    selected.var <- sort(unique(c(sample(1:dim(var.mat)[2], n.selected.var, prob=prob), var.always.include)))
    while(best.rho < max.rho & delta.rho > min.delta.rho & RES$n.var[best.comb] < length(selected.var)){
      #forward step
      step.dir="F"
      step=step+1
      var.comb <- combn(selected.var, RES$n.var[best.comb]+1, simplify=FALSE)
      if(RES$n.var[best.comb] == 0){
        var.comb.incl<-1:length(var.comb)
      } else {
        var.keep <- as.numeric(unlist(strsplit(RES$var.incl[best.comb], ",")))
        temp <- NA*1:length(var.comb)
        for(j in 1:length(temp)){
          temp[j] <- all(var.keep %in% var.comb[[j]]) 
        }
        var.comb.incl <- which(temp==1)
      }
      
      RES.f <- data.frame(step=rep(step, length(var.comb.incl)), step.dir=step.dir, var.incl=NA, n.var=RES$n.var[best.comb]+1, rho=NA)
      for(f in 1:length(var.comb.incl)){
        var.incl <- var.comb[[var.comb.incl[f]]]
        var.incl <- var.incl[order(var.incl)]
        var.dist <- vegdist(as.matrix(var.mat[,var.incl]), method=var.dist.method)
        temp <- suppressWarnings(cor.test(fix.dist, var.dist, method=correlation.method))
        RES.f$var.incl[f] <- paste(var.incl, collapse=",")
        RES.f$rho[f] <- temp$estimate
      }
      
      last.F <- max(which(RES$step.dir=="F"))
      RES <- rbind(RES, RES.f[which.max(RES.f$rho),])
      best.comb <- which.max(RES$rho)
      delta.rho <- RES$rho[best.comb] - best.rho 
      best.rho <- RES$rho[best.comb]
      
      if(best.comb == step){
        while(best.comb == step & RES$n.var[best.comb] > 1){
          #backward step
          step.dir="B"
          step <- step+1
          var.keep <- as.numeric(unlist(strsplit(RES$var.incl[best.comb], ",")))
          var.comb <- combn(var.keep, RES$n.var[best.comb]-1, simplify=FALSE)
          RES.b <- data.frame(step=rep(step, length(var.comb)), step.dir=step.dir, var.incl=NA, n.var=RES$n.var[best.comb]-1, rho=NA)
          for(b in 1:length(var.comb)){
            var.incl <- var.comb[[b]]
            var.incl <- var.incl[order(var.incl)]
            var.dist <- vegdist(as.matrix(var.mat[,var.incl]), method=var.dist.method)
            temp <- suppressWarnings(cor.test(fix.dist, var.dist, method=correlation.method))
            RES.b$var.incl[b] <- paste(var.incl, collapse=",")
            RES.b$rho[b] <- temp$estimate
          }
          RES <- rbind(RES, RES.b[which.max(RES.b$rho),])
          best.comb <- which.max(RES$rho)
          best.rho<- RES$rho[best.comb]
        }
      } else {
        break()
      }
      
    }
    
    RES_TOT <- rbind(RES_TOT, RES[2:dim(RES)[1],])
    print(paste(round((i/num.restarts)*100,3), "% finished"))
  }
  
  RES_TOT <- unique(RES_TOT[,3:5])
  
  
  if(dim(RES_TOT)[1] > output.best){
    order.by.best <- RES_TOT[order(RES_TOT$rho, decreasing=TRUE)[1:output.best],]
  } else {
    order.by.best <-  RES_TOT[order(RES_TOT$rho, decreasing=TRUE), ]
  }
  rownames(order.by.best)<-NULL
  
  order.by.i.comb <- c()
  for(i in 1:length(selected.var)){
    f1 <- which(RES_TOT$n.var==i)
    f2 <- which.max(RES_TOT$rho[f1])
    order.by.i.comb <- rbind(order.by.i.comb, RES_TOT[f1[f2],])
  }
  rownames(order.by.i.comb)<-NULL
  
  if(length(exclude)<1){var.exclude=NULL} else {var.exclude=exclude}
  out <- list(
    order.by.best=order.by.best,
    order.by.i.comb=order.by.i.comb,
    best.model.vars=paste(colnames(var.mat)[as.numeric(unlist(strsplit(order.by.best$var.incl[1], ",")))], collapse=","),
    best.model.rho=order.by.best$rho[1],
    var.always.include=var.always.include,
    var.exclude=var.exclude
  )
  out
  
}

bio.env <- function(fix.mat, var.mat, 
                    fix.dist.method="bray", var.dist.method="euclidean", correlation.method="spearman",
                    scale.fix=FALSE, scale.var=TRUE,
                    output.best=10,
                    var.max=ncol(var.mat)
){
  if(dim(fix.mat)[1] != dim(var.mat)[1]){stop("fixed and variable matrices must have the same number of rows")}
  if(var.max > dim(var.mat)[2]){stop("var.max cannot be larger than the number of variables (columns) in var.mat")}
  
  require(vegan)
  
  combn.sum <- sum(factorial(ncol(var.mat))/(factorial(1:var.max)*factorial(ncol(var.mat)-1:var.max)))
  
  if(scale.fix){fix.mat<-scale(fix.mat)}else{fix.mat<-fix.mat}
  if(scale.var){var.mat<-scale(var.mat)}else{var.mat<-var.mat}
  fix.dist <- vegdist(fix.mat, method=fix.dist.method)
  RES_TOT <- c()
  best.i.comb <- c()
  iter <- 0
  for(i in 1:var.max){
    var.comb <- combn(1:ncol(var.mat), i, simplify=FALSE)
    RES <- data.frame(var.incl=rep(NA, length(var.comb)), n.var=i, rho=0)
    for(f in 1:length(var.comb)){
      iter <- iter+1
      var.dist <- vegdist(as.matrix(var.mat[,var.comb[[f]]]), method=var.dist.method)
      temp <- suppressWarnings(cor.test(fix.dist, var.dist, method=correlation.method))
      RES$var.incl[f] <- paste(var.comb[[f]], collapse=",")
      RES$rho[f] <- temp$estimate
      if(iter %% 100 == 0){print(paste(round(iter/combn.sum*100, 3), "% finished"))}
    }
    
    order.rho <- order(RES$rho, decreasing=TRUE)
    best.i.comb <- c(best.i.comb, RES$var.incl[order.rho[1]])
    if(length(order.rho) > output.best){
      RES_TOT <- rbind(RES_TOT, RES[order.rho[1:output.best],])
    } else {
      RES_TOT <- rbind(RES_TOT, RES)
    }
  }
  rownames(RES_TOT)<-NULL
  
  if(dim(RES_TOT)[1] > output.best){
    order.by.best <- order(RES_TOT$rho, decreasing=TRUE)[1:output.best]
  } else {
    order.by.best <- order(RES_TOT$rho, decreasing=TRUE)
  }
  OBB <- RES_TOT[order.by.best,]
  rownames(OBB) <- NULL
  
  order.by.i.comb <- match(best.i.comb, RES_TOT$var.incl)
  OBC <- RES_TOT[order.by.i.comb,]
  rownames(OBC) <- NULL
  
  out <- list(
    order.by.best=OBB,
    order.by.i.comb=OBC,
    best.model.vars=paste(colnames(var.mat)[as.numeric(unlist(strsplit(OBB$var.incl[1], ",")))], collapse=",") ,
    best.model.rho=OBB$rho[1]
  )
  out
}

fmethod<-switch(opt$fmethod,"euclidean","manhattan","gower","altGower","canberra","bray","kulczynski","morisita","horn","binomial","cao")
vmethod<-switch(opt$vmethod,"euclidean","manhattan","gower","altGower","canberra","bray","kulczynski","morisita","horn","binomial","cao")
nmethod<-switch(opt$nmethod,"euclidean","manhattan","gower","altGower","canberra","bray","kulczynski","morisita","horn","binomial","cao")
correlation<-switch(opt$correlation,"pearson","spearman","kendall")

res <- bio.env(wisconsin(AS), Env,fix.dist.method=fmethod, var.dist.method=vmethod, correlation.method=correlation,
               scale.fix=FALSE, scale.var=TRUE) 

#Save the 10 best environmental data correlations to file
envNames<-colnames(Env)
bestEnvFit<-""
for(i in (1:length(res$order.by.best$var.incl)))
{
  bestEnvFit[i]<-paste(paste(envNames[as.numeric(unlist(strsplit(res$order.by.best$var.incl[i], split=",")))],collapse=' + '), " = ",res$order.by.best$rho[i],sep="")
}
bestEnvFit<-data.frame(bestEnvFit)
colnames(bestEnvFit)<-"Best combination of environmental variables with similarity score"
write.table(bestEnvFit, file=paste(opt$opath,substr(basename(opt$sfile),1,nchar(basename(opt$sfile))-4),"_BIOENV_ENV.csv",sep=""),sep=",",quote=FALSE)


res.bv.step.biobio <- bv.step(wisconsin(AS), wisconsin(AS), 
                              fix.dist.method=fmethod, var.dist.method=vmethod,correlation.method=correlation,
                              scale.fix=FALSE, scale.var=FALSE, 
                              max.rho=0.95, min.delta.rho=0.001,
                              random.selection=TRUE,
                              prop.selected.var=0.3,
                              num.restarts=opt$nrestarts,
                              output.best=10,
                              var.always.include=NULL) 

#Save the 10 best species abundance data correlations to file
taxaNames<-colnames(AS)
bestTaxaFit<-""
for(i in (1:length(res.bv.step.biobio$order.by.best$var.incl)))
{
  bestTaxaFit[i]<-paste(paste(taxaNames[as.numeric(unlist(strsplit(res.bv.step.biobio$order.by.best$var.incl[i], split=",")))],collapse=' + '), " = ",res.bv.step.biobio$order.by.best$rho[i],sep="")
}
bestTaxaFit<-data.frame(bestTaxaFit)
colnames(bestTaxaFit)<-"Best combination of taxa with similarity score"
write.table(bestTaxaFit, file=paste(opt$opath,substr(basename(opt$sfile),1,nchar(basename(opt$sfile))-4),"_BIOENV_TAXA.csv",sep=""),sep=",",quote=FALSE)


MDS_res=metaMDS(AS, distance = nmethod, k = 2, trymax = 50)
MDS_res.nmds.plot <- plot(MDS_res, dis = "sites", type = "n")

bio.keep <- as.numeric(unlist(strsplit(res.bv.step.biobio$order.by.best$var.incl[1], ",")))
bio.fit <- envfit(MDS_res, AS[,bio.keep], perm = 999) 

#write the biofit data to file
biodata<-data.frame(data.frame(bio.fit$vectors$r),data.frame(bio.fit$vectors$pvals))
colnames(biodata)<-c("Correlation^2","P-values")
write.table(biodata, file=paste(opt$opath,substr(basename(opt$sfile),1,nchar(basename(opt$sfile))-4),"_BIOENV_BESTTAXA.csv",sep=""),sep=",",quote=FALSE)


#use the best set of environmental variables in env.keep
eval(parse(text=paste("env.keep <- c(",res$order.by.best$var.incl[1],")",sep="")))

env.fit <- envfit(MDS_res, Env[,env.keep], perm = 999) 

#write the envdata to a file
envdata<-data.frame(data.frame(env.fit$vectors$r),data.frame(env.fit$vectors$pvals))
colnames(envdata)<-c("Correlation^2","P-values")
write.table(envdata, file=paste(opt$opath,substr(basename(opt$sfile),1,nchar(basename(opt$sfile))-4),"_BIOENV_BESTENV.csv",sep=""),sep=",",quote=FALSE)


# save in a file
jpeg(filename = paste(opt$opath,substr(basename(opt$sfile),1,nchar(basename(opt$sfile))-4),"_BIOENV_NMDS_ENV.jpg",sep=""),width = as.numeric(opt$width), height = as.numeric(opt$height), quality=100)

par(mai=c(2,2,1,0.2)
    ,bg="white")

# first plot
plot(MDS_res$points, t="n",
cex.axis=as.numeric(opt$fsize)*0.6,
cex.lab=as.numeric(opt$fsize),
cex=as.numeric(opt$fsize)*0.7,
cex.main=as.numeric(opt$fsize),
xlim=c(min(MDS_res.nmds.plot$sites[,1])-1,max(MDS_res.nmds.plot$sites[,1]))+0.5,ylim=c(min(MDS_res.nmds.plot$sites[,2]),max(MDS_res.nmds.plot$sites[,2])),
xlab="NMDS1", ylab="NMDS2")
points(MDS_res.nmds.plot$sites[,1],MDS_res.nmds.plot$sites[,2],xlim=c(min(MDS_res.nmds.plot$sites[,1])-1,max(MDS_res.nmds.plot$sites[,1]))+0.5,ylim=c(min(MDS_res.nmds.plot$sites[,2]),max(MDS_res.nmds.plot$sites[,2])),col=generateColors(rownames(AS),opt$ccol), pch = 19,cex=as.numeric(opt$fsize)*0.7)
if(opt$dmode){
    textxy(MDS_res.nmds.plot$sites[,1],MDS_res.nmds.plot$sites[,2],rownames(MDS_res.nmds.plot$sites),cx=as.numeric(opt$fsize)*0.6)
}
plot(env.fit, col="black", 
lwd=as.numeric(opt$fsize)*1.5,
cex.axis=as.numeric(opt$fsize)*0.6,
cex.lab=as.numeric(opt$fsize),
cex=as.numeric(opt$fsize),
cex.main=as.numeric(opt$fsize))
dev.off()

# save in a file
jpeg(filename = paste(opt$opath,substr(basename(opt$sfile),1,nchar(basename(opt$sfile))-4),"_BIOENV_NMDS_TAXA.jpg",sep=""),width = as.numeric(opt$width), height = as.numeric(opt$height), quality=100)

par(mai=c(2,2,1,0.2)
,bg="white")

#second plot
plot(MDS_res$points, t="n",
cex.axis=as.numeric(opt$fsize)*0.6,
cex.lab=as.numeric(opt$fsize),
cex=as.numeric(opt$fsize)*0.7,
cex.main=as.numeric(opt$fsize),
xlim=c(min(MDS_res.nmds.plot$sites[,1])-1,max(MDS_res.nmds.plot$sites[,1]))+0.5,ylim=c(min(MDS_res.nmds.plot$sites[,2]),max(MDS_res.nmds.plot$sites[,2])),
xlab="NMDS1", ylab="NMDS2")
points(MDS_res.nmds.plot$sites[,1],MDS_res.nmds.plot$sites[,2],xlim=c(min(MDS_res.nmds.plot$sites[,1])-1,max(MDS_res.nmds.plot$sites[,1]))+0.5,ylim=c(min(MDS_res.nmds.plot$sites[,2]),max(MDS_res.nmds.plot$sites[,2])),col=generateColors(rownames(AS),opt$ccol), pch = 19,cex=as.numeric(opt$fsize)*0.7)
if(opt$dmode){
    textxy(MDS_res.nmds.plot$sites[,1],MDS_res.nmds.plot$sites[,2],rownames(MDS_res.nmds.plot$sites),cx=as.numeric(opt$fsize)*0.6)
}
plot(bio.fit,col="black", 
lwd=as.numeric(opt$fsize)*1.5,
cex.axis=as.numeric(opt$fsize)*0.6,
cex.lab=as.numeric(opt$fsize),
cex=as.numeric(opt$fsize),
cex.main=as.numeric(opt$fsize))
dev.off()