#!/bin/Rscript # *************************************************************** # Name: taxo_NMDS_bioenv.R # Purpose: This R script is an extension of vegan library's bioenv() # function and uses the bio.env() and bio.step() of # http://menugget.blogspot.co.uk/2011/06/clarke-and-ainsworths-bioenv-and-bvstep.html # The original author suggested these functions to overcome # the inflexibility of the bioenv() function which uses # a similarity matrix based on normalized "euclidean" distance. # This script finds the best subset of species and environmental variables and plots them on NMDS plot # Version: 0.2 # History: 0.1 to 0.2 Automatically remove NA entries # Authors: Umer Zeeshan Ijaz (Umer.Ijaz@glasgow.ac.uk) # http://userweb.eng.gla.ac.uk/umer.ijaz # Christopher Quince (Christopher.Quince@glasgow.ac.uk) # http://userweb.eng.gla.ac.uk/christopher.quince # Created: 2012-09-10 # License: Copyright (c) 2012 Computational Microbial Genomics Group, University of Glasgow, UK # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # **************************************************************/ suppressPackageStartupMessages(library("optparse")) suppressPackageStartupMessages(library("calibrate")) suppressPackageStartupMessages(library("vegan")) suppressPackageStartupMessages(library("RColorBrewer")) suppressPackageStartupMessages(library("gplots")) #specify desired options in a list option_list <- list( make_option("--sfile", action="store",default=NULL, help="Species abundance file"), make_option("--efile", action="store",default=NULL, help="environmental file"), make_option("--opath", action="store",default=NULL, help="Output path"), make_option("--fsize", action="store", default="1.2", help="Font size [default %default]"), make_option("--width", type="integer",default=800, help="Width of jpeg files [default %default]"), make_option("--height", type="integer",default=800, help="Height of jpeg files [default %default]"), make_option("--mreads", type="integer",default=200, help="Minimum reads [default %default]"), make_option("--nrestarts", type="integer",default=10, help="Number of restarts [default %default]"), make_option("--ccol", type="integer",default=1, help="Minimum reads [default %default]"), make_option("--correlation", type="integer",default=1, help="Correlation to use: 1=pearson, 2=spearman, 3=kendall [default %default]"), make_option("--fmethod", type="integer",default=6, help="Fixed distance method: 1=euclidean, 2=manhattan, 3=gower, 4=altGower, 5=canberra, 6=bray, 7=kulczynski, 8=morisita,9=horn, 10=binomial, 11=cao [default %default]"), make_option("--vmethod", type="integer",default=6, help="Variable distance method: 1=euclidean, 2=manhattan, 3=gower, 4=altGower, 5=canberra, 6=bray, 7=kulczynski, 8=morisita,9=horn, 10=binomial, 11=cao [default %default]"), make_option("--nmethod", type="integer",default=6, help="NMDS distance method: 1=euclidean, 2=manhattan, 3=gower, 4=altGower, 5=canberra, 6=bray, 7=kulczynski, 8=morisita,9=horn, 10=binomial, 11=cao [default %default]"), make_option("--dmode", action="store_true",default=FALSE, help="Site names display modes [default %default]") ) #get command line options opt<-parse_args(OptionParser(usage="%prog [options] file", option_list=option_list)) if(is.null(opt$sfile) || is.null(opt$efile)) quit() #generate colors function generateColors <- function(siteNames,column) { #generate the indices colInd<-"" if(column==0){ colInd<-rep("blue",length(siteNames)) } else{ #first split the names with respect to underscore and construct a parsedString matrix for(i in seq(1:length(siteNames))) { tmp<-t(data.matrix(unlist(strsplit(siteNames[i],"_")))) if(column>length(tmp)){ column<-length(tmp) } if(i==1){ parsedString<-tmp } else{ parsedString<-rbind(parsedString,tmp) } } #generate template colors crp<- colorRampPalette(c("blue", "#007FFF", "cyan","#7FFF7F", "yellow", "#FF7F00", "red"),space = "Lab")(length(unique(parsedString[,column]))) for(i in seq(1:length(siteNames))) { colInd[i]<-crp[match(parsedString[i,column],unique(parsedString[,column]))] } } return(colInd) } #Importing data now AS_C <-read.csv(opt$sfile,header=TRUE,row.names=1) Env <- read.csv(opt$efile,header=TRUE,row.names=1) #transpose the data AS <- t(AS_C) #filter the data for reads smaller than a particular number AS<-AS[rowSums(AS)>opt$mreads,] Env<- Env[rownames(AS),] #use complete.cases function to filter out those rows that have NA entry in Env Env<-Env[complete.cases(Env),] AS<-AS[rownames(Env),] # The new functions are given below and implement the following algorithms: # Clarke, K. R & Ainsworth, M. 1993. A method of linking multivariate community structure to environmental variables. Marine Ecology Progress Series, 92, 205-219. # Clarke, K. R., Gorley, R. N., 2001. PRIMER v5: User Manual/Tutorial. PRIMER-E, Plymouth, UK. # Clarke, K. R., Warwick, R. M., 2001. Changes in Marine Communities: An Approach to Statistical Analysis and Interpretation, 2nd edition. PRIMER-E Ltd, Plymouth, UK. # Clarke, K. R., Warwick, R. M., 1998. Quantifying structural redundancy in ecological communities. Oecologia, 113:278-289. bv.step <- function(fix.mat, var.mat, fix.dist.method="bray", var.dist.method="euclidean", correlation.method="spearman", scale.fix=FALSE, scale.var=TRUE, max.rho=0.95, min.delta.rho=0.001, random.selection=TRUE, prop.selected.var=0.2, num.restarts=10, var.always.include=NULL, var.exclude=NULL, output.best=10 ){ if(dim(fix.mat)[1] != dim(var.mat)[1]){stop("fixed and variable matrices must have the same number of rows")} if(sum(var.always.include %in% var.exclude) > 0){stop("var.always.include and var.exclude share a variable")} require(vegan) if(scale.fix){fix.mat<-scale(fix.mat)}else{fix.mat<-fix.mat} if(scale.var){var.mat<-scale(var.mat)}else{var.mat<-var.mat} fix.dist <- vegdist(as.matrix(fix.mat), method=fix.dist.method) #an initial removal phase var.dist.full <- vegdist(as.matrix(var.mat), method=var.dist.method) full.cor <- suppressWarnings(cor.test(fix.dist, var.dist.full, method=correlation.method))$estimate var.comb <- combn(1:ncol(var.mat), ncol(var.mat)-1) RES <- data.frame(var.excl=rep(NA,ncol(var.comb)), n.var=ncol(var.mat)-1, rho=NA) for(i in 1:dim(var.comb)[2]){ var.dist <- vegdist(as.matrix(var.mat[,var.comb[,i]]), method=var.dist.method) temp <- suppressWarnings(cor.test(fix.dist, var.dist, method=correlation.method)) RES$var.excl[i] <- c(1:ncol(var.mat))[-var.comb[,i]] RES$rho[i] <- temp$estimate } delta.rho <- RES$rho - full.cor exclude <- sort(unique(c(RES$var.excl[which(abs(delta.rho) < min.delta.rho)], var.exclude))) if(random.selection){ num.restarts=num.restarts prop.selected.var=prop.selected.var prob<-rep(1,ncol(var.mat)) if(prop.selected.var< 1){ prob[exclude]<-0 } n.selected.var <- min(sum(prob),prop.selected.var*dim(var.mat)[2]) } else { num.restarts=1 prop.selected.var=1 prob<-rep(1,ncol(var.mat)) n.selected.var <- min(sum(prob),prop.selected.var*dim(var.mat)[2]) } RES_TOT <- c() for(i in 1:num.restarts){ step=1 RES <- data.frame(step=step, step.dir="F", var.incl=NA, n.var=0, rho=0) attr(RES$step.dir, "levels") <- c("F","B") best.comb <- which.max(RES$rho) best.rho <- RES$rho[best.comb] delta.rho <- Inf selected.var <- sort(unique(c(sample(1:dim(var.mat)[2], n.selected.var, prob=prob), var.always.include))) while(best.rho < max.rho & delta.rho > min.delta.rho & RES$n.var[best.comb] < length(selected.var)){ #forward step step.dir="F" step=step+1 var.comb <- combn(selected.var, RES$n.var[best.comb]+1, simplify=FALSE) if(RES$n.var[best.comb] == 0){ var.comb.incl<-1:length(var.comb) } else { var.keep <- as.numeric(unlist(strsplit(RES$var.incl[best.comb], ","))) temp <- NA*1:length(var.comb) for(j in 1:length(temp)){ temp[j] <- all(var.keep %in% var.comb[[j]]) } var.comb.incl <- which(temp==1) } RES.f <- data.frame(step=rep(step, length(var.comb.incl)), step.dir=step.dir, var.incl=NA, n.var=RES$n.var[best.comb]+1, rho=NA) for(f in 1:length(var.comb.incl)){ var.incl <- var.comb[[var.comb.incl[f]]] var.incl <- var.incl[order(var.incl)] var.dist <- vegdist(as.matrix(var.mat[,var.incl]), method=var.dist.method) temp <- suppressWarnings(cor.test(fix.dist, var.dist, method=correlation.method)) RES.f$var.incl[f] <- paste(var.incl, collapse=",") RES.f$rho[f] <- temp$estimate } last.F <- max(which(RES$step.dir=="F")) RES <- rbind(RES, RES.f[which.max(RES.f$rho),]) best.comb <- which.max(RES$rho) delta.rho <- RES$rho[best.comb] - best.rho best.rho <- RES$rho[best.comb] if(best.comb == step){ while(best.comb == step & RES$n.var[best.comb] > 1){ #backward step step.dir="B" step <- step+1 var.keep <- as.numeric(unlist(strsplit(RES$var.incl[best.comb], ","))) var.comb <- combn(var.keep, RES$n.var[best.comb]-1, simplify=FALSE) RES.b <- data.frame(step=rep(step, length(var.comb)), step.dir=step.dir, var.incl=NA, n.var=RES$n.var[best.comb]-1, rho=NA) for(b in 1:length(var.comb)){ var.incl <- var.comb[[b]] var.incl <- var.incl[order(var.incl)] var.dist <- vegdist(as.matrix(var.mat[,var.incl]), method=var.dist.method) temp <- suppressWarnings(cor.test(fix.dist, var.dist, method=correlation.method)) RES.b$var.incl[b] <- paste(var.incl, collapse=",") RES.b$rho[b] <- temp$estimate } RES <- rbind(RES, RES.b[which.max(RES.b$rho),]) best.comb <- which.max(RES$rho) best.rho<- RES$rho[best.comb] } } else { break() } } RES_TOT <- rbind(RES_TOT, RES[2:dim(RES)[1],]) print(paste(round((i/num.restarts)*100,3), "% finished")) } RES_TOT <- unique(RES_TOT[,3:5]) if(dim(RES_TOT)[1] > output.best){ order.by.best <- RES_TOT[order(RES_TOT$rho, decreasing=TRUE)[1:output.best],] } else { order.by.best <- RES_TOT[order(RES_TOT$rho, decreasing=TRUE), ] } rownames(order.by.best)<-NULL order.by.i.comb <- c() for(i in 1:length(selected.var)){ f1 <- which(RES_TOT$n.var==i) f2 <- which.max(RES_TOT$rho[f1]) order.by.i.comb <- rbind(order.by.i.comb, RES_TOT[f1[f2],]) } rownames(order.by.i.comb)<-NULL if(length(exclude)<1){var.exclude=NULL} else {var.exclude=exclude} out <- list( order.by.best=order.by.best, order.by.i.comb=order.by.i.comb, best.model.vars=paste(colnames(var.mat)[as.numeric(unlist(strsplit(order.by.best$var.incl[1], ",")))], collapse=","), best.model.rho=order.by.best$rho[1], var.always.include=var.always.include, var.exclude=var.exclude ) out } bio.env <- function(fix.mat, var.mat, fix.dist.method="bray", var.dist.method="euclidean", correlation.method="spearman", scale.fix=FALSE, scale.var=TRUE, output.best=10, var.max=ncol(var.mat) ){ if(dim(fix.mat)[1] != dim(var.mat)[1]){stop("fixed and variable matrices must have the same number of rows")} if(var.max > dim(var.mat)[2]){stop("var.max cannot be larger than the number of variables (columns) in var.mat")} require(vegan) combn.sum <- sum(factorial(ncol(var.mat))/(factorial(1:var.max)*factorial(ncol(var.mat)-1:var.max))) if(scale.fix){fix.mat<-scale(fix.mat)}else{fix.mat<-fix.mat} if(scale.var){var.mat<-scale(var.mat)}else{var.mat<-var.mat} fix.dist <- vegdist(fix.mat, method=fix.dist.method) RES_TOT <- c() best.i.comb <- c() iter <- 0 for(i in 1:var.max){ var.comb <- combn(1:ncol(var.mat), i, simplify=FALSE) RES <- data.frame(var.incl=rep(NA, length(var.comb)), n.var=i, rho=0) for(f in 1:length(var.comb)){ iter <- iter+1 var.dist <- vegdist(as.matrix(var.mat[,var.comb[[f]]]), method=var.dist.method) temp <- suppressWarnings(cor.test(fix.dist, var.dist, method=correlation.method)) RES$var.incl[f] <- paste(var.comb[[f]], collapse=",") RES$rho[f] <- temp$estimate if(iter %% 100 == 0){print(paste(round(iter/combn.sum*100, 3), "% finished"))} } order.rho <- order(RES$rho, decreasing=TRUE) best.i.comb <- c(best.i.comb, RES$var.incl[order.rho[1]]) if(length(order.rho) > output.best){ RES_TOT <- rbind(RES_TOT, RES[order.rho[1:output.best],]) } else { RES_TOT <- rbind(RES_TOT, RES) } } rownames(RES_TOT)<-NULL if(dim(RES_TOT)[1] > output.best){ order.by.best <- order(RES_TOT$rho, decreasing=TRUE)[1:output.best] } else { order.by.best <- order(RES_TOT$rho, decreasing=TRUE) } OBB <- RES_TOT[order.by.best,] rownames(OBB) <- NULL order.by.i.comb <- match(best.i.comb, RES_TOT$var.incl) OBC <- RES_TOT[order.by.i.comb,] rownames(OBC) <- NULL out <- list( order.by.best=OBB, order.by.i.comb=OBC, best.model.vars=paste(colnames(var.mat)[as.numeric(unlist(strsplit(OBB$var.incl[1], ",")))], collapse=",") , best.model.rho=OBB$rho[1] ) out } fmethod<-switch(opt$fmethod,"euclidean","manhattan","gower","altGower","canberra","bray","kulczynski","morisita","horn","binomial","cao") vmethod<-switch(opt$vmethod,"euclidean","manhattan","gower","altGower","canberra","bray","kulczynski","morisita","horn","binomial","cao") nmethod<-switch(opt$nmethod,"euclidean","manhattan","gower","altGower","canberra","bray","kulczynski","morisita","horn","binomial","cao") correlation<-switch(opt$correlation,"pearson","spearman","kendall") res <- bio.env(wisconsin(AS), Env,fix.dist.method=fmethod, var.dist.method=vmethod, correlation.method=correlation, scale.fix=FALSE, scale.var=TRUE) #Save the 10 best environmental data correlations to file envNames<-colnames(Env) bestEnvFit<-"" for(i in (1:length(res$order.by.best$var.incl))) { bestEnvFit[i]<-paste(paste(envNames[as.numeric(unlist(strsplit(res$order.by.best$var.incl[i], split=",")))],collapse=' + '), " = ",res$order.by.best$rho[i],sep="") } bestEnvFit<-data.frame(bestEnvFit) colnames(bestEnvFit)<-"Best combination of environmental variables with similarity score" write.table(bestEnvFit, file=paste(opt$opath,substr(basename(opt$sfile),1,nchar(basename(opt$sfile))-4),"_BIOENV_ENV.csv",sep=""),sep=",",quote=FALSE) res.bv.step.biobio <- bv.step(wisconsin(AS), wisconsin(AS), fix.dist.method=fmethod, var.dist.method=vmethod,correlation.method=correlation, scale.fix=FALSE, scale.var=FALSE, max.rho=0.95, min.delta.rho=0.001, random.selection=TRUE, prop.selected.var=0.3, num.restarts=opt$nrestarts, output.best=10, var.always.include=NULL) #Save the 10 best species abundance data correlations to file taxaNames<-colnames(AS) bestTaxaFit<-"" for(i in (1:length(res.bv.step.biobio$order.by.best$var.incl))) { bestTaxaFit[i]<-paste(paste(taxaNames[as.numeric(unlist(strsplit(res.bv.step.biobio$order.by.best$var.incl[i], split=",")))],collapse=' + '), " = ",res.bv.step.biobio$order.by.best$rho[i],sep="") } bestTaxaFit<-data.frame(bestTaxaFit) colnames(bestTaxaFit)<-"Best combination of taxa with similarity score" write.table(bestTaxaFit, file=paste(opt$opath,substr(basename(opt$sfile),1,nchar(basename(opt$sfile))-4),"_BIOENV_TAXA.csv",sep=""),sep=",",quote=FALSE) MDS_res=metaMDS(AS, distance = nmethod, k = 2, trymax = 50) MDS_res.nmds.plot <- plot(MDS_res, dis = "sites", type = "n") bio.keep <- as.numeric(unlist(strsplit(res.bv.step.biobio$order.by.best$var.incl[1], ","))) bio.fit <- envfit(MDS_res, AS[,bio.keep], perm = 999) #write the biofit data to file biodata<-data.frame(data.frame(bio.fit$vectors$r),data.frame(bio.fit$vectors$pvals)) colnames(biodata)<-c("Correlation^2","P-values") write.table(biodata, file=paste(opt$opath,substr(basename(opt$sfile),1,nchar(basename(opt$sfile))-4),"_BIOENV_BESTTAXA.csv",sep=""),sep=",",quote=FALSE) #use the best set of environmental variables in env.keep eval(parse(text=paste("env.keep <- c(",res$order.by.best$var.incl[1],")",sep=""))) env.fit <- envfit(MDS_res, Env[,env.keep], perm = 999) #write the envdata to a file envdata<-data.frame(data.frame(env.fit$vectors$r),data.frame(env.fit$vectors$pvals)) colnames(envdata)<-c("Correlation^2","P-values") write.table(envdata, file=paste(opt$opath,substr(basename(opt$sfile),1,nchar(basename(opt$sfile))-4),"_BIOENV_BESTENV.csv",sep=""),sep=",",quote=FALSE) # save in a file jpeg(filename = paste(opt$opath,substr(basename(opt$sfile),1,nchar(basename(opt$sfile))-4),"_BIOENV_NMDS_ENV.jpg",sep=""),width = as.numeric(opt$width), height = as.numeric(opt$height), quality=100) par(mai=c(2,2,1,0.2) ,bg="white") # first plot plot(MDS_res$points, t="n", cex.axis=as.numeric(opt$fsize)*0.6, cex.lab=as.numeric(opt$fsize), cex=as.numeric(opt$fsize)*0.7, cex.main=as.numeric(opt$fsize), xlim=c(min(MDS_res.nmds.plot$sites[,1])-1,max(MDS_res.nmds.plot$sites[,1]))+0.5,ylim=c(min(MDS_res.nmds.plot$sites[,2]),max(MDS_res.nmds.plot$sites[,2])), xlab="NMDS1", ylab="NMDS2") points(MDS_res.nmds.plot$sites[,1],MDS_res.nmds.plot$sites[,2],xlim=c(min(MDS_res.nmds.plot$sites[,1])-1,max(MDS_res.nmds.plot$sites[,1]))+0.5,ylim=c(min(MDS_res.nmds.plot$sites[,2]),max(MDS_res.nmds.plot$sites[,2])),col=generateColors(rownames(AS),opt$ccol), pch = 19,cex=as.numeric(opt$fsize)*0.7) if(opt$dmode){ textxy(MDS_res.nmds.plot$sites[,1],MDS_res.nmds.plot$sites[,2],rownames(MDS_res.nmds.plot$sites),cx=as.numeric(opt$fsize)*0.6) } plot(env.fit, col="black", lwd=as.numeric(opt$fsize)*1.5, cex.axis=as.numeric(opt$fsize)*0.6, cex.lab=as.numeric(opt$fsize), cex=as.numeric(opt$fsize), cex.main=as.numeric(opt$fsize)) dev.off() # save in a file jpeg(filename = paste(opt$opath,substr(basename(opt$sfile),1,nchar(basename(opt$sfile))-4),"_BIOENV_NMDS_TAXA.jpg",sep=""),width = as.numeric(opt$width), height = as.numeric(opt$height), quality=100) par(mai=c(2,2,1,0.2) ,bg="white") #second plot plot(MDS_res$points, t="n", cex.axis=as.numeric(opt$fsize)*0.6, cex.lab=as.numeric(opt$fsize), cex=as.numeric(opt$fsize)*0.7, cex.main=as.numeric(opt$fsize), xlim=c(min(MDS_res.nmds.plot$sites[,1])-1,max(MDS_res.nmds.plot$sites[,1]))+0.5,ylim=c(min(MDS_res.nmds.plot$sites[,2]),max(MDS_res.nmds.plot$sites[,2])), xlab="NMDS1", ylab="NMDS2") points(MDS_res.nmds.plot$sites[,1],MDS_res.nmds.plot$sites[,2],xlim=c(min(MDS_res.nmds.plot$sites[,1])-1,max(MDS_res.nmds.plot$sites[,1]))+0.5,ylim=c(min(MDS_res.nmds.plot$sites[,2]),max(MDS_res.nmds.plot$sites[,2])),col=generateColors(rownames(AS),opt$ccol), pch = 19,cex=as.numeric(opt$fsize)*0.7) if(opt$dmode){ textxy(MDS_res.nmds.plot$sites[,1],MDS_res.nmds.plot$sites[,2],rownames(MDS_res.nmds.plot$sites),cx=as.numeric(opt$fsize)*0.6) } plot(bio.fit,col="black", lwd=as.numeric(opt$fsize)*1.5, cex.axis=as.numeric(opt$fsize)*0.6, cex.lab=as.numeric(opt$fsize), cex=as.numeric(opt$fsize), cex.main=as.numeric(opt$fsize)) dev.off()