#!/bin/Rscript # *************************************************************** # Name: taxo_spatial_temporal_trends.R # Purpose: This script plots figures for temporal as well as spatial species abundance data. # The species are ranked in order of decreasing frequency occurrence for sample, # with sample names above each column. The lines connect the species between the samples. # Line color is assigned based on the ranked frequency occurrence values for species in the # first sample to allow better identification of a species across different samples. # Line width is also in proportion to the starting frequency occurrence value for each species at # each sample step. The legend on the bottom indicates the frequency occurrence values for # different line widths. # There are two pre-processing steps provided in this script (--pmethod), you can either obtain relative abundances # or rarefy the data to minimum rowsums as provided by the rrarefy() function in vegan's package # Furthermore, using --ntaxa switch, you can restrict the figures to display only most abundant taxa # If there exist subgroups within the dataset, then you can parse the site names and specify the column (--ccol) to # form the group by specifying underscore as a delimeter. # After having generated the figures, if you want to draw only specific taxa then provide there names # in a comma separated manner using --flist switch. For example --flist="C1815,C1161,C1474" # Version: 0.1 # Authors: Umer Zeeshan Ijaz (Umer.Ijaz@glasgow.ac.uk) # http://userweb.eng.gla.ac.uk/umer.ijaz # Created: 2013-04-03 # License: Copyright (c) 2013 Computational Microbial Genomics Group, University of Glasgow, UK # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # **************************************************************/ suppressPackageStartupMessages(library("optparse")) suppressPackageStartupMessages(library("vegan")) suppressPackageStartupMessages(library("RColorBrewer")) suppressPackageStartupMessages(library("scales")) #specify desired options in a list option_list <- list( make_option("--ifile", action="store",default=NULL, help="input CSV file"), make_option("--opath", action="store",default=NULL, help="output path"), make_option("--pmethod", type="integer",default=1, help="1 = relative abundance, 2 = rarefy to minimum rowsum [default %default]"), make_option("--offset", action="store", default="0.05", help="taxa offset (useful when the names dont fit in the figure) [default %default]"), make_option("--mreads", type="integer",default=200, help="minimum reads [default %default]"), make_option("--ntaxa", type="integer",default=50, help="number of most abundant taxa [default %default]"), make_option("--dim", type="integer",default=1200, help="dimensions of output files [default %default]"), make_option("--ccol", type="integer",default=0, help="column number using underscore as delimeter [default %default]"), make_option("--flist", action="store",default=NULL, help="list of taxa to be drawn"), make_option("--npoints", type="integer",default=5, help="number of spatial/temporal points to connect [default %default]")) #get command line options opt<-parse_args(OptionParser(usage="%prog [options] file", option_list=option_list)) if(is.null(opt$ifile)) quit() #first remove the previously generated files if(is.null(opt$opath)){ files<-list.files(path=".",pattern="*.jpg$") } else { files<-list.files(path=opt$opath,pattern="*.jpg$") } for(i in seq(1:length(files))) { file.remove(paste(opt$opath,files[i],sep="")) } #import species abundance data sp.dat <- read.csv(opt$ifile,header=TRUE,row.names=1) #site names parsing function groupNames <- function(siteNames,column) { #generate the indices colInd<-"" if(column==0){ colInd<-rep(1,length(siteNames)) } else{ #first split the names with respect to underscore and construct a parsedString matrix for(i in seq(1:length(siteNames))) { tmp<-t(data.matrix(unlist(strsplit(siteNames[i],"_")))) if(column>length(tmp)){ column<-length(tmp) } if(i==1){ parsedString<-tmp } else{ parsedString<-rbind(parsedString,tmp) } } for(i in seq(1:length(siteNames))) { colInd[i]<-as.numeric(match(parsedString[i,column],unique(parsedString[,column]))) } } return(colInd) } # plotting function # description of arguments are as follows: # x data frame or matrix with input data, first column is time step # x.locs minimum and maximum x coordinates for plotting region, from 0–1 # y.locs minimum and maximum y coordinates for plotting region, from 0–1 # steps character string of time steps to include in plot, default all # sp.names character string of species connections to display, default all # dt.tx logical value indicating if time steps are indicated in the plot # rsc logical value indicating if line widths are scaled proportionally to their value # ln.st numeric value for distance of lines from text labels, distance is determined automatically if not provided # rs.ln two-element numeric vector for rescaling line widths, defaults to one value if one element is supplied, default 3 to 15 # ln.cl character string indicating color of lines, can use multiple colors or input to brewer.pal, default ‘RdYlGn’ # alpha.val numeric value (0–1) indicating transparency of connections, default 0.7 # leg logical value for plotting legend, default T, values are for the original data frame, no change for species or step subsets # ... additional arguments to plot # ref: http://beckmw.wordpress.com/2013/04/01/a-nifty-line-plot-to-visualize-multivariate-time-series/ plot.qual<-function(x,x.locs=c(0.05,0.95),y.locs=c(0,1),steps=NULL,sp.names=NULL,dt.tx=T,rsc=T, ln.st=NULL,rs.ln=c(3,15),ln.cl='RdYlGn',alpha=0.7,leg=T,...){ if(length(x.locs) != 2 | length(y.locs) != 2) stop('x and y dimensions must be two-element vectors') if(x.locs[1]<0 | x.locs[2]>1 | y.locs[1]<0 | y.locs[2]>1) stop('x and y dimensions must in range of 0--1') dim.x<-c(0,1) #plot dims x dim.y<-c(0,1) #plot dims y wrn.val<-F x[,1]<-as.character(x[,1]) tot.sp<-ncol(x)-1 sp.col<-2:ncol(x) #rescale if T, sort legend for later sp.orig<-x[,sp.col] if(length(rs.ln)==1) rsc<-F if(rsc) x[,sp.col]<-rescale(x[,sp.col],rs.ln) if(rsc==F & leg) leg<-F #no legend if line widths aren't rescaled #reorder species columns, add rank as integer first.ord<-order(x[1,sp.col],decreasing=T) x[,sp.col]<-x[,sp.col][,first.ord] names(x)[sp.col]<-names(x)[sp.col][first.ord] names(x)[sp.col]<-paste0(1:tot.sp,sep='. ',names(x)[sp.col]) #list of spp by date, arranged in decreasing order for each date dt.dat.srt<-vector('list',nrow(x)) names(dt.dat.srt)<-x[,1] for(val in 1:nrow(x)){ tmp<-t(x[val,sp.col]) tmp<-tmp[order(tmp,decreasing=T),,drop=F] dt.dat.srt[[val]]<-tmp } #initiate plot object plot(dim.x,dim.y,type='n',axes=F,xlab='',ylab='',...) #subset for steps, if provided if(!is.null(steps)) dt.dat.srt<-dt.dat.srt[names(dt.dat.srt) %in% steps] #plot legend if(leg){ y.locs[1]<-0.05*diff(y.locs)+y.locs[1] leg.txt<-format(round(seq(min(sp.orig),max(sp.orig),length=5),2),nsmall=2,digits=2) leg.wds<-seq(rs.ln[1],rs.ln[2],length=5) legend('bottom',(y.locs[1]-y.olds)/2,col=alpha('black',alpha),lwd=leg.wds,legend=leg.txt,bty='n', horiz=T) } #x locations x.vals<-rep(seq(x.locs[1],x.locs[2],length=length(dt.dat.srt)),each=tot.sp) x.vals<-split(x.vals,x.vals) #y locations, rearranged in loop, exception if dates are plotted if(dt.tx) y.vals<-rev(seq(y.locs[1],y.locs[2],length=tot.sp+1))[-1] else y.vals<-rev(seq(y.locs[1],y.locs[2],length=tot.sp)) #get line colors if(length(ln.cl)==1) if(ln.cl %in% row.names(brewer.pal.info)){ pal.num<-brewer.pal.info[row.names(brewer.pal.info) == ln.cl,1] ln.cl<-brewer.pal(pal.num,ln.cl) } line.cols<-alpha(colorRampPalette(ln.cl)(tot.sp),alpha) #define distance of lines from labels if(is.null(ln.st)){ str.max<-max(strwidth(row.names(dt.dat.srt[[1]]))) if(diff(x.locs)-length(dt.dat.srt)*str.max < 0){ warning('not enough space for lines between columns') wrn.val<-T } else ln.st<-0.2*str.max + str.max/2 } for(val in 1:(length(dt.dat.srt)-1)){ #temp data to plot plt.tmp<-dt.dat.srt[c(val,val+1)] x.tmp<-x.vals[c(val,val+1)] #plot temp text for column text(x.tmp[[1]],y.vals,row.names(plt.tmp[[1]])) if(val == length(dt.dat.srt)-1){ text(x.tmp[[2]],y.vals,row.names(plt.tmp[[2]])) if(dt.tx){ dt.txt<-substitute(italic(x),list(x=names(plt.tmp)[2])) text(unique(x.tmp[[2]]),y.locs[2],dt.txt) } } if(dt.tx){ dt.txt<-substitute(italic(x),list(x=names(plt.tmp)[1])) text(unique(x.tmp[[1]]),y.locs[2],dt.txt) } srt.ln.y<-match(row.names(plt.tmp[[1]]),row.names(plt.tmp[[2]])) #if no line rescale, use first element of rs.ln if(rsc) lwd.val<-plt.tmp[[1]][,1] else lwd.val<-rep(rs.ln[1],tot.sp) #vector for species selection of line segments if(is.null(sp.names)) sel.sp<-rep(T,tot.sp) else{ sel.names<-unlist(lapply(strsplit(row.names(plt.tmp[[1]]),' '),function(x) x[2])) sel.sp<-(sel.names %in% sp.names) } #for lines if(!wrn.val) segments( x.tmp[[1]][sel.sp]+ln.st, y.vals[sel.sp], x.tmp[[2]][sel.sp]-ln.st, y.vals[srt.ln.y][sel.sp], col=line.cols[sel.sp], lwd=lwd.val[sel.sp] ) #resort color vector for next colummn srt.cl.y<-match(row.names(plt.tmp[[2]]),row.names(plt.tmp[[1]])) line.cols<-line.cols[srt.cl.y] } } #remove spaces from site names rownames(sp.dat) <- sub(" ", "_", rownames(sp.dat)) rownames(sp.dat) <- sub(" ", "_", rownames(sp.dat)) sp.dat<-t(sp.dat) #remove sites with minimum reads less than a specified number sp.dat<-sp.dat[rowSums(sp.dat)>opt$mreads,] #preprocess the data if(opt$pmethod==1){ #sp.dat<-exp(log((sp.dat+1)/(rowSums(sp.dat)+dim(sp.dat)[2]))) sp.dat<-sp.dat/rowSums(sp.dat) } else if(opt$pmethod==2){ sp.dat<-rrarefy(sp.dat,min(rowSums(sp.dat))) } #order the data based on colsums sp.dat<-sp.dat[,order(colSums(sp.dat),decreasing=TRUE)] #extract most abundant taxa sp.dat<-sp.dat[,1:min(opt$ntaxa,dim(sp.dat)[2])] #extract grouping information by parsing site names indices<-groupNames(rownames(sp.dat), opt$ccol) figureNumber<-1 for(i in seq(unique(indices))){ #extract site names based on unique indices extractedSamples<-rownames(sp.dat)[indices==i] if(length(extractedSamples)>1){ nameString<-NULL for(j in seq(1:length(extractedSamples))) { nameString<-rbind(nameString,extractedSamples[j]) #for avoiding congestion we need to split the sites to display only number of points if(j%%min(length(extractedSamples),opt$npoints)==0){ if(length(nameString)>1){ sp2.dat<-sp.dat[rownames(sp.dat) %in% nameString,] sp2.dat<-sp2.dat[order(rownames(sp2.dat)),] jpeg(filename = paste(opt$opath,substr(basename(opt$ifile),1,nchar(basename(opt$ifile))-4),"_STT_",figureNumber,".jpg",sep=""),width =max(as.numeric(opt$dim),as.numeric(opt$dim)*(length(nameString)%/%5)) , height = as.numeric(opt$dim), quality=100) par(mar=numeric(4),family='serif') if(is.null(opt$flist)){ plot.qual(data.frame(rownames(sp2.dat),sp2.dat),rs.ln=c(3,15),x.locs=c(as.numeric(opt$offset),1-as.numeric(opt$offset))) } else{ filteredList<-unlist(strsplit(opt$flist,split=",")) plot.qual(data.frame(rownames(sp2.dat),sp2.dat),rs.ln=c(3,15),sp.names=filteredList,x.locs=c(as.numeric(opt$offset),1-as.numeric(opt$offset))) } dev.off() figureNumber<-figureNumber+1 } nameString<-NULL; } } } else { #these are groups with only single sample and are excluded from drawing them cat(paste("\n",extractedSamples," is excluded as a singleton...",sep="")) } }