#!/bin/Rscript
# ***************************************************************
# Name:      taxo_spatial_temporal_trends.R
# Purpose:   This script plots figures for temporal as well as spatial species abundance data. 
#            The species are ranked in order of decreasing frequency occurrence for sample, 
#            with sample names above each column. The lines connect the species between the samples. 
#            Line color is assigned based on the ranked frequency occurrence values for species in the 
#            first sample to allow better identification of a species across different samples. 
#            Line width is also in proportion to the starting frequency occurrence value for each species at 
#            each sample step. The legend on the bottom indicates the frequency occurrence values for 
#            different line widths. 
#            There are two pre-processing steps provided in this script (--pmethod), you can either obtain relative abundances
#            or rarefy the data to minimum rowsums as provided by the rrarefy() function in vegan's package
#            Furthermore, using --ntaxa switch, you can restrict the figures to display only most abundant taxa
#            If there exist subgroups within the dataset, then you can parse the site names and specify the column (--ccol) to
#            form the group by specifying underscore as a delimeter.
#            After having generated the figures, if you want to draw only specific taxa then provide there names 
#            in a comma separated manner using --flist switch. For example --flist="C1815,C1161,C1474"
# Version:   0.1
# Authors:   Umer Zeeshan Ijaz (Umer.Ijaz@glasgow.ac.uk)
#                 http://userweb.eng.gla.ac.uk/umer.ijaz
# Created:   2013-04-03
# License:   Copyright (c) 2013 Computational Microbial Genomics Group, University of Glasgow, UK
# 
#            This program is free software: you can redistribute it and/or modify
#            it under the terms of the GNU General Public License as published by
#            the Free Software Foundation, either version 3 of the License, or
#            (at your option) any later version.
#
#            This program is distributed in the hope that it will be useful,
#            but WITHOUT ANY WARRANTY; without even the implied warranty of
#            MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#            GNU General Public License for more details.
#
#            You should have received a copy of the GNU General Public License
#            along with this program.  If not, see <http://www.gnu.org/licenses/>.
# **************************************************************/

suppressPackageStartupMessages(library("optparse"))
suppressPackageStartupMessages(library("vegan"))
suppressPackageStartupMessages(library("RColorBrewer"))
suppressPackageStartupMessages(library("scales"))

#specify desired options in a list
option_list <- list(
  make_option("--ifile", action="store",default=NULL, help="input CSV file"),
  make_option("--opath", action="store",default=NULL, help="output path"),
  make_option("--pmethod", type="integer",default=1, help="1 = relative abundance, 2 = rarefy to minimum rowsum [default %default]"),
  make_option("--offset", action="store", default="0.05", help="taxa offset (useful when the names dont fit in the figure) [default %default]"),
  make_option("--mreads", type="integer",default=200, help="minimum reads [default %default]"),
  make_option("--ntaxa", type="integer",default=50, help="number of most abundant taxa [default %default]"),
  make_option("--dim", type="integer",default=1200, help="dimensions of output files [default %default]"),
  make_option("--ccol", type="integer",default=0, help="column number using underscore as delimeter [default %default]"),
  make_option("--flist", action="store",default=NULL, help="list of taxa to be drawn"),
  make_option("--npoints", type="integer",default=5, help="number of spatial/temporal points to connect [default %default]"))

#get command line options
opt<-parse_args(OptionParser(usage="%prog [options] file", option_list=option_list))

if(is.null(opt$ifile))
  quit()

#first remove the previously generated files
if(is.null(opt$opath)){
  files<-list.files(path=".",pattern="*.jpg$")
} else {
  files<-list.files(path=opt$opath,pattern="*.jpg$")
}
for(i in seq(1:length(files)))
{
  file.remove(paste(opt$opath,files[i],sep=""))
}

#import species abundance data
sp.dat <- read.csv(opt$ifile,header=TRUE,row.names=1)

#site names parsing function
groupNames <- function(siteNames,column) {
  
  #generate the indices
  colInd<-""
  
  if(column==0){
    colInd<-rep(1,length(siteNames))
  }
  else{
    #first split the names with respect to underscore and construct a parsedString matrix
    for(i in seq(1:length(siteNames)))
    {
      tmp<-t(data.matrix(unlist(strsplit(siteNames[i],"_"))))
      if(column>length(tmp)){
        column<-length(tmp)
      }
      if(i==1){
        parsedString<-tmp
      }
      else{
        parsedString<-rbind(parsedString,tmp)
      }
    }
    for(i in seq(1:length(siteNames)))
    {
      colInd[i]<-as.numeric(match(parsedString[i,column],unique(parsedString[,column])))
    }
  }
  return(colInd)
}

# plotting function
# description of arguments are as follows:
# x         data frame or matrix with input data, first column is time step
# x.locs 	  minimum and maximum x coordinates for plotting region, from 0–1
# y.locs 	  minimum and maximum y coordinates for plotting region, from 0–1
# steps 	  character string of time steps to include in plot, default all
# sp.names 	character string of species connections to display, default all
# dt.tx 	  logical value indicating if time steps are indicated in the plot
# rsc 	    logical value indicating if line widths are scaled proportionally to their value
# ln.st 	  numeric value for distance of lines from text labels, distance is determined automatically if not provided
# rs.ln 	  two-element numeric vector for rescaling line widths, defaults to one value if one element is supplied, default 3 to 15
# ln.cl 	  character string indicating color of lines, can use multiple colors or input to brewer.pal, default ‘RdYlGn’
# alpha.val numeric value (0–1) indicating transparency of connections, default 0.7
# leg 	    logical value for plotting legend, default T, values are for the original data frame, no change for species or step subsets
# ... 	    additional arguments to plot
# ref: http://beckmw.wordpress.com/2013/04/01/a-nifty-line-plot-to-visualize-multivariate-time-series/
plot.qual<-function(x,x.locs=c(0.05,0.95),y.locs=c(0,1),steps=NULL,sp.names=NULL,dt.tx=T,rsc=T,
                    ln.st=NULL,rs.ln=c(3,15),ln.cl='RdYlGn',alpha=0.7,leg=T,...){
  if(length(x.locs) != 2 | length(y.locs) != 2)
    stop('x and y dimensions must be two-element vectors')
  if(x.locs[1]<0 | x.locs[2]>1 | y.locs[1]<0 | y.locs[2]>1)
    stop('x and y dimensions must in range of 0--1')
  dim.x<-c(0,1) #plot dims x
  dim.y<-c(0,1) #plot dims y
  wrn.val<-F
  x[,1]<-as.character(x[,1])
  tot.sp<-ncol(x)-1
  sp.col<-2:ncol(x)
  #rescale if T, sort legend for later
  sp.orig<-x[,sp.col]
  if(length(rs.ln)==1) rsc<-F
  if(rsc) x[,sp.col]<-rescale(x[,sp.col],rs.ln)
  if(rsc==F & leg) leg<-F #no legend if line widths aren't rescaled
  #reorder species columns, add rank as integer
  first.ord<-order(x[1,sp.col],decreasing=T)
  x[,sp.col]<-x[,sp.col][,first.ord]
  names(x)[sp.col]<-names(x)[sp.col][first.ord]
  names(x)[sp.col]<-paste0(1:tot.sp,sep='. ',names(x)[sp.col])
  #list of spp by date, arranged in decreasing order for each date
  dt.dat.srt<-vector('list',nrow(x))
  names(dt.dat.srt)<-x[,1]
  for(val in 1:nrow(x)){
    tmp<-t(x[val,sp.col])
    tmp<-tmp[order(tmp,decreasing=T),,drop=F]
    dt.dat.srt[[val]]<-tmp
  }
  #initiate plot object
  plot(dim.x,dim.y,type='n',axes=F,xlab='',ylab='',...)
  #subset for steps, if provided
  if(!is.null(steps)) dt.dat.srt<-dt.dat.srt[names(dt.dat.srt) %in% steps]
  #plot legend
  if(leg){
    y.locs[1]<-0.05*diff(y.locs)+y.locs[1]
    leg.txt<-format(round(seq(min(sp.orig),max(sp.orig),length=5),2),nsmall=2,digits=2)
    leg.wds<-seq(rs.ln[1],rs.ln[2],length=5)
    legend('bottom',(y.locs[1]-y.olds)/2,col=alpha('black',alpha),lwd=leg.wds,legend=leg.txt,bty='n',
           horiz=T)
  }  
  #x locations
  x.vals<-rep(seq(x.locs[1],x.locs[2],length=length(dt.dat.srt)),each=tot.sp)
  x.vals<-split(x.vals,x.vals)
  #y locations, rearranged in loop, exception if dates are plotted
  if(dt.tx) y.vals<-rev(seq(y.locs[1],y.locs[2],length=tot.sp+1))[-1]
  else y.vals<-rev(seq(y.locs[1],y.locs[2],length=tot.sp))
  #get line colors
  if(length(ln.cl)==1)
    if(ln.cl %in% row.names(brewer.pal.info)){
      pal.num<-brewer.pal.info[row.names(brewer.pal.info) == ln.cl,1]
      ln.cl<-brewer.pal(pal.num,ln.cl)
    }
  line.cols<-alpha(colorRampPalette(ln.cl)(tot.sp),alpha)
  #define distance of lines from labels
  if(is.null(ln.st)){
    str.max<-max(strwidth(row.names(dt.dat.srt[[1]])))
    if(diff(x.locs)-length(dt.dat.srt)*str.max < 0){
      warning('not enough space for lines between columns')
      wrn.val<-T
    }
    else
      ln.st<-0.2*str.max + str.max/2
  }
  for(val in 1:(length(dt.dat.srt)-1)){
    #temp data to plot
    plt.tmp<-dt.dat.srt[c(val,val+1)]
    x.tmp<-x.vals[c(val,val+1)]
    #plot temp text for column
    text(x.tmp[[1]],y.vals,row.names(plt.tmp[[1]]))
    if(val == length(dt.dat.srt)-1){
      text(x.tmp[[2]],y.vals,row.names(plt.tmp[[2]]))
      if(dt.tx){
        dt.txt<-substitute(italic(x),list(x=names(plt.tmp)[2]))
        text(unique(x.tmp[[2]]),y.locs[2],dt.txt)
      }
    }	
    if(dt.tx){
      dt.txt<-substitute(italic(x),list(x=names(plt.tmp)[1]))
      text(unique(x.tmp[[1]]),y.locs[2],dt.txt)
    }
    srt.ln.y<-match(row.names(plt.tmp[[1]]),row.names(plt.tmp[[2]]))
    #if no line rescale, use first element of rs.ln
    if(rsc) lwd.val<-plt.tmp[[1]][,1]
    else lwd.val<-rep(rs.ln[1],tot.sp)
    #vector for species selection of line segments
    if(is.null(sp.names)) sel.sp<-rep(T,tot.sp)
    else{
      sel.names<-unlist(lapply(strsplit(row.names(plt.tmp[[1]]),' '),function(x) x[2]))
      sel.sp<-(sel.names %in% sp.names)
    }
    #for lines
    if(!wrn.val)
      segments(
        x.tmp[[1]][sel.sp]+ln.st,
        y.vals[sel.sp],
        x.tmp[[2]][sel.sp]-ln.st,
        y.vals[srt.ln.y][sel.sp],
        col=line.cols[sel.sp],
        lwd=lwd.val[sel.sp]
      )
    #resort color vector for next colummn
    srt.cl.y<-match(row.names(plt.tmp[[2]]),row.names(plt.tmp[[1]]))
    line.cols<-line.cols[srt.cl.y]
  }
}

#remove spaces from site names
rownames(sp.dat) <- sub(" ", "_", rownames(sp.dat))
rownames(sp.dat) <- sub(" ", "_", rownames(sp.dat))
sp.dat<-t(sp.dat)

#remove sites with minimum reads less than a specified number
sp.dat<-sp.dat[rowSums(sp.dat)>opt$mreads,]

#preprocess the data
if(opt$pmethod==1){
  #sp.dat<-exp(log((sp.dat+1)/(rowSums(sp.dat)+dim(sp.dat)[2]))) 
  sp.dat<-sp.dat/rowSums(sp.dat)
  } else if(opt$pmethod==2){
  sp.dat<-rrarefy(sp.dat,min(rowSums(sp.dat)))  
}

#order the data based on colsums
sp.dat<-sp.dat[,order(colSums(sp.dat),decreasing=TRUE)]

#extract most abundant taxa
sp.dat<-sp.dat[,1:min(opt$ntaxa,dim(sp.dat)[2])]

#extract grouping information by parsing site names
indices<-groupNames(rownames(sp.dat), opt$ccol)

figureNumber<-1

for(i in seq(unique(indices))){

  #extract site names based on unique indices
  extractedSamples<-rownames(sp.dat)[indices==i]

  if(length(extractedSamples)>1){
    nameString<-NULL
    for(j in seq(1:length(extractedSamples)))
    {
      nameString<-rbind(nameString,extractedSamples[j])
     
      #for avoiding congestion we need to split the sites to display only number of points      
      if(j%%min(length(extractedSamples),opt$npoints)==0){
        if(length(nameString)>1){
          sp2.dat<-sp.dat[rownames(sp.dat) %in% nameString,]
          sp2.dat<-sp2.dat[order(rownames(sp2.dat)),]
          
          jpeg(filename = paste(opt$opath,substr(basename(opt$ifile),1,nchar(basename(opt$ifile))-4),"_STT_",figureNumber,".jpg",sep=""),width =max(as.numeric(opt$dim),as.numeric(opt$dim)*(length(nameString)%/%5)) , height = as.numeric(opt$dim), quality=100)
          par(mar=numeric(4),family='serif')
          if(is.null(opt$flist)){
            plot.qual(data.frame(rownames(sp2.dat),sp2.dat),rs.ln=c(3,15),x.locs=c(as.numeric(opt$offset),1-as.numeric(opt$offset)))
          }
          else{
            filteredList<-unlist(strsplit(opt$flist,split=","))
            plot.qual(data.frame(rownames(sp2.dat),sp2.dat),rs.ln=c(3,15),sp.names=filteredList,x.locs=c(as.numeric(opt$offset),1-as.numeric(opt$offset)))
          }
          dev.off()
          figureNumber<-figureNumber+1
          }
        nameString<-NULL;
      }
    }
  } else {
    #these are groups with only single sample and are excluded from drawing them
    cat(paste("\n",extractedSamples," is excluded as a singleton...",sep=""))
    }
  
}