#!/bin/Rscript
# ***************************************************************
# Name:      taxo_richness_plot.R
# Purpose:   This script generates multiple subplots for all the environmental 
#            parameters against species richness in a single plot.
#            The species richness is rarefied to the minimum sample numbers and a 
#            correlation test is performed between the rarefied richness and the 
#            environmental parameter. The resulting correlation and their significance 
#            is drawn on top of each subplot. Currently, it has support for three 
#            correlation measures:
#                 -Pearson
#                 -Spearman
#                 -Kendall
#            Furthermore this script also generates a  log file that 
#            contains the summary stats of regression of rarefied richness against 
#            environmental parameters. The last column contains the P-values and if 
#            significant, it indicates that the richness is affected by this environmental 
#            parameter
# Version:   0.2
# History:   0.1 to 0.2 Automatically remove NA entries
# Authors:   Umer Zeeshan Ijaz (Umer.Ijaz@glasgow.ac.uk)
#                 http://userweb.eng.gla.ac.uk/umer.ijaz
#            Christopher Quince (Christopher.Quince@glasgow.ac.uk)
#                 http://userweb.eng.gla.ac.uk/christopher.quince
# Created:   2012-09-10
# License:   Copyright (c) 2012 Computational Microbial Genomics Group, University of Glasgow, UK
# 
#            This program is free software: you can redistribute it and/or modify
#            it under the terms of the GNU General Public License as published by
#            the Free Software Foundation, either version 3 of the License, or
#            (at your option) any later version.
#
#            This program is distributed in the hope that it will be useful,
#            but WITHOUT ANY WARRANTY; without even the implied warranty of
#            MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#            GNU General Public License for more details.
#
#            You should have received a copy of the GNU General Public License
#            along with this program.  If not, see <http://www.gnu.org/licenses/>.
# **************************************************************/
suppressPackageStartupMessages(library("optparse"))
suppressPackageStartupMessages(library("mgcv"))
suppressPackageStartupMessages(library("vegan"))
suppressPackageStartupMessages(library("RColorBrewer"))
suppressPackageStartupMessages(library("gplots"))

#specify desired options in a list
option_list <- list(
make_option("--sfile", action="store",default=NULL, help="Species abundance file"),
make_option("--efile", action="store",default=NULL, help="environmental file"),
make_option("--opath", action="store",default=NULL, help="Output path"),
make_option("--fsize", action="store", default="1.2", help="Font size [default %default]"),
make_option("--width", type="integer",default=800, help="Width of jpeg files [default %default]"),
make_option("--height", type="integer",default=800, help="Height of jpeg files [default %default]"),
make_option("--correlation", type="integer",default=1, help="Correlation to use: 1=pearson, 2=spearman, 3=kendall [default %default]"),
make_option("--mreads", type="integer",default=200, help="Minimum reads [default %default]"),
make_option("--ccol", type="integer",default=1, help="Minimum reads [default %default]")
)

#get command line options
opt<-parse_args(OptionParser(usage="%prog [options] file", option_list=option_list))

if(is.null(opt$sfile) || is.null(opt$efile))
quit()

formatPvalues <- function(pvalue) {
  ra<-""
  if(pvalue <= 0.1) ra<-"."
  if(pvalue <= 0.05) ra<-"*"
  if(pvalue <= 0.01) ra<-"**"
  if(pvalue <= 0.001) ra<-"***"
  return(ra)
}

#generate colors function
generateColors <- function(siteNames,column) {
    
    #generate the indices
    colInd<-""
    
    if(column==0){
        colInd<-rep("blue",length(siteNames))
    }
    else{
        #first split the names with respect to underscore and construct a parsedString matrix
        for(i in seq(1:length(siteNames)))
        {
            tmp<-t(data.matrix(unlist(strsplit(siteNames[i],"_"))))
            if(column>length(tmp)){
                column<-length(tmp)
            }
            if(i==1){
                parsedString<-tmp
            }
            else{
                parsedString<-rbind(parsedString,tmp)
            }
        }
        #generate template colors
        crp<- colorRampPalette(c("blue", "#007FFF", "cyan","#7FFF7F", "yellow", "#FF7F00", "red"),space = "Lab")(length(unique(parsedString[,column])))        
        for(i in seq(1:length(siteNames)))
        {
            colInd[i]<-crp[match(parsedString[i,column],unique(parsedString[,column]))]
        }
    }
    return(colInd)
}


#Importing data now
AS_C <-read.csv(opt$sfile,header=TRUE,row.names=1)
Env <- read.csv(opt$efile,header=TRUE,row.names=1)

#transpose the data
AS <- t(AS_C)

#filter the data for reads smaller than a particular number
AS<-AS[rowSums(AS)>opt$mreads,]
Env<- Env[rownames(AS),]

#use complete.cases function to filter out those rows that have NA entry in Env
Env<-Env[complete.cases(Env),]
AS<-AS[rownames(Env),]


# Calculate species richness
N <- rowSums(AS)
S <- specnumber(AS)
S.rar <-rarefy(AS, min(N))

S.lm <- lm(S.rar ~ ., data = Env)
summarystats<-summary(S.lm)
capture.output(summary(S.lm), file=paste(opt$opath,substr(basename(opt$sfile),1,nchar(basename(opt$sfile))-4),"_RP.txt",sep=""))

method<-switch(opt$correlation,"pearson","spearman","kendall")

jpeg(filename = paste(opt$opath,substr(basename(opt$sfile),1,nchar(basename(opt$sfile))-4),"_RP.jpg",sep=""),width = as.numeric(opt$width), height = as.numeric(opt$height), quality=100)

par( mfrow=c(ceiling(ncol(Env)/4),4)
     ,mai=c(1,1,1,0.2)
     ,bg="white")

#draw species richness plots
for (e in seq(1,ncol(Env))){
  correlationTest<-cor.test(Env[,e],S.rar,method=method)
  plot(Env[,e],S.rar, xlab = colnames(Env)[e],
       ylab = "richness",
       cex.axis=as.numeric(opt$fsize)*0.6,
       cex.lab=as.numeric(opt$fsize),
       main=paste(sprintf("%.3f\n", correlationTest$estimate),formatPvalues(pvalue=correlationTest$p.value),sep=""),
       cex=as.numeric(opt$fsize),
       cex.main=as.numeric(opt$fsize),
       col=generateColors(rownames(AS),opt$ccol),pch=16)
  
}
dev.off(which=dev.cur())