#!/usr/bin/Rscript
# ***************************************************************
# Name:      clust_validity.R
# Purpose:   This script takes a csv file of n d-dimensional features, performs k-means or 
#            dp-means (bayesian nonparametric k-means) clustering and chooses the optimum 
#            number of clusters based on either of the implemented internal clustering validation 
#            indices. Additionally, if the csv file contains a column titled "True_Clusters" 
#            containing true clusters membership for each object, you can use it to validate 
#            clustering performance using several external clustering indices.
#
#            Format of the csv file (with header names):
#                       ,V1,V2,...,True_Clusters
#            <rowname_1>,<value_V1>,<value_V2>,...,<value_True_Clusters>
#            <rowname_2>,<value_V1>,<value_V2>,...,<value_True_Clusters>
#            ...
#            <rowname_n>,<value_V1>,<value_V2>,...,<value_True_Clusters>
#
#            If the data is 2-dimensional then the script also generates 
#            a 2D plot of objects and colors each object based on it's 
#            cluster membership. The resulting plot is saved as <file_name>.jpg
#
#            Moreover, the clustering results are saved as <file_name>_processed.csv
#            by appending cluster assignments as a column.
#
#            To see what indices are supported, try running this script on command line as:
#                 Rscript clust_validity.R -h
#
# Acknowledgements:
#           This script was written under the supervision of Christopher Quince in the following hackathon:
#                 Event title: ProBin: Probabilistic binning for metagenome contigs
#                 Location: Instituto Gulbenkian De CieNcia, Lisbon, Portugal
#                 Dates: from 24-06-2013 to 28-06-2013
#           organized by European Union's Earth System Science and Environmental Management ES1103 COST Action
#           ("Microbial ecology & the earth system: collaborating for insight and success with the new generation of sequencing tools").
#           This work would not have been possible, were it not for the useful discussions with other participants of the hackathon, namely,
#                 Nick Loman
#                 Joshua Quick
#                 Brynjar Smari Bjarnason
#                 Johannes Alneberg
#                 Anders Anderson
#                 Ino de Bruijn 
# Version:   0.1
# Authors:   Umer Zeeshan Ijaz (Umer.Ijaz@glasgow.ac.uk)
#                 http://userweb.eng.gla.ac.uk/umer.ijaz/index.htm
# Created:   2012-07-09
# License:   Copyright (c) 2013 Computational Microbial Genomics Group, University of Glasgow, UK
#
#            This program is free software: you can redistribute it and/or modify
#            it under the terms of the GNU General Public License as published by
#            the Free Software Foundation, either version 3 of the License, or
#            (at your option) any later version.
#
#            This program is distributed in the hope that it will be useful,
#            but WITHOUT ANY WARRANTY; without even the implied warranty of
#            MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#            GNU General Public License for more details.
#
#            You should have received a copy of the GNU General Public License
#            along with this program.  If not, see <http://www.gnu.org/licenses/>.
# **************************************************************/
suppressWarnings(suppressPackageStartupMessages(library("optparse")))
suppressWarnings(suppressPackageStartupMessages(library("clusterCrit")))
suppressWarnings(suppressPackageStartupMessages(library("RColorBrewer")))
suppressWarnings(suppressPackageStartupMessages(library("mclust")))
suppressWarnings(suppressPackageStartupMessages(library("cluster")))

supportedInternalCriteria<-getCriteriaNames(TRUE)
supportedInternalCriteria<-c(supportedInternalCriteria,"EAC")
supportedExternalCriteria<-getCriteriaNames(FALSE)
supportedExternalCriteria<-c(supportedExternalCriteria,"ARI")
supportedAlgorithms<-c("kmeans","dpmeans")
#specify desired options in a list
option_list <- list(
  make_option("--file", action="store",default=NULL, help="Input csv file"),
  make_option("--algo", action="store",default="kmeans", help=paste("[default %default]",paste("\n\t\tOptions:\n\t\t\t",paste(supportedAlgorithms,collapse="\n\t\t\t"),collapse=""),collapse="")),
  make_option("--paramMin", type="integer",default=2, help="kmeans: minimum numbers of clusters, dpmeans:minimum value of lambda [default %default]"),
  make_option("--paramMax", type="integer",default=50, help="kmeans: maximum numbers of clusters, dpmeans:maximum value of lambda [default %default]"),
  make_option("--nOrderings", type="integer",default=5, help="dpmeans:number of orderings for each value of lambda [default %default]"),
  make_option("--lambdaScale",action="store",default="10", help="dpmeans: lambda/scale [default %default]"),
  make_option("--iterations", type="integer",default=200, help="Iterations in EAC [default %default]"),
  make_option("--internalCriteria", action="store",default="EAC", help=paste("[default %default]",paste("\n\t\tOptions:\n\t\t\t",paste(supportedInternalCriteria,collapse="\n\t\t\t"),collapse=""),collapse="")),
  make_option("--externalCriteria", action="store",default="ARI", help=paste("[default %default]",paste("\n\t\tOptions:\n\t\t\t",paste(supportedExternalCriteria,collapse="\n\t\t\t"),collapse=""),collapse="")),
  make_option("--width", type="integer",default=800, help="Width of jpeg files [default %default]"),
  make_option("--height", type="integer",default=800, help="Height of jpeg files [default %default]"),
  make_option("--fsize", action="store", default="2", help="Font size [default %default]")
)

#get command line options
opt<-parse_args(OptionParser(usage="%prog [options] file", option_list=option_list))

#sanity checks
if(is.null(opt$file)){
  cat("--file not supplied. Quitting now!\n")
  quit()
}

if((!opt$internalCriteria %in% supportedInternalCriteria) || (!opt$externalCriteria %in% supportedExternalCriteria)){
  cat("--internalCriteria/--externalCriteria incorrect. Quitting now!\n")
  quit()
}

if(!opt$algo %in% supportedAlgorithms){
  cat("--algo incorrect. Quitting now!\n")
  quit()
}

#Reference: http://arxiv.org/pdf/1111.0352.pdf
dp.means <- function(data, lambda = 1, max.iterations = 100, tolerance = 10e-3)
{
  n <- nrow(data)
  k <- 1
  assignments <- rep(1, n)
  
  mu<-matrix(colMeans(data),ncol=ncol(data))
  colnames(mu)<-colnames(data)
  
  converged <- FALSE
  iteration <- 0
  
  ss.old <- Inf
  ss.new <- Inf
  
  while (!converged && iteration < max.iterations)
  {
    iteration <- iteration + 1
    for (i in 1:n)
    {
      distances <- rep(NA, k)
      
      for (j in 1:k)
      {
        distances[j] <-sum((data[i,]-mu[j,])^2)
      }
      if (min(distances,na.rm=TRUE) > lambda)
      {
        k <- k + 1
        assignments[i] <- k
        mu<-rbind(mu,data[i,])
      } else
      {
        assignments[i] <- which(distances == min(distances,na.rm=TRUE))
      }
    }
    
    for (j in 1:k)
    {
      if (length(assignments == j) > 0)
      {
        mu[j,]<-colMeans(data[assignments==j,])
      }
    }
    
    ss.new <- 0
    
    for (i in 1:n)
    {
      ss.new <- ss.new + sum((data[i,]-mu[assignments[i],])^2)
    }
    
    ss.change <- ss.old - ss.new
    ss.old <- ss.new
    
    if (!is.nan(ss.change) && ss.change < tolerance)
    {
      converged <- TRUE
    }
  }
  
  centers <- data.frame(mu)
  
  return(list("centers" = centers, "assignments" = assignments, "k" = k, "iterations" = iteration))
}

# Reference: A. L. N. Fred and A. K. Jain. Data clustering using evidence accumulation. In Proc. ICPR, 2002.
# Also, blog post: http://things-about-r.tumblr.com/post/43894016034/the-wisdom-of-crowds-clustering-using-evidence
createCoAssocMatrix <- function(Iter, rangeK, dataSet,cMethod="kmeans") {
  nV <- dim(dataSet)[1]
  CoAssoc <- matrix(rep(0, nV * nV), nrow = nV)
  
  for (j in 1:Iter) {
    cat(paste("\rComputing \"Evidence Accumulation-based Clustering\" (Iteration:",j,"/",Iter,")",sep=""))
    jK <- sample(c(rangeK[1]:rangeK[2]), 1, replace = FALSE)
    jSpecCl <-NULL
    if(cMethod=="kmeans"){
      jSpecCl <- kmeans(x = dataSet, centers = jK)$cluster
    }else if (cMethod=="dpmeans"){
      #Ordering matters in dpmeans and so we will permute data and use forward_map to map from original indices to permuted
      #indices and use reverse_map to map from permuted indices to original indices
      forward_map<-data.frame(seq(1,nrow(dataSet)),sample(seq(1,nrow(dataSet)),nrow(dataSet),replace=FALSE,prob=NULL))
      reverse_map<-forward_map[order(forward_map[,2]),]
      jSpecCl<-as.integer(dp.means(dataSet[forward_map[,2],],jK/as.numeric(opt$lambdaScale))$assignments)[reverse_map[,1]]
    }
    CoAssoc_j <- matrix(rep(0, nV * nV), nrow = nV)
    for (i in unique(jSpecCl)) {
      indVenues <- which(jSpecCl == i)
      CoAssoc_j[indVenues, indVenues] <- CoAssoc_j[indVenues, indVenues] + (1/Iter)
    }
    CoAssoc <- CoAssoc + CoAssoc_j
  }
  return(CoAssoc)
}

eac <- function(Iter, rangeK, dataset, hcMethod = "single",cMethod="kmeans") {
  CoAssocSim <- createCoAssocMatrix(Iter, rangeK, dataset,cMethod)
  
  # transform from similiarity into distance matrix
  CoAssocDist <- 1 - CoAssocSim  
  hclustM <- hclust(as.dist(CoAssocDist), method = hcMethod)
  # determine the cut
  cutValue <- hclustM$height[which.max(diff(hclustM$height))]  
  return(cutree(hclustM, h = cutValue))
}

#import data
data <-read.csv(opt$file,header=TRUE,row.names=1,check.names=FALSE)
trueClusters<-NULL

if("True_Clusters" %in% colnames(data))
  {
  trueClusters<-data[,"True_Clusters"]
  data<-data[,colnames(data)!="True_Clusters"]
  }  


if(opt$internalCriteria=="EAC")
  {
  set.seed(1234)
  data$Predicted_Clusters<- eac(Iter = as.numeric(opt$iterations), rangeK = c(opt$paramMin, opt$paramMax), dataset = data, hcMethod = "single",cMethod=opt$algo)  
  }else if(opt$internalCriteria %in% getCriteriaNames(TRUE)) {
  set.seed(1234)
  
  #for dpmeans, ordering is very important, so we will introduce replicates for each lambda based on opt$nOrderings
  vals<-NULL
  dpmean_clusters<-NULL
  if(opt$algo=="kmeans"){
    vals<-matrix(rep(NA,opt$paramMax-(opt$paramMin-1)),ncol=1,dimnames=list(c(),c(opt$internalCriteria)))  
  }else if(opt$algo=="dpmeans"){
    
    #dpmean_clusters holds the history for each parameter we have used
    dpmean_clusters<-matrix(rep(NA,(opt$paramMax-(opt$paramMin-1))*opt$nOrderings*nrow(data)),ncol=nrow(data))
    vals<-matrix(rep(NA,(opt$paramMax-(opt$paramMin-1))*opt$nOrderings*2),ncol=2,dimnames=list(c(),c("Lambda",opt$internalCriteria)))
  }
  
  for(k in opt$paramMin:opt$paramMax){
    if(opt$algo=="kmeans"){
      cat(paste("\rComputing \"",opt$internalCriteria,"\" (K:",k,")",sep=""))
      
      cl<-kmeans(data,k)
      vals[(k - (as.numeric(opt$paramMin)-1)), 1] <- as.numeric(intCriteria(as.matrix(data), cl$cluster,opt$internalCriteria))
      }else if (opt$algo=="dpmeans"){
      for(h in 1:opt$nOrderings){
        cat(paste("\rComputing \"",opt$internalCriteria,"\" (Lambda:",k/as.numeric(opt$lambdaScale),",Ordering:",h,")",sep=""))
        
        #ordering matters in dpmeans and so we will permute data and use forward_map to map from original indices to permuted
        #indices and use reverse_map to map from permuted indices to original indices
        forward_map<-data.frame(seq(1,nrow(data)),sample(seq(1,nrow(data)),nrow(data),replace=FALSE,prob=NULL))
        reverse_map<-forward_map[order(forward_map[,2]),]      
        
        dpmean_clusters[((k - opt$paramMin)*(opt$nOrderings))+h,] <- as.integer(dp.means(data[forward_map[,2],],k/as.numeric(opt$lambdaScale))$assignments)[reverse_map[,1]]
        
        #if only 1 cluster is returned, we will not evaluate the internal criteria, instead we return Inf
        if(length(unique(dpmean_clusters[((k - opt$paramMin)*(opt$nOrderings))+h,]))>1){
          vals[((k - opt$paramMin)*(opt$nOrderings))+h, 1] <-k/as.numeric(opt$lambdaScale)
          vals[((k - opt$paramMin)*(opt$nOrderings))+h, 2] <- as.numeric(intCriteria(as.matrix(data), dpmean_clusters[((k - opt$paramMin)*(opt$nOrderings))+h,],opt$internalCriteria))
        }else{
          vals[((k - opt$paramMin)*(opt$nOrderings))+h, 1] <-k/as.numeric(opt$lambdaScale)
          vals[((k - opt$paramMin)*(opt$nOrderings))+h, 2] <-Inf
        }
      }
      }
    
    }
  if(opt$algo=="kmeans"){
    vals<-data.frame(K=c(opt$paramMin:opt$paramMax),vals)
    cat(paste("\nInternal clustering index (\"",opt$internalCriteria,"\") for different values of K is as follows:\n",sep=""))
    print(vals,quote=FALSE,row.names=FALSE)
    chosen_k<-vals[bestCriterion(vals[,2],opt$internalCriteria),"K"]
    cat(paste("\nOptimum cluster size is ",chosen_k,sep=""))
    data$Predicted_Clusters<-kmeans(data,chosen_k)$cluster
    
    }else if (opt$algo=="dpmeans"){
    vals<-data.frame(vals)
    colnames(vals)<-c("Lambda",opt$internalCriteria)
    cat(paste("\nInternal clustering index (\"",opt$internalCriteria,"\") for different values of Lambda is as follows (Inf for cases when only one cluster is obtained):\n",sep=""))
    print(vals,quote=FALSE,row.names=FALSE)
    
    #following code is used to cater to Inf cases when only 1 class exists. We remove those cases from
    #both dpmean_clusters and vals. Reason behind doing this is that most internal criteria fail when
    #the dataset has only 1 cluster
    dpmean_clusters<-dpmean_clusters[apply(vals, 1, function(x) all(is.finite(x))),]
    vals<-vals[apply(vals, 1, function(x) all(is.finite(x))),]
    ind<-bestCriterion(vals[,2],opt$internalCriteria)
    
    chosen_lambda<-vals[ind,"Lambda"]
    data$Predicted_Clusters<-dpmean_clusters[ind,]
    cat(paste("\nOptimum cluster size is ",max(unique(dpmean_clusters[ind,]))," for lambda=",chosen_lambda,sep=""))
    }
  }

#calculate external clustering index if "True_Clusters" column exists
if(!is.null(trueClusters)){
  if(opt$externalCriteria=="ARI"){
    val<-adjustedRandIndex(trueClusters,data$Predicted_Clusters)
    cat(paste("\nExternal clustering index (\"",opt$externalCriteria,"\") is ",val,"\n",sep=""))
  }else if(opt$externalCriteria %in% getCriteriaNames(FALSE)) {
    val<-extCriteria(trueClusters,data$Predicted_Clusters,opt$externalCriteria)
    cat(paste("\nExternal clustering index (\"",opt$externalCriteria,"\") is ",val,"\n",sep=""))
    }
}

#plot results only if data is 2-dimensional and save the plot in <filename>.jpg file
if(dim(data)[2]==3){
  cat(paste("Data is 2-dimensional, saving plot as ",substr(basename(opt$file),1,nchar(basename(opt$file))-4),".jpg\n",sep=""))
  jpeg(filename = paste(substr(basename(opt$file),1,nchar(basename(opt$file))-4),".jpg",sep=""),width = as.numeric(opt$width), height = as.numeric(opt$height), quality=100)
  par(mai=c(1,1,1,0.2)
      ,bg="white")
  jet.colors<-colorRampPalette(c("#00007F", "blue", "#007FFF", "cyan","#7FFF7F", "yellow", "#FF7F00", "red", "#7F0000"))(max(data$Predicted_Clusters))
  plot(data$V1,data$V2,
       cex.axis=as.numeric(opt$fsize)*0.6,
       cex.lab=as.numeric(opt$fsize),
       cex=as.numeric(opt$fsize)*0.7,
       col=jet.colors[data$Predicted_Clusters],pch=16,xlab="V1",ylab="V2")
  garbage <- dev.off()
}

#save information about predicted clusters in <filename>_processed.csv file
cat(paste("\nSaving cluster assignments as ",substr(basename(opt$file),1,nchar(basename(opt$file))-4),"_processed.csv\n",sep=""))

if(!is.null(trueClusters)){
trueClusters<-data.frame(trueClusters)
colnames(trueClusters)<-c("True_Clusters")  
write.csv(data.frame(data,trueClusters),file=paste(substr(basename(opt$file),1,nchar(basename(opt$file))-4),"_processed.csv",sep=""),quote=FALSE)
} else {
write.csv(data,file=paste(substr(basename(opt$file),1,nchar(basename(opt$file))-4),"_processed.csv",sep=""),quote=FALSE)  
}