#!/usr/bin/Rscript # *************************************************************** # Name: clust_validity.R # Purpose: This script takes a csv file of n d-dimensional features, performs k-means or # dp-means (bayesian nonparametric k-means) clustering and chooses the optimum # number of clusters based on either of the implemented internal clustering validation # indices. Additionally, if the csv file contains a column titled "True_Clusters" # containing true clusters membership for each object, you can use it to validate # clustering performance using several external clustering indices. # # Format of the csv file (with header names): # ,V1,V2,...,True_Clusters # ,,,..., # ,,,..., # ... # ,,,..., # # If the data is 2-dimensional then the script also generates # a 2D plot of objects and colors each object based on it's # cluster membership. The resulting plot is saved as .jpg # # Moreover, the clustering results are saved as _processed.csv # by appending cluster assignments as a column. # # To see what indices are supported, try running this script on command line as: # Rscript clust_validity.R -h # # Acknowledgements: # This script was written under the supervision of Christopher Quince in the following hackathon: # Event title: ProBin: Probabilistic binning for metagenome contigs # Location: Instituto Gulbenkian De CieNcia, Lisbon, Portugal # Dates: from 24-06-2013 to 28-06-2013 # organized by European Union's Earth System Science and Environmental Management ES1103 COST Action # ("Microbial ecology & the earth system: collaborating for insight and success with the new generation of sequencing tools"). # This work would not have been possible, were it not for the useful discussions with other participants of the hackathon, namely, # Nick Loman # Joshua Quick # Brynjar Smari Bjarnason # Johannes Alneberg # Anders Anderson # Ino de Bruijn # Version: 0.1 # Authors: Umer Zeeshan Ijaz (Umer.Ijaz@glasgow.ac.uk) # http://userweb.eng.gla.ac.uk/umer.ijaz/index.htm # Created: 2012-07-09 # License: Copyright (c) 2013 Computational Microbial Genomics Group, University of Glasgow, UK # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # **************************************************************/ suppressWarnings(suppressPackageStartupMessages(library("optparse"))) suppressWarnings(suppressPackageStartupMessages(library("clusterCrit"))) suppressWarnings(suppressPackageStartupMessages(library("RColorBrewer"))) suppressWarnings(suppressPackageStartupMessages(library("mclust"))) suppressWarnings(suppressPackageStartupMessages(library("cluster"))) supportedInternalCriteria<-getCriteriaNames(TRUE) supportedInternalCriteria<-c(supportedInternalCriteria,"EAC") supportedExternalCriteria<-getCriteriaNames(FALSE) supportedExternalCriteria<-c(supportedExternalCriteria,"ARI") supportedAlgorithms<-c("kmeans","dpmeans") #specify desired options in a list option_list <- list( make_option("--file", action="store",default=NULL, help="Input csv file"), make_option("--algo", action="store",default="kmeans", help=paste("[default %default]",paste("\n\t\tOptions:\n\t\t\t",paste(supportedAlgorithms,collapse="\n\t\t\t"),collapse=""),collapse="")), make_option("--paramMin", type="integer",default=2, help="kmeans: minimum numbers of clusters, dpmeans:minimum value of lambda [default %default]"), make_option("--paramMax", type="integer",default=50, help="kmeans: maximum numbers of clusters, dpmeans:maximum value of lambda [default %default]"), make_option("--nOrderings", type="integer",default=5, help="dpmeans:number of orderings for each value of lambda [default %default]"), make_option("--lambdaScale",action="store",default="10", help="dpmeans: lambda/scale [default %default]"), make_option("--iterations", type="integer",default=200, help="Iterations in EAC [default %default]"), make_option("--internalCriteria", action="store",default="EAC", help=paste("[default %default]",paste("\n\t\tOptions:\n\t\t\t",paste(supportedInternalCriteria,collapse="\n\t\t\t"),collapse=""),collapse="")), make_option("--externalCriteria", action="store",default="ARI", help=paste("[default %default]",paste("\n\t\tOptions:\n\t\t\t",paste(supportedExternalCriteria,collapse="\n\t\t\t"),collapse=""),collapse="")), make_option("--width", type="integer",default=800, help="Width of jpeg files [default %default]"), make_option("--height", type="integer",default=800, help="Height of jpeg files [default %default]"), make_option("--fsize", action="store", default="2", help="Font size [default %default]") ) #get command line options opt<-parse_args(OptionParser(usage="%prog [options] file", option_list=option_list)) #sanity checks if(is.null(opt$file)){ cat("--file not supplied. Quitting now!\n") quit() } if((!opt$internalCriteria %in% supportedInternalCriteria) || (!opt$externalCriteria %in% supportedExternalCriteria)){ cat("--internalCriteria/--externalCriteria incorrect. Quitting now!\n") quit() } if(!opt$algo %in% supportedAlgorithms){ cat("--algo incorrect. Quitting now!\n") quit() } #Reference: http://arxiv.org/pdf/1111.0352.pdf dp.means <- function(data, lambda = 1, max.iterations = 100, tolerance = 10e-3) { n <- nrow(data) k <- 1 assignments <- rep(1, n) mu<-matrix(colMeans(data),ncol=ncol(data)) colnames(mu)<-colnames(data) converged <- FALSE iteration <- 0 ss.old <- Inf ss.new <- Inf while (!converged && iteration < max.iterations) { iteration <- iteration + 1 for (i in 1:n) { distances <- rep(NA, k) for (j in 1:k) { distances[j] <-sum((data[i,]-mu[j,])^2) } if (min(distances,na.rm=TRUE) > lambda) { k <- k + 1 assignments[i] <- k mu<-rbind(mu,data[i,]) } else { assignments[i] <- which(distances == min(distances,na.rm=TRUE)) } } for (j in 1:k) { if (length(assignments == j) > 0) { mu[j,]<-colMeans(data[assignments==j,]) } } ss.new <- 0 for (i in 1:n) { ss.new <- ss.new + sum((data[i,]-mu[assignments[i],])^2) } ss.change <- ss.old - ss.new ss.old <- ss.new if (!is.nan(ss.change) && ss.change < tolerance) { converged <- TRUE } } centers <- data.frame(mu) return(list("centers" = centers, "assignments" = assignments, "k" = k, "iterations" = iteration)) } # Reference: A. L. N. Fred and A. K. Jain. Data clustering using evidence accumulation. In Proc. ICPR, 2002. # Also, blog post: http://things-about-r.tumblr.com/post/43894016034/the-wisdom-of-crowds-clustering-using-evidence createCoAssocMatrix <- function(Iter, rangeK, dataSet,cMethod="kmeans") { nV <- dim(dataSet)[1] CoAssoc <- matrix(rep(0, nV * nV), nrow = nV) for (j in 1:Iter) { cat(paste("\rComputing \"Evidence Accumulation-based Clustering\" (Iteration:",j,"/",Iter,")",sep="")) jK <- sample(c(rangeK[1]:rangeK[2]), 1, replace = FALSE) jSpecCl <-NULL if(cMethod=="kmeans"){ jSpecCl <- kmeans(x = dataSet, centers = jK)$cluster }else if (cMethod=="dpmeans"){ #Ordering matters in dpmeans and so we will permute data and use forward_map to map from original indices to permuted #indices and use reverse_map to map from permuted indices to original indices forward_map<-data.frame(seq(1,nrow(dataSet)),sample(seq(1,nrow(dataSet)),nrow(dataSet),replace=FALSE,prob=NULL)) reverse_map<-forward_map[order(forward_map[,2]),] jSpecCl<-as.integer(dp.means(dataSet[forward_map[,2],],jK/as.numeric(opt$lambdaScale))$assignments)[reverse_map[,1]] } CoAssoc_j <- matrix(rep(0, nV * nV), nrow = nV) for (i in unique(jSpecCl)) { indVenues <- which(jSpecCl == i) CoAssoc_j[indVenues, indVenues] <- CoAssoc_j[indVenues, indVenues] + (1/Iter) } CoAssoc <- CoAssoc + CoAssoc_j } return(CoAssoc) } eac <- function(Iter, rangeK, dataset, hcMethod = "single",cMethod="kmeans") { CoAssocSim <- createCoAssocMatrix(Iter, rangeK, dataset,cMethod) # transform from similiarity into distance matrix CoAssocDist <- 1 - CoAssocSim hclustM <- hclust(as.dist(CoAssocDist), method = hcMethod) # determine the cut cutValue <- hclustM$height[which.max(diff(hclustM$height))] return(cutree(hclustM, h = cutValue)) } #import data data <-read.csv(opt$file,header=TRUE,row.names=1,check.names=FALSE) trueClusters<-NULL if("True_Clusters" %in% colnames(data)) { trueClusters<-data[,"True_Clusters"] data<-data[,colnames(data)!="True_Clusters"] } if(opt$internalCriteria=="EAC") { set.seed(1234) data$Predicted_Clusters<- eac(Iter = as.numeric(opt$iterations), rangeK = c(opt$paramMin, opt$paramMax), dataset = data, hcMethod = "single",cMethod=opt$algo) }else if(opt$internalCriteria %in% getCriteriaNames(TRUE)) { set.seed(1234) #for dpmeans, ordering is very important, so we will introduce replicates for each lambda based on opt$nOrderings vals<-NULL dpmean_clusters<-NULL if(opt$algo=="kmeans"){ vals<-matrix(rep(NA,opt$paramMax-(opt$paramMin-1)),ncol=1,dimnames=list(c(),c(opt$internalCriteria))) }else if(opt$algo=="dpmeans"){ #dpmean_clusters holds the history for each parameter we have used dpmean_clusters<-matrix(rep(NA,(opt$paramMax-(opt$paramMin-1))*opt$nOrderings*nrow(data)),ncol=nrow(data)) vals<-matrix(rep(NA,(opt$paramMax-(opt$paramMin-1))*opt$nOrderings*2),ncol=2,dimnames=list(c(),c("Lambda",opt$internalCriteria))) } for(k in opt$paramMin:opt$paramMax){ if(opt$algo=="kmeans"){ cat(paste("\rComputing \"",opt$internalCriteria,"\" (K:",k,")",sep="")) cl<-kmeans(data,k) vals[(k - (as.numeric(opt$paramMin)-1)), 1] <- as.numeric(intCriteria(as.matrix(data), cl$cluster,opt$internalCriteria)) }else if (opt$algo=="dpmeans"){ for(h in 1:opt$nOrderings){ cat(paste("\rComputing \"",opt$internalCriteria,"\" (Lambda:",k/as.numeric(opt$lambdaScale),",Ordering:",h,")",sep="")) #ordering matters in dpmeans and so we will permute data and use forward_map to map from original indices to permuted #indices and use reverse_map to map from permuted indices to original indices forward_map<-data.frame(seq(1,nrow(data)),sample(seq(1,nrow(data)),nrow(data),replace=FALSE,prob=NULL)) reverse_map<-forward_map[order(forward_map[,2]),] dpmean_clusters[((k - opt$paramMin)*(opt$nOrderings))+h,] <- as.integer(dp.means(data[forward_map[,2],],k/as.numeric(opt$lambdaScale))$assignments)[reverse_map[,1]] #if only 1 cluster is returned, we will not evaluate the internal criteria, instead we return Inf if(length(unique(dpmean_clusters[((k - opt$paramMin)*(opt$nOrderings))+h,]))>1){ vals[((k - opt$paramMin)*(opt$nOrderings))+h, 1] <-k/as.numeric(opt$lambdaScale) vals[((k - opt$paramMin)*(opt$nOrderings))+h, 2] <- as.numeric(intCriteria(as.matrix(data), dpmean_clusters[((k - opt$paramMin)*(opt$nOrderings))+h,],opt$internalCriteria)) }else{ vals[((k - opt$paramMin)*(opt$nOrderings))+h, 1] <-k/as.numeric(opt$lambdaScale) vals[((k - opt$paramMin)*(opt$nOrderings))+h, 2] <-Inf } } } } if(opt$algo=="kmeans"){ vals<-data.frame(K=c(opt$paramMin:opt$paramMax),vals) cat(paste("\nInternal clustering index (\"",opt$internalCriteria,"\") for different values of K is as follows:\n",sep="")) print(vals,quote=FALSE,row.names=FALSE) chosen_k<-vals[bestCriterion(vals[,2],opt$internalCriteria),"K"] cat(paste("\nOptimum cluster size is ",chosen_k,sep="")) data$Predicted_Clusters<-kmeans(data,chosen_k)$cluster }else if (opt$algo=="dpmeans"){ vals<-data.frame(vals) colnames(vals)<-c("Lambda",opt$internalCriteria) cat(paste("\nInternal clustering index (\"",opt$internalCriteria,"\") for different values of Lambda is as follows (Inf for cases when only one cluster is obtained):\n",sep="")) print(vals,quote=FALSE,row.names=FALSE) #following code is used to cater to Inf cases when only 1 class exists. We remove those cases from #both dpmean_clusters and vals. Reason behind doing this is that most internal criteria fail when #the dataset has only 1 cluster dpmean_clusters<-dpmean_clusters[apply(vals, 1, function(x) all(is.finite(x))),] vals<-vals[apply(vals, 1, function(x) all(is.finite(x))),] ind<-bestCriterion(vals[,2],opt$internalCriteria) chosen_lambda<-vals[ind,"Lambda"] data$Predicted_Clusters<-dpmean_clusters[ind,] cat(paste("\nOptimum cluster size is ",max(unique(dpmean_clusters[ind,]))," for lambda=",chosen_lambda,sep="")) } } #calculate external clustering index if "True_Clusters" column exists if(!is.null(trueClusters)){ if(opt$externalCriteria=="ARI"){ val<-adjustedRandIndex(trueClusters,data$Predicted_Clusters) cat(paste("\nExternal clustering index (\"",opt$externalCriteria,"\") is ",val,"\n",sep="")) }else if(opt$externalCriteria %in% getCriteriaNames(FALSE)) { val<-extCriteria(trueClusters,data$Predicted_Clusters,opt$externalCriteria) cat(paste("\nExternal clustering index (\"",opt$externalCriteria,"\") is ",val,"\n",sep="")) } } #plot results only if data is 2-dimensional and save the plot in .jpg file if(dim(data)[2]==3){ cat(paste("Data is 2-dimensional, saving plot as ",substr(basename(opt$file),1,nchar(basename(opt$file))-4),".jpg\n",sep="")) jpeg(filename = paste(substr(basename(opt$file),1,nchar(basename(opt$file))-4),".jpg",sep=""),width = as.numeric(opt$width), height = as.numeric(opt$height), quality=100) par(mai=c(1,1,1,0.2) ,bg="white") jet.colors<-colorRampPalette(c("#00007F", "blue", "#007FFF", "cyan","#7FFF7F", "yellow", "#FF7F00", "red", "#7F0000"))(max(data$Predicted_Clusters)) plot(data$V1,data$V2, cex.axis=as.numeric(opt$fsize)*0.6, cex.lab=as.numeric(opt$fsize), cex=as.numeric(opt$fsize)*0.7, col=jet.colors[data$Predicted_Clusters],pch=16,xlab="V1",ylab="V2") garbage <- dev.off() } #save information about predicted clusters in _processed.csv file cat(paste("\nSaving cluster assignments as ",substr(basename(opt$file),1,nchar(basename(opt$file))-4),"_processed.csv\n",sep="")) if(!is.null(trueClusters)){ trueClusters<-data.frame(trueClusters) colnames(trueClusters)<-c("True_Clusters") write.csv(data.frame(data,trueClusters),file=paste(substr(basename(opt$file),1,nchar(basename(opt$file))-4),"_processed.csv",sep=""),quote=FALSE) } else { write.csv(data,file=paste(substr(basename(opt$file),1,nchar(basename(opt$file))-4),"_processed.csv",sep=""),quote=FALSE) }