#!/bin/Rscript
# ***************************************************************
# Name:      remove_colinear_terms.R
# Purpose:   Uses a step-wise function to iteratively remove colinear variables by calculating 
#            Variance Inflation Factor (VIF) of terms (columns) against each other. A VIF for 
#            single explanatory variable is obtained using the R-squared value of regression of 
#            that variable against all other explanatory variables and is defined as:
#                 VIF_j = 1/(1-R_j^2)
#            This is particularly useful as it improves regression.
#            For example,
#     
#     Proof-of-principle (as suggested by original author):
#                 library(MASS)
#                 library(clusterGeneration)
#                 #simulate explanatory variables
#                 set.seed(2)
#                 num.vars<-15
#                 num.obs<-200
#                 cov.mat<-genPositiveDefMat(num.vars,covMethod=c("unifcorrmat"))$Sigma
#                 rand.vars<-mvrnorm(num.obs,rep(0,num.vars),Sigma=cov.mat)
#
#                 #create response variable from exp vars
#                 parms<-runif(num.vars,-10,10)
#                 y<-rand.vars %*% matrix(parms) + rnorm(num.obs,sd=20)
#                 x<-rand.vars
#
#                 #simple lm for all variables
#                 lm.dat<-data.frame(y,x)
#                 form.in<-paste('y ~',paste(names(lm.dat)[-1],collapse='+'))
#                 mod1<-lm(form.in,data=lm.dat)
#                 summary(mod1)
#                 remove collinear variables and recreate lm   
#                 terms_to_keep<-vif_func(in_frame=x,thresh=5,trace=T)
#                 form.in<-paste('y ~',paste(terms_to_keep,collapse='+'))
#                 mod2<-lm(form.in,data=lm.dat)
#                 summary(mod2)
#
#     Results:
#       > summary(mod1)
#       Call:
#         lm(formula = form.in, data = lm.dat)
#       Residuals:
#       Min      1Q  Median      3Q     Max 
#       -65.855 -13.150   0.647  13.548  53.344 
#
#       Coefficients:
#       Estimate Std. Error t value Pr(>|t|)   
#       (Intercept)    0.631      1.489   0.424  0.67233   
#       X1             6.941      2.328   2.981  0.00326 **
#       X2            -6.714      3.447  -1.948  0.05296 . 
#       X3             2.479      1.694   1.463  0.14522   
#       X4             4.830      5.262   0.918  0.35994   
#       X5            -2.580      1.898  -1.359  0.17567   
#       X6            12.882      8.824   1.460  0.14601   
#       X7            -7.992      6.058  -1.319  0.18873   
#       X8            -3.416      7.953  -0.429  0.66807   
#       X9            -7.579      3.219  -2.355  0.01958 * 
#       X10            5.434      5.810   0.935  0.35083   
#       X11           10.182      3.564   2.857  0.00477 **
#       X12           -2.191      3.253  -0.674  0.50144   
#       X13            3.351      1.458   2.298  0.02270 * 
#       X14            1.953      3.562   0.548  0.58424   
#       X15           -3.780      3.961  -0.954  0.34121   
#       ---
#       Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 
#       Residual standard error: 20.38 on 184 degrees of freedom
#       Multiple R-squared: 0.738,  Adjusted R-squared: 0.7166 
#       F-statistic: 34.54 on 15 and 184 DF,  p-value: < 2.2e-16 
#
#       > summary(mod2)
#       Call:
#         lm(formula = form.in, data = lm.dat)
#       Residuals:
#       Min      1Q  Median      3Q     Max 
#       -75.990 -13.618   2.032  14.895  61.410 
#
#       Coefficients:
#       Estimate Std. Error t value Pr(>|t|)    
#       (Intercept)  -0.3005     1.6164  -0.186 0.852713    
#       X1            7.4327     1.0822   6.868 9.23e-11 ***
#       X2           -2.8343     0.8170  -3.469 0.000647 ***
#       X3            4.3586     0.9171   4.753 3.98e-06 ***
#       X4            4.9693     1.4336   3.466 0.000654 ***
#       X5           -0.3864     0.8548  -0.452 0.651773    
#       X7           -7.4712     1.1967  -6.243 2.78e-09 ***
#       X9           -7.6892     0.9040  -8.506 5.56e-15 ***
#       X11           6.1382     0.8500   7.221 1.24e-11 ***
#       X12          -5.8016     1.0739  -5.403 1.97e-07 ***
#       X13           2.3902     0.7002   3.414 0.000785 ***
#       X15          -3.3389     1.4321  -2.332 0.020786 *  
#       ---
#       Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 
#       Residual standard error: 22.25 on 188 degrees of freedom
#       Multiple R-squared: 0.6808,  Adjusted R-squared: 0.6621 
#       F-statistic: 36.45 on 11 and 188 DF,  p-value: < 2.2e-16 
#
#       We see an increase in the number of variables that are significantly related to the response variable.
#
# Ref:       http://www.r-bloggers.com/collinearity-and-stepwise-vif-selection/
# Version:   0.1
# Authors:   Umer Zeeshan Ijaz (Umer.Ijaz@glasgow.ac.uk)
#                 http://userweb.eng.gla.ac.uk/umer.ijaz
# Created:   2013-03-07
# License:   Copyright (c) 2013 Computational Microbial Genomics Group, University of Glasgow, UK
# 
#            This program is free software: you can redistribute it and/or modify
#            it under the terms of the GNU General Public License as published by
#            the Free Software Foundation, either version 3 of the License, or
#            (at your option) any later version.
#
#            This program is distributed in the hope that it will be useful,
#            but WITHOUT ANY WARRANTY; without even the implied warranty of
#            MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#            GNU General Public License for more details.
#
#            You should have received a copy of the GNU General Public License
#            along with this program.  If not, see .
# **************************************************************/
suppressPackageStartupMessages(library("optparse"))
suppressPackageStartupMessages(library("fmsb"))
#specify desired options in a list
option_list <- list(
  make_option("--ifile", action="store",default=NULL, help="input CSV file"),
  make_option("--opath", action="store",default=NULL, help="Output path"),
  make_option("--ofile", action="store",default=NULL, help="output CSV file"),
  make_option("--threshold", type="integer",default=10, help="Threshold for VIF. If VIF is more than 10, multicolinearity is strongly suggested [default %default]"),
  make_option("--transpose", action="store_true",default=FALSE, help="Transpose the matrix? [default %default]"),
  make_option("--verbose", action="store_true",default=FALSE, help="Show the output of the function [default %default]")
)
vif_func<-function(in_frame,thresh=10,trace=T){
  
  if(class(in_frame) != 'data.frame') in_frame<-data.frame(in_frame)
  
  #get initial vif value for all comparisons of variables
  vif_init<-NULL
  for(val in names(in_frame)){
    form_in<-formula(paste(val,' ~ .'))
    vif_init<-rbind(vif_init,c(val,VIF(lm(form_in,data=in_frame))))
  }
  vif_max<-max(as.numeric(vif_init[,2]))
  
  if(vif_max < thresh){
    if(trace==T){ #print output of each iteration
      prmatrix(vif_init,collab=c('VAR','VIF'),rowlab=rep('',nrow(vif_init)),quote=F)
      cat('\n')
      cat(paste('All variables have VIF < ', thresh,', max VIF ',round(vif_max,2), sep=''),'\n\n')
    }
    return(names(in_frame))
  }
  else{
    
    in_dat<-in_frame
    
    #backwards selection of explanatory variables, stops when all VIF values are below 'thresh'
    while(vif_max >= thresh){
      
      vif_vals<-NULL
      
      for(val in names(in_dat)){
        form_in<-formula(paste(val,' ~ .'))
        vif_add<-VIF(lm(form_in,data=in_dat))
        vif_vals<-rbind(vif_vals,c(val,vif_add))
      }
      max_row<-which(vif_vals[,2] == max(as.numeric(vif_vals[,2])))[1]
      
      vif_max<-as.numeric(vif_vals[max_row,2])
      
      if(vif_max