#!/bin/Rscript
# ***************************************************************
# Name: remove_colinear_terms.R
# Purpose: Uses a step-wise function to iteratively remove colinear variables by calculating
# Variance Inflation Factor (VIF) of terms (columns) against each other. A VIF for
# single explanatory variable is obtained using the R-squared value of regression of
# that variable against all other explanatory variables and is defined as:
# VIF_j = 1/(1-R_j^2)
# This is particularly useful as it improves regression.
# For example,
#
# Proof-of-principle (as suggested by original author):
# library(MASS)
# library(clusterGeneration)
# #simulate explanatory variables
# set.seed(2)
# num.vars<-15
# num.obs<-200
# cov.mat<-genPositiveDefMat(num.vars,covMethod=c("unifcorrmat"))$Sigma
# rand.vars<-mvrnorm(num.obs,rep(0,num.vars),Sigma=cov.mat)
#
# #create response variable from exp vars
# parms<-runif(num.vars,-10,10)
# y<-rand.vars %*% matrix(parms) + rnorm(num.obs,sd=20)
# x<-rand.vars
#
# #simple lm for all variables
# lm.dat<-data.frame(y,x)
# form.in<-paste('y ~',paste(names(lm.dat)[-1],collapse='+'))
# mod1<-lm(form.in,data=lm.dat)
# summary(mod1)
# remove collinear variables and recreate lm
# terms_to_keep<-vif_func(in_frame=x,thresh=5,trace=T)
# form.in<-paste('y ~',paste(terms_to_keep,collapse='+'))
# mod2<-lm(form.in,data=lm.dat)
# summary(mod2)
#
# Results:
# > summary(mod1)
# Call:
# lm(formula = form.in, data = lm.dat)
# Residuals:
# Min 1Q Median 3Q Max
# -65.855 -13.150 0.647 13.548 53.344
#
# Coefficients:
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 0.631 1.489 0.424 0.67233
# X1 6.941 2.328 2.981 0.00326 **
# X2 -6.714 3.447 -1.948 0.05296 .
# X3 2.479 1.694 1.463 0.14522
# X4 4.830 5.262 0.918 0.35994
# X5 -2.580 1.898 -1.359 0.17567
# X6 12.882 8.824 1.460 0.14601
# X7 -7.992 6.058 -1.319 0.18873
# X8 -3.416 7.953 -0.429 0.66807
# X9 -7.579 3.219 -2.355 0.01958 *
# X10 5.434 5.810 0.935 0.35083
# X11 10.182 3.564 2.857 0.00477 **
# X12 -2.191 3.253 -0.674 0.50144
# X13 3.351 1.458 2.298 0.02270 *
# X14 1.953 3.562 0.548 0.58424
# X15 -3.780 3.961 -0.954 0.34121
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# Residual standard error: 20.38 on 184 degrees of freedom
# Multiple R-squared: 0.738, Adjusted R-squared: 0.7166
# F-statistic: 34.54 on 15 and 184 DF, p-value: < 2.2e-16
#
# > summary(mod2)
# Call:
# lm(formula = form.in, data = lm.dat)
# Residuals:
# Min 1Q Median 3Q Max
# -75.990 -13.618 2.032 14.895 61.410
#
# Coefficients:
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) -0.3005 1.6164 -0.186 0.852713
# X1 7.4327 1.0822 6.868 9.23e-11 ***
# X2 -2.8343 0.8170 -3.469 0.000647 ***
# X3 4.3586 0.9171 4.753 3.98e-06 ***
# X4 4.9693 1.4336 3.466 0.000654 ***
# X5 -0.3864 0.8548 -0.452 0.651773
# X7 -7.4712 1.1967 -6.243 2.78e-09 ***
# X9 -7.6892 0.9040 -8.506 5.56e-15 ***
# X11 6.1382 0.8500 7.221 1.24e-11 ***
# X12 -5.8016 1.0739 -5.403 1.97e-07 ***
# X13 2.3902 0.7002 3.414 0.000785 ***
# X15 -3.3389 1.4321 -2.332 0.020786 *
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# Residual standard error: 22.25 on 188 degrees of freedom
# Multiple R-squared: 0.6808, Adjusted R-squared: 0.6621
# F-statistic: 36.45 on 11 and 188 DF, p-value: < 2.2e-16
#
# We see an increase in the number of variables that are significantly related to the response variable.
#
# Ref: http://www.r-bloggers.com/collinearity-and-stepwise-vif-selection/
# Version: 0.1
# Authors: Umer Zeeshan Ijaz (Umer.Ijaz@glasgow.ac.uk)
# http://userweb.eng.gla.ac.uk/umer.ijaz
# Created: 2013-03-07
# License: Copyright (c) 2013 Computational Microbial Genomics Group, University of Glasgow, UK
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
# **************************************************************/
suppressPackageStartupMessages(library("optparse"))
suppressPackageStartupMessages(library("fmsb"))
#specify desired options in a list
option_list <- list(
make_option("--ifile", action="store",default=NULL, help="input CSV file"),
make_option("--opath", action="store",default=NULL, help="Output path"),
make_option("--ofile", action="store",default=NULL, help="output CSV file"),
make_option("--threshold", type="integer",default=10, help="Threshold for VIF. If VIF is more than 10, multicolinearity is strongly suggested [default %default]"),
make_option("--transpose", action="store_true",default=FALSE, help="Transpose the matrix? [default %default]"),
make_option("--verbose", action="store_true",default=FALSE, help="Show the output of the function [default %default]")
)
vif_func<-function(in_frame,thresh=10,trace=T){
if(class(in_frame) != 'data.frame') in_frame<-data.frame(in_frame)
#get initial vif value for all comparisons of variables
vif_init<-NULL
for(val in names(in_frame)){
form_in<-formula(paste(val,' ~ .'))
vif_init<-rbind(vif_init,c(val,VIF(lm(form_in,data=in_frame))))
}
vif_max<-max(as.numeric(vif_init[,2]))
if(vif_max < thresh){
if(trace==T){ #print output of each iteration
prmatrix(vif_init,collab=c('VAR','VIF'),rowlab=rep('',nrow(vif_init)),quote=F)
cat('\n')
cat(paste('All variables have VIF < ', thresh,', max VIF ',round(vif_max,2), sep=''),'\n\n')
}
return(names(in_frame))
}
else{
in_dat<-in_frame
#backwards selection of explanatory variables, stops when all VIF values are below 'thresh'
while(vif_max >= thresh){
vif_vals<-NULL
for(val in names(in_dat)){
form_in<-formula(paste(val,' ~ .'))
vif_add<-VIF(lm(form_in,data=in_dat))
vif_vals<-rbind(vif_vals,c(val,vif_add))
}
max_row<-which(vif_vals[,2] == max(as.numeric(vif_vals[,2])))[1]
vif_max<-as.numeric(vif_vals[max_row,2])
if(vif_max