#!/bin/Rscript # *************************************************************** # Name: remove_colinear_terms.R # Purpose: Uses a step-wise function to iteratively remove colinear variables by calculating # Variance Inflation Factor (VIF) of terms (columns) against each other. A VIF for # single explanatory variable is obtained using the R-squared value of regression of # that variable against all other explanatory variables and is defined as: # VIF_j = 1/(1-R_j^2) # This is particularly useful as it improves regression. # For example, # # Proof-of-principle (as suggested by original author): # library(MASS) # library(clusterGeneration) # #simulate explanatory variables # set.seed(2) # num.vars<-15 # num.obs<-200 # cov.mat<-genPositiveDefMat(num.vars,covMethod=c("unifcorrmat"))$Sigma # rand.vars<-mvrnorm(num.obs,rep(0,num.vars),Sigma=cov.mat) # # #create response variable from exp vars # parms<-runif(num.vars,-10,10) # y<-rand.vars %*% matrix(parms) + rnorm(num.obs,sd=20) # x<-rand.vars # # #simple lm for all variables # lm.dat<-data.frame(y,x) # form.in<-paste('y ~',paste(names(lm.dat)[-1],collapse='+')) # mod1<-lm(form.in,data=lm.dat) # summary(mod1) # remove collinear variables and recreate lm # terms_to_keep<-vif_func(in_frame=x,thresh=5,trace=T) # form.in<-paste('y ~',paste(terms_to_keep,collapse='+')) # mod2<-lm(form.in,data=lm.dat) # summary(mod2) # # Results: # > summary(mod1) # Call: # lm(formula = form.in, data = lm.dat) # Residuals: # Min 1Q Median 3Q Max # -65.855 -13.150 0.647 13.548 53.344 # # Coefficients: # Estimate Std. Error t value Pr(>|t|) # (Intercept) 0.631 1.489 0.424 0.67233 # X1 6.941 2.328 2.981 0.00326 ** # X2 -6.714 3.447 -1.948 0.05296 . # X3 2.479 1.694 1.463 0.14522 # X4 4.830 5.262 0.918 0.35994 # X5 -2.580 1.898 -1.359 0.17567 # X6 12.882 8.824 1.460 0.14601 # X7 -7.992 6.058 -1.319 0.18873 # X8 -3.416 7.953 -0.429 0.66807 # X9 -7.579 3.219 -2.355 0.01958 * # X10 5.434 5.810 0.935 0.35083 # X11 10.182 3.564 2.857 0.00477 ** # X12 -2.191 3.253 -0.674 0.50144 # X13 3.351 1.458 2.298 0.02270 * # X14 1.953 3.562 0.548 0.58424 # X15 -3.780 3.961 -0.954 0.34121 # --- # Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 # Residual standard error: 20.38 on 184 degrees of freedom # Multiple R-squared: 0.738, Adjusted R-squared: 0.7166 # F-statistic: 34.54 on 15 and 184 DF, p-value: < 2.2e-16 # # > summary(mod2) # Call: # lm(formula = form.in, data = lm.dat) # Residuals: # Min 1Q Median 3Q Max # -75.990 -13.618 2.032 14.895 61.410 # # Coefficients: # Estimate Std. Error t value Pr(>|t|) # (Intercept) -0.3005 1.6164 -0.186 0.852713 # X1 7.4327 1.0822 6.868 9.23e-11 *** # X2 -2.8343 0.8170 -3.469 0.000647 *** # X3 4.3586 0.9171 4.753 3.98e-06 *** # X4 4.9693 1.4336 3.466 0.000654 *** # X5 -0.3864 0.8548 -0.452 0.651773 # X7 -7.4712 1.1967 -6.243 2.78e-09 *** # X9 -7.6892 0.9040 -8.506 5.56e-15 *** # X11 6.1382 0.8500 7.221 1.24e-11 *** # X12 -5.8016 1.0739 -5.403 1.97e-07 *** # X13 2.3902 0.7002 3.414 0.000785 *** # X15 -3.3389 1.4321 -2.332 0.020786 * # --- # Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 # Residual standard error: 22.25 on 188 degrees of freedom # Multiple R-squared: 0.6808, Adjusted R-squared: 0.6621 # F-statistic: 36.45 on 11 and 188 DF, p-value: < 2.2e-16 # # We see an increase in the number of variables that are significantly related to the response variable. # # Ref: http://www.r-bloggers.com/collinearity-and-stepwise-vif-selection/ # Version: 0.1 # Authors: Umer Zeeshan Ijaz (Umer.Ijaz@glasgow.ac.uk) # http://userweb.eng.gla.ac.uk/umer.ijaz # Created: 2013-03-07 # License: Copyright (c) 2013 Computational Microbial Genomics Group, University of Glasgow, UK # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # **************************************************************/ suppressPackageStartupMessages(library("optparse")) suppressPackageStartupMessages(library("fmsb")) #specify desired options in a list option_list <- list( make_option("--ifile", action="store",default=NULL, help="input CSV file"), make_option("--opath", action="store",default=NULL, help="Output path"), make_option("--ofile", action="store",default=NULL, help="output CSV file"), make_option("--threshold", type="integer",default=10, help="Threshold for VIF. If VIF is more than 10, multicolinearity is strongly suggested [default %default]"), make_option("--transpose", action="store_true",default=FALSE, help="Transpose the matrix? [default %default]"), make_option("--verbose", action="store_true",default=FALSE, help="Show the output of the function [default %default]") ) vif_func<-function(in_frame,thresh=10,trace=T){ if(class(in_frame) != 'data.frame') in_frame<-data.frame(in_frame) #get initial vif value for all comparisons of variables vif_init<-NULL for(val in names(in_frame)){ form_in<-formula(paste(val,' ~ .')) vif_init<-rbind(vif_init,c(val,VIF(lm(form_in,data=in_frame)))) } vif_max<-max(as.numeric(vif_init[,2])) if(vif_max < thresh){ if(trace==T){ #print output of each iteration prmatrix(vif_init,collab=c('VAR','VIF'),rowlab=rep('',nrow(vif_init)),quote=F) cat('\n') cat(paste('All variables have VIF < ', thresh,', max VIF ',round(vif_max,2), sep=''),'\n\n') } return(names(in_frame)) } else{ in_dat<-in_frame #backwards selection of explanatory variables, stops when all VIF values are below 'thresh' while(vif_max >= thresh){ vif_vals<-NULL for(val in names(in_dat)){ form_in<-formula(paste(val,' ~ .')) vif_add<-VIF(lm(form_in,data=in_dat)) vif_vals<-rbind(vif_vals,c(val,vif_add)) } max_row<-which(vif_vals[,2] == max(as.numeric(vif_vals[,2])))[1] vif_max<-as.numeric(vif_vals[max_row,2]) if(vif_max