#!/usr/bin/perl
# ***************************************************************
# Name:      collateGCMSResults.pl
# Purpose:   This script collates multiple text files produced from GC-MS machine, each
# 	     residing in a subfolder.
#   	     
# 	     Format of a typical file is:
# <FILE>
#  Data Path : C:\msdchem\1\data\Gu-VFA\2013-09\20130904\
#  Data File : 00701019.d
#  Signal(s) : FID1A.ch
#  Acq On    :  4 Sep 2013  19:24
#  Operator  : Gu
#  Sample    : ADR2 7 19/08
#  Misc      :
#  ALS Vial  : 7   Sample Multiplier: 1
#
#  Integration File: autoint1_20140904.e
#  Quant Time: Feb 27 17:33:41 2014
#  Quant Method : C:\msdchem\1\methods\GU-VFA1.M
#  Quant Title  :
#  QLast Update : Thu Feb 27 17:24:49 2014
#  Response via : Initial Calibration
#  Integrator: ChemStation
#
#  Volume Inj.  :
#  Signal Phase :
#  Signal Info  :
#
#          Compound                     R.T.       Response    Conc Units
#   ---------------------------------------------------------------------------
#
#   Target Compounds
#   1)     Ethanol                     0.000              0    N.D.  mg/L
#   2)     Acetate                     6.893         166198   39.167 mg/L
#   3)     Propionate                  7.449          46075    6.931 mg/L
#   4)     isoButyrate                 7.623          27925    3.233 mg/L
#   5)     n-Butyrate                  8.008          12058    1.427 mg/L
#   6)     isoValerate                 8.251          69083    7.135 mg/L
#   7)     n-Valerate                  8.645           7929    0.861 mg/L
#   8)     isoCaproate                 9.006          47784    4.942 mg/L
#   9)     n-Caproate                  9.245          65775    7.206 mg/L
#   ---------------------------------------------------------------------------
#</FILE>
#
#           Description:
#           1) We do two passes, first pass to get all possible compounds, and second pass to populate the
#              frequency table
# 	    2) We match lines that end in "mg/L" using $line=~/mg\/L/
# 	    3) We use a substring of directory names as sample names using substr($sample_terms[$i],-5,-2)
# 	    4) N.D. gets replaced by 0 in the frequency table
#
# Version:   0.1
# Authors:   Umer Zeeshan Ijaz (Umer.Ijaz@glasgow.ac.uk)
#                 http://userweb.eng.gla.ac.uk/umer.ijaz/index.htm
# Created:   2014-03-05
# License:   Copyright (c) 2014 Computational Microbial Genomics Group, University of Glasgow, UK
#
#            This program is free software: you can redistribute it and/or modify
#            it under the terms of the GNU General Public License as published by
#            the Free Software Foundation, either version 3 of the License, or
#            (at your option) any later version.
#
#            This program is distributed in the hope that it will be useful,
#            but WITHOUT ANY WARRANTY; without even the implied warranty of
#            MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#            GNU General Public License for more details.
#
#            You should have received a copy of the GNU General Public License
#            along with this program.  If not, see <http://www.gnu.org/licenses/>.
# **************************************************************/
use warnings;
use strict;
use File::Slurp qw(read_dir);
use Getopt::Long;

my %opts; #store the input arguments
GetOptions(\%opts,
        'folder|f=s',
        'pattern|p=s',
);

if((not defined $opts{"folder"}) || (not defined $opts{"pattern"}))
        {
print <<EOF;
Usage:
To collate results:
        perl collateGCMSResults.pl -f <folder_name> -p <pattern> > <output>
For example,
	perl collateGCMSResults.pl -f /home/projectx -p epatemp 
EOF
        exit;
        }



my $root=$opts{"folder"};
my $pattern=$opts{"pattern"};
my $line;
my @tokens;
my @unique_terms;
my @sample_terms;

#First pass to collect all the terms
for my $dir (grep { -d "$root/$_" } read_dir($root)) {
    for my $file (grep { /$pattern/ } read_dir($root."/".$dir)){
	push @sample_terms, $dir;
	open(FILE,$root."/".$dir."/".$file) or die "Can't open $root/$dir/$file\n";
	while ($line=<FILE>){
		chomp($line);
		if ($line=~/mg\/L/){
			@tokens = split(/\s+/,$line);
			unless ($tokens[2] ~~ @unique_terms) #enter unique terms
               			{
              			push @unique_terms, $tokens[2];
                		}
			}			
	}
	close(FILE);	
	}
}

my @frequency_table=();
for(my $i = 0; $i < scalar(@unique_terms); $i++){
    for(my $j = 0; $j < scalar(@sample_terms); $j++){
                $frequency_table[$i][$j]="0";
    }
}


#Second pass to populate @frequency_table
for my $dir (grep { -d "$root/$_" } read_dir($root)) {
    for my $file (grep { /$pattern/ } read_dir($root."/".$dir)){
        open(FILE,$root."/".$dir."/".$file) or die "Can't open $root/$dir/$file\n";
        while ($line=<FILE>){
                chomp($line);
		if ($line=~/mg\/L/){
                        @tokens = split(/\s+/,$line);
                	my( $index_terms )= grep { $unique_terms[$_] eq $tokens[2] } 0..$#unique_terms;
			my( $index_samples )= grep { $sample_terms[$_] eq $dir } 0..$#sample_terms;

			if(defined $index_terms && defined $index_samples)
				{
				if ($tokens[scalar(@tokens)-2]=~/N\.D\./)
					{
					$frequency_table[$index_terms][$index_samples]=0.0;
					}
				else
					{
					$frequency_table[$index_terms][$index_samples]=$tokens[scalar(@tokens)-2];
					}
				}
		}
        }
	close(FILE);
        }
}

#Now generate the frequency table
print "Samples";
for(my $i=0; $i < @sample_terms; $i++)
	{
	print ",".substr($sample_terms[$i],-5,-2);
	}
print "\n";

for(my $i = 0; $i < scalar(@unique_terms); $i++){
    print $unique_terms[$i];
    for(my $j = 0; $j < scalar(@sample_terms); $j++){
                print ",";
		print $frequency_table[$i][$j];
    }
    print "\n";
}

