#!/usr/bin/perl
# ***************************************************************
# Name:      collateResults.pl
# Purpose:   This script collates the results produced by AMPLICONprocessing and TAXAassign
#	     Say you have a Main_Folder, where each subfolder contains a CSV file.  
#	     Using -f path_to_Main_Folder -p _PHYLUM.csv, the contents of *_PHYLUM.csv file will be collated together.
#
#	     BEFORE:
#            
#            	Main_Folder/Folder_1/*_PHYLUM.csv:
#
#            	species_A,2
#            	species_B,4
#            	species_C,5
#
#            	Main_Folder/Folder_2/*_PHYLUM.csv:
#
#            	species_A,3
#            	species_D,5
#
#	     AFTER:
#
#	     	Samples,Folder_1,Folder_2
#	     	species_A,2,3
#	     	species_B,4,0
#	     	species_C,5,0
#	     	species_D,0,5
#
#		
# Version:   0.1
# Authors:   Umer Zeeshan Ijaz (Umer.Ijaz@glasgow.ac.uk)
#                 http://userweb.eng.gla.ac.uk/umer.ijaz/index.htm
# Created:   2013-08-1
# License:   Copyright (c) 2013 Computational Microbial Genomics Group, University of Glasgow, UK
#
#            This program is free software: you can redistribute it and/or modify
#            it under the terms of the GNU General Public License as published by
#            the Free Software Foundation, either version 3 of the License, or
#            (at your option) any later version.
#
#            This program is distributed in the hope that it will be useful,
#            but WITHOUT ANY WARRANTY; without even the implied warranty of
#            MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#            GNU General Public License for more details.
#
#            You should have received a copy of the GNU General Public License
#            along with this program.  If not, see <http://www.gnu.org/licenses/>.
# **************************************************************/
use warnings;
use strict;
use File::Slurp qw(read_dir);
use Getopt::Long;

my %opts; #store the input arguments
GetOptions(\%opts,
        'folder|f=s',
        'pattern|p=s',
);

if((not defined $opts{"folder"}) || (not defined $opts{"pattern"}))
        {
print <<EOF;
Usage:
To collate results:
        perl collateResults.pl -f <folder_name> -p <pattern> > <output>
For example,
	perl collateResults.pl -f /home/projectx -p _PHYLUM.csv 
EOF
        exit;
        }



my $root=$opts{"folder"};
my $pattern=$opts{"pattern"};
my $line;
my @tokens;
my @unique_terms;
my @sample_terms;
#First pass to collect all the terms
for my $dir (grep { -d "$root/$_" } read_dir($root)) {
    for my $file (grep { /$pattern/ } read_dir($root."/".$dir)){
	push @sample_terms, $dir;
	open(FILE,$root."/".$dir."/".$file) or die "Can't open $root/$dir/$file\n";
	while ($line=<FILE>){
		chomp($line);
		@tokens = split(/,/,$line);
		unless ($tokens[0] ~~ @unique_terms) #enter unique terms
                {
                	push @unique_terms, $tokens[0];
                }		
	}
	close(FILE);	
	}
}
my @frequency_table=();
#initialise @frequency_table
for(my $i = 0; $i < scalar(@unique_terms); $i++){
    for(my $j = 0; $j < scalar(@sample_terms); $j++){
                $frequency_table[$i][$j]="0";
    }
}

#Second pass to populate @frequency_table
for my $dir (grep { -d "$root/$_" } read_dir($root)) {
    for my $file (grep { /$pattern/ } read_dir($root."/".$dir)){
        open(FILE,$root."/".$dir."/".$file) or die "Can't open $root/$dir/$file\n";
        while ($line=<FILE>){
                chomp($line);
                @tokens = split(/,/,$line);
                my( $index_terms )= grep { $unique_terms[$_] eq $tokens[0] } 0..$#unique_terms;
		my( $index_samples )= grep { $sample_terms[$_] eq $dir } 0..$#sample_terms;

		if(defined $index_terms && defined $index_samples)
			{
			$frequency_table[$index_terms][$index_samples]=$tokens[1];
			}
        }
	close(FILE);
        }
}

#Now generate the frequency table
print "Samples,".join(",",@sample_terms)."\n";
for(my $i = 0; $i < scalar(@unique_terms); $i++){
    print $unique_terms[$i];
    for(my $j = 0; $j < scalar(@sample_terms); $j++){
                print ",";
		print $frequency_table[$i][$j];
    }
    print "\n";
}

