#!/usr/bin/python
# ***************************************************************
# Name:      METAmock
# Purpose:   Metagenomics analysis pipeline that maps reads against
#	     the reference database and generates
#	         -Depth of coverage
#		 -Breadth of coverage
#		 -Proportion of reads mapping to each reference genome
#	
#	     The program requires the reference database to be indexed by bwa
#	     and requires the following programs to be installed:
#		 -bwa
#		 -samtools
#		 -bedtools
#
#	     The first public release (version 0.1) was developed 
#	     and completed by Umer Zeeshan Ijaz under the supervision 
#	     of Christopher Quince in the following hackathon:
#
#	     Event title: Integrative mixture models for enterotypes
#	     Location: University of Glasgow & The Burn (Scotland)
#	     Dates: from 12-08-2013 to 16-08-2013
#
#	     organized by European Union's Earth System Science and Environmental 
#	     Management ES1103 COST Action ("Microbial ecology & the earth system: 
#	     collaborating for insight and success with the new generation of sequencing tools").
#	     This work would not have been possible, were it not for the useful discussions 
#	     with other participants of the hackathon, namely,
#
#	     Keith Harris
#	     Leo Lahti
#	     Jarkko Salojarvi
#	     Karoline Faust    
#		
# Version:   0.1
# Authors:   Umer Zeeshan Ijaz (Umer.Ijaz@glasgow.ac.uk)
#                 http://userweb.eng.gla.ac.uk/umer.ijaz
# Created:   2012-08-16
# License:   Copyright (c) 2013 Computational Microbial Genomics Group, University of Glasgow, UK
#
#            This program is free software: you can redistribute it and/or modify
#            it under the terms of the GNU General Public License as published by
#            the Free Software Foundation, either version 3 of the License, or
#            (at your option) any later version.
#
#            This program is distributed in the hope that it will be useful,
#            but WITHOUT ANY WARRANTY; without even the implied warranty of
#            MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#            GNU General Public License for more details.
#
#            You should have received a copy of the GNU General Public License
#            along with this program.  If not, see <http://www.gnu.org/licenses/>.
# **************************************************************/
import os,sys,os.path
import time,datetime
import subprocess
from optparse import OptionParser
from optparse import Option, OptionValueError
import csv

import multiprocessing
from textwrap import dedent
from itertools import izip_longest

#Ref:http://www.ngcrawford.com/2012/03/29/python-multiprocessing-large-files/
def process_chunk(d):
	"""Replace this with your own function that processes data one line at a time"""
 	d = d.strip()
	return d
 
def grouper(n, iterable, padvalue=None):
	"""grouper(3, 'abcdefg', 'x') -->('a','b','c'), ('d','e','f'), ('g','x','x')"""
	return izip_longest(*[iter(iterable)]*n, fillvalue=padvalue)


#Ref:http://preshing.com/20110924/timing-your-code-using-pythons-with-statement
class Timer:    
    def __enter__(self):
        self.start = time.clock()
        return self

    def __exit__(self, *args):
        self.end = time.clock()
        self.interval = self.end - self.start

#Ref:http://stackoverflow.com/questions/4109436/processing-multiple-values-for-one-single-option-using-getopt-optparse
class MultipleOption(Option):
    ACTIONS = Option.ACTIONS + ("extend",)
    STORE_ACTIONS = Option.STORE_ACTIONS + ("extend",)
    TYPED_ACTIONS = Option.TYPED_ACTIONS + ("extend",)
    ALWAYS_TYPED_ACTIONS = Option.ALWAYS_TYPED_ACTIONS + ("extend",)

    def take_action(self, action, dest, opt, value, values, parser):
        if action == "extend":
            values.ensure_value(dest, []).append(value)
        else:
            Option.take_action(self, action, dest, opt, value, values, parser)

def print_time_stamp(message):
    print datetime.datetime.fromtimestamp(time.time()).strftime('[%Y-%m-%d %H:%M:%S] ')+message

#check if the program exists and return it's location, otherwise return None
def which(program):
    def is_exe(fpath):
        return os.path.isfile(fpath) and os.access(fpath, os.X_OK)

    fpath, fname = os.path.split(program)
    if fpath:
        if is_exe(program):
            return program
    else:
        for path in os.environ["PATH"].split(os.pathsep):
            path = path.strip('"')
            exe_file = os.path.join(path, program)
            if is_exe(exe_file):
                return exe_file

    return None

def print_prog_status(prog,loc):
    if loc != None:
	print 'Checking for \'' + prog + '\': found '+ loc
    else:
	print 'Checking for \'' + prog + '\': ERROR - could not find \''+prog+'\''
	print 'Exiting.'
	sys.exit(1)
def run_prog(prog):
    p = subprocess.Popen(prog, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
#    for line in iter(p.stdout.readline, b''):
#    	print(">>> " + line.rstrip())
    retval = p.wait()

def write_dict_file_tsv(mydict,filename):
    writer = csv.writer(open(filename, 'wb'),delimiter='\t')
    for key, value in mydict.items():
    	writer.writerow([key, value])	

def read_dict_file_tsv(filename):
    reader = csv.reader(open(filename, 'rb'),delimiter='\t')
    mydict = dict(x for x in reader)
    return mydict

def main():
    parser = OptionParser(option_class=MultipleOption,usage="usage: %prog [options] filename",
                          version="%prog 0.1")
    parser.add_option("-q","--query_reads",
                       action="append",
                       dest="query_reads",
                       help="specify single or paired-end reads in fastq format. For each file, use -q, e.g., -q R1.fastq -q R2.fastq")    
    parser.add_option("-d","--reference_database",
                       action="store",
		       default=None,
                       dest="reference_database",
                       help="reference database")
    parser.add_option("-o","--output_name",
                       action="store",
                       dest="output_name",
		       default=None,
                       help="name under which the files are stored")
    parser.add_option("-t","--threads",
                       action="store",
                       dest="bwa_threads",
                       default=8,
                       help="number of threads to use in bwa")

    if len(sys.argv) == 1:
        parser.parse_args(['--help'])

    options, args = parser.parse_args()

    if options.reference_database==None or options.output_name==None:
    	print 'ERROR - missing arguments (possibly -o, -d)'
	print 'Exiting.'
        sys.exit(1)	

    PROG_BWA=which('bwa')
    PROG_SAMTOOLS=which('samtools')
    PROG_BEDTOOLS=which('bedtools')

    print "This is METAmock v0.1. Copyright (c) 2013 Computational Microbial Genomics Group, University of Glasgow, UK"
    #check programs 
    print_prog_status("bwa",PROG_BWA)
    print_prog_status("samtools",PROG_SAMTOOLS)
    print_prog_status("bedtools",PROG_BEDTOOLS)

    if os.path.exists(options.output_name+'.sam') != True:
    	if len(options.query_reads)==2:
		print_time_stamp('Aligning '+ os.path.basename(options.query_reads[0])+ " and "+os.path.basename(options.query_reads[1])+' against '+os.path.basename(options.reference_database) + ' using bwa on '+str(options.bwa_threads)+ ' threads')
    		cmd='bwa mem -t '+str(options.bwa_threads)+' '+options.reference_database+' '+' '.join(options.query_reads)+' > '+options.output_name+'.sam'
		run_prog(cmd)
		print_time_stamp('Generated '+options.output_name+'.sam'+' successfully!')
	else:
		print 'Pipeline is only tested for paired-end reads so exiting now to avoid problems in downstream analysis'
		sys.exit(1)

    if os.path.exists(options.output_name+'.bam') != True:
    	print_time_stamp('Converting '+options.output_name+'.sam'+' to '+options.output_name+'.bam')
	cmd='samtools view -h -b -S '+options.output_name+'.sam'+' > '+options.output_name+'.bam'
        run_prog(cmd)
	print_time_stamp('Generated '+options.output_name+'.bam'+' successfully!')

    if os.path.exists(options.output_name+'_mapped.bam') != True:
	print_time_stamp('Extracting mapped reads from bam file')
        cmd='samtools view -b -F 4 '+ options.output_name+'.bam' + ' > '+ options.output_name+'_mapped.bam'
        run_prog(cmd)
	print_time_stamp('Generated '+options.output_name+'_mapped.bam'+' successfully!')

    if os.path.exists('lengths.genome') != True:
        print_time_stamp('Extracting length of reference genomes')
	cmd='samtools view -H '+options.output_name+'_mapped.bam'+' | perl -ne \'if ($_ =~ m/^\@SQ/) { print $_ }\' | perl -ne \'if ($_ =~ m/SN:(.+)\s+LN:(\d+)/) { print $1, \"\\t\", $2, \"\\n\"}\' > lengths.genome'
        run_prog(cmd)
        print_time_stamp('Generated lengths.genome successfully!')

    if os.path.exists(options.output_name+'_mapped_sorted.bam') != True:
        print_time_stamp('Sorting mapped bam file')
	cmd='samtools sort -m 1000000000 '+options.output_name+'_mapped.bam'+' '+ options.output_name+'_mapped_sorted'
        run_prog(cmd)
        print_time_stamp('Generated '+options.output_name+'_mapped_sorted.bam'+' successfully!')

    if os.path.exists(options.output_name+'_perbasecoverage.tsv') != True:
        print_time_stamp('Generating per-base coverage for each reference genome')
        cmd='bedtools genomecov -ibam '+options.output_name+'_mapped_sorted.bam'+' -d -g lengths.genome  > '+options.output_name+'_perbasecoverage.tsv'
	run_prog(cmd)
        print_time_stamp('Generated '+options.output_name+'_perbasecoverage.tsv'+' successfully!')

    print_time_stamp('Generating pipeline statistics')
    genomes_length={}
    with open("lengths.genome") as f:
        for line in f:
                line=line.rstrip()
                d=line.split('\t')
                genomes_length[d[0]]=int(d[1])

    mean_genome_coverage={}
    proportion_genome_covered={}	
    with Timer() as t:
	with open(options.output_name+'_perbasecoverage.tsv') as f:
        	for line in f:
                	line=line.rstrip()
                	d=line.split('\t')		
			mean_genome_coverage[d[0]]=int(d[2])+mean_genome_coverage.get(d[0],0)
			proportion_genome_covered[d[0]]=int(int(d[2])>0)+proportion_genome_covered.get(d[0],0)

    p = subprocess.Popen('samtools view -c '+options.output_name+'.bam', shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
    total_reads=None
    for line in iter(p.stdout.readline, b''):
    	total_reads=line.rstrip()
    retval=p.wait()
    cmd='samtools view '+options.output_name+'_mapped_sorted.bam'+' | awk \'{print $3}\' | sort | uniq -c | awk \'{print $2\"\\t\"$1}\' >'+options.output_name+'_readscount.tsv'
    run_prog(cmd)

    proportion_genome_reads=read_dict_file_tsv(options.output_name+'_readscount.tsv')	
    for key in genomes_length.keys():
	mean_genome_coverage[key]=float(mean_genome_coverage[key])/float(genomes_length[key])
        proportion_genome_covered[key]=float(proportion_genome_covered[key])/float(genomes_length[key])
	proportion_genome_reads[key]=float(proportion_genome_reads[key])/float(total_reads)

    write_dict_file_tsv(mean_genome_coverage,options.output_name+'_MEAN-GENOME-COVERAGE.tsv')
    write_dict_file_tsv(proportion_genome_covered,options.output_name+'_PROPORTION-GENOME-COVERED.tsv')
    write_dict_file_tsv(proportion_genome_reads,options.output_name+'_PROPORTION-GENOME-READS.tsv')

    print_time_stamp('Following files were generated successfully:')
    print(options.output_name+'_MEAN-GENOME-COVERAGE.tsv')
    print(options.output_name+'_PROPORTION-GENOME-COVERED.tsv')
    print(options.output_name+'_PROPORTION-GENOME-READS.tsv')


if __name__ == '__main__':
    main()