#!/usr/bin/python
# ***************************************************************
# Name:      AMPLImock
# Purpose:   Amplicon analysis pipeline that maps reads against
#	     the reference database and generates statistics
# Version:   0.1
# Authors:   Umer Zeeshan Ijaz (Umer.Ijaz@glasgow.ac.uk)
#                 http://userweb.eng.gla.ac.uk/umer.ijaz
# Created:   2012-08-18
# License:   Copyright (c) 2013 Computational Microbial Genomics Group, University of Glasgow, UK
#
#            This program is free software: you can redistribute it and/or modify
#            it under the terms of the GNU General Public License as published by
#            the Free Software Foundation, either version 3 of the License, or
#            (at your option) any later version.
#
#            This program is distributed in the hope that it will be useful,
#            but WITHOUT ANY WARRANTY; without even the implied warranty of
#            MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#            GNU General Public License for more details.
#
#            You should have received a copy of the GNU General Public License
#            along with this program.  If not, see <http://www.gnu.org/licenses/>.
# **************************************************************/
import os,sys,os.path
import time,datetime
import subprocess
from optparse import OptionParser
from optparse import Option, OptionValueError
import csv
import re
import pylab
from Bio import SeqIO
from Bio.SeqUtils import GC
import math

import multiprocessing
from textwrap import dedent
from itertools import izip_longest

#global dict variable to store values
pipeline_statistics={}

#Ref:http://stackoverflow.com/questions/5319922/python-check-if-word-is-in-a-string
def findWholeWord(w):
    #return re.compile(r'\b({0})\b'.format(w), flags=re.IGNORECASE).search
    return re.compile(r'({0})'.format(w), flags=re.IGNORECASE).search	

#Ref:http://www.ngcrawford.com/2012/03/29/python-multiprocessing-large-files/
def process_chunk(d):
	"""Replace this with your own function that processes data one line at a time"""
 	d = d.strip()
	return d
 
def grouper(n, iterable, padvalue=None):
	"""grouper(3, 'abcdefg', 'x') -->('a','b','c'), ('d','e','f'), ('g','x','x')"""
	return izip_longest(*[iter(iterable)]*n, fillvalue=padvalue)


#Ref:http://preshing.com/20110924/timing-your-code-using-pythons-with-statement
class Timer:    
    def __enter__(self):
        self.start = time.clock()
        return self

    def __exit__(self, *args):
        self.end = time.clock()
        self.interval = self.end - self.start

#Ref:http://stackoverflow.com/questions/4109436/processing-multiple-values-for-one-single-option-using-getopt-optparse
class MultipleOption(Option):
    ACTIONS = Option.ACTIONS + ("extend",)
    STORE_ACTIONS = Option.STORE_ACTIONS + ("extend",)
    TYPED_ACTIONS = Option.TYPED_ACTIONS + ("extend",)
    ALWAYS_TYPED_ACTIONS = Option.ALWAYS_TYPED_ACTIONS + ("extend",)

    def take_action(self, action, dest, opt, value, values, parser):
        if action == "extend":
            values.ensure_value(dest, []).append(value)
        else:
            Option.take_action(self, action, dest, opt, value, values, parser)

def print_time_stamp(message):
    print datetime.datetime.fromtimestamp(time.time()).strftime('[%Y-%m-%d %H:%M:%S] ')+message

#check if the program exists and return it's location, otherwise return None
def which(program):
    def is_exe(fpath):
        return os.path.isfile(fpath) and os.access(fpath, os.X_OK)

    fpath, fname = os.path.split(program)
    if fpath:
        if is_exe(program):
            return program
    else:
        for path in os.environ["PATH"].split(os.pathsep):
            path = path.strip('"')
            exe_file = os.path.join(path, program)
            if is_exe(exe_file):
                return exe_file

    return None

def print_cond(cond,true_message,false_message):
    if cond:
    	print true_message
    else:
	print false_message	

def print_prog_status(prog,loc):
    if loc != None:
	print 'Checking for \'' + prog + '\': found '+ loc
    else:
	print 'Checking for \'' + prog + '\': ERROR - could not find \''+prog+'\''
	print 'Exiting.'
	sys.exit(1)
def run_prog(prog):
#    p = subprocess.Popen(prog, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
    p = subprocess.Popen(prog, shell=True, stdout=None, stderr=None)	
#    for line in iter(p.stdout.readline, b''):
#    	print(">>> " + line.rstrip())
    retval = p.wait()

def write_dict_file_tsv(mydict,filename):
    writer = csv.writer(open(filename, 'wb'),delimiter='\t')
    for key, value in mydict.items():
    	writer.writerow([key,value])	

def read_dict_file_tsv(filename):
    reader = csv.reader(open(filename, 'rb'),delimiter='\t')
    mydict = dict(x for x in reader)
    return mydict

def write_dict_file_csv(mydict,filename):
    writer = csv.writer(open(filename, 'wb'),delimiter=',')
    for key, value in mydict.items():
        writer.writerow([key,value])

def read_dict_file_csv(filename):
    reader = csv.reader(open(filename, 'rb'),delimiter=',')
    mydict = dict(x for x in reader)
    return mydict

def extract_filename(filename,returnCompletePath):
    root, ext = os.path.splitext(filename)
    if returnCompletePath:
	return root
    else:
	return os.path.basename(root)

#convert fastq files to fasta
def convert_fastq_to_fasta(input_file_name,output_file_name):
    print_time_stamp('Converting '+input_file_name+' to '+output_file_name)
    SeqIO.convert(input_file_name, "fastq", output_file_name, "fasta")
    print_time_stamp('Generated '+output_file_name)	

#paired-end reads quality trimming using sickle
def quality_trim_reads_pe(input_paired_end_1,input_paired_end_2,output_paired_end_1,output_paired_end_2,output_singlet,quality_type):
    print_time_stamp('Trimming '+input_paired_end_1+' and '+input_paired_end_2+' using quality type as \''+quality_type+'\', quality score of 20, and minimum of 10 bp to keep')
    cmd='sickle pe -f '+input_paired_end_1+' -r '+input_paired_end_2+' -t '+quality_type+' -o '+output_paired_end_1+' -p '+output_paired_end_2+' -s '+output_singlet+' -q 20 -l 10'
    run_prog(cmd)
    print_time_stamp('Generated '+output_paired_end_1+' ,'+output_paired_end_2+' ,and '+output_singlet)	

#single-end reads quality trimming using sickle
def quality_trim_reads_se(input_single_end,output_single_end,quality_type):
    print_time_stamp('Trimming '+input_single_end+' using quality type as \''+quality_type+'\', quality score of 20, and minimum of 10 bp to keep')
    cmd='sickle se -f '+input_single_end+' -t '+quality_type+' -o '+output_single_end+' -q 20 -l 10'
    run_prog(cmd)
    print_time_stamp('Generated '+output_single_end)

#phred quality score plot
def generate_quality_plots(input_fastq_files,output_png_image):
    pylab.clf()
    print_time_stamp('Plotting PHRED quality scores for '+' and '.join(input_fastq_files))
    for k in xrange(len(input_fastq_files)):
    	filename = input_fastq_files[k]
  	pylab.subplot(1, len(input_fastq_files), k)
  	for i,record in enumerate(SeqIO.parse(filename, "fastq")):
      		if i >= 50 : break #trick!
      	pylab.plot(record.letter_annotations["phred_quality"])
  	pylab.ylim(0,45)
  	pylab.ylabel("PHRED quality score")
  	pylab.xlabel("Position")
    pylab.savefig(output_png_image)
    print_time_stamp('Generated '+output_png_image)

#length histogram 
def generate_length_histogram(input_fasta_file,output_png_image):
    pylab.clf()
    print_time_stamp('Plotting sequences lengths histogram for '+input_fasta_file)	
    sizes = [len(rec) for rec in SeqIO.parse(input_fasta_file, "fasta")]
    pylab.hist(sizes, bins=20)
    pylab.title("%i sequences\nLengths %i to %i" \
          % (len(sizes),min(sizes),max(sizes)))
    pylab.xlabel("Sequence length (bp)")
    pylab.ylabel("Count")
    pylab.savefig(output_png_image)
    print_time_stamp('Generated '+output_png_image)			    

#GC percentage plot
def generate_GC_perc_plot(input_fasta_file,output_png_image):
    pylab.clf()
    print_time_stamp('Plotting GC percentage plot for '+input_fasta_file)
    gc_values = sorted(GC(rec.seq) for rec in SeqIO.parse(input_fasta_file, "fasta"))	
    pylab.plot(gc_values)
    pylab.title("%i sequences\nGC%% %0.1f to %0.1f" \
          % (len(gc_values),min(gc_values),max(gc_values)))
    pylab.xlabel("Sequences")
    pylab.ylabel("GC%")
    pylab.savefig(output_png_image)
    print_time_stamp('Generated '+output_png_image)

def usearch(input_fasta_file,reference_database,matching_id,alnout_file,userout_file,output_matched_fasta_file,output_notmatched_fasta_file,threads):
    print_time_stamp('Searching '+input_fasta_file+ ' against '+reference_database+ ' using usearch on '+str(threads)+ ' threads and matching identity of ' + str(matching_id))
    cmd='usearch -usearch_global '+input_fasta_file+' -db '+reference_database+' -id '+str(matching_id)+' -alnout '+alnout_file+' -strand plus -userout '+userout_file+' -userfields '+ 'query+target+id+alnlen+mism+opens+qlo+qhi+tlo+thi+evalue+bits -matched '+output_matched_fasta_file+' -notmatched '+output_notmatched_fasta_file + ' -threads '+str(threads)
    run_prog(cmd)
    print_time_stamp('Generated '+alnout_file+ ', '+userout_file+', '+output_matched_fasta_file+', and '+output_notmatched_fasta_file) 	

def usearch_both(input_fasta_file,reference_database,matching_id,alnout_file,userout_file,output_matched_fasta_file,output_notmatched_fasta_file,threads):
    print_time_stamp('Searching '+input_fasta_file+ ' against '+reference_database+ ' using usearch on '+str(threads)+ ' threads and matching identity of ' + str(matching_id))
    cmd='usearch -usearch_global '+input_fasta_file+' -db '+reference_database+' -id '+str(matching_id)+' -alnout '+alnout_file+' -strand both -userout '+userout_file+' -userfields '+ 'query+target+id+alnlen+mism+opens+qlo+qhi+tlo+thi+evalue+bits -matched '+output_matched_fasta_file+' -notmatched '+output_notmatched_fasta_file + ' -threads '+str(threads)
    run_prog(cmd)
    print_time_stamp('Generated '+alnout_file+ ', '+userout_file+', '+output_matched_fasta_file+', and '+output_notmatched_fasta_file)
	
def fastq_statistics(fastq_file,prefix_string):
    cmd='cat '+fastq_file+' | awk \'((NR-2)%4==0){read=$1;total++;count[read]++}END{for(read in count){if(!max||count[read]>max) {max=count[read];maxRead=read};if(count[read]==1){unique++}};print total,unique,unique*100/total,maxRead,count[maxRead],count[maxRead]*100/total}\''
    p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
    record=None
    for line in iter(p.stdout.readline, b''):
        record=line.rstrip()
    retval=p.wait()
    d=record.split()		
    pipeline_statistics[prefix_string+'_TOTAL_READS']=d[0]
    pipeline_statistics[prefix_string+'_UNIQUE_READS']=d[1]
    pipeline_statistics[prefix_string+'_UNIQUE_READS_PERCENTAGE']=d[2]			
    pipeline_statistics[prefix_string+'_MOST_ABUNDANT_SEQUENCE']=d[3]
    pipeline_statistics[prefix_string+'_MOST_ABUNDANT_SEQUENCE_READS']=d[4]
    pipeline_statistics[prefix_string+'_MOST_ABUNDANT_SEQUENCE_READS_PERCENTAGE']=d[5]
    print_time_stamp(fastq_file+' has following stats:')
    print 'Total reads:'+d[0]
    print 'Unique reads:'+d[1]
    print 'Unique reads percentage:'+d[2]
    print 'Most abundant sequence:'+d[3]
    print 'Most abundant sequence reads:'+d[4]
    print 'Most abundant sequence read percentage:'+d[5]	
		

def fasta_statistics(fasta_file,prefix_string,has_size_header):
    if has_size_header:
    	cmd='awk -F\"size=\" \'/>/{gsub(\";$\",\"\",$2);sum+=$2} END {print sum}\' '+fasta_file
    else:
	cmd='grep -c \">\" '+fasta_file
    p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
    record=None
    for line in iter(p.stdout.readline, b''):
        record=line.rstrip()
    retval=p.wait()
    pipeline_statistics[prefix_string+'_TOTAL_READS']=record	
    print_time_stamp(fasta_file+' has '+record+' reads')


def usearch_statistics(userout_file,mapping_dict,prefix_string):
    print_time_stamp('Computing usearch statistics for '+userout_file)
    freq_dict={}	
    total_records=0	
    mean_identity=0.0
    with open(userout_file) as f:
                for line in f:
                        line=line.rstrip()
                        d=line.split('\t')
			size_match=re.findall(r';size=(\w.*?);',d[0])
			res=mapping_dict.get(d[1], None)
			if res==None:
				if size_match:
					freq_dict[d[1]]=int(size_match[0])+freq_dict.get(d[1],0)
				else:
					freq_dict[d[1]]=1+freq_dict.get(d[1],0)	
			else:
				if size_match:
					freq_dict[res]=int(size_match[0])+freq_dict.get(res,0)
				else:
					freq_dict[res]=1+freq_dict.get(res,0)
			if size_match:
				total_records=total_records+int(size_match[0])
				mean_identity=mean_identity+(float(size_match[0])*float(d[2]))
			else:
                                total_records=total_records+1
                                mean_identity=mean_identity+float(d[2])				

    mean_identity=mean_identity/total_records	
    entropy=0.0
    simpson=1.0
    inverse_simpson=0.0;
    for key in freq_dict.keys():
	if freq_dict[key]>0:
		freq_dict[key]=float(freq_dict[key])/float(total_records)
		entropy=entropy+(-freq_dict[key]*math.log(freq_dict[key]))
		simpson=simpson-math.pow(freq_dict[key],2)
		inverse_simpson=inverse_simpson+math.pow(freq_dict[key],2)
    inverse_simpson=1/inverse_simpson		    
    write_dict_file_csv(freq_dict,extract_filename(userout_file,False)+'_freq.csv')
    print_time_stamp('Generated '+extract_filename(userout_file,False)+'_freq.csv. Mean identity is '+str(mean_identity)+', simpson index is '+str(simpson)+ ', inverse simpson is '+str(inverse_simpson) + ', entropy is '+str(entropy))
    pipeline_statistics[prefix_string+'_MEAN_IDENT']=str(mean_identity)
    pipeline_statistics[prefix_string+'_SIMPSON']=str(simpson)
    pipeline_statistics[prefix_string+'_INVERSE_SIMPSON']=str(inverse_simpson)
    pipeline_statistics[prefix_string+'_ENTROPY']=str(entropy)	


def usearch_derep_remove_chimeras(input_fasta_file,threads,output_nonchimera_fasta_file,output_chimera_fasta_file,flag):
    if flag==0:
    	print_time_stamp('Dereplicating duplicate sequences in '+input_fasta_file+' using usearch on '+str(threads)+' threads, annotating with cluster sizes, and sorting by cluster size')
    	cmd='usearch -derep_fulllength '+input_fasta_file+' -sizeout -output '+extract_filename(input_fasta_file,False)+'_derep.fasta'+' -threads '+str(threads)
    	run_prog(cmd)
    	print_time_stamp('Generated '+extract_filename(input_fasta_file,False)+'_derep.fasta')
    else:
	print_time_stamp('Copying reconstructed sequences to '+extract_filename(input_fasta_file,False)+'_derep.fasta')
	cmd='cp '+input_fasta_file+' '+extract_filename(input_fasta_file,False)+'_derep.fasta'
	run_prog(cmd)
	print_time_stamp('Generated '+extract_filename(input_fasta_file,False)+'_derep.fasta')

    print_time_stamp('Sorting reads in '+extract_filename(input_fasta_file,False)+'_derep.fasta'+' by abundance using usearch')
    cmd='usearch -sortbysize '+extract_filename(input_fasta_file,False)+'_derep.fasta' +' -output '+extract_filename(input_fasta_file,False)+'_derep_sorted.fasta'
    run_prog(cmd)	
    print_time_stamp('Generated '+extract_filename(input_fasta_file,False)+'_derep_sorted.fasta')
    print_time_stamp('Detecting De novo chimeras in '+ extract_filename(input_fasta_file,False)+'_derep_sorted.fasta' + '  using UCHIME')
    cmd='usearch -uchime_denovo '+extract_filename(input_fasta_file,False)+'_derep_sorted.fasta'+ ' -nonchimeras '+output_nonchimera_fasta_file+ ' -chimeras '+output_chimera_fasta_file
    run_prog(cmd)
    print_time_stamp('Generated '+output_nonchimera_fasta_file+' and '+output_chimera_fasta_file)		 	

def pandaseq(forward_fastq_file,reverse_fastq_file,forward_primer,reverse_primer,overlap_base_pairs,eliminate_n,threads,overlap_fastq_file):
    if forward_primer==None or reverse_primer==None:
	if eliminate_n:
		print_time_stamp('Running pandaseq on '+str(threads)+' threads with eliminating undetermined nucleotide sequences in overlap and minimum overlap of '+str(overlap_base_pairs)+'bps')
		cmd='pandaseq -f '+forward_fastq_file+' -r '+reverse_fastq_file+' -B -F -d bfsrk -N -o '+str(overlap_base_pairs)+ ' -T '+str(threads)+' > '+overlap_fastq_file
		run_prog(cmd)	
	else:
		print_time_stamp('Running pandaseq on '+str(threads)+' threads without eliminating undetermined nucleotide sequences in overlap and minimum overlap of '+str(overlap_base_pairs)+'bps')
                cmd='pandaseq -f '+forward_fastq_file+' -r '+reverse_fastq_file+' -B -F -d bfsrk -o '+str(overlap_base_pairs)+ ' -T '+str(threads)+' > '+overlap_fastq_file
                run_prog(cmd)		
    else:
	if eliminate_n:
		print_time_stamp('Running pandaseq on '+str(threads)+' threads with eliminating undetermined nucleotide sequences in overlap, minimum overlap of '+str(overlap_base_pairs)+'bps ,'+forward_primer+ ' as forward primer, and '+reverse_primer+ ' as reverse primer')
                cmd='pandaseq -f '+forward_fastq_file+' -r '+reverse_fastq_file+' -p '+forward_primer+' -q '+reverse_primer+' -B -F -d bfsrk -N -o '+str(overlap_base_pairs)+ ' -T '+str(threads)+' > '+overlap_fastq_file
                run_prog(cmd)
	else:   
		print_time_stamp('Running pandaseq on '+str(threads)+' threads without eliminating undetermined nucleotide sequences in overlap, minimum overlap of '+str(overlap_base_pairs)+'bps ,'+forward_primer+ ' as forward primer, and '+reverse_primer+ ' as reverse primer') 
                cmd='pandaseq -f '+forward_fastq_file+' -r '+reverse_fastq_file+' -p '+forward_primer+' -q '+reverse_primer+' -B -F -d bfsrk -o '+str(overlap_base_pairs)+ ' -T '+str(threads)+' > '+overlap_fastq_file
                run_prog(cmd)	
    print_time_stamp('Generated '+overlap_fastq_file)

def main():
    parser = OptionParser(option_class=MultipleOption,usage="usage: %prog [options] filename",
                          version="%prog 0.1")
    parser.add_option("-q","--query_reads",
                       action="append",
                       dest="query_reads",
                       help="specify single or paired-end reads in fasta/fastq format. For each file, use -q, e.g., -q R1.fastq -q R2.fastq")    
    parser.add_option("-p","--primers",
                       action="append",
                       dest="primers",
                       help="specify forward and reverse primers, for each primer use -p, e.g., -p CAGTGTCAGTTGA -p TCATATCTAATCTATATCGA. The first primer will be considered the forward primer and the second primer will be considered the reverse primer")
    parser.add_option("-d","--reference_database_forward",
                       action="store",
		       default=None,
                       dest="reference_database_forward",
                       help="reference database forward")
    parser.add_option("-e","--reference_database_reverse",
                       action="store",
                       default=None,
                       dest="reference_database_reverse",
                       help="reference database reverse")
    parser.add_option("-l","--list_file",
                       action="store",
                       default=None,
                       dest="list_file",
                       help="list file that contains the mapping from multiple 16S copies to genome")
    parser.add_option("-f","--full_length_amplicons_file",
                       action="store",
                       default=None,
                       dest="full_length_amplicons_file",
                       help="sequence file that contains reconstructed full length amplicons using 3rd party software such as emirge")
    parser.add_option("-t","--quality_type",
                       action="store",
                       default="sanger",
                       dest="quality_type",
                       help="Type of quality values (solexa (CASAVA < 1.3), illumina (CASAVA 1.3 to 1.7), sanger (which is CASAVA >= 1.8) (Default:sanger)")
    parser.add_option("-u","--use_quality_trimming",
                       action="store_true",
                       default=False,
                       dest="use_quality_trimming",
                       help="flag to allow quality trimming for fastq files (Default:False)")
    parser.add_option("-n","--eliminate_n",
                       action="store_true",
                       default=False,
                       dest="eliminate_n",
                       help="flag to eliminate all sequences with unknown nucleotides in the output of pandaseq (Default:False)")
    parser.add_option("-o","--overlap_base_pairs",
                       action="store",
                       dest="overlap_base_pairs",
		       default=50,
                       help="minimum number of overlapping base pairs to consider (Default:50)")
    parser.add_option("-s","--threads",
                       action="store",
                       dest="threads",
                       default=10,
                       help="number of threads to use in usearch and pandaseq (Default:10)")	


    if len(sys.argv) == 1:
        parser.parse_args(['--help'])

    options, args = parser.parse_args()


    PROG_SICKLE=which('sickle')
    PROG_USEARCH=which('usearch')
    PROG_PANDASEQ=which('pandaseq')
    PROG_FASTQC=which('fastqc')

    print "This is AMPLImock v0.1. Copyright (c) 2013 Computational Microbial Genomics Group, University of Glasgow, UK"
    #check programs 
    print_prog_status("sickle",PROG_SICKLE)
    print_prog_status("usearch",PROG_USEARCH)
    print_prog_status("pandaseq",PROG_PANDASEQ)

    #program parameters
    AMPLIMOCK_DIR = os.path.dirname(os.path.realpath(__file__)) + os.sep
    CURRENT_DIR = os.getcwd()	
    PAIRED_END_1=None
    PANDASEQ_PAIRED_END_1=None
    PAIRED_END_1_IS_FASTA=None
    PAIRED_END_1_IS_TRIMMED=None
    PAIRED_END_2=None
    PANDASEQ_PAIRED_END_2=None
    PAIRED_END_2_IS_FASTA=None
    PAIRED_END_2_IS_TRIMMED=None
    SINGLE_END=None
    SINGLE_END_IS_FASTA=None
    SINGLE_END_IS_TRIMMED=None	
    PERFORM_OVERLAPPING=None
    PERFORM_QUALITY_TRIMMING=None
    ELIMINATE_N=None
    MAPPING_FILE=None
    FORWARD_REFERENCE=None
    REVERSE_REFERENCE=None
    QUALITY_TYPE=None
    FORWARD_PRIMER=None
    REVERSE_PRIMER=None
    FULL_LENGTH_AMPLICONS_FILE=None
    OVERLAP_BASE_PAIRS=None
    OVERLAP_FILE=None
    THREADS=None
			
    #sanity checks and assignments	
    if options.query_reads==None:
        print 'ERROR - query reads not specified. Use -q to specify them'
        print 'Exiting.'
        sys.exit(1)    
    
    if options.reference_database_forward==None:
        print 'ERROR - forward reference database not specified. Use -d to specify it'
        print 'Exiting.'
        sys.exit(1)
    else:
	FORWARD_REFERENCE=options.reference_database_forward
	print 'Forward reference database:'+FORWARD_REFERENCE

    if options.reference_database_reverse==None:
        print 'ERROR - reverse reference database not specified. Use -e to specify it'
        print 'Exiting.'
        sys.exit(1)
    else:
        REVERSE_REFERENCE=options.reference_database_reverse
	print 'Reverse reference database:'+REVERSE_REFERENCE

    if options.list_file==None:
        print 'ERROR - list file containing mapping from multiple 16S genes not specified. Use -l to specify it'
        print 'Exiting.'
        sys.exit(1)
    else:
        MAPPING_FILE=options.list_file
        print 'List file containing mapping from multiple 16S genes:'+MAPPING_FILE
		
    if len(options.query_reads)==2:
    	PAIRED_END_1=options.query_reads[0]
	PAIRED_END_2=options.query_reads[1]
	if findWholeWord('fasta$')(PAIRED_END_1) or findWholeWord('fna$')(PAIRED_END_1) or findWholeWord('fa$')(PAIRED_END_1):
        	PAIRED_END_1_IS_FASTA=True
	else:
		PAIRED_END_1_IS_FASTA=False
        if findWholeWord('fasta$')(PAIRED_END_2) or findWholeWord('fna$')(PAIRED_END_2) or findWholeWord('fa$')(PAIRED_END_2):
                PAIRED_END_2_IS_FASTA=True
        else:
                PAIRED_END_2_IS_FASTA=False
	
	if findWholeWord("trim")(PAIRED_END_1):
                PAIRED_END_1_IS_TRIMMED=True
        else:
                PAIRED_END_1_IS_TRIMMED=False
        if findWholeWord("trim")(PAIRED_END_2):
                PAIRED_END_2_IS_TRIMMED=True
        else:
                PAIRED_END_2_IS_TRIMMED=False	
	if options.use_quality_trimming and not (PAIRED_END_1_IS_TRIMMED or PAIRED_END_2_IS_TRIMMED) and not (PAIRED_END_1_IS_FASTA or PAIRED_END_2_IS_FASTA):
        	PERFORM_QUALITY_TRIMMING=True
	else:
		PERFORM_QUALITY_TRIMMING=False
	PERFORM_OVERLAPPING=True
	OVERLAP_BASE_PAIRS=options.overlap_base_pairs

	print 'Forward reads:'+PAIRED_END_1
	print 'Reverse reads:'+PAIRED_END_2
	print_cond(PAIRED_END_1_IS_FASTA,"Query files are in fasta format","Query files are in fastq format") 
	print_cond(PERFORM_QUALITY_TRIMMING,"Query files will be trimmed","Query files will not be trimmed")
    elif len(options.query_reads)==1:
	SINGLE_END=options.query_reads[0]
        if findWholeWord('fasta$')(SINGLE_END) or findWholeWord('fna$')(SINGLE_END) or findWholeWord('fa$')(SINGLE_END):
                SINGLE_END_IS_FASTA=True
        else:
                SINGLE_END_IS_FASTA=False 
        if findWholeWord('trim')(SINGLE_END):
                SINGLE_END_IS_TRIMMED=True
        else:
                SINGLE_END_IS_TRIMMED=False
	if options.use_quality_trimming and not SINGLE_END_IS_TRIMMED and not SINGLE_END_IS_FASTA:
                PERFORM_QUALITY_TRIMMING=True
        else:
                PERFORM_QUALITY_TRIMMING=False
	PERFORM_OVERLAPPING=False

        print 'Single reads:'+SINGLE_END
        print_cond(SINGLE_END_IS_FASTA,"Query file is in fasta format","Query file is in fastq format")
	print_cond(PERFORM_QUALITY_TRIMMING,"Query file will be trimmed","Query file will not be trimmed")
    else:
    	print 'ERROR - more files than what is supported in -q'
	print 'Exiting.'
	sys.exit(1)

    if options.primers!=None:
	if len(options.primers)==2:
		FORWARD_PRIMER=options.primers[0]
		REVERSE_PRIMER=options.primers[1]
		print 'Forward primer:'+FORWARD_PRIMER
		print 'Reverse primer:'+REVERSE_PRIMER
	else:
        	print 'ERROR - only specified one primer. use -p to specify the other one'
        	print 'Exiting.'
		sys.exit(1)			

    if ((options.quality_type == 'solexa') or (options.quality_type == 'illumina') or (options.quality_type == 'sanger')):
	QUALITY_TYPE=options.quality_type
    else:
        print 'ERROR - unrecognized quality type in -t. Use sanger, solexa, or illumina'
        print 'Exiting.'
	sys.exit(1)		

    ELIMINATE_N=options.eliminate_n
    THREADS=options.threads	

    if options.full_length_amplicons_file!=None:
	FULL_LENGTH_AMPLICONS_FILE=options.full_length_amplicons_file
	PERFORM_OVERLAPPING=False
    else:
	PERFORM_OVERLAPPING=True
	OVERLAP_BASE_PAIRS=options.overlap_base_pairs

    print "Processing the data in "+CURRENT_DIR	
    mapping_dict=read_dict_file_csv(MAPPING_FILE)

    if SINGLE_END==None:	
    	if PERFORM_QUALITY_TRIMMING:
		quality_trim_reads_pe(PAIRED_END_1,PAIRED_END_2,extract_filename(PAIRED_END_1,False)+'_trim.fastq',extract_filename(PAIRED_END_2,False)+'_trim.fastq',extract_filename(PAIRED_END_1,False)+'_singlet.fastq',QUALITY_TYPE)	
		PAIRED_END_1=extract_filename(PAIRED_END_1,False)+'_trim.fastq'
		PAIRED_END_2=extract_filename(PAIRED_END_2,False)+'_trim.fastq'	

	PANDASEQ_PAIRED_END_1=PAIRED_END_1
	PANDASEQ_PAIRED_END_2=PAIRED_END_2

	if not (PAIRED_END_1_IS_FASTA or PAIRED_END_2_IS_FASTA):
        	fastq_statistics(PAIRED_END_1,'FORWARD')
        	fastq_statistics(PAIRED_END_2,'REVERSE')	
		generate_quality_plots([PAIRED_END_1],extract_filename(PAIRED_END_1,False)+'_phred-quality.png')
		generate_quality_plots([PAIRED_END_2],extract_filename(PAIRED_END_2,False)+'_phred-quality.png')
		convert_fastq_to_fasta(PAIRED_END_1,extract_filename(PAIRED_END_1,False)+'.fasta')
		convert_fastq_to_fasta(PAIRED_END_2,extract_filename(PAIRED_END_2,False)+'.fasta')
		PAIRED_END_1=extract_filename(PAIRED_END_1,False)+'.fasta'
		PAIRED_END_2=extract_filename(PAIRED_END_2,False)+'.fasta'
	
	generate_length_histogram(PAIRED_END_1,extract_filename(PAIRED_END_1,False)+'_length-histogram.png')
	generate_length_histogram(PAIRED_END_2,extract_filename(PAIRED_END_2,False)+'_length-histogram.png')
	generate_GC_perc_plot(PAIRED_END_1,extract_filename(PAIRED_END_1,False)+'_GC-percentage.png')	
	generate_GC_perc_plot(PAIRED_END_2,extract_filename(PAIRED_END_2,False)+'_GC-percentage.png')

	usearch(PAIRED_END_1,FORWARD_REFERENCE,0.95,extract_filename(PAIRED_END_1,False)+'.aln',extract_filename(PAIRED_END_1,False)+'.ualn',extract_filename(PAIRED_END_1,False)+'_matched.fasta',extract_filename(PAIRED_END_1,False)+'_notmatched.fasta',THREADS)
	fasta_statistics(extract_filename(PAIRED_END_1,False)+'_matched.fasta','FORWARD_MATCHED',False)
	fasta_statistics(extract_filename(PAIRED_END_1,False)+'_notmatched.fasta','FORWARD_NOTMATCHED',False)
	usearch(PAIRED_END_2,REVERSE_REFERENCE,0.95,extract_filename(PAIRED_END_2,False)+'.aln',extract_filename(PAIRED_END_2,False)+'.ualn',extract_filename(PAIRED_END_2,False)+'_matched.fasta',extract_filename(PAIRED_END_2,False)+'_notmatched.fasta',THREADS)
        fasta_statistics(extract_filename(PAIRED_END_2,False)+'_matched.fasta','REVERSE_MATCHED',False)
        fasta_statistics(extract_filename(PAIRED_END_2,False)+'_notmatched.fasta','REVERSE_NOTMATCHED',False)

	usearch_statistics(extract_filename(PAIRED_END_1,False)+'.ualn',mapping_dict,'FORWARD_MATCHED')
	usearch_statistics(extract_filename(PAIRED_END_2,False)+'.ualn',mapping_dict,'REVERSE_MATCHED')

	if PERFORM_OVERLAPPING and not(PAIRED_END_1_IS_FASTA or PAIRED_END_2_IS_FASTA):
		OVERLAP_FILE=extract_filename(PAIRED_END_1,False)+'_overlap.fastq'
		pandaseq(PANDASEQ_PAIRED_END_1,PANDASEQ_PAIRED_END_2,FORWARD_PRIMER,REVERSE_PRIMER,OVERLAP_BASE_PAIRS,ELIMINATE_N,THREADS,OVERLAP_FILE)	
		fastq_statistics(OVERLAP_FILE,'OVERLAP')
		convert_fastq_to_fasta(OVERLAP_FILE,extract_filename(OVERLAP_FILE,False)+'.fasta')
		OVERLAP_FILE=extract_filename(OVERLAP_FILE,False)+'.fasta'
        	generate_length_histogram(OVERLAP_FILE,extract_filename(OVERLAP_FILE,False)+'_length-histogram.png')
        	generate_GC_perc_plot(OVERLAP_FILE,extract_filename(OVERLAP_FILE,False)+'_GC-percentage.png')
        	usearch(OVERLAP_FILE,FORWARD_REFERENCE,0.95,extract_filename(OVERLAP_FILE,False)+'.aln',extract_filename(OVERLAP_FILE,False)+'.ualn',extract_filename(OVERLAP_FILE,False)+'_matched.fasta',extract_filename(OVERLAP_FILE,False)+'_notmatched.fasta',THREADS)
        	fasta_statistics(extract_filename(OVERLAP_FILE,False)+'_matched.fasta','OVERLAP_MATCHED',False)
        	fasta_statistics(extract_filename(OVERLAP_FILE,False)+'_notmatched.fasta','OVERLAP_NOTMATCHED',False)
        	usearch_statistics(extract_filename(OVERLAP_FILE,False)+'.ualn',mapping_dict,'OVERLAP_MATCHED')
		usearch_derep_remove_chimeras(OVERLAP_FILE,THREADS,extract_filename(OVERLAP_FILE,False)+'_derep_sorted_nonchimera.fasta',extract_filename(OVERLAP_FILE,False)+'_derep_sorted_chimera.fasta',0)

	else:
		OVERLAP_FILE=FULL_LENGTH_AMPLICONS_FILE
		usearch_derep_remove_chimeras(OVERLAP_FILE,THREADS,extract_filename(OVERLAP_FILE,False)+'_derep_sorted_nonchimera.fasta',extract_filename(OVERLAP_FILE,False)+'_derep_sorted_chimera.fasta',1)
	usearch(extract_filename(OVERLAP_FILE,False)+'_derep_sorted_nonchimera.fasta',FORWARD_REFERENCE,0.97,extract_filename(OVERLAP_FILE,False)+'_derep_sorted_nonchimera.aln',extract_filename(OVERLAP_FILE,False)+'_derep_sorted_nonchimera.ualn',extract_filename(OVERLAP_FILE,False)+'_derep_sorted_nonchimera_matched.fasta',extract_filename(OVERLAP_FILE,False)+'_derep_sorted_nonchimera_notmatched.fasta',THREADS)
	usearch_statistics(extract_filename(OVERLAP_FILE,False)+'_derep_sorted_nonchimera.ualn',mapping_dict,'OVERLAP_DEREP_NONCHIM_MATCHED')
        fasta_statistics(extract_filename(OVERLAP_FILE,False)+'_derep_sorted_nonchimera.fasta','OVERLAP_DEREP_NONCHIM',True)
        fasta_statistics(extract_filename(OVERLAP_FILE,False)+'_derep_sorted_chimera.fasta','OVERLAP_DEREP_CHIM',True)
        fasta_statistics(extract_filename(OVERLAP_FILE,False)+'_derep_sorted_nonchimera_matched.fasta','OVERLAP_DEREP_NONCHIM_MATCHED',True)
        fasta_statistics(extract_filename(OVERLAP_FILE,False)+'_derep_sorted_nonchimera_notmatched.fasta','OVERLAP_DEREP_NONCHIM_NOTMATCHED',True) 			
	
    else:
	if PERFORM_QUALITY_TRIMMING:
		quality_trim_reads_se(SINGLE_END,extract_filename(SINGLE_END,False)+'_trim.fastq',QUALITY_TYPE)
		SINGLE_END=extract_filename(SINGLE_END,False)+'_trim.fastq'
		generate_quality_plots([SINGLE_END],extract_filename(SINGLE_END,False)+'_phred-quality.png')
	
	if not SINGLE_END_IS_FASTA:
		fastq_statistics(SINGLE_END,'SINGLE')
		convert_fastq_to_fasta(SINGLE_END,extract_filename(SINGLE_END,False)+'.fasta')
		SINGLE_END=extract_filename(SINGLE_END,False)+'.fasta'
	else:
		fasta_statistics(SINGLE_END,'SINGLE',False)	
	generate_length_histogram(SINGLE_END,extract_filename(SINGLE_END,False)+'_length-histogram.png')	
	generate_GC_perc_plot(SINGLE_END,extract_filename(SINGLE_END,False)+'_GC-percentage.png')
	usearch_both(SINGLE_END,FORWARD_REFERENCE,0.95,extract_filename(SINGLE_END,False)+'.aln',extract_filename(SINGLE_END,False)+'.ualn',extract_filename(SINGLE_END,False)+'_matched.fasta',extract_filename(SINGLE_END,False)+'_notmatched.fasta',THREADS)
        fasta_statistics(extract_filename(SINGLE_END,False)+'_matched.fasta','SINGLE_MATCHED',False)
        fasta_statistics(extract_filename(SINGLE_END,False)+'_notmatched.fasta','SINGLE_NOTMATCHED',False)		
	usearch_statistics(extract_filename(SINGLE_END,False)+'.ualn',mapping_dict,'SINGLE')

    print_time_stamp('Saving pipeline statistics in AMPLImock_stats.csv. Here is the summary of results:')
    write_dict_file_csv(pipeline_statistics,'AMPLImock_stats.csv')	
    for key in sorted(pipeline_statistics.keys()):
	print key+','+pipeline_statistics[key]	
if __name__ == '__main__':
    main()