#!/usr/bin/python
# ***************************************************************
# Name:      blast_concat_name_taxa.py
# Purpose:   This scripts takes a blast output file, extracts gid
#	     finds the associated organism name and appends that along with taxa id
#	     at the end of each record
#
#            This script can filter blast files generated through the
#            the following command:
#
#            blastn -db <path_to_nt> -query <fasta_file>
#            -out <blast_file> -outfmt "6" -num_threads 8
#            -max_target_seqs 100/1/500
#
# Version:   0.1
# Authors:   Umer Zeeshan Ijaz (Umer.Ijaz@glasgow.ac.uk)
#                 http://userweb.eng.gla.ac.uk/umer.ijaz
# Created:   2012-11-27
# License:   Copyright (c) 2012 Computational Microbial Genomics Group, University of Glasgow, UK
#
#            This program is free software: you can redistribute it and/or modify
#            it under the terms of the GNU General Public License as published by
#            the Free Software Foundation, either version 3 of the License, or
#            (at your option) any later version.
#
#            This program is distributed in the hope that it will be useful,
#            but WITHOUT ANY WARRANTY; without even the implied warranty of
#            MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#            GNU General Public License for more details.
#
#            You should have received a copy of the GNU General Public License
#            along with this program.  If not, see <http://www.gnu.org/licenses/>.
# **************************************************************/     

import re,urllib,sys,time,getopt
from xml.dom import minidom

def usage():
	print 'Usage:'
	print '\tpython blast_concat_name_taxa.py -b <blast_file> > <output_file>'
def main(argv):
	blast_file=''
	output_flag=1
	try:
		opts, args = getopt.getopt(argv,"hb:",["blast_file="])
	except getopt.GetoptError:
		usage()
		exit(2)
	for opt, arg in opts:
		if opt == '-h':
			usage()
			exit()
      		elif opt in ("-b", "--blast_file"):
        		blast_file = arg
	if blast_file=="":
		usage()
		exit()			
    
	params={
		'db':'nucleotide',
    	}

	#check if it is a single record or a file
	if re.findall(r'gi\|(\w.*?)\|',blast_file):
		title=''
		taxid=''
		gid=int(blast_file.split("|")[1])
		params['id']=gid
		url='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?'+ urllib.urlencode(params)
		data=urllib.urlopen(url).read()
		#parse XML output 
		xmldoc=minidom.parseString(data);
		for ind in range(0,xmldoc.getElementsByTagName('Item').length-1):
			if xmldoc.getElementsByTagName('Item')[ind].attributes.item(1).value=='Title':
				title=xmldoc.getElementsByTagName('Item')[ind].childNodes[0].nodeValue
			if xmldoc.getElementsByTagName('Item')[ind].attributes.item(1).value=='TaxId':
				taxid=xmldoc.getElementsByTagName('Item')[ind].childNodes[0].nodeValue
		if output_flag==1:
			print blast_file.rstrip('\n')+"\t"+re.sub('\s+','_',title)+"\t"+taxid
		else:
			print blast_file.rstrip('\n')+"\t"+re.sub('\s+','_',title).split("_")[0]+"_"+re.sub('\s+','_',title).split("_")[1]+"\t"+taxid
	else:
		ins=open(blast_file,"r")	
		for line in ins:
               		title=''
                	taxid=''
    			gid=int(line.split("|")[1])
    			params['id']=gid
    			url='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?'+ urllib.urlencode(params)
    			data=urllib.urlopen(url).read()
			#parse XML output 
    			xmldoc=minidom.parseString(data);
			for ind in range(0,xmldoc.getElementsByTagName('Item').length-1):
				if xmldoc.getElementsByTagName('Item')[ind].attributes.item(1).value=='Title':
					title=xmldoc.getElementsByTagName('Item')[ind].childNodes[0].nodeValue
                                if xmldoc.getElementsByTagName('Item')[ind].attributes.item(1).value=='TaxId':
                                        taxid=xmldoc.getElementsByTagName('Item')[ind].childNodes[0].nodeValue			
			if output_flag==1:
				print line.rstrip('\n')+"\t"+re.sub('\s+','_',title)+"\t"+taxid
			else:
				print line.rstrip('\n')+"\t"+re.sub('\s+','_',title).split("_")[0]+"_"+re.sub('\s+','_',title).split("_")[1]+"\t"+taxid
		ins.close()

if __name__ == '__main__':
	main(sys.argv[1:])