#!/usr/bin/python # *************************************************************** # Name: blast_concat_name_taxa.py # Purpose: This scripts takes a blast output file, extracts gid # finds the associated organism name and appends that along with taxa id # at the end of each record # # This script can filter blast files generated through the # the following command: # # blastn -db -query # -out -outfmt "6" -num_threads 8 # -max_target_seqs 100/1/500 # # Version: 0.1 # Authors: Umer Zeeshan Ijaz (Umer.Ijaz@glasgow.ac.uk) # http://userweb.eng.gla.ac.uk/umer.ijaz # Created: 2012-11-27 # License: Copyright (c) 2012 Computational Microbial Genomics Group, University of Glasgow, UK # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # **************************************************************/ import re,urllib,sys,time,getopt from xml.dom import minidom def usage(): print 'Usage:' print '\tpython blast_concat_name_taxa.py -b > ' def main(argv): blast_file='' output_flag=1 try: opts, args = getopt.getopt(argv,"hb:",["blast_file="]) except getopt.GetoptError: usage() exit(2) for opt, arg in opts: if opt == '-h': usage() exit() elif opt in ("-b", "--blast_file"): blast_file = arg if blast_file=="": usage() exit() params={ 'db':'nucleotide', } #check if it is a single record or a file if re.findall(r'gi\|(\w.*?)\|',blast_file): title='' taxid='' gid=int(blast_file.split("|")[1]) params['id']=gid url='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?'+ urllib.urlencode(params) data=urllib.urlopen(url).read() #parse XML output xmldoc=minidom.parseString(data); for ind in range(0,xmldoc.getElementsByTagName('Item').length-1): if xmldoc.getElementsByTagName('Item')[ind].attributes.item(1).value=='Title': title=xmldoc.getElementsByTagName('Item')[ind].childNodes[0].nodeValue if xmldoc.getElementsByTagName('Item')[ind].attributes.item(1).value=='TaxId': taxid=xmldoc.getElementsByTagName('Item')[ind].childNodes[0].nodeValue if output_flag==1: print blast_file.rstrip('\n')+"\t"+re.sub('\s+','_',title)+"\t"+taxid else: print blast_file.rstrip('\n')+"\t"+re.sub('\s+','_',title).split("_")[0]+"_"+re.sub('\s+','_',title).split("_")[1]+"\t"+taxid else: ins=open(blast_file,"r") for line in ins: title='' taxid='' gid=int(line.split("|")[1]) params['id']=gid url='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?'+ urllib.urlencode(params) data=urllib.urlopen(url).read() #parse XML output xmldoc=minidom.parseString(data); for ind in range(0,xmldoc.getElementsByTagName('Item').length-1): if xmldoc.getElementsByTagName('Item')[ind].attributes.item(1).value=='Title': title=xmldoc.getElementsByTagName('Item')[ind].childNodes[0].nodeValue if xmldoc.getElementsByTagName('Item')[ind].attributes.item(1).value=='TaxId': taxid=xmldoc.getElementsByTagName('Item')[ind].childNodes[0].nodeValue if output_flag==1: print line.rstrip('\n')+"\t"+re.sub('\s+','_',title)+"\t"+taxid else: print line.rstrip('\n')+"\t"+re.sub('\s+','_',title).split("_")[0]+"_"+re.sub('\s+','_',title).split("_")[1]+"\t"+taxid ins.close() if __name__ == '__main__': main(sys.argv[1:])