#!/usr/bin/python # *************************************************************** # Name: genbank_enzymes.py # Purpose: This script extracts contigs (contig_name.[gbk/fasta]) from a given genbank annotation file # that contain the enzymes provided in a list (-l switch) to the # script. The list file can have both EC numbers and enzyme # names, for example # # # 1.2.1.26 # 1.1.1.46 # 1.1.1.10 # 4.1.1.28 # catalase # # # Furthermore, it generates contig_name.pdf that displays the enzyme on a genome diagram # I am using the following scheme: # hypothetical protein <- orange boxes # any other CDS with known resolution <- light blue boxes # enzymes that match the EC numbers/names in the list <- red arrows # enzymes that dont match the EC numbers/names in the list <- green arrows # # # Please note that the script uses rest-style KEGG API (http://www.kegg.jp/kegg/docs/keggapi.html) # to search the enzyme names. # Version: 0.1 # Authors: Umer Zeeshan Ijaz (Umer.Ijaz@glasgow.ac.uk) # http://userweb.eng.gla.ac.uk/umer.ijaz # Last Modified: 2014-03-23 # License: Copyright (c) 2014 Computational Microbial Genomics Group, University of Glasgow, UK # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # ************************************************************** import sys, getopt,os from Bio import SeqIO from reportlab.lib import colors from reportlab.lib.units import cm from Bio.Graphics import GenomeDiagram import urllib import subprocess import hashlib def run_prog(prog): p = subprocess.Popen(prog, shell=True, stdout=None, stderr=None) retval = p.wait() def usage(): print 'Usage:' print '\tpython genbank_enzymes.py -g -o -l [OPTIONS]' print '\nOptions:' print '\t-s (--font_size) Size of labels on diagrams (Default:12)' print '\t-f (--fragments) Number of fragments to split the genome diagram (Default:1)' print '\t-n (--name_length) Maximum length of names for output files (Default:10)' def main(argv): gbkfile = '' outputfolder='' listfile='' fontsize=12 fragments=1 namelength=10 try: opts, args =getopt.getopt(argv,"hg:o:l:s:f:n:",["gbk_file=","output_folder=","list_file=","font_size=","fragments=","name_length="]) except getopt.GetoptError: usage() sys.exit(2) for opt, arg in opts: if opt == '-h': usage() sys.exit() elif opt in ("-g", "--gbk_file"): gbkfile = arg elif opt in ("-o", "--output_folder"): outputfolder = arg elif opt in ("-l", "--list_file"): listfile=arg elif opt in ("-s", "--font_size"): fontsize=int(arg) elif opt in ("-f", "--fragments"): fragments=int(arg) elif opt in ("-n", "--name_length"): namelength=int(arg) if gbkfile=='' or outputfolder=='' or listfile=='': usage() sys.exit(2) ec_numbers_list=[i.strip() for i in open(listfile).readlines()] ec_numbers_list=map(lambda each:each.lower(), ec_numbers_list) if os.path.isdir(outputfolder): print "ERROR - Folder "+outputfolder+" already exists. Remove it and try again!" sys.exit(2) else: run_prog("mkdir "+outputfolder) for r in SeqIO.parse(gbkfile, "genbank"): foundInContigFlag=False tmp_id=r.id tmp_name=r.name #Assign random shorter names to contigs as BIOPYTHON sometimes complains if the name is longer r.name=hashlib.md5(r.id+r.name).hexdigest()[0:10] r.id=r.name gd_diagram = GenomeDiagram.Diagram(r.id) gd_track_for_features = gd_diagram.new_track(1, name="Annotated Features") gd_feature_set = gd_track_for_features.new_set() if len(r.features) > 1: for feature in r.features: foundInFeatureFlag=False if feature.qualifiers.get("EC_number",None)!=None: tmp_numbers=feature.qualifiers["EC_number"] matched_name='' for i in tmp_numbers: url="http://rest.kegg.jp/find/enzyme/"+i returned_ec_names='' try: returned_ec_names=urllib.urlopen(url).read().split("\t")[1].rstrip().split(";") returned_ec_names=[i.lstrip().rstrip() for i in returned_ec_names] returned_ec_names=map(lambda each:each.lower(), returned_ec_names) returned_ec_names=",".join(returned_ec_names) except: print "Doesn't seem like the rest service returned anything for "+i for j in ec_numbers_list: if j in returned_ec_names: if matched_name=='': matched_name=j else: matched_name=matched_name+','+j foundInContigFlag=True foundInFeatureFlag=True if i in ec_numbers_list: if matched_name=='': matched_name=i else: matched_name=matched_name+','+i foundInContigFlag=True foundInFeatureFlag=True if foundInFeatureFlag==True: print "Found "+matched_name+" in "+tmp_name tmp_gene_name='' if feature.qualifiers.get("gene",None)!=None: tmp_gene_name=feature.qualifiers["gene"] feature.qualifiers["gene"][0]=feature.qualifiers["gene"][0]+'('+matched_name+')' else: feature.qualifiers["gene"]=[matched_name] gd_feature_set.add_feature(feature,color=colors.red,label=True,sigil="ARROW",label_size=fontsize) feature.qualifiers["gene"]=tmp_gene_name else: gd_feature_set.add_feature(feature,color=colors.green,label=True,sigil="ARROW",label_size=fontsize) elif feature.type!="source": if feature.qualifiers.get("product",None)!=None: if "hypothetical protein" in feature.qualifiers["product"]: gd_feature_set.add_feature(feature,color=colors.orange,label=False,label_size=fontsize) else: gd_feature_set.add_feature(feature,color=colors.lightblue,label=True,label_size=fontsize) if foundInContigFlag: r.id=tmp_id r.name=tmp_name gd_diagram.draw(format="linear",orientation="landscape",pagesize='A4',fragments=fragments,start=0,end=len(r)) try: SeqIO.write(r, outputfolder+"/"+r.name+".gbk", "genbank") SeqIO.write(r, outputfolder+"/"+r.name+".fasta", "fasta") gd_diagram.write(outputfolder+"/"+r.name+".pdf","PDF") print "\tGenerated "+outputfolder+"/"+r.name+".gbk, "+outputfolder+"/"+r.name+".fasta, and "+outputfolder+"/"+r.name+".pdf" except: run_prog("rm "+outputfolder+"/"+r.name+".gbk") r.id=r.id[0:namelength] r.name=r.name[0:namelength] SeqIO.write(r, outputfolder+"/"+r.name+".gbk", "genbank") SeqIO.write(r, outputfolder+"/"+r.name+".fasta", "fasta") gd_diagram.write(outputfolder+"/"+r.name+".pdf","PDF") print "\tGenerated "+outputfolder+"/"+r.name+".gbk, "+outputfolder+"/"+r.name+".fasta, and "+outputfolder+"/"+r.name+".pdf" if __name__ == "__main__": main(sys.argv[1:])