#!/usr/bin/python
# ***************************************************************
# Name: genbank_enzymes.py
# Purpose: This script extracts contigs (contig_name.[gbk/fasta]) from a given genbank annotation file
# that contain the enzymes provided in a list (-l switch) to the
# script. The list file can have both EC numbers and enzyme
# names, for example
#
#
# 1.2.1.26
# 1.1.1.46
# 1.1.1.10
# 4.1.1.28
# catalase
#
#
# Furthermore, it generates contig_name.pdf that displays the enzyme on a genome diagram
# I am using the following scheme:
# hypothetical protein <- orange boxes
# any other CDS with known resolution <- light blue boxes
# enzymes that match the EC numbers/names in the list <- red arrows
# enzymes that dont match the EC numbers/names in the list <- green arrows
#
#
# Please note that the script uses rest-style KEGG API (http://www.kegg.jp/kegg/docs/keggapi.html)
# to search the enzyme names.
# Version: 0.1
# Authors: Umer Zeeshan Ijaz (Umer.Ijaz@glasgow.ac.uk)
# http://userweb.eng.gla.ac.uk/umer.ijaz
# Last Modified: 2014-03-23
# License: Copyright (c) 2014 Computational Microbial Genomics Group, University of Glasgow, UK
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
# **************************************************************
import sys, getopt,os
from Bio import SeqIO
from reportlab.lib import colors
from reportlab.lib.units import cm
from Bio.Graphics import GenomeDiagram
import urllib
import subprocess
import hashlib
def run_prog(prog):
p = subprocess.Popen(prog, shell=True, stdout=None, stderr=None)
retval = p.wait()
def usage():
print 'Usage:'
print '\tpython genbank_enzymes.py -g -o -l [OPTIONS]'
print '\nOptions:'
print '\t-s (--font_size) Size of labels on diagrams (Default:12)'
print '\t-f (--fragments) Number of fragments to split the genome diagram (Default:1)'
print '\t-n (--name_length) Maximum length of names for output files (Default:10)'
def main(argv):
gbkfile = ''
outputfolder=''
listfile=''
fontsize=12
fragments=1
namelength=10
try:
opts, args =getopt.getopt(argv,"hg:o:l:s:f:n:",["gbk_file=","output_folder=","list_file=","font_size=","fragments=","name_length="])
except getopt.GetoptError:
usage()
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
usage()
sys.exit()
elif opt in ("-g", "--gbk_file"):
gbkfile = arg
elif opt in ("-o", "--output_folder"):
outputfolder = arg
elif opt in ("-l", "--list_file"):
listfile=arg
elif opt in ("-s", "--font_size"):
fontsize=int(arg)
elif opt in ("-f", "--fragments"):
fragments=int(arg)
elif opt in ("-n", "--name_length"):
namelength=int(arg)
if gbkfile=='' or outputfolder=='' or listfile=='':
usage()
sys.exit(2)
ec_numbers_list=[i.strip() for i in open(listfile).readlines()]
ec_numbers_list=map(lambda each:each.lower(), ec_numbers_list)
if os.path.isdir(outputfolder):
print "ERROR - Folder "+outputfolder+" already exists. Remove it and try again!"
sys.exit(2)
else:
run_prog("mkdir "+outputfolder)
for r in SeqIO.parse(gbkfile, "genbank"):
foundInContigFlag=False
tmp_id=r.id
tmp_name=r.name
#Assign random shorter names to contigs as BIOPYTHON sometimes complains if the name is longer
r.name=hashlib.md5(r.id+r.name).hexdigest()[0:10]
r.id=r.name
gd_diagram = GenomeDiagram.Diagram(r.id)
gd_track_for_features = gd_diagram.new_track(1, name="Annotated Features")
gd_feature_set = gd_track_for_features.new_set()
if len(r.features) > 1:
for feature in r.features:
foundInFeatureFlag=False
if feature.qualifiers.get("EC_number",None)!=None:
tmp_numbers=feature.qualifiers["EC_number"]
matched_name=''
for i in tmp_numbers:
url="http://rest.kegg.jp/find/enzyme/"+i
returned_ec_names=''
try:
returned_ec_names=urllib.urlopen(url).read().split("\t")[1].rstrip().split(";")
returned_ec_names=[i.lstrip().rstrip() for i in returned_ec_names]
returned_ec_names=map(lambda each:each.lower(), returned_ec_names)
returned_ec_names=",".join(returned_ec_names)
except:
print "Doesn't seem like the rest service returned anything for "+i
for j in ec_numbers_list:
if j in returned_ec_names:
if matched_name=='':
matched_name=j
else:
matched_name=matched_name+','+j
foundInContigFlag=True
foundInFeatureFlag=True
if i in ec_numbers_list:
if matched_name=='':
matched_name=i
else:
matched_name=matched_name+','+i
foundInContigFlag=True
foundInFeatureFlag=True
if foundInFeatureFlag==True:
print "Found "+matched_name+" in "+tmp_name
tmp_gene_name=''
if feature.qualifiers.get("gene",None)!=None:
tmp_gene_name=feature.qualifiers["gene"]
feature.qualifiers["gene"][0]=feature.qualifiers["gene"][0]+'('+matched_name+')'
else:
feature.qualifiers["gene"]=[matched_name]
gd_feature_set.add_feature(feature,color=colors.red,label=True,sigil="ARROW",label_size=fontsize)
feature.qualifiers["gene"]=tmp_gene_name
else:
gd_feature_set.add_feature(feature,color=colors.green,label=True,sigil="ARROW",label_size=fontsize)
elif feature.type!="source":
if feature.qualifiers.get("product",None)!=None:
if "hypothetical protein" in feature.qualifiers["product"]:
gd_feature_set.add_feature(feature,color=colors.orange,label=False,label_size=fontsize)
else:
gd_feature_set.add_feature(feature,color=colors.lightblue,label=True,label_size=fontsize)
if foundInContigFlag:
r.id=tmp_id
r.name=tmp_name
gd_diagram.draw(format="linear",orientation="landscape",pagesize='A4',fragments=fragments,start=0,end=len(r))
try:
SeqIO.write(r, outputfolder+"/"+r.name+".gbk", "genbank")
SeqIO.write(r, outputfolder+"/"+r.name+".fasta", "fasta")
gd_diagram.write(outputfolder+"/"+r.name+".pdf","PDF")
print "\tGenerated "+outputfolder+"/"+r.name+".gbk, "+outputfolder+"/"+r.name+".fasta, and "+outputfolder+"/"+r.name+".pdf"
except:
run_prog("rm "+outputfolder+"/"+r.name+".gbk")
r.id=r.id[0:namelength]
r.name=r.name[0:namelength]
SeqIO.write(r, outputfolder+"/"+r.name+".gbk", "genbank")
SeqIO.write(r, outputfolder+"/"+r.name+".fasta", "fasta")
gd_diagram.write(outputfolder+"/"+r.name+".pdf","PDF")
print "\tGenerated "+outputfolder+"/"+r.name+".gbk, "+outputfolder+"/"+r.name+".fasta, and "+outputfolder+"/"+r.name+".pdf"
if __name__ == "__main__":
main(sys.argv[1:])