#!/usr/bin/python
# ***************************************************************
# Name: PROKKA_CDD.py
# Purpose: This script integrates with PROKKA and generates [Cdd/Cog/Kog/Pfam/Prk/Smart/Tigr] assignments for the protein sequences.
# Version: 0.1
# Authors: Umer Zeeshan Ijaz (Umer.Ijaz@glasgow.ac.uk)
# http://userweb.eng.gla.ac.uk/umer.ijaz
# Created: 2014-01-25
# License: Copyright (c) 2014 Computational Microbial Genomics Group, University of Glasgow, UK
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
# **************************************************************/
import sys, getopt, urllib
from xml.dom import minidom
from BCBio import GFF
def get_record_from_cdd(query):
# We need CDD accession to Cog/Kog/Pfam/Prk/Smart/Tigr accession mapping. For this we will use NCBI eutils and parse the returned XML
# file. For example,
#
# http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=cdd&id=223855
#
# returns the following XML record
#
#
#
# 223855
# - COG0784
# - CheY
# - FOG: CheY-like receiver [Signal transduction mechanisms]
# - 0
# - 0
#
#
params = {
'db':'cdd',
}
params['id'] = query
url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?' + urllib.urlencode(params)
data = urllib.urlopen(url).read()
xmldoc = minidom.parseString(data)
items=xmldoc.getElementsByTagName("Item")
r={}
for i in range(items.length):
try:
r[items[i].getAttribute('Name')]=items[i].firstChild.data
except:
r[items[i].getAttribute('Name')]=''
return r
def usage():
print '\n'.join([
'Usage:',
'\t./PROKKA_CDD.py -g -b ',
'',
'Optional parameters:',
'\t-s (--scovs-threshold)\t\tsubject coverage threshold (Default:60)',
'\t-p (--pident-threshold)\t\tpident threshold (Default:0)',
'',
'Example usage:',
'',
'\tStep 1: Run PROKKA_XXXXXXXX.faa with rpsblast against the [Cdd/Cog/Kog/Pfam/Prk/Smart/Tigr] database',
'\twith following format:',
'\t\t\trpsblast -query PROKKA_XXXXXXXX.faa -db [Cdd/Cog/Kog/Pfam/Prk/Smart/Tigr] -evalue 0.00001',
'\t\t\t-outfmt \"6 qseqid sseqid evalue pident score qstart qend',
'\t\t\tsstart send length slen\" -out blast_output.out',
'',
'\tStep 2: Run this script to generate COG anotations:',
'\t\t\t./PROKKA_CDD.py -g PROKKA_XXXXXXXX.gff -b blast_output.out',
'\t\t\t > annotations.tsv',
'',
'\tThe output is a tab-delimited file with following fields:',
'\t\t#Query\t\t\twith concatenated PROKKA CDS Ids using _ as delimeter',
'\t\tHit\t\t\tfrom CDD database using NCBI\'s eutils ',
'\t\tE-value\t\t\tfrom rpsblast',
'\t\tIdentity\t\tfrom rpsblast',
'\t\tScore\t\t\tfrom rpsblast',
'\t\tQuery-start\t\tfrom rpsblast',
'\t\tQuery-end\t\tfrom rpsblast',
'\t\tHit-start\t\tfrom rpsblast',
'\t\tHit-end\t\t\tfrom rpsblast',
'\t\tHit-length\t\tfrom rpsblast',
'\t\tAbstract\t\tfrom CDD database using NCBI\'s eutils',
'\t\tTitle\t\t\tfrom CDD database using NCBI\'s eutils',
'\t\tComments\t\tCDS to contig mapping (from GFF file)',
'',
'Refer to rpsblast tutorial: http://www2.warwick.ac.uk/fac/sci/moac/people/students/peter_cock/python/rpsblast/'])
def main(argv):
# = Parameters to set ============== #
RPSBLAST_SCOVS_THRESHOLD=60.0
RPSBLAST_PIDENT_THRESHOLD=0.0
RPSBLAST_QSEQID_FIELD=0
RPSBLAST_SSEQID_FIELD=1
RPSBLAST_EVALUE_FIELD=2
RPSBLAST_PIDENT_FIELD=3
RPSBLAST_SCORE_FIELD=4
RPSBLAST_QSTART_FIELD=5
RPSBLAST_QEND_FIELD=6
RPSBLAST_SSTART_FIELD=7
RPSBLAST_SEND_FIELD=8
RPSBLAST_LENGTH_FIELD=9
RPSBLAST_SLEN_FIELD=10
# = /Parameters to set ============= #
gfffile = ''
blastoutfile=''
try:
opts, args = getopt.getopt(argv,"hg:b:s:p:",["gfffile=","blastoutfile=","scovs-threshold=","pident-threshold="])
except getopt.GetoptError:
usage()
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
usage()
sys.exit()
elif opt in ("-g", "--gfffile"):
gfffile = arg
elif opt in ("-b", "--blastoutfile"):
blastoutfile = arg
elif opt in ("-s", "--scovs-threshold"):
RPSBLAST_SCOVS_THRESHOLD = float(arg)
elif opt in ("-p", "--pident-threshold"):
RPSBLAST_PIDENT_THRESHOLD = float(arg)
if (gfffile =='' or blastoutfile == ''):
usage()
sys.exit()
featureid_locations={}
limits=dict(gff_type=["gene","mRNA","CDS"])
in_handle=open(gfffile)
for rec in GFF.parse(in_handle,limit_info=limits):
for feature in rec.features:
if str(feature.location.strand)!="-1":
featureid_locations[feature.id]=[rec.id,str(feature.location.start),str(feature.location.end),'+']
else:
featureid_locations[feature.id]=[rec.id,str(feature.location.start),str(feature.location.end),'-']
in_handle.close()
print '#Query\tHit\tE-value\tIdentity\tScore\tQuery-start\tQuery-end\tHit-start\tHit-end\tHit-length\tAbstract\tTitle\tComments'
in_handle=open(blastoutfile)
for line in in_handle:
record=line.split("\t")
if (float(record[RPSBLAST_PIDENT_FIELD])>= RPSBLAST_PIDENT_THRESHOLD and ((float(abs(int(record[RPSBLAST_SEND_FIELD])-int(record[RPSBLAST_SSTART_FIELD]))+1)/float(record[RPSBLAST_SLEN_FIELD]))*100.0)>= RPSBLAST_SCOVS_THRESHOLD):
cddrecord=get_record_from_cdd(record[RPSBLAST_SSEQID_FIELD].split('|')[2])
featureidlocrecord=featureid_locations[record[RPSBLAST_QSEQID_FIELD]]
print ( featureidlocrecord[0]+'_'+record[RPSBLAST_QSEQID_FIELD][7:]+'\t'+
cddrecord['Accession']+'\t'+
record[RPSBLAST_EVALUE_FIELD]+'\t'+
record[RPSBLAST_PIDENT_FIELD]+'\t'+
record[RPSBLAST_SCORE_FIELD]+'\t'+
record[RPSBLAST_QSTART_FIELD]+'\t'+
record[RPSBLAST_QEND_FIELD]+'\t'+
record[RPSBLAST_SSTART_FIELD]+'\t'+
record[RPSBLAST_SEND_FIELD]+'\t'+
record[RPSBLAST_LENGTH_FIELD]+'\t'+
cddrecord.get('Abstract','')+'\t'+
cddrecord.get('Title','')+'\t'+
'['+featureidlocrecord[1]+','+featureidlocrecord[2]+']('+featureidlocrecord[3]+')'
)
in_handle.close()
if __name__ == "__main__":
main(sys.argv[1:])