#!/usr/bin/python # *************************************************************** # Name: KO2MODULEclusters2.py # Purpose: This script takes a KO x Clusters (contigs for a genome) abundance file and uses # http://www.genome.jp/kegg-bin/find_module_object # to generate a MODULE x Clusters file that contains the # proportions of completed modules. You can use this script with the clusters # obtained from CONCOCT software: https://github.com/BinPro/CONCOCT # Note that you need to have an active internet connection for the script # to work # Version: 0.2 # History: 0.1 - 0.2 Optimised module definition resolution # Authors: Umer Zeeshan Ijaz (Umer.Ijaz@glasgow.ac.uk) # http://userweb.eng.gla.ac.uk/umer.ijaz # Created: 2014-03-15 # License: Copyright (c) 2015 Environmental'Omics Group, University of Glasgow, UK # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # **************************************************************/ import urllib import urllib2 import datetime,time import getopt import sys import re from bs4 import BeautifulSoup def usage(): print 'Usage:' print '\tpython KO2MODULEclusters2.py -i -o [OPTIONS]' print 'Options:' print '\t-d Turn debugging on' def main(argv): input_file='' output_file='' debug=0 try: opts, args =getopt.getopt(argv,"hi:o:d",["input_file=","output_file=","debug"]) except getopt.GetoptError: usage() sys.exit(2) for opt, arg in opts: if opt == '-h': usage() sys.exit() elif opt in ("-i", "--input_file"): input_file = arg elif opt in ("-o", "--output_file"): output_file = arg elif opt in ("-d", "--debug"): debug=1 if input_file=='' or output_file=='': usage() sys.exit(2) print_time_stamp("Loading "+input_file) ins=open(input_file,"r") record_count=0 extracted_KOs=[] extracted_clusters=[] clusters_KOs={} for line in ins: line=line.rstrip('\n').rstrip('\r') record=line.split(",") if record_count>0: extracted_KOs.append(record[0].replace('ko:','')) for i in range(len(record[1:])): clusters_KOs[extracted_clusters[i]+":"+extracted_KOs[record_count-1]]=float(record[i+1]) else: extracted_clusters=record[1:] record_count=record_count+1 ins.close() clusters_modules={} extracted_modules=[] for i in extracted_clusters: ind=1 record_string=[] for j in extracted_KOs: if (clusters_KOs[i+":"+j]>0.0): record_string.append(i+"_"+str(ind)+"\t"+j) ind=ind+1 returned_modules={} print_time_stamp("Uploading KOs from " + i + " to http://www.genome.jp/kegg-bin/find_module_object" ) if(debug): print_time_stamp("Uploaded \n"+"\n".join(record_string)) returned_modules=generate_modules_blocks_missing("\n".join(record_string),debug) print_time_stamp("Extracted Modules for " + i) for key in returned_modules.keys(): clusters_modules[i+":"+key]=returned_modules[key] extracted_modules.append(key) extracted_modules=list(set(extracted_modules)) extracted_modules_total_blocks={} print_time_stamp("Resolving definitions for returned modules") for i in extracted_modules: tmp="" try: tmp=urllib.urlopen('http://togows.dbcls.jp/entry/orthology/'+i+'/definition').read().rstrip('\n') if(debug): print_time_stamp("http://togows.dbcls.jp/entry/orthology/"+i+"/definition" +"\n"+tmp) except urllib2.HTTPError, error: print "ERROR: code=%s message=%s" % (error.code, error.msg) sys.exit(1) # The regular expression given below calculates total blocks for a given module # The M number entry is defined by a logical expression of K numbers (and other M numbers), # allowing automatic evaluation of whether the gene set is complete, i.e., the module is # present, in a given genome. A space or a plus sign represents an AND operation, and a # comma sign represents an OR operation in this expression. A plus sign is used for a # molecular complex and a minus sign designates an optional item in the complex. extracted_modules_total_blocks[i]=len(re.sub(r"\([\w\+\-\,\s]+?\)","AND",re.sub(r" \(.*?\)"," AND",re.sub(r"\(.*?\) ","AND ",re.sub(r" \(.*?\) "," AND ",re.sub(r"\([\w\+\-\,\s]+?\)","AND",re.sub(r" \([\w\+\-\,]+?\)"," AND",re.sub(r"\([\w\+\-\,]+?\) ","AND ",re.sub(r" \([\w\-\+\,]+?\) "," AND ",tmp)))))))).split(" ")) print_time_stamp("Saving "+output_file) out=open(output_file,'w') out.write("Clusters,"+",".join(extracted_clusters)+"\n") for j in extracted_modules: out.write(j) for i in extracted_clusters: if clusters_modules.get(i+":"+j,None)==None: out.write(",0.0") else: out.write(","+str((-clusters_modules.get(i+":"+j,0.0)+float(extracted_modules_total_blocks.get(j,0.0)))/float(extracted_modules_total_blocks.get(j,0.0)))) out.write("\n") out.close() def print_time_stamp(message): sys.stderr.write(datetime.datetime.fromtimestamp(time.time()).strftime('[%Y-%m-%d %H:%M:%S] ')+message+"\n") def generate_modules_blocks_missing(record,debug): url = 'http://www.genome.jp/kegg-bin/find_module_object' user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' post_data={} post_data['unclassified']=record post_data['mode']="complete+ng1+ng2" headers = { 'User-Agent' : user_agent } data = urllib.urlencode(post_data) req = urllib2.Request(url, data, headers) response="" try: response = urllib2.urlopen(req) except urllib2.HTTPError, error: print "ERROR: code=%s message=%s" % (error.code, error.msg) sys.exit(1) the_page = response.read() soup = BeautifulSoup(the_page) modules_found=[] for i in soup.findAll(href=re.compile("/kegg-bin/show_module")): modules_found.append(i.string.encode('ascii','ignore')) modules_block_missing=[] returned_data={} ind=0 for i in soup.findAll(text=re.compile('\)\xa0\xa0\(')): if re.match('.*complete.*',i): modules_block_missing.append(0) else: modules_block_missing.append(int(re.match('.*([1-2]) block.*',i).groups(1)[0].encode('ascii','ignore'))) if(debug): print_time_stamp("Missing "+str(modules_block_missing[-1])+ " blocks for " + modules_found[ind]) returned_data[modules_found[ind]]=modules_block_missing[-1] ind=ind+1 return returned_data if __name__ == '__main__': main(sys.argv[1:])