#!/usr/bin/python # *************************************************************** # Name: processPROT.py # Purpose: This script reads statistics file generated from ProteinPilot # and uses the Uniprot IDs to extract detailed records from ExPASy # server # Version: 0.1 # Authors: Umer Zeeshan Ijaz (Umer.Ijaz@glasgow.ac.uk) # http://userweb.eng.gla.ac.uk/umer.ijaz # Last Modified: 2014-07-5 # License: Copyright (c) 2014 Computational Microbial Genomics Group, University of Glasgow, UK # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # ************************************************************** import getopt,sys,xlrd from Bio import ExPASy from Bio import SwissProt from optparse import OptionParser from optparse import Option, OptionValueError #Ref:http://stackoverflow.com/questions/4109436/processing-multiple-values-for-one-single-option-using-getopt-optparse class MultipleOption(Option): ACTIONS = Option.ACTIONS + ("extend",) STORE_ACTIONS = Option.STORE_ACTIONS + ("extend",) TYPED_ACTIONS = Option.TYPED_ACTIONS + ("extend",) ALWAYS_TYPED_ACTIONS = Option.ALWAYS_TYPED_ACTIONS + ("extend",) def take_action(self, action, dest, opt, value, values, parser): if action == "extend": values.ensure_value(dest, []).append(value) else: Option.take_action(self, action, dest, opt, value, values, parser) def insert_uniprot_details_dict(dict,key): #if the record doesn't exist if dict.get(key,None)==None: tmp=[]; handle=ExPASy.get_sprot_raw(key) try: record=SwissProt.read(handle); tmp=[record.gene_name,record.organism,":".join(record.taxonomy_id),":".join(record.organism_classification),record.description, record.sequence] except: tmp=["__NA__","__NA__","__NA__","__NA__","__NA__","__NA__"] dict[key]=tmp; def insert_protein_uniprot_dict(dict,key,value): #if it is first record if dict.get(key,None)==None: dict[key]=[value]; else: if not value in dict[key]: tmp=dict[key]; tmp.append(value); dict[key]=tmp; def update_dicts(prot_uniprot,uniprot_det,key,rep_value): #print rep_value; outter_tok=rep_value.split(";"); if len(outter_tok)>1: for i in range(0,len(outter_tok)): inner_tok=outter_tok[i].split("|"); insert_protein_uniprot_dict(prot_uniprot,key,inner_tok[1]); insert_uniprot_details_dict(uniprot_det,inner_tok[1]); else: inner_tok=outter_tok[0].split("|") insert_protein_uniprot_dict(prot_uniprot,key,inner_tok[1]); insert_uniprot_details_dict(uniprot_det,inner_tok[1]); #print inner_tok[1]; def main(argv): parser = OptionParser(option_class=MultipleOption,usage="usage: %prog [options] filename", version="%prog 0.1") parser.add_option("-i","--excel_file", action="append", dest="excel_file", help="specify one or multiple excel files with -i switch, e.g. -i A.xls -i B.xls") if len(sys.argv)==1: parser.parse_args(['--help']) options, args=parser.parse_args() total_peptide_seqs_dict={} protein_uniprot_dict={} uniprot_details_dict={} REP_ACCESSION_COL=3 PROTEIN_N_COL=2 MIN_PROT_PEPTIDES=2 for k in range(0,len(options.excel_file)): excel_data=xlrd.open_workbook(options.excel_file[k]) peptide_sheet=excel_data.sheet_by_index(0) for row_index in range(1,peptide_sheet.nrows): if peptide_sheet.cell(row_index,REP_ACCESSION_COL).value=='': break else: key=str(k+1)+"_"+str(int(peptide_sheet.cell(row_index,PROTEIN_N_COL).value)) val=peptide_sheet.cell(row_index,REP_ACCESSION_COL).value update_dicts(protein_uniprot_dict,uniprot_details_dict,key,val) total_peptide_seqs_dict[key]=total_peptide_seqs_dict.get(key,0)+1 for i in sorted(total_peptide_seqs_dict.keys()): if total_peptide_seqs_dict[i]>=MIN_PROT_PEPTIDES: for j in protein_uniprot_dict[i]: print i+"\t"+j+"\t"+"\t".join(uniprot_details_dict[j]) if __name__=="__main__": main(sys.argv[1:])