#!/usr/bin/python
# ***************************************************************
# Name:      processPROT.py
# Purpose:   This script reads statistics file generated from ProteinPilot
#            and uses the Uniprot IDs to extract detailed records from ExPASy
#            server
# Version:   0.1
# Authors:   Umer Zeeshan Ijaz (Umer.Ijaz@glasgow.ac.uk)
#                 http://userweb.eng.gla.ac.uk/umer.ijaz
# Last Modified:   2014-07-5
# License:   Copyright (c) 2014 Computational Microbial Genomics Group, University of Glasgow, UK
#
#            This program is free software: you can redistribute it and/or modify
#            it under the terms of the GNU General Public License as published by
#            the Free Software Foundation, either version 3 of the License, or
#            (at your option) any later version.
#
#            This program is distributed in the hope that it will be useful,
#            but WITHOUT ANY WARRANTY; without even the implied warranty of
#            MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#            GNU General Public License for more details.
#
#            You should have received a copy of the GNU General Public License
#            along with this program.  If not, see <http://www.gnu.org/licenses/>.
# **************************************************************

import getopt,sys,xlrd
from Bio import ExPASy
from Bio import SwissProt
from optparse import OptionParser
from optparse import Option, OptionValueError


#Ref:http://stackoverflow.com/questions/4109436/processing-multiple-values-for-one-single-option-using-getopt-optparse
class MultipleOption(Option):
    ACTIONS = Option.ACTIONS + ("extend",)
    STORE_ACTIONS = Option.STORE_ACTIONS + ("extend",)
    TYPED_ACTIONS = Option.TYPED_ACTIONS + ("extend",)
    ALWAYS_TYPED_ACTIONS = Option.ALWAYS_TYPED_ACTIONS + ("extend",)

    def take_action(self, action, dest, opt, value, values, parser):
        if action == "extend":
            values.ensure_value(dest, []).append(value)
        else:
            Option.take_action(self, action, dest, opt, value, values, parser)

def insert_uniprot_details_dict(dict,key):
	#if the record doesn't exist
	if dict.get(key,None)==None:
		tmp=[];
		handle=ExPASy.get_sprot_raw(key)
		try:
			record=SwissProt.read(handle);
			tmp=[record.gene_name,record.organism,":".join(record.taxonomy_id),":".join(record.organism_classification),record.description, record.sequence]
		except:
			tmp=["__NA__","__NA__","__NA__","__NA__","__NA__","__NA__"]
		
		dict[key]=tmp;

def insert_protein_uniprot_dict(dict,key,value):
	#if it is first record
	if dict.get(key,None)==None:
		dict[key]=[value];
	else:
		if not value in dict[key]:
			tmp=dict[key];
			tmp.append(value);
			dict[key]=tmp;	

def update_dicts(prot_uniprot,uniprot_det,key,rep_value):
	#print rep_value;
	outter_tok=rep_value.split(";");	
	if len(outter_tok)>1:
		for i in range(0,len(outter_tok)):
			inner_tok=outter_tok[i].split("|");
			insert_protein_uniprot_dict(prot_uniprot,key,inner_tok[1]);
			insert_uniprot_details_dict(uniprot_det,inner_tok[1]);	
	else:
		inner_tok=outter_tok[0].split("|")
		insert_protein_uniprot_dict(prot_uniprot,key,inner_tok[1]);
		insert_uniprot_details_dict(uniprot_det,inner_tok[1]);
		#print inner_tok[1];	
def main(argv):

	parser = OptionParser(option_class=MultipleOption,usage="usage: %prog [options] filename",
                          version="%prog 0.1")
    	parser.add_option("-i","--excel_file",
                       action="append",
                       dest="excel_file",
                       help="specify one or multiple excel files with -i switch, e.g. -i A.xls -i B.xls")
	if len(sys.argv)==1:
		parser.parse_args(['--help'])

	options, args=parser.parse_args()


	total_peptide_seqs_dict={}
	protein_uniprot_dict={}
	uniprot_details_dict={}

	REP_ACCESSION_COL=3
        PROTEIN_N_COL=2
	MIN_PROT_PEPTIDES=2

	for k in range(0,len(options.excel_file)):
		excel_data=xlrd.open_workbook(options.excel_file[k])
		peptide_sheet=excel_data.sheet_by_index(0)
		for row_index in range(1,peptide_sheet.nrows):
			if peptide_sheet.cell(row_index,REP_ACCESSION_COL).value=='':
				break
			else:
				key=str(k+1)+"_"+str(int(peptide_sheet.cell(row_index,PROTEIN_N_COL).value))
				val=peptide_sheet.cell(row_index,REP_ACCESSION_COL).value
				update_dicts(protein_uniprot_dict,uniprot_details_dict,key,val)
				total_peptide_seqs_dict[key]=total_peptide_seqs_dict.get(key,0)+1


	for i in sorted(total_peptide_seqs_dict.keys()):
		if total_peptide_seqs_dict[i]>=MIN_PROT_PEPTIDES:
			for j in protein_uniprot_dict[i]:
				print i+"\t"+j+"\t"+"\t".join(uniprot_details_dict[j])
				

if __name__=="__main__":
	main(sys.argv[1:])