#!/usr/bin/python
# ***************************************************************
# Name:      extract_fasta_swissprot.py
# Purpose:   This script takes a swissprot file in gz format
#	     and extracts a fasta file by matching a search pattern 
# Version:   0.1
# Authors:   Umer Zeeshan Ijaz (Umer.Ijaz@glasgow.ac.uk)
#                 http://userweb.eng.gla.ac.uk/umer.ijaz
# Created:   2012-10-20
# License:   Copyright (c) 2012 Computational Microbial Genomics Group, University of Glasgow, UK
#
#            This program is free software: you can redistribute it and/or modify
#            it under the terms of the GNU General Public License as published by
#            the Free Software Foundation, either version 3 of the License, or
#            (at your option) any later version.
#
#            This program is distributed in the hope that it will be useful,
#            but WITHOUT ANY WARRANTY; without even the implied warranty of
#            MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#            GNU General Public License for more details.
#
#            You should have received a copy of the GNU General Public License
#            along with this program.  If not, see <http://www.gnu.org/licenses/>.
# **************************************************************/

import gzip,re,sys,getopt
from Bio import SwissProt

def usage():
    print 'Example usages:'
    print '\tpython extract_fasta_swissprot.py -i uniprot_sprot.dat.gz -p "ATP16" -f 5 -m 4 > ATP16.fasta'
    print '\tpython extract_fasta_swissprot.py -i uniprot_sprot.dat.gz -p "Acetyl-CoA hydrolase" > Acetyl_CoA_hydrolase.fasta'
    print '\tpython extract_fasta_swissprot.py -i uniprot_sprot.dat.gz -p "Candida albicans" -f 2 > Candida_albicans.fasta'
    print 'Parameters:'
    print '\t -i (--input)\tCompressed SwissProt file'
    print '\t -p (--pattern)\tSearch pattern'
    print '\t -m (--format)\tHeader format'
    print 'Available search types:'
    print '\t-f 1\tProtein Names (including Alternative Name) [DEFAULT]'
    print '\t-f 2\tOrganism Name'
    print '\t-f 3\tKeywords'
    print '\t-f 4\tComments'
    print '\t-f 5\tGene Name'
    print '\t-f 6\tAccession Number'
    print '\t-f 7\tOrganism Classification'
    print 'Available header formats:'
    print '\t-m 1\t>Accession IDs | Organism Name [DEFAULT]'
    print '\t-m 2\t>Accesion IDs | Gene Name | Organism Name'
    print '\t-m 3\t>Entry ID'
    print '\t-m 4\t>Entry ID | Taxonomy IDs'

def split_str_into_len(s, l=60):
    """ Split a string into chunks of length l """
    return [s[i:i+l] for i in range(0, len(s), l)]

def main(argv):
    filename=''
    search_pattern=''
    what_to_search=1
    header_format=1
    try:
        opts, args = getopt.getopt(argv,"hi:p:f:m:",["input=","pattern=","flag=","format="])
    except getopt.GetoptError:
        usage()
        exit(2)
    for opt, arg in opts:
        if opt == '-h':
            usage()
            exit(2)
        elif opt in ("-i", "--input"):
            filename=arg
        elif opt in ("-p", "--pattern"):
            search_pattern=arg
	elif opt in ("-f", "--flag"):
	    what_to_search=int(arg)
	elif opt in ("-m", "--format"):
	    header_format=int(arg)

    if filename=="" or search_pattern=="":
        usage()
        exit(2)

    handle=gzip.open(filename)

    for record in SwissProt.parse(handle):
	found=0
	if what_to_search==1:
		line=record.description
        	full_matches=re.findall(r'Full=(\w.*?);',line)
        	for m in full_matches:
            		res=re.findall(search_pattern,m)
            		if res:
                		found=1
	elif what_to_search==2:
		line=record.organism
		res=re.findall(search_pattern,line)
		if res:
			found=1
	elif what_to_search==3:
		line=record.keywords
		for l in line:
			res=re.findall(search_pattern,l)
			if res:
				found=1
	elif what_to_search==4:
		line=record.comments
		for l in line:
			res=re.findall(search_pattern,l)
			if res:
				found=1
	elif what_to_search==5:
		line=record.gene_name
		res=re.findall(search_pattern,line)
		if res:
			found=1
	elif what_to_search==6:
		line=record.accessions
		for l in line:
			res=re.findall(search_pattern,l)
			if res:
				found=1
	elif what_to_search==7:
		line=record.organism_classification
		for l in line:
			res=re.findall(search_pattern,l)
			if res:
				found=1
	else:
		print 'Invalid flag'
		usage()
		exit(2)

        if found==1:
            id_list=':'.join(str(x) for x in record.accessions)
            organism_match=re.findall(r'(\w.*?) \(',record.organism)
	    gene_name_match=re.findall(r'Name=(\w.*?);',record.gene_name)
	    if header_format==1:			
            	if organism_match:
                	print '>'+id_list+'|'+re.sub('\s+','_',organism_match[0])
            	else:
                	print '>'+id_list
	    elif header_format==2:
		if organism_match:
			if gene_name_match:
				print '>'+id_list+'|'+gene_name_match[0]+'|'+re.sub('\s+','_',organism_match[0])
        		else:
				print '>'+id_list+'|'+re.sub('\s+','_',organism_match[0])
		else:
			if gene_name_match:
				print '>'+id_list+'|'+gene_name_match[0]
			else:
				print '>'+id_list
            elif header_format==3:
		print '>'+record.entry_name
	    elif header_format==4:
			if len(record.taxonomy_id)!=0:
				taxonomy_id_list=':'.join(str(x) for x in record.taxonomy_id)
				print '>'+record.entry_name+'|'+taxonomy_id_list
			else:
				print '>'+record.entry_name
	    else:
		print '>'+record.entry_name
	    for seq in split_str_into_len(record.sequence,60):
                print seq

if __name__ == '__main__':
    main(sys.argv[1:])