#!/usr/bin/python # *************************************************************** # Name: extract_fasta_swissprot.py # Purpose: This script takes a swissprot file in gz format # and extracts a fasta file by matching a search pattern # Version: 0.1 # Authors: Umer Zeeshan Ijaz (Umer.Ijaz@glasgow.ac.uk) # http://userweb.eng.gla.ac.uk/umer.ijaz # Created: 2012-10-20 # License: Copyright (c) 2012 Computational Microbial Genomics Group, University of Glasgow, UK # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # **************************************************************/ import gzip,re,sys,getopt from Bio import SwissProt def usage(): print 'Example usages:' print '\tpython extract_fasta_swissprot.py -i uniprot_sprot.dat.gz -p "ATP16" -f 5 -m 4 > ATP16.fasta' print '\tpython extract_fasta_swissprot.py -i uniprot_sprot.dat.gz -p "Acetyl-CoA hydrolase" > Acetyl_CoA_hydrolase.fasta' print '\tpython extract_fasta_swissprot.py -i uniprot_sprot.dat.gz -p "Candida albicans" -f 2 > Candida_albicans.fasta' print 'Parameters:' print '\t -i (--input)\tCompressed SwissProt file' print '\t -p (--pattern)\tSearch pattern' print '\t -m (--format)\tHeader format' print 'Available search types:' print '\t-f 1\tProtein Names (including Alternative Name) [DEFAULT]' print '\t-f 2\tOrganism Name' print '\t-f 3\tKeywords' print '\t-f 4\tComments' print '\t-f 5\tGene Name' print '\t-f 6\tAccession Number' print '\t-f 7\tOrganism Classification' print 'Available header formats:' print '\t-m 1\t>Accession IDs | Organism Name [DEFAULT]' print '\t-m 2\t>Accesion IDs | Gene Name | Organism Name' print '\t-m 3\t>Entry ID' print '\t-m 4\t>Entry ID | Taxonomy IDs' def split_str_into_len(s, l=60): """ Split a string into chunks of length l """ return [s[i:i+l] for i in range(0, len(s), l)] def main(argv): filename='' search_pattern='' what_to_search=1 header_format=1 try: opts, args = getopt.getopt(argv,"hi:p:f:m:",["input=","pattern=","flag=","format="]) except getopt.GetoptError: usage() exit(2) for opt, arg in opts: if opt == '-h': usage() exit(2) elif opt in ("-i", "--input"): filename=arg elif opt in ("-p", "--pattern"): search_pattern=arg elif opt in ("-f", "--flag"): what_to_search=int(arg) elif opt in ("-m", "--format"): header_format=int(arg) if filename=="" or search_pattern=="": usage() exit(2) handle=gzip.open(filename) for record in SwissProt.parse(handle): found=0 if what_to_search==1: line=record.description full_matches=re.findall(r'Full=(\w.*?);',line) for m in full_matches: res=re.findall(search_pattern,m) if res: found=1 elif what_to_search==2: line=record.organism res=re.findall(search_pattern,line) if res: found=1 elif what_to_search==3: line=record.keywords for l in line: res=re.findall(search_pattern,l) if res: found=1 elif what_to_search==4: line=record.comments for l in line: res=re.findall(search_pattern,l) if res: found=1 elif what_to_search==5: line=record.gene_name res=re.findall(search_pattern,line) if res: found=1 elif what_to_search==6: line=record.accessions for l in line: res=re.findall(search_pattern,l) if res: found=1 elif what_to_search==7: line=record.organism_classification for l in line: res=re.findall(search_pattern,l) if res: found=1 else: print 'Invalid flag' usage() exit(2) if found==1: id_list=':'.join(str(x) for x in record.accessions) organism_match=re.findall(r'(\w.*?) \(',record.organism) gene_name_match=re.findall(r'Name=(\w.*?);',record.gene_name) if header_format==1: if organism_match: print '>'+id_list+'|'+re.sub('\s+','_',organism_match[0]) else: print '>'+id_list elif header_format==2: if organism_match: if gene_name_match: print '>'+id_list+'|'+gene_name_match[0]+'|'+re.sub('\s+','_',organism_match[0]) else: print '>'+id_list+'|'+re.sub('\s+','_',organism_match[0]) else: if gene_name_match: print '>'+id_list+'|'+gene_name_match[0] else: print '>'+id_list elif header_format==3: print '>'+record.entry_name elif header_format==4: if len(record.taxonomy_id)!=0: taxonomy_id_list=':'.join(str(x) for x in record.taxonomy_id) print '>'+record.entry_name+'|'+taxonomy_id_list else: print '>'+record.entry_name else: print '>'+record.entry_name for seq in split_str_into_len(record.sequence,60): print seq if __name__ == '__main__': main(sys.argv[1:])