#!/usr/bin/python # *************************************************************** # Name: CLUSThack.py # Purpose: This work was developed in the event, # ClusterHack: Efficient OTU clustering, alignment and noise # removal for short read amplicons, August 27th-29th, University of Glasgow # attended by, # Umer Zeeshan Ijaz (Umer.Ijaz@glasgow.ac.uk) # Jiajie Zhang (bestzhangjiajie@gmail.com) # Christopher Quince (quince@civil.gla.ac.uk) # Frederic Mahe (mahe@rhrk.uni-kl.de) # Torbjorn Rognes (torognes@ifi.uio.no) # # The purpose of this module is to test qgram based distance matrix and use it in # hierarchical clustering to cluster similar sequences together. # The program outputs a tab-delimited list of clusters along with the assignments (sequence headers). # # Version: 0.1 basic hierarchical clustering on qgram based distance matrix # Authors: Umer Zeeshan Ijaz (Umer.Ijaz@glasgow.ac.uk) # http://userweb.eng.gla.ac.uk/umer.ijaz # Last Modified: 2013-09-16 # License: Copyright (c) 2013 Computational Microbial Genomics Group, University of Glasgow, UK # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # ************************************************************** import CLUSThack,sys,os from optparse import OptionParser from Bio import SeqIO import matplotlib.pyplot as plt from scipy.cluster.hierarchy import linkage from scipy.cluster.hierarchy import dendrogram from scipy.cluster.hierarchy import fcluster from scipy.spatial.distance import squareform import operator def extract_filename(filename,returnCompletePath): root, ext = os.path.splitext(filename) if returnCompletePath: return root else: return os.path.basename(root) def find(lst, a): result = [] for i, x in enumerate(lst): if x==a: result.append(i) return result def augmented_dendrogram(*args, **kwargs): ddata = dendrogram(*args, **kwargs) if not kwargs.get('no_plot', False): for i, d in zip(ddata['icoord'], ddata['dcoord']): x = 0.5 * sum(i[1:3]) y = d[1] plt.plot(x, y, 'ro') plt.annotate("%.3g" % y, (x, y), xytext=(0, -8), textcoords='offset points', va='top', ha='center') return ddata def main(): parser = OptionParser(usage="usage: %prog [options] filename", version="%prog 0.2") parser.add_option("-f","--file", action="store", dest="file", help="provide file with sequences in FASTA format") parser.add_option("-t","--threads", action="store", default=8, dest="threads", help="number of threads to use while calculating qgrams (Default = 8)") parser.add_option("-c","--cutoff", action="store", default=None, dest="cutoff", help="cutoff for the tree (Default= 0.5*max(height))") parser.add_option("-s","--save_dendrogram", action="store_true", default=False, dest="save_dendrogram", help="save dendrogram as .pdf (Default: False)") if len(sys.argv) == 1: parser.parse_args(['--help']) options, args = parser.parse_args() sequence_list=[] sequence_name=[] for seq_record in SeqIO.parse(options.file, "fasta"): sequence_name.append(str(seq_record.id)) sequence_list.append(str(seq_record.seq)) distance_matrix=CLUSThack.qgram(sequence_list,10,2,int(options.threads),None) distance_matrix=squareform(distance_matrix) linkage_matrix=linkage(distance_matrix) cutoff=0 if options.cutoff==None: cutoff = 0.5*max(linkage_matrix[:,2]) else: cutoff = options.cutoff clusters=fcluster(linkage_matrix,cutoff,'distance') for i in set(clusters): print 'Clust_'+str(i)+'\t'+';'.join([sequence_name[j] for j in find(clusters,i)]) if options.save_dendrogram: labels=range(len(sequence_list)) labels=[sequence_name[i] for i in labels] plt.subplot(1,1,1) plt.axhline(y=cutoff,linewidth=4, color='r') plt.title("Sequence clusters") augmented_dendrogram(linkage_matrix,color_threshold=cutoff,labels=labels,show_leaf_counts=True) fig = plt.gcf() fig.set_size_inches(len(sequence_list)/4.0,10) fig.tight_layout() plt.savefig(extract_filename(options.file,False)+'.pdf',dpi=100,format='pdf') if __name__ == '__main__': main()