#!/usr/bin/python # *************************************************************** # Name: METAmock # Purpose: Metagenomics analysis pipeline that maps reads against # the reference database and generates # -Depth of coverage # -Breadth of coverage # -Proportion of reads mapping to each reference genome # # The program requires the reference database to be indexed by bwa # and requires the following programs to be installed: # -bwa # -samtools # -bedtools # # The first public release (version 0.1) was developed # and completed by Umer Zeeshan Ijaz under the supervision # of Christopher Quince in the following hackathon: # # Event title: Integrative mixture models for enterotypes # Location: University of Glasgow & The Burn (Scotland) # Dates: from 12-08-2013 to 16-08-2013 # # organized by European Union's Earth System Science and Environmental # Management ES1103 COST Action ("Microbial ecology & the earth system: # collaborating for insight and success with the new generation of sequencing tools"). # This work would not have been possible, were it not for the useful discussions # with other participants of the hackathon, namely, # # Keith Harris # Leo Lahti # Jarkko Salojarvi # Karoline Faust # # Version: 0.1 # Authors: Umer Zeeshan Ijaz (Umer.Ijaz@glasgow.ac.uk) # http://userweb.eng.gla.ac.uk/umer.ijaz # Created: 2012-08-16 # License: Copyright (c) 2013 Computational Microbial Genomics Group, University of Glasgow, UK # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # **************************************************************/ import os,sys,os.path import time,datetime import subprocess from optparse import OptionParser from optparse import Option, OptionValueError import csv import multiprocessing from textwrap import dedent from itertools import izip_longest #Ref:http://www.ngcrawford.com/2012/03/29/python-multiprocessing-large-files/ def process_chunk(d): """Replace this with your own function that processes data one line at a time""" d = d.strip() return d def grouper(n, iterable, padvalue=None): """grouper(3, 'abcdefg', 'x') -->('a','b','c'), ('d','e','f'), ('g','x','x')""" return izip_longest(*[iter(iterable)]*n, fillvalue=padvalue) #Ref:http://preshing.com/20110924/timing-your-code-using-pythons-with-statement class Timer: def __enter__(self): self.start = time.clock() return self def __exit__(self, *args): self.end = time.clock() self.interval = self.end - self.start #Ref:http://stackoverflow.com/questions/4109436/processing-multiple-values-for-one-single-option-using-getopt-optparse class MultipleOption(Option): ACTIONS = Option.ACTIONS + ("extend",) STORE_ACTIONS = Option.STORE_ACTIONS + ("extend",) TYPED_ACTIONS = Option.TYPED_ACTIONS + ("extend",) ALWAYS_TYPED_ACTIONS = Option.ALWAYS_TYPED_ACTIONS + ("extend",) def take_action(self, action, dest, opt, value, values, parser): if action == "extend": values.ensure_value(dest, []).append(value) else: Option.take_action(self, action, dest, opt, value, values, parser) def print_time_stamp(message): print datetime.datetime.fromtimestamp(time.time()).strftime('[%Y-%m-%d %H:%M:%S] ')+message #check if the program exists and return it's location, otherwise return None def which(program): def is_exe(fpath): return os.path.isfile(fpath) and os.access(fpath, os.X_OK) fpath, fname = os.path.split(program) if fpath: if is_exe(program): return program else: for path in os.environ["PATH"].split(os.pathsep): path = path.strip('"') exe_file = os.path.join(path, program) if is_exe(exe_file): return exe_file return None def print_prog_status(prog,loc): if loc != None: print 'Checking for \'' + prog + '\': found '+ loc else: print 'Checking for \'' + prog + '\': ERROR - could not find \''+prog+'\'' print 'Exiting.' sys.exit(1) def run_prog(prog): p = subprocess.Popen(prog, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # for line in iter(p.stdout.readline, b''): # print(">>> " + line.rstrip()) retval = p.wait() def write_dict_file_tsv(mydict,filename): writer = csv.writer(open(filename, 'wb'),delimiter='\t') for key, value in mydict.items(): writer.writerow([key, value]) def read_dict_file_tsv(filename): reader = csv.reader(open(filename, 'rb'),delimiter='\t') mydict = dict(x for x in reader) return mydict def main(): parser = OptionParser(option_class=MultipleOption,usage="usage: %prog [options] filename", version="%prog 0.1") parser.add_option("-q","--query_reads", action="append", dest="query_reads", help="specify single or paired-end reads in fastq format. For each file, use -q, e.g., -q R1.fastq -q R2.fastq") parser.add_option("-d","--reference_database", action="store", default=None, dest="reference_database", help="reference database") parser.add_option("-o","--output_name", action="store", dest="output_name", default=None, help="name under which the files are stored") parser.add_option("-t","--threads", action="store", dest="bwa_threads", default=8, help="number of threads to use in bwa") if len(sys.argv) == 1: parser.parse_args(['--help']) options, args = parser.parse_args() if options.reference_database==None or options.output_name==None: print 'ERROR - missing arguments (possibly -o, -d)' print 'Exiting.' sys.exit(1) PROG_BWA=which('bwa') PROG_SAMTOOLS=which('samtools') PROG_BEDTOOLS=which('bedtools') print "This is METAmock v0.1. Copyright (c) 2013 Computational Microbial Genomics Group, University of Glasgow, UK" #check programs print_prog_status("bwa",PROG_BWA) print_prog_status("samtools",PROG_SAMTOOLS) print_prog_status("bedtools",PROG_BEDTOOLS) if os.path.exists(options.output_name+'.sam') != True: if len(options.query_reads)==2: print_time_stamp('Aligning '+ os.path.basename(options.query_reads[0])+ " and "+os.path.basename(options.query_reads[1])+' against '+os.path.basename(options.reference_database) + ' using bwa on '+str(options.bwa_threads)+ ' threads') cmd='bwa mem -t '+str(options.bwa_threads)+' '+options.reference_database+' '+' '.join(options.query_reads)+' > '+options.output_name+'.sam' run_prog(cmd) print_time_stamp('Generated '+options.output_name+'.sam'+' successfully!') else: print 'Pipeline is only tested for paired-end reads so exiting now to avoid problems in downstream analysis' sys.exit(1) if os.path.exists(options.output_name+'.bam') != True: print_time_stamp('Converting '+options.output_name+'.sam'+' to '+options.output_name+'.bam') cmd='samtools view -h -b -S '+options.output_name+'.sam'+' > '+options.output_name+'.bam' run_prog(cmd) print_time_stamp('Generated '+options.output_name+'.bam'+' successfully!') if os.path.exists(options.output_name+'_mapped.bam') != True: print_time_stamp('Extracting mapped reads from bam file') cmd='samtools view -b -F 4 '+ options.output_name+'.bam' + ' > '+ options.output_name+'_mapped.bam' run_prog(cmd) print_time_stamp('Generated '+options.output_name+'_mapped.bam'+' successfully!') if os.path.exists('lengths.genome') != True: print_time_stamp('Extracting length of reference genomes') cmd='samtools view -H '+options.output_name+'_mapped.bam'+' | perl -ne \'if ($_ =~ m/^\@SQ/) { print $_ }\' | perl -ne \'if ($_ =~ m/SN:(.+)\s+LN:(\d+)/) { print $1, \"\\t\", $2, \"\\n\"}\' > lengths.genome' run_prog(cmd) print_time_stamp('Generated lengths.genome successfully!') if os.path.exists(options.output_name+'_mapped_sorted.bam') != True: print_time_stamp('Sorting mapped bam file') cmd='samtools sort -m 1000000000 '+options.output_name+'_mapped.bam'+' '+ options.output_name+'_mapped_sorted' run_prog(cmd) print_time_stamp('Generated '+options.output_name+'_mapped_sorted.bam'+' successfully!') if os.path.exists(options.output_name+'_perbasecoverage.tsv') != True: print_time_stamp('Generating per-base coverage for each reference genome') cmd='bedtools genomecov -ibam '+options.output_name+'_mapped_sorted.bam'+' -d -g lengths.genome > '+options.output_name+'_perbasecoverage.tsv' run_prog(cmd) print_time_stamp('Generated '+options.output_name+'_perbasecoverage.tsv'+' successfully!') print_time_stamp('Generating pipeline statistics') genomes_length={} with open("lengths.genome") as f: for line in f: line=line.rstrip() d=line.split('\t') genomes_length[d[0]]=int(d[1]) mean_genome_coverage={} proportion_genome_covered={} with Timer() as t: with open(options.output_name+'_perbasecoverage.tsv') as f: for line in f: line=line.rstrip() d=line.split('\t') mean_genome_coverage[d[0]]=int(d[2])+mean_genome_coverage.get(d[0],0) proportion_genome_covered[d[0]]=int(int(d[2])>0)+proportion_genome_covered.get(d[0],0) p = subprocess.Popen('samtools view -c '+options.output_name+'.bam', shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) total_reads=None for line in iter(p.stdout.readline, b''): total_reads=line.rstrip() retval=p.wait() cmd='samtools view '+options.output_name+'_mapped_sorted.bam'+' | awk \'{print $3}\' | sort | uniq -c | awk \'{print $2\"\\t\"$1}\' >'+options.output_name+'_readscount.tsv' run_prog(cmd) proportion_genome_reads=read_dict_file_tsv(options.output_name+'_readscount.tsv') for key in genomes_length.keys(): mean_genome_coverage[key]=float(mean_genome_coverage[key])/float(genomes_length[key]) proportion_genome_covered[key]=float(proportion_genome_covered[key])/float(genomes_length[key]) proportion_genome_reads[key]=float(proportion_genome_reads[key])/float(total_reads) write_dict_file_tsv(mean_genome_coverage,options.output_name+'_MEAN-GENOME-COVERAGE.tsv') write_dict_file_tsv(proportion_genome_covered,options.output_name+'_PROPORTION-GENOME-COVERED.tsv') write_dict_file_tsv(proportion_genome_reads,options.output_name+'_PROPORTION-GENOME-READS.tsv') print_time_stamp('Following files were generated successfully:') print(options.output_name+'_MEAN-GENOME-COVERAGE.tsv') print(options.output_name+'_PROPORTION-GENOME-COVERED.tsv') print(options.output_name+'_PROPORTION-GENOME-READS.tsv') if __name__ == '__main__': main()