#!/usr/bin/python # *************************************************************** # Name: AMPLImock # Purpose: Amplicon analysis pipeline that maps reads against # the reference database and generates statistics # Version: 0.1 # Authors: Umer Zeeshan Ijaz (Umer.Ijaz@glasgow.ac.uk) # http://userweb.eng.gla.ac.uk/umer.ijaz # Created: 2012-08-18 # License: Copyright (c) 2013 Computational Microbial Genomics Group, University of Glasgow, UK # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # **************************************************************/ import os,sys,os.path import time,datetime import subprocess from optparse import OptionParser from optparse import Option, OptionValueError import csv import re import pylab from Bio import SeqIO from Bio.SeqUtils import GC import math import multiprocessing from textwrap import dedent from itertools import izip_longest #global dict variable to store values pipeline_statistics={} #Ref:http://stackoverflow.com/questions/5319922/python-check-if-word-is-in-a-string def findWholeWord(w): #return re.compile(r'\b({0})\b'.format(w), flags=re.IGNORECASE).search return re.compile(r'({0})'.format(w), flags=re.IGNORECASE).search #Ref:http://www.ngcrawford.com/2012/03/29/python-multiprocessing-large-files/ def process_chunk(d): """Replace this with your own function that processes data one line at a time""" d = d.strip() return d def grouper(n, iterable, padvalue=None): """grouper(3, 'abcdefg', 'x') -->('a','b','c'), ('d','e','f'), ('g','x','x')""" return izip_longest(*[iter(iterable)]*n, fillvalue=padvalue) #Ref:http://preshing.com/20110924/timing-your-code-using-pythons-with-statement class Timer: def __enter__(self): self.start = time.clock() return self def __exit__(self, *args): self.end = time.clock() self.interval = self.end - self.start #Ref:http://stackoverflow.com/questions/4109436/processing-multiple-values-for-one-single-option-using-getopt-optparse class MultipleOption(Option): ACTIONS = Option.ACTIONS + ("extend",) STORE_ACTIONS = Option.STORE_ACTIONS + ("extend",) TYPED_ACTIONS = Option.TYPED_ACTIONS + ("extend",) ALWAYS_TYPED_ACTIONS = Option.ALWAYS_TYPED_ACTIONS + ("extend",) def take_action(self, action, dest, opt, value, values, parser): if action == "extend": values.ensure_value(dest, []).append(value) else: Option.take_action(self, action, dest, opt, value, values, parser) def print_time_stamp(message): print datetime.datetime.fromtimestamp(time.time()).strftime('[%Y-%m-%d %H:%M:%S] ')+message #check if the program exists and return it's location, otherwise return None def which(program): def is_exe(fpath): return os.path.isfile(fpath) and os.access(fpath, os.X_OK) fpath, fname = os.path.split(program) if fpath: if is_exe(program): return program else: for path in os.environ["PATH"].split(os.pathsep): path = path.strip('"') exe_file = os.path.join(path, program) if is_exe(exe_file): return exe_file return None def print_cond(cond,true_message,false_message): if cond: print true_message else: print false_message def print_prog_status(prog,loc): if loc != None: print 'Checking for \'' + prog + '\': found '+ loc else: print 'Checking for \'' + prog + '\': ERROR - could not find \''+prog+'\'' print 'Exiting.' sys.exit(1) def run_prog(prog): # p = subprocess.Popen(prog, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) p = subprocess.Popen(prog, shell=True, stdout=None, stderr=None) # for line in iter(p.stdout.readline, b''): # print(">>> " + line.rstrip()) retval = p.wait() def write_dict_file_tsv(mydict,filename): writer = csv.writer(open(filename, 'wb'),delimiter='\t') for key, value in mydict.items(): writer.writerow([key,value]) def read_dict_file_tsv(filename): reader = csv.reader(open(filename, 'rb'),delimiter='\t') mydict = dict(x for x in reader) return mydict def write_dict_file_csv(mydict,filename): writer = csv.writer(open(filename, 'wb'),delimiter=',') for key, value in mydict.items(): writer.writerow([key,value]) def read_dict_file_csv(filename): reader = csv.reader(open(filename, 'rb'),delimiter=',') mydict = dict(x for x in reader) return mydict def extract_filename(filename,returnCompletePath): root, ext = os.path.splitext(filename) if returnCompletePath: return root else: return os.path.basename(root) #convert fastq files to fasta def convert_fastq_to_fasta(input_file_name,output_file_name): print_time_stamp('Converting '+input_file_name+' to '+output_file_name) SeqIO.convert(input_file_name, "fastq", output_file_name, "fasta") print_time_stamp('Generated '+output_file_name) #paired-end reads quality trimming using sickle def quality_trim_reads_pe(input_paired_end_1,input_paired_end_2,output_paired_end_1,output_paired_end_2,output_singlet,quality_type): print_time_stamp('Trimming '+input_paired_end_1+' and '+input_paired_end_2+' using quality type as \''+quality_type+'\', quality score of 20, and minimum of 10 bp to keep') cmd='sickle pe -f '+input_paired_end_1+' -r '+input_paired_end_2+' -t '+quality_type+' -o '+output_paired_end_1+' -p '+output_paired_end_2+' -s '+output_singlet+' -q 20 -l 10' run_prog(cmd) print_time_stamp('Generated '+output_paired_end_1+' ,'+output_paired_end_2+' ,and '+output_singlet) #single-end reads quality trimming using sickle def quality_trim_reads_se(input_single_end,output_single_end,quality_type): print_time_stamp('Trimming '+input_single_end+' using quality type as \''+quality_type+'\', quality score of 20, and minimum of 10 bp to keep') cmd='sickle se -f '+input_single_end+' -t '+quality_type+' -o '+output_single_end+' -q 20 -l 10' run_prog(cmd) print_time_stamp('Generated '+output_single_end) #phred quality score plot def generate_quality_plots(input_fastq_files,output_png_image): pylab.clf() print_time_stamp('Plotting PHRED quality scores for '+' and '.join(input_fastq_files)) for k in xrange(len(input_fastq_files)): filename = input_fastq_files[k] pylab.subplot(1, len(input_fastq_files), k) for i,record in enumerate(SeqIO.parse(filename, "fastq")): if i >= 50 : break #trick! pylab.plot(record.letter_annotations["phred_quality"]) pylab.ylim(0,45) pylab.ylabel("PHRED quality score") pylab.xlabel("Position") pylab.savefig(output_png_image) print_time_stamp('Generated '+output_png_image) #length histogram def generate_length_histogram(input_fasta_file,output_png_image): pylab.clf() print_time_stamp('Plotting sequences lengths histogram for '+input_fasta_file) sizes = [len(rec) for rec in SeqIO.parse(input_fasta_file, "fasta")] pylab.hist(sizes, bins=20) pylab.title("%i sequences\nLengths %i to %i" \ % (len(sizes),min(sizes),max(sizes))) pylab.xlabel("Sequence length (bp)") pylab.ylabel("Count") pylab.savefig(output_png_image) print_time_stamp('Generated '+output_png_image) #GC percentage plot def generate_GC_perc_plot(input_fasta_file,output_png_image): pylab.clf() print_time_stamp('Plotting GC percentage plot for '+input_fasta_file) gc_values = sorted(GC(rec.seq) for rec in SeqIO.parse(input_fasta_file, "fasta")) pylab.plot(gc_values) pylab.title("%i sequences\nGC%% %0.1f to %0.1f" \ % (len(gc_values),min(gc_values),max(gc_values))) pylab.xlabel("Sequences") pylab.ylabel("GC%") pylab.savefig(output_png_image) print_time_stamp('Generated '+output_png_image) def usearch(input_fasta_file,reference_database,matching_id,alnout_file,userout_file,output_matched_fasta_file,output_notmatched_fasta_file,threads): print_time_stamp('Searching '+input_fasta_file+ ' against '+reference_database+ ' using usearch on '+str(threads)+ ' threads and matching identity of ' + str(matching_id)) cmd='usearch -usearch_global '+input_fasta_file+' -db '+reference_database+' -id '+str(matching_id)+' -alnout '+alnout_file+' -strand plus -userout '+userout_file+' -userfields '+ 'query+target+id+alnlen+mism+opens+qlo+qhi+tlo+thi+evalue+bits -matched '+output_matched_fasta_file+' -notmatched '+output_notmatched_fasta_file + ' -threads '+str(threads) run_prog(cmd) print_time_stamp('Generated '+alnout_file+ ', '+userout_file+', '+output_matched_fasta_file+', and '+output_notmatched_fasta_file) def usearch_both(input_fasta_file,reference_database,matching_id,alnout_file,userout_file,output_matched_fasta_file,output_notmatched_fasta_file,threads): print_time_stamp('Searching '+input_fasta_file+ ' against '+reference_database+ ' using usearch on '+str(threads)+ ' threads and matching identity of ' + str(matching_id)) cmd='usearch -usearch_global '+input_fasta_file+' -db '+reference_database+' -id '+str(matching_id)+' -alnout '+alnout_file+' -strand both -userout '+userout_file+' -userfields '+ 'query+target+id+alnlen+mism+opens+qlo+qhi+tlo+thi+evalue+bits -matched '+output_matched_fasta_file+' -notmatched '+output_notmatched_fasta_file + ' -threads '+str(threads) run_prog(cmd) print_time_stamp('Generated '+alnout_file+ ', '+userout_file+', '+output_matched_fasta_file+', and '+output_notmatched_fasta_file) def fastq_statistics(fastq_file,prefix_string): cmd='cat '+fastq_file+' | awk \'((NR-2)%4==0){read=$1;total++;count[read]++}END{for(read in count){if(!max||count[read]>max) {max=count[read];maxRead=read};if(count[read]==1){unique++}};print total,unique,unique*100/total,maxRead,count[maxRead],count[maxRead]*100/total}\'' p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) record=None for line in iter(p.stdout.readline, b''): record=line.rstrip() retval=p.wait() d=record.split() pipeline_statistics[prefix_string+'_TOTAL_READS']=d[0] pipeline_statistics[prefix_string+'_UNIQUE_READS']=d[1] pipeline_statistics[prefix_string+'_UNIQUE_READS_PERCENTAGE']=d[2] pipeline_statistics[prefix_string+'_MOST_ABUNDANT_SEQUENCE']=d[3] pipeline_statistics[prefix_string+'_MOST_ABUNDANT_SEQUENCE_READS']=d[4] pipeline_statistics[prefix_string+'_MOST_ABUNDANT_SEQUENCE_READS_PERCENTAGE']=d[5] print_time_stamp(fastq_file+' has following stats:') print 'Total reads:'+d[0] print 'Unique reads:'+d[1] print 'Unique reads percentage:'+d[2] print 'Most abundant sequence:'+d[3] print 'Most abundant sequence reads:'+d[4] print 'Most abundant sequence read percentage:'+d[5] def fasta_statistics(fasta_file,prefix_string,has_size_header): if has_size_header: cmd='awk -F\"size=\" \'/>/{gsub(\";$\",\"\",$2);sum+=$2} END {print sum}\' '+fasta_file else: cmd='grep -c \">\" '+fasta_file p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) record=None for line in iter(p.stdout.readline, b''): record=line.rstrip() retval=p.wait() pipeline_statistics[prefix_string+'_TOTAL_READS']=record print_time_stamp(fasta_file+' has '+record+' reads') def usearch_statistics(userout_file,mapping_dict,prefix_string): print_time_stamp('Computing usearch statistics for '+userout_file) freq_dict={} total_records=0 mean_identity=0.0 with open(userout_file) as f: for line in f: line=line.rstrip() d=line.split('\t') size_match=re.findall(r';size=(\w.*?);',d[0]) res=mapping_dict.get(d[1], None) if res==None: if size_match: freq_dict[d[1]]=int(size_match[0])+freq_dict.get(d[1],0) else: freq_dict[d[1]]=1+freq_dict.get(d[1],0) else: if size_match: freq_dict[res]=int(size_match[0])+freq_dict.get(res,0) else: freq_dict[res]=1+freq_dict.get(res,0) if size_match: total_records=total_records+int(size_match[0]) mean_identity=mean_identity+(float(size_match[0])*float(d[2])) else: total_records=total_records+1 mean_identity=mean_identity+float(d[2]) mean_identity=mean_identity/total_records entropy=0.0 simpson=1.0 inverse_simpson=0.0; for key in freq_dict.keys(): if freq_dict[key]>0: freq_dict[key]=float(freq_dict[key])/float(total_records) entropy=entropy+(-freq_dict[key]*math.log(freq_dict[key])) simpson=simpson-math.pow(freq_dict[key],2) inverse_simpson=inverse_simpson+math.pow(freq_dict[key],2) inverse_simpson=1/inverse_simpson write_dict_file_csv(freq_dict,extract_filename(userout_file,False)+'_freq.csv') print_time_stamp('Generated '+extract_filename(userout_file,False)+'_freq.csv. Mean identity is '+str(mean_identity)+', simpson index is '+str(simpson)+ ', inverse simpson is '+str(inverse_simpson) + ', entropy is '+str(entropy)) pipeline_statistics[prefix_string+'_MEAN_IDENT']=str(mean_identity) pipeline_statistics[prefix_string+'_SIMPSON']=str(simpson) pipeline_statistics[prefix_string+'_INVERSE_SIMPSON']=str(inverse_simpson) pipeline_statistics[prefix_string+'_ENTROPY']=str(entropy) def usearch_derep_remove_chimeras(input_fasta_file,threads,output_nonchimera_fasta_file,output_chimera_fasta_file,flag): if flag==0: print_time_stamp('Dereplicating duplicate sequences in '+input_fasta_file+' using usearch on '+str(threads)+' threads, annotating with cluster sizes, and sorting by cluster size') cmd='usearch -derep_fulllength '+input_fasta_file+' -sizeout -output '+extract_filename(input_fasta_file,False)+'_derep.fasta'+' -threads '+str(threads) run_prog(cmd) print_time_stamp('Generated '+extract_filename(input_fasta_file,False)+'_derep.fasta') else: print_time_stamp('Copying reconstructed sequences to '+extract_filename(input_fasta_file,False)+'_derep.fasta') cmd='cp '+input_fasta_file+' '+extract_filename(input_fasta_file,False)+'_derep.fasta' run_prog(cmd) print_time_stamp('Generated '+extract_filename(input_fasta_file,False)+'_derep.fasta') print_time_stamp('Sorting reads in '+extract_filename(input_fasta_file,False)+'_derep.fasta'+' by abundance using usearch') cmd='usearch -sortbysize '+extract_filename(input_fasta_file,False)+'_derep.fasta' +' -output '+extract_filename(input_fasta_file,False)+'_derep_sorted.fasta' run_prog(cmd) print_time_stamp('Generated '+extract_filename(input_fasta_file,False)+'_derep_sorted.fasta') print_time_stamp('Detecting De novo chimeras in '+ extract_filename(input_fasta_file,False)+'_derep_sorted.fasta' + ' using UCHIME') cmd='usearch -uchime_denovo '+extract_filename(input_fasta_file,False)+'_derep_sorted.fasta'+ ' -nonchimeras '+output_nonchimera_fasta_file+ ' -chimeras '+output_chimera_fasta_file run_prog(cmd) print_time_stamp('Generated '+output_nonchimera_fasta_file+' and '+output_chimera_fasta_file) def pandaseq(forward_fastq_file,reverse_fastq_file,forward_primer,reverse_primer,overlap_base_pairs,eliminate_n,threads,overlap_fastq_file): if forward_primer==None or reverse_primer==None: if eliminate_n: print_time_stamp('Running pandaseq on '+str(threads)+' threads with eliminating undetermined nucleotide sequences in overlap and minimum overlap of '+str(overlap_base_pairs)+'bps') cmd='pandaseq -f '+forward_fastq_file+' -r '+reverse_fastq_file+' -B -F -d bfsrk -N -o '+str(overlap_base_pairs)+ ' -T '+str(threads)+' > '+overlap_fastq_file run_prog(cmd) else: print_time_stamp('Running pandaseq on '+str(threads)+' threads without eliminating undetermined nucleotide sequences in overlap and minimum overlap of '+str(overlap_base_pairs)+'bps') cmd='pandaseq -f '+forward_fastq_file+' -r '+reverse_fastq_file+' -B -F -d bfsrk -o '+str(overlap_base_pairs)+ ' -T '+str(threads)+' > '+overlap_fastq_file run_prog(cmd) else: if eliminate_n: print_time_stamp('Running pandaseq on '+str(threads)+' threads with eliminating undetermined nucleotide sequences in overlap, minimum overlap of '+str(overlap_base_pairs)+'bps ,'+forward_primer+ ' as forward primer, and '+reverse_primer+ ' as reverse primer') cmd='pandaseq -f '+forward_fastq_file+' -r '+reverse_fastq_file+' -p '+forward_primer+' -q '+reverse_primer+' -B -F -d bfsrk -N -o '+str(overlap_base_pairs)+ ' -T '+str(threads)+' > '+overlap_fastq_file run_prog(cmd) else: print_time_stamp('Running pandaseq on '+str(threads)+' threads without eliminating undetermined nucleotide sequences in overlap, minimum overlap of '+str(overlap_base_pairs)+'bps ,'+forward_primer+ ' as forward primer, and '+reverse_primer+ ' as reverse primer') cmd='pandaseq -f '+forward_fastq_file+' -r '+reverse_fastq_file+' -p '+forward_primer+' -q '+reverse_primer+' -B -F -d bfsrk -o '+str(overlap_base_pairs)+ ' -T '+str(threads)+' > '+overlap_fastq_file run_prog(cmd) print_time_stamp('Generated '+overlap_fastq_file) def main(): parser = OptionParser(option_class=MultipleOption,usage="usage: %prog [options] filename", version="%prog 0.1") parser.add_option("-q","--query_reads", action="append", dest="query_reads", help="specify single or paired-end reads in fasta/fastq format. For each file, use -q, e.g., -q R1.fastq -q R2.fastq") parser.add_option("-p","--primers", action="append", dest="primers", help="specify forward and reverse primers, for each primer use -p, e.g., -p CAGTGTCAGTTGA -p TCATATCTAATCTATATCGA. The first primer will be considered the forward primer and the second primer will be considered the reverse primer") parser.add_option("-d","--reference_database_forward", action="store", default=None, dest="reference_database_forward", help="reference database forward") parser.add_option("-e","--reference_database_reverse", action="store", default=None, dest="reference_database_reverse", help="reference database reverse") parser.add_option("-l","--list_file", action="store", default=None, dest="list_file", help="list file that contains the mapping from multiple 16S copies to genome") parser.add_option("-f","--full_length_amplicons_file", action="store", default=None, dest="full_length_amplicons_file", help="sequence file that contains reconstructed full length amplicons using 3rd party software such as emirge") parser.add_option("-t","--quality_type", action="store", default="sanger", dest="quality_type", help="Type of quality values (solexa (CASAVA < 1.3), illumina (CASAVA 1.3 to 1.7), sanger (which is CASAVA >= 1.8) (Default:sanger)") parser.add_option("-u","--use_quality_trimming", action="store_true", default=False, dest="use_quality_trimming", help="flag to allow quality trimming for fastq files (Default:False)") parser.add_option("-n","--eliminate_n", action="store_true", default=False, dest="eliminate_n", help="flag to eliminate all sequences with unknown nucleotides in the output of pandaseq (Default:False)") parser.add_option("-o","--overlap_base_pairs", action="store", dest="overlap_base_pairs", default=50, help="minimum number of overlapping base pairs to consider (Default:50)") parser.add_option("-s","--threads", action="store", dest="threads", default=10, help="number of threads to use in usearch and pandaseq (Default:10)") if len(sys.argv) == 1: parser.parse_args(['--help']) options, args = parser.parse_args() PROG_SICKLE=which('sickle') PROG_USEARCH=which('usearch') PROG_PANDASEQ=which('pandaseq') PROG_FASTQC=which('fastqc') print "This is AMPLImock v0.1. Copyright (c) 2013 Computational Microbial Genomics Group, University of Glasgow, UK" #check programs print_prog_status("sickle",PROG_SICKLE) print_prog_status("usearch",PROG_USEARCH) print_prog_status("pandaseq",PROG_PANDASEQ) #program parameters AMPLIMOCK_DIR = os.path.dirname(os.path.realpath(__file__)) + os.sep CURRENT_DIR = os.getcwd() PAIRED_END_1=None PANDASEQ_PAIRED_END_1=None PAIRED_END_1_IS_FASTA=None PAIRED_END_1_IS_TRIMMED=None PAIRED_END_2=None PANDASEQ_PAIRED_END_2=None PAIRED_END_2_IS_FASTA=None PAIRED_END_2_IS_TRIMMED=None SINGLE_END=None SINGLE_END_IS_FASTA=None SINGLE_END_IS_TRIMMED=None PERFORM_OVERLAPPING=None PERFORM_QUALITY_TRIMMING=None ELIMINATE_N=None MAPPING_FILE=None FORWARD_REFERENCE=None REVERSE_REFERENCE=None QUALITY_TYPE=None FORWARD_PRIMER=None REVERSE_PRIMER=None FULL_LENGTH_AMPLICONS_FILE=None OVERLAP_BASE_PAIRS=None OVERLAP_FILE=None THREADS=None #sanity checks and assignments if options.query_reads==None: print 'ERROR - query reads not specified. Use -q to specify them' print 'Exiting.' sys.exit(1) if options.reference_database_forward==None: print 'ERROR - forward reference database not specified. Use -d to specify it' print 'Exiting.' sys.exit(1) else: FORWARD_REFERENCE=options.reference_database_forward print 'Forward reference database:'+FORWARD_REFERENCE if options.reference_database_reverse==None: print 'ERROR - reverse reference database not specified. Use -e to specify it' print 'Exiting.' sys.exit(1) else: REVERSE_REFERENCE=options.reference_database_reverse print 'Reverse reference database:'+REVERSE_REFERENCE if options.list_file==None: print 'ERROR - list file containing mapping from multiple 16S genes not specified. Use -l to specify it' print 'Exiting.' sys.exit(1) else: MAPPING_FILE=options.list_file print 'List file containing mapping from multiple 16S genes:'+MAPPING_FILE if len(options.query_reads)==2: PAIRED_END_1=options.query_reads[0] PAIRED_END_2=options.query_reads[1] if findWholeWord('fasta$')(PAIRED_END_1) or findWholeWord('fna$')(PAIRED_END_1) or findWholeWord('fa$')(PAIRED_END_1): PAIRED_END_1_IS_FASTA=True else: PAIRED_END_1_IS_FASTA=False if findWholeWord('fasta$')(PAIRED_END_2) or findWholeWord('fna$')(PAIRED_END_2) or findWholeWord('fa$')(PAIRED_END_2): PAIRED_END_2_IS_FASTA=True else: PAIRED_END_2_IS_FASTA=False if findWholeWord("trim")(PAIRED_END_1): PAIRED_END_1_IS_TRIMMED=True else: PAIRED_END_1_IS_TRIMMED=False if findWholeWord("trim")(PAIRED_END_2): PAIRED_END_2_IS_TRIMMED=True else: PAIRED_END_2_IS_TRIMMED=False if options.use_quality_trimming and not (PAIRED_END_1_IS_TRIMMED or PAIRED_END_2_IS_TRIMMED) and not (PAIRED_END_1_IS_FASTA or PAIRED_END_2_IS_FASTA): PERFORM_QUALITY_TRIMMING=True else: PERFORM_QUALITY_TRIMMING=False PERFORM_OVERLAPPING=True OVERLAP_BASE_PAIRS=options.overlap_base_pairs print 'Forward reads:'+PAIRED_END_1 print 'Reverse reads:'+PAIRED_END_2 print_cond(PAIRED_END_1_IS_FASTA,"Query files are in fasta format","Query files are in fastq format") print_cond(PERFORM_QUALITY_TRIMMING,"Query files will be trimmed","Query files will not be trimmed") elif len(options.query_reads)==1: SINGLE_END=options.query_reads[0] if findWholeWord('fasta$')(SINGLE_END) or findWholeWord('fna$')(SINGLE_END) or findWholeWord('fa$')(SINGLE_END): SINGLE_END_IS_FASTA=True else: SINGLE_END_IS_FASTA=False if findWholeWord('trim')(SINGLE_END): SINGLE_END_IS_TRIMMED=True else: SINGLE_END_IS_TRIMMED=False if options.use_quality_trimming and not SINGLE_END_IS_TRIMMED and not SINGLE_END_IS_FASTA: PERFORM_QUALITY_TRIMMING=True else: PERFORM_QUALITY_TRIMMING=False PERFORM_OVERLAPPING=False print 'Single reads:'+SINGLE_END print_cond(SINGLE_END_IS_FASTA,"Query file is in fasta format","Query file is in fastq format") print_cond(PERFORM_QUALITY_TRIMMING,"Query file will be trimmed","Query file will not be trimmed") else: print 'ERROR - more files than what is supported in -q' print 'Exiting.' sys.exit(1) if options.primers!=None: if len(options.primers)==2: FORWARD_PRIMER=options.primers[0] REVERSE_PRIMER=options.primers[1] print 'Forward primer:'+FORWARD_PRIMER print 'Reverse primer:'+REVERSE_PRIMER else: print 'ERROR - only specified one primer. use -p to specify the other one' print 'Exiting.' sys.exit(1) if ((options.quality_type == 'solexa') or (options.quality_type == 'illumina') or (options.quality_type == 'sanger')): QUALITY_TYPE=options.quality_type else: print 'ERROR - unrecognized quality type in -t. Use sanger, solexa, or illumina' print 'Exiting.' sys.exit(1) ELIMINATE_N=options.eliminate_n THREADS=options.threads if options.full_length_amplicons_file!=None: FULL_LENGTH_AMPLICONS_FILE=options.full_length_amplicons_file PERFORM_OVERLAPPING=False else: PERFORM_OVERLAPPING=True OVERLAP_BASE_PAIRS=options.overlap_base_pairs print "Processing the data in "+CURRENT_DIR mapping_dict=read_dict_file_csv(MAPPING_FILE) if SINGLE_END==None: if PERFORM_QUALITY_TRIMMING: quality_trim_reads_pe(PAIRED_END_1,PAIRED_END_2,extract_filename(PAIRED_END_1,False)+'_trim.fastq',extract_filename(PAIRED_END_2,False)+'_trim.fastq',extract_filename(PAIRED_END_1,False)+'_singlet.fastq',QUALITY_TYPE) PAIRED_END_1=extract_filename(PAIRED_END_1,False)+'_trim.fastq' PAIRED_END_2=extract_filename(PAIRED_END_2,False)+'_trim.fastq' PANDASEQ_PAIRED_END_1=PAIRED_END_1 PANDASEQ_PAIRED_END_2=PAIRED_END_2 if not (PAIRED_END_1_IS_FASTA or PAIRED_END_2_IS_FASTA): fastq_statistics(PAIRED_END_1,'FORWARD') fastq_statistics(PAIRED_END_2,'REVERSE') generate_quality_plots([PAIRED_END_1],extract_filename(PAIRED_END_1,False)+'_phred-quality.png') generate_quality_plots([PAIRED_END_2],extract_filename(PAIRED_END_2,False)+'_phred-quality.png') convert_fastq_to_fasta(PAIRED_END_1,extract_filename(PAIRED_END_1,False)+'.fasta') convert_fastq_to_fasta(PAIRED_END_2,extract_filename(PAIRED_END_2,False)+'.fasta') PAIRED_END_1=extract_filename(PAIRED_END_1,False)+'.fasta' PAIRED_END_2=extract_filename(PAIRED_END_2,False)+'.fasta' generate_length_histogram(PAIRED_END_1,extract_filename(PAIRED_END_1,False)+'_length-histogram.png') generate_length_histogram(PAIRED_END_2,extract_filename(PAIRED_END_2,False)+'_length-histogram.png') generate_GC_perc_plot(PAIRED_END_1,extract_filename(PAIRED_END_1,False)+'_GC-percentage.png') generate_GC_perc_plot(PAIRED_END_2,extract_filename(PAIRED_END_2,False)+'_GC-percentage.png') usearch(PAIRED_END_1,FORWARD_REFERENCE,0.95,extract_filename(PAIRED_END_1,False)+'.aln',extract_filename(PAIRED_END_1,False)+'.ualn',extract_filename(PAIRED_END_1,False)+'_matched.fasta',extract_filename(PAIRED_END_1,False)+'_notmatched.fasta',THREADS) fasta_statistics(extract_filename(PAIRED_END_1,False)+'_matched.fasta','FORWARD_MATCHED',False) fasta_statistics(extract_filename(PAIRED_END_1,False)+'_notmatched.fasta','FORWARD_NOTMATCHED',False) usearch(PAIRED_END_2,REVERSE_REFERENCE,0.95,extract_filename(PAIRED_END_2,False)+'.aln',extract_filename(PAIRED_END_2,False)+'.ualn',extract_filename(PAIRED_END_2,False)+'_matched.fasta',extract_filename(PAIRED_END_2,False)+'_notmatched.fasta',THREADS) fasta_statistics(extract_filename(PAIRED_END_2,False)+'_matched.fasta','REVERSE_MATCHED',False) fasta_statistics(extract_filename(PAIRED_END_2,False)+'_notmatched.fasta','REVERSE_NOTMATCHED',False) usearch_statistics(extract_filename(PAIRED_END_1,False)+'.ualn',mapping_dict,'FORWARD_MATCHED') usearch_statistics(extract_filename(PAIRED_END_2,False)+'.ualn',mapping_dict,'REVERSE_MATCHED') if PERFORM_OVERLAPPING and not(PAIRED_END_1_IS_FASTA or PAIRED_END_2_IS_FASTA): OVERLAP_FILE=extract_filename(PAIRED_END_1,False)+'_overlap.fastq' pandaseq(PANDASEQ_PAIRED_END_1,PANDASEQ_PAIRED_END_2,FORWARD_PRIMER,REVERSE_PRIMER,OVERLAP_BASE_PAIRS,ELIMINATE_N,THREADS,OVERLAP_FILE) fastq_statistics(OVERLAP_FILE,'OVERLAP') convert_fastq_to_fasta(OVERLAP_FILE,extract_filename(OVERLAP_FILE,False)+'.fasta') OVERLAP_FILE=extract_filename(OVERLAP_FILE,False)+'.fasta' generate_length_histogram(OVERLAP_FILE,extract_filename(OVERLAP_FILE,False)+'_length-histogram.png') generate_GC_perc_plot(OVERLAP_FILE,extract_filename(OVERLAP_FILE,False)+'_GC-percentage.png') usearch(OVERLAP_FILE,FORWARD_REFERENCE,0.95,extract_filename(OVERLAP_FILE,False)+'.aln',extract_filename(OVERLAP_FILE,False)+'.ualn',extract_filename(OVERLAP_FILE,False)+'_matched.fasta',extract_filename(OVERLAP_FILE,False)+'_notmatched.fasta',THREADS) fasta_statistics(extract_filename(OVERLAP_FILE,False)+'_matched.fasta','OVERLAP_MATCHED',False) fasta_statistics(extract_filename(OVERLAP_FILE,False)+'_notmatched.fasta','OVERLAP_NOTMATCHED',False) usearch_statistics(extract_filename(OVERLAP_FILE,False)+'.ualn',mapping_dict,'OVERLAP_MATCHED') usearch_derep_remove_chimeras(OVERLAP_FILE,THREADS,extract_filename(OVERLAP_FILE,False)+'_derep_sorted_nonchimera.fasta',extract_filename(OVERLAP_FILE,False)+'_derep_sorted_chimera.fasta',0) else: OVERLAP_FILE=FULL_LENGTH_AMPLICONS_FILE usearch_derep_remove_chimeras(OVERLAP_FILE,THREADS,extract_filename(OVERLAP_FILE,False)+'_derep_sorted_nonchimera.fasta',extract_filename(OVERLAP_FILE,False)+'_derep_sorted_chimera.fasta',1) usearch(extract_filename(OVERLAP_FILE,False)+'_derep_sorted_nonchimera.fasta',FORWARD_REFERENCE,0.97,extract_filename(OVERLAP_FILE,False)+'_derep_sorted_nonchimera.aln',extract_filename(OVERLAP_FILE,False)+'_derep_sorted_nonchimera.ualn',extract_filename(OVERLAP_FILE,False)+'_derep_sorted_nonchimera_matched.fasta',extract_filename(OVERLAP_FILE,False)+'_derep_sorted_nonchimera_notmatched.fasta',THREADS) usearch_statistics(extract_filename(OVERLAP_FILE,False)+'_derep_sorted_nonchimera.ualn',mapping_dict,'OVERLAP_DEREP_NONCHIM_MATCHED') fasta_statistics(extract_filename(OVERLAP_FILE,False)+'_derep_sorted_nonchimera.fasta','OVERLAP_DEREP_NONCHIM',True) fasta_statistics(extract_filename(OVERLAP_FILE,False)+'_derep_sorted_chimera.fasta','OVERLAP_DEREP_CHIM',True) fasta_statistics(extract_filename(OVERLAP_FILE,False)+'_derep_sorted_nonchimera_matched.fasta','OVERLAP_DEREP_NONCHIM_MATCHED',True) fasta_statistics(extract_filename(OVERLAP_FILE,False)+'_derep_sorted_nonchimera_notmatched.fasta','OVERLAP_DEREP_NONCHIM_NOTMATCHED',True) else: if PERFORM_QUALITY_TRIMMING: quality_trim_reads_se(SINGLE_END,extract_filename(SINGLE_END,False)+'_trim.fastq',QUALITY_TYPE) SINGLE_END=extract_filename(SINGLE_END,False)+'_trim.fastq' generate_quality_plots([SINGLE_END],extract_filename(SINGLE_END,False)+'_phred-quality.png') if not SINGLE_END_IS_FASTA: fastq_statistics(SINGLE_END,'SINGLE') convert_fastq_to_fasta(SINGLE_END,extract_filename(SINGLE_END,False)+'.fasta') SINGLE_END=extract_filename(SINGLE_END,False)+'.fasta' else: fasta_statistics(SINGLE_END,'SINGLE',False) generate_length_histogram(SINGLE_END,extract_filename(SINGLE_END,False)+'_length-histogram.png') generate_GC_perc_plot(SINGLE_END,extract_filename(SINGLE_END,False)+'_GC-percentage.png') usearch_both(SINGLE_END,FORWARD_REFERENCE,0.95,extract_filename(SINGLE_END,False)+'.aln',extract_filename(SINGLE_END,False)+'.ualn',extract_filename(SINGLE_END,False)+'_matched.fasta',extract_filename(SINGLE_END,False)+'_notmatched.fasta',THREADS) fasta_statistics(extract_filename(SINGLE_END,False)+'_matched.fasta','SINGLE_MATCHED',False) fasta_statistics(extract_filename(SINGLE_END,False)+'_notmatched.fasta','SINGLE_NOTMATCHED',False) usearch_statistics(extract_filename(SINGLE_END,False)+'.ualn',mapping_dict,'SINGLE') print_time_stamp('Saving pipeline statistics in AMPLImock_stats.csv. Here is the summary of results:') write_dict_file_csv(pipeline_statistics,'AMPLImock_stats.csv') for key in sorted(pipeline_statistics.keys()): print key+','+pipeline_statistics[key] if __name__ == '__main__': main()