Shell Utilities for QC

by Umer Zeeshan Ijaz and Chris Quince

We have written shell utilities that emulate the functionality of FastQC, a popular software that allows quality control checks on raw sequence data coming from high throughput sequencing pipelines. The sheer strength of these shell utilities comes from the powerful perl one-liners that have resulted in very compact scripts. For visualisation, we use gplot, a wrapper around gnuplot, that facilitates us to pipe quantitative data directly into gnuplot without the need of generating temporary files.

Dependencies

emboss, Download from here
gplot, Install using git clone https://github.com/RhysU/gplot.git
Statistics::Descriptive, Install using sudo perl -MCPAN -e shell install Statistics::Descriptive

Available Utilities

perbase_quality_FASTQ.sh: Per-base quality score for FASTQ file
average_quality_hist_FASTQ.sh: Average quality distribution for FASTQ file
duplication_hist_FAST[Q/A].sh: Duplication distribution for FASTQ/FASTA file
length_distribution_FAST[Q/A].sh: Length distribution for FASTQ/FASTA file
average_GC_hist_FAST[Q/A].sh: Average GC distribution for FASTQ/FASTA file
perbase_seqcontent_FAST[Q/A].sh: Per-base sequence content for FASTQ/FASTA file
perbase_GCcontent_FAST[Q/A].sh: Per-base GC content for FASTQ/FASTA file
perbase_ATcontent_FAST[Q/A].sh: Per-base AT content for FASTQ/FASTA file
perbase_Ncontent_FAST[Q/A].sh: Per-base N content for FASTQ/FASTA file
top_kmer_FAST[Q/A].sh: Top N Kmers for FASTQ/FASTA file

perbase_quality_FASTQ.sh: Per-base quality score for FASTQ file

Usage: ./perbase_quality_FASTQ.sh file.fastq
#!/bin/bash
gplot -e -x "Quality scores across all bases" using 1:2:3 with yerrorbars ::: <(cat $1 | \
	perl -MStatistics::Descriptive -lne 'push @a, $_; @a = @a[@a-4..$#a]; if ($. % 4 == 0){
		chomp($a[3]);
		$max_j=0;
		$j=0;
		++$i;
		map{$r{$i.":".++$j}=(ord($_)-33)}split("",$a[3]);
		if($j>$max_j){
			$max_j=$j;}}
		}{ 
		for($jj=1;$jj<=$max_j;$jj++){
			@m=();
			for($ii=1;$ii<=$i;$ii++){
				unless(not defined $r{$ii.":".$jj})
					{push @m,$r{$ii.":".$jj}}}
		$s=Statistics::Descriptive::Full->new();
		$s->add_data(@m); 
		print($jj."\t".$s->mean()."\t".$s->standard_deviation())}')
Output: ./perbase_quality_FASTQ.sh forward.fastq

average_quality_hist_FASTQ.sh: Average quality distribution for FASTQ file

Usage: ./average_quality_hist_FASTQ.sh file.fastq histogram_bin_size
#!/bin/bash
gplot -x "Average Quality Distribution" -H $2 using '(bin($1,bw)):(1.0)' smooth frequency w boxes ::: <(cat $1 | \
	perl -lne 'push @a, $_; @a = @a[@a-4..$#a]; if ($. % 4 == 0){
		chomp($a[3]);
		$s=0;
		map{$s+=ord($_)-33}split("",$a[3]);
		print $s/$a[3]=~y===c}')
Output: ./average_quality_hist_FASTQ.sh forward.fastq 0.125

duplication_hist_FAST[Q/A].sh: Duplication distribution for FASTQ/FASTA file

Usage: ./duplication_hist_FASTQ.sh file.fastq histogram_bin_size
#!/bin/bash
gplot -x "Duplication Distribution" -H $2 using '(bin($1,bw)):(1.0)' smooth frequency w boxes ::: <(cat $1 | \
	perl -ne 'push @a, $_; @a = @a[@a-4..$#a]; if ($. % 4 == 0){
		chomp($a[1]);
		$d{uc($a[1])}++;} 
		END { 
		foreach my $n (sort{ $d{$b} <=> $d{$a} } keys %d){
			print $d{$n}."\n"}}')
Usage ./duplication_hist_FASTA.sh file.fasta histogram_bin_size
#!/bin/bash
gplot -x "Duplication Distribution" -H $2 using '(bin($1,bw)):(1.0)' smooth frequency w boxes ::: <(cat $1 | \
	perl -pe '/^>/?s/^>/\n>/:s/\s*$// if$.>1' | \
		perl -ne 'push @a, $_; @a = @a[@a-2..$#a]; if ($. % 2 == 0){
			chomp($a[1]);
			$d{uc($a[1])}++;} 
			END { 
			foreach my $n (sort{ $d{$b} <=> $d{$a} } keys %d){
				print $d{$n}."\n"}}')
Output: ./duplication_hist_FASTQ.sh forward.fastq 2

length_distribution_FAST[Q/A].sh: Length distribution for FASTQ/FASTA file

Usage: ./length_distribution_FASTQ.sh file.fastq histogram_bin_size
#!/bin/bash
gplot -x "Length Distribution" -H $2 using '(bin($1,bw)):(1.0)' smooth frequency w boxes ::: <(cat $1 | \
	perl -lne 'push @a, $_; @a = @a[@a-4..$#a]; if ($. % 4 == 0){
		chomp($a[1]);
		print ($a[1]=~y===c)}')
Usage ./length_distribution_FASTA.sh file.fasta histogram_bin_size
#!/bin/bash
gplot -x "Length Distribution" -H $2 using '(bin($1,bw)):(1.0)' smooth frequency w boxes ::: <(cat $1 | \
	perl -pe '/^>/?s/^>/\n>/:s/\s*$// if$.>1' | \
		perl -lne 'print $_=y///c if !/^>/')
Output: ./length_distribution_FASTQ.sh forward.fastq 5

average_GC_hist_FAST[Q/A].sh: Average GC distribution for FASTQ/FASTA file

Usage: ./average_GC_hist_FASTQ.sh histogram_bin_size
#!/bin/bash
gplot -x "Average GC Distribution" -H $2 using '(bin($1,bw)):(1.0)' smooth frequency w boxes ::: <(cat $1 | \
	perl -lne 'push @a, $_; @a = @a[@a-4..$#a]; if ($. % 4 == 0){
		chomp($a[1]);
		print (($a[1]=~tr/GC//)/($a[1]=~y===c)*100)}')
Usage: ./average_GC_hist_FASTA.sh histogram_bin_size
#!/bin/bash
gplot -x "Average GC Distribution" -H $2 using '(bin($1,bw)):(1.0)' smooth frequency w boxes ::: <(cat $1 | \
	perl -pe '/^>/?s/^>/\n>/:s/\s*$// if$.>1' | \
		perl -lne 'push @a, $_; @a = @a[@a-2..$#a]; if ($. % 2 == 0){
			chomp($a[1]);
			print (($a[1]=~tr/GC//)/($a[1]=~y===c)*100)}')
Output: ./average_GC_hist_FASTQ.sh forward.fastq 0.5

perbase_seqcontent_FAST[Q/A].sh: Per-base sequence content for FASTQ/FASTA file

Usage: ./perbase_seqcontent_FASTQ.sh file.fastq
#!/bin/bash
cat $1 | perl -lne 'push @a, $_; @a = @a[@a-4..$#a]; if ($. % 4 == 0){
	chomp($a[1]);
	$max_j=0;
	$j=0;++$i;
	map{$r{$i.":".++$j}=$_}split("",$a[1]);
	if($j>$max_j){
		$max_j=$j;}}
	}{ 
	for($jj=1;$jj<=$max_j;$jj++){
		@m=();
		for($ii=1;$ii<=$i;$ii++){
			unless(not defined $r{$ii.":".$jj}){
				push @m,$r{$ii.":".$jj}}}
		$s=join("",@m);
		$l=($s=~y===c); 
		print ($jj."\t".((($s=~tr/A//)/$l)*100)."\t".((($s=~tr/C//)/$l)*100)."\t".((($s=~tr/G//)/$l)*100)."\t".((($s=~tr/T//)/$l)*100))}' > per_base_SQ.dat; \
gplot -e -x "Sequence content across all bases" u 1:2 t \"%A\" w l, \"\" u 1:3 t \"%C\" w l, \"\" u 1:4 t \"%G\" w l, \"\" u 1:5 t \"%T\" w l ::: per_base_SQ.dat; \
rm per_base_SQ.dat
Usage: ./perbase_seqcontent_FASTA.sh file.fasta
#!/bin/bash
cat $1 | perl -pe '/^>/?s/^>/\n>/:s/\s*$// if$.>1' | \
	perl -lne 'push @a, $_; @a = @a[@a-2..$#a]; if ($. % 2 == 0){
		chomp($a[1]);
		$max_j=0;
		$j=0;++$i;
		map{$r{$i.":".++$j}=$_}split("",$a[1]);
		if($j>$max_j){
			$max_j=$j;}}
		}{ 
		for($jj=1;$jj<=$max_j;$jj++){
			@m=();
			for($ii=1;$ii<=$i;$ii++){
				unless(not defined $r{$ii.":".$jj}){
					push @m,$r{$ii.":".$jj}}}
			$s=join("",@m);
			$l=($s=~y===c); 
			print ($jj."\t".((($s=~tr/A//)/$l)*100)."\t".((($s=~tr/C//)/$l)*100)."\t".((($s=~tr/G//)/$l)*100)."\t".((($s=~tr/T//)/$l)*100))}' > per_base_SQ.dat; \
gplot -e -x "Sequence content across all bases" u 1:2 t \"%A\" w l, \"\" u 1:3 t \"%C\" w l, \"\" u 1:4 t \"%G\" w l, \"\" u 1:5 t \"%T\" w l ::: per_base_SQ.dat; \
rm per_base_SQ.dat
Output: ./perbase_seqcontent_FASTQ.sh forward.fastq

perbase_GCcontent_FAST[Q/A].sh: Per-base GC content for FASTQ/FASTA file

Usage: ./perbase_GCcontent_FASTQ.sh file.fastq
#!/bin/bash
gplot -e -x "GC content across all bases" u 1:2 t \"%GC\" w l ::: <(cat $1 | \
	perl -lne 'push @a, $_; @a = @a[@a-4..$#a]; if ($. % 4 == 0){
		chomp($a[1]);
		$max_j=0;
		$j=0;++$i;
		map{$r{$i.":".++$j}=$_}split("",$a[1]);
		if($j>$max_j){
			$max_j=$j;}}
		}{ 
		for($jj=1;$jj<=$max_j;$jj++){
			@m=();
			for($ii=1;$ii<=$i;$ii++){
				unless(not defined $r{$ii.":".$jj}){
					push @m,$r{$ii.":".$jj}}}
			$s=join("",@m);
			$l=($s=~y===c); 
			print ($jj."\t".((($s=~tr/GC//)/$l)*100))}')
Usage: ./perbase_GCcontent_FASTA.sh file.fasta
#!/bin/bash
gplot -e -x "GC content across all bases" u 1:2 t \"%GC\" w l ::: <(cat $1 | \
	perl -pe '/^>/?s/^>/\n>/:s/\s*$// if$.>1' | \
		perl -lne 'push @a, $_; @a = @a[@a-2..$#a]; if ($. % 2 == 0){
			chomp($a[1]);
			$max_j=0;
			$j=0;++$i;
			map{$r{$i.":".++$j}=$_}split("",$a[1]);
			if($j>$max_j){
				$max_j=$j;}}
			}{ 
			for($jj=1;$jj<=$max_j;$jj++){
				@m=();
				for($ii=1;$ii<=$i;$ii++){
					unless(not defined $r{$ii.":".$jj}){
						push @m,$r{$ii.":".$jj}}}
				$s=join("",@m);
				$l=($s=~y===c); 
				print ($jj."\t".((($s=~tr/GC//)/$l)*100))}')
Output: ./perbase_GCcontent_FASTQ.sh forward.fastq

perbase_ATcontent_FAST[Q/A].sh: Per-base AT content for FASTQ/FASTA file

Usage: ./perbase_ATcontent_FASTQ.sh file.fastq
#!/bin/bash
gplot -e -x "AT content across all bases" u 1:2 t \"%AT\" w l ::: <(cat $1 | \
	perl -lne 'push @a, $_; @a = @a[@a-4..$#a]; if ($. % 4 == 0){
		chomp($a[1]);
		$max_j=0;
		$j=0;++$i;
		map{$r{$i.":".++$j}=$_}split("",$a[1]);
		if($j>$max_j){
			$max_j=$j;}}
		}{ 
		for($jj=1;$jj<=$max_j;$jj++){
			@m=();
			for($ii=1;$ii<=$i;$ii++){
				unless(not defined $r{$ii.":".$jj}){
					push @m,$r{$ii.":".$jj}}}
			$s=join("",@m);
			$l=($s=~y===c); 
			print ($jj."\t".((($s=~tr/AT//)/$l)*100))}')
Usage: ./perbase_ATcontent_FASTA.sh file.fasta
#!/bin/bash
gplot -e -x "AT content across all bases" u 1:2 t \"%AT\" w l ::: <(cat $1 | \
	perl -pe '/^>/?s/^>/\n>/:s/\s*$// if$.>1' | \
		perl -lne 'push @a, $_; @a = @a[@a-2..$#a]; if ($. % 2 == 0){
			chomp($a[1]);
			$max_j=0;
			$j=0;++$i;
			map{$r{$i.":".++$j}=$_}split("",$a[1]);
			if($j>$max_j){
				$max_j=$j;}}
			}{ 
			for($jj=1;$jj<=$max_j;$jj++){
				@m=();
				for($ii=1;$ii<=$i;$ii++){
					unless(not defined $r{$ii.":".$jj}){
						push @m,$r{$ii.":".$jj}}}
				$s=join("",@m);
				$l=($s=~y===c); 
				print ($jj."\t".((($s=~tr/AT//)/$l)*100))}')
Output: ./perbase_ATcontent_FASTQ.sh forward.fastq

perbase_Ncontent_FAST[Q/A].sh: Per-base N content for FASTQ/FASTA file

Usage: ./perbase_Ncontent_FASTQ.sh file.fastq
#!/bin/bash
gplot -e -x "N content across all bases" u 1:2 t \"%N\" w l ::: <(cat $1 | \
	perl -lne 'push @a, $_; @a = @a[@a-4..$#a]; if ($. % 4 == 0){
		chomp($a[1]);
		$max_j=0;
		$j=0;
		++$i;
		map{$r{$i.":".++$j}=$_}split("",$a[1]);
		if($j>$max_j){
			$max_j=$j;}}
		}{ 
		for($jj=1;$jj<=$max_j;$jj++){
			@m=();
			for($ii=1;$ii<=$i;$ii++){
				unless(not defined $r{$ii.":".$jj}){
					push @m,$r{$ii.":".$jj}}}
			$s=join("",@m);
			$l=($s=~y===c); 
			print ($jj."\t".((($s=~tr/N//)/$l)*100))}')
Usage: ./perbase_Ncontent_FASTA.sh file.fasta
#!/bin/bash
gplot -e -x "N content across all bases" u 1:2 t \"%N\" w l ::: <(cat $1 | \
	perl -pe '/^>/?s/^>/\n>/:s/\s*$// if$.>1' | \
		perl -lne 'push @a, $_; @a = @a[@a-2..$#a]; if ($. % 2 == 0){
			chomp($a[1]);
			$max_j=0;
			$j=0;++$i;
			map{$r{$i.":".++$j}=$_}split("",$a[1]);
			if($j>$max_j){
				$max_j=$j;}}
			}{ 
			for($jj=1;$jj<=$max_j;$jj++){
				@m=();
				for($ii=1;$ii<=$i;$ii++){
					unless(not defined $r{$ii.":".$jj}){
						push @m,$r{$ii.":".$jj}}}
				$s=join("",@m);
				$l=($s=~y===c); 
				print ($jj."\t".((($s=~tr/N//)/$l)*100))}')
Output: ./perbase_Ncontent_FASTQ.sh forward.fastq

top_kmer_FAST[Q/A].sh: Top N Kmers for FASTQ/FASTA file

Usage: ./top_kmer_FASTQ.sh file.fastq Kmer_size top_n
#!/bin/bash
gplot -H 1 -e using '2:xtic(1)' with histogram ::: <(wordcount $1 -wordsize=$2 -stdout -auto | head -$3)
Usage: ./top_kmer_FASTA.sh file.fasta Kmer_size top_n
#!/bin/bash
gplot -H 1 -e using '2:xtic(1)' with histogram ::: <(wordcount $1 -wordsize=$2 -stdout -auto | head -$3)
Output: ./top_kmer_FASTQ.sh forward.fastq 5 15


Last Updated by Dr Umer Zeeshan Ijaz on 23/04/2014.