As I routinely waste time on benchmarks... The take-home message is C/C++ still takes the crown of speed, four times as fast as Perl/Python. Among the following programs, only seqtk and brentp's work with multi-line fastq. The rest assume 4-line fastq. [PS: Python-2.7.2, Ruby-1.9.2p290, LuaJIT-beta8, Java-1.6.0_25, Perl-5.8.8 and gcc-4.1.2. i.e. latest python/ruby/luajit/java but years old C/Perl]
Program Lang User Sys Mem/MB Comments
cat C 0.4 10.1 0.4 Unix cat
wc -l C 4.3 10.4 0.4 Unix wc -l
fqextract.c C 17.3 10.8 2.0 Hardcoded maximum line length: 4096 C++ 19.5 12.2 3.8 Use tr1::unordered_set
seqtk C 19.9 8.9 4.1 Multi-line fastq; subregion; gzip input C++ 22.8 12.6 3.9 Replace cin with reading /dev/stdin
wc C 32.0 10.6 0.4 Unix wc (no command-line options) Python 54.0 7.9 8.0 Same algorithm as fqextract.c Java 55.8 8.9 562.7 Same algorithm as fqextract.c Perl 71.5 10.5 5.5 Same algorithm as fqextract.c Python 78.0 13.3 8.0 Infinite loop given improper input
fqextract.lua LuaJIT 109.6 8.6 6.1 Same algorithm as fqextract.c Python 122.9 11.3 8.1 Multi-line fastq
bp-fqiter Python 124.5 9.9 15.2 Multi-line fastq; biopython
drio.rb Ruby 130.1 6.9 10.0 Same algorithm as fqextract.c
readfq.lua Lua 189.5 11.5 6.0 Multi-line fastq Perl 203.1 69.6 5.7 Multi-line fastq C++ 323.6 13.3 3.9 Same algorithm as Python 928.8 34.2 15.6 Multi-line fastq Perl >2400.0 >15.0 Multi-line fastq; killed
grep -f -A C >2400.0 >414.0 Killed; possible false positives
- Readfq is my implementation of a unified multi-line fasta/fastq parser in Perl, Python and Lua.
- It is a little surprising that C/C++ implementations greatly outperform Perl and Python implementations. I would expect the difference to be less than 2 folds.
- and may go into an infinite loop when the input list contains names absent from the fastq file. The strategy used in fqextract is recommended. It is also faster, at least for python.
- BioPerl's fastq parser is impractically slow. also parsers multi-line fastq, but is by far faster. BioPython's fastq parser is more efficient than BioPerl's, but not efficient enough. is 7 times faster.
- Unix cat is insanely fast. Unix "wc -l" (line counting only) is 8X faster than "wc" without options (also counting words and bytes). As I remember, Unix fgrep (or grep -f) builds a finite-state automaton for query strings. The algorithm is close to the optimal for general uses, but as fgrep ignores the format of fastq, it is far slower than a fastq-aware program.
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include "khash.h" // khash.h from samtools/bwa/seqtk/klib
#define BUF_SIZE 4096
int main(int argc, char *argv[])
char buf[BUF_SIZE];
FILE *fp;
int lineno = 0, flag = 0, ret;
khash_t(s) *h;
if (argc == 1) {
fprintf(stderr, "Usage: cat in.fq | fqextract <name.lst>\n");
return 1;
h = kh_init(s);
fp = fopen(argv[1], "rb"); // FIXME: check fp
while (fgets(buf, BUF_SIZE, fp))
kh_put(s, h, strdup(buf), &ret); // FIXME: check ret
while (fgets(buf, BUF_SIZE, stdin)) {
if (++lineno%4 == 1)
flag = (kh_get(s, h, buf + 1) != kh_end(h));
if (flag) fputs(buf, stdout);
kh_destroy(s, h); // FIXME: free keys before destroy
return 0;
#include <iostream>
#include <string>
#include <fstream>
#include <cstdlib>
#include <tr1/unordered_set> // use <unordered_set> for more recent g++
using namespace std;
int main(int argc,char **argv)
string line;
tr1::unordered_set<string> names;
ifstream in(argv[1],ios::in);
if(!in.is_open()) return EXIT_FAILURE;
string name, seq, name2, qual;
tr1::unordered_set<string>::iterator r;
ifstream in2("/dev/stdin", ios::in);
if(!getline(in2,name,'\n')) break;
if(!getline(in2,seq,'\n')) break;
if(!getline(in2,name2,'\n')) break;
if(!getline(in2,qual,'\n')) break;
if(r==names.end()) continue;
names.erase(r);//names are unique we don't need this one anymore
cout << name << "\n" << seq << "\n" << name2 << "\n" << qual << endl;
return 0;
import java.util.HashSet;
class fqextract {
public static void main(String[] args) throws IOException {
BufferedReader fp = new BufferedReader(new FileReader(args[0]));
HashSet h = new HashSet();
String l;
while ((l = fp.readLine()) != null) h.add('@' + l);
int lineno = 0;
boolean flag = false;
fp = new BufferedReader(new InputStreamReader(;
while ((l = fp.readLine()) != null) {
if (++lineno % 4 == 1) flag = h.contains(l);
if (flag) System.out.println(l);
die("Usage: cat in.fq | <name.lst>\n") if (@ARGV == 0);
my $fn = shift(@ARGV);
my %hash = ();
open(FILE, $fn) || die;
$hash{'@'.$_} = 1 while (<FILE>);
my $flag = 0;
while (<>) {
$flag = defined($hash{$_})? 1 : 0 if ($.%4 == 1);
print if ($flag);
import sys
ids, lineno, flag = set('@' + x for x in open(sys.argv[1])), 0, False
for l in sys.stdin:
lineno += 1
if lineno%4 == 1: flag = (l in ids)
if flag: print l,
from Bio import SeqIO
import sys
ids = set(x[:-1] for x in open(sys.argv[1]))
for seq in SeqIO.parse(sys.stdin, "fastq"):
if ( in ids): print seq.format("fastq"),
from Bio.SeqIO.QualityIO import FastqGeneralIterator
import sys
ids = set(x[:-1] for x in open(sys.argv[1]))
for name, seq, qual in FastqGeneralIterator(sys.stdin):
if (name in ids): print("@%s\n%s\n+\n%s" % (name, seq, qual))
EDIT1 by lh3: Optimized kseq.h for faster FASTQ parsing, though the parser is a little less robust than the previous version (fine for nearly all files). Added cat/wc/grep. Added more discussions.
EDIT2 by lh3: Added Ruby-1.9.2p290
EDIT3 by lh3: Added LuaJIT-beta8. Lua-5.1.4 is 20% slower. Added and
EDIT4 by lh3: Added biopython and readfq parsers.
EDIT5 by lh3: Added BioPython.FastqGeneralIterator (see also here). This parser is as fast as my
