How To Efficiently Parse A Huge Fastq File?

As I routinely waste time on benchmarks... The take-home message is C/C++ still takes the crown of speed, four times as fast as Perl/Python. Among the following programs, only seqtk and brentp's bioperl.pl work with multi-line fastq. The rest assume 4-line fastq. [PS: Python-2.7.2, Ruby-1.9.2p290, LuaJIT-beta8, Java-1.6.0_25, Perl-5.8.8 and gcc-4.1.2. i.e. latest python/ruby/luajit/java but years old C/Perl]

=========================================================================================
 Program        Lang       User  Sys   Mem/MB    Comments
-----------------------------------------------------------------------------------------
 cat            C           0.4  10.1     0.4    Unix cat
 wc -l          C           4.3  10.4     0.4    Unix wc -l
 fqextract.c    C          17.3  10.8     2.0    Hardcoded maximum line length: 4096
 Pierre-r2.cc   C++        19.5  12.2     3.8    Use tr1::unordered_set
 seqtk          C          19.9   8.9     4.1    Multi-line fastq; subregion; gzip input
 Pierre-r1.cc   C++        22.8  12.6     3.9    Replace cin with reading /dev/stdin
 wc             C          32.0  10.6     0.4    Unix wc (no command-line options)
 fqextract.py   Python     54.0   7.9     8.0    Same algorithm as fqextract.c
 fqextract.java Java       55.8   8.9   562.7    Same algorithm as fqextract.c
 fqextract.pl   Perl       71.5  10.5     5.5    Same algorithm as fqextract.c
 Leszek.py      Python     78.0  13.3     8.0    Infinite loop given improper input
 fqextract.lua  LuaJIT    109.6   8.6     6.1    Same algorithm as fqextract.c
 readfq.py      Python    122.9  11.3     8.1    Multi-line fastq
 bp-fqiter      Python    124.5   9.9    15.2    Multi-line fastq; biopython
 drio.rb        Ruby      130.1   6.9    10.0    Same algorithm as fqextract.c
 readfq.lua     Lua       189.5  11.5     6.0    Multi-line fastq
 readfq.pl      Perl      203.1  69.6     5.7    Multi-line fastq
 Pierre.cc      C++       323.6  13.3     3.9    Same algorithm as Leszek.py
 biopython.py   Python    928.8  34.2    15.6    Multi-line fastq
 bioperl.pl     Perl    >2400.0         >15.0    Multi-line fastq; killed
 grep -f -A     C       >2400.0        >414.0    Killed; possible false positives
=========================================================================================

#Discussions#

Readfq is my implementation of a unified multi-line fasta/fastq parser in Perl, Python and Lua.
It is a little surprising that C/C++ implementations greatly outperform Perl and Python implementations. I would expect the difference to be less than 2 folds.
Pierre.cc and Leszek.py may go into an infinite loop when the input list contains names absent from the fastq file. The strategy used in fqextract is recommended. It is also faster, at least for python.
BioPerl's fastq parser is impractically slow. Readfq.pl also parsers multi-line fastq, but is by far faster. BioPython's fastq parser is more efficient than BioPerl's, but not efficient enough. Readfq.py is 7 times faster.
Unix cat is insanely fast. Unix "wc -l" (line counting only) is 8X faster than "wc" without options (also counting words and bytes). As I remember, Unix fgrep (or grep -f) builds a finite-state automaton for query strings. The algorithm is close to the optimal for general uses, but as fgrep ignores the format of fastq, it is far slower than a fastq-aware program.

#Programs#

fqextract.c

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include "khash.h" // khash.h from samtools/bwa/seqtk/klib
KHASH_SET_INIT_STR(s)

#define BUF_SIZE 4096

int main(int argc, char *argv[])
{
    char buf[BUF_SIZE];
    FILE *fp;
    int lineno = 0, flag = 0, ret;
    khash_t(s) *h;
    if (argc == 1) {
        fprintf(stderr, "Usage: cat in.fq | fqextract <name.lst>\n");
        return 1;
    }
    h = kh_init(s);
    fp = fopen(argv[1], "rb"); // FIXME: check fp
    while (fgets(buf, BUF_SIZE, fp))
        kh_put(s, h, strdup(buf), &ret); // FIXME: check ret
    fclose(fp);
    while (fgets(buf, BUF_SIZE, stdin)) {
        if (++lineno%4 == 1)
            flag = (kh_get(s, h, buf + 1) != kh_end(h));
        if (flag) fputs(buf, stdout);
    }
    kh_destroy(s, h); // FIXME: free keys before destroy
    return 0;
}

Pierre-r2.cc

#include <iostream>
#include <string>
#include <fstream>
#include <cstdlib>
#include <tr1/unordered_set> // use <unordered_set> for more recent g++
using namespace std;

int main(int argc,char **argv)
{
    string line;
    tr1::unordered_set<string> names;
    ifstream in(argv[1],ios::in);
    if(!in.is_open()) return EXIT_FAILURE;
    while(getline(in,line,'\n'))
        names.insert(line);
    in.close();
    string name, seq, name2, qual;
    tr1::unordered_set<string>::iterator r;
    ifstream in2("/dev/stdin", ios::in);
    while(!names.empty())
    {
        if(!getline(in2,name,'\n')) break;
        if(!getline(in2,seq,'\n')) break;
        if(!getline(in2,name2,'\n')) break;
        if(!getline(in2,qual,'\n')) break;
        r=names.find(name);
        if(r==names.end()) continue;
        names.erase(r);//names are unique we don't need this one anymore
        cout << name << "\n" << seq << "\n" << name2 << "\n" << qual << endl;
    }
    in2.close();
    return 0;
}

fqextract.java

import java.util.HashSet;
import java.io.*;

class fqextract {
    public static void main(String[] args) throws IOException {
        BufferedReader fp = new BufferedReader(new FileReader(args[0]));
        HashSet h = new HashSet();
        String l;
        while ((l = fp.readLine()) != null) h.add('@' + l);
        fp.close();
        int lineno = 0;
        boolean flag = false;
        fp = new BufferedReader(new InputStreamReader(System.in));
        while ((l = fp.readLine()) != null) {
            if (++lineno % 4 == 1) flag = h.contains(l);
            if (flag) System.out.println(l);
        }
    }
}

seqtk

fqextract.pl

die("Usage: cat in.fq | fqextract.pl <name.lst>\n") if (@ARGV == 0);
my $fn = shift(@ARGV);
my %hash = ();
open(FILE, $fn) || die;
$hash{'@'.$_} = 1 while (<FILE>);
close(FILE);
my $flag = 0;
while (<>) {
    $flag = defined($hash{$_})? 1 : 0 if ($.%4 == 1);
    print if ($flag);
}

fqextract.py

import sys 
ids, lineno, flag = set('@' + x for x in open(sys.argv[1])), 0, False
for l in sys.stdin:
    lineno += 1
    if lineno%4 == 1: flag = (l in ids)
    if flag: print l,

biopython.py

from Bio import SeqIO
import sys
ids = set(x[:-1] for x in open(sys.argv[1]))
for seq in SeqIO.parse(sys.stdin, "fastq"):
    if (seq.id in ids): print seq.format("fastq"),

bp-fqiter.py

from Bio.SeqIO.QualityIO import FastqGeneralIterator
import sys 
ids = set(x[:-1] for x in open(sys.argv[1]))
for name, seq, qual in FastqGeneralIterator(sys.stdin):
    if (name in ids): print("@%s\n%s\n+\n%s" % (name, seq, qual))

EDIT1 by lh3: Optimized kseq.h for faster FASTQ parsing, though the parser is a little less robust than the previous version (fine for nearly all files). Added cat/wc/grep. Added more discussions.

EDIT2 by lh3: Added Ruby-1.9.2p290

EDIT3 by lh3: Added LuaJIT-beta8. Lua-5.1.4 is 20% slower. Added fqextract.py and fqextract.java

EDIT4 by lh3: Added biopython and readfq parsers.

EDIT5 by lh3: Added BioPython.FastqGeneralIterator (see also here). This parser is as fast as my readfq.py.

	from Bio import SeqIO
	import sys
	ids = set(x[:-1] for x in open(sys.argv[1]))
	wanted = (rec for rec in SeqIO.parse(sys.stdin, "fastq") if rec.id in ids)
	SeqIO.write(wanted, sys.stdout, "fastq")

	from Bio.SeqIO.QualityIO import FastqGeneralIterator
	import sys
	ids = set(x[:-1] for x in open(sys.argv[1]))
	for title, seq, quals in FastqGeneralIterator(sys.stdin):
	if title.split(None,1)[0] in ids:
	print "@%s\n%s\n+\n%s\n" % (title, seq, quals)