|
""" |
|
split a single fastq file in to random, non-overlapping subsets |
|
arguments: |
|
+ fastq file |
|
+ number of splits |
|
+ number of reps |
|
|
|
e.g.: |
|
|
|
python fq.split.py input.fastq 3 4 |
|
|
|
will create 12 new files in 4 sets of 3. Each |
|
set of 3 will contain all of the original records. |
|
""" |
|
|
|
import gzip |
|
import random |
|
from itertools import islice, izip |
|
|
|
xopen = lambda fq: gzip.open(fq) if fq.endswith('.gz') else open(fq) |
|
|
|
|
|
def fqiter(fq, n=4): |
|
with xopen(fq) as fh: |
|
fqclean = (x.strip("\r\n") for x in fh if x.strip()) |
|
while True: |
|
rec = [x for x in islice(fqclean, n)] |
|
if not rec: raise StopIteration |
|
assert all(rec) and len(rec) == 4 |
|
yield rec |
|
|
|
def fqsplit(fq, nchunks, nreps, prefix=None): |
|
if prefix == None: prefix = fq + ".split" |
|
prefix += "chunk-%i.rep-%i.fq" |
|
|
|
fq_size = sum(1 for x in xopen(fq)) |
|
assert fq_size % 4 == 0 |
|
fq_size /= 4 # number of records |
|
|
|
chunk_size = 1 + (fq_size) // nchunks |
|
print >>sys.stderr, "chunk_size:", chunk_size |
|
|
|
for rep in range(1, nreps + 1): |
|
|
|
files = [open(prefix % (c, rep), 'w') for c in range(1, nchunks + 1)] |
|
ints = range(fq_size) |
|
random.shuffle(ints) |
|
|
|
for i, fqr in izip(ints, fqiter(fq)): |
|
chunk, chunk_i = divmod(i, chunk_size) |
|
print >>files[chunk], "\n".join(fqr) |
|
[f.close() for f in files] |
|
|
|
if __name__ == "__main__": |
|
|
|
import sys |
|
|
|
fq = sys.argv[1] |
|
nchunks = int(sys.argv[2]) |
|
nreps = int(sys.argv[3]) |
|
fqsplit(fq, nchunks, nreps) |
Thanks for the script. It seems to work, though I am getting an error after a few minutes.
AS the fastq files is zipped, this is the command I'm using:
After a few minutes I am getting a chunk size massage
But than the script stops without any errors, but only with the traceback massage:
Is it a memory problem? I hope you can help
Thanks, Assa
I updated the script just now (to use izip in place of zip). Give another try.
NO it is still not working. I can run it with the unzipped files, but not with the gzipped ones. I can't understand why.