Question

Error requiring a string or a buffer

0

Entering edit mode

22 months ago

zainabi8077 ▴ 20

Hello, I'm trying to run this Python script to eliminate over-representative sequences from my fastq files, but I keep getting an error. I'm new to bioinfomatics and have been using a series of pipelines for sequence assembling. With this script, I hoped to eliminate over-representative moments. The code is as follows:

import sys
import gzip
from os.path import basename
import argparse
import re
from itertools import izip,izip_longest

def seqsmatch(overreplist,read):
    flag=False
    if overreplist!=[]:
        for seq in overreplist:
            if seq in read:
                flag=True
                break
    return flag

def get_input_streams(r1file,r2file):
    if  r1file[-2:]=='gz':
        r1handle=gzip.open(r1file,'rb')
        r2handle=gzip.open(r2file,'rb')
    else:
        r1handle=open(r1file,'r')
        r2handle=open(r2file,'r')

    return r1handle,r2handle

def FastqItrate(iterable,fillvalue=None):
    "Grab one 4-line fastq read at a time"
    args = [iter(iterable)] * 4
    return izip_longest(fillvalue=fillvalue, *args) 

def ParseFastqcLog(fastqclog):    
    with open(fastqclog) as fp:
        for result in re.findall('Overrepresented sequences(.*?)END_MODULE', fp.read(), re.S):
            seqs=([i.split('\t')[0] for i in result.split('\n')[2:-1]])
    return seqs     

if __name__=="__main__": 
    parser = argparse.ArgumentParser(description="options for removing reads with over-represented sequences")
    parser.add_argument('-11','--left_reads',dest='leftreads',type=str,help='R1 fastq file')
    parser.add_argument('-12','--right_reads',dest='rightreads',type=str,help='R2 fastq file')
    parser.add_argument('-fql','--fastqc_left',dest='l_fastqc',type=str,help='fastqc text file for R1')
    parser.add_argument('-fqr','--fastqc_right',dest='r_fastqc',type=str,help='fastqc text file for R2')
    opts = parser.parse_args()

    leftseqs=ParseFastqcLog(opts.l_fastqc)
    rightseqs=ParseFastqcLog(opts.r_fastqc)

    r1_out=open('rmoverrep_'+basename(opts.leftreads).replace('.gz',''),'w')
    r2_out=open('rmoverrep_'+basename(opts.rightreads).replace('.gz',''),'w')

    r1_stream,r2_stream=get_input_streams(opts.leftreads,opts.rightreads)

    counter=0
    failcounter=0

    with r1_stream as f1, r2_stream as f2:
        R1=FastqIterate(f1)
        R2=FastqIterate(f2)
        for entry in R1:
            counter+=1
            if counter%100000==0:
                print "%s reads processed" % counter

            head1,seq1,placeholder1,qual1=[i.strip() for i in entry]
            head2,seq2,placeholder2,qual2=[j.strip() for j in R2.next()]

            flagleft,flagright=seqsmatch(leftseqs,seq1),seqsmatch(rightseqs,seq2)

            if True not in (flagleft,flagright):
                r1_out.write('%s\n' % '\n'.join([head1,seq1,'+',qual1]))
                r2_out.write('%s\n' % '\n'.join([head2,seq2,'+',qual2]))
            else:
                failcounter+=1


        print 'total # of reads evaluated = %s' % counter
        print 'number of reads retained = %s' % (counter-failcounter)
        print 'number of PE reads filtered = %s' % failcounter


r1_out.close()
r2_out.close()

Error:

Line 46 of the file "TranscriptomeAssemblyTools/RemoveFastqcOverrepSequenceReads.py," in leftseqs=ParseFastqcLog(opts.l fastqc). Line 33 of the file "TranscriptomeAssemblyTools/RemoveFastqcOverrepSequenceReads.py" in ParseFastqcLog with open(fastqclog) as fp: TypeError: Unicode coercion: string or buffer required, NoneType found**

python • 539 views

ADD COMMENT • link updated 22 months ago by Ram 44k • written 22 months ago by zainabi8077 ▴ 20

0

Entering edit mode

What are you typing into the command line to run the script? It doesn't appear to be finding a value for the command line argument -fql aka --fastqc_left.

ADD REPLY • link 22 months ago by rpolicastro 13k