So I have two sam files and I want to delete from each one any reads that are not in the other one.
So I wrote some code using pysam to figure out which reads are in both and write those back out to new sam files.
However my new sam files don't have any headers. So the next time I try to open the files with pysam it aborts saying it can't find headers.
Any idea how to do this? I actually don't understand what the headers are for? Is there a way to get the correct headers into the files I'm creating?
Here's my code if it helps:
def open_as_pysam(self, filepath):
try:
return pysam.Samfile(filepath, 'r')
except:
return pysam.Samfile(filepath + '.gz','r')
def remove_non_matching_reads(self, sim_samfilepath, sec_samfilepath, delete_original_files=False):
"""Only keep reads that are in both files (by qname)."""
sim_removed_count, sec_removed_count = 0,0
sim_reads, sec_reads = set(), set()
sim_samfile_in = self.open_as_pysam(sim_samfilepath)
sec_samfile_in = self.open_as_pysam(sec_samfilepath)
for read in sim_samfile_in.fetch():
sim_reads.add(read.qname)
for read in sec_samfile_in.fetch():
sec_reads.add(read.qname)
both = sim_reads.intersection(sec_reads)
print "Found %s reads in sim file" % len(sim_reads)
print "Found %s reads in sec file" % len(sec_reads)
print "Found %s reads common to both files" % len(both)
#write out only those in both
out_sim_samfilepath = sim_samfilepath + '.noncommon.reads.removed.sam'
out_sec_samfilepath = sec_samfilepath + '.noncommon.reads.removed.sam'
sim_outfile = pysam.Samfile(out_sim_samfilepath, 'w', template=sim_samfile_in)
sec_outfile = pysam.Samfile(out_sec_samfilepath, 'w', template=sec_samfile_in)
#Close and re-open input files because I don't seem to have access to the seek
#method in the pysam files to reset the fetch.
sim_samfile_in.close()
sec_samfile_in.close()
sim_samfile_in = self.open_as_pysam(sim_samfilepath)
sec_samfile_in = self.open_as_pysam(sec_samfilepath)
for read in sim_samfile_in.fetch():
if read.qname in both:
sim_outfile.write(read)
else:
sim_removed_count +=1
for read in sec_samfile_in.fetch():
if read.qname in both:
sec_outfile.write(read)
else:
sec_removed_count +=1
print "Removed %s simulans reads and %s seculans reads" % (sim_removed_count, sec_removed_count)
sim_outfile.close()
sec_outfile.close()
if delete_original_files:
os.remove(sim_samfilepath)
os.remove(sec_samfilepath)
return out_sim_samfilepath, out_sec_samfilepath
You can also use picard CreateSequenceDictionary on the genome fasta and add that to the top of the sam file.