bam file calculate sliding window insert sizes

Entering edit mode

8.2 years ago

14134125465346445 ★ 3.6k

Hi all,

I have a bam file with paired-end reads of relatively uneven coverage, and I want to produce a wiggle plot with the sliding window of insert sizes. Is there a tool for this?

bam insert size • 2.9k views

ADD COMMENT • link updated 8.2 years ago by Devon Ryan 105k • written 8.2 years ago by 14134125465346445 ★ 3.6k

Entering edit mode

In the somewhat likely event that there's nothing prewritten to do this, I expect something can be thrown together with the deepTools python API (I can try to throw something together if you'd like).

ADD REPLY • link 8.2 years ago by Devon Ryan 105k

Entering edit mode

That would be great!

ADD REPLY • link 8.2 years ago by 14134125465346445 ★ 3.6k

Entering edit mode

8.2 years ago

Pierre Lindenbaum 166k

generate a bed of insert size from your bam

samtools view -f 66 in.bam | awk -F '\t' '{printf("%s\t%d\t%s\t%s\n",$3,(int($4)<int($8)?int($4):int($8))-1,int($4)<int($8)?$8:$4,int($9)<0?-$9:$9);}' | LC_ALL=C sort -t$'\t' -k1,1 -k2,2n > f1.bed

create sliding windows from your reference

$  bedtools  makewindows -g ref.fasta.fai  -w 10000000 -s 5000000  | LC_ALL=C sort -t$'\t' -k1,1 -k2,2n > f2.bed

compute the intersection between the two bed file, cut window-chr/window-start/window-end/segment-length and group by the mean of the segment-length:

bedtools intersect -wa -wb -a f2.bed -b f1.bed | cut -f 1,2,3,7 |   bedtools groupby -i - -g 1,2,3 -c 4  -o mean > f3.bed

f3 is a bedgraph that you can convert to a wiggle file using the UCSC tools.

ADD COMMENT • link 8.2 years ago by Pierre Lindenbaum 166k

Entering edit mode

8.2 years ago

Devon Ryan 105k

Pierre's answer may end up being easier, but since I was a bit curious how easy/hard this would be to do with the deepTools API (it was very non-trivial, but it'll be quicker than with samtools):

	#!/usr/bin/env python
	import deeptools.utilities
	from deeptools import bamHandler
	from deeptools import mapReduce
	from deeptools import writeBedGraph
	from deeptools import utilities
	from deeptools.utilities import getCommonChrNames, toBytes

	import sys
	import shutil
	import os
	import numpy as np

	BAM = "something.bam" # This is a BAM file name
	binLength = 1000 # Window size
	stepSize = 50 # The step size by which each window is shifted
	numberOfProcessors = 20 # Number of threads
	outFileName = "something.bigWig" # Ouput file name
	outFileFormat = "bigwig" # or "bedgraph"

	def writeBedGraph_wrapper(args):
	"""
	Passes the arguments to writeBedGraph_worker.
	This is a step required given
	the constrains from the multiprocessing module.
	The args var, contains as first element the 'self' value
	from the WriteBedGraph object

	"""
	return foo.writeBedGraph_worker(*args)

	class foo(writeBedGraph.WriteBedGraph):
	def calculate_mean_isize(self, bamHandle, chrom, regions):
	rv = np.zeros(len(regions), dtype='float64')
	for idx, reg in enumerate(regions):
	vals = [r.template_length for r in bamHandle.fetch(chrom, reg[0], reg[1]) if r.flag & 4 == 0]
	if len(vals) > 0:
	rv[idx] = np.mean(np.abs(vals))
	return rv

	def count_reads_in_region(self, chrom, start, end):
	bam_handlers = []
	for fname in self.bamFilesList:
	bam_handlers.append(bamHandler.openBam(fname))
	# A list of lists of tuples
	transcriptsToConsider = []
	for i in range(start, end, self.stepSize):
	if i + self.binLength > end:
	break
	transcriptsToConsider.append([(i, i + self.binLength)])

	subnum_reads_per_bin = []
	for bam in bam_handlers:
	for trans in transcriptsToConsider:
	tcov = self.calculate_mean_isize(bam, chrom, trans)
	subnum_reads_per_bin.extend(tcov)

	subnum_reads_per_bin = np.concatenate([subnum_reads_per_bin]).reshape(-1, len(self.bamFilesList), order='F')

	return subnum_reads_per_bin, ''

	def writeBedGraph_worker(self, chrom, start, end, func_to_call, func_args):
	coverage, _ = self.count_reads_in_region(chrom, start, end)

	_file = open(utilities.getTempFileName(suffix='.bg'), 'w')
	previous_value = None
	line_string = "{}\t{}\t{}\t{:g}\n"
	for tileIndex in range(coverage.shape[0]):
	tileCoverage = coverage[tileIndex, :]
	value = func_to_call(tileCoverage, func_args)
	writeStart = start + tileIndex * self.stepSize
	writeEnd = min(writeStart + self.stepSize, end)
	_file.write(line_string.format(chrom, writeStart, writeEnd, value))

	tempfilename = _file.name
	_file.close()
	return tempfilename

	def run(self, func_to_call, func_args, out_file_name, format="bedgraph"):
	bam_handlers = [bamHandler.openBam(x) for x in self.bamFilesList]
	genome_chunk_length = writeBedGraph.getGenomeChunkLength(bam_handlers, self.binLength)
	# check if both bam files correspond to the same species
	# by comparing the chromosome names:
	chrom_names_and_size, non_common = getCommonChrNames(bam_handlers, verbose=False)

	if self.region:
	# in case a region is used, append the tilesize
	self.region += ":{}".format(self.binLength)

	for x in list(self.__dict__.keys()):
	sys.stderr.write("{}: {}\n".format(x, self.__getattribute__(x)))

	res = mapReduce.mapReduce([func_to_call, func_args],
	writeBedGraph_wrapper,
	chrom_names_and_size,
	self_=self,
	genomeChunkLength=genome_chunk_length,
	region=self.region,
	numberOfProcessors=self.numberOfProcessors)

	# concatenate intermediary bedgraph files
	out_file = open(out_file_name + ".bg", 'wb')
	for tempfilename in res:
	if tempfilename:
	# concatenate all intermediate tempfiles into one
	# bedgraph file
	_foo = open(tempfilename, 'rb')
	shutil.copyfileobj(_foo, out_file)
	_foo.close()
	os.remove(tempfilename)

	bedgraph_file = out_file.name
	out_file.close()
	if format == 'bedgraph':
	sort_cmd = cfg.config.get('external_tools', 'sort')
	os.system("LC_ALL=C {} -k1,1 -k2,2n {} > {}".format(sort_cmd, bedgraph_file, out_file_name))
	os.remove(bedgraph_file)
	if self.verbose:
	print("output file: {}".format(out_file_name))
	else:
	writeBedGraph.bedGraphToBigWig(
	chrom_names_and_size, bedgraph_file, out_file_name, True)
	if self.verbose:
	print("output file: {}".format(out_file_name))
	os.remove(bedgraph_file)


	wr = foo([BAM], binLength=binLength, stepSize=stepSize, numberOfProcessors=numberOfProcessors, region="19")
	func_args = {'scaleFactor': 1.0}
	wr.run(writeBedGraph.scaleCoverage, func_args, outFileName, format=outFileFormat)

view raw BamToInsertsizeBigWig.py hosted with ❤ by GitHub

Change the input and output names or add in argparse if you really want to reuse it. The code contains a lot of copy/paste from other deepTools functions, which is why it's a bit of a mess.

ADD COMMENT • link 8.2 years ago by Devon Ryan 105k