Hi all,
I have a bam file with paired-end reads of relatively uneven coverage, and I want to produce a wiggle plot with the sliding window of insert sizes. Is there a tool for this?
Hi all,
I have a bam file with paired-end reads of relatively uneven coverage, and I want to produce a wiggle plot with the sliding window of insert sizes. Is there a tool for this?
generate a bed of insert size from your bam
samtools view -f 66 in.bam | awk -F '\t' '{printf("%s\t%d\t%s\t%s\n",$3,(int($4)<int($8)?int($4):int($8))-1,int($4)<int($8)?$8:$4,int($9)<0?-$9:$9);}' | LC_ALL=C sort -t$'\t' -k1,1 -k2,2n > f1.bed
create sliding windows from your reference
$ bedtools makewindows -g ref.fasta.fai -w 10000000 -s 5000000 | LC_ALL=C sort -t$'\t' -k1,1 -k2,2n > f2.bed
compute the intersection between the two bed file, cut window-chr/window-start/window-end/segment-length and group by the mean of the segment-length:
bedtools intersect -wa -wb -a f2.bed -b f1.bed | cut -f 1,2,3,7 | bedtools groupby -i - -g 1,2,3 -c 4 -o mean > f3.bed
f3 is a bedgraph that you can convert to a wiggle file using the UCSC tools.
Pierre's answer may end up being easier, but since I was a bit curious how easy/hard this would be to do with the deepTools API (it was very non-trivial, but it'll be quicker than with samtools):
#!/usr/bin/env python | |
import deeptools.utilities | |
from deeptools import bamHandler | |
from deeptools import mapReduce | |
from deeptools import writeBedGraph | |
from deeptools import utilities | |
from deeptools.utilities import getCommonChrNames, toBytes | |
import sys | |
import shutil | |
import os | |
import numpy as np | |
BAM = "something.bam" # This is a BAM file name | |
binLength = 1000 # Window size | |
stepSize = 50 # The step size by which each window is shifted | |
numberOfProcessors = 20 # Number of threads | |
outFileName = "something.bigWig" # Ouput file name | |
outFileFormat = "bigwig" # or "bedgraph" | |
def writeBedGraph_wrapper(args): | |
""" | |
Passes the arguments to writeBedGraph_worker. | |
This is a step required given | |
the constrains from the multiprocessing module. | |
The args var, contains as first element the 'self' value | |
from the WriteBedGraph object | |
""" | |
return foo.writeBedGraph_worker(*args) | |
class foo(writeBedGraph.WriteBedGraph): | |
def calculate_mean_isize(self, bamHandle, chrom, regions): | |
rv = np.zeros(len(regions), dtype='float64') | |
for idx, reg in enumerate(regions): | |
vals = [r.template_length for r in bamHandle.fetch(chrom, reg[0], reg[1]) if r.flag & 4 == 0] | |
if len(vals) > 0: | |
rv[idx] = np.mean(np.abs(vals)) | |
return rv | |
def count_reads_in_region(self, chrom, start, end): | |
bam_handlers = [] | |
for fname in self.bamFilesList: | |
bam_handlers.append(bamHandler.openBam(fname)) | |
# A list of lists of tuples | |
transcriptsToConsider = [] | |
for i in range(start, end, self.stepSize): | |
if i + self.binLength > end: | |
break | |
transcriptsToConsider.append([(i, i + self.binLength)]) | |
subnum_reads_per_bin = [] | |
for bam in bam_handlers: | |
for trans in transcriptsToConsider: | |
tcov = self.calculate_mean_isize(bam, chrom, trans) | |
subnum_reads_per_bin.extend(tcov) | |
subnum_reads_per_bin = np.concatenate([subnum_reads_per_bin]).reshape(-1, len(self.bamFilesList), order='F') | |
return subnum_reads_per_bin, '' | |
def writeBedGraph_worker(self, chrom, start, end, func_to_call, func_args): | |
coverage, _ = self.count_reads_in_region(chrom, start, end) | |
_file = open(utilities.getTempFileName(suffix='.bg'), 'w') | |
previous_value = None | |
line_string = "{}\t{}\t{}\t{:g}\n" | |
for tileIndex in range(coverage.shape[0]): | |
tileCoverage = coverage[tileIndex, :] | |
value = func_to_call(tileCoverage, func_args) | |
writeStart = start + tileIndex * self.stepSize | |
writeEnd = min(writeStart + self.stepSize, end) | |
_file.write(line_string.format(chrom, writeStart, writeEnd, value)) | |
tempfilename = _file.name | |
_file.close() | |
return tempfilename | |
def run(self, func_to_call, func_args, out_file_name, format="bedgraph"): | |
bam_handlers = [bamHandler.openBam(x) for x in self.bamFilesList] | |
genome_chunk_length = writeBedGraph.getGenomeChunkLength(bam_handlers, self.binLength) | |
# check if both bam files correspond to the same species | |
# by comparing the chromosome names: | |
chrom_names_and_size, non_common = getCommonChrNames(bam_handlers, verbose=False) | |
if self.region: | |
# in case a region is used, append the tilesize | |
self.region += ":{}".format(self.binLength) | |
for x in list(self.__dict__.keys()): | |
sys.stderr.write("{}: {}\n".format(x, self.__getattribute__(x))) | |
res = mapReduce.mapReduce([func_to_call, func_args], | |
writeBedGraph_wrapper, | |
chrom_names_and_size, | |
self_=self, | |
genomeChunkLength=genome_chunk_length, | |
region=self.region, | |
numberOfProcessors=self.numberOfProcessors) | |
# concatenate intermediary bedgraph files | |
out_file = open(out_file_name + ".bg", 'wb') | |
for tempfilename in res: | |
if tempfilename: | |
# concatenate all intermediate tempfiles into one | |
# bedgraph file | |
_foo = open(tempfilename, 'rb') | |
shutil.copyfileobj(_foo, out_file) | |
_foo.close() | |
os.remove(tempfilename) | |
bedgraph_file = out_file.name | |
out_file.close() | |
if format == 'bedgraph': | |
sort_cmd = cfg.config.get('external_tools', 'sort') | |
os.system("LC_ALL=C {} -k1,1 -k2,2n {} > {}".format(sort_cmd, bedgraph_file, out_file_name)) | |
os.remove(bedgraph_file) | |
if self.verbose: | |
print("output file: {}".format(out_file_name)) | |
else: | |
writeBedGraph.bedGraphToBigWig( | |
chrom_names_and_size, bedgraph_file, out_file_name, True) | |
if self.verbose: | |
print("output file: {}".format(out_file_name)) | |
os.remove(bedgraph_file) | |
wr = foo([BAM], binLength=binLength, stepSize=stepSize, numberOfProcessors=numberOfProcessors, region="19") | |
func_args = {'scaleFactor': 1.0} | |
wr.run(writeBedGraph.scaleCoverage, func_args, outFileName, format=outFileFormat) |
Change the input and output names or add in argparse if you really want to reuse it. The code contains a lot of copy/paste from other deepTools functions, which is why it's a bit of a mess.
Use of this site constitutes acceptance of our User Agreement and Privacy Policy.
In the somewhat likely event that there's nothing prewritten to do this, I expect something can be thrown together with the deepTools python API (I can try to throw something together if you'd like).
That would be great!