|
# From http://www.biostars.org/p/83800/: |
|
|
|
# "What I want to do is to plot reads of my histone marks (in bam file) |
|
# around TSS with CpG and TSS without CpG (Essentially a coverage profile)." |
|
# |
|
# |
|
# To install metaseq and dependencies, see: |
|
# |
|
# |
|
# https://pythonhosted.org/metaseq/install.html |
|
# |
|
# |
|
# To download the example data used here, make sure you're in the directory |
|
# this script is saved in, and then use: |
|
# |
|
# git clone https://gist.github.com/a2e63a2fb93d05341de5.git demo_data |
|
# |
|
# (Or see https://gist.github.com/daler/a2e63a2fb93d05341de5 and download the |
|
# files individually) |
|
# |
|
|
|
|
|
import metaseq |
|
import pybedtools |
|
import numpy as np |
|
from matplotlib import pyplot as plt |
|
|
|
bam = metaseq.genomic_signal('demo_data/h3k4me3-chr21.bam', 'bam') |
|
cpg = pybedtools.BedTool('demo_data/cpg-chr21.bed.gz') |
|
tss = pybedtools.BedTool('demo_data/tss-chr21.bed.gz') |
|
|
|
# extend by 5 kb up/downstream |
|
tss = tss.slop(b=5000, genome='hg19') |
|
|
|
tss_with_cpg = tss.intersect(cpg, u=True) |
|
tss_without_cpg = tss.intersect(cpg, v=True) |
|
|
|
# change this to as many CPUs as you have in order to run in parallel |
|
processes = 1 |
|
|
|
# each read will be extended 3' to a total size of this many bp |
|
fragment_size = 200 |
|
|
|
# the region +/-5kb around each TSS will be split into a total of 100 bins, |
|
# change as needed |
|
bins = 100 |
|
|
|
x = np.linspace(-5000, 5000, bins) |
|
|
|
# most of the work happens here |
|
y1 = bam.array(tss_with_cpg, bins=bins, processes=processes, fragment_size=fragment_size) |
|
y2 = bam.array(tss_without_cpg, bins=bins, processes=processes, fragment_size=fragment_size) |
|
|
|
plt.rcParams['font.size'] = 11 |
|
plt.rcParams['font.family'] = 'Arial' |
|
|
|
plt.plot(x, y1.mean(axis=0), label='with cpg', color='k') |
|
plt.plot(x, y2.mean(axis=0), label='without cpg', color='r', linestyle='--') |
|
plt.legend(loc='best') |
|
plt.xlabel('Distance from TSS (bp)') |
|
plt.ylabel('Mean H3K4me3 read density') |
|
plt.show() |
I would not recommend to plot raw read counts directly from BAM as this may give you false impressions due to biases. For example, promoters by default have more reads do to GC and open chromatin bias. Thus, it is better to use log2 ratios or the difference between treatment and input.