This code example generates 10,000 intervals then queries them for overlapping regions. Requires only the presence of Python.
The code below requires the either the installation of the bx python package or alternatively you may just download the quicksect.py module and place it next to the script itself:
from random import randint, seed
# if you can install bx python then uncomment the line below
#
# from bx.intervals.operations.quicksect import IntervalNode
# otherwise just download the quickset module as shown above
# and place it in next to your program
#
from quicksect import IntervalNode
# the span of the generated intervals
SPAN = 10
# the size of the genome
SIZE = 5*10**4
# the number of intervals
N = 10**4
def generate(x):
"Generates random interval over a size and span"
lo = randint(10000, SIZE)
hi = lo + randint(1, SPAN)
return (lo, hi)
def find(start, end, tree):
"Returns a list with the overlapping intervals"
out = []
tree.intersect( start, end, lambda x: out.append(x) )
return [ (x.start, x.end) for x in out ]
# use this to force both examples to generate the same data
seed(10)
# generate 10 thousand random intervals
data = map(generate, xrange(N))
# generate the intervals to query over
query = map(generate, xrange(10))
# start the root at the first element
start, end = data[0]
tree = IntervalNode( start, end )
# build an interval tree from the rest of the data
for start, end in data[1:]:
tree = tree.insert( start, end )
for start, end in query:
overlap = find(start, end , tree)
print '(%s, %s) -> %s' % (start, end, overlap)
Produces the output:
(41901, 41903) -> [(41894, 41902)]
(36981, 36987) -> [(36981, 36984), (36973, 36982), (36978, 36987)]
(36338, 36339) -> [(36337, 36347)]
(32741, 32748) -> [(32738, 32742)]
(49864, 49872) -> [(49859, 49865)]
(21475, 21477) -> []
(29425, 29428) -> [(29418, 29426), (29419, 29426)]
(29590, 29599) -> [(29586, 29595), (29596, 29598)]
(12804, 12811) -> [(12806, 12811), (12799, 12806), (12809, 12819)]
(30339, 30343) -> [(30336, 30346), (30335, 30345), (30340, 30341)]
anyone benchmarked a nested containment list against an interval tree? other than pygr are there any NCL's in the wild?