Here, is my first naive attempt in R using a loop over a GRanges object. It works but is painfully slow as expected. Takes about 110 seconds for 1000 features on my computer. Afaik, one cannot use apply with GRanges. The way it is implemented now works almost without re-drawing, except for too small contigs. It might be faster to draw and assign everything in one go though using apply and then just re-draw features overlapping exclude. Any comment is welcome.
require(GenomicRanges)
shuffle <- function(genome, excludes=NULL, landmark.prob=TRUE,
keep.gene.models=TRUE, max.retries=10000) {
### make a lookup vector from the Parent character list
if (! is.null(genome$Parent))
lookup.parent <- unlist(sapply(genome$Parent, function(x)if (length(x) == 0) 'NA' else x ))
rdata <- as(excludes, "RangedData")
### Foreach feature that doesn't have a parent
### calculate gaps for placing objects:
gaplist <- gaps(ranges(rdata), start=1, end = seqlengths(genome))
### END preapations
### iterate only over the top-level
myset <- if ( keep.gene.models && ! is.null(genome$Parent))
which(lookup.parent == "NA")
else
1:length(genome)
for (i in myset) {
f <- genome[i]
### select a contig or chromosome:
landmark <- NULL
for (c in 1:max.retries) {
prob <- if (landmark.prob)
(seqlengths(genome))/max((seqlengths(genome)))
else NULL
landmark <- sample(seqlevels(genome), 1, prob=prob )
### redraw if landmark doesn't have enough free space (gene length > max gap length)
if (max(width(gaplist[[landmark]])) >= width(f)) break;
}
if (is.null(landmark)) {
warning ("couldn't place feature");
next;
}
### end choose chromosome
### choose a random start position from landmark (at least gene-length away from any exclude region)
### flank position intervals on start position with - feature width will ensure that.
### we have already chosen landmark such that there is enough space.
new.start <- as.numeric(sample(as.integer(flank(gaplist[[landmark]], -1*width(f))), 1))
#new.start <- (sample(as.integer(gaplist[[landmark]]), 1))
offset <- new.start-start(f)
### change feature coordinates
seqnames(genome[i]) <- factor(landmark, levels=seqlevels(f))
genome[i] <- shift(genome[i], offset)
### collect all dependent features and adjust their location to the new origin of the parent
if (keep.gene.models & ! is.null(genome$Parent)) {
children <- get.children.i(i, genome, lookup.parent)
if (!is.null(children)) {
seqnames(genome[children]) <- factor(landmark, levels=seqlevels(genome))
genome[children] <- shift(genome[children], offset)
}
## NEXT feature
}
}
return (genome)
}
## look for all children recursively
## this is slightly faster than using the objects
get.children.i <- function(i, allfeatures, lookup.parent) {
m <- match(allfeatures[i]$ID, lookup.parent)
if is.na(m))
return(c())
else {
ch <- get.children.i(m, allfeatures, lookup.parent)
return (c(m, ch))
}
}
Thanks for pointing this out - it is a feature that we should certainly provide.