Here is a functional approach to the problem, it was designed to scan long sequences measuring the entropy with a n length sliding window.:
import sys
import numpy as np
import inspect
import itertools
from pylab import *
from matplotlib import pyplot as plt
EPSILON = np.float64(1.0*10**-50)
DNA = ['A','C','G','T']
GAP = '-'
def apply(f, x):
return f(x)
def mapply(f, x):
return map(f, x)
def rapply(f, x, red):
return reduce(red, map(f,x))
def np_apply(f, x, type):
return np.array(map(f, x), dtype = type)
def gen_apply(val):
return lambda f: apply(f, val)
def gen_mapply(vals):
return lambda f: mapply(f, vals)
def gen_rapply(vals, red):
return lambda f: rapply(f, vals, red)
def comparator(val, increment):
return lambda x: increment if x == val else np.float64(0)
def gen_comparators(alpha, increment):
return map(lambda x: comparator(x, increment), alpha)
def prior_norm_count(seqs):
return np.float64(1.0 / (len(seqs) * len(seqs[0])))
def post_norm_count(seqs):
return np.float64(1.0 / len(seqs))
def gen_position(seqs, pos, wlen):
return map(lambda x: x[pos:pos+wlen], seqs)
def gen_positions(seqs, alpha):
return map(lambda x: gen_position(seqs, x, len(alpha[0])),
xrange(len(seqs[0])))
def pos_gapped(pos):
if True in map(lambda x: True if GAP in x else False, pos):
return 1
else:
return 0
def get_gaps(seqs, wlen):
return np_apply(lambda x: x != None, np_apply(lambda x: x if pos_gapped(gen_position(seqs, x, wlen)) is 1 else -1, xrange(len(seqs[0])), np.int64), np.int64)
def count_one_pos(comps, pos):
return np_apply(lambda x: rapply(x, pos, lambda x,y:x+y), comps,
np.float64)
def count_all_pos(comps, seqs, alpha):
return np_apply(lambda x: count_one_pos(comps, x),
gen_positions(seqs, alpha),
np.float64)
def count_total(counts):
return np_apply(lambda x: counts[:,x].sum(), xrange(len(counts[0])),
np.float64)
def add_epsilon(mat): return mat[:] + EPSILON
def plogp(mat): return mat[:] * np.log2(mat[:])
def a_priori(seqs, alpha):
return plogp(add_epsilon(count_total(count_all_pos(gen_comparators(alpha, prior_norm_count(seqs)), seqs, alpha)))).sum() * -1.0
def positional_entropy(seqs, alpha):
return np_apply(lambda x: plogp(x).sum() * -1.0, add_epsilon(count_all_pos(gen_comparators(alpha, post_norm_count(seqs)), seqs, alpha)), np.float64)
def positional_ic(seqs, alpha, prior):
return prior - positional_entropy(seqs, alpha)[:]
def fasta(indir):
return [reduce(lambda x,y: x+y, x.split('\n')) for x in open(indir, 'rb').read().split('>')[1:]]
def gen_nmer(alpha, size):
if size == 1: return alpha
else: return [reduce(lambda x, y: x+y, x) for x in list(itertools.product(alpha, repeat=size))]
def __main__():
alpha = gen_nmer(DNA, int(sys.argv[2]))
seqs = fasta(sys.argv[1])
count = prior_norm_count(seqs)
prior = a_priori(seqs, alpha)
post = positional_ic(seqs, alpha, prior)
gpos = filter(lambda x: x != -1, get_gaps(seqs, int(sys.argv[2])))
x = np.array(xrange(len(seqs[0])))
npos = filter(lambda x: x not in gpos, x)
avg = np.empty(len(seqs[0]), dtype = np.float64)
avg.fill(np.mean(post))
std = np.empty(len(seqs[0]), dtype = np.float64)
std.fill(np.std(post))
plt.figure(1)
plt.plot(x[gpos], post[gpos], 'r', label = 'Gapped Positions')
plt.plot(x[npos], post[npos], 'g', label = 'Ungapped Positions')
plt.plot(x, avg, 'b', label = 'Average IC')
plt.plot(x, avg + std, 'y', label = 'Mean+1 std')
plt.plot(x, avg - std, 'y', label = 'Mean-1 std')
plt.savefig(sys.argv[3], dpi=1024, format='pdf')
__main__()
Karl does make a point, not in the best way, but if you're looking for a homework problem you should really try this on your own. Calculating sequence entropy is a very basic task but it can be a really fun coding problem. Because the calculations are so flexible, you can really explore programming solutions. Especially with python where you have lambdas, list comprehensions and some other fun tools. This is fun in Haskell as well.
There's no question here. If you are trying to outsource work, please forward half your monthly paycheck to me and I will email you the solution.