I'm trying to implement a javascript-based search of PWM in a DNA sequence. I've been inspired by the following post: http://davetang.org/muse/2013/10/01/position-weight-matrix/ and by the R/Biostrings-code https://github.com/genome-vendor/r-bioc-biostrings/blob/master/src/match_PWM.c. See my code below.
I'm starting from a PFM, and I convert it to PWM using the algorithm described in http://davetang.org/muse/2013/10/01/position-weight-matrix/. I get the same results:
PWM input
A 0 4 4 0 3 7 4 3 5 4 2 0 0 4
C 3 0 4 8 0 0 0 3 0 0 0 0 2 4
G 2 3 0 0 0 0 0 0 1 0 6 8 5 0
T 3 1 0 0 5 1 4 2 2 4 0 0 1 0
PWM output
A -1.936751795439824 0.7980887856393778 0.7980887856393778 -1.936751795439824 0.4535418763586175 1.5094375840527072 0.7980887856393778 0.4535418763586175 1.0760077609593484 0.7980887856393778 0 -1.936751795439824 -1.936751795439824 0.7980887856393778
C 0.4535418763586175 -1.936751795439824 0.7980887856393778 1.6854416207624505 -1.936751795439824 -1.936751795439824 -1.936751795439824 0.4535418763586175 -1.936751795439824 -1.936751795439824 -1.936751795439824 -1.936751795439824 0 0.7980887856393778
G 0 0.4535418763586175 -1.936751795439824 -1.936751795439824 -1.936751795439824 -1.936751795439824 -1.936751795439824 -1.936751795439824 -0.665198492276212 -1.936751795439824 1.308938775371164 1.6854416207624505 1.0760077609593484 -1.936751795439824
T 0.4535418763586175 -0.665198492276212 -1.936751795439824 -1.936751795439824 1.0760077609593484 -0.665198492276212 0.7980887856393778 0 0 0.7980887856393778 -1.936751795439824 -1.936751795439824 -0.665198492276212 -1.936751795439824
Searching the motif in the DNA sequence : I'm not confident about my code.
Score("GAAATTACAAGGG"): null
Score("ttacataagtagtc"): null
Score("ttacataACCagtc"): null
Score("CAACTAAAAAGGGC"): 2.692960768092946
My main problem is about the min_score
https://github.com/genome-vendor/r-bioc-biostrings/blob/master/src/match_PWM.c. As far as I understand this score should is a fraction 0<=min_score<1 . But (am I wrong ?), there is no normalization here on the X-axis. So is my code correct and if no, how should I fix it?
Thank you,
Pierre
<html> | |
<head> | |
<script type="text/javascript"> | |
//<![CDATA[ | |
/* from R code https://github.com/genome-vendor/r-bioc-biostrings/blob/master/src/match_PWM.c */ | |
function compute_pwm_score(pwm, S,offset) | |
{ | |
var score=0.0; | |
var pwm_ncol = pwm[0].length; | |
score = 0.00; | |
for (var x = 0; x < pwm_ncol; x++) | |
{ | |
var y = -1; | |
switch(S.charAt(offset+x)) | |
{ | |
case 'a':case 'A': y=0; break; | |
case 'c':case 'C': y=1; break; | |
case 't':case 'T': y=2; break; | |
case 'g':case 'G': y=3; break; | |
} | |
if (y == -1) continue; | |
score += pwm[y][x]; | |
} | |
return score; | |
} | |
/* from R code */ | |
function match_PWM_XString(pwm,S,minscore) | |
{ | |
var ncols= pwm[0].length; | |
for (var n1 = 0; n1+ncols<=S.length;++n1) | |
{ | |
var score = compute_pwm_score(pwm,S,n1); | |
if (score >= minscore) | |
{ | |
return score; | |
} | |
} | |
return null; | |
} | |
/* search pwm in dna */ | |
function scanPwm(matrix,dna,score) | |
{ | |
var score=match_PWM_XString(matrix,dna,score); | |
var E=document.getElementById("rez3"); | |
E.appendChild(document.createTextNode("Score(\""+dna+"\"): "+score+"\n")); | |
} | |
/** convert a matrix to string */ | |
function matrixToString(matrix) | |
{ | |
var s=""; | |
var bases=["A","C","G","T"]; | |
var i=0,j; | |
for(i=0;i< 4;++i) | |
{ | |
var total=0; | |
s+=bases[i]; | |
for(j=0;j< matrix[i].length;++j) | |
{ | |
s+=" "+matrix[i][j]; | |
total+=matrix[i][j]; | |
} | |
//s+=" ("+total+")"; | |
s+="\n"; | |
} | |
return s; | |
} | |
function run() | |
{ | |
/* example from http://davetang.org/muse/2013/10/01/position-weight-matrix/ */ | |
var pfm= [ | |
[0, 4, 4, 0, 3, 7, 4, 3, 5, 4, 2, 0, 0, 4],/* A */ | |
[3, 0, 4, 8, 0, 0, 0, 3, 0, 0, 0, 0, 2, 4],/* C */ | |
[2, 3, 0, 0, 0, 0, 0, 0, 1, 0, 6, 8, 5, 0],/* G */ | |
[3, 1, 0, 0, 5, 1, 4, 2, 2, 4, 0, 0, 1, 0] /* T */ | |
]; | |
var ncols= pfm[0].length; | |
var pwm= [ | |
new Array(ncols), | |
new Array(ncols), | |
new Array(ncols), | |
new Array(ncols) | |
]; | |
/* convert pfm to pwm */ | |
for(var x=0;x<ncols;++x) | |
{ | |
var total=0; | |
for(var y=0;y<4;++y) total+=pfm[y][x]; | |
for(var y=0;y<4;++y) | |
{ | |
var prob_base=0.25; | |
var freq=pfm[y][x]; | |
var w= Math.log2 ( ( freq + Math.sqrt(total) * prob_base ) / ( total + Math.sqrt(total) ) / prob_base ); | |
pwm[y][x]=w; | |
} | |
} | |
var E=document.getElementById("rez1"); | |
E.appendChild(document.createTextNode(matrixToString(pfm))); | |
E=document.getElementById("rez2"); | |
E.appendChild(document.createTextNode(matrixToString(pwm))); | |
/* search this pattern */ | |
var min_score=0.8; | |
scanPwm(pwm,"GAAATTACAAGGG",min_score); | |
scanPwm(pwm,"ttacataagtagtc",min_score); | |
scanPwm(pwm,"ttacataACCagtc",min_score); | |
scanPwm(pwm,"CAACTAAAAAGGGC",min_score); | |
} | |
//]]> | |
</script> | |
</head> | |
<body onload="run()"> | |
<h3>PFM</h3> | |
<pre id="rez1"></pre> | |
<h3>PWM</h3> | |
<pre id="rez2"></pre> | |
<h3>Search</h3> | |
<pre id="rez3"></pre> | |
</body> | |
</html> |
The script that is mentioned appears to be available here