how to align lists of words using biopython pairwise2
3
1
Entering edit mode
8.1 years ago
euacar ▴ 10

When I run the script below, output is getting split into single chars. Any idea why? It looks like the second argument gets split into single chars. I am trying to align the word sequences. I will have many words hence cannot map them to letters only.

Thx Ercan

from Bio.Seq import Seq

from Bio.pairwise2 import format_alignment

fruits = ["orange","pear", "apple","pear","orange"]

fruits1 = ["pear","apple"]


from Bio import pairwise2

alignments = pairwise2.align.localms(fruits,fruits1,2,-1,-0.5,-0.1, gap_char=["-"])

for a in alignments: 

    print(format_alignment(*a))

Output:

 ['orange', 'r', 'a', 'e', 'p', 'e', 'l', 'p', 'p', 'a', 'pear', 'orange']
 |||||||||
['-', 'r', 'a', 'e', 'p', 'e', 'l', 'p', 'p', 'a', '-', '-']
  Score=4
alignment sequence • 8.3k views
ADD COMMENT
1
Entering edit mode

Please reformat your code using the 101010 button or by putting 4 spaces before each line of code. This is particularly important for python as you'll need to correctly indent the code so that we can be sure it's written correctly.

ADD REPLY
0
Entering edit mode

Thx For the feedback

ADD REPLY
0
Entering edit mode

As said by jrj.healey, code formatting is very important. I now added code markup to your post for increased readability. You can do this by selecting the text and clicking the 101010 button. When you compose or edit a post that button is in your toolbar, see image below:

101010 Button

ADD REPLY
0
Entering edit mode

Thx For the feedback

ADD REPLY
6
Entering edit mode
8.1 years ago

BioPython allows for performing an alignment on sequences' symbols (not words). I can't think of any program that would do the words' alignment. I suggest you search the Web for Python modules/functions that perform string character alignments and then modify the existing code to handle words.

For example, I tweaked some code from GitHub to do local (water) and global (needle) alignments of words.

Filename: alignment.py:

#This software is a free software. Thus, it is licensed under GNU General Public License.
#Python implementation to Smith-Waterman Algorithm for Homework 1 of Bioinformatics class.
#Forrest Bao, Sept. 26 <http://fsbao.net> <forrest.bao aT gmail.com>
# zeros() was origianlly from NumPy.
# This version is implemented by alevchuk 2011-04-10
def zeros(shape):
retval = []
for x in range(shape[0]):
retval.append([])
for y in range(shape[1]):
retval[-1].append(0)
return retval
match_award = 20
mismatch_penalty = -5
gap_penalty = -5 # both for opening and extanding
def match_score(alpha, beta):
if alpha == beta:
return match_award
elif alpha == '-' or beta == '-':
return gap_penalty
else:
return mismatch_penalty
def finalize(align1, align2):
align1.reverse() #reverse sequence 1
align2.reverse() #reverse sequence 2
i,j = 0,0
#calcuate identity, score and aligned sequeces
found = 0
score = 0
identity = 0
for i in range(0,len(align1)):
# if two AAs are the same, then output the letter
if align1[i] == align2[i]:
identity = identity + 1
score += match_score(align1[i], align2[i])
# if they are not identical and none of them is gap
elif align1[i] != align2[i] and align1[i] != '-' and align2[i] != '-':
score += match_score(align1[i], align2[i])
found = 0
#if one of them is a gap, output a space
elif align1[i] == '-' or align2[i] == '-':
score += gap_penalty
identity = float(identity) / len(align1)
return identity, score, align1, align2
def needle(seq1, seq2):
m, n = len(seq1), len(seq2) # length of two sequences
# Generate DP table and traceback path pointer matrix
score = zeros((m+1, n+1)) # the DP table
# Calculate DP table
for i in range(0, m + 1):
score[i][0] = gap_penalty * i
for j in range(0, n + 1):
score[0][j] = gap_penalty * j
for i in range(1, m + 1):
for j in range(1, n + 1):
match = score[i - 1][j - 1] + match_score(seq1[i-1], seq2[j-1])
delete = score[i - 1][j] + gap_penalty
insert = score[i][j - 1] + gap_penalty
score[i][j] = max(match, delete, insert)
# Traceback and compute the alignment
align1, align2 = [], []
i,j = m,n # start from the bottom right cell
while i > 0 and j > 0: # end toching the top or the left edge
score_current = score[i][j]
score_diagonal = score[i-1][j-1]
score_up = score[i][j-1]
score_left = score[i-1][j]
if score_current == score_diagonal + match_score(seq1[i-1], seq2[j-1]):
align1.append(seq1[i-1])
align2.append(seq2[j-1])
i -= 1
j -= 1
elif score_current == score_left + gap_penalty:
align1.append(seq1[i-1])
align2.append('-')
i -= 1
elif score_current == score_up + gap_penalty:
align1.append('-')
align2.append(seq2[j-1])
j -= 1
# Finish tracing up to the top left cell
while i > 0:
align1.append(seq1[i-1])
align2.append('-')
i -= 1
while j > 0:
align1.append('-')
align2.append(seq2[j-1])
j -= 1
return finalize(align1, align2)
def water(seq1, seq2):
m, n = len(seq1), len(seq2) # length of two sequences
# Generate DP table and traceback path pointer matrix
score = zeros((m+1, n+1)) # the DP table
pointer = zeros((m+1, n+1)) # to store the traceback path
max_score = 0 # initial maximum score in DP table
# Calculate DP table and mark pointers
for i in range(1, m + 1):
for j in range(1, n + 1):
score_diagonal = score[i-1][j-1] + match_score(seq1[i-1], seq2[j-1])
score_up = score[i][j-1] + gap_penalty
score_left = score[i-1][j] + gap_penalty
score[i][j] = max(0,score_left, score_up, score_diagonal)
if score[i][j] == 0:
pointer[i][j] = 0 # 0 means end of the path
if score[i][j] == score_left:
pointer[i][j] = 1 # 1 means trace up
if score[i][j] == score_up:
pointer[i][j] = 2 # 2 means trace left
if score[i][j] == score_diagonal:
pointer[i][j] = 3 # 3 means trace diagonal
if score[i][j] >= max_score:
max_i = i
max_j = j
max_score = score[i][j];
align1, align2 = [], [] # initial sequences
i,j = max_i,max_j # indices of path starting point
#traceback, follow pointers
while pointer[i][j] != 0:
if pointer[i][j] == 3:
align1.append(seq1[i-1])
align2.append(seq2[j-1])
i -= 1
j -= 1
elif pointer[i][j] == 2:
align1.append('-')
align2.append(seq2[j-1])
j -= 1
elif pointer[i][j] == 1:
align1.append(seq1[i-1])
align2.append('-')
i -= 1
return finalize(align1, align2)
if __name__ == "__main__":
fruits1 = ["orange","pear", "apple","pear","orange"]
fruits2 = ["pear","apple"]
print(needle(fruits1, fruits2))
print(water(fruits1, fruits2))
view raw alignment.py hosted with ❤ by GitHub

Usage:

import alignment
fruits1 = ["orange", "pear", "apple", "pear", "orange"]
fruits2 = ["pear", "apple"]
aln = alignment.needle(fruits1, fruits2)
identity = aln[0]
score = aln[1]
print(identity, score)
print(aln[2])
print(aln[3])

Output:

(0.4, 25)
['-', 'pear', 'apple', '-', '-']
['orange', 'pear', 'apple', 'pear', 'orange']
ADD COMMENT
0
Entering edit mode

Thank you for the code..

ADD REPLY
1
Entering edit mode
7.8 years ago
Markus ▴ 320

You can do this with Biopython's pairwise2 (again). There was a bug in pairwise2 in Biopython releases 1.68/1.69 which prevented the proper handling of lists as input. If you run the same code in Biopython 1.70 you get the following (and expected) result:

['orange', 'pear', 'apple', 'pear', 'orange']
 ||
['-', 'pear', 'apple', '-', '-']
  Score=4
ADD COMMENT
0
Entering edit mode

Thanks for posting this info. Good to know that.

ADD REPLY
0
Entering edit mode
5.3 years ago
yannis1962 • 0

Comparing two lists of words didn't worked for me either, so what I did was to convert words into Chinese characters (there are more than 20,000 of them in Unicode), aligning the sequences as character strings, and then back to Latin-alphabet words again. Works like a charm:

from bio import pairwise2
from bio.pairwise2 import format_alignment

LISTA=["alors","en","fait","depuis","novembre","2017","du","coup",",","j'","ai","fait",",","plusieurs","ce","que","j'","appelle","des","crises",",","en","fait","c'","est",",","pendant",",","une","semaine",",","entre","une","semaine","et","10","jours",",","je","me","sens","un","peu","comme",",","déconnectée","de","la","réalité","un","peu",",","j'","arrive","pas","à",",","j'","arrivais","plus","à","faire","la","différence","entre",",","entre","si","j'","étais","dans","un","ou","si","j'","étais","dans","la","réalité","."]
LISTB=["alors","en","fait","depuis","euh","novembre","deux","mille","dix-sept","du","coup","j'","ai","fait","euh","hum","hum","plusieurs","euh","enfin","ce","que","j'","appelle","des","crises","en","fait","c'","est","euh","pendant","euh","une","semaine","entre","une","semaine","et","dix","jours","euh","je","me","sens","un","peu","comme","euh","déconnectée","de","la","réalité","un","peu","j'","arrive","pas","à","j'","arrivais","plus","à","faire","la","différence","entre","entre","si","j'","étais","dans","un","rêve","ou","si","j'","étais","dans","la","réalité"]

charcode=ord(u"一")-1
LATtoHAN={}
HANtoLAT={}
LISTA_=[]
LISTB_=[]
for x in LISTA:
    if x in LATtoHAN.keys():
        LISTA_.append(LATtoHAN[x])
    else:
        charcode+=1
        LATtoHAN[x]=chr(charcode)
        HANtoLAT[chr(charcode)]=x
        LISTA_.append(LATtoHAN[x])
for x in LISTB:
    if x in LATtoHAN.keys():
        LISTB_.append(LATtoHAN[x])
    else:
        charcode+=1
        LATtoHAN[x]=chr(charcode)
        HANtoLAT[chr(charcode)]=x
        LISTB_.append(LATtoHAN[x])

LISTA__="".join(LISTA_)
LISTB__="".join(LISTB_)

alignments=pairwise2.align.globalxx(LISTA__,LISTB__)

RESA=[]
RESB=[]
for w in alignments[0][0]:
    if (w == "-"):
        RESA.append("-")
    else:
        RESA.append(HANtoLAT[w])
for w in alignments[0][1]:
    if (w == "-"):
        RESB.append("-")
    else:
        RESB.append(HANtoLAT[w])

print(RESA,RESB)
ADD COMMENT

Login before adding your answer.

Traffic: 1399 users visited in the last hour
Help About
FAQ
Access RSS
API
Stats

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.

Powered by the version 2.3.6