how to align lists of words using biopython pairwise2

Entering edit mode

8.2 years ago

euacar ▴ 10

When I run the script below, output is getting split into single chars. Any idea why? It looks like the second argument gets split into single chars. I am trying to align the word sequences. I will have many words hence cannot map them to letters only.

Thx Ercan

from Bio.Seq import Seq

from Bio.pairwise2 import format_alignment

fruits = ["orange","pear", "apple","pear","orange"]

fruits1 = ["pear","apple"]


from Bio import pairwise2

alignments = pairwise2.align.localms(fruits,fruits1,2,-1,-0.5,-0.1, gap_char=["-"])

for a in alignments: 

    print(format_alignment(*a))

Output:

 ['orange', 'r', 'a', 'e', 'p', 'e', 'l', 'p', 'p', 'a', 'pear', 'orange']
 |||||||||
['-', 'r', 'a', 'e', 'p', 'e', 'l', 'p', 'p', 'a', '-', '-']
  Score=4

alignment sequence • 8.5k views

ADD COMMENT • link updated 5.3 years ago by yannis1962 • 0 • written 8.2 years ago by euacar ▴ 10

Entering edit mode

Please reformat your code using the 101010 button or by putting 4 spaces before each line of code. This is particularly important for python as you'll need to correctly indent the code so that we can be sure it's written correctly.

ADD REPLY • link 8.2 years ago by Joe 22k

Entering edit mode

Thx For the feedback

ADD REPLY • link 8.2 years ago by euacar ▴ 10

Entering edit mode

As said by jrj.healey, code formatting is very important. I now added code markup to your post for increased readability. You can do this by selecting the text and clicking the 101010 button. When you compose or edit a post that button is in your toolbar, see image below:

101010 Button

ADD REPLY • link 8.2 years ago by WouterDeCoster 48k

Entering edit mode

Thx For the feedback

ADD REPLY • link 8.2 years ago by euacar ▴ 10

Entering edit mode

8.2 years ago

Andrzej Zielezinski 11k

BioPython allows for performing an alignment on sequences' symbols (not words). I can't think of any program that would do the words' alignment. I suggest you search the Web for Python modules/functions that perform string character alignments and then modify the existing code to handle words.

For example, I tweaked some code from GitHub to do local (water) and global (needle) alignments of words.

Filename: alignment.py:

	#This software is a free software. Thus, it is licensed under GNU General Public License.
	#Python implementation to Smith-Waterman Algorithm for Homework 1 of Bioinformatics class.
	#Forrest Bao, Sept. 26 <http://fsbao.net> <forrest.bao aT gmail.com>

	# zeros() was origianlly from NumPy.
	# This version is implemented by alevchuk 2011-04-10
	def zeros(shape):
	retval = []
	for x in range(shape[0]):
	retval.append([])
	for y in range(shape[1]):
	retval[-1].append(0)
	return retval

	match_award = 20
	mismatch_penalty = -5
	gap_penalty = -5 # both for opening and extanding

	def match_score(alpha, beta):
	if alpha == beta:
	return match_award
	elif alpha == '-' or beta == '-':
	return gap_penalty
	else:
	return mismatch_penalty

	def finalize(align1, align2):
	align1.reverse() #reverse sequence 1
	align2.reverse() #reverse sequence 2

	i,j = 0,0

	#calcuate identity, score and aligned sequeces

	found = 0
	score = 0
	identity = 0
	for i in range(0,len(align1)):
	# if two AAs are the same, then output the letter
	if align1[i] == align2[i]:

	identity = identity + 1
	score += match_score(align1[i], align2[i])

	# if they are not identical and none of them is gap
	elif align1[i] != align2[i] and align1[i] != '-' and align2[i] != '-':
	score += match_score(align1[i], align2[i])
	found = 0

	#if one of them is a gap, output a space
	elif align1[i] == '-' or align2[i] == '-':
	score += gap_penalty

	identity = float(identity) / len(align1)

	return identity, score, align1, align2


	def needle(seq1, seq2):
	m, n = len(seq1), len(seq2) # length of two sequences

	# Generate DP table and traceback path pointer matrix
	score = zeros((m+1, n+1)) # the DP table

	# Calculate DP table
	for i in range(0, m + 1):
	score[i][0] = gap_penalty * i
	for j in range(0, n + 1):
	score[0][j] = gap_penalty * j
	for i in range(1, m + 1):
	for j in range(1, n + 1):
	match = score[i - 1][j - 1] + match_score(seq1[i-1], seq2[j-1])
	delete = score[i - 1][j] + gap_penalty
	insert = score[i][j - 1] + gap_penalty
	score[i][j] = max(match, delete, insert)

	# Traceback and compute the alignment
	align1, align2 = [], []
	i,j = m,n # start from the bottom right cell
	while i > 0 and j > 0: # end toching the top or the left edge
	score_current = score[i][j]
	score_diagonal = score[i-1][j-1]
	score_up = score[i][j-1]
	score_left = score[i-1][j]

	if score_current == score_diagonal + match_score(seq1[i-1], seq2[j-1]):
	align1.append(seq1[i-1])
	align2.append(seq2[j-1])
	i -= 1
	j -= 1
	elif score_current == score_left + gap_penalty:
	align1.append(seq1[i-1])
	align2.append('-')
	i -= 1
	elif score_current == score_up + gap_penalty:
	align1.append('-')
	align2.append(seq2[j-1])
	j -= 1

	# Finish tracing up to the top left cell
	while i > 0:
	align1.append(seq1[i-1])
	align2.append('-')
	i -= 1
	while j > 0:
	align1.append('-')
	align2.append(seq2[j-1])
	j -= 1

	return finalize(align1, align2)


	def water(seq1, seq2):
	m, n = len(seq1), len(seq2) # length of two sequences

	# Generate DP table and traceback path pointer matrix
	score = zeros((m+1, n+1)) # the DP table
	pointer = zeros((m+1, n+1)) # to store the traceback path

	max_score = 0 # initial maximum score in DP table
	# Calculate DP table and mark pointers
	for i in range(1, m + 1):
	for j in range(1, n + 1):
	score_diagonal = score[i-1][j-1] + match_score(seq1[i-1], seq2[j-1])
	score_up = score[i][j-1] + gap_penalty
	score_left = score[i-1][j] + gap_penalty
	score[i][j] = max(0,score_left, score_up, score_diagonal)
	if score[i][j] == 0:
	pointer[i][j] = 0 # 0 means end of the path
	if score[i][j] == score_left:
	pointer[i][j] = 1 # 1 means trace up
	if score[i][j] == score_up:
	pointer[i][j] = 2 # 2 means trace left
	if score[i][j] == score_diagonal:
	pointer[i][j] = 3 # 3 means trace diagonal
	if score[i][j] >= max_score:
	max_i = i
	max_j = j
	max_score = score[i][j];

	align1, align2 = [], [] # initial sequences

	i,j = max_i,max_j # indices of path starting point

	#traceback, follow pointers
	while pointer[i][j] != 0:
	if pointer[i][j] == 3:
	align1.append(seq1[i-1])
	align2.append(seq2[j-1])
	i -= 1
	j -= 1
	elif pointer[i][j] == 2:
	align1.append('-')
	align2.append(seq2[j-1])
	j -= 1
	elif pointer[i][j] == 1:
	align1.append(seq1[i-1])
	align2.append('-')
	i -= 1

	return finalize(align1, align2)


	if __name__ == "__main__":
	fruits1 = ["orange","pear", "apple","pear","orange"]
	fruits2 = ["pear","apple"]
	print(needle(fruits1, fruits2))
	print(water(fruits1, fruits2))

view raw alignment.py hosted with ❤ by GitHub

Usage:

import alignment
fruits1 = ["orange", "pear", "apple", "pear", "orange"]
fruits2 = ["pear", "apple"]
aln = alignment.needle(fruits1, fruits2)
identity = aln[0]
score = aln[1]
print(identity, score)
print(aln[2])
print(aln[3])

Output:

(0.4, 25)
['-', 'pear', 'apple', '-', '-']
['orange', 'pear', 'apple', 'pear', 'orange']

ADD COMMENT • link 8.2 years ago by Andrzej Zielezinski 11k

Entering edit mode

Thank you for the code..

ADD REPLY • link 8.2 years ago by euacar ▴ 10

Entering edit mode

7.8 years ago

Markus ▴ 320

You can do this with Biopython's pairwise2 (again). There was a bug in pairwise2 in Biopython releases 1.68/1.69 which prevented the proper handling of lists as input. If you run the same code in Biopython 1.70 you get the following (and expected) result:

['orange', 'pear', 'apple', 'pear', 'orange']
 ||
['-', 'pear', 'apple', '-', '-']
  Score=4

ADD COMMENT • link 7.8 years ago by Markus ▴ 320

Entering edit mode

Thanks for posting this info. Good to know that.

ADD REPLY • link 7.8 years ago by Andrzej Zielezinski 11k

Entering edit mode

5.3 years ago

yannis1962 • 0

Comparing two lists of words didn't worked for me either, so what I did was to convert words into Chinese characters (there are more than 20,000 of them in Unicode), aligning the sequences as character strings, and then back to Latin-alphabet words again. Works like a charm:

from bio import pairwise2
from bio.pairwise2 import format_alignment

LISTA=["alors","en","fait","depuis","novembre","2017","du","coup",",","j'","ai","fait",",","plusieurs","ce","que","j'","appelle","des","crises",",","en","fait","c'","est",",","pendant",",","une","semaine",",","entre","une","semaine","et","10","jours",",","je","me","sens","un","peu","comme",",","déconnectée","de","la","réalité","un","peu",",","j'","arrive","pas","à",",","j'","arrivais","plus","à","faire","la","différence","entre",",","entre","si","j'","étais","dans","un","ou","si","j'","étais","dans","la","réalité","."]
LISTB=["alors","en","fait","depuis","euh","novembre","deux","mille","dix-sept","du","coup","j'","ai","fait","euh","hum","hum","plusieurs","euh","enfin","ce","que","j'","appelle","des","crises","en","fait","c'","est","euh","pendant","euh","une","semaine","entre","une","semaine","et","dix","jours","euh","je","me","sens","un","peu","comme","euh","déconnectée","de","la","réalité","un","peu","j'","arrive","pas","à","j'","arrivais","plus","à","faire","la","différence","entre","entre","si","j'","étais","dans","un","rêve","ou","si","j'","étais","dans","la","réalité"]

charcode=ord(u"一")-1
LATtoHAN={}
HANtoLAT={}
LISTA_=[]
LISTB_=[]
for x in LISTA:
    if x in LATtoHAN.keys():
        LISTA_.append(LATtoHAN[x])
    else:
        charcode+=1
        LATtoHAN[x]=chr(charcode)
        HANtoLAT[chr(charcode)]=x
        LISTA_.append(LATtoHAN[x])
for x in LISTB:
    if x in LATtoHAN.keys():
        LISTB_.append(LATtoHAN[x])
    else:
        charcode+=1
        LATtoHAN[x]=chr(charcode)
        HANtoLAT[chr(charcode)]=x
        LISTB_.append(LATtoHAN[x])

LISTA__="".join(LISTA_)
LISTB__="".join(LISTB_)

alignments=pairwise2.align.globalxx(LISTA__,LISTB__)

RESA=[]
RESB=[]
for w in alignments[0][0]:
    if (w == "-"):
        RESA.append("-")
    else:
        RESA.append(HANtoLAT[w])
for w in alignments[0][1]:
    if (w == "-"):
        RESB.append("-")
    else:
        RESB.append(HANtoLAT[w])

print(RESA,RESB)

ADD COMMENT • link 5.3 years ago by yannis1962 • 0