Question

Sequence Pair Alignment in Python. Local and Global Sequence Alignment.

1

Entering edit mode

4.6 years ago

moreirasarasousa ▴ 10

Hi !

I'm trying to create a code, capable of sequencing 2 sequences, globaly and localy. This is what i have so far:

from Bio import SeqIO, Seq
from Bio.SeqIO import read
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna, generic_protein
from Bio.Align import substitution_matrices
# https://www.tutorialspoint.com/biopython/index.htm <- bons tutoriais!
# http://biopython.org/DIST/docs/api/Bio.SeqIO-module.html <- parse das sequencias

# http://biopython.org/DIST/docs/api/Bio.Seq.Seq-class.html
# http://biopython.org/DIST/docs/tutorial/Tutorial.html

class Alignment():
    def __init__(self, submat, gap=-1):
        self.sm = submat
        self.g = gap
        self.seq1 = None
        self.seq2 = None
        self.S = None
        self.T = None


    def score_pos (self, c1, c2):
        if c1 == "-" or c2=="-":
            return self.g
        else:
            return self.sm[c1,c2]


    def needleman_Wunsch(self, seq1, seq2):
        if (seq1.alphabet != seq2.alphabet): return None 
        # ^ "AttributeError: 'Seq' object has no attribute 'seq_type'
        # Solution: substituí seq_type por alphabet (atributo da classe Seq)
        self.S = [[0]]
        self.T = [[0]]
        self.seq1 = seq1
        self.seq2 = seq2
        for j in range(1, len(seq2) + 1):
            self.S[0].append(self.g * j)
            self.T[0].append(3)
        for i in range(1, len(seq1) + 1):
            self.S.append([self.g * i])
            self.T.append([2])
        for i in range(0, len(seq1)):
            for j in range(len(seq2)):
                s1 = self.S[i][j] + self.score_pos(seq1[i], seq2[j])
                s2 = self.S[i][j + 1] + self.g
                s3 = self.S[i + 1][j] + self.g
                self.S[i + 1].append(max(s1, s2, s3))
                self.T[i + 1].append(max3t(s1, s2, s3))
        return self.S[len(seq1)][len(seq2)]

class Protein(Alignment):
    def __init__(self, submat, gap):
        self.prot1 = None
        self.prot2 = None
        super().__init__(submat, gap) 


# metodos não pertencentes à classe       
def max3t (v1, v2, v3):
    if v1 > v2:
        if v1 > v3: return 1
        else: return 3
    else:
        if v2 > v3: return 2
        else: return 3

def read_submat_file(filename):
    sm = {}
    f = open(filename, "r")
    line = f.readline()
    tokens = line.split("\t")
    ns = len(tokens)
    alphabet = []
    for i in range(0, ns):
        alphabet.append(tokens[i][0])
    for i in range(0, ns):
        line = f.readline()
        tokens = line.split("\t")
        for j in range(0, len(tokens)):
            k = alphabet[i] + alphabet[j]
            sm[k] = int(tokens[j])
    return sm

def printMat (mat):
    for i in range(0, len(mat)):
        print(mat[i])
# 

# config data
subMat = substitution_matrices.load("BLOSUM62") # ler matriz de subs.
#subMat = read_submat_file("blosum62.mat")
gap = -3

# sequencias a alinhar
file1 = SeqIO.read("sequence1.fasta", "fasta")
file2 = SeqIO.read("sequence2.fasta", "fasta")
seq1 = file1.seq
seq2 = file2.seq

dna_obj = Alignment(subMat, gap)

res = dna_obj.needleman_Wunsch(seq1, seq2) # res dá um numero

# o resto dá erro:
S = res[0] # IndexError: invalid index to scalar variable.
T = res[1]
print("Score of optimal alignment:", S[len(seq1)][len(seq2)])
dna_obj.print_mat(S)
dna_obj.print_mat(T)

align_obj = dna_obj.recover_align(T, seq1, seq2)
print(align_obj[0])
print(align_obj[1])

Recently i understood that, i need to add this part : return self.S[len(seq1)][len(seq2)] -> ref self.S, self.T, in order to correct and error in the substitution_matrices.load function.

But i dont know what to do next. Can anybody help me please ?!?!

Python Alignments Local Global Fasta • 2.7k views

ADD COMMENT • link 4.6 years ago by moreirasarasousa ▴ 10

0

Entering edit mode

Is this for an assignment or something? I assume you know Biopython already has a very capable pairwise sequence alignment module (since you're already importing matrices)?

Any reason you're reinventing the wheel without saving yourself a dependency?

ADD REPLY • link 4.6 years ago by Joe 21k

0

Entering edit mode

Yes, it is for an assignment.

ADD REPLY • link 4.6 years ago by moreirasarasousa ▴ 10

0

Entering edit mode

All right you created the scoring matrix with your needleman_Wunsch function and you get the optimal score. But following lines are not needed res variable is not matrix just the score

# o resto dá erro:
S = res[0] # IndexError: invalid index to scalar variable.
T = res[1]

If you want to print the matrixes it should be something like following or printMat function needs be inside Alignment class

printMat(dna_obj.S)
printMat(dna_obj.T)

And in the following lines are for the aligned strings but recover_align function is missing in the Alignment class you need to write that function

align_obj = dna_obj.recover_align(T, seq1, seq2)
print(align_obj[0])
print(align_obj[1])

ADD REPLY • link 4.6 years ago by tamerg ▴ 100

0

Entering edit mode

Thank you, i will check it out.

ADD REPLY • link 4.6 years ago by moreirasarasousa ▴ 10

0

Entering edit mode

No problem. I updated my answer with some additional detail of your code.

ADD REPLY • link 4.6 years ago by tamerg ▴ 100