Python script for sequence alignment not running
1
0
Entering edit mode
4.6 years ago
def read_Fasta(filename):
    from re import sub, search
    res = []
    sequence = None
    info = None
    fh = open(filename)
    for line in fh:
        if search(">.*", line):
            if sequence is not None and info is not None and sequence != "":
                res.append(sequence)
            info = line
            sequence = ""
        else:
            if sequence is None:
                return None
            else:
                sequence += sub("\s", "", line)
    if sequence is not None and info is not None and sequence != "":
        res.append(sequence)
    fh.close()
    return res

def read_submat_file(filename):
    sm = {}
    f = open(filename, "r")
    line = f.readline()
    tokens = line.split("\t")
    ns = len(tokens)
    alphabet = []
    for i in range(0, ns):
        alphabet.append(tokens[i][0])
    for i in range(0, ns):
        line = f.readline()
        tokens = line.split("\t")
        for j in range(0, len(tokens)):
            k = alphabet[i] + alphabet[j]
            sm[k] = int(tokens[j])
    return sm

def needleman_wunsch (seq1, seq2, sm, g):
    """Global Alignment"""
    sm = read_submat_file(f)
    seq1 = read_Fasta("A.fasta")
    seq2 = read_Fasta("B.fasta")
    S = [[0]]
    T = [[0]]
    # initialize gaps in rows
    for j in range(1, len(seq2)+1):
        S[0].append(g * j)
        T[0].append(3)
    # initialize gaps in cols
    for i in range(1, len(seq1)+1):
        S.append([g * i])
        T.append([2])
    # apply the recurrence to fill the matrices
    for i in range(0, len(seq1)):
        for j in range(len(seq2)):
            s1 = S[i][j] + score_pos(seq1[i], seq2[j], sm, g)
            s2 = S[i][j+1] + g
            s3 = S[i+1][j] + g
            S[i+1].append(max(s1, s2, s3))
            T[i+1].append(max3t(s1, s2, s3))
    return (S, T)

def test_DNA_GlobalAlign():
    sm = read_submat_file("blosum62.mat")
    seq1 = read_Fasta("A.fasta")
    seq2 = read_Fasta("B.fasta")
    p = -3
    res = needleman_wunsch(seq1, seq2, sm, p)
    S = res[0]
    T = res[1]
    print("Score of optimal alignment:", S[len(seq1)][len(seq2)])
    print_mat(S)
    print_mat(T)
    alig = recover_align(T, seq1, seq2)
    print(alig[0])
    print(alig[1])

test_DNA_GlobalAlign()

I have been trying to run the python script to align 2 fasta files, it reads the files but I can't seem to use the previous functions on the subsequent functions. I get these errors.

>>> test_DNA_GlobalAlign()
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "<stdin>", line 6, in test_DNA_GlobalAlign
  File "<stdin>", line 20, in needleman_wunsch
  File "<stdin>", line 6, in score_pos
python alignment • 1.0k views
ADD COMMENT
1
Entering edit mode
4.6 years ago
Mensur Dlakic ★ 28k

You probably need to delete these lines from needleman_wunsch:

sm = read_submat_file(f)
seq1 = read_Fasta("A.fasta")
seq2 = read_Fasta("B.fasta")

By the way, you can achieve this task with a dozen or so lines of code in Biopython.

ADD COMMENT

Login before adding your answer.

Traffic: 2494 users visited in the last hour
Help About
FAQ
Access RSS
API
Stats

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.

Powered by the version 2.3.6