Saturday, April 4, 2015

by23. Needleman–Wunsch algorithm in Python

The Needleman–Wunsch algorithm is used for global alignment of two sequences. The best global alignment can be scored, and we may find the number of identities..


In the pt dictionary, the three scores for match, mismatch and gap are given.


The first two steps in the algorithm are initialization and fill, which will create the score matrix.


The last column on the last row shows the final score, and we start traceback to first column in first row, and in so doing find the best scoring alignment.

# bpy23.py
from __future__ import print_function, division
import numpy as np

pt ={'match': 1, 'mismatch': -1, 'gap': -1}

def mch(alpha, beta):
    if alpha == beta:
        return pt['match']
    elif alpha == '-' or beta == '-':
        return pt['gap']
    else:
        return pt['mismatch']

def needle(s1, s2):
    m, n = len(s1), len(s2)
    score = np.zeros((m+1, n+1))
    
    #Initialization
    for i in range(m+1):
        score[i][0] = pt['gap'] * i
    for j in range(n+1):
        score[0][j] = pt['gap'] * j
    
    #Fill
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            diag = score[i-1][j-1] + mch(s1[i-1], s2[j-1])
            delete = score[i-1][j] + pt['gap']
            insert = score[i][j-1] + pt['gap']
            score[i][j] = max(diag, delete, insert)

    print('score matrix = \n%s\n' % score)
    align1, align2 = '', ''
    i,j = m,n
    
    #Traceback
    while i > 0 and j > 0:
        score_current = score[i][j]
        score_diag = score[i-1][j-1]
        score_left = score[i][j-1]
        score_up = score[i-1][j]
        
        print('score_current: ',score_current)
        print('score_diag: ',score_diag)
        print('score_left: ',score_left)
        print('score_up: ',score_up)

        if score_current == score_diag + mch(s1[i-1], s2[j-1]):
            print('diag')
            a1,a2 = s1[i-1],s2[j-1]
            i,j = i-1,j-1
        elif score_current == score_up + pt['gap']:
            print('up')
            a1,a2 = s1[i-1],'-'
            i -= 1
        elif score_current == score_left + pt['gap']:
            print('left')
            a1,a2 = '-',s2[j-1]
            j -= 1
        print('%s ---> a1 = %s\t a2 = %s\n' % ('Add',a1,a2))
        align1 += a1
        align2 += a2
            

    while i > 0:
        a1,a2 = s1[i-1],'-'
        print('%s ---> a1 = %s\t a2 = %s\n' % ('Add',a1,a2))
        align1 += a1
        align2 += a2
        i -= 1
        
    while j > 0:
        a1,a2 = '-',s2[j-1]
        print('%s --> a1 = %s\t a2 = %s\n' % ('Add',a1,a2))
        align1 += a1
        align2 += a2
        j -= 1
    
    align1 = align1[::-1]
    align2 = align2[::-1]
    seqN = len(align1)
    sym = ''
    seq_score = 0
    ident = 0
    for i in range(seqN):
        a1 = align1[i]
        a2 = align2[i]
        if a1 == a2:
            sym += a1
            ident += 1
            seq_score += mch(a1, a2)
    
        else: 
            seq_score += mch(a1, a2)
            sym += ' '
        
    ident = ident/seqN * 100
    
    print('Identity = %2.1f percent' % ident)
    print('Score = %d\n'% seq_score)
    print(align1)
    print(sym)
    print(align2)

if __name__ == '__main__':
    needle("TCGA","ACCGT")

#score matrix = 
#[[ 0. -1. -2. -3. -4. -5.]
# [-1. -1. -2. -3. -4. -3.]
# [-2. -2.  0. -1. -2. -3.]
# [-3. -3. -1. -1.  0. -1.]
# [-4. -2. -2. -2. -1. -1.]]
#
#score_current:  -1.0
#score_diag:  0.0
#score_left:  -1.0
#score_up:  -1.0
#diag
#Add ---> a1 = A  a2 = T
#
#score_current:  0.0
#score_diag:  -1.0
#score_left:  -1.0
#score_up:  -2.0
#diag
#Add ---> a1 = G  a2 = G
#
#score_current:  -1.0
#score_diag:  -2.0
#score_left:  0.0
#score_up:  -3.0
#diag
#Add ---> a1 = C  a2 = C
#
#score_current:  -2.0
#score_diag:  -1.0
#score_left:  -1.0
#score_up:  -2.0
#diag
#Add ---> a1 = T  a2 = C
#
#Add --> a1 = -   a2 = A
#
#Identity = 40.0 percent
#Score = -1
#
#-TCGA
#  CG 
#ACCGT

No comments:

Post a Comment