Sunday, April 5, 2015

bpy24. The Smith-Waterman algorithm in Python

The Smith–Waterman algorithm is used for local alignment of two sequences. This algorithm is described at Wikipedia. We use the two sequences from that link.


The last example, bpy23.py, is modified. There is no initialization as the fill for first row and first column is zero, and np.zeros is used to construct the matrices. For the first step, we find the matrices H and T. In the Wikipedia page, you can see the arrows, for the pointer matrix T, which makes the traceback path clear. In the program the various directions are labelled as integers, such as 3 for diagonal.


Here the high score, 12, happens to be the last column of last row. In general, this will not be true and the highest scoring entry could be located anywhere in the matrix. From the highest entry, we proceed to 0, using the pointer information in T. At each step to 0, we record the letter of sequence 1 and sequence2. There might be cases it is null, where a gap is recorded for the particular entry. This is shown in the printouts, where sometimes the string '-' is added.

# bpy24.py

from __future__ import division, print_function
import numpy as np

pt ={'match': 2, 'mismatch': -1, 'gap': -1}

def mch(alpha, beta):
    if alpha == beta:
        return pt['match']
    elif alpha == '-' or beta == '-':
        return pt['gap']
    else:
        return pt['mismatch']

def water(s1, s2):
    m, n = len(s1), len(s2)
    H = np.zeros((m+1, n+1))    
    T = np.zeros((m+1, n+1))
    max_score = 0
    # Score, Pointer Matrix
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            sc_diag = H[i-1][j-1] + mch(s1[i-1], s2[j-1])
            sc_up = H[i][j-1] + pt['gap']
            sc_left = H[i-1][j] + pt['gap']
            H[i][j] = max(0,sc_left, sc_up, sc_diag)
            if H[i][j] == 0: T[i][j] = 0
            if H[i][j] == sc_left: T[i][j] = 1
            if H[i][j] == sc_up: T[i][j] = 2
            if H[i][j] == sc_diag: T[i][j] = 3
            if H[i][j] >= max_score:
                max_i = i
                max_j = j
                max_score = H[i][j];
    
    print('H=\n',H,'\n')
    print('T=\n',T,'\n')
    align1, align2 = '', ''
    i,j = max_i,max_j
    
    #Traceback
    while T[i][j] != 0:
        if T[i][j] == 3:
            a1 = s1[i-1]
            a2 = s2[j-1]
            i -= 1
            j -= 1
        elif T[i][j] == 2:
            a1 = '-'
            a2 = s2[j-1]
            j -= 1
        elif T[i][j] == 1:
            a1 = s1[i-1]
            a2 = '-'
            i -= 1
        print('%s ---> a1 = %s\t a2 = %s\n' % ('Add',a1,a2))
        align1 += a1
        align2 += a2

    align1 = align1[::-1]
    align2 = align2[::-1]
    sym = ''
    iden = 0
    for i in range(len(align1)):
        a1 = align1[i]
        a2 = align2[i]
        if a1 == a2:                
            sym += a1
            iden += 1
        elif a1 != a2 and a1 != '-' and a2 != '-': 
            sym += ' '
        elif a1 == '-' or a2 == '-':          
            sym += ' '
    
    identity = iden / len(align1) * 100
    print('Identity = %f percent' % identity)
    print('Score =', max_score)
    print(align1)
    print(sym)
    print(align2)

if __name__ == '__main__':
    water('AGCACACA','ACACACTA')

#H=
# [[  0.   0.   0.   0.   0.   0.   0.   0.   0.]
# [  0.   2.   1.   2.   1.   2.   1.   0.   2.]
# [  0.   1.   1.   1.   1.   1.   1.   0.   1.]
# [  0.   0.   3.   2.   3.   2.   3.   2.   1.]
# [  0.   2.   2.   5.   4.   5.   4.   3.   4.]
# [  0.   1.   4.   4.   7.   6.   7.   6.   5.]
# [  0.   2.   3.   6.   6.   9.   8.   7.   8.]
# [  0.   1.   4.   5.   8.   8.  11.  10.   9.]
# [  0.   2.   3.   6.   7.  10.  10.  10.  12.]] 
#
#T=
# [[ 0.  0.  0.  0.  0.  0.  0.  0.  0.]
# [ 0.  3.  2.  3.  2.  3.  2.  2.  3.]
# [ 0.  1.  3.  1.  3.  1.  3.  3.  1.]
# [ 0.  1.  3.  2.  3.  2.  3.  2.  2.]
# [ 0.  3.  1.  3.  2.  3.  2.  2.  3.]
# [ 0.  1.  3.  1.  3.  2.  3.  2.  2.]
# [ 0.  3.  1.  3.  1.  3.  2.  2.  3.]
# [ 0.  1.  3.  1.  3.  1.  3.  2.  2.]
# [ 0.  3.  1.  3.  1.  3.  1.  3.  3.]] 
#
#Add ---> a1 = A  a2 = A
#
#Add ---> a1 = -  a2 = T
#
#Add ---> a1 = C  a2 = C
#
#Add ---> a1 = A  a2 = A
#
#Add ---> a1 = C  a2 = C
#
#Add ---> a1 = A  a2 = A
#
#Add ---> a1 = C  a2 = C
#
#Add ---> a1 = G  a2 = -
#
#Add ---> a1 = A  a2 = A
#
#Identity = 77.777778 percent
#Score = 12.0
#AGCACAC-A
#A CACAC A
#A-CACACTA

No comments:

Post a Comment