The Needleman–Wunsch algorithm is used for global alignment of two sequences. The best global alignment can be scored, and we may find the number of identities..
In the pt dictionary, the three scores for match, mismatch and gap are given.
The first two steps in the algorithm are initialization and fill, which will create the score matrix.
The last column on the last row shows the final score, and we start traceback to first column in first row, and in so doing find the best scoring alignment.
# bpy23.py
from __future__ import print_function, division
import numpy as np
pt ={'match': 1, 'mismatch': -1, 'gap': -1}
def mch(alpha, beta):
if alpha == beta:
return pt['match']
elif alpha == '-' or beta == '-':
return pt['gap']
else:
return pt['mismatch']
def needle(s1, s2):
m, n = len(s1), len(s2)
score = np.zeros((m+1, n+1))
#Initialization
for i in range(m+1):
score[i][0] = pt['gap'] * i
for j in range(n+1):
score[0][j] = pt['gap'] * j
#Fill
for i in range(1, m + 1):
for j in range(1, n + 1):
diag = score[i-1][j-1] + mch(s1[i-1], s2[j-1])
delete = score[i-1][j] + pt['gap']
insert = score[i][j-1] + pt['gap']
score[i][j] = max(diag, delete, insert)
print('score matrix = \n%s\n' % score)
align1, align2 = '', ''
i,j = m,n
#Traceback
while i > 0 and j > 0:
score_current = score[i][j]
score_diag = score[i-1][j-1]
score_left = score[i][j-1]
score_up = score[i-1][j]
print('score_current: ',score_current)
print('score_diag: ',score_diag)
print('score_left: ',score_left)
print('score_up: ',score_up)
if score_current == score_diag + mch(s1[i-1], s2[j-1]):
print('diag')
a1,a2 = s1[i-1],s2[j-1]
i,j = i-1,j-1
elif score_current == score_up + pt['gap']:
print('up')
a1,a2 = s1[i-1],'-'
i -= 1
elif score_current == score_left + pt['gap']:
print('left')
a1,a2 = '-',s2[j-1]
j -= 1
print('%s ---> a1 = %s\t a2 = %s\n' % ('Add',a1,a2))
align1 += a1
align2 += a2
while i > 0:
a1,a2 = s1[i-1],'-'
print('%s ---> a1 = %s\t a2 = %s\n' % ('Add',a1,a2))
align1 += a1
align2 += a2
i -= 1
while j > 0:
a1,a2 = '-',s2[j-1]
print('%s --> a1 = %s\t a2 = %s\n' % ('Add',a1,a2))
align1 += a1
align2 += a2
j -= 1
align1 = align1[::-1]
align2 = align2[::-1]
seqN = len(align1)
sym = ''
seq_score = 0
ident = 0
for i in range(seqN):
a1 = align1[i]
a2 = align2[i]
if a1 == a2:
sym += a1
ident += 1
seq_score += mch(a1, a2)
else:
seq_score += mch(a1, a2)
sym += ' '
ident = ident/seqN * 100
print('Identity = %2.1f percent' % ident)
print('Score = %d\n'% seq_score)
print(align1)
print(sym)
print(align2)
if __name__ == '__main__':
needle("TCGA","ACCGT")
#score matrix =
#[[ 0. -1. -2. -3. -4. -5.]
# [-1. -1. -2. -3. -4. -3.]
# [-2. -2. 0. -1. -2. -3.]
# [-3. -3. -1. -1. 0. -1.]
# [-4. -2. -2. -2. -1. -1.]]
#
#score_current: -1.0
#score_diag: 0.0
#score_left: -1.0
#score_up: -1.0
#diag
#Add ---> a1 = A a2 = T
#
#score_current: 0.0
#score_diag: -1.0
#score_left: -1.0
#score_up: -2.0
#diag
#Add ---> a1 = G a2 = G
#
#score_current: -1.0
#score_diag: -2.0
#score_left: 0.0
#score_up: -3.0
#diag
#Add ---> a1 = C a2 = C
#
#score_current: -2.0
#score_diag: -1.0
#score_left: -1.0
#score_up: -2.0
#diag
#Add ---> a1 = T a2 = C
#
#Add --> a1 = - a2 = A
#
#Identity = 40.0 percent
#Score = -1
#
#-TCGA
# CG
#ACCGT
No comments:
Post a Comment