Here we resuse the code from previous chapters. We have hidden this code to make you focus on the salient parts of the reasoning.
Source
import numpy as np
# Print 2 sequences on top of each other
def print_alignment(seqA,seqB):
print(seqA)
print(seqB)
# Print a dynamic programming score matrix
# together with its sequences
def print_dynamic(seqA,seqB,dpm):
seqA,seqB = "-" + seqA, "-" + seqB
m,n = len(seqA),len(seqB)
print('{:^5}'.format(" "), end=""),
for j in range(n):
print('{:^5}'.format(seqB[j]), end="")
print()
for i in range(m):
print ('{:^5}'.format(seqA[i]), end="")
for j in range(n):
print ('{:5.1f}'.format(dpm[i,j]), end="")
print()
print()
# Format an alignment by inserting gaps in sequences
def format_alignment(seqA,seqB,S,trace):
print("Best score: " + str(S[-1,-1]))
outA,outB = "",""
i,j = len(seqA),len(seqB)
while i>0 or j>0:
di,dj = trace[i,j]
i += int(di)
j += int(dj)
if di == 0:
outA = "-" + outA
else:
outA = seqA[i] + outA
if dj == 0:
outB = "-" + outB
else:
outB = seqB[j] + outB
return outA,outB
# Initiating dynamic programming matrices, S and trace
def initiate_global_dp(m,n):
S = np.zeros((m,n)) # An m*n matrix, initiated with 0's
trace = np.zeros((m,n,2)) # An m*n matrix, initiated with (0,0)'s
S[0,0] = 0.
trace[0,0,:] = (0.,0.)
for i in range(1,m):
S[i,0] = i * gap_penalty()
trace[i,0,:] =(-1,0)
for j in range(1,n):
S[0,j] = j * gap_penalty()
trace[0,j,:] =(0,-1)
return S,trace
def global_align(seqA,seqB,print_dynamic_matrix = False):
# Initiating variables
m, n = len(seqA)+1, len(seqB)+1
S,trace = initiate_global_dp(m,n)
# Fill in the rest of the dynamic programming matrix
for i in range(1,m):
for j in range(1,n):
# Note the subtraction of 1 for the sequence position
# In python sequences are indexed from 0 to len-1
match = S[i-1,j-1] + match_score(seqA[i-1],seqB[j-1])
delete = S[i-1,j] + match_score(seqA[i-1],'-')
insert = S[i,j-1] + match_score('-',seqB[j-1])
S[i,j] = max(match,delete,insert)
if match >= max(insert,delete):
trace[i,j,:] = (-1,-1)
elif delete >= insert:
trace[i,j,:] = (-1,0)
else:
trace[i,j,:] = (0,-1)
if print_dynamic_matrix:
print_dynamic(seqA,seqB,S)
return format_alignment(seqA,seqB,S,trace)
def format_local_alignment(seqA,seqB,S,trace):
outA,outB = "",""
i,j = np.unravel_index(S.argmax(),S.shape)
print("Best score: " + str(S[i,j]))
while min(trace[i,j])<0:
di,dj = trace[i,j]
i += int(di)
j += int(dj)
if di == 0:
outA = "-" + outA
else:
outA = seqA[i] + outA
if dj == 0:
outB = "-" + outB
else:
outB = seqB[j] + outB
return outA,outB
# Initiating dynamic programming matrices, S and trace
def initiate_local_dp(m,n):
S = np.zeros((m,n))
trace = np.zeros((m,n,2))
S[0,0] = 0.
trace[0,0,:] = (0.,0.)
for i in range(1,m):
S[i,0] = 0
trace[i,0,:] =(0,0)
for j in range(1,n):
S[0,j] = 0
trace[0,j,:] =(0,0)
return S,trace
def local_align(seqA,seqB,print_dynamic_matrix = False):
# Initiating variables
m, n = len(seqA)+1, len(seqB)+1
S,trace = initiate_local_dp(m,n)
# Fill in the rest of the dynamic programming matrix
for i in range(1,m):
for j in range(1,n):
match = S[i-1][j-1] + match_score(seqA[i-1],seqB[j-1])
delete = S[i-1,j] + match_score(seqA[i-1],'-')
insert = S[i,j-1] + match_score('-',seqB[j-1])
S[i,j] = max(match, delete, insert, 0.)
if match >= max(delete,insert,0.):
trace[i,j,:] = (-1,-1.)
elif delete >= max(insert,0.):
trace[i,j,:] = (-1.,0.)
elif insert >= 0.:
trace[i,j,:] = (0.,-1.)
else:
trace[i,j,:] = (0.,0.)
if print_dynamic_matrix:
print_dynamic(seqA,seqB,S)
return format_local_alignment(seqA,seqB,S,trace)
def format_semiglobal_alignment(seqA,seqB,S,trace):
outA,outB = "",""
m,n = S.shape[0]-1, S.shape[1]-1
i,j = S[:,n].argmax(), S[m,:].argmax()
if S[i,n] > S[m,j]:
print("Best score: " + str(S[i,n]))
j = n
# point the alignmnént from (m,n) to here
for ix in range(i+1,m+1):
print(ix,n)
trace[ix,n,:] = (-1,0)
else:
print("Best score: " + str(S[m,j]))
i = m
# point the alignmnént from (m,n) to here
for ix in range(j+1,n+1):
print(m,ix)
trace[m,ix,:] = (0,-1)
i,j = m,n
while min(trace[i,j])<0:
di,dj = trace[i,j]
i += int(di)
j += int(dj)
if di == 0:
outA = "-" + outA
else:
outA = seqA[i] + outA
if dj == 0:
outB = "-" + outB
else:
outB = seqB[j] + outB
return outA,outB
# Initiating dynamic programming matrices, S and trace
def initiate_semiglobal_dp(m,n):
S = np.zeros((m,n))
trace = np.zeros((m,n,2))
S[0,0] = 0.
trace[0,0,:] = (0.,0.)
for i in range(1,m):
S[i,0] = 0
trace[i,0,:] =(-1,0)
for j in range(1,n):
S[0,j] = 0
trace[0,j,:] =(0,-1)
return S,trace
def semiglobal_align(seqA,seqB,print_dynamic_matrix = False):
# Initiating variables
m, n = len(seqA)+1, len(seqB)+1
S,trace = initiate_semiglobal_dp(m,n)
# Fill in the rest of the dynamic programming matrix
for i in range(1,m):
for j in range(1,n):
match = S[i-1][j-1] + match_score(seqA[i-1],seqB[j-1])
delete = S[i-1,j] + match_score(seqA[i-1],'-')
insert = S[i,j-1] + match_score('-',seqB[j-1])
S[i,j] = max(match, delete, insert)
if match >= max(delete,insert):
trace[i,j,:] = (-1,-1.)
elif delete >= insert:
trace[i,j,:] = (-1.,0.)
else:
trace[i,j,:] = (0.,-1.)
if print_dynamic_matrix:
print_dynamic(seqA,seqB,S)
return format_semiglobal_alignment(seqA,seqB,S,trace)Protein sequences¶
We can use the exact same alignment methods for protein sequences, as long as we define an appropriate scoring function. Here we will use a PAM250 matrix for the matches, and a gap penalty of 10.
PAM250 = {
'A': {'A': 2, 'C': -2, 'D': 0, 'E': 0, 'F': -3, 'G': 1, 'H': -1, 'I': -1, 'K': -1, 'L': -2, 'M': -1, 'N': 0, 'P': 1, 'Q': 0, 'R': -2, 'S': 1, 'T': 1, 'V': 0, 'W': -6, 'Y': -3},
'C': {'A': -2, 'C': 12, 'D': -5, 'E':-5, 'F': -4, 'G': -3, 'H': -3, 'I': -2, 'K': -5, 'L': -6, 'M': -5, 'N': -4, 'P': -3, 'Q': -5, 'R': -4, 'S': 0, 'T': -2, 'V': -2, 'W': -8, 'Y': 0},
'D': {'A': 0, 'C': -5, 'D': 4, 'E': 3, 'F': -6, 'G': 1, 'H': 1, 'I': -2, 'K': 0, 'L': -4, 'M': -3, 'N': 2, 'P': -1, 'Q': 2, 'R': -1, 'S': 0, 'T': 0, 'V': -2, 'W': -7, 'Y': -4},
'E': {'A': 0, 'C': -5, 'D': 3, 'E': 4, 'F': -5, 'G': 0, 'H': 1, 'I': -2, 'K': 0, 'L': -3, 'M': -2, 'N': 1, 'P': -1, 'Q': 2, 'R': -1, 'S': 0, 'T': 0, 'V': -2, 'W': -7, 'Y': -4},
'F': {'A': -3, 'C': -4, 'D': -6, 'E':-5, 'F': 9, 'G': -5, 'H': -2, 'I': 1, 'K': -5, 'L': 2, 'M': 0, 'N': -3, 'P': -5, 'Q': -5, 'R': -4, 'S': -3, 'T': -3, 'V': -1, 'W': 0, 'Y': 7},
'G': {'A': 1, 'C': -3, 'D': 1, 'E': 0, 'F': -5, 'G': 5, 'H': -2, 'I': -3, 'K': -2, 'L': -4, 'M': -3, 'N': 0, 'P': 0, 'Q': -1, 'R': -3, 'S': 1, 'T': 0, 'V': -1, 'W': -7, 'Y': -5},
'H': {'A': -1, 'C': -3, 'D': 1, 'E': 1, 'F': -2, 'G': -2, 'H': 6, 'I': -2, 'K': 0, 'L': -2, 'M': -2, 'N': 2, 'P': 0, 'Q': 3, 'R': 2, 'S': -1, 'T': -1, 'V': -2, 'W': -3, 'Y': 0},
'I': {'A': -1, 'C': -2, 'D': -2, 'E':-2, 'F': 1, 'G': -3, 'H': -2, 'I': 5, 'K': -2, 'L': 2, 'M': 2, 'N': -2, 'P': -2, 'Q': -2, 'R': -2, 'S': -1, 'T': 0, 'V': 4, 'W': -5, 'Y': -1},
'K': {'A': -1, 'C': -5, 'D': 0, 'E': 0, 'F': -5, 'G': -2, 'H': 0, 'I': -2, 'K': 5, 'L': -3, 'M': 0, 'N': 1, 'P': -1, 'Q': 1, 'R': 3, 'S': 0, 'T': 0, 'V': -2, 'W': -3, 'Y': -4},
'L': {'A': -2, 'C': -6, 'D': -4, 'E':-3, 'F': 2, 'G': -4, 'H': -2, 'I': 2, 'K': -3, 'L': 6, 'M': 4, 'N': -3, 'P': -3, 'Q': -2, 'R': -3, 'S': -3, 'T': -2, 'V': 2, 'W': -2, 'Y': -1},
'M': {'A': -1, 'C': -5, 'D': -3, 'E':-2, 'F': 0, 'G': -3, 'H': -2, 'I': 2, 'K': 0, 'L': 4, 'M': 6, 'N': -2, 'P': -2, 'Q': -1, 'R': 0, 'S': -2, 'T': -1, 'V': 2, 'W': -4, 'Y': -2},
'N': {'A': 0, 'C': -4, 'D': 2, 'E': 1, 'F': -3, 'G': 0, 'H': 2, 'I': -2, 'K': 1, 'L': -3, 'M': -2, 'N': 2, 'P': 0, 'Q': 1, 'R': 0, 'S': 1, 'T': 0, 'V': -2, 'W': -4, 'Y': -2},
'P': {'A': 1, 'C': -3, 'D': -1, 'E':-1, 'F': -5, 'G': 0, 'H': 0, 'I': -2, 'K': -1, 'L': -3, 'M': -2, 'N': 0, 'P': 6, 'Q': 0, 'R': 0, 'S': 1, 'T': 0, 'V': -1, 'W': -6, 'Y': -5},
'Q': {'A': 0, 'C': -5, 'D': 2, 'E': 2, 'F': -5, 'G': -1, 'H': 3, 'I': -2, 'K': 1, 'L': -2, 'M': -1, 'N': 1, 'P': 0, 'Q': 4, 'R': 1, 'S': -1, 'T': -1, 'V': -2, 'W': -5, 'Y': -4},
'R': {'A': -2, 'C': -4, 'D': -1, 'E':-1, 'F': -4, 'G': -3, 'H': 2, 'I': -2, 'K': 3, 'L': -3, 'M': 0, 'N': 0, 'P': 0, 'Q': 1, 'R': 6, 'S': 0, 'T': -1, 'V': -2, 'W': 2, 'Y': -4},
'S': {'A': 1, 'C': 0, 'D': 0, 'E': 0, 'F': -3, 'G': 1, 'H': -1, 'I': -1, 'K': 0, 'L': -3, 'M': -2, 'N': 1, 'P': 1, 'Q': -1, 'R': 0, 'S': 2, 'T': 1, 'V': -1, 'W': -2, 'Y': -3},
'T': {'A': 1, 'C': -2, 'D': 0, 'E': 0, 'F': -3, 'G': 0, 'H': -1, 'I': 0, 'K': 0, 'L': -2, 'M': -1, 'N': 0, 'P': 0, 'Q': -1, 'R': -1, 'S': 1, 'T': 3, 'V': 0, 'W': -5, 'Y': -3},
'V': {'A': 0, 'C': -2, 'D': -2, 'E':-2, 'F': -1, 'G': -1, 'H': -2, 'I': 4, 'K': -2, 'L': 2, 'M': 2, 'N': -2, 'P': -1, 'Q': -2, 'R': -2, 'S': -1, 'T': 0, 'V': 4, 'W': -6, 'Y': -2},
'W': {'A': -6, 'C': -8, 'D': -7, 'E':-7, 'F': 0, 'G': -7, 'H': -3, 'I': -5, 'K': -3, 'L': -2, 'M': -4, 'N': -4, 'P': -6, 'Q': -5, 'R': 2, 'S': -2, 'T': -5, 'V': -6, 'W': 17, 'Y': 0},
'Y': {'A': -3, 'C': 0, 'D': -4, 'E':-4, 'F': 7, 'G': -5, 'H': 0, 'I': -1, 'K': -4, 'L': -1, 'M': -2, 'N': -2, 'P': -5, 'Q': -4, 'R': -4, 'S': -3, 'T': -3, 'V': -2, 'W': 0, 'Y': 10}}
def gap_penalty():
return -10.0
def match_score(letterA,letterB):
if letterA == '-' or letterB == '-':
return gap_penalty()
else:
return PAM250[letterA][letterB]
seqA,seqB = global_align("IAMAPEPTIDE","IAMPEPPED",True)
print_alignment(seqA,seqB) - I A M P E P P E D
- 0.0-10.0-20.0-30.0-40.0-50.0-60.0-70.0-80.0-90.0
I -10.0 5.0 -5.0-15.0-25.0-35.0-45.0-55.0-65.0-75.0
A -20.0 -5.0 7.0 -3.0-13.0-23.0-33.0-43.0-53.0-63.0
M -30.0-15.0 -3.0 13.0 3.0 -7.0-17.0-27.0-37.0-47.0
A -40.0-25.0-13.0 3.0 14.0 4.0 -6.0-16.0-26.0-36.0
P -50.0-35.0-23.0 -7.0 9.0 13.0 10.0 0.0-10.0-20.0
E -60.0-45.0-33.0-17.0 -1.0 13.0 12.0 9.0 4.0 -6.0
P -70.0-55.0-43.0-27.0-11.0 3.0 19.0 18.0 8.0 3.0
T -80.0-65.0-53.0-37.0-21.0 -7.0 9.0 19.0 18.0 8.0
I -90.0-75.0-63.0-47.0-31.0-17.0 -1.0 9.0 17.0 16.0
D -100.0-85.0-73.0-57.0-41.0-27.0-11.0 -1.0 12.0 21.0
E -110.0-95.0-83.0-67.0-51.0-37.0-21.0-11.0 3.0 15.0
Best score: 15.0
IAMAPEPTIDE
IAM-PEPP-ED
seqA,seqB = global_align("MYPERFECTDAY","THERESAMAY",False)
print_alignment(seqA,seqB)Best score: 3.0
MYPERFECTDAY
T-HER-ESAMAY
seqA,seqB = local_align("MYPERFECTDAY","THERESAMAY",False)
print_alignment(seqA,seqB)Best score: 14.0
PERFECTDAY
HER-ESAMAY