Chapter 4 Alignment Algorithms

4.1 Longest Common Subsequence

backtrack_lcs <- function(b,x,m,n){
  if (m==0 | n==0) return(NULL)
  if (b[m+1,n+1] == '\\'){
    return(c(x[m],backtrack_lcs(b,x,m-1,n-1)))
  }else if(b[m+1,n+1] == '|'){
    backtrack_lcs(b,x,m-1,n)
  }else{
    backtrack_lcs(b,x,m,n-1)
  }
}

library(compiler)
find_lcs <- cmpfun(function(x,y){
  
  options(expressions=10000)
  
  x <- unlist(strsplit(x,''))
  y <- unlist(strsplit(y,''))
  
  m <- length(x)
  n <- length(y)
  
  backtrack_key <- c('|','--','\\')
  s <- matrix(0,length(x)+1,length(y)+1,dimnames=list(c('',x),c('',y)))
  b <- matrix('',length(x)+1,length(y)+1,dimnames=list(c('',x),c('',y)))
  
  for (i in seq(2,m+1)){
    for (j in seq(2,n+1)){
      
      s_up <- s[i-1,j]
      s_left <- s[i,j-1]
      s_diag <- s[i-1,j-1] + 1
      
      if (x[i-1]==y[j-1]) scores <- c(s_up,s_left,s_diag) else scores <- c(s_up,s_left)
      
      backtrack_update <- which.max(scores)
      score_update <- max(scores)
      
      s[i,j] <- score_update
      b[i,j] <- backtrack_key[backtrack_update]
      
    }
  }
  
  lcs <- backtrack_lcs(b,x,m,n)
  
  return(list(lcs=paste0(rev(lcs),collapse=''),
              length=s[length(s)],
              score=s,
              backtrack=b))
  
})

4.2 Global Alignment (R)

backtrack_global <- function(x,y,b,score_matrix,penalty){
  
  m <- length(x)
  n <- length(y)
  
  score <- 0
  align_x <- NULL
  align_y <- NULL
  
  while (m > 0 | n > 0){
    if (b[m+1,n+1] == '|'){
      align_x <- c(align_x,x[m])
      align_y <- c(align_y,'-')
      score <- score + penalty
      m <- m-1
    }else if(b[m+1,n+1] == '--'){
      align_x <- c(align_x,'-')
      align_y <- c(align_y,y[n])
      score <- score + penalty
      n <- n-1
    }else{
      align_x <- c(align_x,x[m]) 
      align_y <- c(align_y,y[n])
      score <- score + score_matrix[x[m],y[n]]
      n <- n-1
      m <- m-1
    }
  }
  
  alignment <- c(paste0(rev(align_x),collapse=''),paste0(rev(align_y),collapse=''))
  
  return(list(score=score,alignment=alignment))
  
}

library(compiler)
global_alignment <- cmpfun(function(x,y,score_matrix,penalty){
  
  x <- unlist(strsplit(x,''))
  y <- unlist(strsplit(y,''))
  
  m <- length(x)
  n <- length(y)
  
  backtrack_key <- c('|','--','\\')
  
  s <- matrix(0,length(x)+1,length(y)+1,dimnames=list(c('',x),c('',y)))
  s[1,] <- cumsum(c(0,rep(penalty,ncol(s)-1)))
  s[,1] <- cumsum(c(0,rep(penalty,nrow(s)-1)))
  
  b <- matrix('',length(x)+1,length(y)+1,dimnames=list(c('',x),c('',y)))
  b[1,] <- '--'
  b[,1] <- '|'
  b[1,1] <- '\\'
  
  for (i in seq(2,m+1)){
    for (j in seq(2,n+1)){
      
      s_up <- s[i-1,j] + penalty
      s_left <- s[i,j-1] + penalty
      s_diag <- s[i-1,j-1] + score_matrix[x[i-1],y[j-1]]
      
      scores <- c(s_up,s_left,s_diag) 
      
      backtrack_update <- which.max(scores)
      score_matrix_update <- max(scores)
      
      s[i,j] <- score_matrix_update
      b[i,j] <- backtrack_key[backtrack_update]
      
    }
  }
  
  return(backtrack_global(x,y,b,score_matrix,penalty))
  
})
blosum62 <- as.matrix(read.table('https://gist.githubusercontent.com/sw1/8870a124624f31585d5c15be72fcfc21/raw/34c4a34ce49d58b595c3fb2dc77b89a5b6a9b0af/blosum62.dat'))
x <- 'ISTHISALL'
y <- 'ALIGNED'
penalty <- -5
global_alignment(x,y,blosum62,-5)
## $score
## [1] -15
## 
## $alignment
## [1] "ISTHISALL" "-AL-IGNED"

4.3 Global Alignment (Python)

import urllib.request

def scoring_matrix(filename):
    scoring = urllib.request.urlopen(filename).readlines()
    scoring = [i.decode("utf-8").strip('\n')  for i in scoring[1:]]
    keys = [i[0] for i in scoring]
    scoring = [i.split()[1:] for i in scoring]
    scoring_dict = {}
    for ii,i in enumerate(keys):
        scoring_dict[i] = {}
        for ji,j in enumerate(keys):
            scoring_dict[i][j] = int(scoring[ii][ji])
    return scoring_dict
    
def global_alignment(v,w,penalty,matrix_file):
    score_matrix = scoring_matrix(matrix_file)
    s = [[0]*(len(w)+1) for i in range(len(v)+1)]
    s[0] = list(range(0,penalty*len(s[0]),penalty))
    for i in range(1,len(s)):
        s[i][0] = penalty + s[i-1][0]
    path = [['||'] + ['--']*(len(w)) for i in range(len(v)+1)]
    path[0][0] = '\\'
    for i in range(1,len(v)+1):
        for j in range(1,len(w)+1):
            score = score_matrix[v[i-1]][w[j-1]]
            s[i][j] = max(s[i-1][j-1] + score, s[i][j-1] + penalty,s[i-1][j] + penalty)
            if s[i][j] == s[i-1][j] + penalty:
                path[i][j] = '||'
            if s[i][j] == s[i][j-1] + penalty:
                path[i][j] = "--"
            if s[i][j] == s[i-1][j-1] + score:
                path[i][j] = "\\"
    score = 0
    align1 = ''
    align2 = ''
    while i >= 1 or j >= 1:
        if path[i][j] == "||":
            align1 += v[i-1]
            align2 += '-'
            score += penalty
            i -= 1
        elif path[i][j] == "--":
            align1 += '-'
            align2 += w[j-1]
            score += penalty
            j -= 1
        else:
            align1 += v[i-1]
            align2 += w[j-1]
            score += score_matrix[w[j-1]][v[i-1]]
            i -= 1
            j -= 1
    align1 = align1[::-1]
    align2 = align2[::-1]
    print('\n'.join([str(score),align1,align2]))
    return [score, align1,align2]
    
blosum62 = 'https://gist.githubusercontent.com/sw1/8870a124624f31585d5c15be72fcfc21/raw/34c4a34ce49d58b595c3fb2dc77b89a5b6a9b0af/blosum62.dat'
x = 'ISTHISALL'
y = 'ALIGNED'
penalty = -5
global_alignment(x,y,penalty,blosum62)