R::Script to find hidden path in HMM, viterbi algorithm

From Biocourse

Jump to: navigation, search

#################################################
# Viterbi
#
# Description : find hidden path, CpG island example
#  (0:non-CpG island, 1: CpG island)
#
# author : arhan@ucla.edu
#################################################

#emission probability of a dna from non-CpG state
e0 <- function(dna) {
 value = 0.25;
 value
}

#emission probability of a dna from CpG state
e1 <- function(dna) {
 value = -1
 if(dna == "A") value = 0.1;
 if(dna == "C") value = 0.4;
 if(dna == "G") value = 0.4;
 if(dna == "T") value = 0.1;
        value
}

#transition probability
a <- function(previous, current){
 value = -1
 if(previous == 0 && current == 0) value = 0.99;
 if(previous == 0 && current == 1) value = 0.01;
 if(previous == 1 && current == 0) value = 0.02;
 if(previous == 1 && current == 1) value = 0.98;
 value
}

maxPath <- function(x_0, x_1){
        maxpath = ""
 if(x_0 >  x_1) maxpath = "0"
 if(x_0 <= x_1) maxpath = "1"
        maxpath
}

getElement <- function(data, index){
 substr(data, index, index)
}

isCG <- function(x,y){
 value = 0
 if(x == "C" && y == "G") value = 1  
        value
}

# open an output file connection
#datafile = file("c:/Dataset1.txt", "r") 
#dataline = readLines(datafile)
#inputSequence = dataline[2];
#No_dna = 1000;

#input sequence
inputSequence = "CGCGCGCGTTTTTTAAAAAATATATATACCGGCGCGCGCG";
No_dna = 40;

aB0 = 0.5;
aB1 = 0.5;
a0E = 1;
a1E = 1;

#initialization
V0 = array();
V1 = array();

V0[1] = log(aB0) + log(e0(getElement(inputSequence ,1)));
V1[1] = log(aB1) + log(e1(getElement(inputSequence ,1)));

#recursive
for(i in 2:No_dna){
 V0[i] = max(V0[i-1]+log(a(0,0)), V1[i-1]+log(a(1,0)))+log(e0(getElement(inputSequence,i)));
 V1[i] = max(V0[i-1]+log(a(0,1)), V1[i-1]+log(a(1,1)))+log(e1(getElement(inputSequence,i)));
}

#finish when index
VE_3 = max(V0[No_dna]+log(a0E), V1[No_dna]+log(a1E))

#traceback
path = array();
path[No_dna] = maxPath(V0[No_dna], V1[No_dna]);

for(i in (No_dna-1):2){
 if(path[i+1]==0){ path[i] = maxPath(V0[i-1]+log(a(0,0)), V1[i-1]+log(a(1,0)));}
 if(path[i+1]==1){path[i] = maxPath(V0[i-1]+log(a(0,1)), V1[i-1]+log(a(1,1)));}
}

#CpG dinucletide
answer = array();
for(t in 1:No_dna-1){
 answer[t] = isCG(getElement(inputSequence,t), getElement(inputSequence,t+1));
}

#plot hidden path
plot(path, type = "l")

#plot CG dinucleotide
plot(answer, type = "l")