library(HMM)
library(Biostrings)
library(tidyverse)
library(magrittr)

Question 1 [10 points]
Using the S. cerevisiae (Baker’s yeast) data that we imported into R in Lectures 11 and 12, show R code of how you would estimate the frequency of A, C, G, T nucleotides in coding regions only. Use only chromosome 1.

( scerevisiae <- readRDS("../data/Scerevisiae_vers_1.0.rds") ) 
## An object of class "Genome"
## <S4 Type Object>
## attr(,"organism_name")
## [1] "Saccharomyces cerevisiae"
## attr(,"chromosomes")
## DNAStringSet object of length 17:
##        width seq                                            names               
##  [1]  230218 CCACACCACACCCACACACCCA...TGTGGGTGTGGTGTGTGTGGG 1
##  [2]  813184 AAATAGCCCTCATGTACGTCTC...GGGTGTGGTGTGTGGGTGTGT 2
##  [3]  316620 CCCACACACCACACCCACACCA...GTGGTGGGTGTGGTGTGTGTG 3
##  [4] 1531933 ACACCACACCCACACCACACCC...AAGGTAGTAAGTAGCTTTTGG 4
##  [5]  576874 CGTCTCCTCCAAGCCCTGTTGT...TTCATTTTCATTTTTTTTTTT 5
##  ...     ... ...
## [13]  924431 CCACACACACACCACACCCACA...GGGTGTGGTGTGTGTGTGGGG 13
## [14]  784333 CCGGCTTTCTGACCGAAATTAA...GTGTGTGGGTGTGGTGTGGGT 14
## [15] 1091291 ACACCACACCCACACCACACCC...GAGTGTGTGGGTGTGGTGTGT 15
## [16]  948066 AAATAGCCCTCATGTACGTCTC...TTTTTTAATTTCGGTCAGAAA 16
## [17]   85779 TTCATAATTAATTTTTTATATA...AATTATAATATAATATCCATA m
## attr(,"annotations")
## # A tibble: 23,058 x 9
##    seqid source type       start    end score strand phase attributes           
##    <fct> <fct>  <fct>      <int>  <int> <dbl> <fct>  <fct> <chr>                
##  1 chrI  SGD    chromosome     1 230218    NA <NA>   <NA>  ID=chrI;dbxref=NCBI:…
##  2 chrI  SGD    telomere       1    801    NA -      <NA>  ID=TEL01L;Name=TEL01…
##  3 chrI  SGD    X_element    337    801    NA -      <NA>  ID=TEL01L_X_element;…
##  4 chrI  SGD    X_element…    63    336    NA -      <NA>  ID=TEL01L_X_element_…
##  5 chrI  SGD    telomeric…     1     62    NA -      <NA>  ID=TEL01L_telomeric_…
##  6 chrI  SGD    gene         335    649    NA +      <NA>  ID=YAL069W;Name=YAL0…
##  7 chrI  SGD    CDS          335    649    NA +      0     Parent=YAL069W_mRNA;…
##  8 chrI  SGD    mRNA         335    649    NA +      <NA>  ID=YAL069W_mRNA;Name…
##  9 chrI  SGD    gene         538    792    NA +      <NA>  ID=YAL068W-A;Name=YA…
## 10 chrI  SGD    CDS          538    792    NA +      0     Parent=YAL068W-A_mRN…
## # … with 23,048 more rows
## attr(,"chromosome_names")
## # A tibble: 17 x 2
##    reference      chromosome
##    <chr>          <fct>     
##  1 ref|NC_001133| 1         
##  2 ref|NC_001134| 2         
##  3 ref|NC_001135| 3         
##  4 ref|NC_001136| 4         
##  5 ref|NC_001137| 6         
##  6 ref|NC_001138| 7         
##  7 ref|NC_001139| 8         
##  8 ref|NC_001140| 9         
##  9 ref|NC_001141| 5         
## 10 ref|NC_001142| 10        
## 11 ref|NC_001143| 11        
## 12 ref|NC_001144| 12        
## 13 ref|NC_001145| 13        
## 14 ref|NC_001146| 14        
## 15 ref|NC_001147| 15        
## 16 ref|NC_001148| 16        
## 17 ref|NC_001224| m         
## attr(,"GO")
## # A tibble: 42,885 x 7
##    X1    X2    X3        X4    X5                              X6       X7      
##    <chr> <chr> <chr>     <chr> <chr>                           <chr>    <chr>   
##  1 HRA1  HRA1  S0001193… C     cellular_component              GO:0005… ncRNA_g…
##  2 HRA1  HRA1  S0001193… F     molecular_function              GO:0003… ncRNA_g…
##  3 HRA1  HRA1  S0001193… P     rRNA processing                 GO:0006… ncRNA_g…
##  4 HRA1  HRA1  S0001193… P     ribosomal small subunit biogen… GO:0042… ncRNA_g…
##  5 ICR1  ICR1  S0001326… C     nucleus                         GO:0005… ncRNA_g…
##  6 ICR1  ICR1  S0001326… F     molecular_function              GO:0003… ncRNA_g…
##  7 ICR1  ICR1  S0001326… P     transcription from RNA polymer… GO:0006… ncRNA_g…
##  8 LSR1  LSR1  S0000064… C     nucleus                         GO:0005… snRNA_g…
##  9 LSR1  LSR1  S0000064… F     RNA binding                     GO:0003… snRNA_g…
## 10 LSR1  LSR1  S0000064… P     RNA splicing                    GO:0008… snRNA_g…
## # … with 42,875 more rows

First thing we need to do is to get the data into a manageable format.

get_chromosome <- function(genome_obj, chrom) {
  
  # Searches through Genome object attribute "chromosomes" and returns indexed DNAStringSet
  # Can take numeric or character type
  
  return(genome_obj@chromosomes[chrom])
}


get_annot <- function(genome_obj, annot) {
  
  # Searches through Genome object attribute "annotations" and returns filtered 
  # annotation tbl with selected annotation. Annotations are converted to roman
  # numerals before searching. If annot is not a numeric, returns Chrmt by default
  
  if (is.numeric(annot)) {
    annot <- paste0("chr", as.roman(annot))
    return(genome_obj@annotations[genome_obj@annotations["seqid"] == annot, ])
  } else {
    # I know, I know... HARDCODING?!
    return(genome_obj@annotations[genome_obj@annotations["seqid"] == "chrmt", ])
  }
}


choose_chrom <- function(genome_obj, chrom) {
  
  # Parses Genome object for selected chromosome sequence and attributes 
  # and returns it as a list. 
  
  return(list("chromosomes" = get_chromosome(genome_obj, chrom),
              "annotations" = get_annot(genome_obj, chrom)))
}
( chrom.1 <- choose_chrom(scerevisiae, 1) )
## Warning: The `i` argument of ``[`()` can't be a matrix as of tibble 3.0.0.
## Convert to a vector.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## $chromosomes
## DNAStringSet object of length 1:
##      width seq                                              names               
## [1] 230218 CCACACCACACCCACACACCCAC...GTGTGGGTGTGGTGTGTGTGGG 1
## 
## $annotations
## # A tibble: 424 x 9
##    seqid source type       start    end score strand phase attributes           
##    <fct> <fct>  <fct>      <int>  <int> <dbl> <fct>  <fct> <chr>                
##  1 chrI  SGD    chromosome     1 230218    NA <NA>   <NA>  ID=chrI;dbxref=NCBI:…
##  2 chrI  SGD    telomere       1    801    NA -      <NA>  ID=TEL01L;Name=TEL01…
##  3 chrI  SGD    X_element    337    801    NA -      <NA>  ID=TEL01L_X_element;…
##  4 chrI  SGD    X_element…    63    336    NA -      <NA>  ID=TEL01L_X_element_…
##  5 chrI  SGD    telomeric…     1     62    NA -      <NA>  ID=TEL01L_telomeric_…
##  6 chrI  SGD    gene         335    649    NA +      <NA>  ID=YAL069W;Name=YAL0…
##  7 chrI  SGD    CDS          335    649    NA +      0     Parent=YAL069W_mRNA;…
##  8 chrI  SGD    mRNA         335    649    NA +      <NA>  ID=YAL069W_mRNA;Name…
##  9 chrI  SGD    gene         538    792    NA +      <NA>  ID=YAL068W-A;Name=YA…
## 10 chrI  SGD    CDS          538    792    NA +      0     Parent=YAL068W-A_mRN…
## # … with 414 more rows

Great, now its organized. Next step would be to actually answer the question, which is to estimate the frequency of A, C, G, T nucleotides in coding regions only. We know that in this data set, the coding regions have an annotation of type “CDS”, meaning that whatever isn’t in that range can be considered non-coding. In reference to the actual HMM and using correct terminology, I will be considering the states as coding/non-coding and emissions as the nucleotides themselves. Using the start and end coordinates, we can assign areas of the chromosomes by state.

To do this, I will create a vector that matches the length of the chromosome and keeps track of which state each nucleotide is in. Since we only have two states, I will represent the states as logicals, where F == non-coding and T == coding.

set_state_vector <- function(data_chrom) {
  
  # Takes list generated in the format from choose_chrom() and appends a state
  # vector to that list
  
  # Get width
  chrom_width <- data_chrom$chromosomes@ranges@width
  
  # Append to list 
  chrom_width <- c(data_chrom, states = list(logical(chrom_width)))
}


modify_state <- function(data_chrom) {
  
  # Function iterates through rows in annotation and changes F -> T in the states vector
  # based on the start and end values of type CDS
  
  start <- data_chrom$annotations$start
  end <- data_chrom$annotations$end
  
  for (i in 1:nrow(data_chrom$annotations)) {
    data_chrom$states[start[i]:end[i]] <- TRUE
  }
  
  return(data_chrom)
}
# Isolate CDS
chrom.1$annotations <- chrom.1$annotations %>% 
  filter(type == "CDS")

chrom.1 <- chrom.1 %>%
  # Initialize state vector
  set_state_vector %>% 
  # Set states based on annotation labels
  modify_state

I think we’re set to actually tackle the problem. The generic formula for calculating frequency is the number of occurrences divided by the total possible number of occurrences

Let’s wrap this to handle our data

get_nucleotide_freq_both_states <- function(data) {
  
  # Takes our modified 3 element list and calculates frequencies for each emission (nucleotides)
  # in either state. Returns as a tbl
  
  # Get vector index in state
  state_index_t <- which(data$states == T)
  state_index_f <- which(data$states == F)
  
  # Subset nucleotides by state
  state_t <- data$chromosomes[[1]][state_index_t]
  state_f <- data$chromosomes[[1]][state_index_f]
  
  out <- tibble("Nucleotide" = c("A", "C", "G", "T"), 
                "Coding" = alphabetFrequency(state_t, baseOnly = T, as.prob = T)[1:4],
                "Non-Coding" = alphabetFrequency(state_f, baseOnly = T, as.prob = T)[1:4])
  return(out)

}
emission.frequencies.chrom1 <- get_nucleotide_freq_both_states(chrom.1)  
emission.frequencies.chrom1$Coding
##         A         C         G         T 
## 0.2919849 0.2051492 0.2096557 0.2932103

Question 2 [points 10]
Using the S. cerevisiae (Baker’s yeast) data that we imported into R in Lectures 11 and 12, show R code of how you would estimate the frequency of A, C, G, T nucleotides in non-coding regions only. Use only chromosome 1. Estimate the self-transition probabilities (coding to coding, non-coding to non-coding) to transition probabilities to and from coding and non-coding.

Due to some FANTASTIC design, we already have the answer to the first part of this question

emission.frequencies.chrom1$`Non-Coding`
##         A         C         G         T 
## 0.3233797 0.1740879 0.1796447 0.3228877

As for the transition probabilities, we can calculate them using our state vector

get_transition_frequency_matrix <- function(data) {
  
  states <- data$states
  n_transitions <- length(states) - 1
  
  # Create tally matrix
  tally <- as.data.frame(matrix(0, ncol = 2, nrow = 2))
  rownames(tally) <- c("T", "F")
  names(tally) <- c("T", "F")
  
  for ( i in 1: n_transitions ) {
    
    # Get position 1 & 2
    pos.one.and.two <- states[i:(i+1)]
    
    # Add 1 to tally matrix based on transition
    if (pos.one.and.two[1] == T & pos.one.and.two[2] == T) tally["T", "T"] <- tally["T","T"] + 1
    if (pos.one.and.two[1] == T & pos.one.and.two[2] == F) tally["T", "F"] <- tally["T","F"] + 1
    if (pos.one.and.two[1] == F & pos.one.and.two[2] == T) tally["F", "T"] <- tally["F","T"] + 1
    if (pos.one.and.two[1] == F & pos.one.and.two[2] == F) tally["F", "F"] <- tally["F","F"] + 1
  }
  
  # Count per variable
  n <- tally %>%
    sapply(sum) %>%
    unname()
  
  # Keep tally and add probability matrix
  tally <- list(frequencies = tally,
                probabilities = tally/n)
  
  return(tally)
}

transitions.chrom.1 <- get_transition_frequency_matrix(chrom.1)
transitions.chrom.1$probabilities
##             T            F
## T 0.999292026 0.0007079742
## F 0.001248215 0.9987517853

Question 3 [points 20]
Using the \({\tt HMM}\) package in R (available in \({\tt A04-Assignment/src/A04-Assginment.Rmd}\)), implement your model. The documentation for this package is here. Note that you might want to look at the \({\tt dishonestCasino()}\) function as an example.
Perhaps follow the \({\tt viterbi}\) function and the example there. Show your code. Apply it back to chromosome 1. Apply it chromosome 2 too.

Questiom 4 [points 10]
Compute the specificity, sensitivity and accuracy on both chromosomes individually. Comment on your findings.

These questions lead into each other, so I will be tackling them together

First thing we do is initialize our HMM using the initHMM() function. We can get the necessary information from Q1 & Q2 and reformat it for the function

# Required as a matrix
transition.prob.chrom1 <- as.matrix(transitions.chrom.1$probabilities)
emission.prob.chrom1 <- t(data.matrix(emission.frequencies.chrom1))[-1,]
colnames(emission.prob.chrom1) <- c("A", "C", "G", "T")

# Initialize our HMM model
( chrom1.hmm <- initHMM(States = c("Coding", "Non-coding"), 
               Symbols = c("A", "C", "G", "T"),
               transProbs = transition.prob.chrom1,
               emissionProbs = emission.prob.chrom1) )
## $States
## [1] "Coding"     "Non-coding"
## 
## $Symbols
## [1] "A" "C" "G" "T"
## 
## $startProbs
##     Coding Non-coding 
##        0.5        0.5 
## 
## $transProbs
##             to
## from              Coding   Non-coding
##   Coding     0.999292026 0.0007079742
##   Non-coding 0.001248215 0.9987517853
## 
## $emissionProbs
##             symbols
## states               A         C         G         T
##   Coding     0.2919849 0.2051492 0.2096557 0.2932103
##   Non-coding 0.3233797 0.1740879 0.1796447 0.3228877

Looking at the documentation, we can use the viterbi() function, which applies the Viterbi algorithm to the chromosome emission vector and attempts to compute the most probable path of states. We can then compare the Viterbi output to the true states and calculate the sensitivity, specificity, and accuracy of the algorithm.

# DNAStringSet to character vector
vectorized.chrom1.seq <- unlist(str_extract_all(chrom.1$chromosomes, boundary("character")))

# Use our HMM to predict state of emissions in chromosome 1 using Viterbi algorithm
vit <-  viterbi(hmm = chrom1.hmm, vectorized.chrom1.seq)
measure_performance <- function(chromosome, vit) {
  
  # Function takes in chromosome and vit, changes the state from logical to either
  # "Coding" or "Non-coding", combines the two into a tibble, evaluates if it is 
  # a TP, FP, TN, FN, and returns measures of performance as a named vector.
  
  # Rename state vector
  state <- ifelse(chromosome$states, "Coding", "Non-coding")
  
  # Create comparison tibble
  performance <- tibble(true.states = state, predicted.states = vit)

  # Add new column evaluating matches 
  performance <- performance %>% 
    rowwise() %>%
    mutate(classifier.status = case_when(true.states == "Coding" & predicted.states == "Coding" ~ "TP",
                                         true.states == "Coding" & predicted.states == "Non-coding" ~ "FN",
                                         true.states == "Non-coding" & predicted.states == "Coding" ~ "FP",
                                         true.states == "Non-coding" & predicted.states == "Non-coding" ~ "TN")) %>% 
  # Tidying into counts
    group_by(classifier.status) %>%
    summarise(n = n(), .groups = 'drop')
  
  # Add missing matches set to count = 0
  for (i in c("TP", "FN", "FP", "TN")) {
    if (!(i %in% performance$classifier.status)) {
      performance <- performance %>%
        add_row(classifier.status = i, n = 0)
    }
  }
  
  # Turn to named vector
  performance <- deframe(performance)

  out <- c("Sensitivity" = (performance[["TP"]] / (performance[["TP"]] + performance[["FN"]])),
           "Specificity" = (performance[["TN"]] / (performance[["TN"]] + performance[["FP"]])), 
           "Accuracy" = (performance[["TP"]] + performance[["TN"]]) / 
             (performance[["TP"]] + performance[["FP"]] + performance[["TN"]] + performance[["FN"]])
           )
  return(list(performance.counts = performance,
              performance.measures = out))
}

Let’s see how we did

(chrom1.performance <- measure_performance(chrom.1, vit))
## $performance.counts
##    FN    FP    TN    TP 
## 60935 27428 55892 85963 
## 
## $performance.measures
## Sensitivity Specificity    Accuracy 
##   0.5851884   0.6708113   0.6161768

Because we spent so much effort on data managing and writing functions, we can get chromosome 2 set up in a few lines

chrom.2 <- choose_chrom(scerevisiae, 2)

# Isolate CDS
chrom.2$annotations <- chrom.2$annotations %>% 
  filter(type == "CDS")

chrom.2 <- chrom.2 %>%
  # Initialize state vector
  set_state_vector %>% 
  # Set states based on annotation labels
  modify_state

Evaluate performance of our HMM using chromosome 2

# DNAStringSet to character vector
vectorized.chrom2.seq <- unlist(str_extract_all(chrom.2$chromosomes, boundary("character")))

# Use our HMM to predict state of emissions in chromosome 2 using Viterbi algorithm
vit <-  viterbi(hmm = chrom1.hmm, vectorized.chrom2.seq)

(chrom2.performance <- measure_performance(chrom.2, vit))
## $performance.counts
##     FN     FP     TN     TP 
## 318488  82639 121058 290999 
## 
## $performance.measures
## Sensitivity Specificity    Accuracy 
##   0.4774491   0.5943043   0.5067205

Comparing the two, our model performs better across the board when predicting states in chromosome 1 compared to chromosome 2. This makes sense because the prediction model was trained on chromosome 1, which has different transition and emission probabilities than chromosome 2. To help increase these scores, we could feed in sequences from multiple chromosomes to help the model generalize the difference between coding and non-coding states.