library(HMM)
library(Biostrings)
library(tidyverse)
library(magrittr)
Question 1 [10 points]
Using the S. cerevisiae (Baker’s yeast) data that we imported into R in Lectures 11 and 12, show R code of how you would estimate the frequency of A, C, G, T nucleotides in coding regions only. Use only chromosome 1.
( scerevisiae <- readRDS("../data/Scerevisiae_vers_1.0.rds") )
## An object of class "Genome"
## <S4 Type Object>
## attr(,"organism_name")
## [1] "Saccharomyces cerevisiae"
## attr(,"chromosomes")
## DNAStringSet object of length 17:
## width seq names
## [1] 230218 CCACACCACACCCACACACCCA...TGTGGGTGTGGTGTGTGTGGG 1
## [2] 813184 AAATAGCCCTCATGTACGTCTC...GGGTGTGGTGTGTGGGTGTGT 2
## [3] 316620 CCCACACACCACACCCACACCA...GTGGTGGGTGTGGTGTGTGTG 3
## [4] 1531933 ACACCACACCCACACCACACCC...AAGGTAGTAAGTAGCTTTTGG 4
## [5] 576874 CGTCTCCTCCAAGCCCTGTTGT...TTCATTTTCATTTTTTTTTTT 5
## ... ... ...
## [13] 924431 CCACACACACACCACACCCACA...GGGTGTGGTGTGTGTGTGGGG 13
## [14] 784333 CCGGCTTTCTGACCGAAATTAA...GTGTGTGGGTGTGGTGTGGGT 14
## [15] 1091291 ACACCACACCCACACCACACCC...GAGTGTGTGGGTGTGGTGTGT 15
## [16] 948066 AAATAGCCCTCATGTACGTCTC...TTTTTTAATTTCGGTCAGAAA 16
## [17] 85779 TTCATAATTAATTTTTTATATA...AATTATAATATAATATCCATA m
## attr(,"annotations")
## # A tibble: 23,058 x 9
## seqid source type start end score strand phase attributes
## <fct> <fct> <fct> <int> <int> <dbl> <fct> <fct> <chr>
## 1 chrI SGD chromosome 1 230218 NA <NA> <NA> ID=chrI;dbxref=NCBI:…
## 2 chrI SGD telomere 1 801 NA - <NA> ID=TEL01L;Name=TEL01…
## 3 chrI SGD X_element 337 801 NA - <NA> ID=TEL01L_X_element;…
## 4 chrI SGD X_element… 63 336 NA - <NA> ID=TEL01L_X_element_…
## 5 chrI SGD telomeric… 1 62 NA - <NA> ID=TEL01L_telomeric_…
## 6 chrI SGD gene 335 649 NA + <NA> ID=YAL069W;Name=YAL0…
## 7 chrI SGD CDS 335 649 NA + 0 Parent=YAL069W_mRNA;…
## 8 chrI SGD mRNA 335 649 NA + <NA> ID=YAL069W_mRNA;Name…
## 9 chrI SGD gene 538 792 NA + <NA> ID=YAL068W-A;Name=YA…
## 10 chrI SGD CDS 538 792 NA + 0 Parent=YAL068W-A_mRN…
## # … with 23,048 more rows
## attr(,"chromosome_names")
## # A tibble: 17 x 2
## reference chromosome
## <chr> <fct>
## 1 ref|NC_001133| 1
## 2 ref|NC_001134| 2
## 3 ref|NC_001135| 3
## 4 ref|NC_001136| 4
## 5 ref|NC_001137| 6
## 6 ref|NC_001138| 7
## 7 ref|NC_001139| 8
## 8 ref|NC_001140| 9
## 9 ref|NC_001141| 5
## 10 ref|NC_001142| 10
## 11 ref|NC_001143| 11
## 12 ref|NC_001144| 12
## 13 ref|NC_001145| 13
## 14 ref|NC_001146| 14
## 15 ref|NC_001147| 15
## 16 ref|NC_001148| 16
## 17 ref|NC_001224| m
## attr(,"GO")
## # A tibble: 42,885 x 7
## X1 X2 X3 X4 X5 X6 X7
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 HRA1 HRA1 S0001193… C cellular_component GO:0005… ncRNA_g…
## 2 HRA1 HRA1 S0001193… F molecular_function GO:0003… ncRNA_g…
## 3 HRA1 HRA1 S0001193… P rRNA processing GO:0006… ncRNA_g…
## 4 HRA1 HRA1 S0001193… P ribosomal small subunit biogen… GO:0042… ncRNA_g…
## 5 ICR1 ICR1 S0001326… C nucleus GO:0005… ncRNA_g…
## 6 ICR1 ICR1 S0001326… F molecular_function GO:0003… ncRNA_g…
## 7 ICR1 ICR1 S0001326… P transcription from RNA polymer… GO:0006… ncRNA_g…
## 8 LSR1 LSR1 S0000064… C nucleus GO:0005… snRNA_g…
## 9 LSR1 LSR1 S0000064… F RNA binding GO:0003… snRNA_g…
## 10 LSR1 LSR1 S0000064… P RNA splicing GO:0008… snRNA_g…
## # … with 42,875 more rows
First thing we need to do is to get the data into a manageable format.
get_chromosome <- function(genome_obj, chrom) {
# Searches through Genome object attribute "chromosomes" and returns indexed DNAStringSet
# Can take numeric or character type
return(genome_obj@chromosomes[chrom])
}
get_annot <- function(genome_obj, annot) {
# Searches through Genome object attribute "annotations" and returns filtered
# annotation tbl with selected annotation. Annotations are converted to roman
# numerals before searching. If annot is not a numeric, returns Chrmt by default
if (is.numeric(annot)) {
annot <- paste0("chr", as.roman(annot))
return(genome_obj@annotations[genome_obj@annotations["seqid"] == annot, ])
} else {
# I know, I know... HARDCODING?!
return(genome_obj@annotations[genome_obj@annotations["seqid"] == "chrmt", ])
}
}
choose_chrom <- function(genome_obj, chrom) {
# Parses Genome object for selected chromosome sequence and attributes
# and returns it as a list.
return(list("chromosomes" = get_chromosome(genome_obj, chrom),
"annotations" = get_annot(genome_obj, chrom)))
}
( chrom.1 <- choose_chrom(scerevisiae, 1) )
## Warning: The `i` argument of ``[`()` can't be a matrix as of tibble 3.0.0.
## Convert to a vector.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## $chromosomes
## DNAStringSet object of length 1:
## width seq names
## [1] 230218 CCACACCACACCCACACACCCAC...GTGTGGGTGTGGTGTGTGTGGG 1
##
## $annotations
## # A tibble: 424 x 9
## seqid source type start end score strand phase attributes
## <fct> <fct> <fct> <int> <int> <dbl> <fct> <fct> <chr>
## 1 chrI SGD chromosome 1 230218 NA <NA> <NA> ID=chrI;dbxref=NCBI:…
## 2 chrI SGD telomere 1 801 NA - <NA> ID=TEL01L;Name=TEL01…
## 3 chrI SGD X_element 337 801 NA - <NA> ID=TEL01L_X_element;…
## 4 chrI SGD X_element… 63 336 NA - <NA> ID=TEL01L_X_element_…
## 5 chrI SGD telomeric… 1 62 NA - <NA> ID=TEL01L_telomeric_…
## 6 chrI SGD gene 335 649 NA + <NA> ID=YAL069W;Name=YAL0…
## 7 chrI SGD CDS 335 649 NA + 0 Parent=YAL069W_mRNA;…
## 8 chrI SGD mRNA 335 649 NA + <NA> ID=YAL069W_mRNA;Name…
## 9 chrI SGD gene 538 792 NA + <NA> ID=YAL068W-A;Name=YA…
## 10 chrI SGD CDS 538 792 NA + 0 Parent=YAL068W-A_mRN…
## # … with 414 more rows
Great, now its organized. Next step would be to actually answer the question, which is to estimate the frequency of A, C, G, T nucleotides in coding regions only. We know that in this data set, the coding regions have an annotation of type “CDS”, meaning that whatever isn’t in that range can be considered non-coding. In reference to the actual HMM and using correct terminology, I will be considering the states as coding/non-coding and emissions as the nucleotides themselves. Using the start and end coordinates, we can assign areas of the chromosomes by state.
To do this, I will create a vector that matches the length of the chromosome and keeps track of which state each nucleotide is in. Since we only have two states, I will represent the states as logicals, where F == non-coding and T == coding.
set_state_vector <- function(data_chrom) {
# Takes list generated in the format from choose_chrom() and appends a state
# vector to that list
# Get width
chrom_width <- data_chrom$chromosomes@ranges@width
# Append to list
chrom_width <- c(data_chrom, states = list(logical(chrom_width)))
}
modify_state <- function(data_chrom) {
# Function iterates through rows in annotation and changes F -> T in the states vector
# based on the start and end values of type CDS
start <- data_chrom$annotations$start
end <- data_chrom$annotations$end
for (i in 1:nrow(data_chrom$annotations)) {
data_chrom$states[start[i]:end[i]] <- TRUE
}
return(data_chrom)
}
# Isolate CDS
chrom.1$annotations <- chrom.1$annotations %>%
filter(type == "CDS")
chrom.1 <- chrom.1 %>%
# Initialize state vector
set_state_vector %>%
# Set states based on annotation labels
modify_state
I think we’re set to actually tackle the problem. The generic formula for calculating frequency is the number of occurrences divided by the total possible number of occurrences
Let’s wrap this to handle our data
get_nucleotide_freq_both_states <- function(data) {
# Takes our modified 3 element list and calculates frequencies for each emission (nucleotides)
# in either state. Returns as a tbl
# Get vector index in state
state_index_t <- which(data$states == T)
state_index_f <- which(data$states == F)
# Subset nucleotides by state
state_t <- data$chromosomes[[1]][state_index_t]
state_f <- data$chromosomes[[1]][state_index_f]
out <- tibble("Nucleotide" = c("A", "C", "G", "T"),
"Coding" = alphabetFrequency(state_t, baseOnly = T, as.prob = T)[1:4],
"Non-Coding" = alphabetFrequency(state_f, baseOnly = T, as.prob = T)[1:4])
return(out)
}
emission.frequencies.chrom1 <- get_nucleotide_freq_both_states(chrom.1)
emission.frequencies.chrom1$Coding
## A C G T
## 0.2919849 0.2051492 0.2096557 0.2932103
Question 2 [points 10]
Using the S. cerevisiae (Baker’s yeast) data that we imported into R in Lectures 11 and 12, show R code of how you would estimate the frequency of A, C, G, T nucleotides in non-coding regions only. Use only chromosome 1. Estimate the self-transition probabilities (coding to coding, non-coding to non-coding) to transition probabilities to and from coding and non-coding.
Due to some FANTASTIC design, we already have the answer to the first part of this question
emission.frequencies.chrom1$`Non-Coding`
## A C G T
## 0.3233797 0.1740879 0.1796447 0.3228877
As for the transition probabilities, we can calculate them using our state vector
get_transition_frequency_matrix <- function(data) {
states <- data$states
n_transitions <- length(states) - 1
# Create tally matrix
tally <- as.data.frame(matrix(0, ncol = 2, nrow = 2))
rownames(tally) <- c("T", "F")
names(tally) <- c("T", "F")
for ( i in 1: n_transitions ) {
# Get position 1 & 2
pos.one.and.two <- states[i:(i+1)]
# Add 1 to tally matrix based on transition
if (pos.one.and.two[1] == T & pos.one.and.two[2] == T) tally["T", "T"] <- tally["T","T"] + 1
if (pos.one.and.two[1] == T & pos.one.and.two[2] == F) tally["T", "F"] <- tally["T","F"] + 1
if (pos.one.and.two[1] == F & pos.one.and.two[2] == T) tally["F", "T"] <- tally["F","T"] + 1
if (pos.one.and.two[1] == F & pos.one.and.two[2] == F) tally["F", "F"] <- tally["F","F"] + 1
}
# Count per variable
n <- tally %>%
sapply(sum) %>%
unname()
# Keep tally and add probability matrix
tally <- list(frequencies = tally,
probabilities = tally/n)
return(tally)
}
transitions.chrom.1 <- get_transition_frequency_matrix(chrom.1)
transitions.chrom.1$probabilities
## T F
## T 0.999292026 0.0007079742
## F 0.001248215 0.9987517853
Question 3 [points 20]
Using the \({\tt HMM}\) package in R (available in \({\tt A04-Assignment/src/A04-Assginment.Rmd}\)), implement your model. The documentation for this package is here. Note that you might want to look at the \({\tt dishonestCasino()}\) function as an example.
Perhaps follow the \({\tt viterbi}\) function and the example there. Show your code. Apply it back to chromosome 1. Apply it chromosome 2 too.
Questiom 4 [points 10]
Compute the specificity, sensitivity and accuracy on both chromosomes individually. Comment on your findings.
These questions lead into each other, so I will be tackling them together
First thing we do is initialize our HMM using the initHMM() function. We can get the necessary information from Q1 & Q2 and reformat it for the function
# Required as a matrix
transition.prob.chrom1 <- as.matrix(transitions.chrom.1$probabilities)
emission.prob.chrom1 <- t(data.matrix(emission.frequencies.chrom1))[-1,]
colnames(emission.prob.chrom1) <- c("A", "C", "G", "T")
# Initialize our HMM model
( chrom1.hmm <- initHMM(States = c("Coding", "Non-coding"),
Symbols = c("A", "C", "G", "T"),
transProbs = transition.prob.chrom1,
emissionProbs = emission.prob.chrom1) )
## $States
## [1] "Coding" "Non-coding"
##
## $Symbols
## [1] "A" "C" "G" "T"
##
## $startProbs
## Coding Non-coding
## 0.5 0.5
##
## $transProbs
## to
## from Coding Non-coding
## Coding 0.999292026 0.0007079742
## Non-coding 0.001248215 0.9987517853
##
## $emissionProbs
## symbols
## states A C G T
## Coding 0.2919849 0.2051492 0.2096557 0.2932103
## Non-coding 0.3233797 0.1740879 0.1796447 0.3228877
Looking at the documentation, we can use the viterbi() function, which applies the Viterbi algorithm to the chromosome emission vector and attempts to compute the most probable path of states. We can then compare the Viterbi output to the true states and calculate the sensitivity, specificity, and accuracy of the algorithm.
# DNAStringSet to character vector
vectorized.chrom1.seq <- unlist(str_extract_all(chrom.1$chromosomes, boundary("character")))
# Use our HMM to predict state of emissions in chromosome 1 using Viterbi algorithm
vit <- viterbi(hmm = chrom1.hmm, vectorized.chrom1.seq)
measure_performance <- function(chromosome, vit) {
# Function takes in chromosome and vit, changes the state from logical to either
# "Coding" or "Non-coding", combines the two into a tibble, evaluates if it is
# a TP, FP, TN, FN, and returns measures of performance as a named vector.
# Rename state vector
state <- ifelse(chromosome$states, "Coding", "Non-coding")
# Create comparison tibble
performance <- tibble(true.states = state, predicted.states = vit)
# Add new column evaluating matches
performance <- performance %>%
rowwise() %>%
mutate(classifier.status = case_when(true.states == "Coding" & predicted.states == "Coding" ~ "TP",
true.states == "Coding" & predicted.states == "Non-coding" ~ "FN",
true.states == "Non-coding" & predicted.states == "Coding" ~ "FP",
true.states == "Non-coding" & predicted.states == "Non-coding" ~ "TN")) %>%
# Tidying into counts
group_by(classifier.status) %>%
summarise(n = n(), .groups = 'drop')
# Add missing matches set to count = 0
for (i in c("TP", "FN", "FP", "TN")) {
if (!(i %in% performance$classifier.status)) {
performance <- performance %>%
add_row(classifier.status = i, n = 0)
}
}
# Turn to named vector
performance <- deframe(performance)
out <- c("Sensitivity" = (performance[["TP"]] / (performance[["TP"]] + performance[["FN"]])),
"Specificity" = (performance[["TN"]] / (performance[["TN"]] + performance[["FP"]])),
"Accuracy" = (performance[["TP"]] + performance[["TN"]]) /
(performance[["TP"]] + performance[["FP"]] + performance[["TN"]] + performance[["FN"]])
)
return(list(performance.counts = performance,
performance.measures = out))
}
Let’s see how we did
(chrom1.performance <- measure_performance(chrom.1, vit))
## $performance.counts
## FN FP TN TP
## 60935 27428 55892 85963
##
## $performance.measures
## Sensitivity Specificity Accuracy
## 0.5851884 0.6708113 0.6161768
Because we spent so much effort on data managing and writing functions, we can get chromosome 2 set up in a few lines
chrom.2 <- choose_chrom(scerevisiae, 2)
# Isolate CDS
chrom.2$annotations <- chrom.2$annotations %>%
filter(type == "CDS")
chrom.2 <- chrom.2 %>%
# Initialize state vector
set_state_vector %>%
# Set states based on annotation labels
modify_state
Evaluate performance of our HMM using chromosome 2
# DNAStringSet to character vector
vectorized.chrom2.seq <- unlist(str_extract_all(chrom.2$chromosomes, boundary("character")))
# Use our HMM to predict state of emissions in chromosome 2 using Viterbi algorithm
vit <- viterbi(hmm = chrom1.hmm, vectorized.chrom2.seq)
(chrom2.performance <- measure_performance(chrom.2, vit))
## $performance.counts
## FN FP TN TP
## 318488 82639 121058 290999
##
## $performance.measures
## Sensitivity Specificity Accuracy
## 0.4774491 0.5943043 0.5067205
Comparing the two, our model performs better across the board when predicting states in chromosome 1 compared to chromosome 2. This makes sense because the prediction model was trained on chromosome 1, which has different transition and emission probabilities than chromosome 2. To help increase these scores, we could feed in sequences from multiple chromosomes to help the model generalize the difference between coding and non-coding states.