Part 1 [2 marks]
Download \({\tt taxdmp}\) directly to RStudio Cloud and uncompress it if necessary. Follow good practices and store this information in a folder \({\tt raw}\) as done in the lecture.

# Create raw directory
dir.create("/cloud/project/raw")

# Download file
download.file(url = "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip", 
              destfile = "/cloud/project/raw/taxdmp.zip",
              method = "wget")

# Unzip
unzip(destfile, exdir = "/cloud/project/raw/taxdmp")

Part 2. [2 marks]
The \({\tt readme.txt}\) file describes each file in the download of \({\tt taxdmp}\). We do not need to consider some of these files. We will just focus on \({\tt nodes, names, division}\). Show how to read each of these into their own tibble.

# File names + dir
files <- c("division" = "/cloud/project/raw/taxdmp/division.dmp", 
           "nodes" = "/cloud/project/raw/taxdmp/nodes.dmp", 
           "names" = "/cloud/project/raw/taxdmp/names.dmp")

# Fields from readme
fields <- list("division" = c("division id", "division cde", "division name", "comments"),
               
               "nodes" = c("tax_id", "parent tax_id", "rank", "embl code", "division id",
                           "inherited div flag", "genetic code id", "inherited GC flag", 
                           "mitochondrial genetic code id", "inherited MGC flag",
                           "GenBank hidden flag", "hidden subtree root flag", "comments"),
               
               "names" = c("tax_id", "name_txt", "unique name", "name class")
               )

Make functions to process these files.

read_dmp <- function(dmp_file, col_names) {
  
  # Takes the dmp file and outputs a dataframe
  # Renames column to the matching fields from readme
  #
  # Readme states:
  # Field terminator is "\t|\t"
  # Row terminator is "\t|\n"
  #
  # Using readLines() removes the new line escape sequence (\n)
  # gsub() compensates for that by removing the resulting "\t|"
  
  out <- readLines(dmp_file) %>%
    gsub(pattern = "\t\\|$", replacement = "") %>% 
    str_split(., pattern = "\t\\|\t", simplify = TRUE) %>%
    as.data.frame() %>%
    setNames(nm = col_names) %>%
    as_tibble()
  
  return(out)
}


safe_read_dmp <- function(dmp_file, col_names, .f = read_dmp) {
  
  # Helper function that performs the functions of read_dmp(), but matches the names
  # of the dmp_file and the col_name list. Both must have matching names. 
  # dmp_file is a named vector
  # col_names is a named list
  
  # Reorders the dmp_file vector to match the col_names list
  dmp_file <- dmp_file[order(match(names(dmp_file), names(col_names)))]
  map2(.x = dmp_file, .y = col_names, .f = .f)
}

Create the output and save

refined <- safe_read_dmp(files, fields)

## Create refined directory
dir.create("/cloud/project/refined")

# Save
saveRDS(refined, "/cloud/project/refined/q2.rds")

Part 3. [2 marks]
Some variables of these three tibbles are no longer necessary because we don’t consider here \({\tt gencode, merged, delnodes, citations}\). Remove these columns from the three tibbles. In addition remove any column with the term \({\tt flag}\) in it and \({\tt mitochondrial~genetic~code~id}\). We don’t need these.

refined <- readRDS("/cloud/project/refined/q2.rds")

# To remove

# nodes 
#   - comments
#   - mitochondrial genetic code id
#   - inherited div flag
#   - genetic code id            
#   - inherited GC flag
#   - inherited MGC flag  
#   - GenBank hidden flag
#   - hidden subtree root flag     

# divisions 
#   - comments

# names
#   - None

# Vector of terms used to filter
filter_terms <- c("comments", "mitochondrial genetic code id", "inherited div flag",
                  "genetic code id", "inherited GC flag", "inherited MGC flag", 
                  "GenBank hidden flag", "hidden subtree root flag")

# Filter
refined <- map(
  .x = refined,
  .f = ~ select(.x, -any_of(filter_terms))
  )

Part 4. [3 marks]
For each tibble, is it tidy? Why or why not?

head(refined)
## $division
## # A tibble: 12 x 3
##    `division id` `division cde` `division name`       
##    <chr>         <chr>          <chr>                 
##  1 0             BCT            Bacteria              
##  2 1             INV            Invertebrates         
##  3 2             MAM            Mammals               
##  4 3             PHG            Phages                
##  5 4             PLN            Plants and Fungi      
##  6 5             PRI            Primates              
##  7 6             ROD            Rodents               
##  8 7             SYN            Synthetic and Chimeric
##  9 8             UNA            Unassigned            
## 10 9             VRL            Viruses               
## 11 10            VRT            Vertebrates           
## 12 11            ENV            Environmental samples 
## 
## $nodes
## # A tibble: 2,287,530 x 5
##    tax_id `parent tax_id` rank         `embl code` `division id`
##    <chr>  <chr>           <chr>        <chr>       <chr>        
##  1 1      1               no rank      ""          8            
##  2 2      131567          superkingdom ""          0            
##  3 6      335928          genus        ""          0            
##  4 7      6               species      "AC"        0            
##  5 9      32199           species      "BA"        0            
##  6 10     1706371         genus        ""          0            
##  7 11     1707            species      "CG"        0            
##  8 13     203488          genus        ""          0            
##  9 14     13              species      "DT"        0            
## 10 16     32011           genus        ""          0            
## # … with 2,287,520 more rows
## 
## $names
## # A tibble: 3,314,189 x 4
##    tax_id name_txt    `unique name`            `name class`       
##    <chr>  <chr>       <chr>                    <chr>              
##  1 1      all         ""                       synonym            
##  2 1      root        ""                       scientific name    
##  3 2      Bacteria    "Bacteria <bacteria>"    scientific name    
##  4 2      bacteria    ""                       blast name         
##  5 2      eubacteria  ""                       genbank common name
##  6 2      Monera      "Monera <bacteria>"      in-part            
##  7 2      Procaryotae "Procaryotae <bacteria>" in-part            
##  8 2      Prokaryotae "Prokaryotae <bacteria>" in-part            
##  9 2      Prokaryota  "Prokaryota <bacteria>"  in-part            
## 10 2      prokaryote  "prokaryote <bacteria>"  in-part            
## # … with 3,314,179 more rows

The divisions and node tables are tidy and the names table is not because the names table takes multiple rows for the same observation. What they could have done is expanded the “name class” column into different variables for the same tax_id, which would reduce the number of observations and turn the tax_id column into a primary key.

# Shows which "name class" values are possible 
table(refined$names["name class"]) 
## 
##             acronym           authority          blast name         common name 
##                1205              534572                 229               14525 
##     equivalent name     genbank acronym genbank common name     genbank synonym 
##               50032                 484               29748                1107 
##             in-part            includes     scientific name             synonym 
##                 540               55969             2287530              180695 
##       type material 
##              157553

Part 5. [3 marks]
For each tibble, are the classes of the individual columns appropriate? Why or why not? If they are not, show how to correct this.

# Show structure of object
str(refined)
## List of 3
##  $ division: tibble [12 × 3] (S3: tbl_df/tbl/data.frame)
##   ..$ division id  : chr [1:12] "0" "1" "2" "3" ...
##   ..$ division cde : chr [1:12] "BCT" "INV" "MAM" "PHG" ...
##   ..$ division name: chr [1:12] "Bacteria" "Invertebrates" "Mammals" "Phages" ...
##  $ nodes   : tibble [2,287,530 × 5] (S3: tbl_df/tbl/data.frame)
##   ..$ tax_id       : chr [1:2287530] "1" "2" "6" "7" ...
##   ..$ parent tax_id: chr [1:2287530] "1" "131567" "335928" "6" ...
##   ..$ rank         : chr [1:2287530] "no rank" "superkingdom" "genus" "species" ...
##   ..$ embl code    : chr [1:2287530] "" "" "" "AC" ...
##   ..$ division id  : chr [1:2287530] "8" "0" "0" "0" ...
##  $ names   : tibble [3,314,189 × 4] (S3: tbl_df/tbl/data.frame)
##   ..$ tax_id     : chr [1:3314189] "1" "1" "2" "2" ...
##   ..$ name_txt   : chr [1:3314189] "all" "root" "Bacteria" "bacteria" ...
##   ..$ unique name: chr [1:3314189] "" "" "Bacteria <bacteria>" "" ...
##   ..$ name class : chr [1:3314189] "synonym" "scientific name" "scientific name" "blast name" ...

Here we see that many of the variables in our tables are characters, when they should be numeric.

# Division
refined$division[["division id"]] <- as.numeric(refined$division[["division id"]])

# Nodes
refined$nodes <-  refined$nodes %>%
  mutate_at(vars("tax_id", "parent tax_id", "division id"), ~ as.numeric(.))

# Names
refined$names[["tax_id"]] <- as.numeric(refined$names[["tax_id"]])

# Print
str(refined)
## List of 3
##  $ division: tibble [12 × 3] (S3: tbl_df/tbl/data.frame)
##   ..$ division id  : num [1:12] 0 1 2 3 4 5 6 7 8 9 ...
##   ..$ division cde : chr [1:12] "BCT" "INV" "MAM" "PHG" ...
##   ..$ division name: chr [1:12] "Bacteria" "Invertebrates" "Mammals" "Phages" ...
##  $ nodes   : tibble [2,287,530 × 5] (S3: tbl_df/tbl/data.frame)
##   ..$ tax_id       : num [1:2287530] 1 2 6 7 9 10 11 13 14 16 ...
##   ..$ parent tax_id: num [1:2287530] 1 131567 335928 6 32199 ...
##   ..$ rank         : chr [1:2287530] "no rank" "superkingdom" "genus" "species" ...
##   ..$ embl code    : chr [1:2287530] "" "" "" "AC" ...
##   ..$ division id  : num [1:2287530] 8 0 0 0 0 0 0 0 0 0 ...
##  $ names   : tibble [3,314,189 × 4] (S3: tbl_df/tbl/data.frame)
##   ..$ tax_id     : num [1:3314189] 1 1 2 2 2 2 2 2 2 2 ...
##   ..$ name_txt   : chr [1:3314189] "all" "root" "Bacteria" "bacteria" ...
##   ..$ unique name: chr [1:3314189] "" "" "Bacteria <bacteria>" "" ...
##   ..$ name class : chr [1:3314189] "synonym" "scientific name" "scientific name" "blast name" ...

Part 6. [4 marks]
Show how to join the \({\tt nodes}\) and \({\tt names}\) tibbles. What is your primary and foreign keys? Which join function did you use and why?

# Primary key is refined$nodes["tax_id"] because it uniquely identifies each tax_id 
# in the nodes table
refined$nodes %>%
  count(tax_id) %>% 
  filter(n > 1) %>% 
  nrow
## [1] 0
# Foreign key is refined$names["tax_id"] because it appears in the nodes table 
# where it links each refined$names["tax_id"] observation to a unique node
refined$names %>%
  count(tax_id) %>% 
  filter(n > 1) %>%
  nrow
## [1] 553799
# Check if there are any empty entries in either key
table(is.na(refined$nodes["tax_id"])); table(is.na(refined$names["tax_id"])) 
## 
##   FALSE 
## 2287530
## 
##   FALSE 
## 3314189
# I used the left_join() function because I wanted to keep all the observations in 
# refined$names. These entries match refined$nodes, therefore we don't lose any
# data. It keeps the one-to-many relationship intact
tbl <- refined$names %>%
  left_join(refined$nodes, by = "tax_id")

tbl
## # A tibble: 3,314,189 x 8
##    tax_id name_txt `unique name` `name class` `parent tax_id` rank  `embl code`
##     <dbl> <chr>    <chr>         <chr>                  <dbl> <chr> <chr>      
##  1      1 all      ""            synonym                    1 no r… ""         
##  2      1 root     ""            scientific …               1 no r… ""         
##  3      2 Bacteria "Bacteria <b… scientific …          131567 supe… ""         
##  4      2 bacteria ""            blast name            131567 supe… ""         
##  5      2 eubacte… ""            genbank com…          131567 supe… ""         
##  6      2 Monera   "Monera <bac… in-part               131567 supe… ""         
##  7      2 Procary… "Procaryotae… in-part               131567 supe… ""         
##  8      2 Prokary… "Prokaryotae… in-part               131567 supe… ""         
##  9      2 Prokary… "Prokaryota … in-part               131567 supe… ""         
## 10      2 prokary… "prokaryote … in-part               131567 supe… ""         
## # … with 3,314,179 more rows, and 1 more variable: `division id` <dbl>

Part 7. [6 marks]
Show how to join the tibbles for Part 6 with the \({\tt division}\) tibble. What is your primary and foreign keys? Which join function did you use and why? Why or why isn’t joining the \({\tt division}\) tibble with the tibble from Part 6 a good idea?

# Primary key is refined$division[`division id`] because it uniquely identifies each
# division id in the division table. Technically, the other variables can be considered
# primary keys as well, because they are all unique, but in relation to what we are
# trying to do, we will only use this one.
refined$division %>%
  count(`division id`) %>% 
  filter(n > 1) %>%
  nrow
## [1] 0
# Foreign key is `division id` in our new tbl from Part 6 because it appears in the
# refined$division table, where it matches each division to a unique id
tbl %>%
  count(`division id`) %>%
  filter(n > 1) %>%
  nrow
## [1] 12
# Check if there are any empty entries in either key
table(is.na(refined$division["division id"])); table(is.na(tbl$`division id`)) 
## 
## FALSE 
##    12
## 
##   FALSE 
## 3314189
# I used the left_join() function because I wanted to keep all the observations in 
# the tbl object and add the additional information found in refined$division.

# I believe it would be okay to merge the two sets. My main concern was that tbl$`division id`
# would have values that are not in the tbl, but that is not the case. However, merging 
# the two does not provide any new information because the variables are just different
# representations of the division id, and instead makes my tbl larger. I choose to not 
# merge them. If I ever need the information in the division table, I can access it and
# change whatever needs changing, but we won't need it for this assignment.

# I don't run this because I do not want this in the rest of the script
# tbl <- tbl %>%
#    left_join(refined$division, by = "division id")

Part 9. [6 marks]
Write a function that accepts as arguments the tibble from question 6 and a \({\tt tax\_id}\) for a target taxon. The function returns a tibble consisting of all the direct children of target.

get_taxon <- function(tbl, target_tax_id) {
  
  # Takes tbl and indexes into tax_id to return the requested tax_id
  # and all direct children of target.
  # Returns a tbl
  
  return(dplyr::filter(tbl, `parent tax_id` == target_tax_id))
}
# 131567 == root
get_taxon(tbl, 131567)
## # A tibble: 30 x 8
##    tax_id name_txt `unique name` `name class` `parent tax_id` rank  `embl code`
##     <dbl> <chr>    <chr>         <chr>                  <dbl> <chr> <chr>      
##  1      2 Bacteria "Bacteria <b… scientific …          131567 supe… ""         
##  2      2 bacteria ""            blast name            131567 supe… ""         
##  3      2 eubacte… ""            genbank com…          131567 supe… ""         
##  4      2 Monera   "Monera <bac… in-part               131567 supe… ""         
##  5      2 Procary… "Procaryotae… in-part               131567 supe… ""         
##  6      2 Prokary… "Prokaryotae… in-part               131567 supe… ""         
##  7      2 Prokary… "Prokaryota … in-part               131567 supe… ""         
##  8      2 prokary… "prokaryote … in-part               131567 supe… ""         
##  9      2 prokary… "prokaryotes… in-part               131567 supe… ""         
## 10   2157 archaea  ""            blast name            131567 supe… ""         
## # … with 20 more rows, and 1 more variable: `division id` <dbl>

To be accurate, we will have to filter the name class to “scientific name”

get_scientific_taxon <- function(tbl, target_tax_id) {
  
  # Wrapper function of get_taxon()
  # Specifically returns output where "name class" variable == "scientific name"
  
  tmp <- get_taxon(tbl, target_tax_id) %>%
    filter(`name class` == "scientific name")
  
  return(tmp)
}

get_taxon(tbl, 131567)
## # A tibble: 30 x 8
##    tax_id name_txt `unique name` `name class` `parent tax_id` rank  `embl code`
##     <dbl> <chr>    <chr>         <chr>                  <dbl> <chr> <chr>      
##  1      2 Bacteria "Bacteria <b… scientific …          131567 supe… ""         
##  2      2 bacteria ""            blast name            131567 supe… ""         
##  3      2 eubacte… ""            genbank com…          131567 supe… ""         
##  4      2 Monera   "Monera <bac… in-part               131567 supe… ""         
##  5      2 Procary… "Procaryotae… in-part               131567 supe… ""         
##  6      2 Prokary… "Prokaryotae… in-part               131567 supe… ""         
##  7      2 Prokary… "Prokaryota … in-part               131567 supe… ""         
##  8      2 prokary… "prokaryote … in-part               131567 supe… ""         
##  9      2 prokary… "prokaryotes… in-part               131567 supe… ""         
## 10   2157 archaea  ""            blast name            131567 supe… ""         
## # … with 20 more rows, and 1 more variable: `division id` <dbl>

Part 10. [12 marks]
Which domain (Eukaryota, Bacteria, Archae) has the most species, and how many of them are there for each? What about genera?

Let’s change our tibble to show observations of name class “scientific name”

tbl <- tbl %>%
  filter(`name class` == "scientific name")

And create a function that will count species

rank_counter <- function(data, start_rank_id, tally_rank) {
  
  # Function takes in our tbl (modified to only have name class of scientific name),
  # a start_rank_id: the tax_id of our subsetted root node,
  #   Note: Not the actual root node of the main tree
  # and a tally_rank: a string that defines which rank you assign the leaves to be.
  # Returns a tally numeric 
  
  tally <- 0
  tally_data <- filter(data, `parent tax_id` == start_rank_id)
  
  # Loop till end
  while(nrow(tally_data != 0)) {
    
    # Check if any children are of desired rank and add to tally
    tally <- tally_data %>% filter(rank == tally_rank) %>% nrow() %>% sum(., tally)
    
    # Get IDs of parents in a vector  
    parent_ids <- tally_data[["tax_id"]]
    
    # Set new tbl with children nodes
    tally_data <- filter(data, `parent tax_id` %in% parent_ids)
  }
  return(tally)
}

Get tax_id’s of the three domains and run the function

ids <- get_scientific_taxon(tbl, 131567)[["tax_id"]]
names(ids) <- get_scientific_taxon(tbl, 131567)[["name_txt"]]

map(.x = ids, 
    .f = ~ rank_counter(data = tbl,
                     start_rank_id = .x,
                     tally_rank = "species")
    )
## $Bacteria
## [1] 459311
## 
## $Archaea
## [1] 12464
## 
## $Eukaryota
## [1] 1337941

And for genera

map(.x = ids, 
    .f = ~ rank_counter(data = tbl,
                     start_rank_id = .x,
                     tally_rank = "genus")
    )
## $Bacteria
## [1] 4041
## 
## $Archaea
## [1] 202
## 
## $Eukaryota
## [1] 90229

Part 11. [10 marks + 4 bonus marks]
Write a function called \({\tt path\_to\_root}\) that accepts as arguments the tibble from question 6 and a \({\tt tax\_id}\) for a target taxon. The function returns a string vector whose order is the path of that taxon to the root of the tree of life. For example,

# print( path_to_root( tax_id=632, taxonomy=part6_tibble) )
#    root.cellular organisms, Bacteria, Proteobacteria, Gammaproteobacteria, Enterobacterales, Yersiniaceae, Yersinia, NA, Yersinia pestis

Note that some nodes along the path from the taxon to the root may not have a name. Your function should print out \({\tt NA}\)

path_to_root <- function(tax_id, taxonomy) {
  
  # Function does what the question asks :) 
  
  # Using the environment because of undefined behavior when trying to filter with
  # tax_id. The names conflict.
  functione_env <- environment()
  path <- c()
  
  # Obvious, but still need to instantiate. 
  # Hard coding it could be a problem if they ever change the root tax_id, 
  # but I doubt that will ever happen.
  root_tax_id <- 1
  
  # Functions as off switch in while loop. Allows for one iteration using root
  count_switch <- 2
  
  while (count_switch != 0) {
    
    # Get name of node and add to path vector
    node_name <- taxonomy %>%
      filter(`tax_id` == get("tax_id", functione_env)) %>%
      pull("name_txt") %>%  # Pull name
      replace_na("NA") # Fixes NAs
    
    # Add to vector
    path <- c(path, node_name) 
    
    # Switch to new node
    tax_id <- taxonomy %>%
      filter(`tax_id` == get("tax_id", functione_env)) %>%
      pull("parent tax_id")
    
    # Off switch
    if (tax_id == root_tax_id) {
      count_switch <- count_switch - 1
    }
    
  }
  return(cat(rev(path), sep = ", "))
}
path_to_root(tax_id = 632, taxonomy = tbl)
## root, cellular organisms, Bacteria, Proteobacteria, Gammaproteobacteria, Enterobacterales, Yersiniaceae, Yersinia, Yersinia pseudotuberculosis complex, Yersinia pestis

My original tbl did not have any NAs for the example. Here I test to see if it will output NAs into the desired format

# Which is Yersinia pseudotuberculosis complex
tbl %>%
  filter(`name_txt` == "Yersinia pseudotuberculosis complex")
## # A tibble: 1 x 8
##   tax_id name_txt `unique name` `name class` `parent tax_id` rank  `embl code`
##    <dbl> <chr>    <chr>         <chr>                  <dbl> <chr> <chr>      
## 1 1.65e6 Yersini… ""            scientific …             629 spec… ""         
## # … with 1 more variable: `division id` <dbl>

# Get row
which(tbl$name_txt == "Yersinia pseudotuberculosis complex")
## [1] 1336483

# Change to NA
na_tbl <- tbl

# Check
na_tbl[1336483, 2] <- NA
na_tbl[1336483, ] # Good 
## # A tibble: 1 x 8
##   tax_id name_txt `unique name` `name class` `parent tax_id` rank  `embl code`
##    <dbl> <chr>    <chr>         <chr>                  <dbl> <chr> <chr>      
## 1 1.65e6 <NA>     ""            scientific …             629 spec… ""         
## # … with 1 more variable: `division id` <dbl>

# Works
path_to_root(tax_id = 632, taxonomy = na_tbl)
## root, cellular organisms, Bacteria, Proteobacteria, Gammaproteobacteria, Enterobacterales, Yersiniaceae, Yersinia, NA, Yersinia pestis