fig_genomes.Rmd

---
title: "Data munging and Figure 4 preparation"
author: "Shyam Saladi (saladi@caltech.edu)"
date: "09/20/2016"
output: html_document
---

## Load libraries
Load all libraries necessary for subsequent data munging and plotting.
Some defaults are set here to make things easier throughout.
```{r libraries}
library(testthat)

library(datamart) # `uconv` for unit conversion

# library(plyr) # for `dlply`
library(magrittr)
library(tidyverse)
library(multidplyr)
library(dplyrExtras)

library(RColorBrewer)
library(ggbeeswarm)
library(cowplot)

library(doMC)
# registerDoMC(cores = 20L)

# create_cluster(20L) %>%
#     set_default_cluster()

# devtools::install("myutils")
library(myutils)
```

### Load data from training process
```{r}
training_env <- new.env()
load("training.RData", training_env)
```

### Load model data for prediction function
```{r}
model_env <- new.env()
load("model.RData", model_env)
svmpredict <- model_env$svmpredict
```

### Load NYCOMPS data for prediction function
```{r}
nycomps_env <- new.env()
load("large-scale.RData", nycomps_env)
```

# p4 - forward predictions
```{r forward_data_prep, cache=TRUE}
# load allstats and scores
feat_col_spec = cols(.default = col_double(), title = col_character())

forward_preds <-
    tibble(fn = c("celegans_mp_wormbase",
                  "hs_rnd1",
                  "mm_rnd1",
                  "sgd_orf_coding.mps",
                  "Picpa1_GeneCatalog_CDS_20130227.mps"),
           group = c(rep("Metazoa", 3), rep("Eukaryota", 2)),
           species = c("Caenorhabditis elegans",
                       "Homo sapiens",
                       "Mus musculus",
                       "Saccharomyces cerevisiae",
                       "Pichia pastoris")) %>%
    mutate(fn = paste0("forward-genomes/", fn, ".fna.allstats.csv.gz")) %>%
    group_by(fn, group, species) %>%
    do(read_csv(.$fn, col_types = feat_col_spec)) %>%
    ungroup %>%
    select(-fn)

forward_preds <- "forward-genomes/microbial_query.tsv.gz" %>%
    read_delim(delim = "\t") %>%
    bind_cols(read_csv("forward-genomes/microbial_query.fna.allstats.csv.gz",
                       col_types = feat_col_spec), .) %>%
    rename(group = grouping) %>%
    # Remove duplicate E. coli genome
    filter(species != "Escherichia coli (strain ATCC 33849 / DSM 4235 / NCIB 12045 / K12 / DH1)") %>%
    select(one_of(colnames(forward_preds))) %>%
    bind_rows(forward_preds) %>%
    filter(numTMs > 0) %>%
    # format the scientific name
    separate(species, c("genus", "species")) %>%
    mutate(species = paste(genus, species)) %>%
    select(-genus)

forward_preds$score <- svmpredict(forward_preds)
```

# prepare data for plotting
```{r}
my_boxplot <- function(x) {
    # 1.5 corresponds to the criteria for the whiskers of a Tukey boxplot
    out <- boxplot.stats(x, coef = 1.5, do.conf = TRUE, do.out = TRUE)$stats
    names(out) <-
        c("whisker_low", "quartile1", "median", "quartile3", "whisker_high")
    out[['count']] <- length(x)
    enframe(out)
}

forward_tufte_boxplots <- forward_preds %>%
    group_by(group, species) %>%
    do(my_boxplot(.$score)) %>%
    spread(name, value) %>%
    mutate(species_count = paste0(species, " (", count, ")") ) %>%
    arrange(species == "Escherichia coli", group == "Metazoa", median) %>%
    ungroup

forward_ec_thresholds <-
    filter(forward_tufte_boxplots, species == "Escherichia coli") %>%
    select(-group, -species, -count, -species_count) %>% unlist

forward_preds_order <- c(
    "Vibrio cholerae",
    "Helicobacter pylori",
    "Campylobacter jejuni",
    "Mycobacterium tuberculosis",
    "Staphylococcus aureus",
    "Plasmodium falciparum",

    "Aquifex aeolicus",
    "Thermus thermophilus",
    "Spirochaeta thermophila",

    "Gloeobacter violaceus",
    "Rhizobium loti",
    "Methanococcus maripaludis",
    "Haloarcula marismortui",
    "Bacillus halodurans",

    "Lactobacillus casei",
    "Bacillus cereus",
    "Bacillus subtilis",
    "Escherichia coli",

    "Pichia pastoris",
    "Saccharomyces cerevisiae",

    "Caenorhabditis elegans",
    "Mus musculus",
    "Homo sapiens")

thermophile_species <- c("Aquifex aeolicus",
                         "Spirochaeta thermophila",
                         "Thermus thermophilus")

p4_all_violin <- ggplot(forward_preds) +
    geom_hline(yintercept = forward_ec_thresholds['median'], color = "darkgrey") +
    geom_point(aes(x = species, y = median),
               color = "darkgrey", data = forward_tufte_boxplots) +
    geom_text(aes(x = species, y = -6, label = species_count,
                  color = species %in% thermophile_species),
              vjust = 2, fontface = "italic", size = 7*5/14,
              data = forward_tufte_boxplots) +
    geom_linerange(aes(x = species, ymin = whisker_low, ymax = quartile1),
                   color = "darkgrey", data = forward_tufte_boxplots) +
    geom_linerange(aes(x = species, ymin = quartile3, ymax = whisker_high),
                   color = "darkgrey", data = forward_tufte_boxplots) +
    geom_violin2(aes(y = score, x = species),
                 color = "black", adjust = 0.3, alpha = 0) +
    coord_flip(ylim = nycomps_env$nycomps_xscale) +
    ylab(expression("IMProve score")) +
    scale_y_continuous(breaks = nycomps_env$nycomps_breaks) +
    ggplot2::scale_x_discrete(limits = forward_preds_order) +
    scale_color_manual(values = c("TRUE" = "#ff6600",
                                  "FALSE" = "black")) +
    theme(axis.ticks.y = element_blank(),
          axis.title.y = element_blank(),
          axis.text.y = element_blank(),
          axis.line.y = element_blank())

p4_all_violin
```

```{r}
selected_violin_genomes <- c("Plasmodium falciparum",
                             "Aquifex aeolicus",
                             "Thermus thermophilus",
                             "Spirochaeta thermophila",
                             "Bacillus subtilis",
                             "Escherichia coli",
                             "Saccharomyces cerevisiae",
                             "Homo sapiens")

p4_selected_violin <- forward_preds %>%
    filter(species %in% selected_violin_genomes) %>%
    ggplot() +
    geom_hline(yintercept = forward_ec_thresholds['median'],
               linetype = "dashed", color = "darkgrey") +
    geom_point(aes(x = species, y = median), color = "darkgrey",
               data = forward_tufte_boxplots %>%
                   filter(species %in% selected_violin_genomes)) +
    geom_text(aes(x = species, y = -6, label = species_count,
                  color = species %in% thermophile_species),
              vjust = 2, fontface = "italic", size = 7*5/14,
              data = forward_tufte_boxplots %>%
                  filter(species %in% selected_violin_genomes)) +
    geom_linerange(aes(x = species, ymin = whisker_low, ymax = quartile1),
                   color = "darkgrey",
                   data = forward_tufte_boxplots %>%
                       filter(species %in% selected_violin_genomes)) +
    geom_linerange(aes(x = species, ymin = quartile3, ymax = whisker_high),
                   color = "darkgrey",
                   data = forward_tufte_boxplots %>%
                       filter(species %in% selected_violin_genomes)) +
    geom_violin2(aes(y = score, x = species),
                 color = "black", adjust = 0.3, alpha = 0) +
    coord_flip(ylim = nycomps_env$nycomps_xscale) +
    ylab(expression("SVM" ^ "rank"*~"score")) +
    scale_color_manual(values = c("TRUE" = "#ff6600",
                                  "FALSE" = "black")) +
    ggplot2::scale_x_discrete(limits = selected_violin_genomes,
                              breaks = selected_violin_genomes) +
    scale_y_continuous(breaks = nycomps_env$nycomps_breaks) +
    theme(axis.ticks.y = element_blank(),
          axis.title.y = element_blank(),
          axis.text.y = element_blank(),
          axis.line.y = element_blank())

p4_selected_violin
```

```{r fig4_nycomps_genes_ppv}
p4_nvector_ppv_prep <- nycomps_env$nycomps_gene_outcomes %>%
   # filter(plasmid_name == "N") %>% 
    mutate(outcome = gene_outcome == "pos") %>%
    calc_auc_roc(method = "roc") %>%
    prep_for_polygon() %>%
    as_tibble

p4_nvector_ppv <- ggplot(p4_nvector_ppv_prep) +
    geom_polygon(aes(x = thresholds, y = ppv, group = id, fill = y_greater_min,
                     color = y_greater_min), size = 0.1) +
    geom_path(aes(x = thresholds, y = ppv), size = 0.5,
              data = filter(p4_nvector_ppv_prep, type != "min_y") %>%
                     distinct(thresholds, ppv)) +
    geom_hline(aes(yintercept = min(min_ppv))) +
    annotate("text", x = -1.7, y = 55, size = 7*5/14,
             label = "Expression in (N) His-FLAG-TEV-", fontface = "bold") +
   # coord_cartesian(xlim = nycomps_env$nycomps_xscale) +
    scale_x_continuous(breaks = nycomps_env$nycomps_breaks) +
    scale_y_continuous(limits = c(0, .601), breaks = seq(0, 1, .1),
                       labels = function(x) x*100) +
    xlab(expression("IMProve score")) +
    ylab("Positive Predictive Value") +
    scale_color_manual(values = c("#A4A5A5", "#A10F1C")) +
    scale_fill_manual(values = c("#A4A5A5", "#A10F1C"))

p4_nvector_ppv
```

## deal with feature distributions
```{r feature_distributions, cache = TRUE}
all_features <- forward_preds %>%
    mutate(set = "forward") %>%
    select(-score) %>%
    gather(feature, value, -title, -group, -species, -set) %>%
    bind_rows(training_env$daley_allstats %>%
                  rename(title = id) %>%
                  mutate(group = "Training",
                         species = "Ec-Daley Training",
                         set = "training") %>%
                  gather(feature, value, -title, -group, -species, -set)) %>%
    bind_rows(nycomps_env$nycomps_features %>%
                  #rename() %>%
                  mutate(title = as.character(id),
                         group = "NYCOMPS",
                         species = "nycomps all",
                         set = "nycomps") %>%
                  select(-score, -score_nornass, -cterm, -name) %>%
                  gather(feature, value, -title, -group, -species, -set))

all_features %<>%
    semi_join(all_features %>% filter(set == "forward"), by = "feature") %>%
    semi_join(all_features %>% filter(set == "training"), by = "feature") %>%
    # make sure that value is not NA
    filter(!is.na(value)) %>%
    group_by(feature)

# total count should be 89 - 5 (no RNAss from NUPACK)
test_that("correct number of distinct features", {
  expect_equal(all_features %>%
                   filter(set == "training") %>%
                   distinct(feature) %>%
                   nrow, 84)
})

overlap_wrap <- function(.data, x, labelx, ref, labelref) {
    data_frame(
        labelx = labelx,
        labelref = labelref,
        overlap = estimate_overlap(
            # can't figure out how to implement this with NSE
            .data[x,] %$% unlist(value, use.names = FALSE),
            .data[ref,] %$% unlist(value, use.names = FALSE))
        )
}

# since the subsequent calculation is time consuming,
# split over multiple cores
all_features %<>%
    partition(feature)

# load libraries on each core
myclusterEvalQ <- function(cl, ...) {
    clusterEvalQ(cl, ...)
    cl
}

get_default_cluster() %>%
    cluster_assign_value("overlap_wrap", overlap_wrap) %>%
    cluster_assign_value("thermophile_species", thermophile_species) %>%
    myclusterEvalQ(library(magrittr)) %>%
    myclusterEvalQ(library(tidyverse)) %>%
    myclusterEvalQ(library(myutils))

overlap_df <- bind_rows(
    all_features %>%
        do(overlap_wrap(., .$set == "nycomps", "nycomps",
                        .$set == "training", "training")) %>% collect,
    all_features %>%
        do(overlap_wrap(., .$set == "forward", "forward",
                        .$set == "training", "training")) %>% collect,
    all_features %>%
        do(overlap_wrap(., .$species %in% thermophile_species, "thermo",
                        .$set == "training", "training")) %>% collect,
    all_features %>%
        do(overlap_wrap(., .$species == "Plasmodium falciparum", "pf",
                        .$set == "training", "training")) %>% collect,
    all_features %>%
        do(overlap_wrap(., .$species == "Escherichia coli", "ec_whole",
                        .$set == "training", "training")) %>% collect
) %>%
    spread(labelx, overlap) %>%
    ungroup

all_features %<>% collect

mad_calc <- function(x, y) {
    abs(x - y) %>%
    mean(na.rm = TRUE) * 100
}

# Mean absolute deviations:
mad_vs_nycomps <- overlap_df %>%
    summarize(forward = mad_calc(nycomps, forward),
              pf = mad_calc(nycomps, pf),
              thermo =  mad_calc(nycomps, thermo)) %>%
    unlist
```


```{r}
p4_ecoli_dist <- ggplot(overlap_df) +
    geom_vline(xintercept = 0.75,
               size = 0.7, linetype = "dotted", color = "#00b1b9") +
    geom_point(aes(x = ec_whole, y = 1), size = 0.25,
               position = position_quasirandom()) +
    scale_x_continuous(expand = c(0, 0), limits = c(0, 1),
                       labels = function(x) x*100) +
    xlab("E. coli vs. E. coli (Training Set)") +
    theme(axis.title.y = element_blank(),
          axis.text.y = element_blank(),
          axis.line.y = element_blank(),
          axis.ticks.y = element_blank())

p4_ecoli_dist %<>% ggplotGrob

p4_ecoli_dist$layout %<>%
    mutate_rows(name == "panel", clip = "off")

ggdraw(p4_ecoli_dist)
```

```{r}
pext2_thermo <-
    ggplot(overlap_df, aes(x = nycomps, y = thermo, label = feature)) +
    geom_hline(yintercept = 0.75,
               linetype = "dotted", size = 0.7, color = "#00b1b9") +
    geom_vline(xintercept = 0.75,
               linetype = "dotted", size = 0.7, color = "#00b1b9") +
    geom_abline(color = "grey", linetype = "dashed") +
    geom_point(size = 0.25) +
    annotate(geom = "text",
             label = paste0("MAD=", specify_decimal(mad_vs_nycomps['thermo'])),
             x = 0.25, y = 0.9, size = 6*5/14) +
    xlab("NYCOMPS vs. Training") +
    ylab("Thermophiles vs. Training") +
    scale_x_continuous(limits = c(0,1), expand = c(0,0),
                       labels = function(x) x*100) +
    scale_y_continuous(limits = c(0,1), expand = c(0,0),
                       labels = function(x) x*100) +
    theme(legend.position = "none",
          legend.title = element_blank(),
          aspect.ratio = 1)
pext2_thermo
```

```{r}
pext2_pf <- ggplot(overlap_df, aes(x = nycomps, y = pf, label = feature)) +
    geom_hline(yintercept = 0.75,
               linetype = "dotted", size = 0.7, color = "#00b1b9") +
    geom_vline(xintercept = 0.75,
               linetype = "dotted", size = 0.7, color = "#00b1b9") +
    geom_abline(color = "grey", linetype = "dashed") +
    geom_point(size = 0.25) +
    annotate(geom = "text",
             label = paste0("MAD=", specify_decimal(mad_vs_nycomps['pf'])),
             x = 0.25, y = 0.9, size = 6*5/14) +
    xlab("NYCOMPS vs. Training") +
    ylab("P. falciparum vs. Training") +
    scale_color_manual(values = brewer.pal(3, "Dark2")[2:1]) +
    scale_x_continuous(limits = c(0,1), expand = c(0,0),
                       labels = function(x) x*100) +
    scale_y_continuous(limits = c(0,1), expand = c(0,0),
                       labels = function(x) x*100) +
    theme(legend.position = "none",
          legend.title = element_blank(),
          aspect.ratio = 1)
pext2_pf
```

```{r}
pext2_nycomps_forward <-
    ggplot(overlap_df, aes(x = nycomps, y = forward, label = feature)) +
    geom_hline(yintercept = 0.75,
               linetype = "dotted", size = 0.7, color = "#00b1b9") +
    geom_vline(xintercept = 0.75,
               linetype = "dotted", size = 0.7, color = "#00b1b9") +
    geom_abline(color = "grey", linetype = "dashed") +
    geom_point(size = 0.25) +
    annotate(geom = "text",
             label = paste0("MAD=", specify_decimal(mad_vs_nycomps['forward'])),
             x = 0.25, y = 0.9, size = 6*5/14) +
    xlab("NYCOMPS vs. Training") +
    ylab("Forward vs. Training") +
    scale_x_continuous(limits = c(0,1), expand = c(0,0),
                       labels = function(x) x*100) +
    scale_y_continuous(limits = c(0,1), expand = c(0,0),
                       labels = function(x) x*100) +
    theme(aspect.ratio = 1)
pext2_nycomps_forward
```

Need to split up plotting since we want to have several on log axes
```{r}
selected <-
    c(seqLen = "Sequence Length",
      pI = "Isoelectric Point",
      GC = "GC Content",
      membrCont = "Membrane Residue Count",
      tAI = "tRNA Adapation Index",
      numPosNormCyt = "Positive Cytoplasmic Residues (normalized)",
      CPS = "Codon Pair Score",
      delGallTMs = "ΔG of Insertion (Hessa, et al.)",
      relareaSD = "Shine-Dalgarno Sites (normalized)",
      len1_2loop = "Length of TM1-TM2 Loop",
      zeroto38avgRNAss = "RNA Secondary Structure (1st 40 codons)",
      avgRONN = "Mean Disorder Prediction (RONN)")

log_scale <- c("seqLen", "membrCont", "len1_2loop")

make_indiv_plot <- function(df) {
    library(tidyverse)
    library(cowplot)
    library(myutils)
    feat <- unique(df$feature)

    p_hist <- ggplot(df) +
        geom_step_hist(aes(x = value, color = group),
                       alpha = 0.7,
                       position = "identity") +
        xlab(selected[[feat]]) +
        scale_color_manual(
            values = c("training" = brewer.pal(3L, "Dark2")[[1L]],
                       "thermo" = "#ff6600",
                       "pf" = brewer.pal(3L, "Dark2")[[3L]])) +
        scale_y_continuous(expand = c(0, 0)) +
        scale_x_continuous(expand = c(0, 0)) +
        theme(axis.title.y = element_blank(),
              legend.position = "None",
              plot.margin = unit(c(5, 5, 0, 0), "pt"))

    # adjustments if log scale is desired
    if (feat %in% log_scale) {
        p_hist <- p_hist +
            scale_x_log10(
                expand = c(0, 0),
                labels = scales::trans_format(
                        "log10", scales::math_format(10 ^ .x))) +
            annotation_logticks2(sides = "b",
                                 short = unit(-0.4, "mm"),
                                 mid = unit(-0.4, "mm"),
                                 long = unit(-0.6, "mm"),
                                 size = 0.2)

        p_hist %<>% ggplotGrob
        p_hist$layout %<>%
            mutate_rows(name == "panel", clip = "off")
    }

    # add labels in ggdraw coordinates
    ggdraw(p_hist) +
        draw_label(x = 0.85, y = 0.9,
                 label = overlap_df %>%
                     filter_("feature == '" %>% paste0(feat, "'")) %>%
                     select(thermo) %>% unlist(use.names = FALSE) %>%
                     specify_decimal(2L),
                  size = 5, colour = "#ff6600") +
        draw_label(x = 0.85, y = 0.8,
                 label = overlap_df %>%
                     filter_("feature == '" %>% paste0(feat, "'")) %>%
                     select(pf) %>% unlist(use.names = FALSE) %>%
                     specify_decimal(2L),
                 size = 5, colour = brewer.pal(3L, "Dark2")[[3L]])
}

setdiff(names(selected), names(all_features))

p4_feat_hist <- all_features %>%
    # necessary to filter for certain features of interest
    ungroup %>%
    filter(set == "training" |
               species %in% thermophile_species |
               species == "Plasmodium falciparum",
           feature %in% names(selected)) %>%
    mutate_rows(set == "training", group = "training") %>%
    mutate_rows(species %in% thermophile_species, group = "thermo") %>%
    mutate_rows(species == "Plasmodium falciparum", group = "pf") %>%
    # to control plot order
    mutate(feature = factor(feature, names(selected))) %>%
    plyr::dlply(.(feature), make_indiv_plot, .parallel = TRUE) %>%
    # render final plot
    plot_grid(plotlist = ., align = "hv", ncol = 2L)

p4_feat_hist
```

```{r}
pext2 <- ggdraw() +
    draw_plot(p4_all_violin, width = 0.65) +
    draw_label("a", x = 0.01, y = .99, fontface = "bold", size = 8) +
    draw_plot(p4_feat_hist, x = 0.67, width = 0.33, height = 0.97) +
    draw_label("b", x = .66, y = .99, fontface = "bold", size = 8) +
    draw_label("E. coli", x = .73, y = .98, fontface = "bold.italic",
               size = 7, colour = brewer.pal(3L, "Dark2")[[1L]]) +
    draw_label("Thermophiles", x = .83, y = .98, fontface = "bold",
               size = 7, colour = "#ff6600") +
    draw_label("P. falciparum", x = .93, y = .98, fontface = "bold.italic",
               size = 7, colour = brewer.pal(3L, "Dark2")[[3L]]) +
    draw_label("Counts", x = .655, y = .5,
               fontface = "bold", size = 7, angle = 90)

# RGB by default

save_plot(filename = "plots/extfig2_names.pdf",
          plot = pext2,
          base_width = uconv(183, "mm", "in", "Length"), base_height = 7,
          dpi = 300)

pext2
```

## Figure 4 Compilation
```{r}
p4_main <- ggdraw() +
    draw_plot(plot_grid(p4_selected_violin, p4_nvector_ppv,
                        nrow = 2L, align = "hv", rel_heights = c(3, 1),
                        labels = letters[1:2], label_size = 8),
              x = -.01, y = 0, width = .72, height = 1) +
    draw_plot(plot_grid(p4_ecoli_dist, pext2_nycomps_forward,
                        pext2_thermo, pext2_pf,
                        labels = letters[3:6], label_size = 8,
                        ncol = 1L, align = "hv",
                        rel_heights = c(2, 3, 3, 3)),
              x = .72, y = 0, width = .27, height = 1)

# RGB by default
save_plot(filename = "plots/fig4_fluman_thermo.pdf", plot = p4_main,
      base_width = uconv(136, "mm", "in", "Length"),
      base_height = 5.5)

p4_main
```

```{r session}
save(forward_preds, file = "genomes.RData")
sessionInfo()
```