fig_training.Rmd

---
title: "Data munging and Figure 1 preparation"
author: "Shyam Saladi (saladi@caltech.edu)"
date: "09/20/2016"
output: html_document
---

## Load libraries
Load all libraries necessary for subsequent data munging and plotting.
Some defaults are set here to make things easier throughout.
```{r libraries}
library(tidyverse)
library(dplyrExtras)
library(magrittr)

# for CI for spearman correlation coefficients
library(boot)
library(pROC)

library(gridExtra)
library(RColorBrewer)
# `uconv` for unit conversion
library(datamart)
library(cowplot)

cores <- 4L

# devtools::install("myutils")
library(myutils)

# if TRUE, then data will be written to file
write_out <- TRUE
bs_count <- 5050L
```

### Load data from training process
```{r}
training_env <- new.env()
load("training.RData", training_env)
```

### Load model data for prediction function
```{r}
model_env <- new.env()
load("model.RData", model_env)
svmpredict <- model_env$svmpredict
```

## E. coli Training - Data Preparation
This block prepares the data necesary for plotting the panels of Figure 1.
```{r}
ecoli_daley_fluman <- training_env$daley_outcomes %>%
    filter(normalized) %>%
    bind_rows(training_env$fluman_folded %>%
                  mutate(cterm = "fluman") %>%
                  select(id, cterm, measurement, outcome)) %>%
    group_by(cterm, id) %>%
    summarize(outcome_max = max(outcome, na.rm = TRUE),
              outcome_avg = mean(outcome, na.rm = TRUE),
              outcome_min = min(outcome, na.rm = TRUE),
              outcome_var = diff(range(outcome, na.rm = TRUE))) %>%
    ungroup

ecoli_daley_fluman <- ecoli_daley_fluman %>%
    filter(cterm == "in") %>%
    inner_join(ecoli_daley_fluman %>%
                   filter(cterm == "fluman"),
               by = "id", suffix = c("_in", "_fluman")) %>%
    select(-cterm_fluman, -cterm_in) %>%
    rename(daley_min = outcome_min_in,
           daley_avg = outcome_avg_in,
           daley_max = outcome_max_in,
           fluman_min = outcome_min_fluman,
           fluman_avg = outcome_avg_fluman,
           fluman_max = outcome_max_fluman)
```

## Perform bootstrap estimates if calculation has not been saved of
Spearman correlation coefficent confidence intervals and significance
To avoid making unnecessary assumptions about the spread of the data, use
bootstrap resampling to calculate confidence intervals and shuffling to
calculate p-values. Since we calculate $10^6$ bootstraps/shuffles, this takes
a couple hours on a 32-core machine. To avoid running this everytime, the output
is saved.
```{r}
daley_fluman_cor <- ecoli_daley_fluman %>%
    boot_perm(x_col = "fluman_avg", y_col = "daley_avg",
              fn_prefix = "training/daley_fluman", count = bs_count,
              cores = cores)
```

# Daley, et al. vs Fluman, et al. outcomes
Plotting the agreement between whole cell fluorescence (Daley, et al.) and
folded protein via in-gel fluorescence (Fluman, et al.).
```{r daley_vs_fluman}
ecoli_daley_fluman_cor_label <- daley_fluman_cor %>%
    mutate(t0 = specify_decimal(t0, k = 2L),
           bca_low = specify_decimal(bca_low, k = 2L),
           bca_high = specify_decimal(bca_high, k = 2L)) %>%
    mutate_rows(perm_p < 0.01, t0 = t0 %>% paste0('**')) %>%
    mutate(label = paste0("'", t0, "'~('", bca_low, "'-'", bca_high, "')")) %>%
    select(label) %>% unlist

p1_daley_fluman <- ggplot(ecoli_daley_fluman) +
    geom_segment(aes(x = fluman_min, xend = fluman_max,
                     y = daley_avg, yend = daley_avg),
                 size = 0.3, alpha = 0.35) +
    geom_segment(aes(x = fluman_avg, xend = fluman_avg,
                     y = daley_min, yend = daley_max),
                 size = 0.3, alpha = 0.35) +
    geom_point(aes(x = fluman_avg, y = daley_avg), size = 0.25) +
    annotate("text", x = 1L, y = 7.45, label = ecoli_daley_fluman_cor_label,
             size = 5*5/14, parse = TRUE) +
    scale_y_continuous(expand = c(0, 0)) +
    scale_x_continuous(expand = c(.01, 0)) +
    ylab("Normalized GFP activity") +
    xlab("Normalized folded protein") +
    theme(axis.title.x = element_text(size = 7L,
                                      color = brewer.pal(3, "Dark2")[3]),
          axis.title.y = element_text(size = 7L,
                                      color = brewer.pal(3, "Dark2")[1]),
          plot.margin = unit(c(0L, 0L, 0L, .5), "mm"))

p1_daley_fluman
```

## Training metrics
The following table displays metrics relating to the input data and training
procedure.
```{r training_metrics}
ecoli_stats <- training_env$ecoli_training %>%
    group_by(cterm) %>%
    summarize(plates = length(unique(grouping)),
              genes = length(unique(id)),
              observations = n())

ecoli_stats <- training_env$best_performance %>%
    select(cterm, total, kendall) %>%
    inner_join(ecoli_stats, ., by = "cterm") %>%
    mutate(kendall = specify_decimal(kendall, 3L)) %>%
    arrange(desc(plates)) %>%
    rename(`Plates` = plates,
           `Genes` = genes,
           `Observations` = observations,
           `Total Pairs` = total,
           `Kendall's~tau` = kendall) %>%
    mutate_rows(cterm == "in", cterm = "C-terminal Cytoplasmic\n(GFP)") %>%
    mutate_rows(cterm == "out", cterm = "C-terminal Periplasmic\n(PhoA)") %>%
    mutate_rows(cterm == "fluman",
                cterm = "C-terminal Cytoplasmic\n(GFP, folded protein)") %>%
    as.data.frame %>%
    column_to_rownames("cterm") %>%
    t %>%
    as.data.frame

table_theme <-  ttheme_default(
    core = list(fg_params = list(fontsize = 6L)),
    colhead = list(fg_params = list(fontsize = 6L, parse = TRUE)),
    rowhead = list(fg_params = list(fontsize = 6L, parse = TRUE)),
    padding = unit(c(2L, 3L), "mm"))

tab <- tableGrob(ecoli_stats, theme = table_theme)
header <- tableGrob(ecoli_stats[1L, 1:2],
                    theme = table_theme,
                    cols = c("Daley, Rapp, et al., 2005",
                             "Fluman, et al., 2014"))

p1_table <- gridExtra::combine(header[1,], tab, along = 2L)
p1_table$layout[1:6, c("l", "r")] <- list(c(2L, 4L), c(3L, 4L))

# set colors
ind <- find_cell(p1_table, 2L, 2L, "colhead-fg")
p1_table$grobs[ind][[1L]][["gp"]]$col <- brewer.pal(3L, "Dark2")[[1L]]
ind <- find_cell(p1_table, 2L, 3L, "colhead-fg")
p1_table$grobs[ind][[1L]][["gp"]]$col <- brewer.pal(3L, "Dark2")[[2L]]
ind <- find_cell(p1_table, 2L, 4L, "colhead-fg")
p1_table$grobs[ind][[1L]][["gp"]]$col <- brewer.pal(3L, "Dark2")[[3L]]

# remove italics
ind <- find_cell(p1_table, 6L, 1L, "rowhead-fg")
p1_table$grobs[ind][[1L]][["gp"]]$font <- 1L
ind <- find_cell(p1_table, 7L, 1L, "rowhead-fg")
p1_table$grobs[ind][[1L]][["gp"]]$font <- 1L

ggdraw(p1_table)
```


## Perform bootstrap estimates if calculation has not been saved of
Spearman correlation coefficent confidence intervals and significance
To avoid making unnecessary assumptions about the spread of the data, use
bootstrap resampling to calculate confidence intervals and shuffling to
calculate p-values. Since we calculate $10^6$ bootstraps/shuffles, this takes
a couple hours on a 32-core machine. To avoid running this everytime, the output
is saved.
```{r}
ecoli_xy_cor <- training_env$ecoli_training %>%
    filter(normalized, !is.na(score)) %>%
    group_by(cterm) %>%
    mutate(fn_prefix = paste0("training/ecoli_xy_boot_", cterm)) %>%
    do(boot_perm(., fn_prefix = unique(.$fn_prefix),
                 count = bs_count, cores = cores, force = FALSE))

# need to calculate AUCs for each activity value
# but within the "in" and "out" groupings
ecoli_roc <- training_env$ecoli_training %>%
    filter(normalized, !is.na(score)) %>%
    group_by(cterm) %>%
    do(calc_auc_roc(., method = "roc"))
ecoli_auc <- training_env$ecoli_training %>%
    filter(normalized, !is.na(score)) %>%
    group_by(cterm) %>%
    do(calc_auc_roc(., method = "auc"))
```


## Normalized outcomes (from all plates) vs. the model's score
This block plots the normalized experimental outcomes against the model's score.
Additional text annotations shown on the published figure were added using
Adobe Illustrator (after the fact).
```{r normalization_correlation}
ecoli_xy_cor_label <- training_env$ecoli_training %>%
    filter(normalized, !is.na(score)) %>%
    group_by(cterm) %>%
    summarize(count = n(),
              # label positions
              x = min(score) + (max(score) - min(score))*(6/20),
              y1 = max(outcome)*(1 - 1/50) - max(outcome)*(2/20),
              y2 = y1 - max(outcome)*(2/20)) %>%
    inner_join(ecoli_xy_cor, by = "cterm") %>%
    mutate(t0 = specify_decimal(t0, k = 2L),
           bca_low = specify_decimal(bca_low, k = 2L),
           bca_high = specify_decimal(bca_high, k = 2L)) %>%
    mutate_rows(perm_p < 0.01, t0 = t0 %>% paste0('**')) %>%
    mutate(label = paste0("'", t0, "'~('", bca_low, "'-'", bca_high, "')"),
           cterm = factor(cterm, c("in", "out", "fluman"),
                          ordered = TRUE))

p1_ecoli_xy <- training_env$ecoli_training %>%
    filter(normalized, !is.na(score)) %>%
    mutate(cterm = factor(cterm, c("in", "out", "fluman"),
                  ordered = TRUE)) %>%
    ggplot(aes(x = score, y = outcome, color = cterm)) +
    stat_summary(fun.ymin = min, fun.ymax = max, geom = "linerange",
                 size = 0.3, alpha = 0.35, color = "black") +
    stat_summary(fun.y = mean, geom = "point", size = .25, alpha = 0.7) +
    geom_text(aes(x = -2, y = y1, label = count), color = "black",
              size = 5*5/14, data = ecoli_xy_cor_label) +
    geom_text(aes(x = -2, y = y2, color = cterm, label = label),
              size = 5*5/14, parse = TRUE, data = ecoli_xy_cor_label) +
    scale_y_continuous(expand = c(.01, 0), breaks = c(0:3, 4L, 6L, 8L)) +
    expand_limits(ymin = 0) +
    scale_x_continuous(expand = c(.01, 0.01)) +
    scale_color_manual(values = brewer.pal(3, "Dark2")) +
    ylab("Mean Normalized Activity") +
    xlab(expression("IMProve score")) +
    theme(legend.position = "None",
          strip.text = element_blank(),
          plot.margin = unit(c(0L, 0L, 0L, 0L), "mm")) +
    facet_wrap(~cterm, ncol = 1, scales = "free_y")

p1_ecoli_xy %<>% ggplot_build

for (idx in c(2, 4)) {
    p1_ecoli_xy$layout$panel_ranges[[1]]$y.labels[[idx]] <- ""
    p1_ecoli_xy$layout$panel_ranges[[1]]$y.major[[idx]] <- NA
}

p1_ecoli_xy %<>% ggplot_gtable

ggdraw(p1_ecoli_xy)
```

## Training data: Receiver Operating Characteristics
While in the preparation stage, ROCs were calculated for all percentiles (as
integers), for the figure, we just plot two of these (at the $50^{th}$ and
$97^{th}$ percentile in experimental outcomes). By adjusting the first line,
any percentile can be plotted. the $50^{th}$ and $97^{th}$ seem to show an
interesting behavior on the all-AUC plot (the following figure), but there
potentially are many others that could be interesting!
```{r training_roc}
ecoli_selected_auc <- ecoli_auc %>%
    ungroup %>%
    mutate(cterm = factor(cterm, levels = c("in", "out", "fluman"),
                          ordered = TRUE)) %>%
    group_by(cterm) %>%
    filter(find_closest(outcome_percentile, c(25L, 75L))) %>%
    mutate(outcome_percentile = outcome_percentile %>% round %>% as.integer,
           auc = specify_decimal(auc*100),
           lower95 = specify_decimal(lower95*100),
           upper95 = specify_decimal(upper95*100),
           ci = paste0("'", auc, "'~('", lower95, "'-'", upper95, "')")) %>%
    ungroup %>%
    arrange(outcome_percentile, cterm)

ecoli_selected_counts <- ecoli_selected_auc %>%
    group_by(outcome_percentile) %>%
    summarize(count = paste(pos_count, collapse = ","))

p1_ecoli_roc <- ecoli_roc %>%
    ungroup %>%
    mutate(cterm = factor(cterm, levels = c("in", "out", "fluman"),
                          ordered = TRUE)) %>%
    group_by(cterm) %>%
    filter(find_closest(outcome_percentile, c(25L, 75L))) %>%
    mutate(outcome_percentile =
               outcome_percentile %>% round %>% as.integer) %>%
    ggplot() +
    geom_abline(slope = 1L, intercept = 0L,
                linetype = "dashed", color = "darkgrey") +
    geom_path(aes(x = 1 - specificities, y = sensitivities, color = cterm)) +
    geom_text(aes(x = .75, y = .8*(.4 - as.numeric(cterm)/10) - .03,
                  color = cterm, label = ci),
              size = 5*5/14, parse = TRUE, data = ecoli_selected_auc) +
    geom_text(aes(x = .75, y = .29, label = count),
              size = 5*5/14, parse = FALSE, data = ecoli_selected_counts) +
    scale_y_continuous(expand = c(0L, 0L), labels = function(x) x*100) +
    scale_x_continuous(expand = c(0L, 0L), labels = function(x) x*100) +
    xlab("False Positive Rate") +
    ylab("True Positive Rate") +
    scale_color_manual(values = brewer.pal(3L, "Dark2")) +
    facet_wrap(~outcome_percentile, nrow = 1L, scales = "free") +
    theme(legend.position = "None",
          aspect.ratio = 1L,
          plot.margin = unit(c(0L, 0L, 0L, 0L), "mm"),
          strip.background = element_blank())

p1_ecoli_roc <- ggplotGrob(p1_ecoli_roc)
p1_ecoli_roc$grobs[[12L]]$grobs[[1L]]$children[[2L]]$children[[1L]]$label <-
    expression(25L ^ "th"*~"percentile")
p1_ecoli_roc$grobs[[13L]]$grobs[[1L]]$children[[2L]]$children[[1L]]$label <-
    expression(75L ^ "th"*~"percentile")

ggdraw(p1_ecoli_roc)
```

# AUC across Expression Cutoffs
Since the experimental outcomes are continuous (not binary), expression can be
defined via many different thresholds. To assess the degree of predictive
performance, the AUC of the ROCs for each one of these cutoffs is plotted.
```{r training_auc}
# all possible cutoffs with AUCs
p1_ecoli_auc <- ecoli_auc %>%
    ungroup %>%
    mutate(cterm = factor(cterm, levels = c("in", "out", "fluman"),
                          ordered = TRUE)) %>%
    ggplot(aes(x = outcome_percentile, color = cterm)) +
    geom_hline(aes(yintercept = .5), linetype = "dashed", color = "darkgrey") +
    geom_line(aes(y = auc)) +
    scale_y_continuous(expand = c(0L, 0L), breaks = seq(0, 1, .1),
                       labels = function(x) x*100) +
    scale_x_continuous(expand = c(0L, 0L), breaks = seq(0L, 100L, 25L),
                       limit = c(0, 100)) +
    ylab("AUC") +
    xlab("Activity Percentile Rank") +
    scale_color_manual(values = brewer.pal(3L, "Dark2")) +
    theme(legend.position = "None")

p1_ecoli_auc
```

## Compile the plots for the training data figure
This block compiles the figures generated above into the preliminary version of
Figure 1. Some minor adjustments were made after the fact using Adobe
Illustrator, e.g. removing clipping masks, fixing font faces.
```{r training_plot_compilation}
p1_main <- ggdraw() +
    draw_plot(p1_daley_fluman, x = .0, y = .5, width = .23, height = 0.48) +
    draw_plot(p1_table, x = .21, y = 0.51, width = 1 - .47, height = 0.48) +
    draw_plot(p1_ecoli_xy, x = .75, y = 0, width = .24, height = .98) +
    draw_plot(plot_grid(p1_ecoli_roc, p1_ecoli_auc,
                        nrow = 1L, rel_widths = c(1L, 1L)),
              x = 0, y = 0, width = .75, height = .5) +
    draw_plot_label(LETTERS[1:5],
                    c(0, 0.26, .75, 0, 0.38), c(1, 1, 1, 0.52, 0.52), size = 8L)

if (write_out) {
    save_plot(filename = "plots/fig1_w_panels.pdf",
              plot = p1_main,
              base_width = uconv(183L, "mm", "in", "Length"),
              base_height = 3L)
}

ggdraw(p1_main)
```

## Save environment for future reference
No variables need to be kept. Since bootstraps are recorded to file at the time
they are calculated.
```{r session}
sessionInfo()
```


## look at variability due to normalization
```{r var_x_y}
var_x_y <- function(mean_x, mean_y, var_x, var_y, cov_x_y)
    (mean_x / mean_y) ^ 2 * (
        (var_x / mean_x) ^ 2 + (var_y / mean_y) ^ 2 -
        2 * (cov_x_y / (mean_x * mean_y))
    )

daley_variation <- training_env$daley_outcomes %>%
    filter(cterm != "fluman", grepl("Val", measurement)) %>%
    separate(grouping, c("Day")) %>%
    left_join(training_env$control_outcomes) %>%
    mutate(outcome_raw = outcome*(control - background)) %>%
    group_by(id) %>%
    summarize(mean_id = mean(outcome_raw),
              mean_ctrl = mean(control),
              var_id = var(outcome_raw),
              var_ctrl = var(control),
              cov_id_ctrl = cov(outcome_raw, control)) %>%
    ungroup %>%
    mutate(var_id_ctrl = var_x_y(mean_id, mean_ctrl,
                                 var_id, var_ctrl, cov_id_ctrl),
           var_id_less = var_id < var_id_ctrl,
           var_ctrl_less = var_ctrl < var_id_ctrl)

ggplot(ecoli_daley_fluman) +
    geom_point(aes(outcome_var_in, outcome_var_fluman)) +
    geom_abline(slope = 1) +
    theme(aspect.ratio = 1)

table(ecoli_daley_fluman$outcome_var_in > ecoli_daley_fluman$outcome_var_fluman)
# FALSE  TRUE
#    22    51
#
# 22/51
# [1] 0.4313725
```

# train a linear models for fun (might take this out for the publication)
```
lm_data <- ecoli_training_norm %>%
    filter(cterm != "fluman") %>%
    select(id, outcome) %>%
    inner_join(daley_allstats_transformed, by = "id") %>%
    select(-id)
lm_data[is.na(lm_data)] <- 0
lm_fit <- train(outcome ~ ., data = lm_data,
                 method = "lm")
lm_data$scores <- predict(lm_fit, lm_data)
cor(lm_data$outcome, lm_data$scores)

fluman_data <- ecoli_training_norm %>%
    filter(cterm == "fluman") %>%
    select(id, outcome) %>%
    inner_join(daley_allstats_transformed, by = "id") %>%
    select(-id)
fluman_data[is.na(fluman_data)] <- 0
fluman_data$scores <- predict(lm_fit, fluman_data)
cor(fluman_data$outcome, fluman_data$scores)
```