model-training.Rmd

---
title: "Model Training"
author: "Shyam Saladi (saladi@calech.edu)"
date: "September 25, 2016"
output: html_document
---

Here I will train a single model based on ml20 (also referred to as ml21) with
a recalculated set of windowed tAI features using a linear kernel
preference-ranking SVM. The model will be trained on only the raw plate
measurement data from the Daley, Rapp, et al., 2005 dataset for proteins with
a positively determined C-terminal localization. The hyperparameter `C` will
be tuned over a grid of $2^{-5 to 5}$. The final tuning parameter will selected
by performance on the folded protein measurement from Fluman, et al., 2012.

# load libraries
```{r}
library(magrittr)
library(tidyverse)
library(dplyrExtras)

library(caret)

library(foreach)
library(doMC)
registerDoMC(cores = 4L)

library(multidplyr)

# devtools::install("myutils")
library(myutils)

library(multidplyr)
create_cluster(4L) %>%
    cluster_library("tidyverse") %>%
    cluster_library("myutils") %>%
    set_default_cluster()

write_out = TRUE
```

### Read in Daley and Fluman outcomes
This block loads in the control wells for plates where that data exists
to normalize the measured expression levels.
```{r control_outcomes}
control_outcomes <- read_csv("training/all-assays-analysis.csv",
                             na = c("", "NA", "#N/A")) %>%
    filter(Id %in% c("invLep", "LepB", "CC118 / BL21", "blank"),
           # experimental LepB construct
           Plate != "Wed D11") %>%
    select(Day, Id, GFP1:GFP4, PhoA1:PhoA4) %>%
    gather(measurement, control, GFP1:PhoA4) %>%
    mutate(cterm = ifelse(grepl("GFP", measurement), "in", "out"),
           measurement = gsub("GFP", "", measurement) %>%
               gsub("PhoA", "", .) %>% paste0("Val", .))

control_outcomes <- control_outcomes %>%
    filter((Id == "LepB" & cterm == "out") |
               (Id == "invLep" & cterm == "in")) %>%
    inner_join(control_outcomes %>%
                   filter(Id == "CC118 / BL21") %>%
                   select(-Id) %>%
                   rename(non_induced = control),
               by = c("Day", "cterm", "measurement")) %>%
    inner_join(control_outcomes %>% filter(Id == "blank") %>%
                   select(-Id) %>%
                   rename(blank = control),
           by = c("Day", "cterm", "measurement")) %>%
    filter(!is.na(control)) %>%
    # assume if blank is NA, it's zero
    mutate_rows(is.na(blank), blank = 0L) %>%
    mutate(background = non_induced + blank) %>%
    select(Day, cterm, measurement, control, background)
```

```{r}
daley_outcomes <- read_csv("training/AllPlates_161104_with_raw.csv") %>%
    gather(measurement, outcome, Val1:OD600_Norm4) %>%
    mutate(id = tolower(ID),
           cterm = Cterm_new,
           grouping = paste(groupid, measurement, sep = "|"),
           outcome = as.numeric(outcome)) %>%
    filter(`Keep?` == 1L, !is.na(outcome),
           # remove normalized values since many are not correct
           measurement %in% c("OD600_1", "OD600_2", "OD600_3", "OD600_4",
                              "Val1", "Val2", "Val3", "Val4")) %>%
    select(Day, id, cterm, grouping, measurement, outcome) %>%
    # properly normalize values where possible
    left_join(control_outcomes %>%
                  mutate(normalized = TRUE),
              by = c("Day", "cterm", "measurement")) %>%
    mutate_rows(is.na(control),
                background = 0L, normalized = FALSE, control = 1L) %>%
    # normalize to control well
    mutate(outcome = (outcome - background)/(control - background)) %>%
    select(-control, -background, -Day)

daley_outcomes_exp <- daley_outcomes %>%
    # original model was trained using these three measurement categories
    # omitting "Val3"
    filter(measurement %in% c("Val1", "Val2", "Val4")) %>%
    group_by(grouping) %>%
    filter(n() > 1L) %>%
    # original model didn't require unique count > 1
    # filter(unique(outcome) %>% length > 1) %>%
    ungroup

fluman_folded <- read_csv("training/fluman-compiled-emrD-normalized.csv",
                          col_types = cols(.default = col_double(),
                                           name = col_character(),
                                           `clone #` = col_integer(),
                                           gi = col_integer()),
                        na = c("", "NA", "#VALUE!")) %>%
    gather(measurement, outcome,
           `whole cell fluorescence`:`OD600 at harvest`) %>%
    filter(!is.na(outcome), measurement %in% c("folded exp",
                                               "folded exp_1",
                                               "folded exp_2")) %>%
    mutate(grouping = measurement,
           id = tolower(name),
           normalized = TRUE,
           cterm = "fluman")

# Concatenate together for model evaluation
ecoli_training <- bind_rows(daley_outcomes_exp, fluman_folded) %>%
    select(id, cterm, measurement, outcome, grouping, normalized) %>%
    mutate(grouping = grouping %>% as.character %>% factor)
```

### Process sequence features for GFP and PhoA datasets
```{r}
feat_col_spec = cols(.default = col_double(), title = col_character())

#read and preprocess the feature files
daley_allstats <- bind_rows(
    read_csv("training/Daley_phoa.fna.allstats.csv",
             col_types = feat_col_spec),
    read_csv("training/Daley_gfp.fna.allstats.csv",
             col_types = feat_col_spec)) %>%
    separate(title, c("id", "title"), extra = "drop") %>%
    select(-title) %>%
    mutate(id = tolower(id))
    # Original model didn't cull these
    # filter(numTMs > 0)

# Find features with near zero variance
near_zero_var_features <- nearZeroVar(daley_allstats, names = TRUE)

# Remove sequence features with near zero variance
daley_allstats[, near_zero_var_features] <- list(NULL)

# Find highly correlating features
highly_correlating_features <- daley_allstats %>%
    select(-id) %>%
    cor(use = "pairwise.complete.obs") %>%
    findCorrelation(cutoff = 0.95, names = TRUE)

# Remove highly correlating features
daley_allstats[, highly_correlating_features] <- list(NULL)

# Find parameters to center and scale the training feature space
daley_preprocess_params <- daley_allstats %>%
    select(-id) %>%
    preProcess(method = c("center", "scale"))

# Transform input data
daley_allstats_transformed <-
    predict(daley_preprocess_params, daley_allstats)
```

### Join outcomes with features and write datasets in $SVM^{light}$ format
```{r}
daley_to_train <- daley_outcomes_exp %>%
    group_by(grouping) %>%
    select(cterm, id, outcome, grouping) %>%
    # This join means that genes without any features (i.e. gene sequence not
    # a part of the training set) will still be trained on
    left_join(daley_allstats_transformed, by = "id")

daley_svmlight_fn <- daley_to_train %>%
    select(-cterm, -id) %>%
    write_dataset("training/daley_gfp_phoa")
```

### Train linear SVM models against the raw Daley, Rapp, et al.
data over a set of C values
```{r train_models}
tunegrid <- 4 # 2 ^ seq(-5L, 5L)

binary <- "/ul/saladi/apps/svm_rank/1.00/svm_rank_learn"
args <- paste("-c %PARAM% -t 0 -m 4000",
              daley_svmlight_fn,
              gsub(".svmlight", ".train.%PARAM%.model", daley_svmlight_fn))

junk <- foreach(thisparam = tunegrid) %dopar% {
    thisargs <- gsub("%PARAM%", thisparam, args)
    system2(binary, thisargs, stdout = FALSE)
}
```

### Test each model
against the Training (Daley, Rapp, et al.) as well as Fluman, et al. datasets
```{r test_models}
feat_names <- daley_to_train %>%
    ungroup %>%
    select(-cterm, -grouping, -id, -outcome) %>%
    colnames

model_template <- gsub(".svmlight", ".train.%PARAM%.model", daley_svmlight_fn)

test_model <- function(model, data, xtrans = NULL) {
    data$score <- prediction_fn(data, xtrans = xtrans, model)

    data %>%
        group_by(cterm, grouping) %>%
        partition(cterm, grouping) %>%
        do(ConDis_fast(.$score, .$outcome)) %>%
        collect() %>%
        group_by(cterm) %>%
        summarize(total = sum(valid_count),
                  kendall = sum(con - dis)/total) %>%
        ungroup %>%
        bind_cols(model %>% t %>% as_data_frame %>%
                      bind_rows(., ., .))
}

training_performance <-
    foreach(thisparam = tunegrid,
            .combine = bind_rows, .multicombine = TRUE) %do% {

    filename <- gsub("%PARAM%", thisparam, model_template)
    model <- read_svmlight_model(filename, feat_names)

    ecoli_training %>%
        left_join(daley_allstats, by = "id") %>%
        test_model(model, ., daley_preprocess_params) %>%
        mutate(param = thisparam)
}
```

# Identify final model based on Fluman, et al., performance
```{r choose_model}
best_performance <- training_performance %>%
    filter(cterm == "fluman") %>%
    arrange(desc(kendall)) %>%
    head(1)

svm_model <- best_performance %>%
    select(-cterm, -total, -kendall, -param) %>%
    unlist

best_performance <- training_performance %>%
    filter(param == best_performance$param)

best_performance %>%
    select(cterm, total, kendall, param)
```

## predict on training dataset
```{r}
svmpredict <- partial(prediction_fn,
                      xtrans = daley_preprocess_params,
                      weights = svm_model, .lazy = FALSE)

# svm_model <- model_env$svm_model
# svm_model["40deltaG"] <- svm_model["X40deltaG"]
# svm_model["40freqens"] <- svm_model["X40freqens"]
# svm_model <- svm_model[!(names(svm_model) %in% c("X40freqens", "X40deltaG"))]

ecoli_training <- daley_allstats %>%
    mutate(score = svmpredict(daley_allstats)) %>%
    select(id, score) %>%
    left_join(ecoli_training, ., by = "id")
```

## Remove temporary files and save session
```{r session}
file.remove(daley_svmlight_fn)
Sys.glob(gsub("%PARAM%", "\\*", model_template)) %>%
    file.remove() %>% all

if (write_out) {
    save(control_outcomes, daley_outcomes, fluman_folded,
         daley_allstats, ecoli_training, daley_to_train,
         training_performance, best_performance,
         file = "training.RData", compression_level = 9L)
    save(svmpredict, daley_preprocess_params, svm_model, file = "model.RData",
         compression_level = 9L)
}

sessionInfo()
```