roughly added the design (#6)

* roughly added the design * update design doc * [skip style] [skip vbump] Restyle files --------- Co-authored-by: github-actions <41898282+github-actions[bot]@users.noreply.github.com>
openpharma · Apr 1, 2024 · 498ee82 · 498ee82
1 parent c6f11fc
commit 498ee82
Show file tree

Hide file tree

Showing 4 changed files with 245 additions and 35 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -6,7 +6,7 @@ Date: 2023-06-20
 Authors@R:
     person("openpharma", , , "[email protected]", role = c("aut", "cre"))
 Description: R package template with GitHub Actions workflows included.
-License: Apache License 2.0 | file LICENSE
+License: Apache License 2.0
 URL: https://github.com/openpharma/RobinCar2/
 BugReports: https://github.com/openpharma/RobinCar2/issues
 Depends:

diff --git a/LICENSE b/LICENSE
diff --git a/covadj.Rproj b/covadj.Rproj
diff --git a/design/package_structure/design.Rmd b/design/package_structure/design.Rmd
@@ -0,0 +1,244 @@
+---
+title: "Design Doc of RobinCar2"
+author: Liming Li
+---
+
+In this doc we will summarize the general design.
+
+For robust inference, there are several key components: unbiased prediction, robust covariance, and delta method.
+
+So the general workflow will be
+
+1. Construct the model.
+1. Obtain unbiased prediction, depending on the data
+1. Obtain the robust covariance (also with unbiased prediction as input)
+1. use delta method to obtain treatment effect and its inference
+
+## Create dummy data
+
+Before we start our model, first we need to prepare our data.
+
+In the following part we create a simple simulation data.
+z1 comes from a bernouli distribution of p = 0.5.
+z2 comes from a multinomial distribution of p = (0.4, 0.3, 0.3)
+cov1 comes from a normal distribution.
+trt is based on stratified block randomization, block size is 6, stratified by z1 and z2.
+y is the response, the formula is
+
+exp(y) = 0.2 + 0.5 * (trt = "trt1") + 0.7 * (trt = "trt2") + 1.5 * (z1 = "b") - 1 * (z2 = y) - 0.5 * (z2 = z) + 0.2 * cov
+
+```{r}
+library(stats)
+n <- 200
+block_rand <- function(block = c(0, 0, 1, 1)) {
+  function(n) {
+    r <- lapply(seq_len(ceiling(n / length(block))), function(i) {
+      sample(block)
+    })
+    unlist(r)[1:n]
+  }
+}
+by_strata <- function(f, strata) {
+  ret <- rep(NA, length(strata))
+  for (x in split(seq_len(length(strata)), strata)) {
+    ret[x] <- f(length(x))
+  }
+  return(ret)
+}
+sim_data <- function(n) {
+  cov1 <- rnorm(n)
+  z1 <- sample(size = n, factor(c("a", "b")), c(0.5, 0.5), replace = TRUE)
+  z2 <- sample(size = n, factor(c("x", "y", "z")), c(0.4, 0.3, 0.3), replace = TRUE)
+  permute_block <- c(0, 0, 1, 1)
+  trt <- by_strata(block_rand(c("trt1", "trt1", "trt2", "trt2", "pbo", "pbo")), interaction(z1, z2))
+  trt <- factor(trt, levels = c("pbo", "trt1", "trt2"))
+  df <- data.frame(trt, z1, z2, cov1)
+  x_mat <- model.matrix(~ trt + z1 + z2 + cov1, data = df)
+  coef <- c(0.2, 0.5, 0.7, 1.5, -1, -0.5, 0.2)
+  theta <- x_mat %*% coef
+  y <- plogis(theta)
+  y_bin <- as.integer(runif(n) < y)
+
+  df$y <- y_bin
+  df
+}
+
+d <- sim_data(500)
+```
+
+## Construct the model
+
+In this part, we will create the model based on the data provided.
+Let's begin with binary outcome:
+
+```{r}
+fit <- glm(y ~ trt:z1 + trt, family = binomial(), data = d)
+```
+
+## Obtain the predictions
+
+```{r}
+predict_cf <- function(fit, trt, data, ...) {
+  UseMethod("predict_cf")
+}
+
+predict_cf.glm <- function(fit, trt, data, ...) {
+  lvls <- fit$xlevels[[trt]]
+  # data has to be a data.frame
+  df <- rbind(fit$model, fit$model, fit$model)
+  df[[trt]] <- factor(rep(lvls, each = nrow(fit$model)), levels = lvls)
+  preds <- predict(fit, type = "response", newdata = df)
+  matrix(preds, ncol = length(lvls), dimnames = list(row.names(fit$model), lvls))
+}
+```
+
+with this `predict_cf` function we already obtain the counterfactual response
+
+```{r}
+pred <- predict_cf(fit, "trt")
+pred
+```
+
+## Obtain the biasness
+
+```{r}
+bias <- function(fit, trt, strat, data) {
+  UseMethod("bias")
+}
+bias.glm <- function(fit, trt, strat, data) {
+  trt_var <- fit$model[[trt]]
+  if (length(strat) != 0) {
+    strat_var <- fit$data[, strat]
+  } else {
+    strat_var <- rep(0L, length(fit$y))
+  }
+  residuals <- fit$y - fit$fitted.values
+  d <- matrix(NA_real_, nrow = length(fit$y), ncol = length(fit$xlevels[[trt]]))
+  id_strat <- split(seq_len(length(residuals)), strat_var)
+  for (i in id_strat) {
+    df <- vapply(split(residuals[i], trt_var[i]), function(xx) mean(xx), FUN.VALUE = 0)
+    d[i, ] <- matrix(df, nrow = length(i), ncol = length(df), byrow = TRUE)
+  }
+  d
+}
+```
+
+## Ensure the unbiasness
+
+to ensure the unbiasness, the predict_cf need additional argument
+
+```{r}
+predict_cf.glm <- function(fit, trt, data, ensure_unbias = TRUE, strata = NULL, ...) {
+  lvls <- fit$xlevels[[trt]]
+  # data has to be a data.frame
+  df <- rbind(fit$model, fit$model, fit$model)
+  df[[trt]] <- factor(rep(lvls, each = nrow(fit$model)), levels = lvls)
+  preds <- predict(fit, type = "response", newdata = df)
+  ret <- matrix(preds, ncol = length(lvls), dimnames = list(row.names(fit$model), lvls))
+  if (ensure_unbias) {
+    ret <- ret - bias(fit, trt, strata)
+  }
+  ret
+}
+```
+
+```{r}
+pred <- predict_cf(fit, "trt")
+```
+
+
+## Robust variance
+
+```{r}
+adjust_pi <- function(pi_t) {
+  diag(pi_t) - pi_t %*% t(pi_t)
+}
+
+get_erb <- function(resi, strata, trt, pit, randomization) {
+  if (length(strata) == 0) {
+    return(0)
+  }
+  if (randomization %in% c("simple", "pocock-simon")) {
+    return(0)
+  }
+  # Calculate Omega Z under simple
+  omegaz_sr <- adjust_pi(pit)
+  idx <- split(seq_len(length(resi)), cbind(trt, strata))
+  resi_per_strata <- vapply(idx, function(ii) mean(resi[ii]), FUN.VALUE = 0)
+  # Calculate strata levels and proportions for
+  # the outer expectation
+
+  strata_props <- vapply(idx, length, FUN.VALUE = 0L)
+  strata_props <- strata_props / sum(strata_props)
+  # Estimate R(B) by first getting the conditional expectation
+  # vector for a particular strata (vector contains
+  # all treatment groups), then dividing by the pi_t
+
+  rb_z <- resi_per_strata / as.numeric(pit)
+  # Compute the R(B)[Omega_{SR} - Omega_{Z_i}]R(B) | Z_i
+  # for each Z_i
+  strata_levels <- length(resi_per_strata)
+  n_trt <- length(pit)
+  ind <- matrix(seq_len(strata_levels), byrow = TRUE, ncol = n_trt)
+  rb_z_sum <- lapply(
+    seq_len(nrow(ind)),
+    function(x) rb_z[ind[x, ]] %*% t(rb_z[ind[x, ]]) * sum(strata_props[ind[x, ]])
+  )
+  rb_z_sum <- Reduce(`+`, rb_z_sum)
+  rb_z_sum * omegaz_sr
+}
+
+vcov_robin <- function(fit, ...) {
+  UseMethod("vcov_robin")
+}
+
+are <- function(objs, cls) {
+  all(vapply(objs, is, class2 = cls, FUN.VALUE = TRUE))
+}
+
+vcov_robin.glm <- function(fit, trt, strat = NULL, preds = predict_cf(fit, trt, strat), sr_decompose = TRUE, randomization = "simple", ...) {
+  raw_data <- fit$data
+  resi <- residuals(fit, type = "response")
+  est <- colMeans(preds)
+  var_preds <- var(preds)
+  pit <- as.numeric(table(raw_data[[trt]]) / nrow(raw_data))
+  trt_lvls <- fit$xlevels[[trt]]
+
+  idx <- split(seq_len(nrow(raw_data)), raw_data[[trt]])
+  cov_ymu <- vapply(idx, function(is) stats::cov(fit$y[is], preds[is, ]), FUN.VALUE = rep(0, ncol(preds)))
+
+  vcov_sr <- vapply(idx, function(is) stats::var(fit$y[is]), FUN.VALUE = 0) / pit
+  if (sr_decompose) {
+    vcov_sr <- vcov_sr + (diag(var_preds) - 2 * diag(cov_ymu)) / pit
+  }
+  v <- diag(vcov_sr) + cov_ymu + t(cov_ymu) - var_preds
+  if (!randomization %in% c("simple", "pocock-simon") && length(strat) > 0) {
+    v <- v - get_erb(resi, raw_data[strat], raw_data[[trt]], pit, randomization)
+  }
+  ret <- v / length(resi)
+  dimnames(ret) <- list(trt_lvls, trt_lvls)
+  return(ret)
+}
+vcov_robin(fit, "trt", "z1")
+```
+
+
+## robincar_glm wrapper
+
+```{r}
+robincar_glm2 <- function(data, formula, trt, strata, car_scheme, ...) {
+  fit <- glm(formula = formula, data = data, ...)
+  pred <- predict_cf(fit, trt, ensure_unbias = TRUE)
+  var <- vcov_robin(fit, trt = trt, strat = strata, randomization = car_scheme, preds = pred)
+  list(
+    pred = colMeans(pred),
+    var = var
+  )
+}
+robincar_glm(d, formula = y ~ z1:trt + trt, "trt", "z1", "coin")
+```
+
+potentially, there can be other arguments just like `RobinCar::robincar_glm`.
+The formula can be inferred from the choices of "ANOVA", "ANCOVA", "ANHECOVA", etc. if not provided.
+
+All these functions can be exported for direct usage, and also the wrapper is provided.