diff --git a/DESCRIPTION b/DESCRIPTION index 7e88625..63327a5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -6,7 +6,7 @@ Date: 2023-06-20 Authors@R: person("openpharma", , , "openpharma@example.com", role = c("aut", "cre")) Description: R package template with GitHub Actions workflows included. -License: Apache License 2.0 | file LICENSE +License: Apache License 2.0 URL: https://github.com/openpharma/RobinCar2/ BugReports: https://github.com/openpharma/RobinCar2/issues Depends: diff --git a/LICENSE b/LICENSE deleted file mode 100644 index 63aaf2d..0000000 --- a/LICENSE +++ /dev/null @@ -1,13 +0,0 @@ -Copyright 2022 F. Hoffmann-La Roche AG - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. diff --git a/covadj.Rproj b/covadj.Rproj deleted file mode 100644 index cba1b6b..0000000 --- a/covadj.Rproj +++ /dev/null @@ -1,21 +0,0 @@ -Version: 1.0 - -RestoreWorkspace: No -SaveWorkspace: No -AlwaysSaveHistory: Default - -EnableCodeIndexing: Yes -UseSpacesForTab: Yes -NumSpacesForTab: 2 -Encoding: UTF-8 - -RnwWeave: Sweave -LaTeX: pdfLaTeX - -AutoAppendNewline: Yes -StripTrailingWhitespace: Yes - -BuildType: Package -PackageUseDevtools: Yes -PackageInstallArgs: --no-multiarch --with-keep.source -PackageRoxygenize: rd,collate,namespace diff --git a/design/package_structure/design.Rmd b/design/package_structure/design.Rmd new file mode 100644 index 0000000..948463b --- /dev/null +++ b/design/package_structure/design.Rmd @@ -0,0 +1,244 @@ +--- +title: "Design Doc of RobinCar2" +author: Liming Li +--- + +In this doc we will summarize the general design. + +For robust inference, there are several key components: unbiased prediction, robust covariance, and delta method. + +So the general workflow will be + +1. Construct the model. +1. Obtain unbiased prediction, depending on the data +1. Obtain the robust covariance (also with unbiased prediction as input) +1. use delta method to obtain treatment effect and its inference + +## Create dummy data + +Before we start our model, first we need to prepare our data. + +In the following part we create a simple simulation data. +z1 comes from a bernouli distribution of p = 0.5. +z2 comes from a multinomial distribution of p = (0.4, 0.3, 0.3) +cov1 comes from a normal distribution. +trt is based on stratified block randomization, block size is 6, stratified by z1 and z2. +y is the response, the formula is + +exp(y) = 0.2 + 0.5 * (trt = "trt1") + 0.7 * (trt = "trt2") + 1.5 * (z1 = "b") - 1 * (z2 = y) - 0.5 * (z2 = z) + 0.2 * cov + +```{r} +library(stats) +n <- 200 +block_rand <- function(block = c(0, 0, 1, 1)) { + function(n) { + r <- lapply(seq_len(ceiling(n / length(block))), function(i) { + sample(block) + }) + unlist(r)[1:n] + } +} +by_strata <- function(f, strata) { + ret <- rep(NA, length(strata)) + for (x in split(seq_len(length(strata)), strata)) { + ret[x] <- f(length(x)) + } + return(ret) +} +sim_data <- function(n) { + cov1 <- rnorm(n) + z1 <- sample(size = n, factor(c("a", "b")), c(0.5, 0.5), replace = TRUE) + z2 <- sample(size = n, factor(c("x", "y", "z")), c(0.4, 0.3, 0.3), replace = TRUE) + permute_block <- c(0, 0, 1, 1) + trt <- by_strata(block_rand(c("trt1", "trt1", "trt2", "trt2", "pbo", "pbo")), interaction(z1, z2)) + trt <- factor(trt, levels = c("pbo", "trt1", "trt2")) + df <- data.frame(trt, z1, z2, cov1) + x_mat <- model.matrix(~ trt + z1 + z2 + cov1, data = df) + coef <- c(0.2, 0.5, 0.7, 1.5, -1, -0.5, 0.2) + theta <- x_mat %*% coef + y <- plogis(theta) + y_bin <- as.integer(runif(n) < y) + + df$y <- y_bin + df +} + +d <- sim_data(500) +``` + +## Construct the model + +In this part, we will create the model based on the data provided. +Let's begin with binary outcome: + +```{r} +fit <- glm(y ~ trt:z1 + trt, family = binomial(), data = d) +``` + +## Obtain the predictions + +```{r} +predict_cf <- function(fit, trt, data, ...) { + UseMethod("predict_cf") +} + +predict_cf.glm <- function(fit, trt, data, ...) { + lvls <- fit$xlevels[[trt]] + # data has to be a data.frame + df <- rbind(fit$model, fit$model, fit$model) + df[[trt]] <- factor(rep(lvls, each = nrow(fit$model)), levels = lvls) + preds <- predict(fit, type = "response", newdata = df) + matrix(preds, ncol = length(lvls), dimnames = list(row.names(fit$model), lvls)) +} +``` + +with this `predict_cf` function we already obtain the counterfactual response + +```{r} +pred <- predict_cf(fit, "trt") +pred +``` + +## Obtain the biasness + +```{r} +bias <- function(fit, trt, strat, data) { + UseMethod("bias") +} +bias.glm <- function(fit, trt, strat, data) { + trt_var <- fit$model[[trt]] + if (length(strat) != 0) { + strat_var <- fit$data[, strat] + } else { + strat_var <- rep(0L, length(fit$y)) + } + residuals <- fit$y - fit$fitted.values + d <- matrix(NA_real_, nrow = length(fit$y), ncol = length(fit$xlevels[[trt]])) + id_strat <- split(seq_len(length(residuals)), strat_var) + for (i in id_strat) { + df <- vapply(split(residuals[i], trt_var[i]), function(xx) mean(xx), FUN.VALUE = 0) + d[i, ] <- matrix(df, nrow = length(i), ncol = length(df), byrow = TRUE) + } + d +} +``` + +## Ensure the unbiasness + +to ensure the unbiasness, the predict_cf need additional argument + +```{r} +predict_cf.glm <- function(fit, trt, data, ensure_unbias = TRUE, strata = NULL, ...) { + lvls <- fit$xlevels[[trt]] + # data has to be a data.frame + df <- rbind(fit$model, fit$model, fit$model) + df[[trt]] <- factor(rep(lvls, each = nrow(fit$model)), levels = lvls) + preds <- predict(fit, type = "response", newdata = df) + ret <- matrix(preds, ncol = length(lvls), dimnames = list(row.names(fit$model), lvls)) + if (ensure_unbias) { + ret <- ret - bias(fit, trt, strata) + } + ret +} +``` + +```{r} +pred <- predict_cf(fit, "trt") +``` + + +## Robust variance + +```{r} +adjust_pi <- function(pi_t) { + diag(pi_t) - pi_t %*% t(pi_t) +} + +get_erb <- function(resi, strata, trt, pit, randomization) { + if (length(strata) == 0) { + return(0) + } + if (randomization %in% c("simple", "pocock-simon")) { + return(0) + } + # Calculate Omega Z under simple + omegaz_sr <- adjust_pi(pit) + idx <- split(seq_len(length(resi)), cbind(trt, strata)) + resi_per_strata <- vapply(idx, function(ii) mean(resi[ii]), FUN.VALUE = 0) + # Calculate strata levels and proportions for + # the outer expectation + + strata_props <- vapply(idx, length, FUN.VALUE = 0L) + strata_props <- strata_props / sum(strata_props) + # Estimate R(B) by first getting the conditional expectation + # vector for a particular strata (vector contains + # all treatment groups), then dividing by the pi_t + + rb_z <- resi_per_strata / as.numeric(pit) + # Compute the R(B)[Omega_{SR} - Omega_{Z_i}]R(B) | Z_i + # for each Z_i + strata_levels <- length(resi_per_strata) + n_trt <- length(pit) + ind <- matrix(seq_len(strata_levels), byrow = TRUE, ncol = n_trt) + rb_z_sum <- lapply( + seq_len(nrow(ind)), + function(x) rb_z[ind[x, ]] %*% t(rb_z[ind[x, ]]) * sum(strata_props[ind[x, ]]) + ) + rb_z_sum <- Reduce(`+`, rb_z_sum) + rb_z_sum * omegaz_sr +} + +vcov_robin <- function(fit, ...) { + UseMethod("vcov_robin") +} + +are <- function(objs, cls) { + all(vapply(objs, is, class2 = cls, FUN.VALUE = TRUE)) +} + +vcov_robin.glm <- function(fit, trt, strat = NULL, preds = predict_cf(fit, trt, strat), sr_decompose = TRUE, randomization = "simple", ...) { + raw_data <- fit$data + resi <- residuals(fit, type = "response") + est <- colMeans(preds) + var_preds <- var(preds) + pit <- as.numeric(table(raw_data[[trt]]) / nrow(raw_data)) + trt_lvls <- fit$xlevels[[trt]] + + idx <- split(seq_len(nrow(raw_data)), raw_data[[trt]]) + cov_ymu <- vapply(idx, function(is) stats::cov(fit$y[is], preds[is, ]), FUN.VALUE = rep(0, ncol(preds))) + + vcov_sr <- vapply(idx, function(is) stats::var(fit$y[is]), FUN.VALUE = 0) / pit + if (sr_decompose) { + vcov_sr <- vcov_sr + (diag(var_preds) - 2 * diag(cov_ymu)) / pit + } + v <- diag(vcov_sr) + cov_ymu + t(cov_ymu) - var_preds + if (!randomization %in% c("simple", "pocock-simon") && length(strat) > 0) { + v <- v - get_erb(resi, raw_data[strat], raw_data[[trt]], pit, randomization) + } + ret <- v / length(resi) + dimnames(ret) <- list(trt_lvls, trt_lvls) + return(ret) +} +vcov_robin(fit, "trt", "z1") +``` + + +## robincar_glm wrapper + +```{r} +robincar_glm2 <- function(data, formula, trt, strata, car_scheme, ...) { + fit <- glm(formula = formula, data = data, ...) + pred <- predict_cf(fit, trt, ensure_unbias = TRUE) + var <- vcov_robin(fit, trt = trt, strat = strata, randomization = car_scheme, preds = pred) + list( + pred = colMeans(pred), + var = var + ) +} +robincar_glm(d, formula = y ~ z1:trt + trt, "trt", "z1", "coin") +``` + +potentially, there can be other arguments just like `RobinCar::robincar_glm`. +The formula can be inferred from the choices of "ANOVA", "ANCOVA", "ANHECOVA", etc. if not provided. + +All these functions can be exported for direct usage, and also the wrapper is provided.