From 2d41a6d9b6a1d7cedc4de9530c3dbbcd5ca04c86 Mon Sep 17 00:00:00 2001
From: Julian Stamp <julian.d.stamp@gmail.com>
Date: Mon, 26 Feb 2024 13:31:11 -0500
Subject: [PATCH] MM-134 working examples

---
 vignettes/functional_mvmapit.Rmd | 311 +++++++++++++++++++++++++++++++
 1 file changed, 311 insertions(+)
 create mode 100644 vignettes/functional_mvmapit.Rmd

diff --git a/vignettes/functional_mvmapit.Rmd b/vignettes/functional_mvmapit.Rmd
new file mode 100644
index 0000000..cb855c9
--- /dev/null
+++ b/vignettes/functional_mvmapit.Rmd
@@ -0,0 +1,311 @@
+---
+title: "functional_mvmapit"
+output: rmarkdown::html_vignette
+vignette: >
+  %\VignetteIndexEntry{functional_mvmapit}
+  %\VignetteEngine{knitr::rmarkdown}
+  %\VignetteEncoding{UTF-8}
+---
+
+```{r, include = FALSE}
+knitr::opts_chunk$set(
+  collapse = TRUE,
+  comment = "#>"
+)
+```
+
+```{r setup}
+library(mvMAPIT)
+library(tidyr)
+library(dplyr)
+```
+
+
+
+```{r}
+# seed <- sample(1:1e6, 1)#67132
+
+n_samples <- c(1000, 2000)#2938
+n_snp <- 5747
+d <- 1
+PVE <- 0.6
+rho <- c(0.5, 0.8)
+n_causal <- 1000
+n_trait_specific <- 0
+n_pleiotropic <- 10
+group_ratio_pleiotropic <- 1
+epistatic_correlation <- 0.9
+maf <- 0.05
+n_rand_snps <- c(10,100) * (n_pleiotropic)
+num_experiments <- 10
+parameters <- crossing(n_samples, rho, n_rand_snps)
+```
+
+```{r simulations}
+# set.seed(1234)
+
+RESULT <- NULL
+P <- NULL
+for (k in 1:nrow(parameters)) {
+  n_samples <- parameters[[k, 1]]
+  rho <- parameters[[k, 2]]
+  n_rand_snps <- parameters[[k, 3]]
+  maf <- 0.05 + 0.45 * runif(n_snp)
+  random_minor_allele_counts   <-
+    (runif(n_samples * n_snp) < maf) + (runif(n_samples * n_snp) < maf)
+  genotype_data <- matrix(
+    random_minor_allele_counts,
+    nrow = n_samples,
+    ncol = n_snp,
+    byrow = TRUE,
+  )
+  
+  sample_names <- seq_len(n_samples) %>% sprintf(fmt = "id%04d")
+  snp_names <- seq_len(n_snp) %>% sprintf(fmt = "snp%04d")
+  
+  colnames(genotype_data) <- snp_names
+  rownames(genotype_data) <- sample_names
+  
+  #
+  for (l in seq_len(num_experiments)) {
+    seed <- sample(1:1e6, 1)
+    simulated_data <- simulate_traits(
+      genotype_data,
+      n_causal = n_causal,
+      n_trait_specific = n_trait_specific,
+      n_pleiotropic = n_pleiotropic,
+      d = d,
+      H2 = PVE,
+      rho = rho,
+      epistatic_correlation = epistatic_correlation,
+      group_ratio_pleiotropic = group_ratio_pleiotropic,
+      maf_threshold = 0.05,
+      seed = seed
+    )
+    epi <- which(grepl("epi", colnames(simulated_data$genotype)))
+    random_snps <- sample(1:n_snp, n_rand_snps)
+    mask_snps <- unique(c(epi, random_snps))
+    mvmapit_normal_full <- mvmapit(
+      t(simulated_data$genotype),
+      t(simulated_data$trait),
+      test = "normal",
+      variantIndex = epi
+    )
+    pv_full <- mvmapit_normal_full$pvalues %>%
+      drop_na() %>%
+      select(-trait) %>%
+      mutate(covar = "full",
+             n = n_samples,
+             rho = rho,
+             n_rand_snps = n_snp - length(epi) - 1,
+             trial = l)
+    W <- matrix(0, nrow = n_snp, ncol = n_snp)
+    for (i in mask_snps) {
+      for (j in mask_snps) {
+        W[i, j] = 1
+      }
+    }
+    mvmapit_normal_masked <- mvmapit(
+      t(simulated_data$genotype),
+      t(simulated_data$trait),
+      W = W,
+      test = "normal",
+      variantIndex = epi
+    )
+    pv_masked <- mvmapit_normal_masked$pvalues %>%
+      drop_na() %>%
+      select(-trait) %>%
+      mutate(covar = "masked",
+             n = n_samples,
+             rho = rho,
+             n_rand_snps = n_rand_snps,
+             trial = l)
+    p <- pv_masked %>%
+      rbind(pv_full)
+    P <- rbind(P, p)
+  }
+  
+}
+```
+```{r}
+RESULT %>% 
+  group_by(covar, alpha, n, rho, n_rand_snps) %>%
+  summarise(se = sd(power) / sqrt(num_experiments),
+            power = mean(power))
+```
+```{r}
+    RES05 <- P %>%
+      group_by(covar, n, rho, n_rand_snps, trial) %>%
+      summarise(
+        power = sum(p < 0.05) / n()
+      ) %>%
+  mutate(alpha = 0.05)
+    RES01 <- P %>%
+      group_by(covar, n, rho, n_rand_snps, trial) %>%
+      summarise(
+        power = sum(p < 0.01) / n()
+      ) %>%
+  mutate(alpha = 0.01)
+    RES001 <- P %>%
+      group_by(covar, n, rho, n_rand_snps, trial) %>%
+      summarise(
+        power = sum(p < 0.001) / n()
+      ) %>%
+  mutate(alpha = 0.001)
+RES <- RES05 %>% rbind(RES01) %>% rbind(RES001)
+PLOT <- RES  %>% 
+  group_by(alpha, rho,  n, n_rand_snps,covar) %>%
+  summarise(se = sd(power) / sqrt(num_experiments),
+            power = mean(power))
+```
+
+```{r}
+library(ggplot2)
+gg <- PLOT %>%
+  filter(n == 2000) %>%
+  mutate(n_rand_snps = case_when(
+    n_rand_snps == "NA" ~ "5737",
+                   TRUE ~ n_rand_snps)) %>%
+  ggplot(aes(x=n_rand_snps, y=power, fill=covar)) +
+  geom_bar(stat="identity") +
+  geom_errorbar(aes(ymin=power-se, ymax=power+se), width=.2,
+                 position=position_dodge(.9)) +
+  facet_grid(rho ~ alpha, labeller = label_both) +
+  theme_minimal() +
+  scale_fill_manual(values = c("Indianred", "SteelBlue")) +
+  labs(
+    x = "# Non-epistatic SNPs in covar",
+    y = "Power") +
+  theme(legend.position = "none")
+  
+show(gg)
+ggsave(paste0("proof-of-concept", ".png"), plot = gg, width = 6, height = 4, dpi = 300)
+```
+
+```{r bhadur}
+library(tidyr)
+library(dplyr)
+n_samples <- c(100, 1000, 2000) #2938
+N <- max(n_samples)
+n_snp <- 6000
+d <- 1
+PVE <- 0.6
+rho <- 0.8
+n_causal <- 50
+n_trait_specific <- 0
+n_pleiotropic <- 10
+group_ratio_pleiotropic <- 1
+epistatic_correlation <- 0.9
+maf <- 0.05
+n_rand_snps <- c(100, 1000, 5000)
+num_experiments <- 5
+parameters <- crossing(n_samples, n_rand_snps)
+  maf <- 0.05 + 0.45 * runif(n_snp)
+  random_minor_allele_counts   <-
+    (runif(N * n_snp) < maf) + (runif(N * n_snp) < maf)
+  genotype_data <- matrix(
+    random_minor_allele_counts,
+    nrow = N,
+    ncol = n_snp,
+    byrow = TRUE,
+  )
+  
+sample_names <- seq_len(N) %>% sprintf(fmt = "id%04d")
+snp_names <- seq_len(n_snp) %>% sprintf(fmt = "snp%04d")
+colnames(genotype_data) <- snp_names
+rownames(genotype_data) <- sample_names
+seed <- sample(1:1e6, 1)
+simulated_data <- simulate_traits(
+  genotype_data,
+  n_causal = n_causal,
+  n_trait_specific = n_trait_specific,
+  n_pleiotropic = n_pleiotropic,
+  d = d,
+  H2 = PVE,
+  rho = rho,
+  epistatic_correlation = epistatic_correlation,
+  group_ratio_pleiotropic = group_ratio_pleiotropic,
+  maf_threshold = 0.05,
+  seed = seed
+  )
+  
+```
+
+```{r}
+RESULT <- NULL
+P2 <- NULL
+for (k in 1:nrow(parameters)) {
+  n_samples <- parameters[[k, 1]]
+  n_rand_snps <- parameters[[k, 2]]
+ind <- sample(1:N, n_samples, replace = FALSE)
+  
+  #
+print(n_samples)
+print(n_rand_snps)
+  for (l in seq_len(num_experiments)) {
+
+    epi <- which(grepl("epi", colnames(simulated_data$genotype)))
+    random_snps <- sample(1:n_snp, n_rand_snps)
+    mask_snps <- unique(c(epi, random_snps))
+    W <- matrix(0, nrow = n_snp, ncol = n_snp)
+    for (i in mask_snps) {
+      for (j in mask_snps) {
+        W[i, j] = 1
+      }
+    }
+    mvmapit_normal_masked <- mvmapit(
+      t(simulated_data$genotype[ind,]),
+      t(simulated_data$trait[ind,]),
+      W = W,
+      test = "normal",
+      variantIndex = epi
+    )
+    p <- mvmapit_normal_masked$pvalues %>%
+      drop_na() %>%
+      select(-trait) %>%
+      mutate(n = n_samples,
+             rho = rho,
+             n_rand_snps = n_rand_snps,
+             trial = l)
+    P2 <- rbind(P2, p)
+  }
+}
+```
+```{r}
+library(ggplot2)
+P2 %>%
+  select(n, p, n_rand_snps) %>%
+  mutate(rand = as.factor(n_rand_snps),
+         pplot = -log10(p)) %>%
+  group_by(n_rand_snps) %>%
+  ggplot(aes(x = n, y = p, color = rand) ) +
+     facet_wrap(rho) +
+     geom_point() +
+     geom_smooth(method = "lm", se = FALSE)
+```
+```{r}
+bhadur <- P2 %>%
+  group_by(id, n, rho, n_rand_snps) %>%
+  summarise(p = mean(p)) %>%
+  ungroup() %>%
+  nest(-id, -rho, -n_rand_snps) %>%
+  mutate(slope = map_dbl(data, ~coef(lm(p ~ n, data = .x))[["n"]])) %>%
+  unnest(data) %>%
+  group_by(rho, n_rand_snps) %>%
+  mutate(m = mean(slope),
+         signal_to_noise = as.factor(10/n_rand_snps)) %>%
+  ggplot(aes(x = n, y = p, color = signal_to_noise) ) +
+     # geom_point() +
+     geom_abline(aes(slope=m, intercept = 1, color = signal_to_noise), size=2) +
+    xlim(0, 10000) +
+  theme_minimal() +
+  scale_fill_manual(values = c("Indianred", "SteelBlue", "Aquamarine")) +
+  labs(
+    x = "Sample Size",
+    y = "Empirical p-value decrease")
+  
+show(bhadur)
+ggsave(paste0("proof-of-concept-bhadur", ".png"), plot = bhadur, width = 6, height = 4, dpi = 300)
+
+```
+