diff --git a/vignettes/functional_mvmapit.Rmd b/vignettes/functional_mvmapit.Rmd new file mode 100644 index 0000000..cb855c9 --- /dev/null +++ b/vignettes/functional_mvmapit.Rmd @@ -0,0 +1,311 @@ +--- +title: "functional_mvmapit" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{functional_mvmapit} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +```{r setup} +library(mvMAPIT) +library(tidyr) +library(dplyr) +``` + + + +```{r} +# seed <- sample(1:1e6, 1)#67132 + +n_samples <- c(1000, 2000)#2938 +n_snp <- 5747 +d <- 1 +PVE <- 0.6 +rho <- c(0.5, 0.8) +n_causal <- 1000 +n_trait_specific <- 0 +n_pleiotropic <- 10 +group_ratio_pleiotropic <- 1 +epistatic_correlation <- 0.9 +maf <- 0.05 +n_rand_snps <- c(10,100) * (n_pleiotropic) +num_experiments <- 10 +parameters <- crossing(n_samples, rho, n_rand_snps) +``` + +```{r simulations} +# set.seed(1234) + +RESULT <- NULL +P <- NULL +for (k in 1:nrow(parameters)) { + n_samples <- parameters[[k, 1]] + rho <- parameters[[k, 2]] + n_rand_snps <- parameters[[k, 3]] + maf <- 0.05 + 0.45 * runif(n_snp) + random_minor_allele_counts <- + (runif(n_samples * n_snp) < maf) + (runif(n_samples * n_snp) < maf) + genotype_data <- matrix( + random_minor_allele_counts, + nrow = n_samples, + ncol = n_snp, + byrow = TRUE, + ) + + sample_names <- seq_len(n_samples) %>% sprintf(fmt = "id%04d") + snp_names <- seq_len(n_snp) %>% sprintf(fmt = "snp%04d") + + colnames(genotype_data) <- snp_names + rownames(genotype_data) <- sample_names + + # + for (l in seq_len(num_experiments)) { + seed <- sample(1:1e6, 1) + simulated_data <- simulate_traits( + genotype_data, + n_causal = n_causal, + n_trait_specific = n_trait_specific, + n_pleiotropic = n_pleiotropic, + d = d, + H2 = PVE, + rho = rho, + epistatic_correlation = epistatic_correlation, + group_ratio_pleiotropic = group_ratio_pleiotropic, + maf_threshold = 0.05, + seed = seed + ) + epi <- which(grepl("epi", colnames(simulated_data$genotype))) + random_snps <- sample(1:n_snp, n_rand_snps) + mask_snps <- unique(c(epi, random_snps)) + mvmapit_normal_full <- mvmapit( + t(simulated_data$genotype), + t(simulated_data$trait), + test = "normal", + variantIndex = epi + ) + pv_full <- mvmapit_normal_full$pvalues %>% + drop_na() %>% + select(-trait) %>% + mutate(covar = "full", + n = n_samples, + rho = rho, + n_rand_snps = n_snp - length(epi) - 1, + trial = l) + W <- matrix(0, nrow = n_snp, ncol = n_snp) + for (i in mask_snps) { + for (j in mask_snps) { + W[i, j] = 1 + } + } + mvmapit_normal_masked <- mvmapit( + t(simulated_data$genotype), + t(simulated_data$trait), + W = W, + test = "normal", + variantIndex = epi + ) + pv_masked <- mvmapit_normal_masked$pvalues %>% + drop_na() %>% + select(-trait) %>% + mutate(covar = "masked", + n = n_samples, + rho = rho, + n_rand_snps = n_rand_snps, + trial = l) + p <- pv_masked %>% + rbind(pv_full) + P <- rbind(P, p) + } + +} +``` +```{r} +RESULT %>% + group_by(covar, alpha, n, rho, n_rand_snps) %>% + summarise(se = sd(power) / sqrt(num_experiments), + power = mean(power)) +``` +```{r} + RES05 <- P %>% + group_by(covar, n, rho, n_rand_snps, trial) %>% + summarise( + power = sum(p < 0.05) / n() + ) %>% + mutate(alpha = 0.05) + RES01 <- P %>% + group_by(covar, n, rho, n_rand_snps, trial) %>% + summarise( + power = sum(p < 0.01) / n() + ) %>% + mutate(alpha = 0.01) + RES001 <- P %>% + group_by(covar, n, rho, n_rand_snps, trial) %>% + summarise( + power = sum(p < 0.001) / n() + ) %>% + mutate(alpha = 0.001) +RES <- RES05 %>% rbind(RES01) %>% rbind(RES001) +PLOT <- RES %>% + group_by(alpha, rho, n, n_rand_snps,covar) %>% + summarise(se = sd(power) / sqrt(num_experiments), + power = mean(power)) +``` + +```{r} +library(ggplot2) +gg <- PLOT %>% + filter(n == 2000) %>% + mutate(n_rand_snps = case_when( + n_rand_snps == "NA" ~ "5737", + TRUE ~ n_rand_snps)) %>% + ggplot(aes(x=n_rand_snps, y=power, fill=covar)) + + geom_bar(stat="identity") + + geom_errorbar(aes(ymin=power-se, ymax=power+se), width=.2, + position=position_dodge(.9)) + + facet_grid(rho ~ alpha, labeller = label_both) + + theme_minimal() + + scale_fill_manual(values = c("Indianred", "SteelBlue")) + + labs( + x = "# Non-epistatic SNPs in covar", + y = "Power") + + theme(legend.position = "none") + +show(gg) +ggsave(paste0("proof-of-concept", ".png"), plot = gg, width = 6, height = 4, dpi = 300) +``` + +```{r bhadur} +library(tidyr) +library(dplyr) +n_samples <- c(100, 1000, 2000) #2938 +N <- max(n_samples) +n_snp <- 6000 +d <- 1 +PVE <- 0.6 +rho <- 0.8 +n_causal <- 50 +n_trait_specific <- 0 +n_pleiotropic <- 10 +group_ratio_pleiotropic <- 1 +epistatic_correlation <- 0.9 +maf <- 0.05 +n_rand_snps <- c(100, 1000, 5000) +num_experiments <- 5 +parameters <- crossing(n_samples, n_rand_snps) + maf <- 0.05 + 0.45 * runif(n_snp) + random_minor_allele_counts <- + (runif(N * n_snp) < maf) + (runif(N * n_snp) < maf) + genotype_data <- matrix( + random_minor_allele_counts, + nrow = N, + ncol = n_snp, + byrow = TRUE, + ) + +sample_names <- seq_len(N) %>% sprintf(fmt = "id%04d") +snp_names <- seq_len(n_snp) %>% sprintf(fmt = "snp%04d") +colnames(genotype_data) <- snp_names +rownames(genotype_data) <- sample_names +seed <- sample(1:1e6, 1) +simulated_data <- simulate_traits( + genotype_data, + n_causal = n_causal, + n_trait_specific = n_trait_specific, + n_pleiotropic = n_pleiotropic, + d = d, + H2 = PVE, + rho = rho, + epistatic_correlation = epistatic_correlation, + group_ratio_pleiotropic = group_ratio_pleiotropic, + maf_threshold = 0.05, + seed = seed + ) + +``` + +```{r} +RESULT <- NULL +P2 <- NULL +for (k in 1:nrow(parameters)) { + n_samples <- parameters[[k, 1]] + n_rand_snps <- parameters[[k, 2]] +ind <- sample(1:N, n_samples, replace = FALSE) + + # +print(n_samples) +print(n_rand_snps) + for (l in seq_len(num_experiments)) { + + epi <- which(grepl("epi", colnames(simulated_data$genotype))) + random_snps <- sample(1:n_snp, n_rand_snps) + mask_snps <- unique(c(epi, random_snps)) + W <- matrix(0, nrow = n_snp, ncol = n_snp) + for (i in mask_snps) { + for (j in mask_snps) { + W[i, j] = 1 + } + } + mvmapit_normal_masked <- mvmapit( + t(simulated_data$genotype[ind,]), + t(simulated_data$trait[ind,]), + W = W, + test = "normal", + variantIndex = epi + ) + p <- mvmapit_normal_masked$pvalues %>% + drop_na() %>% + select(-trait) %>% + mutate(n = n_samples, + rho = rho, + n_rand_snps = n_rand_snps, + trial = l) + P2 <- rbind(P2, p) + } +} +``` +```{r} +library(ggplot2) +P2 %>% + select(n, p, n_rand_snps) %>% + mutate(rand = as.factor(n_rand_snps), + pplot = -log10(p)) %>% + group_by(n_rand_snps) %>% + ggplot(aes(x = n, y = p, color = rand) ) + + facet_wrap(rho) + + geom_point() + + geom_smooth(method = "lm", se = FALSE) +``` +```{r} +bhadur <- P2 %>% + group_by(id, n, rho, n_rand_snps) %>% + summarise(p = mean(p)) %>% + ungroup() %>% + nest(-id, -rho, -n_rand_snps) %>% + mutate(slope = map_dbl(data, ~coef(lm(p ~ n, data = .x))[["n"]])) %>% + unnest(data) %>% + group_by(rho, n_rand_snps) %>% + mutate(m = mean(slope), + signal_to_noise = as.factor(10/n_rand_snps)) %>% + ggplot(aes(x = n, y = p, color = signal_to_noise) ) + + # geom_point() + + geom_abline(aes(slope=m, intercept = 1, color = signal_to_noise), size=2) + + xlim(0, 10000) + + theme_minimal() + + scale_fill_manual(values = c("Indianred", "SteelBlue", "Aquamarine")) + + labs( + x = "Sample Size", + y = "Empirical p-value decrease") + +show(bhadur) +ggsave(paste0("proof-of-concept-bhadur", ".png"), plot = bhadur, width = 6, height = 4, dpi = 300) + +``` +