-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlimma_module.R
140 lines (121 loc) · 5.79 KB
/
limma_module.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#! R
#' Limma module taking metadata_csv and raw count data frame to run DE analysis
#'
#' @param count_data object in matrix format (integers) with rownames as genes and colnames as sample IDs
#' @param anno_tb data frame of ensembl_gene_id - external_gene_id mappings for annotation
#' @param tpm_tb data frame of tpms to append to results for output
#' @param tag string used to prefix output
#' @param metadata_csv path to file holding metadata
#' @param metadata_design string of design using colnames of metadata_csv BN that final entry is the
#' @param control_reference string indicating the 'control' intgroup for last element of design
#' @param output_dir path to where output goes
#' @param delim_samples character to delimit sample IDs (default: ".")
#' @return nz_fc_co raw count object with no lines summing to zeros
#' @importFrom magrittr '%>%'
#' @export
limma_module <- function(count_data, anno_tb = NULL, tpm_tb = NULL, tag = NULL, metadata_csv = NULL, metadata_design = NULL, control_reference = NULL, output_dir = NULL, delim_samples = "\\.", run_voom = FALSE){
print("Running: limma_module()")
##warnigns for undefined param
if(is.null(tag)){
stop("Please provide a tag to name your outputs")
}
if(is.null(metadata_csv)){
stop("Please specify a metadata_csv file in CSV format")
}
if(is.null(metadata_design)){
print("Please specify a metadata_design for DESeq2")
print("NB this should be format: first + second + ... + last")
print("NBB that 'last' in design is condition of interest in DE analysis")
stop("NBBB that all elements of the design must be names of columns in 'metadata' file")
}
if(is.null(output_dir)){
print("No directory specified for output, the current dir will be used")
output_dir <- "./limma"
} else {
output_dir <- paste0(output_dir, "/limma")
dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)
}
##read in condition data
cond <- readr::read_csv(metadata_csv)
cond_df <- as.data.frame(cond)
rownames(cond_df) <- cond_df[,1]
##overlap w/count_data and sort
cond_df <- cond_df[rownames(cond_df) %in% colnames(count_data),]
cond_df <- cond_df[sort(rownames(cond_df)),]
##break design into components
design_vec <- unlist(lapply(metadata_design, function(f){gsub(" ", "", strsplit(f, "\\+")[[1]])}))
##take only cols of interest and define major condition
cond_df <- cond_df[,colnames(cond_df) %in% c("sample", design_vec)]
CONDITION <- rev(design_vec)[1]
##create factors
for(x in 1:length(colnames(cond_df))){
cond_df[,x] <- factor(cond_df[,x])
}
##change reference level if specified
if(!is.null(control_reference)){
cond_df[,CONDITION] <- relevel(cond_df[,CONDITION], ref = control_reference)
}
##ExpressionSet object
eset <- Biobase::ExpressionSet(assayData = as.matrix(count_data),
phenoData = Biobase::AnnotatedDataFrame(cond_df))
##design
limma_design <- model.matrix(formula(paste0("~ 0 + ", CONDITION)), data = cond_df)
if(run_voom == TRUE){
##VOOM - Removing heteroscedascity from count data
pdf(paste0(output_dir, "/", tag, ".limma_voom.pdf"))
limma_voom <- limma::voom(counts = eset, design = limma_design, plot = TRUE)
dev.off()
limma_fit <- limma::lmFit(limma_voom, limma_design)
}
##make all contrasts of CONDITION, then set into named list
contrasts <- apply(t(combn(levels(cond_df[,CONDITION]),2)), 1, function(f){
paste(paste0(CONDITION, f), collapse=" - ")
})
limma_contrasts <- limma::makeContrasts(contrasts = contrasts,
levels = paste0(CONDITION, levels(cond_df[,CONDITION])))
if(run_voom == FALSE){
limma_dgelist <- edgeR::DGEList(counts = count_data,
samples = cond_df,
group = cond_df[,CONDITION])
keep <- edgeR::filterByExpr(limma_dgelist)
limma_dgelist <- limma_dgelist[keep,,keep.lib.sizes=FALSE]
limma_dgelist <- edgeR::calcNormFactors(limma_dgelist)
limma_logCPM <- edgeR::cpm(limma_dgelist, log=TRUE, prior.count=3)
limma_fit <- limma::lmFit(limma_logCPM, limma_design)
}
limma_fit$coefficients <- limma_fit$coefficients[,colnames(limma_fit$coefficients) %in% rownames(limma_contrasts)]
limma_fitc <- limma::contrasts.fit(fit = limma_fit, contrasts = limma_contrasts)
limma_fite <- limma::eBayes(limma_fitc, trend=TRUE)
limma_res_list <- lapply(seq_along(contrasts), function(f){
ress_tb <- tibble::as_tibble(limma::topTable(fit = limma_fite,
coef = f,
number = Inf,
adjust.method = "fdr"), rownames = "ensembl_gene_id")
if(!is.null(anno_tb)){
if(class(anno_tb)[1] != "tbl_df"){
anno_tb <- tibble::as_tibble(anno_tb)
}
if(any(colnames(anno_tb) %in% "target_id")==TRUE){
anno_tb <- anno_tb %>% dplyr::select(-target_id)
}
ress_tb <- dplyr::left_join(anno_tb, ress_tb) %>%
na.omit() %>%
dplyr::arrange(adj.P.Val) %>%
tibble::as_tibble() %>%
dplyr::distinct()
}
if(!is.null(tpm_tb)){
num_cols <- unlist(lapply(tpm_tb[1,], is.numeric))
colnames(tpm_tb)[num_cols] <- paste0(colnames(tpm_tb)[num_cols], "_tpm")
ress_tb <- dplyr::left_join(ress_tb, tpm_tb) %>%
na.omit() %>%
tibble::as_tibble() %>%
dplyr::distinct() %>%
dplyr::arrange(adj.P.Val)
}
ress_tb <- ress_tb %>% dplyr::rename("padj" = adj.P.Val)
readr::write_tsv(ress_tb, paste0(output_dir, "/", tag, ".res.", gsub(" ", "", contrasts[f]), ".limma.tsv"))
})
names(limma_res_list) <- gsub(" ", "", contrasts)
saveRDS(limma_res_list, file = paste0(output_dir, "/", tag, ".limma_res_list.rds"))
}