-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathedgeR_module.R
123 lines (106 loc) · 4.88 KB
/
edgeR_module.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#! R
#' edgeR module taking metadata_csv and raw count data frame to run DE analysis
#'
#' @param count_data object in matrix format (integers) with rownames as genes and colnames as sample IDs
#' @param anno_tb data frame of ensembl_gene_id - external_gene_id mappings for annotation
#' @param tpm_tb data frame of tpms to append to results for output
#' @param tag string used to prefix output
#' @param metadata_csv path to file holding metadata
#' @param metadata_design string of design using colnames of metadata_csv
#' @param control_reference string indicating the 'control' intgroup for last element of design
#' @param output_dir path to where output goes
#' @param ens_version ENSEMBL version to use
#' @param org_prefix organism prefix from ENSEMBL
#' @param delim_samples character to delimit sample IDs (default: ".")
#' @return nz_fc_co raw count object with no lines summing to zeros
#' @importFrom magrittr '%>%'
#' @export
edgeR_module <- function(count_data, anno_tb = NULL, tpm_tb = NULL, tag = NULL, metadata_csv = NULL, metadata_design = NULL, control_reference = NULL, output_dir = NULL, delim_samples = "\\."){
print("Running: edgeR_module()")
##warnigns for undefined param
if(is.null(tag)){
stop("Please provide a tag to name your outputs")
}
if(is.null(metadata_csv)){
stop("Please specify a metadata_csv file in CSV format")
}
if(is.null(metadata_design)){
print("Please specify a metadata_design for DESeq2")
print("NB this should be format: first + second + ... + last")
print("NBB that 'last' in design is condition of interest in DE analysis")
stop("NBBB that all elements of the design must be names of columns in 'metadata' file")
}
if(is.null(output_dir)){
print("No directory specified for output, the current dir will be used")
output_dir <- "./edgeR"
} else {
output_dir <- paste0(output_dir, "/edgeR")
dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)
}
##read in condition data
cond <- readr::read_csv(metadata_csv)
cond_df <- as.data.frame(cond)
rownames(cond_df) <- cond_df[,1]
##overlap w/count_data and sort
cond_df <- cond_df[rownames(cond_df) %in% colnames(count_data),]
cond_df <- cond_df[sort(rownames(cond_df)),]
##break design into components
design_vec <- unlist(lapply(metadata_design, function(f){gsub(" ", "", strsplit(f, "\\+")[[1]])}))
##take only cols of interest and define major condition
cond_df <- cond_df[,colnames(cond_df) %in% c("sample", design_vec)]
CONDITION <- rev(design_vec)[1]
##create factors
for(x in 1:length(colnames(cond_df))){
cond_df[,x] <- factor(cond_df[,x])
}
##change reference level if specified
if(!is.null(control_reference)){
cond_df[,CONDITION] <- relevel(cond_df[,CONDITION], ref = control_reference)
}
y <- edgeR::DGEList(counts = count_data,
samples = cond_df,
group = cond_df[,CONDITION])
keep <- edgeR::filterByExpr(y)
y <- y[keep,,keep.lib.sizes=FALSE]
y <- edgeR::calcNormFactors(y)
edgeR_design <- model.matrix(formula(paste0("~ 0 + ", metadata_design)), data = cond_df)
y <- edgeR::estimateDisp(y, edgeR_design)
##make all contrasts of CONDITION, then set into named list
contrasts <- apply(t(combn(levels(cond_df[,CONDITION]),2)), 1, function(f){
paste(paste0(CONDITION, f), collapse="-")
})
edgeR_res_list <- lapply(seq_along(contrasts), function(f){
# tt_fit <- edgeR::glmQLFTest(fit, coef = f)
exac_test <- edgeR::exactTest(y, pair = gsub(CONDITION, "", rev(strsplit(contrasts[f], "-")[[1]])))
ress_tb <- tibble::as_tibble(as.data.frame(edgeR::topTags(exac_test,
n = Inf,
adjust.method = "BH")),
rownames = "ensembl_gene_id")
if(!is.null(anno_tb)){
if(class(anno_tb)[1] != "tbl_df"){
anno_tb <- tibble::as_tibble(anno_tb)
}
if(any(colnames(anno_tb) %in% "target_id")==TRUE){
anno_tb <- anno_tb %>% dplyr::select(-target_id)
}
ress_tb <- dplyr::left_join(anno_tb, ress_tb) %>%
na.omit() %>%
dplyr::arrange(FDR) %>%
tibble::as_tibble() %>%
dplyr::distinct()
}
if(!is.null(tpm_tb)){
num_cols <- unlist(lapply(tpm_tb[1,], is.numeric))
colnames(tpm_tb)[num_cols] <- paste0(colnames(tpm_tb)[num_cols], "_tpm")
ress_tb <- dplyr::left_join(ress_tb, tpm_tb) %>%
na.omit() %>%
dplyr::arrange(FDR) %>%
tibble::as_tibble() %>%
dplyr::distinct()
}
ress_tb <- ress_tb %>% dplyr::rename("padj" = FDR)
readr::write_tsv(ress_tb, paste0(output_dir, "/", tag, ".res.", contrasts[f], ".edgeR.tsv"))
})
names(edgeR_res_list) <- contrasts
saveRDS(edgeR_res_list, file = paste0(output_dir, "/", tag, ".edgeR_res_list.rds"))
}