-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpost_module_joins.R
291 lines (247 loc) · 10.2 KB
/
post_module_joins.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
#! R
#' Master list, join to give found-in-2 and all-3 tables
#'
#' @param input_dir path where DESeq2, limma, edgeR RDS results files are held
#' @param tpm_tb tpm tibble for adding to master_ist_sig ([[x]][[2]])
#' @return list object of 'found-in-2' and 'all-3' results, including unique cols from each module
#' @importFrom magrittr '%>%'
#' @export
master_parse_join <- function(input_dir, tpm_tb = NULL){
print("Running: master_parse_join()")
##read RDS results
rdss <- dir(input_dir, pattern = "_res_list.rds", recursive = TRUE, full.names = TRUE)
master_names <- unlist(lapply(rdss, function(f){
filn <- rev(strsplit(f, "/")[[1]])[1]
gsub("_res_list", "", strsplit(filn, "\\.")[[1]][2])
}))
master_list <- lapply(rdss, function(f){
readRDS(f)
})
names(master_list) <- master_names
##check if multiple annotations arise and fix by aggregating if so
##esp for homology tables (i.e. where !genome_prefix == hsapiens)
master_list_a <- lapply(master_list, function(ff){
mlist <- lapply(ff, function(f){
extg <- f$external_gene_name
ensg <- f$ensembl_gene_id
if(length(table(extg[table(extg)>1])) > 1 | length(table(ensg[table(ensg)>1])) > 1){
print("Found multiple annotations for identifiers, aggregating results")
colns <- grep("_gene", colnames(f), value = TRUE)
if(length(colns) == 4){
##have 4 cols, first 2 to group_by, second 2 to aggregate
f <- group_agg_multi(f, colns)
}
##then aggregate ENS IDs so only single external_gene_name is found
group_agg_two(f, colns)
} else {
return(f)
}
})
return(mlist)
})
##list within names of significant results (p < 0.01)
master_list_sig <- lapply(master_list_a, function(f){
ff <- dplyr:::filter(.data = f[[1]], padj < 0.01)
##add TPM to genes
if(!is.null(tpm_tb)){
ff <- dplyr::left_join(ff, tpm_tb) %>%
dplyr::arrange(external_gene_name)
}
return(ff)
})
names(master_list_sig) <- names(master_list_a)
save(master_list_sig, file = paste0(input_dir, "/RData/", tag, ".master_list_sig.RData"))
return(master_list_a)
}
#' Aggregate columns 3, 4 based on grouping on columns 1, 2
#' NB that table can have multiple annotations in RNAseqon based on contrasts
#' this function removes this 'per-caller'
#'
#' @param f table object with colnames specified in colns
#' @param colns colnames of f on which to aggregate
#' @param pattern string to grep for colnames on which to operate
#' @return agg_f aggregated gene ID, names table
#' @importFrom magrittr '%>%'
#' @export
group_agg_multi <- function(f, colns = NULL, pattern = NULL){
if(is.null(colns) && is.null(pattern)){
stop("Please specify one of 'colns' or 'patterns'")
}
if(is.null(colns)){
colns <- grep(pattern, colnames(f), value = TRUE)
}
agg_f <- f %>% dplyr:::rowwise() %>%
dplyr::distinct(dplyr::across(-!!as.symbol(colns[3])), .keep_all = TRUE) %>%
dplyr::ungroup()
return(agg_f)
}
#' Aggregate column 1 based on grouping by column 2
#'
#' @param f table object with colnames specified in colns
#' @param colns colnames of f on which to aggregate, group in that order!
#' @param pattern string to grep for colnames on which to operate
#' @return agg_f aggregated gene ID, names table
#' @importFrom magrittr '%>%'
#' @export
group_agg_two <- function(f, colns = NULL, pattern = NULL){
if(is.null(colns) && is.null(pattern)){
stop("Please specify one of 'colns' or 'patterns'")
}
if(is.null(colns)){
colns <- grep(pattern, colnames(f), value = TRUE)
}
agg_f <- f %>% dplyr:::rowwise() %>%
dplyr::distinct(dplyr::across(-!!as.symbol(colns[1])), .keep_all = TRUE) %>%
dplyr::ungroup()
return(agg_f)
}
#' Found-in-2
#'
#' @param master_list list of 3 DE results
#' @param padj significance threshold below which to determine significance (adjusted by FDR)
#' @return fitwo_list list of tibble of results including unique cols from each module
#' @importFrom magrittr '%>%'
#' @export
found_in_two <- function(master_list, padj = 0.01){
##pairwise joins
##operate over multiple contrasts
combns <- apply(t(combn(m = 2, names(master_list))), 1, function(f){
paste(f, collapse = "-")
})
fitwo_list <- apply(t(combn(m = 2, names(master_list))), 1, function(f){
per_contrast_list <- lapply(names(master_list[[1]]), function(nm){
print(paste0("Working on found-in-two, contrast: ", nm))
ml1 <- master_list
##rename colnames with method ID
cn1 <- colnames(ml1[[f[1]]][[nm]])
cn2 <- colnames(ml1[[f[2]]][[nm]])
cn1i <- grep("_gene", grep("tpm", cn1, invert = TRUE, value = TRUE), invert = TRUE)
cn2i <- grep("_gene", grep("tpm", cn2, invert = TRUE, value = TRUE), invert = TRUE)
colnames(ml1[[f[1]]][[nm]])[cn1i] <- paste0(f[1], "_", cn1[cn1i])
colnames(ml1[[f[2]]][[nm]])[cn2i] <- paste0(f[2], "_", cn2[cn2i])
f1 <- paste0(f[1], "_padj")
f2 <- paste0(f[2], "_padj")
base::suppressMessages(dplyr::left_join(ml1[[f[1]]][[nm]], ml1[[f[2]]][[nm]])) %>%
dplyr::filter(!!as.symbol(f1) < !!padj & !!as.symbol(f2) < !!padj)
})
names(per_contrast_list) <- names(master_list[[1]])
return(per_contrast_list)
})
names(fitwo_list) <- combns
return(fitwo_list)
}
#' Found-in-3
#'
#' @param master_list list of 3 DE results
#' @param padj significance threshold below which to determine significance (adjusted by FDR)
#' @return per_contrast_list list of tibble of results including unique cols from each module
#' @importFrom magrittr '%>%'
#' @export
found_in_three <- function(master_list, padj = 0.01){
##as per find-in-two
per_contrast_list <- lapply(names(master_list[[1]]), function(nm){
ml1 <- master_list
f <- names(ml1)
print(paste0("Working on found-in-three, contrast: ", nm))
##rename colnames with method ID
cn1 <- colnames(ml1[[f[1]]][[nm]])
cn2 <- colnames(ml1[[f[2]]][[nm]])
cn3 <- colnames(ml1[[f[3]]][[nm]])
cn1i <- grep("_gene", grep("tpm", cn1, invert = TRUE, value = TRUE), invert = TRUE)
cn2i <- grep("_gene", grep("tpm", cn2, invert = TRUE, value = TRUE), invert = TRUE)
cn3i <- grep("_gene", grep("tpm", cn3, invert = TRUE, value = TRUE), invert = TRUE)
colnames(ml1[[f[1]]][[nm]])[cn1i] <- paste0(f[1], "_", cn1[cn1i])
colnames(ml1[[f[2]]][[nm]])[cn2i] <- paste0(f[2], "_", cn2[cn2i])
colnames(ml1[[f[3]]][[nm]])[cn3i] <- paste0(f[3], "_", cn3[cn3i])
f1 <- paste0(f[1], "_padj")
f2 <- paste0(f[2], "_padj")
f3 <- paste0(f[3], "_padj")
mlj <- base::suppressMessages(dplyr::left_join(ml1[[f[1]]][[nm]], ml1[[f[2]]][[nm]]))
base::suppressMessages(dplyr::left_join(mlj, ml1[[f[3]]][[nm]])) %>%
dplyr::filter(!!as.symbol(f1) < !!padj & !!as.symbol(f2) < !!padj & !!as.symbol(f3) < !!padj)
})
names(per_contrast_list) <- names(master_list[[1]])
return(per_contrast_list)
}
#' Make venn diagram
#'
#' @param master_list list of 3 DE results
#' @param padj significance threshold below which to determine significance (adjusted by FDR)
#' @param output_dir path to where output goes
#' @param tag string used to prefix output
#' @return none, venn diagram of 3 DE callers printed
#' @importFrom magrittr '%>%'
#' @export
venn_3 <- function(master_list, tag, output_dir, padj = 0.01){
print("Running: venn_3()")
##run over master_list to get number of overlapped genes
print("Working on found-in-two")
fitwo <- suppressMessages(RNAseqon::found_in_two(master_list))
print("Working on found-in-three")
fithree <- suppressMessages(RNAseqon::found_in_three(master_list))
##each master_list for contrast
unc <- unique(t(combn(x = c(0,0,0,1,1,1,0,0,0,1,1,1), m = 3)))
unc <- unc[order(rowSums(unc)),]
unct <- unc == 1
##create list of ensembl_gene_ids per set defined by unc
lapply(names(master_list[[1]]), function(contrast){
print(paste0("Working on contrast: ", contrast))
compf_list <- lapply(seq(1:dim(unct)[1]), function(f){
compf <- names(master_list)[unct[f,]]
print(compf)
if(length(compf) == 0){
return()
} else if(length(compf) == 1){
dat <- master_list[[compf]][[contrast]]
if(dim(dat)[1] == 0){
return()
} else {
dat1 <- dplyr::filter(.data = dat, padj < !!padj)
dat2 <- dplyr::select(.data = dat1, ensembl_gene_id)
as.vector(unlist(dat2))
}
} else if(length(compf) == 2){
##aready padjusted
dat <- fitwo[[paste(compf, collapse = "-")]][[contrast]]
if(dim(dat)[1] == 0){
return()
} else {
dat1 <- dplyr::select(.data = dat, ensembl_gene_id)
as.vector(unlist(dat1))
}
} else {
dat <- fithree[[contrast]]
if(dim(dat)[1] == 0){
return()
} else {
dat1 <- dplyr::select(.data = dat, ensembl_gene_id)
as.vector(unlist(dat1))
}
}
})
##compare those lists
vdf <- cbind(unc, unlist(lapply(compf_list, length)))
colnames(vdf) <- c(names(master_list), "Counts")
dir.create(paste0(output_dir, "/venn_3"), recursive = TRUE, showWarnings = FALSE)
readr::write_csv(as.data.frame(vdf), file = paste0(output_dir, "/venn_3/", tag, ".", contrast, ".venn_3.csv"))
##write venn
trip_venn <- VennDiagram::draw.triple.venn(area1 = vdf[4,4],
area2 = vdf[3,4],
area3 = vdf[2,4],
n12 = vdf[6,4],
n13 = vdf[7,4],
n23 = vdf[5,4],
n123 = vdf[8,4],
category = colnames(vdf)[1:3],
fill = c("dodgerblue", "orange", "forestgreen"),
cat.cex = 2, cex = 2)
pdf(paste0(output_dir, "/venn_3/", tag, ".", contrast, ".venn_3.pdf"))
title <- grid::textGrob(contrast,
x = 0.2, y = 0.01,
vjust = 0,
gp = grid::gpar(fontsize = 12))
gt <- grid::gTree(children = grid::gList(title, trip_venn))
grid::grid.draw(gt)
dev.off()
})
}