-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathssgsea.R
137 lines (120 loc) · 6.31 KB
/
ssgsea.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#' Function to run ssGSEA using the GSVA package and return PCA of results
#'
#' @param pways list of pathway genesets to use in ssGSEA
#' @param log2tpm_mat matrix of log2tpm values per sample across genes in gene lists
#' @param msigdb_cat one of 'c("H", paste0("C", c(1:7)))', see: gsea-msigdb.org/gsea/msigdb/collections.jsp; if H, loads Process_Category data from Liberzon 2015 paper to colour PCA
#' @param output_dir path to where output goes
#' @param contrast string used to prefix output naming the contrast being investigated
#' @param metadata tibble of sample, \<group\> where group colours the PCA
#' @return ssgsea_ggp_list results table of each geneset and sample
#' @export
ssgsea_pca <- function(pways, log2tpm_mat, msigdb_cat = "H", output_dir, contrast, metadata) {
print(paste0("Working on: ", contrast))
##output dir
out_dir <- paste0(output_dir, "/ssgsea")
dir.create(paste0(out_dir, "/plots"), recursive = TRUE, showWarnings = FALSE)
dir.create(paste0(out_dir, "/data"), recursive = TRUE, showWarnings = FALSE)
##use res as significant set also
if(msigdb_cat == "H"){
#hallmarks <- system.file("inst","extdata","Liberzon_2015_Table1_Hallmark_categories.csv",
# package = "RNAseqR")
hallmarks <- readr::read_csv("https://raw.githubusercontent.com/brucemoran/RNAseqR/master/inst/extdata/Liberzon_2015_Table1_Hallmark_categories.csv")
rns <- "Hallmark_Name"
} else {
print(paste0("No colouring will be added to PCA baecause you are using: ", msigdb_cat))
rns <- msigdb_cat
}
##GSVA ssGSEA
if(length(pways) != 0) {
ssgsea_res <- GSVA::gsva(log2tpm_mat,
pways,
method='ssgsea',
min.sz=0,
max.sz=1000,
ssgsea.norm=T)
ggp <- ssgsea_rotationPCA(ssgsea = ssgsea_res,
hallmark_tb = hallmarks,
contrast = contrast,
metadata = metadata,
returnData = FALSE)
readr::write_tsv(tibble::as_tibble(ssgsea_res, rownames = rns), file = paste0(out_dir, "/data/", contrast , ".ssgsea_res.tsv"))
ggplot2::ggsave(ggp[[1]], file = paste0(out_dir, "/plots/", contrast , ".ssgsea_rotationPCA.pdf"))
ssgsea_ggp_list <- list(ssgsea_res = ssgsea_res, ggplot_pca = ggp)
return(ssgsea_ggp_list)
} else {
print("No genesets in pathways, skipping...")
}
}
#' Plotting PCA function
#' @param ssgsea output table
#' @param hallmark_tb hallmark data with Process_Category column to colour by process
#' @param metadata tibble of sample, \<group\> where group colours the PCA
#' @param contrast string to title plot
#' @param returnData flag to allow data to be returned
#' @param intgroup colname to use as group to colour in if not using Process Category
#' @return ggp_pca_list ggplot2 object for printing, pca for safe keeping
#' @export
ssgsea_rotationPCA <- function(ssgsea, metadata, hallmark_tb = NULL, contrast, returnData = FALSE, group = NULL) {
if("sampleID" %in% colnames(metadata)){
colnames(metadata)[match("sampleID" , colnames(metadata))] <- "sample"
}
pca <- prcomp(t(ssgsea))
percentVar <- pca$sdev^2/sum(pca$sdev^2)
##make rotation data for hallmarks
d <- data.frame(PC1 = pca$rotation[, 1], PC2 = pca$rotation[, 2], Hallmark_Name = rownames(pca$rotation))
##make point data for samples
p <- tibble::as_tibble(pca$x, rownames = "sample") %>%
dplyr::left_join(metadata, .)
p$group <- as.factor(p$group)
if(!is.null(hallmark_tb)){
hallmark_tb$Hallmark_Name <- paste0("HALLMARK_", hallmark_tb$Hallmark_Name)
d <- dplyr::left_join(d, hallmark_tb, by = "Hallmark_Name")
d$Process_Category[is.na(d$Process_Category)] <- "other"
}
if (returnData) {
attr(d, "percentVar") <- percentVar[1:2]
return(d)
}
if("Process_Category" %in% colnames(d)){
colsh_p <- colnames(p)[2]
ggp <- ggplot2::ggplot() + ggplot2::geom_segment(data = d,
ggplot2::aes(x = 0, y = 0, xend = (PC1*2),
yend = (PC2*2),
group = Process_Category,
color = Process_Category),
arrow = ggplot2::arrow(length = ggplot2::unit(1/2, "picas"),
ends="last",
type = "closed"),
size = 1.2,
linetype = 1,
inherit.aes = FALSE) +
ggplot2::geom_point(data = p,
ggplot2::aes_string(x = "PC1", y = "PC2",
shape = colsh_p),
size = 2.5,
show.legend = TRUE,
inherit.aes = FALSE) +
ggrepel::geom_label_repel(data = d,
ggplot2::aes(x = (PC1*2), y = (PC2*2),
label = gsub("HALLMARK_", "", Hallmark_Name),
group = Process_Category,
colour = Process_Category),
size = 2.5,
fontface = "bold",
show.legend = FALSE,
inherit.aes = FALSE) +
ggplot2::labs(title = contrast,
subtitle = "PCA plot using MsigDB Hallmark with Process_Category",
x = paste0("PC1: ", round(percentVar[1] * 100), "% variance"),y = paste0("PC2: ", round(percentVar[2] * 100), "% variance"))
} else {
stop("Unfinished, suggest using Hallmarks...")
ggp <- ggplot2::ggplot(data = p,
ggplot2::aes(x = PC1, y = PC2, group = group)) +
ggplot2::geom_point(ggplot2::aes_string(shape = "group",
colour = "group",
fill = "group"),
size = 3)
}
ggp_pca_list <- list(ggplot = ggp, pca = pca)
return(ggp_pca_list)
}