-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathenrichment_vs_n.R
96 lines (75 loc) · 4.5 KB
/
enrichment_vs_n.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
library(ggplot2)
library(reshape2)
pdb_lig_csv <- "/Users/evan/scratch_enf/b2ar_analysis/tICA_t10_n_components5_switches_npxx_tm6_bp/docking_n_clusters1000_n_samples10_dist_SP/3p0g_lig/docking_means_pnas_means.csv"
pdb_lig_pnas_active <- data.frame(read.csv(pdb_lig_csv))
pdb_lig_sorted <- pdb_lig_pnas_active[order(-pdb_lig_pnas_active[,2]),]
all_pnas_active_csv <- "/Users/evan/scratch_enf/b2ar_analysis/features_pnas/all_pnas_distances.csv"
all_pnas_active <- data.frame(read.csv(all_pnas_active_csv))
aggregate_docking_pnas_csv <- "/Users/evan/scratch_enf/b2ar_analysis/tICA_t10_n_components5_switches_npxx_tm6_bp/analysis_n_clusters1000/aggregate_docking_pnas_joined.csv"
aggregate_docking_pnas <- data.frame(read.csv(aggregate_docking_pnas_csv))
aggregate_docking_sorted <- aggregate_docking_pnas[order(-aggregate_docking_pnas[,2]),]
make_plot <- function(data, title) {
data <- data.frame(data)[1:1000,]
final.data.melted <- melt(as.data.frame(data), id.vars = colnames(data)[1])
p <- ggplot(final.data.melted, aes(x = Index, y = value, colour= variable)) + geom_line() #+ geom_hline(yintercept = 1.0, color = "blue", linetype = "longdash") #+ geom_vline(xintercept=50, color = "blue", linetype = "longdash")
p <- p + ylab("Accuracy") + xlab("Top 'n' clusters scored by docking") + ggtitle(title)
#p <- p + geom_line(data=final.data.melted,thickness=3) #+ ylab(colnames(data)[2]) + xlab(colnames(data)[1])
#p <- p + theme(axis.title = element_text(size = 12,face="bold"))
#p <- p + theme(plot.title = element_text(size=12,face="bold"))
#p <- p + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
# panel.background = element_blank(), axis.line = element_line(colour = "black"))
#p <- p + scale_x_continuous(expand=c(.002,0))
#p <- p + theme(aspect.ratio=.5)
p
#pdf(paste(title,".pdf"),width=12,height=6)
#plot(p)
#dev.off()
}
total_active <- function(df, cutoff, col) {
num_active = length(df[df[,col] < cutoff, col])
print(num_active)
print(length(df[,1]))
percent_active = num_active / length(df[,1])
return num_active
}
active_under_index <- function(index, df, cutoff, col) {
under_index = df[1:index,]
num_active = length(under_index[under_index[,col] < cutoff, col])
print(num_active)
percent_active = num_active / length(under_index[,1])
return(num_active)
}
enrichment <- function(percent_active, percent_all) {
return(percent_active/percent_all)
}
accuracy <- function(percent_active)
cutoff <- 0.15
total_percent_active_1000_clusters <- total_active(pdb_lig_pnas_active, cutoff, 3)
#multiple scoring functions
active_vs_index <- cbind(1:1000, rep(0,1000))
active_vs_cutoff <- cbind(seq(0.1, 0.5, .05), rep(0,9))
active_vs_index[,2] <- sapply(enrichment_vs_index[,1], active_under_index, df = aggregate_docking_sorted, cutoff = cutoff, col = 3)
enrichment_vs_index <- active_vs_index
enrichment_vs_index[,2] <- (enrichment_vs_index[,2] / enrichment_vs_index[,1]) / (total_percent_active_1000_clusters)
accuracy_vs_index <- active_vs_index
accuracy_vs_index[,2] <- active_vs_index[,2]/active_vs_index[,1]
plot(accuracy_vs_index)
#single scoring function w/ 3p0g ligand
active_vs_index_single <- cbind(1:1000, rep(0,1000))
active_vs_cutoff_single <- cbind(seq(0.1, 0.5, .05), rep(0,9))
colnames(active_vs_index_single) <- c("Index", "Enrichment")
active_vs_index_single[,2] <- sapply(active_vs_index_single[,1], active_under_index, df = pdb_lig_sorted, cutoff = cutoff, col = 3)
enrichment_vs_index_single <- active_vs_index_single
enrichment_vs_index_single[,2] <- (enrichment_vs_index_single[,2] / enrichment_vs_index_single[,1]) / (total_percent_active_1000_clusters)
accuracy_vs_index_single <- active_vs_index_single
colnames(accuracy_vs_index_single)[2] <- "Accuracy"
accuracy_vs_index_single[,2] <- active_vs_index_single[,2]/active_vs_index_single[,1]
plot(accuracy_vs_index_single)
## nicer plots:
accuracy_single_vs_multiple <- cbind(accuracy_vs_index[,c(1,2)], accuracy_vs_index_single[,2])
colnames(accuracy_single_vs_multiple) <- c("Index", "Multiple Scoring Functions", "Single Scoring Function")
make_plot(accuracy_single_vs_multiple, "Accuracy of Top 'n' Clusters Scored by Docking")
enrichment_single_vs_multiple <- accuracy_single_vs_multiple
enrichment_single_vs_multiple[,3] <- enrichment_single_vs_multiple[,3] / total_percent_active_1000_clusters
enrichment_single_vs_multiple[,2] <- enrichment_single_vs_multiple[,2] / total_percent_active_1000_clusters
make_plot(enrichment_single_vs_multiple, "Enrichment of Top 'n' Clusters Scored by Docking")