-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpeak_switching.R
230 lines (141 loc) · 10.2 KB
/
peak_switching.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
# Gets the poly A tails from a bam file and returns them as a list
poly_A_peak_puller<- function(peak,bam_file, gff){
library(Rsamtools)
#Outpus the peaks matching a certain gene
gff_peaks <-gff
numbers <- seq(1:length (gff_peaks [ ,1]))
# Number the gff peaks
numbered_gff <- cbind(gff_peaks, numbers)
# Search through for gene of interest.
found_gene <- subset(numbered_gff, numbered_gff[,10] == peak)
output <-as.data.frame(found_gene)
# Run peak finder or name finder depending on input
goi<- output
colnames(goi) <- c('chr', 'program', 'type', 'peak start', 'peak end','DNS','orientation', 'DNS2','information','numbers')
# Split the gene of interest by orientation
split_goi <- split(goi, goi[,'orientation'],drop = TRUE)
# Pull the poly A reads for strands in each oreintation from the gff file
plus_reads <- split_goi[['+']]
minus_reads <- split_goi[['-']]
list_of_peaks <- list()
# For plus and minus reads, assign reads to peaks
all_poly_a_tails_minus<- numeric()
last_line <- as.data.frame(data.frame(matrix(1, nrow=1,ncol=10)))
if (length(minus_reads)>=1){
for (line in 1:nrow(minus_reads)){
peak <- minus_reads [line,]
# If the start of a line in less than the end of a previous read, move the start to the end of the previous read.
if(peak[,4] <= last_line[,5]){
peak[,4] <- last_line[,5]
}
param <- ScanBamParam(what=c('qname','pos','qwidth','strand'),tag=c('AN'), which=GRanges(peak [,'chr'],IRanges(
peak[,'peak start'], peak[,'peak end'] +5 )))
result1 <- scanBam ( bam_file , param=param, isMinusStrand = TRUE)
no_of_as1 <- result1[[1]][[5]][[1]]
reads_in_peak_neg <- length (no_of_as1)
list_of_peaks <- c(list_of_peaks, length(no_of_as1))
# Add succesive peaks together
all_poly_a_tails_minus<-c(all_poly_a_tails_minus,no_of_as1)
# Print to output file for later identification of 3' UTR switching
p <- c(line,reads_in_peak_neg)
#write.table(p, file ,append=TRUE, row.names = FALSE, col.names = FALSE, eol = "\t", quote = FALSE)
}
}
# The poly_A flag values
all_poly_a_tails_plus <- numeric()
# Give last line a default value
last_line <- as.data.frame(data.frame(matrix(1, nrow=1,ncol=10)))
# Last line is set to the last peak. If it overlaps another peak it will make its end the new peak's start.
if (length(plus_reads)>=1){
for (line in 1:nrow(plus_reads)){
peak <- plus_reads [line,]
# If the start of a line in less than the end of a previous read, move the start to the end of the previous read.
if(peak[,4] <= last_line[,5]){
peak[,4] <- last_line[,5]
}
param <- ScanBamParam(what=c('qname','pos','qwidth','strand'),tag=c('AN'), which=GRanges(peak [,'chr'],IRanges(
peak[,'peak start'] -305, peak[,'peak end']+305 )))
result2 <- scanBam(bam_file , param=param, isMinusStrand = FALSE)
result2[[1]][[3]]<-result2[[1]][[3]]+result2[[1]][[4]]-1
df <- as.data.frame(result2)
# Second set of precessig to account for 5'read ends at the ead of a peak
if (nrow(df)>0){
my.data.frame <- subset(df, df[,3] >= (peak[,'peak start'] -5) & df[,3] <= (peak[,'peak end']+5))
no_of_as2 <- my.data.frame[,5]
list_of_peaks <- c(list_of_peaks, length(no_of_as2))
# Add succesive peaks together
all_poly_a_tails_plus <-c(all_poly_a_tails_plus, no_of_as2)
reads_in_peak_pos <- length (no_of_as2)
# Print to output file for later identification of 3' UTR switching
p <- c(line,reads_in_peak_pos)
#write.table(p, file ,append=TRUE, row.names = FALSE, col.names = FALSE, eol = "\t", quote = FALSE)
last_line <- peak
}}
}
# Combine the pulled reads back together
all_poly_a_tails<- sort(c(all_poly_a_tails_plus, all_poly_a_tails_minus))
return (all_poly_a_tails)
}
peak_switching<- function(peak_list,bam_list, gff, number_of_replicates = 3, number_of_conditions =2){
#This function currently only works for two conditions
# Read in the gff file
gff_file <- read.table (gff, sep = "\t", header = FALSE)
# Set an empty read couts list for later use
read_counts<- list()
# Get differnt peaks for each BAM file and store them in read counts
for (bam_file in bam_list){
read_count_condition1 <- lapply (peak_list[[1]], poly_A_peak_puller, bam_file,gff_file)
read_counts <- c(read_counts, read_count_condition1)
}
for (bam_file in bam_list){
read_count_condition2 <- lapply (peak_list[[2]], poly_A_peak_puller, bam_file,gff_file)
read_counts <- c(read_counts, read_count_condition2)
}
# Get the lengths of the poly A tail lists stored in read counts to get read counts
length_list <- lapply (read_counts, length)
# Put these values into a data frame
length_frame <- do.call('rbind', length_list)
# Add the list of file names at their corresponding counts
length_frame <- cbind(paste(bam_list), as.numeric(length_frame))
# Order the data frame by name so that like conditions match up (may mave condition out of input order)
length_frame <- length_frame[order(length_frame[,1], decreasing = FALSE),]
# Get totals of peaks per conition
totals <- colSums(matrix(as.numeric(length_frame[,2]), nrow=length(peak_list)))
# Put totals in cols next to the vales that were added to reach them
totals <- rep(totals, each = length(peak_list))
# Calculate percentage peaks used
length_frame <- cbind(length_frame, totals)
length_frame[,2] <- as.numeric(length_frame[,2]) / (as.numeric(length_frame[,3]))*100
# Split up your data frames into peak groups
length_frame1 <- length_frame[,-3] [seq(1, nrow(length_frame), 2),]
length_frame2 <- length_frame[,-3] [seq(2, nrow(length_frame), 2),]
stdev_lists1 <- split(matrix(as.numeric(length_frame1[,2])), ceiling(seq_along(length_frame1[,2])/number_of_replicates))
print(stdev_lists1)
stdev_lists1 <-lapply(stdev_lists1,sd)
stdev_lists2 <- split(matrix(as.numeric(length_frame2[,2])), ceiling(seq_along(length_frame2[,2])/number_of_replicates))
stdev_lists2 <-lapply(stdev_lists2,sd)
print(stdev_lists2)
sdevs<- as.numeric(c(stdev_lists1, stdev_lists2))
sdevs <-cbind(paste(c(rep(bam_list[1],rep=2),rep(bam_list[(length(bam_list)/number_of_conditions)+1],rep=2))),sdevs)
print(sdevs)
sdevs <-sdevs [order(sdevs[,1], decreasing = TRUE),]
print(sdevs)
# Get the avearge proportion of these groups
length_frame1 <- colMeans(matrix(as.numeric(length_frame1[,2]), nrow=length(bam_list)/number_of_conditions))
length_frame2 <- colMeans(matrix(as.numeric(length_frame2[,2]), nrow=length(bam_list)/number_of_conditions))
length_frame1 <- cbind(length_frame1)
length_frame2 <- cbind(length_frame2)
# Put them back together
length_frame<- rbind(length_frame1,length_frame2)
# Add names back in
length_frame<-cbind(paste(c(rep(bam_list[1],rep=2),rep(bam_list[(length(bam_list)/number_of_conditions)+1],rep=2))),length_frame)
# Order by sample
length_frame <- length_frame[order(length_frame[,1], decreasing = TRUE),]
avgs <- as.numeric(length_frame[,2])
points<- c(1.5,4.5,7.5,10.5)
barplot(main= 'peak switching', beside=TRUE, xlim=c(0,15), ylim = c(0,100), as.numeric(length_frame[,2]), xaxt= 'n' ,col=c('green','purple'),space =c(0,0), ylab = 'percentage peak used', axes=FALSE, xlab= '', width = 3:3, names= length_frame[,1])
legend("topright", legend = paste('peak',peak_list), fill = c("green","purple"),text.width=5, bty = 'n')
axis(2, pos=0, at= c(0,25,50,75,100), tick = 25)
axis(1,hadj= 0.85,cex.axis=0.2,las=3, at=c(1.5,4.5,7.5,10.5), labels = length_frame[,1] )
arrows(x0 = points, y0 = avgs, x1= points, y1 = avgs + as.numeric(sdevs[,2]), length=0.05, angle=90, code=2)
}