-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdna.py
262 lines (210 loc) · 7.85 KB
/
dna.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
from __future__ import division
import numpy as np
import random as rnd
import matplotlib.pyplot as plt
from math import sqrt
from collections import defaultdict
from operator import itemgetter
class DNA:
def __init__(self, infile):
self.seq_list = self.read_file(infile)
self.N = len(self.seq_list) # number of dna sequances in input file
self.dist_matrix = np.zeros((self.N, self.N)) # NxN similarity matrix
# here I'm assuming that N is a perfect square. If not, then the
# number of cells in the grid will be smaller than N. Also, I'm
# strictly assuming a SQUARE GRID for the whole class.
self.grid_size = int(sqrt(self.N))
self.grid = self.generate_grid()
def hamming_dist(self, a, b):
'''
INPUT string, string
OUTUT int
Calculates penalty between sequnces a and b which is simply the
hamming distance between two strings
'''
return sum(ch_a != ch_b for ch_a, ch_b in zip(a, b))
def read_file(self, infile):
'''
INPUT string
OUTPUT none
Reads file and stored sequences in a list
'''
with open(infile) as f:
lseq = f.read().splitlines()
return lseq
def hamming_dist_all(self):
'''
INPUT none
OUTPUT none
Calculate hamming distances between all sequences and store these
in a square similarity matrix. This matrix will be used as lookup
table for any other function. I believe this will reduce the time in
the long run, but will use more space.
'''
for i in xrange(self.N):
for j in xrange(i+1, self.N):
d = self.hamming_dist(self.seq_list[i], self.seq_list[j])
self.dist_matrix[i][j] = d
self.dist_matrix[j][i] = d
def local_avg_dist(self, i, j):
'''
INPUT int, int
OUTPUT float
i and j give the coordinates of a cell in 2d grid. This function
calculates average distance between nearest neighbours (adjacent cells)
of cell (i,j)
'''
total_dist = 0
seq = self.grid[i, j] # get the sequence id in the current cell
# get ids of neighbouring sequences taking care of edges in the grid
neighbours = []
if i != 0:
neighbours.append(self.grid[i - 1, j])
if i != self.grid_size-1:
neighbours.append(self.grid[i + 1, j])
if j != 0:
neighbours.append(self.grid[i, j - 1])
if j != self.grid_size-1:
neighbours.append(self.grid[i, j + 1])
for nb in neighbours:
total_dist += self.dist_matrix[seq, nb]
return total_dist/len(neighbours)
def min_max_dist(self):
'''
INPUT none
OUTPUT none
Fine minimum and maximum distance in the distace matrix
'''
# since it is a symmetric matrix, we just need to look at upper or
# lower triangle part of the matrix
upper = self.dist_matrix[np.triu_indices(self.N, 1)]
return np.amin(upper), np.amax(upper)
def hamming_hist(self):
'''
INPUT none
OUTPUT none
plots histogram of hamming distances between all dna sequences (but not
from the grid)
'''
# get values of upper triangle of distance matrix ignoring diagonal
upper = self.dist_matrix[np.triu_indices(self.N, 1)]
plt.hist(upper)
def plot_heat_map(self, cmap=plt.cm.Reds):
'''
INPUT cmap
OUTPUT none
Plots 2d array as heat map
'''
hmap = self.heat_map()
plt.figure(figsize=(7, 7))
img = plt.imshow(hmap, interpolation='nearest', cmap=cmap)
min_d, max_d = self.min_max_dist()
img.set_clim(min_d-1, max_d+1)
plt.colorbar()
plt.tight_layout()
def generate_grid(self):
'''
INPUT none
OUTPUT numpy array
Generates a square matrix and assigns values randomly to elements.
Here I'm assuming that the sequences from the input txt file are
numberd sequentially from 0 to N-1. If N is not a square number,
then some sequences will not be included int the grid
'''
rnd.seed()
l = range(self.grid_size*self.grid_size)
rnd.shuffle(l)
square_grid = np.array(l).reshape(self.grid_size, self.grid_size)
return square_grid
def heat_map(self):
'''
INPUT none
OUTPUT none
This function creates heat map.
'''
n = self.grid_size
hmap = np.zeros((n, n))
for i in xrange(n):
for j in xrange(n):
hmap[i, j] = self.local_avg_dist(i, j)
return hmap
def randomize(self):
self.grid = self.generate_grid()
def clusterify(self, trials):
'''
INPUT int
OUTPUT none
This is a monte carlo method that tries to arrange sequences on a grid
in such a way that the local average distance becomes smaller.
It works following way
- pick two cells on the grid at random
- calculate the average local distance of both cells.
- the swap the contents of the cell
- calculate the average local distance of after swap
- if the average local distance becomes smaller for both cell after
swap, then accept new arrangement, else, discard the change.
All this algorithm is doing is to cool down the heatmap by lowering the
"temperature" at every cell in the grid!
'''
for t in range(trials):
i1 = rnd.randint(0, self.grid_size-1)
j1 = rnd.randint(0, self.grid_size-1)
i2 = rnd.randint(0, self.grid_size-1)
j2 = rnd.randint(0, self.grid_size-1)
seq1 = self.grid[i1, j1]
seq2 = self.grid[i2, j2]
d1_before = self.local_avg_dist(i1, j1)
d2_before = self.local_avg_dist(i2, j2)
self.grid[i1, j1] = seq2
self.grid[i2, j2] = seq1
d1_after = self.local_avg_dist(i1, j1)
d2_after = self.local_avg_dist(i2, j2)
if d1_after > d1_before or d2_after > d2_before:
self.grid[i1, j1] = seq1
self.grid[i2, j2] = seq2
def local_avg_hist(self):
'''
INPUT none
OUTPUT none
plots a histogram of local average distances in the heat map
'''
hmap = self.heat_map()
lst = hmap.tolist()
lst = sum(lst, []) # this flattens the list
plt.hist(lst)
def print_neigbours(self, i, j):
'''
INPUT int, int
OUTPUT none
This method just prints out the sequences in the adjacent cells to
the cell i,j, and also prints the distances.
'''
seq = self.grid[i, j]
print self.seq_list[seq]
neighbours = []
if i != 0:
neighbours.append(self.grid[i - 1, j])
if i != self.grid_size-1:
neighbours.append(self.grid[i + 1, j])
if j != 0:
neighbours.append(self.grid[i, j - 1])
if j != self.grid_size-1:
neighbours.append(self.grid[i, j + 1])
for nb in neighbours:
print self.seq_list[nb], self.dist_matrix[seq, nb]
def write(self, outfile):
'''
INPUT string
OUTPUT none
writes the coordinates of the sequences in the square grid
'''
d = defaultdict(lambda: (0, 0))
n = self.grid_size
for i in xrange(n):
for j in xrange(n):
d[self.grid[i][j]] = i, j
# sort dictionary w.r.t keys because the keys are seq ids
sorted_d = sorted(d.items(), key=itemgetter(0))
with open(outfile, 'w') as f:
for item in sorted_d:
f.write(str(item[1][0]) + ' ' + str(item[1][1]) + '\n')