-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_motif_report.py
447 lines (373 loc) · 15.8 KB
/
create_motif_report.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
import os
import argparse
import re
import textwrap
from itertools import chain
import numpy as np
import plotly.graph_objects as go
from plotly.io import to_json
from bs4 import BeautifulSoup
from natsort import natsorted
motif_summary = """
<h2 id="summary">Summary table</h2>
<table class="tg" id="tg">
<thead>
<tr>
<th class="tg-s6z2" rowspan="2">Sample</th>
<th class="tg-s6z2" colspan="2">Allele 1</th>
<th class="tg-s6z2" colspan="2">Allele 2</th>
<th class="tg-s6z2" rowspan="2">Overall<br>confidence</th>
<th class="tg-s6z2" colspan="2">Reads</th>
</tr>
<tr>
<td class="tg-s6z2">prediction</td>
<td class="tg-s6z2">confidence</td>
<td class="tg-s6z2">prediction</td>
<td class="tg-s6z2">confidence</td>
<td class="tg-s6z2">full</td>
<td class="tg-s6z2">partial</td>
</tr>
</thead>
<tbody>
{table}
</tbody>
</table>
<script>
$(document).ready( function () {{
$('#tg').DataTable( {{
'order': [],
dom: 'Bfrtip',
buttons: [
'copy', 'csv', 'excel', 'pdf', 'print'
]
}} );
}} );
</script>
"""
row_string = """ <tr>
<td class="tg-s6z2">{name}</td>
<td class="tg-s6z2">{allele1}</td>
<td class="tg-s6z2">{allele1_conf}%</td>
<td class="tg-s6z2">{allele2}</td>
<td class="tg-s6z2">{allele2_conf}%</td>
<td class="tg-s6z2">{motif_conf}%</td>
<td class="tg-s6z2">{reads_blue}</td>
<td class="tg-s6z2">{reads_grey}</td>
</tr>
"""
summary_plot_string = """
<h2 id="map">Allele heatmap</h2>
<p><b>Number of background results: {background}</b></p>
<div id="motif-heatmap"></div>
<script>
Plotly.newPlot('motif-heatmap', {heatmap}, {{}});
</script>
<h2 id="hist">Allele histogram</h2>
<div id="motif-hist"></div>
<script>
Plotly.newPlot('motif-hist', {histogram}, {{}});
</script>
"""
motif_plot_string = """<h3>{sample_name}</h3>
<table>
<tr>
<td colspan="2">
<div class="hist pic100" id="hist-{sample_id}"></div>
<script>
Plotly.react('hist-{sample_id}', {hist_plot}, {{}});
</script>
</td>
<td colspan="1">
<div class="pcol pic100" id="pcol-{sample_id}"></div>
<script>
Plotly.react('pcol-{sample_id}', {pcol_plot}, {{}});
</script>
</td>
</tr>
</table>
"""
align_string = """
<details>
<summary>{display_text}</summary>
<div id="align-{sample_id}" class="align">press "Run with JS"</div>
<script>
var fasta = {fasta};
var seqs = msa.io.fasta.parse(fasta);
var opts = {{
el: document.getElementById("align-{sample_id}"),
vis: {{
conserv: false,
metaIdentity: true,
overviewbox: true,
seqlogo: true
}},
seqs: seqs,
colorscheme: {{"scheme": "nucleotide"}},
// smaller menu for JSBin
menu: "small",
bootstrapMenu: true
}};
var m = new msa.msa(opts);
m.render()
</script>
</details>
"""
def load_arguments():
"""
Loads and parses arguments.
:return: input_dir - path to directory with Dante reports
output_dir - path to output dir
args - parsed arguments
"""
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
description=textwrap.dedent("""Python program to collect Dante report files and
create 'reversed' report for each motif, to make the comparison of multiple samples easier"""))
# add arguments
parser.add_argument('input_dir', help='Path to directory with Dante reports')
parser.add_argument('output_dir', help='Path to directory where the output will be stored', nargs='?',
default='example/motif_report')
parser.add_argument('--report_every', type=int, help='Specify how often a progress message should be printed (default=5)',
default=5)
parser.add_argument('-q', '--quiet', help='Don\'t print any progress messages', action='store_true')
args = parser.parse_args()
input_dir = args.input_dir
output_dir = args.output_dir
return input_dir, output_dir, args
def custom_format(template, **kwargs):
"""
Custom format of strings for only those that we provide
:param template: str - string to format
:param kwargs: dict - dictionary of strings to replace
:return: str - formatted string
"""
for k, v in kwargs.items():
template = template.replace('{%s}' % k, v)
return template
def generate_row(motif_name, a1, c1, a2, c2, c, reads_blue, reads_grey):
return row_string.format(name=motif_name, allele1=a1, allele1_conf=c1, allele2=a2, allele2_conf=c2,
motif_conf=c, reads_blue=reads_blue, reads_grey=reads_grey)
# Convert extracted numbers from table to ints and set background and expanded alleles to 0
def parse_alleles(num):
if num == 'B' or num == 'E':
return num
elif num == '---':
return -1
else:
return int(num)
def parse_label(num):
if num == 0:
return ''
else:
return str(int(num))
def generate_motif_report(path, key, samples, plots, fig_heatmap, fig_hist, bgs, alignments=None):
"""
Generate report file for one motif
:param path: str - path to output dir
:param key: str - motif name
:param samples: list - list of samples of selected motif
:param plots: list - list of plots of selected motif
:param alignments: list - list of alignments of selected motif
:param fig_heatmap: str - heatmap object
:param fig_hist: str - histogram object
:param bgs: int - number of background results
"""
script_dir = os.path.dirname(os.path.abspath(__file__))
template = open('%s/report/motif_report.html' % script_dir, 'r').read()
rows = []
key = key.replace('/', '-')
for sample in samples:
rows.append(generate_row(sample[0], sample[1], sample[2], sample[3],
sample[4], sample[5], sample[6], sample[7]))
if alignments is not None:
motif_plots = list(chain.from_iterable(zip(plots, alignments)))
else:
motif_plots = plots
with open('%s/report_%s.html' % (path, key), 'w') as f:
table = motif_summary.format(table='\n'.join(rows))
summary_plots = summary_plot_string.format(background=bgs, heatmap=to_json(fig_heatmap), histogram=to_json(fig_hist))
f.write(custom_format(template, motif=key.split('_')[0], seq=key.split('_')[1],
motifs_content=table, summary_plots=summary_plots, motif_plots='\n'.join(motif_plots)))
def create_reports(input_dir, output_dir, arg_list):
"""
Traverse input dir, collect all tables and plots from reports and generate motif reports
:param input_dir: str - path to input dir
:param output_dir: str - path to output dir
:param arg_list: [any] - list of arguments
"""
os.makedirs(output_dir, exist_ok=True)
paths = []
# find all report files in root directory
for root, dirs, files in os.walk(input_dir):
for file in files:
if file == 'report.html':
paths.append(os.path.join(root, file))
motifs = {}
plots = {}
alignments = {}
current_alignment = []
prev_name = ''
if len(paths) == 0:
print('ERROR\tInput directory is empty')
return
cnt = 1
print('INFO\tParsing reports')
for path in natsorted(paths):
if not arg_list.quiet and cnt % arg_list.report_every == 0:
print('INFO\tParsing file\t%d/%d' % (cnt, len(paths)))
cnt += 1
file = BeautifulSoup(open(path, 'r'), 'html.parser')
sample = file.find(id='sample_name').text.strip()
# find table of class 'tg' and extract all rows from it
for cl in file.find_all(class_='tg'):
for row in cl.find_all('tr'):
columns = row.find_all('td')
# remove head rows
if columns == [] or columns[0].text.strip() == 'prediction':
continue
if len(columns) >= 9:
name = re.sub(r'[^\w_]', '', columns[0].text.strip()) + '_' + columns[1].text.strip()
if ',' in name:
break
doc = [sample, columns[2].text.strip(), columns[3].text.strip().replace('%', ''),
columns[4].text.strip(), columns[5].text.strip().replace('%', ''),
columns[6].text.strip().replace('%', ''), columns[7].text.strip(), columns[8].text.strip()]
if name not in motifs:
motifs[name] = [doc]
else:
motifs[name].append(doc)
for cl in file.find_all(class_='plots'):
name = cl.find(class_='hist')['class'][-1]
hist = cl.find(class_='hist').find_next('script').text
pcol = cl.find(class_='pcol')
if pcol:
pcol = pcol.find_next('script').text
prev = cl.find_previous('p').find_all('u')
if len(prev) > 1:
continue
hist_data_re = re.match(r"[\w\W]+let hist_data = ({.+});[\w\W]+", hist)
if hist_data_re is None:
continue
hist_data = hist_data_re.group(1)
pcol_data_re = re.match(r"[\w\W]+let pcol_data = ({.+});[\w\W]+", pcol)
if pcol_data_re is None:
continue
pcol_data = pcol_data_re.group(1)
name += '_' + prev[0].text
temp = motif_plot_string.format(sample_name=sample, sample_id=name + '_' + sample,
hist_plot=hist_data, pcol_plot=pcol_data)
if name not in plots:
plots[name] = [temp]
else:
plots[name].append(temp)
for cl in file.find_all(class_='align'):
name = cl['id'].split('_')[0][1:]
msa = cl.find_next('script').text
prev = cl.find_previous('p').find_all('u')
disp = cl.find_previous('summary').text
if len(prev) > 1:
continue
msa_data = re.match(r"[\w\W]+var fasta = (`[\w\W]*`);[\w\W]+", msa).group(1)
name += '_' + prev[0].text
if prev_name == '':
prev_name = name
temp = align_string.format(sample_name=sample, sample_id=cl['id'] + sample,
fasta=msa_data, display_text=disp)
if prev_name == name:
current_alignment.append(temp)
else:
if prev_name not in alignments:
alignments[prev_name] = ['\n'.join(current_alignment)]
else:
alignments[prev_name].append('\n'.join(current_alignment))
current_alignment = [temp]
prev_name = name
if len(current_alignment) != 0:
if prev_name not in alignments:
alignments[prev_name] = ['\n'.join(current_alignment)]
else:
alignments[prev_name].append('\n'.join(current_alignment))
current_alignment = []
prev_name = ''
print('INFO\tGenerating motif reports')
# create histogram of read counts
for _key in motifs.keys():
a1 = [parse_alleles(row[1]) for row in motifs[_key] if parse_alleles(row[1]) != -1]
a2 = [parse_alleles(row[3]) for row in motifs[_key] if parse_alleles(row[3]) != -1]
if len(a1) == 0 or len(a2) == 0:
continue
try:
a1_max = max(x for x in a1 if isinstance(x, int))
except ValueError:
a1_max = 0
try:
a2_max = max(x for x in a2 if isinstance(x, int))
except ValueError:
a2_max = 0
arr = np.zeros((a1_max + 2, a2_max + 2))
max_count = max(a1_max, a2_max)
hist_arr = [0 for _ in range(max_count + 2)]
hist_color = ['#636EFA' for _ in range(max_count + 1)] + ['#EF553B']
for i in range(len(a1)):
if a2[i] == '---':
pass
else:
al1 = 0 if a1[i] == 'B' else -1 if a1[i] == 'E' else int(a1[i])
al2 = 0 if a2[i] == 'B' else -1 if a2[i] == 'E' else int(a2[i])
if al1 == -1 and al2 == -1: # special case when (E, E)
al1 = 0
arr[al1, al2] += 1
hist_arr[al1] += 1
hist_arr[al2] += 1
fig_histogram = go.Figure(data=[
go.Bar(y=hist_arr, text=[parse_label(num) for num in hist_arr], name='Count histogram', marker_color=hist_color),
])
# fig_histogram.add_vline(x=len(hist_arr) - 1.5, line_width=5, line_color='black', opacity=1)
fig_histogram.update_xaxes(title_text='Prediction', tickmode='array',
tickvals=np.concatenate([[0, 1], np.array(range(5, max_count + 1, 5)), [max_count + 1]]),
ticktext=['B', 1] + list(range(5, max_count + 1, 5)) + ['E(>%d)' % (max_count + 1)])
fig_histogram.update_yaxes(title_text='Count')
fig_histogram.update_traces(hovertemplate='<b>Prediction:\t%{x}</b><br />Count:\t%{y}<br />', textfont_size=7)
fig_histogram.update_layout(width=1000, height=500, template='simple_white',
barmode='stack', yaxis_fixedrange=True, hovermode='x')
row_max, column_max = arr.shape
row_sum = np.sum(arr, axis=1)
row_count, column_count = 0, 0
for element in row_sum:
if element == 0:
row_count += 1
else:
break
column_sum = np.sum(arr, axis=0)
for element in column_sum:
if element == 0:
column_count += 1
else:
break
text = [[parse_label(arr[i, j]) for j in range(arr.shape[1])] for i in range(arr.shape[0])]
bgs = sum(arr[0, :])
# arr = arr / np.max(arr)
# create heatmap of alleles
fig_heatmap = go.Figure(data=[
go.Heatmap(z=list(arr), text=text, textfont={'size': 10}, colorscale='Hot_r',
hovertemplate='<b>Allele 1:\t%{y}<br />Allele 2:\t%{x}</b><br />Count:\t%{text}',
texttemplate='%{text}', name='Prediction heatmap')
])
fig_heatmap.add_vline(x=a2_max + 0.5, line_width=5, line_color='black', opacity=1)
fig_heatmap.add_hline(y=0.5, line_width=5, line_color='black', opacity=1)
fig_heatmap.update_layout(width=750, height=750, template='simple_white',
yaxis=dict(range=[row_count - 1.5, row_max - 0.5]),
xaxis=dict(range=[column_count - 1.5, column_max - 0.5]))
fig_heatmap.update_yaxes(title_text='Allele 1', tickmode='array',
tickvals=np.concatenate([[0, 1], np.array(range(5, row_max, 5))]),
ticktext=['B', 1] + list(range(5, row_max, 5)))
fig_heatmap.update_xaxes(title_text='Allele 2', tickmode='array',
tickvals=np.concatenate([np.array(range(0, column_max - 1, 5)), [column_max - 1]]),
ticktext=list(range(0, column_max - 1, 5)) + ['E(>%d)' % (column_max - 1)])
# generate motif
if _key in alignments:
generate_motif_report(output_dir, _key, motifs[_key], plots[_key], fig_heatmap, fig_histogram, bgs, alignments[_key])
else:
generate_motif_report(output_dir, _key, motifs[_key], plots[_key], fig_heatmap, fig_histogram, bgs)
if __name__ == '__main__':
input_d, output_d, args = load_arguments()
create_reports(input_d, output_d, args)