-
Notifications
You must be signed in to change notification settings - Fork 36
/
Copy pathbenchmark.py
executable file
·109 lines (91 loc) · 3.35 KB
/
benchmark.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/env python
# vim:fileencoding=utf-8
# License: Apache 2.0 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>
from __future__ import absolute_import, division, print_function, unicode_literals
import argparse
import textwrap
from functools import partial
import html5_parser
import html5lib
from bs4 import BeautifulSoup
try:
from time import monotonic
except ImportError:
from time import time as monotonic
TF = 'test/large.html'
try:
raw = open(TF, 'rb').read()
except Exception:
try:
from urllib.request import urlopen
except ImportError:
from urllib import urlopen
print('Downloading large HTML file...')
raw = urlopen('https://www.w3.org/TR/html5/single-page.html').read()
open(TF, 'wb').write(raw)
print('Testing with HTML file of', '{:,}'.format(len(raw)), 'bytes')
def timeit(func, number=1):
total = 0
for i in range(number):
st = monotonic()
r = func()
t = monotonic() - st
total += t
del r
return total / number
def doit(name, func, num=20):
print('Parsing', num, 'times with', name)
t = timeit(func, num)
rate = len(raw) / t
print(name, f'took an average of: {t:,.3f} seconds to parse it @ {rate/1000:.1f} KB/s')
return t
p = argparse.ArgumentParser(description='Benchmark html5-parser')
p.add_argument('treebuilder', nargs='?', default='lxml', choices='lxml soup dom etree'.split())
p.add_argument(
'--num',
'-n',
default=10,
type=int,
help='Number of repetitions for html5lib (html5-parser will use 10x as many reps)')
args = p.parse_args()
base_time = doit(
'html5-parser',
partial(
html5_parser.parse,
raw,
transport_encoding="utf-8",
namespace_elements=True,
treebuilder=args.treebuilder),
num=args.num * 10)
soup_time = doit(
'html5-parser-to-soup',
partial(html5_parser.parse, raw, transport_encoding="utf-8", treebuilder='soup'),
num=args.num)
h5time = doit(
'html5lib',
partial(html5lib.parse, raw, transport_encoding="utf-8", treebuilder=args.treebuilder),
num=args.num)
soup5_time = doit(
'BeautifulSoup-with-html5lib', partial(BeautifulSoup, raw, 'html5lib'), num=args.num)
soup4_time = doit('BeautifulSoup-with-lxml', partial(BeautifulSoup, raw, 'lxml'), num=args.num)
def row(*args):
for a in args:
print('{:18s}'.format(str(a)), end='|')
print()
print()
print(textwrap.fill(
'Results are below. They show how much faster html5-parser is than'
' each specified parser. Note that there are two additional considerations:'
' what the final tree is and whether the parsing supports the HTML 5'
' parsing algorithm. The most apples-to-apples comparison is when the'
' final tree is lxml and HTML 5 parsing is supported by the parser being compared to.'
' Note that in this case, we have the largest speedup. In all other cases,'
' speedup is less because of the overhead of building the final tree'
' in python instead of C or because the compared parser does not use'
' the HTML 5 parsing algorithm or both.'))
print()
row('Parser', 'Tree', 'Supports HTML 5', 'Speedup (factor)')
print('=' * 79)
row('html5lib', 'lxml', 'yes', round(h5time / base_time))
row('soup+html5lib', 'BeautifulSoup', 'yes', round(soup5_time / soup_time))
row('soup+lxml.html', 'BeautifulSoup', 'no', round(soup4_time / soup_time))