-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathe06asyncextract.py
executable file
·65 lines (51 loc) · 1.58 KB
/
e06asyncextract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/usr/bin/env python3
"""
./e06asyncextract.py http://camlistore.org
http://camlistore.org/ is 3681 bytes, 11 urls:
...
Also try the error case and check out the legible traceback:
./e06asyncextract.py http://camlistore.bad
...
"""
import logging
logging.getLogger().setLevel(logging.INFO)
import re
from sys import argv
from urllib.parse import urljoin
from urllib.parse import urlparse
from urllib.parse import urlunparse
import aiohttp
import asyncio
from e01extract import canonicalize, same_domain, URL_EXPR
@asyncio.coroutine
def fetch_async(url):
logging.info('Fetching %s', url)
response = yield from aiohttp.request('get', url, timeout=5)
try:
assert response.status == 200
data = yield from response.read()
assert data
return data.decode('utf-8')
finally:
response.close()
@asyncio.coroutine
def extract_async(url):
data = yield from fetch_async(url)
found_urls = set()
for match in URL_EXPR.finditer(data):
found = canonicalize(match.group('url'))
if same_domain(url, found):
found_urls.add(urljoin(url, found))
return url, data, sorted(found_urls)
def main():
url = canonicalize(argv[1])
# Bridge the gap between sync and async
future = asyncio.Task(extract_async(url))
loop = asyncio.get_event_loop()
loop.run_until_complete(future)
loop.close()
_, data, found_urls = future.result() # Will raise exception
print('%s is %d bytes, %d urls:\n%s' %
(url, len(data), len(found_urls), '\n'.join(found_urls)))
if __name__ == '__main__':
main()