-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfetch.py
65 lines (47 loc) · 1.73 KB
/
fetch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import os
from urllib.request import urlretrieve
import tarfile
from lxml import etree
def get_text_from_element_tree (root):
"""
Performs a depth-first walk through the element tree starting from root,
and concatenates the text from each bodyText and listItem tag.
Args:
root - the root node for the element tree.
Returns:
The text from each bodyText and listItem tag in the element tree concatenated together.
"""
text = ""
for child in root:
if child.tag == "bodyText":
text += child.text + " "
if child.tag == "listItem":
text += child.text + " "
else:
text += get_text_from_element_tree(child)
return text
def fetch (url, filename):
"""
Retrieves the tarfile at the given url, and then extracts and parses each file within it.
After each file in the tarfile has been processed, the tarfile is deleted.
Args:
url - the url at which the tarfile can be found.
filename - the filename under which to temporarily download the tarfile.
Returns:
A dictionary which maps each filename to the text parsed from said file.
"""
name_to_text = {}
urlretrieve(url, filename)
archive = tarfile.open(name=filename, mode="r:gz")
names = archive.getnames()
members = archive.getmembers()
for name, member in zip(names, members):
print(name)
extracted_file = archive.extractfile(member)
xml = extracted_file.read()
parser = etree.XMLParser(recover=True)
root = etree.fromstring(xml, parser)
text = get_text_from_element_tree(root)
name_to_text[name] = text
os.remove(filename)
return name_to_text