-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathgen.py
110 lines (87 loc) · 4.1 KB
/
gen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
from lxml import etree as ET
import os
import re
import csv
import json
from functools import partial
import sys
languages = ['en', 'fr']
xml_lang = '{http://www.w3.org/XML/1998/namespace}lang'
OUTPUTDIR = os.path.join('out', 'clv2')
def normalize_whitespace(x):
if x is None:
return x
x = x.strip()
x = re.sub(r'\s+', ' ', x)
return x
def codelist_item_todict(codelist_item, default_lang='', lang='en'):
out = dict([(child.tag, normalize_whitespace(child.text)) for child in codelist_item if child.tag not in ['name', 'description'] or child.attrib.get(xml_lang) == lang or (child.attrib.get(xml_lang) is None and lang == default_lang)])
if 'public-database' in codelist_item.attrib:
out['public-database'] = True if codelist_item.attrib['public-database'] in ['1', 'true'] else False
out['status'] = codelist_item.get('status', 'active')
return out
def utf8_encode_dict(d):
def enc(a):
if type(a) == str:
return a.encode('utf8')
else:
return None
return dict((enc(k), enc(v)) for k, v in d.items())
for language in languages:
codelists = ET.Element('codelists')
codelists_list = []
try:
os.makedirs(os.path.join(OUTPUTDIR, 'json', language))
os.makedirs(os.path.join(OUTPUTDIR, 'csv', language))
except OSError:
pass
for fname in os.listdir(os.path.join('out', 'clv2', 'xml')):
codelist = ET.parse(os.path.join('out', 'clv2', 'xml', fname))
attrib = codelist.getroot().attrib
assert attrib['name'] == fname.replace('.xml', '')
default_lang = codelist.getroot().attrib.get(xml_lang)
codelist_dicts = list(map(partial(codelist_item_todict, default_lang=default_lang, lang=language), codelist.getroot().find('codelist-items').findall('codelist-item')))
fieldnames = [
'code',
'name',
'description',
'category',
'url',
'status'
]
if fname == 'OrganisationRegistrationAgency.xml':
fieldnames.append('public-database')
dw = csv.DictWriter(open(os.path.join(OUTPUTDIR, 'csv', language, attrib['name'] + '.csv'), 'w'), fieldnames)
dw.writeheader()
for row in codelist_dicts:
if sys.version_info.major == 2:
row = utf8_encode_dict(row)
dw.writerow(row)
name_elements = codelist.getroot().xpath('/codelist/metadata/name[{}@xml:lang="{}"]'.format('not(@xml:lang) or ' if language == default_lang else '', language))
description_elements = codelist.getroot().xpath('/codelist/metadata/description[{}@xml:lang="{}"]'.format('not(@xml:lang) or ' if language == default_lang else '', language))
category_elements = codelist.getroot().xpath('/codelist/metadata/category[{}@xml:lang="{}"]'.format('not(@xml:lang) or ' if language == default_lang else '', language))
url_elements = codelist.getroot().xpath('/codelist/metadata/url')
# JSON
json.dump(
{
'attributes': {
'name': attrib['name'],
'complete': attrib.get('complete'),
'embedded': attrib.get('embedded'),
'category-codelist': attrib.get('category-codelist'),
},
'metadata': {
'name': name_elements[0].text if name_elements else '',
'description': description_elements[0].text if description_elements else '',
'category': category_elements[0].text if category_elements else '',
'url': url_elements[0].text if url_elements else ''
},
'data': codelist_dicts
},
open(os.path.join(OUTPUTDIR, 'json', language, attrib['name'] + '.json'), 'w')
)
codelists_list.append(attrib['name'])
ET.SubElement(codelists, 'codelist').attrib['ref'] = attrib['name']
tree = ET.ElementTree(codelists)
tree.write(os.path.join(OUTPUTDIR, 'codelists.xml'), pretty_print=True)
json.dump(codelists_list, open(os.path.join(OUTPUTDIR, 'codelists.json'), 'w'))