-
Notifications
You must be signed in to change notification settings - Fork 1
/
convert_csv_to_json.py
77 lines (64 loc) · 2.32 KB
/
convert_csv_to_json.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import pandas as pd
import json
import collections
data_location = \
'https://raw.githubusercontent.com/Geoyi/Cleaning-Titanic-Data/' +\
'master/titanic_clean.csv'
def transform_to_json(row: pd.Series) -> str:
"""
Transforms a row of passenger data into a JSON-formatted string.
:param row:
pd.Series containing all data for a passenger
:return:
JSON-formatted string of the data row
"""
passenger_dict = collections.OrderedDict()
passenger_dict['name'] = row['name']
demographic_dict = collections.OrderedDict()
demographic_dict['sex'] = row['sex']
demographic_dict['age'] = row['age']
passenger_dict['demographics'] = demographic_dict
family_dict = collections.OrderedDict()
family_dict['sibsp'] = row['sibsp']
family_dict['parch'] = row['parch']
passenger_dict['family'] = family_dict
geography_dict = collections.OrderedDict()
geography_dict['embarked'] = row['embarked']
geography_dict['home.dest'] = row['home.dest']
passenger_dict['geography'] = geography_dict
ticket_dict = collections.OrderedDict()
ticket_dict['ticket'] = row['ticket']
ticket_dict['pclass'] = row['pclass']
ticket_dict['has_cabin_number'] = row['has_cabin_number']
# nulls are encoded as the string "nan" in the CSV file
ticket_dict['cabin'] = \
None if row['cabin'] == 'nan' \
else row['cabin']
ticket_dict['fare'] = row['fare']
passenger_dict['ticket'] = ticket_dict
survival_dict = collections.OrderedDict()
survival_dict['survived'] = row['survived']
survival_dict['boat'] = \
None if row['boat'] == 'nan' \
else row['boat']
survival_dict['body'] = \
None if row['body'] == 'nan' \
else row['body']
passenger_dict['survival'] = survival_dict
return json.dumps(passenger_dict)
# skip the last row of data in the file which doesn't relate to a passenger.
# also set data types for a few problematic fields
titanic_passengers = pd.read_csv(
data_location,
skipfooter=1,
dtype={
"cabin": str,
"boat": str,
"body": str
}
)
titanic_passengers['json'] = titanic_passengers.apply(transform_to_json, axis=1)
json_file = open('titanic.json', 'w')
json_file.write(titanic_passengers['json'].str.cat(sep='\n'))
json_file.write('\n')
json_file.close()