-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess_bloodtests.py
32 lines (22 loc) · 1 KB
/
preprocess_bloodtests.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import os
import numpy as np
import pandas as pd
from config import rider_mapping, DATA_PATH
root = DATA_PATH+'bloodtests/'
df = pd.read_excel(root+'2018-2019 TNN Riders - HbA1c.xlsx', skiprows=1, header=(0,1,2))#, nrows=16, sheet_name=None)
df = df.rename(columns={'Unnamed: 0_level_2': '', 'Unnamed: 1_level_2': ''})
df = df.dropna(subset=[('TNN HbA1c', 'first name', '')])
df = df.drop([('TNN HbA1c', 'first name', '')], axis=1)
# clean up name columns
df[('TNN HbA1c', 'last name', '')] = df[('TNN HbA1c', 'last name', '')].str.lower()
df[('TNN HbA1c', 'last name', '')] = df[('TNN HbA1c', 'last name', '')].apply(lambda x: x.split(' ')[0])
# anonymize
df[('TNN HbA1c', 'last name', '')] = df[('TNN HbA1c', 'last name', '')].map(rider_mapping)
df = df.dropna(subset=[('TNN HbA1c', 'last name', '')])
# clean index
df = df.set_index([('TNN HbA1c', 'last name', '')])
df.index.name = 'RIDER'
df.index = df.index.astype(int)
df = df.sort_index()
df = df.replace({'NV':np.nan})
df.to_csv(root+'HbA1c.csv')