-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscale_data.py
40 lines (27 loc) · 1.15 KB
/
scale_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import numpy as np
import pandas as pd
import os.path as osp
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from utils import set_seeds
PATH_BASE = './'
PATH_DATA = osp.join(PATH_BASE, 'data')
def main(file_name):
df = pd.read_csv(osp.join(PATH_DATA, file_name))
if 'train' in file_name:
drop_idx = []
drop_idx.extend(df[df['PR_Allred_score'] > 8].index.to_list())
drop_idx.extend(df[pd.isna(df['ER'])].index.to_list())
drop_idx.extend(df[pd.isna(df['T_category'])].index.to_list())
drop_idx.extend(df[pd.isna(df['HER2'])].index.to_list())
df = df.drop(drop_idx)
drop_col = ['DCIS_or_LCIS_type', 'HER2_SISH', 'HER2_SISH_ratio', 'BRCA_mutation']
df = df.drop(columns=drop_col).reset_index(drop=True)
df['KI-67_LI_percent'] = df['KI-67_LI_percent'].apply(lambda x:x if np.isnan(x) else int(x))
df = df.fillna(-1)
std_scaler = OneHotEncoder()
df = pd.concat([df[['ID','img_path','mask_path','나이','수술연월일']]])
print(df.info())
if __name__ == '__main__':
set_seeds(42)
file_name = 'train_heuristic_5fold.csv'
main(file_name)