-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathwrap_up.py
95 lines (69 loc) · 3.04 KB
/
wrap_up.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# coding:utf-8
from __future__ import division
import numpy as np
import pandas as pd
from scipy import stats
from datetime import datetime
def fill_fre_top_5(x):
if len(x) <= 5:
new_array = np.full(5, np.nan)
new_array[0:len(x)] = x
return new_array
def eda_analysis(missSet=[np.nan, 9999999999, -999999], df=None):
# (11)Count distinct#
count_un = df.apply(lambda x: len(x.unique()))
count_un = count_un.to_frame('count')
# (2)Zero Values#
count_zero = df.apply(lambda x: np.sum(x == 0))
count_zero = count_zero.to_frame('count_zero')
# (3)Mean Values#
df_mean = df.apply(lambda x: np.mean(x[~np.isin(x, missSet)]))
df_mean = df_mean.to_frame('mean')
# (4)Median Values#
df_median = df.apply(lambda x: np.median(x[~np.isin(x, missSet)]))
df_median = df_median.to_frame('median')
# (5)Mode Values#
df_mode = df.apply(lambda x: stats.mode(x[~np.isin(x, missSet)])[0][0])
df_mode = df_mode.to_frame('mode')
# (6)Mode Percentage#
df_mode_count = df.apply(lambda x: stats.mode(x[~np.isin(x, missSet)])[1][0])
df_mode_count = df_mode_count.to_frame('mode_count')
df_mode_perct = df_mode_count / df.shape[0]
df_mode_perct.columns = ['mode_perct']
# (7)Min Values#
df_min = df.apply(lambda x: np.min(x[~np.isin(x, missSet)]))
df_min = df_min.to_frame('min')
# (8)Max Values#
df_max = df.apply(lambda x: np.max(x[~np.isin(x, missSet)]))
df_max = df_max.to_frame('max')
# (9)quantile values
json_quantile = {}
for i, name in enumerate(df.columns):
# print('the %d columns: %s' % (i, name))
json_quantile[name] = np.percentile(df[name][~np.isin(df[name], missSet)], (1, 5, 25, 50, 75, 95, 99))
df_quantife = pd.DataFrame(json_quantile)[df.columns].T
df_quantife.columns = ['quan01', 'quan05', 'quan25', 'quan50', 'quan75', 'quan95', 'quan99']
# (10)Frequent Values
json_fre_name = {}
json_fre_count = {}
for i, name in enumerate(df.columns):
index_name = df[name][~np.isin(df[name], missSet)].value_counts().iloc[0:5, ].index.values
index_name = fill_fre_top_5(index_name)
json_fre_name[name] = index_name
values_count = df[name][~np.isin(df[name], missSet)].value_counts().iloc[0:5, ].values
values_count = fill_fre_top_5(values_count)
json_fre_count[name] = values_count
df_fre_name = pd.DataFrame(json_fre_name)[df.columns].T
df_fre_count = pd.DataFrame(json_fre_count)[df.columns].T
df_fre = pd.concat([df_fre_name, df_fre_count], axis=1)
df_fre.columns = ['value1', 'value2', 'value3', 'value4', 'value5', 'freq1', 'freq2', 'freq3', 'freq4', 'freq5']
# (11)Miss Values
df_miss = df.apply(lambda x: np.sum(np.isin(x, missSet)))
df_miss = df_miss.to_frame('freq_miss')
#####12.Combine All Informations#####
df_eda_summary = pd.concat(
[count_un, count_zero, df_mean, df_median, df_mode,
df_mode_count, df_mode_perct, df_min, df_max, df_fre,
df_miss, df_quantife], axis=1
)
return df_eda_summary