Skip to content

Commit

Permalink
Add ·auto_discard_unique· in ModelConfig to determine whether to disc…
Browse files Browse the repository at this point in the history
…ard a variable with only a single value. #37
  • Loading branch information
jackguagua committed Sep 24, 2020
1 parent f69a57a commit ecdd4fe
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 1 deletion.
26 changes: 26 additions & 0 deletions deeptables/eda/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,3 +126,29 @@ def split_seq(iterable, size):
yield item
item = list(itertools.islice(it, size))

def reduce_mem_usage(df, verbose=True):
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
start_mem = df.memory_usage().sum() / 1024**2
for col in df.columns:
col_type = df[col].dtypes
if col_type in numerics:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
end_mem = df.memory_usage().sum() / 1024**2
if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
3 changes: 3 additions & 0 deletions deeptables/models/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ class ModelConfig(collections.namedtuple('ModelConfig',
'auto_encode_label',
'auto_imputation',
'auto_discrete',
'auto_discard_unique',
'apply_gbm_features',
'gbm_params',
'gbm_feature_type',
Expand Down Expand Up @@ -66,6 +67,7 @@ def __new__(cls,
auto_encode_label=True,
auto_imputation=True,
auto_discrete=False,
auto_discard_unique = True,
apply_gbm_features=False,
gbm_params={},
gbm_feature_type=consts.GBM_FEATURE_TYPE_EMB, # embedding/dense
Expand Down Expand Up @@ -144,6 +146,7 @@ def __new__(cls,
auto_encode_label,
auto_imputation,
auto_discrete,
auto_discard_unique,
apply_gbm_features,
gbm_params,
gbm_feature_type,
Expand Down
3 changes: 2 additions & 1 deletion deeptables/models/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ def signature(self):
{self.config.categorical_columns}|
{self.config.auto_categorize}|
{self.config.cat_remain_numeric}|
{self.config.auto_discard_unique}|
{self.config.gbm_params}|
{self.config.gbm_feature_type}|
{self.config.fixed_embedding_dim}|
Expand Down Expand Up @@ -293,7 +294,7 @@ def __prepare_features(self, X):
nunique = X[c].nunique()
dtype = str(X[c].dtype)

if nunique <= 1:
if nunique <= 1 and self.config.auto_discard_unique:
continue

if c in self.config.exclude_columns:
Expand Down

0 comments on commit ecdd4fe

Please sign in to comment.