Add ·auto_discard_unique· in ModelConfig to determine whether to disc…

…ard a variable with only a single value. #37
DataCanvasIO · Sep 24, 2020 · ecdd4fe · ecdd4fe
1 parent f69a57a
commit ecdd4fe
Show file tree

Hide file tree

Showing 3 changed files with 31 additions and 1 deletion.
diff --git a/deeptables/eda/utils.py b/deeptables/eda/utils.py
@@ -126,3 +126,29 @@ def split_seq(iterable, size):
         yield item
         item = list(itertools.islice(it, size))
 
+def reduce_mem_usage(df, verbose=True):
+    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
+    start_mem = df.memory_usage().sum() / 1024**2
+    for col in df.columns:
+        col_type = df[col].dtypes
+        if col_type in numerics:
+            c_min = df[col].min()
+            c_max = df[col].max()
+            if str(col_type)[:3] == 'int':
+                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
+                    df[col] = df[col].astype(np.int8)
+                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
+                    df[col] = df[col].astype(np.int16)
+                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
+                    df[col] = df[col].astype(np.int32)
+                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
+                    df[col] = df[col].astype(np.int64)
+            else:
+                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
+                    df[col] = df[col].astype(np.float16)
+                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
+                    df[col] = df[col].astype(np.float32)
+                else:
+                    df[col] = df[col].astype(np.float64)
+    end_mem = df.memory_usage().sum() / 1024**2
+    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
diff --git a/deeptables/models/config.py b/deeptables/models/config.py
@@ -20,6 +20,7 @@ class ModelConfig(collections.namedtuple('ModelConfig',
                                           'auto_encode_label',
                                           'auto_imputation',
                                           'auto_discrete',
+                                          'auto_discard_unique',
                                           'apply_gbm_features',
                                           'gbm_params',
                                           'gbm_feature_type',
@@ -66,6 +67,7 @@ def __new__(cls,
                 auto_encode_label=True,
                 auto_imputation=True,
                 auto_discrete=False,
+                auto_discard_unique = True,
                 apply_gbm_features=False,
                 gbm_params={},
                 gbm_feature_type=consts.GBM_FEATURE_TYPE_EMB,  # embedding/dense
@@ -144,6 +146,7 @@ def __new__(cls,
                                                auto_encode_label,
                                                auto_imputation,
                                                auto_discrete,
+                                               auto_discard_unique,
                                                apply_gbm_features,
                                                gbm_params,
                                                gbm_feature_type,

diff --git a/deeptables/models/preprocessor.py b/deeptables/models/preprocessor.py
@@ -59,6 +59,7 @@ def signature(self):
 {self.config.categorical_columns}|
 {self.config.auto_categorize}|
 {self.config.cat_remain_numeric}|
+{self.config.auto_discard_unique}|
 {self.config.gbm_params}|
 {self.config.gbm_feature_type}|
 {self.config.fixed_embedding_dim}|
@@ -293,7 +294,7 @@ def __prepare_features(self, X):
             nunique = X[c].nunique()
             dtype = str(X[c].dtype)
 
-            if nunique <= 1:
+            if nunique <= 1 and self.config.auto_discard_unique:
                 continue
 
             if c in self.config.exclude_columns: