Merge pull request #89 from shakedzy/support_more_correlations

support different correlation methods
shakedzy · Jul 14, 2021 · ae8fbba · ae8fbba
2 parents 51a34b6 + 0be340a
commit ae8fbba
Show file tree

Hide file tree

Showing 7 changed files with 202 additions and 111 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,11 @@
 # Change Log
 
+## 0.6.7
+* `associations` and `compute_associations` now supports several numerical-numerical association measures 
+  (issue [#84](https://github.com/shakedzy/dython/issues/84))
+* Added a `numerical_columns` option to `associations` and `compute_associations`
+* `roc_graph` is officially removed (replaced with `metric_graph`)
+
 ## 0.6.6
 * Fixed issue where `nan_strategy` affected input data (issue [#82](https://github.com/shakedzy/dython/issues/82))
 * Added `datetime` support to `nominal.associations` (issue [#76](https://github.com/shakedzy/dython/issues/76))

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-0.6.6
+0.6.7
diff --git a/docs/modules/model_utils.md b/docs/modules/model_utils.md
@@ -277,20 +277,6 @@ true class is 1, with predicted probability of 0.8.
 
 __________________
 
-#### `roc_graph`
-
-`roc_graph(y_true, y_pred, *args, **kwargs)`
-
-Plot a ROC graph of predictor's results (including AUC scores), where each
-row of y_true and y_pred represent a single example.
-
-!!! warning "Note:" 
-
-	The `roc_graph` method is deprecated and will be removed in future versions. 
-    Please use `metric_graph(y_true, y_pred, metric='roc',...)` instead.
-
-__________________
-
 
 #### `random_forest_feature_importance`
 

diff --git a/docs/modules/nominal.md b/docs/modules/nominal.md
@@ -6,7 +6,7 @@ title: nominal
 
 #### `associations`
 
-`associations(dataset, nominal_columns='auto', mark_columns=False, theil_u=False, plot=True, clustering=False, bias_correction=True, nan_strategy=_REPLACE, nan_replace_value=_DEFAULT_REPLACE_VALUE, ax=None, figsize=None, annot=True, fmt='.2f', cmap=None, sv_color='silver', cbar=True, vmax=1.0, vmin=None, title=None, filename=None)`
+`associations(dataset, nominal_columns='auto', numerical_columns=None, mark_columns=False, nom_nom_assoc='cramer', num_num_assoc='pearson', bias_correction=True, nan_strategy=_REPLACE, nan_replace_value=_DEFAULT_REPLACE_VALUE, ax=None, figsize=None, annot=True, fmt='.2f', cmap=None, sv_color='silver', cbar=True, vmax=1.0, vmin=None, plot=True, clustering=False, title=None, filename=None)`
 
 Calculate the correlation/strength-of-association of features in data-set with both categorical and
 continuous features using:
@@ -20,9 +20,21 @@ continuous features using:
 
 - **`nominal_columns`** : `string / list / NumPy ndarray`
 
+    _Default = 'auto'_
+
     Names of columns of the data-set which hold categorical values. Can also be the string 'all' to state that all
     columns are categorical, 'auto' (default) to identify nominal columns automatically, or None to state none are 
-    categorical
+    categorical. Only used if `numerical_columns` is `None`.
+
+- **`numerical_columns`** : `string / list / NumPy ndarray`
+
+    _Default = None_
+
+    To be used instead of `nominal_columns`. Names of columns of the data-set 
+    which hold numerical values. Can also be the string 'all' to state that 
+    all columns are numerical (equivalent to `nominal_columns=None`) or
+    'auto' to try to identify numerical columns (equivalent to 
+    `nominal_columns=auto`). If `None`, `nominal_columns` is used.
 
 - **`mark_columns`** : `Boolean` 
 
@@ -31,31 +43,31 @@ continuous features using:
     if True, output's columns' names will have a suffix of '(nom)' or '(con)' based on their type (nominal or 
     continuous), as provided by nominal_columns
 
-- **`theil_u`** : `Boolean` 
-
-    _Default: False_
-
-    In the case of categorical-categorical feaures, use Theil's U instead of Cramer's V. If selected, 
-    heat-map rows are the provided information: $U = U(row|col)$. 
-
-- **`plot`** : `Boolean` 
-
-    _Default: True_
+- **`nom_nom_assoc`** : `string`
+
+    _Default = 'cramer'_
 
-    Plot a heat-map of the correlation matrix. If False, heat-map will still be
-    drawn, but not shown. The heat-map's `ax` is part of this function's output. 
-
-    !!! tip "Tip"
+    !!! info "Method signature change"
+        This replaces the `theil_u` flag which was used till version 0.6.6.
+
+    Name of nominal-nominal (categorical-categorical) association to use:
+
+    * `cramer`: Cramer's V
+
+    * `theil`: Theil's U. When selected, heat-map rows are the provided information (meaning: $U = U(row|col)$)
+
+- **`num_num_assoc`** : `string`
 
-        If you have no use of the plot at all, and only require the correlations
-        DataFrame, consider using [`compute_associations`](#compute_associations) instead.
-
-- **`clustering`** : `Boolean` 
-
-    _Default: False_
-
-    If True, the computed associations will be sorted into groups by similar correlations
-
+    _Default = 'pearson'_
+
+    Name of numerical-numerical association to use: 
+
+    * `pearson`: Pearson's R
+
+    * `spearman`: Spearman's R
+
+    * `kendall`: Kendall's Tau
+
 - **`bias_correction`** : `Boolean`
 
      _Default = True_
@@ -129,7 +141,25 @@ continuous features using:
 
     Set heat-map `vmin` option. If set to `None`, `vmin` will be chosen automatically 
     between 0 and -1.0, depending on the types of associations used (-1.0 if Pearson's R 
-    is used, 0 otherwise)  
+    is used, 0 otherwise)
+
+- **`plot`** : `Boolean` 
+
+    _Default: True_
+
+    Plot a heat-map of the correlation matrix. If False, heat-map will still be
+    drawn, but not shown. The heat-map's `ax` is part of this function's output. 
+
+    !!! tip "Tip"
+
+        If you have no use of the plot at all, and only require the correlations
+        DataFrame, consider using [`compute_associations`](#compute_associations) instead.
+
+- **`clustering`** : `Boolean` 
+
+    _Default: False_
+
+    If True, the computed associations will be sorted into groups by similar correlations
 
 - **`title`**: `string` or `None`
 
@@ -185,7 +215,7 @@ __________________
 
 #### `compute_associations`
 
-`compute_associations(dataset, nominal_columns='auto', mark_columns=False, theil_u=False, clustering=False, bias_correction=True, nan_strategy=_REPLACE, nan_replace_value=_DEFAULT_REPLACE_VALUE)`
+`compute_associations(dataset, nominal_columns='auto', mark_columns=False, theil_u=False, bias_correction=True, nan_strategy=_REPLACE, nan_replace_value=_DEFAULT_REPLACE_VALUE, clustering=False)`
 
 Calculate the correlation/strength-of-association of features in data-set with both categorical and
 continuous features using:
@@ -205,27 +235,53 @@ continuous features using:
 
 - **`nominal_columns`** : `string / list / NumPy ndarray`
 
+    _Default = 'auto'_
+
     Names of columns of the data-set which hold categorical values. Can also be the string 'all' to state that all
-    columns are categorical, 'auto' (default) to identify nominal columns automatically, or None to state none are categorical
+    columns are categorical, 'auto' (default) to identify nominal columns automatically, or None to state none are 
+    categorical. Only used if `numerical_columns` is `None`.
+
+- **`numerical_columns`** : `string / list / NumPy ndarray`
+
+    _Default = None_
 
+    To be used instead of `nominal_columns`. Names of columns of the data-set 
+    which hold numerical values. Can also be the string 'all' to state that 
+    all columns are numerical (equivalent to `nominal_columns=None`) or
+    'auto' to try to identify numerical columns (equivalent to 
+    `nominal_columns=auto`). If `None`, `nominal_columns` is used.
+
 - **`mark_columns`** : `Boolean` 
 
     _Default: False_
 
     if True, output's columns' names will have a suffix of '(nom)' or '(con)' based on there type (eda_tools or continuous), as provided by nominal_columns
 
-- **`theil_u`** : `Boolean` 
-
-    _Default: False_
-
-    In the case of categorical-categorical feaures, use Theil's U instead of Cramer's V
-
-- **`clustering`** : `Boolean` 
-
-    _Default: False_
-
-    If True, the computed associations will be sorted into groups by similar correlations
+- **`nom_nom_assoc`** : `string`
+
+    _Default = 'cramer'_
 
+    !!! info "Method signature change"
+        This replaces the `theil_u` flag which was used till version 0.6.6.
+
+    Name of nominal-nominal (categorical-categorical) association to use:
+
+    * `cramer`: Cramer's V
+
+    * `theil`: Theil's U. When selected, heat-map rows are the provided information (meaning: $U = U(row|col)$)
+
+- **`num_num_assoc`** : `string`
+
+    _Default = 'pearson'_
+
+    Name of numerical-numerical association to use: 
+
+    * `pearson`: Pearson's R
+
+    * `spearman`: Spearman's R
+
+    * `kendall`: Kendall's Tau
+
 - **`bias_correction`** : `Boolean`
 
       _Default = True_
@@ -245,6 +301,13 @@ continuous features using:
 
     The value used to replace missing values with. Only applicable when nan_strategy is set to 'replace'
 
+- **`clustering`** : `Boolean` 
+
+    _Default: False_
+
+    If True, the computed associations will be sorted into groups by similar correlations
+
+
 **Returns:** A DataFrame of the correlation/strength-of-association between all features
 
 __________________

diff --git a/dython/examples.py b/dython/examples.py
@@ -87,7 +87,7 @@ def associations_iris_example():
     df = pd.concat([X, y], axis=1)
 
     # Plot features associations
-    return associations(df)
+    return associations(df, num_num_assoc='spearman')
 
 
 def associations_mushrooms_example():
@@ -104,7 +104,7 @@ def associations_mushrooms_example():
                   'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat']
 
     # Plot features associations
-    return associations(df, theil_u=True, figsize=(15, 15))
+    return associations(df, nom_nom_assoc='theil', figsize=(15, 15))
 
 
 def split_hist_example():

diff --git a/dython/model_utils.py b/dython/model_utils.py
@@ -1,4 +1,3 @@
-import warnings
 import numpy as np
 import matplotlib.pyplot as plt
 from sklearn.metrics import roc_curve, precision_recall_curve, auc
@@ -8,7 +7,6 @@
 __all__ = [
     'random_forest_feature_importance',
     'metric_graph',
-    'roc_graph',
     'ks_abc'
 ]
 
@@ -339,16 +337,6 @@ def random_forest_feature_importance(forest, features, precision=4):
                   reverse=True)
 
 
-def roc_graph(y_true, y_pred, *args, **kwargs):
-    """
-    This method is deprecated. Please use `metric_graph(metric='roc',...)`
-    """
-    warnings.warn("The 'roc_graph' method is deprecated and will be removed in future versions. " +
-                  "Please use 'metric_graph(y_true, y_pred, metric='roc',...)' instead.",
-                  DeprecationWarning)
-    return metric_graph(y_true, y_pred, 'roc', *args, **kwargs)
-
-
 def ks_abc(y_true, y_pred, ax=None, figsize=None, colors=('darkorange', 'b'), title=None, xlim=(0.,1.), ylim=(0.,1.),
            fmt='.2f', lw=2, legend='best', plot=True, filename=None):
     """