From 4680031e7e9752def4454ffd1fa4e25c4ea821ed Mon Sep 17 00:00:00 2001
From: qubixes <44498096+qubixes@users.noreply.github.com>
Date: Wed, 29 Nov 2023 09:48:07 +0100
Subject: [PATCH] Change from AIC to BIC (#212)

* Change from AIC to BIC

* Fix pylint issue

---------

Co-authored-by: Raoul Schram <r.d.schram@uu.nl>
Co-authored-by: Erik-Jan van Kesteren <erikjanvankesteren@pm.me>
---
 examples/example_gmf_titanic.json   | 156 ++--------------------------
 metasyn/distribution/base.py        |   2 +-
 metasyn/distribution/categorical.py |   2 +-
 metasyn/distribution/continuous.py  |   7 +-
 metasyn/distribution/discrete.py    |  15 +--
 tests/test_continuous.py            |  15 ++-
 tests/test_discrete.py              |  16 +--
 7 files changed, 40 insertions(+), 173 deletions(-)

diff --git a/examples/example_gmf_titanic.json b/examples/example_gmf_titanic.json
index dd57069a..a9842a04 100644
--- a/examples/example_gmf_titanic.json
+++ b/examples/example_gmf_titanic.json
@@ -4,9 +4,9 @@
     "provenance": {
         "created by": {
             "name": "metasyn",
-            "version": "0.6.1.dev32+gd454b49.d20231121"
+            "version": "0.6.1.dev32+g871b8ec"
         },
-        "creation time": "2023-11-21T12:34:31.732581"
+        "creation time": "2023-11-21T13:22:03.439633"
     },
     "vars": [
         {
@@ -70,157 +70,13 @@
             "dtype": "Int64",
             "prop_missing": 0.19865319865319866,
             "distribution": {
-                "implements": "core.multinoulli",
+                "implements": "core.discrete_uniform",
                 "version": "1.0",
                 "provenance": "builtin",
-                "class_name": "MultinoulliDistribution",
+                "class_name": "DiscreteUniformDistribution",
                 "parameters": {
-                    "labels": [
-                        0,
-                        1,
-                        2,
-                        3,
-                        4,
-                        5,
-                        6,
-                        7,
-                        8,
-                        9,
-                        10,
-                        11,
-                        12,
-                        13,
-                        14,
-                        15,
-                        16,
-                        17,
-                        18,
-                        19,
-                        20,
-                        21,
-                        22,
-                        23,
-                        24,
-                        25,
-                        26,
-                        27,
-                        28,
-                        29,
-                        30,
-                        31,
-                        32,
-                        33,
-                        34,
-                        35,
-                        36,
-                        37,
-                        38,
-                        39,
-                        40,
-                        41,
-                        42,
-                        43,
-                        44,
-                        45,
-                        46,
-                        47,
-                        48,
-                        49,
-                        50,
-                        51,
-                        52,
-                        53,
-                        54,
-                        55,
-                        56,
-                        57,
-                        58,
-                        59,
-                        60,
-                        61,
-                        62,
-                        63,
-                        64,
-                        65,
-                        66,
-                        70,
-                        71,
-                        74,
-                        80
-                    ],
-                    "probs": [
-                        0.0014005602240896359,
-                        0.018207282913165267,
-                        0.014005602240896359,
-                        0.008403361344537815,
-                        0.014005602240896359,
-                        0.0056022408963585435,
-                        0.004201680672268907,
-                        0.004201680672268907,
-                        0.0056022408963585435,
-                        0.011204481792717087,
-                        0.0028011204481792717,
-                        0.0056022408963585435,
-                        0.0014005602240896359,
-                        0.0028011204481792717,
-                        0.00980392156862745,
-                        0.0070028011204481795,
-                        0.023809523809523808,
-                        0.018207282913165267,
-                        0.036414565826330535,
-                        0.0350140056022409,
-                        0.022408963585434174,
-                        0.03361344537815126,
-                        0.037815126050420166,
-                        0.02100840336134454,
-                        0.04481792717086835,
-                        0.03221288515406162,
-                        0.025210084033613446,
-                        0.025210084033613446,
-                        0.037815126050420166,
-                        0.028011204481792718,
-                        0.037815126050420166,
-                        0.023809523809523808,
-                        0.028011204481792718,
-                        0.02100840336134454,
-                        0.022408963585434174,
-                        0.025210084033613446,
-                        0.03221288515406162,
-                        0.008403361344537815,
-                        0.015406162464985995,
-                        0.0196078431372549,
-                        0.02100840336134454,
-                        0.008403361344537815,
-                        0.018207282913165267,
-                        0.0070028011204481795,
-                        0.012605042016806723,
-                        0.01680672268907563,
-                        0.0070028011204481795,
-                        0.012605042016806723,
-                        0.012605042016806723,
-                        0.008403361344537815,
-                        0.014005602240896359,
-                        0.00980392156862745,
-                        0.008403361344537815,
-                        0.0014005602240896359,
-                        0.011204481792717087,
-                        0.0028011204481792717,
-                        0.0070028011204481795,
-                        0.0028011204481792717,
-                        0.0070028011204481795,
-                        0.0028011204481792717,
-                        0.0056022408963585435,
-                        0.004201680672268907,
-                        0.0056022408963585435,
-                        0.0028011204481792717,
-                        0.0028011204481792717,
-                        0.004201680672268907,
-                        0.0014005602240896359,
-                        0.004201680672268907,
-                        0.0028011204481792717,
-                        0.0014005602240896359,
-                        0.0014005602240896359
-                    ]
+                    "low": 0,
+                    "high": 81
                 }
             }
         },
diff --git a/metasyn/distribution/base.py b/metasyn/distribution/base.py
index d2a261fe..5df23dcc 100644
--- a/metasyn/distribution/base.py
+++ b/metasyn/distribution/base.py
@@ -265,7 +265,7 @@ def information_criterion(self, values):
         return self._information_criterion(vals)
 
     def _information_criterion(self, values):
-        return 2 * self.n_par - 2 * np.sum(self.dist.logpdf(values))
+        return np.log(len(values)) * self.n_par - 2 * np.sum(self.dist.logpdf(values))
 
 
 @metadist(is_unique=True)
diff --git a/metasyn/distribution/categorical.py b/metasyn/distribution/categorical.py
index 4ad4f1bd..3f75181d 100644
--- a/metasyn/distribution/categorical.py
+++ b/metasyn/distribution/categorical.py
@@ -77,7 +77,7 @@ def information_criterion(self,
                 log_lik += count * np.log(pdict.get(lab, 1))
             n_parameters = len(self.probs)-1
 
-        return 2*n_parameters - 2*log_lik
+        return np.log(len(series))*n_parameters - 2*log_lik
 
     def _log_like_int(
             self,
diff --git a/metasyn/distribution/continuous.py b/metasyn/distribution/continuous.py
index e42228b7..6ee1949f 100644
--- a/metasyn/distribution/continuous.py
+++ b/metasyn/distribution/continuous.py
@@ -35,10 +35,11 @@ def _fit(cls, values):
 
     def _information_criterion(self, values):
         if np.any(np.array(values) < self.min_val) or np.any(np.array(values) > self.max_val):
-            return 2*self.n_par + 100*len(values)
+            return np.log(len(values))*self.n_par + 100*len(values)
         if np.fabs(self.max_val-self.min_val) < 1e-8:
-            return 2*self.n_par - 100*len(values)
-        return 2*self.n_par - 2*len(values)*np.log((self.max_val-self.min_val)**-1)
+            return np.log(len(values))*self.n_par - 100*len(values)
+        return (np.log(len(values))*self.n_par
+                - 2*len(values)*np.log((self.max_val-self.min_val)**-1))
 
     @classmethod
     def default_distribution(cls):
diff --git a/metasyn/distribution/discrete.py b/metasyn/distribution/discrete.py
index 86bc94c9..2b847d14 100644
--- a/metasyn/distribution/discrete.py
+++ b/metasyn/distribution/discrete.py
@@ -30,7 +30,7 @@ def __init__(self, low: int, high: int):
         self.dist = self.dist_class(low=low, high=high)
 
     def _information_criterion(self, values):
-        return 2*self.n_par - 2*np.sum(self.dist.logpmf(values))
+        return np.log(len(values))*self.n_par - 2*np.sum(self.dist.logpmf(values))
 
     @classmethod
     def _fit(cls, values):
@@ -60,7 +60,7 @@ def __init__(self, mu: float):
         self.dist = self.dist_class(mu=mu)
 
     def _information_criterion(self, values):
-        return 2*self.n_par - 2*np.sum(self.dist.logpmf(values))
+        return np.log(len(values))*self.n_par - 2*np.sum(self.dist.logpmf(values))
 
     @classmethod
     def _fit(cls, values):
@@ -121,11 +121,11 @@ def draw(self):
 
     def _information_criterion(self, values):
         if values.min() < self.low:
-            return 3+999*len(values)
+            return 2*np.log(len(values))+999*len(values)
 
         # If the values are not unique the fit is extremely bad.
         if len(set(values)) != len(values):
-            return 3+999*len(values)
+            return 2*np.log(len(values))+999*len(values)
 
         low = values.min()
         high = values.max()+1
@@ -133,13 +133,14 @@ def _information_criterion(self, values):
         if self.consecutive == 1:
             # Check if the values are truly consecutive
             if len(values) == high-low and np.all(values.to_numpy() == np.arange(low, high)):
-                return 3
-            return 3+999*len(values)
+                return 2*np.log(len(values))
+            return 2*np.log(len(values))+999*len(values)
 
         n_choice = high - low
 
         # Probabilities go up like 1/n, 1/(n-1), 1/(n-2), ..., 1/2, 1
-        return 5 - 2*np.sum(np.log(1/np.arange(n_choice, n_choice-len(values), -1)))
+        return (3*np.log(len(values))
+                - 2*np.sum(np.log(1/np.arange(n_choice, n_choice-len(values), -1))))
 
     @classmethod
     def default_distribution(cls):
diff --git a/tests/test_continuous.py b/tests/test_continuous.py
index 3e236b54..09b5a541 100644
--- a/tests/test_continuous.py
+++ b/tests/test_continuous.py
@@ -1,9 +1,14 @@
 import numpy as np
-from scipy import stats
-from metasyn.distribution.continuous import UniformDistribution,\
-    NormalDistribution, LogNormalDistribution, TruncatedNormalDistribution,\
-    ExponentialDistribution
 from pytest import mark
+from scipy import stats
+
+from metasyn.distribution.continuous import (
+    ExponentialDistribution,
+    LogNormalDistribution,
+    NormalDistribution,
+    TruncatedNormalDistribution,
+    UniformDistribution,
+)
 
 
 @mark.parametrize(
@@ -20,7 +25,7 @@ def test_uniform(lower_bound, upper_bound):
     dist = UniformDistribution.fit(values)
     assert dist.min_val <= values.min()
     assert dist.max_val >= values.max()
-    assert dist.information_criterion(values) < 4 - 200*np.log((upper_bound-lower_bound)**-1)
+    assert dist.information_criterion(values) < 2*np.log(len(values)) - 200*np.log((upper_bound-lower_bound)**-1)
     assert isinstance(dist.draw(), float)
 
 
diff --git a/tests/test_discrete.py b/tests/test_discrete.py
index 29e64360..65af8ead 100644
--- a/tests/test_discrete.py
+++ b/tests/test_discrete.py
@@ -1,12 +1,16 @@
+from math import fabs
+
+import numpy as np
 import pandas as pd
 import polars as pl
-import numpy as np
+from pytest import mark
 from scipy.stats import poisson
 
-from metasyn.distribution.discrete import UniqueKeyDistribution, DiscreteUniformDistribution,\
-    PoissonDistribution
-from pytest import mark
-from math import fabs
+from metasyn.distribution.discrete import (
+    DiscreteUniformDistribution,
+    PoissonDistribution,
+    UniqueKeyDistribution,
+)
 
 
 @mark.parametrize(
@@ -29,7 +33,7 @@ def test_uniform(data, series_type):
         assert len(drawn_values) == len(series)
     drawn_values = np.array(list(drawn_values))
     assert np.isclose(dist.information_criterion(drawn_values),
-                      4+2*len(drawn_values)*(np.log(dist.high-dist.low)))
+                      np.log(len(drawn_values))*2+2*len(drawn_values)*(np.log(dist.high-dist.low)))
 
 
 @mark.parametrize(