From 4680031e7e9752def4454ffd1fa4e25c4ea821ed Mon Sep 17 00:00:00 2001 From: qubixes <44498096+qubixes@users.noreply.github.com> Date: Wed, 29 Nov 2023 09:48:07 +0100 Subject: [PATCH] Change from AIC to BIC (#212) * Change from AIC to BIC * Fix pylint issue --------- Co-authored-by: Raoul Schram Co-authored-by: Erik-Jan van Kesteren --- examples/example_gmf_titanic.json | 156 ++-------------------------- metasyn/distribution/base.py | 2 +- metasyn/distribution/categorical.py | 2 +- metasyn/distribution/continuous.py | 7 +- metasyn/distribution/discrete.py | 15 +-- tests/test_continuous.py | 15 ++- tests/test_discrete.py | 16 +-- 7 files changed, 40 insertions(+), 173 deletions(-) diff --git a/examples/example_gmf_titanic.json b/examples/example_gmf_titanic.json index dd57069a..a9842a04 100644 --- a/examples/example_gmf_titanic.json +++ b/examples/example_gmf_titanic.json @@ -4,9 +4,9 @@ "provenance": { "created by": { "name": "metasyn", - "version": "0.6.1.dev32+gd454b49.d20231121" + "version": "0.6.1.dev32+g871b8ec" }, - "creation time": "2023-11-21T12:34:31.732581" + "creation time": "2023-11-21T13:22:03.439633" }, "vars": [ { @@ -70,157 +70,13 @@ "dtype": "Int64", "prop_missing": 0.19865319865319866, "distribution": { - "implements": "core.multinoulli", + "implements": "core.discrete_uniform", "version": "1.0", "provenance": "builtin", - "class_name": "MultinoulliDistribution", + "class_name": "DiscreteUniformDistribution", "parameters": { - "labels": [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29, - 30, - 31, - 32, - 33, - 34, - 35, - 36, - 37, - 38, - 39, - 40, - 41, - 42, - 43, - 44, - 45, - 46, - 47, - 48, - 49, - 50, - 51, - 52, - 53, - 54, - 55, - 56, - 57, - 58, - 59, - 60, - 61, - 62, - 63, - 64, - 65, - 66, - 70, - 71, - 74, - 80 - ], - "probs": [ - 0.0014005602240896359, - 0.018207282913165267, - 0.014005602240896359, - 0.008403361344537815, - 0.014005602240896359, - 0.0056022408963585435, - 0.004201680672268907, - 0.004201680672268907, - 0.0056022408963585435, - 0.011204481792717087, - 0.0028011204481792717, - 0.0056022408963585435, - 0.0014005602240896359, - 0.0028011204481792717, - 0.00980392156862745, - 0.0070028011204481795, - 0.023809523809523808, - 0.018207282913165267, - 0.036414565826330535, - 0.0350140056022409, - 0.022408963585434174, - 0.03361344537815126, - 0.037815126050420166, - 0.02100840336134454, - 0.04481792717086835, - 0.03221288515406162, - 0.025210084033613446, - 0.025210084033613446, - 0.037815126050420166, - 0.028011204481792718, - 0.037815126050420166, - 0.023809523809523808, - 0.028011204481792718, - 0.02100840336134454, - 0.022408963585434174, - 0.025210084033613446, - 0.03221288515406162, - 0.008403361344537815, - 0.015406162464985995, - 0.0196078431372549, - 0.02100840336134454, - 0.008403361344537815, - 0.018207282913165267, - 0.0070028011204481795, - 0.012605042016806723, - 0.01680672268907563, - 0.0070028011204481795, - 0.012605042016806723, - 0.012605042016806723, - 0.008403361344537815, - 0.014005602240896359, - 0.00980392156862745, - 0.008403361344537815, - 0.0014005602240896359, - 0.011204481792717087, - 0.0028011204481792717, - 0.0070028011204481795, - 0.0028011204481792717, - 0.0070028011204481795, - 0.0028011204481792717, - 0.0056022408963585435, - 0.004201680672268907, - 0.0056022408963585435, - 0.0028011204481792717, - 0.0028011204481792717, - 0.004201680672268907, - 0.0014005602240896359, - 0.004201680672268907, - 0.0028011204481792717, - 0.0014005602240896359, - 0.0014005602240896359 - ] + "low": 0, + "high": 81 } } }, diff --git a/metasyn/distribution/base.py b/metasyn/distribution/base.py index d2a261fe..5df23dcc 100644 --- a/metasyn/distribution/base.py +++ b/metasyn/distribution/base.py @@ -265,7 +265,7 @@ def information_criterion(self, values): return self._information_criterion(vals) def _information_criterion(self, values): - return 2 * self.n_par - 2 * np.sum(self.dist.logpdf(values)) + return np.log(len(values)) * self.n_par - 2 * np.sum(self.dist.logpdf(values)) @metadist(is_unique=True) diff --git a/metasyn/distribution/categorical.py b/metasyn/distribution/categorical.py index 4ad4f1bd..3f75181d 100644 --- a/metasyn/distribution/categorical.py +++ b/metasyn/distribution/categorical.py @@ -77,7 +77,7 @@ def information_criterion(self, log_lik += count * np.log(pdict.get(lab, 1)) n_parameters = len(self.probs)-1 - return 2*n_parameters - 2*log_lik + return np.log(len(series))*n_parameters - 2*log_lik def _log_like_int( self, diff --git a/metasyn/distribution/continuous.py b/metasyn/distribution/continuous.py index e42228b7..6ee1949f 100644 --- a/metasyn/distribution/continuous.py +++ b/metasyn/distribution/continuous.py @@ -35,10 +35,11 @@ def _fit(cls, values): def _information_criterion(self, values): if np.any(np.array(values) < self.min_val) or np.any(np.array(values) > self.max_val): - return 2*self.n_par + 100*len(values) + return np.log(len(values))*self.n_par + 100*len(values) if np.fabs(self.max_val-self.min_val) < 1e-8: - return 2*self.n_par - 100*len(values) - return 2*self.n_par - 2*len(values)*np.log((self.max_val-self.min_val)**-1) + return np.log(len(values))*self.n_par - 100*len(values) + return (np.log(len(values))*self.n_par + - 2*len(values)*np.log((self.max_val-self.min_val)**-1)) @classmethod def default_distribution(cls): diff --git a/metasyn/distribution/discrete.py b/metasyn/distribution/discrete.py index 86bc94c9..2b847d14 100644 --- a/metasyn/distribution/discrete.py +++ b/metasyn/distribution/discrete.py @@ -30,7 +30,7 @@ def __init__(self, low: int, high: int): self.dist = self.dist_class(low=low, high=high) def _information_criterion(self, values): - return 2*self.n_par - 2*np.sum(self.dist.logpmf(values)) + return np.log(len(values))*self.n_par - 2*np.sum(self.dist.logpmf(values)) @classmethod def _fit(cls, values): @@ -60,7 +60,7 @@ def __init__(self, mu: float): self.dist = self.dist_class(mu=mu) def _information_criterion(self, values): - return 2*self.n_par - 2*np.sum(self.dist.logpmf(values)) + return np.log(len(values))*self.n_par - 2*np.sum(self.dist.logpmf(values)) @classmethod def _fit(cls, values): @@ -121,11 +121,11 @@ def draw(self): def _information_criterion(self, values): if values.min() < self.low: - return 3+999*len(values) + return 2*np.log(len(values))+999*len(values) # If the values are not unique the fit is extremely bad. if len(set(values)) != len(values): - return 3+999*len(values) + return 2*np.log(len(values))+999*len(values) low = values.min() high = values.max()+1 @@ -133,13 +133,14 @@ def _information_criterion(self, values): if self.consecutive == 1: # Check if the values are truly consecutive if len(values) == high-low and np.all(values.to_numpy() == np.arange(low, high)): - return 3 - return 3+999*len(values) + return 2*np.log(len(values)) + return 2*np.log(len(values))+999*len(values) n_choice = high - low # Probabilities go up like 1/n, 1/(n-1), 1/(n-2), ..., 1/2, 1 - return 5 - 2*np.sum(np.log(1/np.arange(n_choice, n_choice-len(values), -1))) + return (3*np.log(len(values)) + - 2*np.sum(np.log(1/np.arange(n_choice, n_choice-len(values), -1)))) @classmethod def default_distribution(cls): diff --git a/tests/test_continuous.py b/tests/test_continuous.py index 3e236b54..09b5a541 100644 --- a/tests/test_continuous.py +++ b/tests/test_continuous.py @@ -1,9 +1,14 @@ import numpy as np -from scipy import stats -from metasyn.distribution.continuous import UniformDistribution,\ - NormalDistribution, LogNormalDistribution, TruncatedNormalDistribution,\ - ExponentialDistribution from pytest import mark +from scipy import stats + +from metasyn.distribution.continuous import ( + ExponentialDistribution, + LogNormalDistribution, + NormalDistribution, + TruncatedNormalDistribution, + UniformDistribution, +) @mark.parametrize( @@ -20,7 +25,7 @@ def test_uniform(lower_bound, upper_bound): dist = UniformDistribution.fit(values) assert dist.min_val <= values.min() assert dist.max_val >= values.max() - assert dist.information_criterion(values) < 4 - 200*np.log((upper_bound-lower_bound)**-1) + assert dist.information_criterion(values) < 2*np.log(len(values)) - 200*np.log((upper_bound-lower_bound)**-1) assert isinstance(dist.draw(), float) diff --git a/tests/test_discrete.py b/tests/test_discrete.py index 29e64360..65af8ead 100644 --- a/tests/test_discrete.py +++ b/tests/test_discrete.py @@ -1,12 +1,16 @@ +from math import fabs + +import numpy as np import pandas as pd import polars as pl -import numpy as np +from pytest import mark from scipy.stats import poisson -from metasyn.distribution.discrete import UniqueKeyDistribution, DiscreteUniformDistribution,\ - PoissonDistribution -from pytest import mark -from math import fabs +from metasyn.distribution.discrete import ( + DiscreteUniformDistribution, + PoissonDistribution, + UniqueKeyDistribution, +) @mark.parametrize( @@ -29,7 +33,7 @@ def test_uniform(data, series_type): assert len(drawn_values) == len(series) drawn_values = np.array(list(drawn_values)) assert np.isclose(dist.information_criterion(drawn_values), - 4+2*len(drawn_values)*(np.log(dist.high-dist.low))) + np.log(len(drawn_values))*2+2*len(drawn_values)*(np.log(dist.high-dist.low))) @mark.parametrize(