Skip to content

Commit

Permalink
Change from AIC to BIC (#212)
Browse files Browse the repository at this point in the history
* Change from AIC to BIC

* Fix pylint issue

---------

Co-authored-by: Raoul Schram <[email protected]>
Co-authored-by: Erik-Jan van Kesteren <[email protected]>
  • Loading branch information
3 people authored Nov 29, 2023
1 parent 25d4753 commit 4680031
Show file tree
Hide file tree
Showing 7 changed files with 40 additions and 173 deletions.
156 changes: 6 additions & 150 deletions examples/example_gmf_titanic.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
"provenance": {
"created by": {
"name": "metasyn",
"version": "0.6.1.dev32+gd454b49.d20231121"
"version": "0.6.1.dev32+g871b8ec"
},
"creation time": "2023-11-21T12:34:31.732581"
"creation time": "2023-11-21T13:22:03.439633"
},
"vars": [
{
Expand Down Expand Up @@ -70,157 +70,13 @@
"dtype": "Int64",
"prop_missing": 0.19865319865319866,
"distribution": {
"implements": "core.multinoulli",
"implements": "core.discrete_uniform",
"version": "1.0",
"provenance": "builtin",
"class_name": "MultinoulliDistribution",
"class_name": "DiscreteUniformDistribution",
"parameters": {
"labels": [
0,
1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
11,
12,
13,
14,
15,
16,
17,
18,
19,
20,
21,
22,
23,
24,
25,
26,
27,
28,
29,
30,
31,
32,
33,
34,
35,
36,
37,
38,
39,
40,
41,
42,
43,
44,
45,
46,
47,
48,
49,
50,
51,
52,
53,
54,
55,
56,
57,
58,
59,
60,
61,
62,
63,
64,
65,
66,
70,
71,
74,
80
],
"probs": [
0.0014005602240896359,
0.018207282913165267,
0.014005602240896359,
0.008403361344537815,
0.014005602240896359,
0.0056022408963585435,
0.004201680672268907,
0.004201680672268907,
0.0056022408963585435,
0.011204481792717087,
0.0028011204481792717,
0.0056022408963585435,
0.0014005602240896359,
0.0028011204481792717,
0.00980392156862745,
0.0070028011204481795,
0.023809523809523808,
0.018207282913165267,
0.036414565826330535,
0.0350140056022409,
0.022408963585434174,
0.03361344537815126,
0.037815126050420166,
0.02100840336134454,
0.04481792717086835,
0.03221288515406162,
0.025210084033613446,
0.025210084033613446,
0.037815126050420166,
0.028011204481792718,
0.037815126050420166,
0.023809523809523808,
0.028011204481792718,
0.02100840336134454,
0.022408963585434174,
0.025210084033613446,
0.03221288515406162,
0.008403361344537815,
0.015406162464985995,
0.0196078431372549,
0.02100840336134454,
0.008403361344537815,
0.018207282913165267,
0.0070028011204481795,
0.012605042016806723,
0.01680672268907563,
0.0070028011204481795,
0.012605042016806723,
0.012605042016806723,
0.008403361344537815,
0.014005602240896359,
0.00980392156862745,
0.008403361344537815,
0.0014005602240896359,
0.011204481792717087,
0.0028011204481792717,
0.0070028011204481795,
0.0028011204481792717,
0.0070028011204481795,
0.0028011204481792717,
0.0056022408963585435,
0.004201680672268907,
0.0056022408963585435,
0.0028011204481792717,
0.0028011204481792717,
0.004201680672268907,
0.0014005602240896359,
0.004201680672268907,
0.0028011204481792717,
0.0014005602240896359,
0.0014005602240896359
]
"low": 0,
"high": 81
}
}
},
Expand Down
2 changes: 1 addition & 1 deletion metasyn/distribution/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ def information_criterion(self, values):
return self._information_criterion(vals)

def _information_criterion(self, values):
return 2 * self.n_par - 2 * np.sum(self.dist.logpdf(values))
return np.log(len(values)) * self.n_par - 2 * np.sum(self.dist.logpdf(values))


@metadist(is_unique=True)
Expand Down
2 changes: 1 addition & 1 deletion metasyn/distribution/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def information_criterion(self,
log_lik += count * np.log(pdict.get(lab, 1))
n_parameters = len(self.probs)-1

return 2*n_parameters - 2*log_lik
return np.log(len(series))*n_parameters - 2*log_lik

def _log_like_int(
self,
Expand Down
7 changes: 4 additions & 3 deletions metasyn/distribution/continuous.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,11 @@ def _fit(cls, values):

def _information_criterion(self, values):
if np.any(np.array(values) < self.min_val) or np.any(np.array(values) > self.max_val):
return 2*self.n_par + 100*len(values)
return np.log(len(values))*self.n_par + 100*len(values)
if np.fabs(self.max_val-self.min_val) < 1e-8:
return 2*self.n_par - 100*len(values)
return 2*self.n_par - 2*len(values)*np.log((self.max_val-self.min_val)**-1)
return np.log(len(values))*self.n_par - 100*len(values)
return (np.log(len(values))*self.n_par
- 2*len(values)*np.log((self.max_val-self.min_val)**-1))

@classmethod
def default_distribution(cls):
Expand Down
15 changes: 8 additions & 7 deletions metasyn/distribution/discrete.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def __init__(self, low: int, high: int):
self.dist = self.dist_class(low=low, high=high)

def _information_criterion(self, values):
return 2*self.n_par - 2*np.sum(self.dist.logpmf(values))
return np.log(len(values))*self.n_par - 2*np.sum(self.dist.logpmf(values))

@classmethod
def _fit(cls, values):
Expand Down Expand Up @@ -60,7 +60,7 @@ def __init__(self, mu: float):
self.dist = self.dist_class(mu=mu)

def _information_criterion(self, values):
return 2*self.n_par - 2*np.sum(self.dist.logpmf(values))
return np.log(len(values))*self.n_par - 2*np.sum(self.dist.logpmf(values))

@classmethod
def _fit(cls, values):
Expand Down Expand Up @@ -121,25 +121,26 @@ def draw(self):

def _information_criterion(self, values):
if values.min() < self.low:
return 3+999*len(values)
return 2*np.log(len(values))+999*len(values)

# If the values are not unique the fit is extremely bad.
if len(set(values)) != len(values):
return 3+999*len(values)
return 2*np.log(len(values))+999*len(values)

low = values.min()
high = values.max()+1

if self.consecutive == 1:
# Check if the values are truly consecutive
if len(values) == high-low and np.all(values.to_numpy() == np.arange(low, high)):
return 3
return 3+999*len(values)
return 2*np.log(len(values))
return 2*np.log(len(values))+999*len(values)

n_choice = high - low

# Probabilities go up like 1/n, 1/(n-1), 1/(n-2), ..., 1/2, 1
return 5 - 2*np.sum(np.log(1/np.arange(n_choice, n_choice-len(values), -1)))
return (3*np.log(len(values))
- 2*np.sum(np.log(1/np.arange(n_choice, n_choice-len(values), -1))))

@classmethod
def default_distribution(cls):
Expand Down
15 changes: 10 additions & 5 deletions tests/test_continuous.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
import numpy as np
from scipy import stats
from metasyn.distribution.continuous import UniformDistribution,\
NormalDistribution, LogNormalDistribution, TruncatedNormalDistribution,\
ExponentialDistribution
from pytest import mark
from scipy import stats

from metasyn.distribution.continuous import (
ExponentialDistribution,
LogNormalDistribution,
NormalDistribution,
TruncatedNormalDistribution,
UniformDistribution,
)


@mark.parametrize(
Expand All @@ -20,7 +25,7 @@ def test_uniform(lower_bound, upper_bound):
dist = UniformDistribution.fit(values)
assert dist.min_val <= values.min()
assert dist.max_val >= values.max()
assert dist.information_criterion(values) < 4 - 200*np.log((upper_bound-lower_bound)**-1)
assert dist.information_criterion(values) < 2*np.log(len(values)) - 200*np.log((upper_bound-lower_bound)**-1)
assert isinstance(dist.draw(), float)


Expand Down
16 changes: 10 additions & 6 deletions tests/test_discrete.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
from math import fabs

import numpy as np
import pandas as pd
import polars as pl
import numpy as np
from pytest import mark
from scipy.stats import poisson

from metasyn.distribution.discrete import UniqueKeyDistribution, DiscreteUniformDistribution,\
PoissonDistribution
from pytest import mark
from math import fabs
from metasyn.distribution.discrete import (
DiscreteUniformDistribution,
PoissonDistribution,
UniqueKeyDistribution,
)


@mark.parametrize(
Expand All @@ -29,7 +33,7 @@ def test_uniform(data, series_type):
assert len(drawn_values) == len(series)
drawn_values = np.array(list(drawn_values))
assert np.isclose(dist.information_criterion(drawn_values),
4+2*len(drawn_values)*(np.log(dist.high-dist.low)))
np.log(len(drawn_values))*2+2*len(drawn_values)*(np.log(dist.high-dist.low)))


@mark.parametrize(
Expand Down

0 comments on commit 4680031

Please sign in to comment.