Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change from AIC to BIC #212

Merged
merged 3 commits into from
Nov 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
156 changes: 6 additions & 150 deletions examples/example_gmf_titanic.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
"provenance": {
"created by": {
"name": "metasyn",
"version": "0.6.1.dev32+gd454b49.d20231121"
"version": "0.6.1.dev32+g871b8ec"
},
"creation time": "2023-11-21T12:34:31.732581"
"creation time": "2023-11-21T13:22:03.439633"
},
"vars": [
{
Expand Down Expand Up @@ -70,157 +70,13 @@
"dtype": "Int64",
"prop_missing": 0.19865319865319866,
"distribution": {
"implements": "core.multinoulli",
"implements": "core.discrete_uniform",
"version": "1.0",
"provenance": "builtin",
"class_name": "MultinoulliDistribution",
"class_name": "DiscreteUniformDistribution",
"parameters": {
"labels": [
0,
1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
11,
12,
13,
14,
15,
16,
17,
18,
19,
20,
21,
22,
23,
24,
25,
26,
27,
28,
29,
30,
31,
32,
33,
34,
35,
36,
37,
38,
39,
40,
41,
42,
43,
44,
45,
46,
47,
48,
49,
50,
51,
52,
53,
54,
55,
56,
57,
58,
59,
60,
61,
62,
63,
64,
65,
66,
70,
71,
74,
80
],
"probs": [
0.0014005602240896359,
0.018207282913165267,
0.014005602240896359,
0.008403361344537815,
0.014005602240896359,
0.0056022408963585435,
0.004201680672268907,
0.004201680672268907,
0.0056022408963585435,
0.011204481792717087,
0.0028011204481792717,
0.0056022408963585435,
0.0014005602240896359,
0.0028011204481792717,
0.00980392156862745,
0.0070028011204481795,
0.023809523809523808,
0.018207282913165267,
0.036414565826330535,
0.0350140056022409,
0.022408963585434174,
0.03361344537815126,
0.037815126050420166,
0.02100840336134454,
0.04481792717086835,
0.03221288515406162,
0.025210084033613446,
0.025210084033613446,
0.037815126050420166,
0.028011204481792718,
0.037815126050420166,
0.023809523809523808,
0.028011204481792718,
0.02100840336134454,
0.022408963585434174,
0.025210084033613446,
0.03221288515406162,
0.008403361344537815,
0.015406162464985995,
0.0196078431372549,
0.02100840336134454,
0.008403361344537815,
0.018207282913165267,
0.0070028011204481795,
0.012605042016806723,
0.01680672268907563,
0.0070028011204481795,
0.012605042016806723,
0.012605042016806723,
0.008403361344537815,
0.014005602240896359,
0.00980392156862745,
0.008403361344537815,
0.0014005602240896359,
0.011204481792717087,
0.0028011204481792717,
0.0070028011204481795,
0.0028011204481792717,
0.0070028011204481795,
0.0028011204481792717,
0.0056022408963585435,
0.004201680672268907,
0.0056022408963585435,
0.0028011204481792717,
0.0028011204481792717,
0.004201680672268907,
0.0014005602240896359,
0.004201680672268907,
0.0028011204481792717,
0.0014005602240896359,
0.0014005602240896359
]
"low": 0,
"high": 81
}
}
},
Expand Down
2 changes: 1 addition & 1 deletion metasyn/distribution/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ def information_criterion(self, values):
return self._information_criterion(vals)

def _information_criterion(self, values):
return 2 * self.n_par - 2 * np.sum(self.dist.logpdf(values))
return np.log(len(values)) * self.n_par - 2 * np.sum(self.dist.logpdf(values))


@metadist(is_unique=True)
Expand Down
2 changes: 1 addition & 1 deletion metasyn/distribution/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def information_criterion(self,
log_lik += count * np.log(pdict.get(lab, 1))
n_parameters = len(self.probs)-1

return 2*n_parameters - 2*log_lik
return np.log(len(series))*n_parameters - 2*log_lik

def _log_like_int(
self,
Expand Down
7 changes: 4 additions & 3 deletions metasyn/distribution/continuous.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,11 @@ def _fit(cls, values):

def _information_criterion(self, values):
if np.any(np.array(values) < self.min_val) or np.any(np.array(values) > self.max_val):
return 2*self.n_par + 100*len(values)
return np.log(len(values))*self.n_par + 100*len(values)
if np.fabs(self.max_val-self.min_val) < 1e-8:
return 2*self.n_par - 100*len(values)
return 2*self.n_par - 2*len(values)*np.log((self.max_val-self.min_val)**-1)
return np.log(len(values))*self.n_par - 100*len(values)
return (np.log(len(values))*self.n_par
- 2*len(values)*np.log((self.max_val-self.min_val)**-1))

@classmethod
def default_distribution(cls):
Expand Down
15 changes: 8 additions & 7 deletions metasyn/distribution/discrete.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def __init__(self, low: int, high: int):
self.dist = self.dist_class(low=low, high=high)

def _information_criterion(self, values):
return 2*self.n_par - 2*np.sum(self.dist.logpmf(values))
return np.log(len(values))*self.n_par - 2*np.sum(self.dist.logpmf(values))

@classmethod
def _fit(cls, values):
Expand Down Expand Up @@ -60,7 +60,7 @@ def __init__(self, mu: float):
self.dist = self.dist_class(mu=mu)

def _information_criterion(self, values):
return 2*self.n_par - 2*np.sum(self.dist.logpmf(values))
return np.log(len(values))*self.n_par - 2*np.sum(self.dist.logpmf(values))

@classmethod
def _fit(cls, values):
Expand Down Expand Up @@ -121,25 +121,26 @@ def draw(self):

def _information_criterion(self, values):
if values.min() < self.low:
return 3+999*len(values)
return 2*np.log(len(values))+999*len(values)

# If the values are not unique the fit is extremely bad.
if len(set(values)) != len(values):
return 3+999*len(values)
return 2*np.log(len(values))+999*len(values)

low = values.min()
high = values.max()+1

if self.consecutive == 1:
# Check if the values are truly consecutive
if len(values) == high-low and np.all(values.to_numpy() == np.arange(low, high)):
return 3
return 3+999*len(values)
return 2*np.log(len(values))
return 2*np.log(len(values))+999*len(values)

n_choice = high - low

# Probabilities go up like 1/n, 1/(n-1), 1/(n-2), ..., 1/2, 1
return 5 - 2*np.sum(np.log(1/np.arange(n_choice, n_choice-len(values), -1)))
return (3*np.log(len(values))
- 2*np.sum(np.log(1/np.arange(n_choice, n_choice-len(values), -1))))

@classmethod
def default_distribution(cls):
Expand Down
15 changes: 10 additions & 5 deletions tests/test_continuous.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
import numpy as np
from scipy import stats
from metasyn.distribution.continuous import UniformDistribution,\
NormalDistribution, LogNormalDistribution, TruncatedNormalDistribution,\
ExponentialDistribution
from pytest import mark
from scipy import stats

from metasyn.distribution.continuous import (
ExponentialDistribution,
LogNormalDistribution,
NormalDistribution,
TruncatedNormalDistribution,
UniformDistribution,
)


@mark.parametrize(
Expand All @@ -20,7 +25,7 @@ def test_uniform(lower_bound, upper_bound):
dist = UniformDistribution.fit(values)
assert dist.min_val <= values.min()
assert dist.max_val >= values.max()
assert dist.information_criterion(values) < 4 - 200*np.log((upper_bound-lower_bound)**-1)
assert dist.information_criterion(values) < 2*np.log(len(values)) - 200*np.log((upper_bound-lower_bound)**-1)
assert isinstance(dist.draw(), float)


Expand Down
16 changes: 10 additions & 6 deletions tests/test_discrete.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
from math import fabs

import numpy as np
import pandas as pd
import polars as pl
import numpy as np
from pytest import mark
from scipy.stats import poisson

from metasyn.distribution.discrete import UniqueKeyDistribution, DiscreteUniformDistribution,\
PoissonDistribution
from pytest import mark
from math import fabs
from metasyn.distribution.discrete import (
DiscreteUniformDistribution,
PoissonDistribution,
UniqueKeyDistribution,
)


@mark.parametrize(
Expand All @@ -29,7 +33,7 @@ def test_uniform(data, series_type):
assert len(drawn_values) == len(series)
drawn_values = np.array(list(drawn_values))
assert np.isclose(dist.information_criterion(drawn_values),
4+2*len(drawn_values)*(np.log(dist.high-dist.low)))
np.log(len(drawn_values))*2+2*len(drawn_values)*(np.log(dist.high-dist.low)))


@mark.parametrize(
Expand Down