diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 8788c39..d86b807 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -19,7 +19,7 @@ jobs: metasyn-version: [ "git+https://github.com/sodascience/metasyn.git@main"] include: - python-version: "3.11" - metasyn-version: "metasyn==0.8.0" + metasyn-version: "metasyn==1.0.0" steps: - uses: actions/checkout@v4 diff --git a/metasyncontrib/disclosure/base.py b/metasyncontrib/disclosure/base.py index 7cecaf6..84fc762 100644 --- a/metasyncontrib/disclosure/base.py +++ b/metasyncontrib/disclosure/base.py @@ -1,5 +1,8 @@ """Base class for all disclosure control distributions.""" +import polars as pl +from metasyn.distribution.base import BaseDistribution + def metadist_disclosure(): """Decorate class to create a distribution with disclosure control. @@ -17,3 +20,25 @@ def _wrap(cls): return cls return _wrap + + +class DisclosureConstantMixin(BaseDistribution): + """Mixin class to overload fit method for constant distributions.""" + + @classmethod + def fit(cls, series, *args, n_avg: int = 11, **kwargs) -> BaseDistribution: + """Fit constant distributions with disclosure control rules in place.""" + pl_series: pl.Series = cls._to_series(series) + + # if unique, just get that value if it occurs at least n_avg times + if pl_series.n_unique() == 1 and pl_series.len() >= n_avg: + return cls._fit(pl_series, *args, **kwargs) + + if pl_series.n_unique() > 1: + # if not unique, ensure most common value occurs at least n_avg times + _value, count = pl_series.value_counts(sort=True).row(0) + if count >= n_avg: + return cls._fit(pl_series, *args, **kwargs) + + return cls.default_distribution() + diff --git a/metasyncontrib/disclosure/constant.py b/metasyncontrib/disclosure/constant.py deleted file mode 100644 index 40c3a81..0000000 --- a/metasyncontrib/disclosure/constant.py +++ /dev/null @@ -1,66 +0,0 @@ -"""Module for disclosure controlled constant distributions.""" -from __future__ import annotations - -import polars as pl -from metasyn.distribution.base import BaseDistribution -from metasyn.distribution.constant import ( - ConstantDistribution, - DateConstantDistribution, - DateTimeConstantDistribution, - DiscreteConstantDistribution, - StringConstantDistribution, - TimeConstantDistribution, -) - -from metasyncontrib.disclosure.base import metadist_disclosure - - -class DisclosureConstantMixin(BaseDistribution): - """Mixin class to overload fit method for constant distributions.""" - - @classmethod - def fit(cls, series, *args, n_avg: int = 11, **kwargs) -> BaseDistribution: - """Fit constant distributions with disclosure control rules in place.""" - pl_series: pl.Series = cls._to_series(series) - - # if unique, just get that value if it occurs at least n_avg times - if pl_series.n_unique() == 1 and pl_series.len() >= n_avg: - return cls._fit(pl_series, *args, **kwargs) - - if pl_series.n_unique() > 1: - # if not unique, ensure most common value occurs at least n_avg times - _value, count = pl_series.value_counts(sort=True).row(0) - if count >= n_avg: - return cls._fit(pl_series, *args, **kwargs) - - return cls.default_distribution() - - -@metadist_disclosure() -class DisclosureConstant(DisclosureConstantMixin, ConstantDistribution): - """Disclosure controlled ConstantDistribution.""" - - -@metadist_disclosure() -class DisclosureDiscreteConstant(DisclosureConstantMixin, DiscreteConstantDistribution): - """Disclosure controlled DiscreteConstantDistribution.""" - - -@metadist_disclosure() -class DisclosureStringConstant(DisclosureConstantMixin, StringConstantDistribution): - """Disclosure controlled StringConstantDistribution.""" - - -@metadist_disclosure() -class DisclosureDateTimeConstant(DisclosureConstantMixin, DateTimeConstantDistribution): - """Disclosure controlled DateTimeConstantDistribution.""" - - -@metadist_disclosure() -class DisclosureTimeConstant(DisclosureConstantMixin, TimeConstantDistribution): - """Disclosure controlled TimeConstantDistribution.""" - - -@metadist_disclosure() -class DisclosureDateConstant(DisclosureConstantMixin, DateConstantDistribution): - """Disclosure controlled DateConstantDistribution.""" diff --git a/metasyncontrib/disclosure/continuous.py b/metasyncontrib/disclosure/continuous.py index 4ee1902..fa543d3 100644 --- a/metasyncontrib/disclosure/continuous.py +++ b/metasyncontrib/disclosure/continuous.py @@ -1,6 +1,7 @@ """Disclosure control implementations for continuous distributions.""" from metasyn.distribution.continuous import ( + ConstantDistribution, ExponentialDistribution, LogNormalDistribution, NormalDistribution, @@ -8,7 +9,7 @@ UniformDistribution, ) -from metasyncontrib.disclosure.base import metadist_disclosure +from metasyncontrib.disclosure.base import DisclosureConstantMixin, metadist_disclosure from metasyncontrib.disclosure.numerical import DisclosureNumericalMixin @@ -35,3 +36,7 @@ class DisclosureTruncatedNormal(DisclosureNumericalMixin, TruncatedNormalDistrib @metadist_disclosure() class DisclosureExponential(DisclosureNumericalMixin, ExponentialDistribution): """Disclosure exponential distribution.""" + +@metadist_disclosure() +class DisclosureConstant(DisclosureConstantMixin, ConstantDistribution): + """Disclosure controlled ConstantDistribution.""" diff --git a/metasyncontrib/disclosure/datetime.py b/metasyncontrib/disclosure/datetime.py index 651d2ff..b99dcdb 100644 --- a/metasyncontrib/disclosure/datetime.py +++ b/metasyncontrib/disclosure/datetime.py @@ -6,12 +6,15 @@ import polars as pl from metasyn.distribution.datetime import ( + DateConstantDistribution, + DateTimeConstantDistribution, DateTimeUniformDistribution, DateUniformDistribution, + TimeConstantDistribution, TimeUniformDistribution, ) -from metasyncontrib.disclosure.base import metadist_disclosure +from metasyncontrib.disclosure.base import DisclosureConstantMixin, metadist_disclosure # from metasyncontrib.disclosure.base import BaseDisclosureDistribution from metasyncontrib.disclosure.utils import micro_aggregate @@ -56,3 +59,17 @@ def _fit(cls, values: pl.Series, n_avg: int = 11) -> DisclosureDate: # Convert back into dates sub_series = pl.Series([dt_val.date() for dt_val in dt_sub_series]) return cls(sub_series.min(), sub_series.max()) + +@metadist_disclosure() +class DisclosureDateTimeConstant(DisclosureConstantMixin, DateTimeConstantDistribution): + """Disclosure controlled DateTimeConstantDistribution.""" + + +@metadist_disclosure() +class DisclosureTimeConstant(DisclosureConstantMixin, TimeConstantDistribution): + """Disclosure controlled TimeConstantDistribution.""" + + +@metadist_disclosure() +class DisclosureDateConstant(DisclosureConstantMixin, DateConstantDistribution): + """Disclosure controlled DateConstantDistribution.""" diff --git a/metasyncontrib/disclosure/discrete.py b/metasyncontrib/disclosure/discrete.py index 20d7a97..5c198fe 100644 --- a/metasyncontrib/disclosure/discrete.py +++ b/metasyncontrib/disclosure/discrete.py @@ -4,6 +4,7 @@ import polars as pl from metasyn.distribution.discrete import ( + DiscreteConstantDistribution, DiscreteNormalDistribution, DiscreteTruncatedNormalDistribution, DiscreteUniformDistribution, @@ -11,7 +12,7 @@ UniqueKeyDistribution, ) -from metasyncontrib.disclosure.base import metadist_disclosure +from metasyncontrib.disclosure.base import DisclosureConstantMixin, metadist_disclosure from metasyncontrib.disclosure.numerical import DisclosureNumericalMixin from metasyncontrib.disclosure.utils import micro_aggregate @@ -49,3 +50,7 @@ def _fit(cls, values: pl.Series, n_avg: int = 11): return cls(0, True) sub_values = micro_aggregate(values, n_avg) return super()._fit(sub_values) + +@metadist_disclosure() +class DisclosureDiscreteConstant(DisclosureConstantMixin, DiscreteConstantDistribution): + """Disclosure controlled DiscreteConstantDistribution.""" diff --git a/metasyncontrib/disclosure/provider.py b/metasyncontrib/disclosure/provider.py index f7ae7b1..54be86a 100644 --- a/metasyncontrib/disclosure/provider.py +++ b/metasyncontrib/disclosure/provider.py @@ -5,35 +5,37 @@ from metasyn.provider import BaseDistributionProvider from metasyncontrib.disclosure.categorical import DisclosureMultinoulli -from metasyncontrib.disclosure.constant import ( - DisclosureConstant, - DisclosureDateConstant, - DisclosureDateTimeConstant, - DisclosureDiscreteConstant, - DisclosureStringConstant, - DisclosureTimeConstant, -) from metasyncontrib.disclosure.continuous import ( + DisclosureConstant, DisclosureExponential, DisclosureLogNormal, DisclosureNormal, DisclosureTruncatedNormal, DisclosureUniform, ) -from metasyncontrib.disclosure.datetime import DisclosureDate, DisclosureDateTime, DisclosureTime +from metasyncontrib.disclosure.datetime import ( + DisclosureDate, + DisclosureDateConstant, + DisclosureDateTime, + DisclosureDateTimeConstant, + DisclosureTime, + DisclosureTimeConstant, +) from metasyncontrib.disclosure.discrete import ( + DisclosureDiscreteConstant, DisclosureDiscreteNormal, DisclosureDiscreteTruncatedNormal, DisclosureDiscreteUniform, DisclosurePoisson, DisclosureUniqueKey, ) -from metasyncontrib.disclosure.faker import ( +from metasyncontrib.disclosure.na import DisclosureNA +from metasyncontrib.disclosure.string import ( DisclosureFaker, DisclosureFreetext, + DisclosureStringConstant, DisclosureUniqueFaker, ) -from metasyncontrib.disclosure.na import DisclosureNA class DisclosureProvider(BaseDistributionProvider): diff --git a/metasyncontrib/disclosure/faker.py b/metasyncontrib/disclosure/string.py similarity index 78% rename from metasyncontrib/disclosure/faker.py rename to metasyncontrib/disclosure/string.py index f6ca192..5309b34 100644 --- a/metasyncontrib/disclosure/faker.py +++ b/metasyncontrib/disclosure/string.py @@ -1,12 +1,13 @@ """Module for disclosure control for string distributions.""" -from metasyn.distribution.faker import ( +from metasyn.distribution.string import ( FakerDistribution, FreeTextDistribution, + StringConstantDistribution, UniqueFakerDistribution, ) -from metasyncontrib.disclosure.base import metadist_disclosure +from metasyncontrib.disclosure.base import DisclosureConstantMixin, metadist_disclosure @metadist_disclosure() @@ -34,3 +35,7 @@ class DisclosureFreetext(FreeTextDistribution): @classmethod def _fit(cls, values, max_values: int = 50, n_avg: int = 11): # pylint: disable=unused-argument return super()._fit(values, max_values=max_values) + +@metadist_disclosure() +class DisclosureStringConstant(DisclosureConstantMixin, StringConstantDistribution): + """Disclosure controlled StringConstantDistribution.""" diff --git a/tests/test_constant.py b/tests/test_constant.py index a4826eb..b10b5ad 100644 --- a/tests/test_constant.py +++ b/tests/test_constant.py @@ -1,21 +1,21 @@ -from metasyn.distribution.constant import ( - ConstantDistribution, +from metasyn.distribution.continuous import ConstantDistribution +from metasyn.distribution.datetime import ( DateConstantDistribution, DateTimeConstantDistribution, - DiscreteConstantDistribution, - StringConstantDistribution, TimeConstantDistribution, ) +from metasyn.distribution.discrete import DiscreteConstantDistribution +from metasyn.distribution.string import StringConstantDistribution from pytest import mark -from metasyncontrib.disclosure.constant import ( - DisclosureConstant, +from metasyncontrib.disclosure.continuous import DisclosureConstant +from metasyncontrib.disclosure.datetime import ( DisclosureDateConstant, DisclosureDateTimeConstant, - DisclosureDiscreteConstant, - DisclosureStringConstant, DisclosureTimeConstant, ) +from metasyncontrib.disclosure.discrete import DisclosureDiscreteConstant +from metasyncontrib.disclosure.string import DisclosureStringConstant @mark.parametrize( diff --git a/tests/test_other_dist.py b/tests/test_other_dist.py index 2efd121..ad4219b 100644 --- a/tests/test_other_dist.py +++ b/tests/test_other_dist.py @@ -12,7 +12,7 @@ from metasyncontrib.disclosure.categorical import DisclosureMultinoulli from metasyncontrib.disclosure.datetime import DisclosureDate, DisclosureDateTime, DisclosureTime -from metasyncontrib.disclosure.faker import DisclosureFaker +from metasyncontrib.disclosure.string import DisclosureFaker @mark.parametrize(