Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Coverage #354

Merged
merged 9 commits into from
Jan 8, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .github/workflows/core-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,13 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install ".[test,extra]"
python -m pip install -e ".[test,extra]"
- name: Test with pytest
if: ${{ matrix.python-version != '3.12' }} # Either do coverage testing or pytest only
run: pytest tests
- name: Test coverage >= 95%
if: ${{ matrix.python-version == '3.12' }}
run: pytest --cov=metasyn tests/ --cov-report=term-missing --cov-fail-under=95
- name: Check notebook output
if: ${{ matrix.os != 'macos-latest' }}
run: pytest --nbval-lax examples
Expand Down
7 changes: 4 additions & 3 deletions metasyn/distribution/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,9 +332,10 @@ class UniqueDistributionMixin(BaseDistribution):
variations, such as `UniqueFakerDistribution` and `UniqueRegexDistribution`.
"""

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.key_set: set = set()
def __new__(cls, *args, **kwargs): # noqa
instance = super().__new__(cls)
instance.key_set: set = set()
return instance

def draw_reset(self):
self.key_set = set()
Expand Down
9 changes: 6 additions & 3 deletions metasyn/provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,11 +252,14 @@ def create(self, var_spec: Union[VarSpec, VarSpecAccess]) -> BaseDistribution:
return dist_class(**dist_spec.parameters)
except TypeError as err:
dist_param = set(signature(dist_class.__init__).parameters) - {"self"} # type: ignore
if "args" in dist_param or "kwargs" in dist_param:
raise err
unknown_param = set(dist_spec.parameters) - dist_param # type: ignore
missing_param = dist_param - set(dist_spec.parameters) # type: ignore
if len(unknown_param) > 0:
raise TypeError(f"Unknown parameters {unknown_param} for variable {var_spec.name}.")
raise TypeError(f"Unknown parameters {unknown_param} for variable {var_spec.name}."
f"Available parameters: {dist_param}")
if len(missing_param) > 0:
raise ValueError(f"Missing parameters for variable {var_spec.name}:"
f" {missing_param}.")
raise err

def _find_best_fit(self, series: pl.Series, var_type: str,
Expand Down
4 changes: 4 additions & 0 deletions metasyn/testutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,10 @@ def check_distribution(distribution: type[BaseDistribution], privacy: BasePrivac
assert isinstance(new_dist, distribution)
assert set(list(new_dist.to_dict())) >= set(
("implements", "provenance", "class_name", "parameters"))
empty_series = pl.Series([], dtype=series.dtype)
new_dist = distribution.fit(empty_series, **privacy.fit_kwargs)
assert isinstance(new_dist, distribution)




Expand Down
11 changes: 3 additions & 8 deletions metasyn/var.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,10 +63,8 @@ def __init__( # noqa: PLR0913
):
self.name = name
if var_type is None:
if not isinstance(distribution.var_type, str):
raise ValueError("Failed to infer variable type for variable '{name}'"
" supply var_type or a different distribution.")
var_type = distribution.var_type
var_type = MetaVar.get_var_type(pl.Series([distribution.draw()]))
distribution.draw_reset()
self.var_type = var_type
self.distribution = distribution
self.dtype = dtype
Expand All @@ -90,9 +88,6 @@ def __init__( # noqa: PLR0913
def get_var_type(series: pl.Series) -> str:
"""Convert polars dtype to metasyn variable type.

This method uses internal polars methods, so this might break at some
point.

Parameters
----------
series:
Expand Down Expand Up @@ -130,7 +125,7 @@ def get_var_type(series: pl.Series) -> str:
try:
return convert_dict[polars_dtype]
except KeyError as exc:
raise ValueError(f"Unsupported polars type '{polars_dtype}'") from exc
raise TypeError(f"Unsupported polars type '{polars_dtype}'") from exc

def to_dict(self) -> Dict[str, Any]:
"""Create a dictionary from the variable."""
Expand Down
7 changes: 5 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ documentation = "https://metasyn.readthedocs.io/en/latest/index.html"
[project.optional-dependencies]
extra = ["xlsxwriter", "pandas", "tomlkit"]
check = ["ruff", "mypy", "types-tqdm", "types-regex"]
test = ["pytest", "nbval"]
test = ["pytest", "nbval", "pytest-cov"]
docs = [
"sphinx<9.0.0", "sphinx-rtd-theme", "sphinxcontrib-napoleon",
"sphinx-autodoc-typehints", "sphinx_inline_tabs", "sphinx_copybutton",
Expand Down Expand Up @@ -104,4 +104,7 @@ max-args=10
max-locals=35

[tool.ruff.lint.pydocstyle]
convention="numpy"
convention="numpy"

[tool.coverage.run]
omit = ["metasyn/_version.py"]
3 changes: 2 additions & 1 deletion tests/data/example_config.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Example toml file as input for metasyn
dist_providers = ["builtin"]

config_version = "1.0"
privacy = "none"

[[var]]
name = "PassengerId"
Expand Down
4 changes: 4 additions & 0 deletions tests/data/incompatible_config.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
privacy = "disclosure"

[defaults]
privacy = "builtin"
1 change: 1 addition & 0 deletions tests/data/unsupported_config.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
config_version = "2.0"
2 changes: 2 additions & 0 deletions tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ def test_meta_config_datafree():

def test_meta_config():
"""Test the creation of a MetaConfig class that is not data free."""
with pytest.raises(ValueError):
MetaConfig.from_toml(Path("tests", "data", "titanic.csv"))
meta_config = MetaConfig.from_toml(Path("tests", "data", "example_config.toml"))
assert len(meta_config.var_specs) == 5
var_spec = meta_config.get("Cabin")
Expand Down
23 changes: 23 additions & 0 deletions tests/test_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
UniqueRegexDistribution,
)
from metasyn.provider import BuiltinDistributionProvider, DistributionProviderList
from metasyn.varspec import DistributionSpec, VarSpec


@mark.parametrize("input", ["builtin", "fake-name", BuiltinDistributionProvider,
Expand Down Expand Up @@ -112,3 +113,25 @@ def test_find_distribution(dist_str, var_type, is_unique, dist):
new_class = provider_list.find_distribution(dist_class.__name__, var_type=var_type,
unique=is_unique)
assert new_class == dist_class

def test_create_distribution():
dist_spec = DistributionSpec("uniform", False, parameters={"lower": 10, "upper": 20})
var_spec = VarSpec("test", dist_spec, var_type="continuous")
provider_list = DistributionProviderList("builtin")

assert isinstance(provider_list.create(var_spec), UniformDistribution)

# Error with missing parameters
var_spec.dist_spec.parameters.pop("lower")
with pytest.raises(ValueError):
provider_list.create(var_spec)

# Error with unknown parameters
with pytest.raises(TypeError):
var_spec.dist_spec.parameters["unknown"] = 1
provider_list.create(var_spec)

# Error when implements is not given.
with pytest.raises(ValueError):
var_spec.dist_spec.implements = None
provider_list.create(var_spec)
26 changes: 25 additions & 1 deletion tests/test_string.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
"""Test the string type distribution inference."""
import pandas as pd
import polars as pl
import pytest
from pytest import mark

from metasyn.distribution.string import FakerDistribution, FreeTextDistribution
from metasyn.distribution.string import (
FakerDistribution,
FreeTextDistribution,
UniqueFakerDistribution,
UniqueRegexDistribution,
)
from metasyn.var import MetaVar


Expand Down Expand Up @@ -44,3 +50,21 @@ def test_free_text(series, lang, avg_sentences, avg_words):
series_1 = var.draw_series(100, seed=1234)
series_2 = var.draw_series(100, seed=1234)
assert all(series_1 == series_2)


def test_unique_regex():
dist = UniqueRegexDistribution(r"[0-9]")
var = MetaVar("some_var", "string", dist, prop_missing=0.0)

series = var.draw_series(10, None)
assert len(series.unique()) == 10

with pytest.raises(ValueError):
var.draw()

def test_unique_faker():
dist = UniqueFakerDistribution("city")
var = MetaVar("some_var", "string", dist, prop_missing=0.0)

series = var.draw_series(1000, None)
assert len(series.unique()) == 1000
6 changes: 6 additions & 0 deletions tests/test_toml.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,3 +79,9 @@ def test_toml_err():

with pytest.raises(tomllib.TOMLDecodeError):
MetaFrame.from_config(Path("tests", "data", "bad_config.toml"))

with pytest.raises(ValueError):
MetaFrame.from_config(Path("tests", "data", "unsupported_config.toml"))

with pytest.raises(ValueError):
MetaFrame.from_config(Path("tests", "data", "incompatible_config.toml"))
8 changes: 7 additions & 1 deletion tests/test_var.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,11 +241,11 @@ def test_invalid_prop(prop_missing):
MetaVar("test", "discrete", DiscreteUniformDistribution.default_distribution(),
prop_missing=prop_missing)


@mark.parametrize(
"series,var_type",
[
(pl.Series([1, 2, 3]), "discrete"),
(pd.Series([1, 2, 3]), "discrete"),
(pl.Series([1.0, 2.0, 3.0]), "continuous"),
(pl.Series(["1", "2", "3"]), "string"),
(pl.Series(["1", "2", "3"], dtype=pl.Categorical), "categorical"),
Expand All @@ -263,6 +263,12 @@ def test_get_var_type(series, var_type):
assert MetaVar.get_var_type(series) == var_type


def test_unsupported_type():
series = pl.Series([MetaVar])
with pytest.raises(TypeError):
MetaVar.get_var_type(series)


@mark.parametrize(
"series",
[pd.Series([np.random.rand() for _ in range(5000)]),
Expand Down
Loading