Skip to content

Commit

Permalink
Fix defaults so plugins don't need manual fitkwarg (#296)
Browse files Browse the repository at this point in the history
* Fix compatibility with polars 0.20.31
* Fix compatibility with numpy 2.0.0
  • Loading branch information
qubixes authored Jun 19, 2024
1 parent 0553fd8 commit d4d26df
Show file tree
Hide file tree
Showing 14 changed files with 22 additions and 21 deletions.
1 change: 1 addition & 0 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -65,4 +65,5 @@ jobs:
- name: Test configuration example
run: |
pip install git+https://github.com/sodascience/metasyn-disclosure-control
pip install .
metasyn create-meta metasyn/demo/demo_titanic.csv --config examples/example_config.toml
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ csv_path = demo_file("fruit")
# Create a polars dataframe from the csv file.
# It is important to ensure the data types are correct
# when creating your dataframe, especially categorical data!
df = pl.read_csv(csv_path, dtypes={
df = pl.read_csv(csv_path, schema_overrides={
"fruits": pl.Categorical,
"cars": pl.Categorical
})
Expand Down
8 changes: 4 additions & 4 deletions docs/source/usage/config_files.rst
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ For example:
description = "Cabin number of the passenger."
distribution = {implements = "core.regex", parameters = {regex_data = "[A-F][0-9]{2,3}"}}
prop_missing = 0.1
privacy = {name = "disclosure", parameters = {n_avg = 21}}
privacy = {name = "disclosure", parameters = {partition_size = 21}}
Distribution providers
Expand All @@ -76,7 +76,7 @@ For example:
[privacy]
name = "disclosure"
parameters = {n_avg = 11}
parameters = {partition_size = 11}
Example Configuration File
Expand All @@ -93,7 +93,7 @@ The following is an example which specifies the distribution providers, privacy
[privacy]
name = "disclosure"
parameters = {n_avg = 11}
parameters = {partition_size = 11}
[[var]]
Expand All @@ -117,7 +117,7 @@ The following is an example which specifies the distribution providers, privacy
[[var]]
name = "Cabin"
distribution = {implements = "core.regex", parameters = {regex_data = "[A-F][0-9]{2,3}"}}
privacy = {name = "disclosure", parameters = {n_avg = 21}}
privacy = {name = "disclosure", parameters = {partition_size = 21}}
Synthetic data without input file
Expand Down
2 changes: 1 addition & 1 deletion docs/source/usage/generating_metaframes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ For example, if we want to load a dataset named 'dataset.csv' into a Polars Data
# Create a Polars DataFrame
df = pl.read_csv(
source="dataset.csv",
dtypes={"Color": pl.Categorical, "Fruit": pl.Categorical},
schema_overrides={"Color": pl.Categorical, "Fruit": pl.Categorical},
try_parse_dates=True,
)
Expand Down
2 changes: 1 addition & 1 deletion docs/source/usage/quick_start.rst
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ To finish loading the dataset, we simply use the :meth:`polars.read_csv` functio

.. code-block:: python
df = pl.read_csv(dataset_csv, dtypes=data_types)
df = pl.read_csv(dataset_csv, schema_overrides=data_types)
This converts the CSV file into a DataFrame named ``df``.
Expand Down
4 changes: 2 additions & 2 deletions examples/example_config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ dist_providers = ["builtin", "metasyn-disclosure"]

[privacy]
name = "disclosure"
parameters = {n_avg = 11}
parameters = {partition_size = 11}


[[var]]
Expand All @@ -28,4 +28,4 @@ distribution = {implements = "core.uniform", parameters = {lower = 20, upper = 4
[[var]]
name = "Cabin"
distribution = {implements = "core.regex", parameters = {regex_data = "[A-F][0-9]{2,3}"}}
privacy = {name = "disclosure", parameters = {n_avg = 21}}
privacy = {name = "disclosure", parameters = {partition_size = 21}}
2 changes: 1 addition & 1 deletion examples/getting_started.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@
"data_types = {\"Sex\": pl.Categorical, \"Embarked\": pl.Categorical}\n",
"\n",
"# read the data from the csv path\n",
"df = pl.read_csv(csv_path, dtypes=data_types, try_parse_dates=True)\n",
"df = pl.read_csv(csv_path, schema_overrides=data_types, try_parse_dates=True)\n",
"\n",
"# check out the data\n",
"df.head()"
Expand Down
6 changes: 3 additions & 3 deletions metasyn/demo/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,15 +126,15 @@ def demo_dataframe(name: str = "titanic") -> pl.DataFrame:
"Destination": pl.Categorical,
"Transported": pl.Categorical,
}
return pl.read_csv(file_path, dtypes=data_types, try_parse_dates=True)
return pl.read_csv(file_path, schema_overrides=data_types, try_parse_dates=True)
if name == "titanic":
# our edited titanic data
data_types = {"Sex": pl.Categorical, "Embarked": pl.Categorical}
return pl.read_csv(file_path, dtypes=data_types, try_parse_dates=True)
return pl.read_csv(file_path, schema_overrides=data_types, try_parse_dates=True)
if name == "fruit":
# basic fruit data from polars example
data_types = {"fruits": pl.Categorical, "cars": pl.Categorical}
return pl.read_csv(file_path, dtypes=data_types)
return pl.read_csv(file_path, schema_overrides=data_types)

raise ValueError(
f"No demonstration dataset with name '{name}'. Options: titanic, spaceship, fruit."
Expand Down
4 changes: 2 additions & 2 deletions metasyn/distribution/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

import numpy as np
import polars as pl
from numpy import Inf
from numpy import inf
from numpy import typing as npt


Expand Down Expand Up @@ -388,4 +388,4 @@ def draw(self):

def information_criterion(self, values):
vals = self._to_series(values)
return -Inf if vals.n_unique() < 2 else Inf
return -inf if vals.n_unique() < 2 else inf
2 changes: 1 addition & 1 deletion metasyn/distribution/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ class MultinoulliDistribution(BaseDistribution):
def __init__(
self,
labels: Union[npt.NDArray[Union[np.str_, np.int_]], list[Union[str, int]]],
probs: Union[npt.NDArray[np.float_], list[float]]
probs: Union[npt.NDArray[np.double], list[float]]
):
self.labels = np.array(labels)
self.probs = np.array(probs)
Expand Down
4 changes: 2 additions & 2 deletions metasyn/privacy.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def fit_kwargs(self):
For example epsilon in the case of differential privacy.
"""
return {}
return self.to_dict()["parameters"]


class BasicPrivacy(BasePrivacy):
Expand All @@ -74,7 +74,7 @@ def get_privacy(name: str, parameters: Optional[dict] = None) -> BasePrivacy:
Name of the privacy type, use "none" for no specific type of privacy.
parameters, optional
The parameters for the privacy type. This could be the epsilon for differential
privacy or n_avg for disclosure control, by default None.
privacy or partition_size for disclosure control, by default None.
Returns
-------
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ classifiers = [
]

dependencies = [
"polars>=0.20.13",
"polars>=0.20.31",
"tqdm",
"numpy>=1.20",
"pyarrow", # Dependency of polars since we're converting from pandas.
Expand Down
2 changes: 1 addition & 1 deletion tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def tmp_dir(tmp_path_factory) -> Path:
"Age": float,
"Fare": float
}
data_frame = pl.read_csv(csv_fp, dtypes=csv_dt)[:100]
data_frame = pl.read_csv(csv_fp, schema_overrides=csv_dt)[:100]
meta_frame = MetaFrame.fit_dataframe(data_frame, var_specs=[{"name": "PassengerId", "distribution": {"unique": True}}])
meta_frame.to_json(json_path)
config_fp = TMP_DIR_PATH / "config.ini"
Expand Down
2 changes: 1 addition & 1 deletion tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def _read_csv(fp, dataframe_lib):
df = pd.read_csv(fp, dtype=dtypes)
return df.iloc[:100]
else:
df = pl.read_csv(fp, dtypes={x: pl.Categorical for x, x_type in dtypes.items() if x_type == "category"})
df = pl.read_csv(fp, schema_overrides={x: pl.Categorical for x, x_type in dtypes.items() if x_type == "category"})
return df[:100]


Expand Down

0 comments on commit d4d26df

Please sign in to comment.