Fix defaults so plugins don't need manual fitkwarg (#296)

* Fix compatibility with polars 0.20.31 * Fix compatibility with numpy 2.0.0
sodascience · Jun 19, 2024 · d4d26df · d4d26df
1 parent 0553fd8
commit d4d26df
Show file tree

Hide file tree

Showing 14 changed files with 22 additions and 21 deletions.
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -65,4 +65,5 @@ jobs:
     - name: Test configuration example
       run: |
         pip install git+https://github.com/sodascience/metasyn-disclosure-control
+        pip install .
         metasyn create-meta metasyn/demo/demo_titanic.csv --config examples/example_config.toml
diff --git a/README.md b/README.md
@@ -56,7 +56,7 @@ csv_path = demo_file("fruit")
 # Create a polars dataframe from the csv file.
 # It is important to ensure the data types are correct  
 # when creating your dataframe, especially categorical data!
-df = pl.read_csv(csv_path, dtypes={
+df = pl.read_csv(csv_path, schema_overrides={
   "fruits": pl.Categorical, 
   "cars": pl.Categorical
 })

diff --git a/docs/source/usage/config_files.rst b/docs/source/usage/config_files.rst
@@ -51,7 +51,7 @@ For example:
    description = "Cabin number of the passenger."
    distribution = {implements = "core.regex", parameters = {regex_data = "[A-F][0-9]{2,3}"}}
    prop_missing = 0.1
-   privacy = {name = "disclosure", parameters = {n_avg = 21}}
+   privacy = {name = "disclosure", parameters = {partition_size = 21}}
 
 
 Distribution providers
@@ -76,7 +76,7 @@ For example:
 
    [privacy]
    name = "disclosure"
-   parameters = {n_avg = 11}
+   parameters = {partition_size = 11}
 
 
 Example Configuration File
@@ -93,7 +93,7 @@ The following is an example which specifies the distribution providers, privacy
 
    [privacy]
    name = "disclosure"
-   parameters = {n_avg = 11}
+   parameters = {partition_size = 11}
 
 
    [[var]]
@@ -117,7 +117,7 @@ The following is an example which specifies the distribution providers, privacy
    [[var]]
    name = "Cabin"
    distribution = {implements = "core.regex", parameters = {regex_data = "[A-F][0-9]{2,3}"}}
-   privacy = {name = "disclosure", parameters = {n_avg = 21}}
+   privacy = {name = "disclosure", parameters = {partition_size = 21}}
 
 
 Synthetic data without input file

diff --git a/docs/source/usage/generating_metaframes.rst b/docs/source/usage/generating_metaframes.rst
@@ -24,7 +24,7 @@ For example, if we want to load a dataset named 'dataset.csv' into a Polars Data
    # Create a Polars DataFrame
    df = pl.read_csv(
        source="dataset.csv",
-       dtypes={"Color": pl.Categorical, "Fruit": pl.Categorical},
+       schema_overrides={"Color": pl.Categorical, "Fruit": pl.Categorical},
        try_parse_dates=True,
    )
 

diff --git a/docs/source/usage/quick_start.rst b/docs/source/usage/quick_start.rst
@@ -40,7 +40,7 @@ To finish loading the dataset, we simply use the :meth:`polars.read_csv` functio
 
 .. code-block:: python
 
-   df = pl.read_csv(dataset_csv, dtypes=data_types)
+   df = pl.read_csv(dataset_csv, schema_overrides=data_types)
 
 
 This converts the CSV file into a DataFrame named ``df``.

diff --git a/examples/example_config.toml b/examples/example_config.toml
@@ -4,7 +4,7 @@ dist_providers = ["builtin", "metasyn-disclosure"]
 
 [privacy]
 name = "disclosure"
-parameters = {n_avg = 11}
+parameters = {partition_size = 11}
 
 
 [[var]]
@@ -28,4 +28,4 @@ distribution = {implements = "core.uniform", parameters = {lower = 20, upper = 4
 [[var]]
 name = "Cabin"
 distribution = {implements = "core.regex", parameters = {regex_data = "[A-F][0-9]{2,3}"}}
-privacy = {name = "disclosure", parameters = {n_avg = 21}}
+privacy = {name = "disclosure", parameters = {partition_size = 21}}
diff --git a/examples/getting_started.ipynb b/examples/getting_started.ipynb
@@ -137,7 +137,7 @@
     "data_types = {\"Sex\": pl.Categorical, \"Embarked\": pl.Categorical}\n",
     "\n",
     "# read the data from the csv path\n",
-    "df = pl.read_csv(csv_path, dtypes=data_types, try_parse_dates=True)\n",
+    "df = pl.read_csv(csv_path, schema_overrides=data_types, try_parse_dates=True)\n",
     "\n",
     "# check out the data\n",
     "df.head()"

diff --git a/metasyn/demo/dataset.py b/metasyn/demo/dataset.py
@@ -126,15 +126,15 @@ def demo_dataframe(name: str = "titanic") -> pl.DataFrame:
             "Destination": pl.Categorical,
             "Transported": pl.Categorical,
         }
-        return pl.read_csv(file_path, dtypes=data_types, try_parse_dates=True)
+        return pl.read_csv(file_path, schema_overrides=data_types, try_parse_dates=True)
     if name == "titanic":
         # our edited titanic data
         data_types = {"Sex": pl.Categorical, "Embarked": pl.Categorical}
-        return pl.read_csv(file_path, dtypes=data_types, try_parse_dates=True)
+        return pl.read_csv(file_path, schema_overrides=data_types, try_parse_dates=True)
     if name == "fruit":
         # basic fruit data from polars example
         data_types = {"fruits": pl.Categorical, "cars": pl.Categorical}
-        return pl.read_csv(file_path, dtypes=data_types)
+        return pl.read_csv(file_path, schema_overrides=data_types)
 
     raise ValueError(
         f"No demonstration dataset with name '{name}'. Options: titanic, spaceship, fruit."

diff --git a/metasyn/distribution/base.py b/metasyn/distribution/base.py
@@ -23,7 +23,7 @@
 
 import numpy as np
 import polars as pl
-from numpy import Inf
+from numpy import inf
 from numpy import typing as npt
 
 
@@ -388,4 +388,4 @@ def draw(self):
 
     def information_criterion(self, values):
         vals = self._to_series(values)
-        return -Inf if vals.n_unique() < 2 else Inf
+        return -inf if vals.n_unique() < 2 else inf
diff --git a/metasyn/distribution/categorical.py b/metasyn/distribution/categorical.py
@@ -39,7 +39,7 @@ class MultinoulliDistribution(BaseDistribution):
     def __init__(
         self,
         labels: Union[npt.NDArray[Union[np.str_, np.int_]], list[Union[str, int]]],
-        probs: Union[npt.NDArray[np.float_], list[float]]
+        probs: Union[npt.NDArray[np.double], list[float]]
     ):
         self.labels = np.array(labels)
         self.probs = np.array(probs)

diff --git a/metasyn/privacy.py b/metasyn/privacy.py
@@ -49,7 +49,7 @@ def fit_kwargs(self):
 
         For example epsilon in the case of differential privacy.
         """
-        return {}
+        return self.to_dict()["parameters"]
 
 
 class BasicPrivacy(BasePrivacy):
@@ -74,7 +74,7 @@ def get_privacy(name: str, parameters: Optional[dict] = None) -> BasePrivacy:
         Name of the privacy type, use "none" for no specific type of privacy.
     parameters, optional
         The parameters for the privacy type. This could be the epsilon for differential
-        privacy or n_avg for disclosure control, by default None.
+        privacy or partition_size for disclosure control, by default None.
 
     Returns
     -------

diff --git a/pyproject.toml b/pyproject.toml
@@ -25,7 +25,7 @@ classifiers = [
 ]
 
 dependencies = [
-    "polars>=0.20.13",
+    "polars>=0.20.31",
     "tqdm",
     "numpy>=1.20",
     "pyarrow", # Dependency of polars since we're converting from pandas.

diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -35,7 +35,7 @@ def tmp_dir(tmp_path_factory) -> Path:
             "Age": float,
             "Fare": float
         }
-        data_frame = pl.read_csv(csv_fp, dtypes=csv_dt)[:100]
+        data_frame = pl.read_csv(csv_fp, schema_overrides=csv_dt)[:100]
         meta_frame = MetaFrame.fit_dataframe(data_frame, var_specs=[{"name": "PassengerId", "distribution": {"unique": True}}])
         meta_frame.to_json(json_path)
         config_fp = TMP_DIR_PATH / "config.ini"

diff --git a/tests/test_dataset.py b/tests/test_dataset.py
@@ -31,7 +31,7 @@ def _read_csv(fp, dataframe_lib):
         df = pd.read_csv(fp, dtype=dtypes)
         return df.iloc[:100]
     else:
-        df = pl.read_csv(fp, dtypes={x: pl.Categorical for x, x_type in dtypes.items() if x_type == "category"})
+        df = pl.read_csv(fp, schema_overrides={x: pl.Categorical for x, x_type in dtypes.items() if x_type == "category"})
         return df[:100]