From daea99c57b1a7641380957336f1aed133b2279a4 Mon Sep 17 00:00:00 2001 From: Rajan Mudambi <11376379+rmudambi@users.noreply.github.com> Date: Thu, 6 Apr 2023 22:10:45 -0700 Subject: [PATCH 1/2] support reading parquet files --- setup.py | 1 + src/pseudopeople/entity_types.py | 7 ++++++- src/pseudopeople/interface.py | 21 ++++++++++++++++++--- tests/unit/test_noise_form.py | 2 +- 4 files changed, 26 insertions(+), 5 deletions(-) diff --git a/setup.py b/setup.py index b0e8c366..522b87f3 100644 --- a/setup.py +++ b/setup.py @@ -34,6 +34,7 @@ "pandas", "pyyaml>=5.1", "vivarium", + "pyarrow", ] interactive_requirements = [ diff --git a/src/pseudopeople/entity_types.py b/src/pseudopeople/entity_types.py index 7749ce7b..0f0c19ff 100644 --- a/src/pseudopeople/entity_types.py +++ b/src/pseudopeople/entity_types.py @@ -58,7 +58,12 @@ def __call__( randomness_stream: RandomnessStream, additional_key: Any, ) -> pd.Series: - column = column.copy() + # TODO: this is a temporary hack to account for all string columns having been made categorical + # We should record expected output dtype in the columns data structure + if column.dtype.name == "category": + column = column.astype(str) + else: + column = column.copy() noise_level = configuration.row_noise_level to_noise_idx = get_index_to_noise( column, noise_level, randomness_stream, f"{self.name}_{additional_key}" diff --git a/src/pseudopeople/interface.py b/src/pseudopeople/interface.py index 7a543478..202eb754 100644 --- a/src/pseudopeople/interface.py +++ b/src/pseudopeople/interface.py @@ -30,12 +30,27 @@ def _generate_form( Noised form data """ configuration_tree = get_configuration(configuration) + if isinstance(source, str): + source = Path(source) if isinstance(source, pd.DataFrame): data = source + elif isinstance(source, Path): + if source.suffix == ".hdf": + data = pd.read_hdf(source) + if not isinstance(data, pd.DataFrame): + raise TypeError(f"File located at {source} must contain a pandas DataFrame.") + elif source.suffix == ".parquet": + data = pd.read_parquet(source) + else: + raise ValueError( + "Source path must either be a .hdf or a .parquet file. Provided " + f"{source.suffix}" + ) else: - data = pd.read_hdf(source) - if not isinstance(data, pd.DataFrame): - raise TypeError(f"File located at {source} must contain a pandas DataFrame.") + raise TypeError( + f"Source {source} must be either a pandas DataFrame or a path to a " + "file containing a pandas DataFrame." + ) return noise_form(form, data, configuration_tree, seed) diff --git a/tests/unit/test_noise_form.py b/tests/unit/test_noise_form.py index f6cbe048..f89a2981 100644 --- a/tests/unit/test_noise_form.py +++ b/tests/unit/test_noise_form.py @@ -170,7 +170,7 @@ def test_correct_forms_are_used(func, form, mocker): pytest.skip(reason=f"TODO: implement function for {form.value} form") mock = mocker.patch("pseudopeople.interface.noise_form") mocker.patch("pseudopeople.interface.pd.read_hdf", return_value=pd.DataFrame()) - _ = func("dummy/path") + _ = func("dummy/path.hdf") assert mock.call_args[0][0] == form From 8671224f9319f405f94fa66c986624a4dc9f29e2 Mon Sep 17 00:00:00 2001 From: Rajan Mudambi <11376379+rmudambi@users.noreply.github.com> Date: Fri, 7 Apr 2023 12:11:29 -0700 Subject: [PATCH 2/2] move is dataframe validation --- src/pseudopeople/interface.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pseudopeople/interface.py b/src/pseudopeople/interface.py index 202eb754..a72aa7b1 100644 --- a/src/pseudopeople/interface.py +++ b/src/pseudopeople/interface.py @@ -37,8 +37,6 @@ def _generate_form( elif isinstance(source, Path): if source.suffix == ".hdf": data = pd.read_hdf(source) - if not isinstance(data, pd.DataFrame): - raise TypeError(f"File located at {source} must contain a pandas DataFrame.") elif source.suffix == ".parquet": data = pd.read_parquet(source) else: @@ -46,6 +44,8 @@ def _generate_form( "Source path must either be a .hdf or a .parquet file. Provided " f"{source.suffix}" ) + if not isinstance(data, pd.DataFrame): + raise TypeError(f"File located at {source} must contain a pandas DataFrame.") else: raise TypeError( f"Source {source} must be either a pandas DataFrame or a path to a "