From a8459ae4fac4e671cf5753c454545470e2053b4b Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 28 Nov 2020 08:08:00 -0800 Subject: [PATCH 1/8] REF: de-duplicate Block.__init__ --- pandas/core/internals/blocks.py | 31 ++++--------------------------- 1 file changed, 4 insertions(+), 27 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 74b5a184df95d..b1dd6b2ad6bce 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -144,6 +144,10 @@ def __init__(self, values, placement, ndim: int): f"placement implies {len(self.mgr_locs)}" ) + if self.is_extension and self.ndim == 2 and len(self.mgr_locs) != 1: + # TODO(EA2D): check unnecessary with 2D EAs + raise AssertionError("block.size != values.size") + def _maybe_coerce_values(self, values): """ Ensure we have correctly-typed values. @@ -1667,33 +1671,6 @@ class ExtensionBlock(Block): values: ExtensionArray - def __init__(self, values, placement, ndim: int): - """ - Initialize a non-consolidatable block. - - 'ndim' may be inferred from 'placement'. - - This will call continue to call __init__ for the other base - classes mixed in with this Mixin. - """ - - # Placement must be converted to BlockPlacement so that we can check - # its length - if not isinstance(placement, libinternals.BlockPlacement): - placement = libinternals.BlockPlacement(placement) - - # Maybe infer ndim from placement - if ndim is None: - if len(placement) != 1: - ndim = 1 - else: - ndim = 2 - super().__init__(values, placement, ndim=ndim) - - if self.ndim == 2 and len(self.mgr_locs) != 1: - # TODO(EA2D): check unnecessary with 2D EAs - raise AssertionError("block.size != values.size") - @property def shape(self): # TODO(EA2D): override unnecessary with 2D EAs From e105769478a4ff915e66974b889847c0b19c9d35 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 28 Nov 2020 12:26:11 -0800 Subject: [PATCH 2/8] deprecate allowing ndim=None --- pandas/core/internals/blocks.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index b1dd6b2ad6bce..abf3218165d1a 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -134,9 +134,9 @@ def __init__(self, values, placement, ndim: int): 1 for SingleBlockManager/Series, 2 for BlockManager/DataFrame """ # TODO(EA2D): ndim will be unnecessary with 2D EAs - self.ndim = self._check_ndim(values, ndim) self.mgr_locs = placement self.values = self._maybe_coerce_values(values) + self.ndim = self._check_ndim(values, ndim) if self._validate_ndim and self.ndim and len(self.mgr_locs) != len(self.values): raise ValueError( @@ -184,7 +184,19 @@ def _check_ndim(self, values, ndim): ValueError : the number of dimensions do not match """ if ndim is None: - ndim = values.ndim + warnings.warn( + "Accepting ndim=None in the Block constructor is deprecated, " + "this will raise in a future version.", + FutureWarning, + stacklevel=3, + ) + if self.is_extension: + if len(self.mgr_locs) != 1: + ndim = 1 + else: + ndim = 2 + else: + ndim = values.ndim if self._validate_ndim and values.ndim != ndim: raise ValueError( From 766e4814859a82671862c873d98379a3e50d4458 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 28 Nov 2020 13:33:57 -0800 Subject: [PATCH 3/8] catch warnings in parquet docs --- doc/source/user_guide/io.rst | 4 ++++ doc/source/user_guide/scale.rst | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 1bd35131622ab..e7e653aed18a7 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -4735,6 +4735,7 @@ Write to a feather file. Read from a feather file. .. ipython:: python + :okwarning: result = pd.read_feather("example.feather") result @@ -4818,6 +4819,7 @@ Write to a parquet file. Read from a parquet file. .. ipython:: python + :okwarning: result = pd.read_parquet("example_fp.parquet", engine="fastparquet") result = pd.read_parquet("example_pa.parquet", engine="pyarrow") @@ -4827,6 +4829,7 @@ Read from a parquet file. Read only certain columns of a parquet file. .. ipython:: python + :okwarning: result = pd.read_parquet( "example_fp.parquet", @@ -4895,6 +4898,7 @@ Partitioning Parquet files Parquet supports partitioning of data based on the values of one or more columns. .. ipython:: python + :okwarning: df = pd.DataFrame({"a": [0, 0, 1, 1], "b": [0, 1, 0, 1]}) df.to_parquet(path="test", engine="pyarrow", partition_cols=["a"], compression=None) diff --git a/doc/source/user_guide/scale.rst b/doc/source/user_guide/scale.rst index 7f2419bc7f19d..ef50ed406bbf1 100644 --- a/doc/source/user_guide/scale.rst +++ b/doc/source/user_guide/scale.rst @@ -71,6 +71,7 @@ To load the columns we want, we have two options. Option 1 loads in all the data and then filters to what we need. .. ipython:: python + :okwarning: columns = ["id_0", "name_0", "x_0", "y_0"] @@ -98,6 +99,7 @@ referred to as "low-cardinality" data). By using more efficient data types, you can store larger datasets in memory. .. ipython:: python + :okwarning: ts = pd.read_parquet("timeseries.parquet") ts @@ -206,6 +208,7 @@ counts up to this point. As long as each individual file fits in memory, this wi work for arbitrary-sized datasets. .. ipython:: python + :okwarning: %%time files = pathlib.Path("data/timeseries/").glob("ts*.parquet") @@ -289,6 +292,7 @@ returns a Dask Series with the same dtype and the same name. To get the actual result you can call ``.compute()``. .. ipython:: python + :okwarning: %time ddf["name"].value_counts().compute() @@ -322,6 +326,7 @@ Dask implements the most used parts of the pandas API. For example, we can do a familiar groupby aggregation. .. ipython:: python + :okwarning: %time ddf.groupby("name")[["x", "y"]].mean().compute().head() @@ -345,6 +350,7 @@ we need to supply the divisions manually. Now we can do things like fast random access with ``.loc``. .. ipython:: python + :okwarning: ddf.loc["2002-01-01 12:01":"2002-01-01 12:05"].compute() From db17e6da11e188f93daafd8a490d7ffe8addfb53 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 28 Nov 2020 15:08:21 -0800 Subject: [PATCH 4/8] okwarning --- doc/source/user_guide/scale.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/user_guide/scale.rst b/doc/source/user_guide/scale.rst index ef50ed406bbf1..33be15b951aaa 100644 --- a/doc/source/user_guide/scale.rst +++ b/doc/source/user_guide/scale.rst @@ -80,6 +80,7 @@ Option 1 loads in all the data and then filters to what we need. Option 2 only loads the columns we request. .. ipython:: python + :okwarning: pd.read_parquet("timeseries_wide.parquet", columns=columns) From 5b70d6e199178e97cb9974c1770ad148a7b1eeab Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 29 Nov 2020 12:15:01 -0800 Subject: [PATCH 5/8] FutureWarning -> DeprecationWarning --- pandas/core/internals/blocks.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index abf3218165d1a..93c6b7f41a555 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -184,10 +184,11 @@ def _check_ndim(self, values, ndim): ValueError : the number of dimensions do not match """ if ndim is None: + # GH#38134 warnings.warn( "Accepting ndim=None in the Block constructor is deprecated, " "this will raise in a future version.", - FutureWarning, + DeprecationWarning, stacklevel=3, ) if self.is_extension: From 724fc562fca653c5b61acf27283d4b494ff64df6 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 9 Dec 2020 12:25:53 -0800 Subject: [PATCH 6/8] catch warnings from pyarrow --- pandas/tests/io/test_feather.py | 4 +++- pandas/tests/io/test_parquet.py | 8 ++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 58ae5196151c1..4d7a12de2cc9c 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -88,7 +88,9 @@ def test_basic(self): # df["intervals"] = pd.interval_range(0, 3, 3) assert df.dttz.dtype.tz.zone == "US/Eastern" - self.check_round_trip(df) + with tm.assert_produces_warning(FutureWarning): + # GH#38134 until pyarrow updates to pass ndim to Block constructor + self.check_round_trip(df) def test_duplicate_columns(self): diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index fe3ca0d0937b3..7f8cb2d8ca331 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -561,7 +561,9 @@ def test_basic(self, pa, df_full): df["datetime_tz"] = dti df["bool_with_none"] = [True, None, True] - check_round_trip(df, pa) + with tm.assert_produces_warning(FutureWarning): + # GH#38134 until pyarrow updates to pass ndim to Block constructor + check_round_trip(df, pa) def test_basic_subset_columns(self, pa, df_full): # GH18628 @@ -883,7 +885,9 @@ def test_timezone_aware_index(self, pa, timezone_aware_date_list): # they both implement datetime.tzinfo # they both wrap datetime.timedelta() # this use-case sets the resolution to 1 minute - check_round_trip(df, pa, check_dtype=False) + with tm.assert_produces_warning(FutureWarning): + # GH#38134 until pyarrow updates to pass ndim to Block constructor + check_round_trip(df, pa, check_dtype=False) @td.skip_if_no("pyarrow", min_version="0.17") def test_filter_row_groups(self, pa): From c6ba82614a7bad6695b0bfc06592de4c731faa91 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 9 Dec 2020 15:06:16 -0800 Subject: [PATCH 7/8] post-merge fixup --- pandas/core/internals/blocks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index b6225308ab9ad..7766ce109ed15 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1559,7 +1559,7 @@ def _unstack(self, unstacker, fill_value, new_placement): new_values = new_values.T[mask] new_placement = new_placement[mask] - blocks = [make_block(new_values, placement=new_placement)] + blocks = [make_block(new_values, placement=new_placement, ndim=2)] return blocks, mask def quantile(self, qs, interpolation="linear", axis: int = 0): From 8dc0f94f5ee0338d2c428660eaeb273977fef005 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 17 Dec 2020 15:08:19 -0800 Subject: [PATCH 8/8] update stacklevel --- pandas/core/internals/blocks.py | 3 ++- pandas/tests/io/test_feather.py | 2 +- pandas/tests/io/test_parquet.py | 6 ++---- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 8dd4d8dcaa185..a694c45fe726c 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -184,7 +184,7 @@ def _check_ndim(self, values, ndim): "Accepting ndim=None in the Block constructor is deprecated, " "this will raise in a future version.", DeprecationWarning, - stacklevel=3, + stacklevel=10, ) if self.is_extension: if len(self.mgr_locs) != 1: @@ -199,6 +199,7 @@ def _check_ndim(self, values, ndim): "Wrong number of dimensions. " f"values.ndim != ndim [{values.ndim} != {ndim}]" ) + return ndim @property diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 4d7a12de2cc9c..29dd72a1de379 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -88,7 +88,7 @@ def test_basic(self): # df["intervals"] = pd.interval_range(0, 3, 3) assert df.dttz.dtype.tz.zone == "US/Eastern" - with tm.assert_produces_warning(FutureWarning): + with tm.assert_produces_warning(DeprecationWarning): # GH#38134 until pyarrow updates to pass ndim to Block constructor self.check_round_trip(df) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 85e91ffad2d53..7ba959b64ebf6 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -561,9 +561,7 @@ def test_basic(self, pa, df_full): df["datetime_tz"] = dti df["bool_with_none"] = [True, None, True] - with tm.assert_produces_warning(FutureWarning): - # GH#38134 until pyarrow updates to pass ndim to Block constructor - check_round_trip(df, pa) + check_round_trip(df, pa) def test_basic_subset_columns(self, pa, df_full): # GH18628 @@ -880,7 +878,7 @@ def test_timezone_aware_index(self, pa, timezone_aware_date_list): # they both implement datetime.tzinfo # they both wrap datetime.timedelta() # this use-case sets the resolution to 1 minute - with tm.assert_produces_warning(FutureWarning): + with tm.assert_produces_warning(DeprecationWarning): # GH#38134 until pyarrow updates to pass ndim to Block constructor check_round_trip(df, pa, check_dtype=False)