From 91e6f5cb9f9a7cb7d1a00f72f2d97f943d28c214 Mon Sep 17 00:00:00 2001 From: Sebastian Liebscher <112352529+sebastianliebscher@users.noreply.github.com> Date: Thu, 20 Jul 2023 20:04:17 +0200 Subject: [PATCH] chore(deps): bump pandas >=2.0 (#24705) Co-authored-by: EugeneTorap --- requirements/base.txt | 4 ++- requirements/testing.txt | 2 -- setup.py | 2 +- superset/common/query_context_processor.py | 6 ++-- superset/config.py | 2 +- superset/reports/notifications/slack.py | 11 +++++--- superset/views/database/views.py | 2 -- superset/viz.py | 8 +++--- tests/integration_tests/viz_tests.py | 2 +- .../pandas_postprocessing/test_rolling.py | 28 +++++++++---------- 10 files changed, 33 insertions(+), 34 deletions(-) diff --git a/requirements/base.txt b/requirements/base.txt index dc042a7747210..dd6e65791d08b 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -190,7 +190,7 @@ packaging==23.1 # deprecation # limits # marshmallow -pandas==1.5.3 +pandas==2.0.3 # via apache-superset paramiko==2.11.0 # via sshtunnel @@ -288,6 +288,8 @@ typing-extensions==4.4.0 # apache-superset # flask-limiter # limits +tzdata==2023.3 + # via pandas urllib3==1.26.6 # via selenium vine==5.0.0 diff --git a/requirements/testing.txt b/requirements/testing.txt index 5605167228c7f..283b4c9fcd8cc 100644 --- a/requirements/testing.txt +++ b/requirements/testing.txt @@ -152,8 +152,6 @@ tqdm==4.65.0 # prophet trino==0.324.0 # via apache-superset -tzdata==2023.3 - # via pytz-deprecation-shim tzlocal==4.3 # via trino websocket-client==1.5.1 diff --git a/setup.py b/setup.py index 814214f5a5c91..ff69b73eb5e64 100644 --- a/setup.py +++ b/setup.py @@ -102,7 +102,7 @@ def get_git_sha() -> str: "nh3>=0.2.11, <0.3", "numpy==1.23.5", "packaging", - "pandas>=1.5.3, <1.6", + "pandas>=2.0.3, <2.1", "parsedatetime", "pgsanity", "polyline>=2.0.0, <3.0", diff --git a/superset/common/query_context_processor.py b/superset/common/query_context_processor.py index f6152b232a938..72021a4626e49 100644 --- a/superset/common/query_context_processor.py +++ b/superset/common/query_context_processor.py @@ -138,7 +138,7 @@ def get_df_payload( if query_obj and cache_key and not cache.is_loaded: try: - invalid_columns = [ + if invalid_columns := [ col for col in get_column_names_from_columns(query_obj.columns) + get_column_names_from_metrics(query_obj.metrics or []) @@ -146,9 +146,7 @@ def get_df_payload( col not in self._qc_datasource.column_names and col != DTTM_ALIAS ) - ] - - if invalid_columns: + ]: raise QueryObjectValidationError( _( "Columns missing in dataset: %(invalid_columns)s", diff --git a/superset/config.py b/superset/config.py index 4a7434093c9b1..7fbd5e52ceade 100644 --- a/superset/config.py +++ b/superset/config.py @@ -761,7 +761,7 @@ class D3Format(TypedDict, total=False): # Excel Options: key/value pairs that will be passed as argument to DataFrame.to_excel # method. # note: index option should not be overridden -EXCEL_EXPORT = {"encoding": "utf-8"} +EXCEL_EXPORT: dict[str, Any] = {} # --------------------------------------------------- # Time grain configurations diff --git a/superset/reports/notifications/slack.py b/superset/reports/notifications/slack.py index 4c3f2ee419a5c..a769622b57640 100644 --- a/superset/reports/notifications/slack.py +++ b/superset/reports/notifications/slack.py @@ -21,6 +21,7 @@ from typing import Union import backoff +import pandas as pd from flask_babel import gettext as __ from slack_sdk import WebClient from slack_sdk.errors import ( @@ -121,8 +122,9 @@ def _get_body(self) -> str: # need to truncate the data for i in range(len(df) - 1): truncated_df = df[: i + 1].fillna("") - truncated_df = truncated_df.append( - {k: "..." for k in df.columns}, ignore_index=True + truncated_row = pd.Series({k: "..." for k in df.columns}) + truncated_df = pd.concat( + [truncated_df, truncated_row.to_frame().T], ignore_index=True ) tabulated = df.to_markdown() table = f"```\n{tabulated}\n```\n\n(table was truncated)" @@ -130,8 +132,9 @@ def _get_body(self) -> str: if len(message) > MAXIMUM_MESSAGE_SIZE: # Decrement i and build a message that is under the limit truncated_df = df[:i].fillna("") - truncated_df = truncated_df.append( - {k: "..." for k in df.columns}, ignore_index=True + truncated_row = pd.Series({k: "..." for k in df.columns}) + truncated_df = pd.concat( + [truncated_df, truncated_row.to_frame().T], ignore_index=True ) tabulated = df.to_markdown() table = ( diff --git a/superset/views/database/views.py b/superset/views/database/views.py index 1c7ff25942706..1862ac2088f54 100644 --- a/superset/views/database/views.py +++ b/superset/views/database/views.py @@ -201,7 +201,6 @@ def form_post(self, form: CsvToDatabaseForm) -> Response: infer_datetime_format=form.infer_datetime_format.data, iterator=True, keep_default_na=not form.null_values.data, - mangle_dupe_cols=form.overwrite_duplicate.data, usecols=form.use_cols.data if form.use_cols.data else None, na_values=form.null_values.data if form.null_values.data else None, nrows=form.nrows.data, @@ -344,7 +343,6 @@ def form_post(self, form: ExcelToDatabaseForm) -> Response: index_col=form.index_col.data, io=form.excel_file.data, keep_default_na=not form.null_values.data, - mangle_dupe_cols=form.mangle_dupe_cols.data, na_values=form.null_values.data if form.null_values.data else None, parse_dates=form.parse_dates.data, skiprows=form.skiprows.data, diff --git a/superset/viz.py b/superset/viz.py index 3051f104e20af..0c4a1ccaa8c19 100644 --- a/superset/viz.py +++ b/superset/viz.py @@ -2636,7 +2636,7 @@ def levels_for( for i in range(0, len(groups) + 1): agg_df = df.groupby(groups[:i]) if i else df levels[i] = ( - agg_df.mean() + agg_df.mean(numeric_only=True) if time_op == "agg_mean" else agg_df.sum(numeric_only=True) ) @@ -2661,7 +2661,7 @@ def levels_for_diff( lambda a, b, fill_value: a / float(b) - 1, ], }[time_op] - agg_df = df.groupby(DTTM_ALIAS).sum() + agg_df = df.groupby(DTTM_ALIAS).sum(numeric_only=True) levels = { 0: pd.Series( { @@ -2671,7 +2671,7 @@ def levels_for_diff( ) } for i in range(1, len(groups) + 1): - agg_df = df.groupby([DTTM_ALIAS] + groups[:i]).sum() + agg_df = df.groupby([DTTM_ALIAS] + groups[:i]).sum(numeric_only=True) levels[i] = pd.DataFrame( { m: func[0](agg_df[m][until], agg_df[m][since], fill_value=0) @@ -2687,7 +2687,7 @@ def levels_for_time( procs = {} for i in range(0, len(groups) + 1): self.form_data["groupby"] = groups[:i] - df_drop = df.drop(groups[i:], 1) + df_drop = df.drop(groups[i:], axis=1) procs[i] = self.process_data(df_drop, aggregate=True) self.form_data["groupby"] = groups return procs diff --git a/tests/integration_tests/viz_tests.py b/tests/integration_tests/viz_tests.py index f1665e96888d0..c4c11df9d8120 100644 --- a/tests/integration_tests/viz_tests.py +++ b/tests/integration_tests/viz_tests.py @@ -627,7 +627,7 @@ def test_nest_procs_returns_hierarchy(self): metrics = ["metric1", "metric2", "metric3"] procs = {} for i in range(0, 4): - df_drop = df.drop(groups[i:], 1) + df_drop = df.drop(groups[i:], axis=1) pivot = df_drop.pivot_table( index=DTTM_ALIAS, columns=groups[:i], values=metrics ) diff --git a/tests/unit_tests/pandas_postprocessing/test_rolling.py b/tests/unit_tests/pandas_postprocessing/test_rolling.py index b72a8bee44827..1859b0c2c7a1f 100644 --- a/tests/unit_tests/pandas_postprocessing/test_rolling.py +++ b/tests/unit_tests/pandas_postprocessing/test_rolling.py @@ -149,21 +149,21 @@ def test_rolling_after_pivot_with_single_metric(): sum_metric country UK US dttm - 2019-01-01 5.0 6.0 - 2019-01-02 12.0 14.0 + 2019-01-01 5 6 + 2019-01-02 12 14 """ flat_df = pp.flatten(rolling_df) """ dttm sum_metric, UK sum_metric, US - 0 2019-01-01 5.0 6.0 - 1 2019-01-02 12.0 14.0 + 0 2019-01-01 5 6 + 1 2019-01-02 12 14 """ assert flat_df.equals( pd.DataFrame( data={ "dttm": pd.to_datetime(["2019-01-01", "2019-01-02"]), - FLAT_COLUMN_SEPARATOR.join(["sum_metric", "UK"]): [5.0, 12.0], - FLAT_COLUMN_SEPARATOR.join(["sum_metric", "US"]): [6.0, 14.0], + FLAT_COLUMN_SEPARATOR.join(["sum_metric", "UK"]): [5, 12], + FLAT_COLUMN_SEPARATOR.join(["sum_metric", "US"]): [6, 14], } ) ) @@ -200,23 +200,23 @@ def test_rolling_after_pivot_with_multiple_metrics(): count_metric sum_metric country UK US UK US dttm - 2019-01-01 1.0 2.0 5.0 6.0 - 2019-01-02 4.0 6.0 12.0 14.0 + 2019-01-01 1 2 5 6 + 2019-01-02 4 6 12 14 """ flat_df = pp.flatten(rolling_df) """ dttm count_metric, UK count_metric, US sum_metric, UK sum_metric, US - 0 2019-01-01 1.0 2.0 5.0 6.0 - 1 2019-01-02 4.0 6.0 12.0 14.0 + 0 2019-01-01 1 2 5 6 + 1 2019-01-02 4 6 12 14 """ assert flat_df.equals( pd.DataFrame( data={ "dttm": pd.to_datetime(["2019-01-01", "2019-01-02"]), - FLAT_COLUMN_SEPARATOR.join(["count_metric", "UK"]): [1.0, 4.0], - FLAT_COLUMN_SEPARATOR.join(["count_metric", "US"]): [2.0, 6.0], - FLAT_COLUMN_SEPARATOR.join(["sum_metric", "UK"]): [5.0, 12.0], - FLAT_COLUMN_SEPARATOR.join(["sum_metric", "US"]): [6.0, 14.0], + FLAT_COLUMN_SEPARATOR.join(["count_metric", "UK"]): [1, 4], + FLAT_COLUMN_SEPARATOR.join(["count_metric", "US"]): [2, 6], + FLAT_COLUMN_SEPARATOR.join(["sum_metric", "UK"]): [5, 12], + FLAT_COLUMN_SEPARATOR.join(["sum_metric", "US"]): [6, 14], } ) )