Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore(ci): fix numpy type errors and revert #22610 #22782

Merged
merged 7 commits into from
Jan 19, 2023
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@
[metadata]
name = Superset
summary = a data exploration platform
description-file = README.md
description_file = README.md
author = Apache Superset Dev
author-email = [email protected]
author_email = [email protected]
Comment on lines -20 to +22
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bycatch; noticed this was being emitted by pylint:

/Users/ville/apple/apache-superset/.tox/pylint/lib/python3.8/site-packages/setuptools/dist.py:770: UserWarning: Usage of dash-separated 'description-file' will not be supported in future versions. Please use the underscore name 'description_file' instead
warnings.warn(
/Users/ville/apple/apache-superset/.tox/pylint/lib/python3.8/site-packages/setuptools/dist.py:770: UserWarning: Usage of dash-separated 'author-email' will not be supported in future versions. Please use the underscore name 'author_email' instead

license = Apache License, Version 2.0

[files]
Expand Down
2 changes: 1 addition & 1 deletion superset/db_engine_specs/hive.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ def df_to_sql(
with cls.get_engine(database) as engine:
engine.execute(f"DROP TABLE IF EXISTS {str(table)}")

def _get_hive_type(dtype: np.dtype) -> str:
def _get_hive_type(dtype: np.dtype[Any]) -> str:
hive_type_by_dtype = {
np.dtype("bool"): "BOOLEAN",
np.dtype("float64"): "DOUBLE",
Expand Down
9 changes: 5 additions & 4 deletions superset/reports/commands/alert.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,13 @@
import logging
from operator import eq, ge, gt, le, lt, ne
from timeit import default_timer
from typing import Optional
from typing import Any, Optional

import numpy as np
import pandas as pd
from celery.exceptions import SoftTimeLimitExceeded
from flask_babel import lazy_gettext as _
from numpy.typing import NDArray

from superset import app, jinja_context, security_manager
from superset.commands.base import BaseCommand
Expand Down Expand Up @@ -84,12 +85,12 @@ def run(self) -> bool:
except (KeyError, json.JSONDecodeError) as ex:
raise AlertValidatorConfigError() from ex

def _validate_not_null(self, rows: np.recarray) -> None:
def _validate_not_null(self, rows: NDArray[Any]) -> None:
villebro marked this conversation as resolved.
Show resolved Hide resolved
self._validate_result(rows)
self._result = rows[0][1]

@staticmethod
def _validate_result(rows: np.recarray) -> None:
def _validate_result(rows: NDArray[Any]) -> None:
# check if query return more than one row
if len(rows) > 1:
raise AlertQueryMultipleRowsError(
Expand All @@ -108,7 +109,7 @@ def _validate_result(rows: np.recarray) -> None:
)
)

def _validate_operator(self, rows: np.recarray) -> None:
def _validate_operator(self, rows: NDArray[Any]) -> None:
self._validate_result(rows)
if rows[0][1] in (0, None, np.nan):
self._result = 0.0
Expand Down
15 changes: 8 additions & 7 deletions superset/result_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import numpy as np
import pandas as pd
import pyarrow as pa
from numpy.typing import NDArray

from superset.db_engine_specs import BaseEngineSpec
from superset.superset_typing import DbapiDescription, DbapiResult, ResultSetColumnType
Expand Down Expand Up @@ -62,16 +63,16 @@ def stringify(obj: Any) -> str:
return json.dumps(obj, default=utils.json_iso_dttm_ser)


def stringify_values(array: np.ndarray) -> np.ndarray:
def stringify_values(array: NDArray[Any]) -> NDArray[Any]:
result = np.copy(array)

with np.nditer(result, flags=["refs_ok"], op_flags=["readwrite"]) as it:
with np.nditer(result, flags=["refs_ok"], op_flags=[["readwrite"]]) as it:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wow, this actually caught an incorrect type!

for obj in it:
if pd.isna(obj):
if na_obj := pd.isna(obj):
# pandas <NA> type cannot be converted to string
obj[pd.isna(obj)] = None
obj[na_obj] = None # type: ignore
else:
obj[...] = stringify(obj)
obj[...] = stringify(obj) # type: ignore

return result

Expand Down Expand Up @@ -106,7 +107,7 @@ def __init__( # pylint: disable=too-many-locals
pa_data: List[pa.Array] = []
deduped_cursor_desc: List[Tuple[Any, ...]] = []
numpy_dtype: List[Tuple[str, ...]] = []
stringified_arr: np.ndarray
stringified_arr: NDArray[Any]

if cursor_description:
# get deduped list of column names
Expand Down Expand Up @@ -208,7 +209,7 @@ def convert_table_to_df(table: pa.Table) -> pd.DataFrame:
return table.to_pandas(integer_object_nulls=True, timestamp_as_object=True)

@staticmethod
def first_nonempty(items: List[Any]) -> Any:
def first_nonempty(items: NDArray[Any]) -> Any:
return next((i for i in items if i), None)

def is_temporal(self, db_type_str: Optional[str]) -> bool:
Expand Down
10 changes: 5 additions & 5 deletions superset/utils/pandas_postprocessing/boxplot.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,10 @@ def boxplot(
"""

def quartile1(series: Series) -> float:
return np.nanpercentile(series, 25, interpolation="midpoint")
return np.nanpercentile(series, 25, interpolation="midpoint") # type: ignore

def quartile3(series: Series) -> float:
return np.nanpercentile(series, 75, interpolation="midpoint")
return np.nanpercentile(series, 75, interpolation="midpoint") # type: ignore

if whisker_type == PostProcessingBoxplotWhiskerType.TUKEY:

Expand Down Expand Up @@ -99,8 +99,8 @@ def whisker_low(series: Series) -> float:
return np.nanpercentile(series, low)

else:
whisker_high = np.max
whisker_low = np.min
whisker_high = np.max # type: ignore
whisker_low = np.min # type: ignore

def outliers(series: Series) -> Set[float]:
above = series[series > whisker_high(series)]
Expand All @@ -126,7 +126,7 @@ def outliers(series: Series) -> Set[float]:
# nanpercentile needs numeric values, otherwise the isnan function
# that's used in the underlying function will fail
for column in metrics:
if df.dtypes[column] == np.object:
if df.dtypes[column] == np.object_:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nowadays np.object is the same as object:

>>> import numpy as np
>>> np.object
<stdin>:1: DeprecationWarning: `np.object` is a deprecated alias for the builtin `object`. To silence this warning, use `object` by itself. Doing this will not modify any behavior and is safe.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
<class 'object'>

I'm opting for np.object_ here as it seems cleaner and it also worked (there's a unit test for this, so we'll catch this if it changes in the future). Check https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations 👍
image

df[column] = to_numeric(df[column], errors="coerce")

return aggregate(df, groupby=groupby, aggregates=aggregates)
2 changes: 1 addition & 1 deletion superset/utils/pandas_postprocessing/flatten.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def flatten(
_columns = []
for series in df.columns.to_flat_index():
_cells = []
for cell in series if is_sequence(series) else [series]:
for cell in series if is_sequence(series) else [series]: # type: ignore
if pd.notnull(cell):
# every cell should be converted to string and escape comma
_cells.append(escape_separator(str(cell)))
Expand Down
2 changes: 1 addition & 1 deletion superset/utils/pandas_postprocessing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@

from superset.exceptions import InvalidPostProcessingError

NUMPY_FUNCTIONS = {
NUMPY_FUNCTIONS: Dict[str, Callable[..., Any]] = {
"average": np.average,
"argmin": np.argmin,
"argmax": np.argmax,
Expand Down