Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(python)!: Remove pyxlsb engine from read_excel #16784

Merged
merged 1 commit into from
Jun 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions docs/user-guide/io/excel.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ Polars does not have a native Excel reader. Instead, it uses external libraries

- xlsx2csv: This is the current default.
- openpyxl: Typically slower than xls2csv, but can provide more flexibility for files that are difficult to parse.
- pyxlsb: For reading binary Excel files (xlsb).
- fastexcel: This reader is based on [calamine](https://github.com/tafia/calamine) and is typically the fastest reader but has fewer features than xls2csv.

Although fastexcel is not the default at this point, we recommend trying fastexcel first and using xlsx2csv or openpyxl if you encounter issues.
Expand All @@ -19,7 +18,7 @@ To use one of these engines, the appropriate Python package must be installed as
=== ":fontawesome-brands-python: Python"

```shell
$ pip install xlsx2csv openpyxl pyxlsb fastexcel
$ pip install xlsx2csv openpyxl fastexcel
```

The default Excel reader is xlsx2csv.
Expand Down
84 changes: 0 additions & 84 deletions py-polars/polars/io/spreadsheet/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,8 +152,6 @@ def read_excel(

.. versionadded:: 0.20.6
Added "calamine" fastexcel engine for Excel Workbooks (.xlsx, .xlsb, .xls).
.. versionadded:: 0.19.4
Added "pyxlsb" engine for Excel Binary Workbooks (.xlsb).
.. versionadded:: 0.19.3
Added "openpyxl" engine, and added `schema_overrides` parameter.

Expand Down Expand Up @@ -184,18 +182,13 @@ def read_excel(
additional automatic type inference; potentially useful if you are otherwise
unable to parse your sheet with the (default) `xlsx2csv` engine in
conjunction with the `schema_overrides` parameter.
* "pyxlsb": this engine can be used for Excel Binary Workbooks (`.xlsb` files).
Note that you have to use `schema_overrides` to correctly load date/datetime
columns (or these will be read as floats representing offset Julian values).
You should now prefer the "calamine" engine for this Workbook type.
engine_options
Additional options passed to the underlying engine's primary parsing
constructor (given below), if supported:

* "xlsx2csv": `Xlsx2csv`
* "calamine": n/a (can only provide `read_options`)
* "openpyxl": `load_workbook`
* "pyxlsb": `open_workbook`
read_options
Options passed to the underlying engine method that reads the sheet data.
Where supported, this allows for additional control over parsing. The
Expand All @@ -204,7 +197,6 @@ def read_excel(
* "xlsx2csv": `pl.read_csv`
* "calamine": `ExcelReader.load_sheet_by_name`
* "openpyxl": n/a (can only provide `engine_options`)
* "pyxlsb": n/a (can only provide `engine_options`)
schema_overrides
Support type specification or override of one or more columns.
infer_schema_length
Expand Down Expand Up @@ -658,24 +650,6 @@ def _initialise_spreadsheet_parser(
]
return _read_spreadsheet_calamine, parser, sheets

elif engine == "pyxlsb":
issue_deprecation_warning(
"the 'pyxlsb' engine is deprecated and should be replaced with 'calamine'",
version="0.20.22",
)
pyxlsb = import_optional("pyxlsb")
try:
parser = pyxlsb.open_workbook(source, **engine_options)
except KeyError as err:
if "no item named 'xl/_rels/workbook.bin.rels'" in str(err):
msg = f"invalid Excel Binary Workbook: {source!r}"
raise TypeError(msg) from None
raise
sheets = [
{"index": i + 1, "name": name} for i, name in enumerate(parser.sheets)
]
return _read_spreadsheet_pyxlsb, parser, sheets

msg = f"unrecognized engine: {engine!r}"
raise NotImplementedError(msg)

Expand Down Expand Up @@ -890,64 +864,6 @@ def _read_spreadsheet_calamine(
return df


def _read_spreadsheet_pyxlsb(
parser: Any,
sheet_name: str | None,
read_options: dict[str, Any],
schema_overrides: SchemaDict | None,
*,
raise_if_empty: bool,
) -> pl.DataFrame:
from pyxlsb import convert_date

infer_schema_length = read_options.pop("infer_schema_length", None)
ws = parser.get_sheet(sheet_name)
try:
# establish header/data rows
header: list[str | None] = []
rows_iter = ws.rows()
for row in rows_iter:
row_values = [cell.v for cell in row]
if any(v is not None for v in row_values):
header.extend(row_values)
break

# load data rows as series
series_data = []
for name, column_data in zip(header, zip(*rows_iter)):
if name:
values = [cell.v for cell in column_data]
if (dtype := (schema_overrides or {}).get(name)) == String:
# note: if we init series with mixed-type data (eg: str/int)
# the non-strings will become null, so we handle the cast here
values = [
str(int(v) if isinstance(v, float) and v.is_integer() else v)
if (v is not None)
else v
for v in values
]
elif dtype in (Datetime, Date):
dtype = None

s = pl.Series(name, values, dtype=dtype)
series_data.append(s)
finally:
ws.close()

if schema_overrides:
for idx, s in enumerate(series_data):
if schema_overrides.get(s.name) in (Datetime, Date):
series_data[idx] = s.map_elements(convert_date, return_dtype=Datetime)

df = pl.DataFrame(
{s.name: s for s in series_data},
schema_overrides=schema_overrides,
infer_schema_length=infer_schema_length,
strict=False,
)
return _drop_null_data(df, raise_if_empty=raise_if_empty)


def _read_spreadsheet_xlsx2csv(
parser: Any,
sheet_name: str | None,
Expand Down
2 changes: 0 additions & 2 deletions py-polars/polars/meta/versions.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ def show_versions() -> None:
pyarrow: 16.0.0
pydantic: 2.7.1
pyiceberg: 0.6.0
pyxlsb: 1.0.10
sqlalchemy: 2.0.29
torch: 2.2.2
xlsx2csv: 0.8.2
Expand Down Expand Up @@ -79,7 +78,6 @@ def _get_dependency_info() -> dict[str, str]:
"pyarrow",
"pydantic",
"pyiceberg",
"pyxlsb",
"sqlalchemy",
"torch",
"xlsx2csv",
Expand Down
4 changes: 1 addition & 3 deletions py-polars/polars/type_aliases.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,9 +215,7 @@
BufferInfo: TypeAlias = Tuple[int, int, int]

# type alias for supported spreadsheet engines
ExcelSpreadsheetEngine: TypeAlias = Literal[
"xlsx2csv", "openpyxl", "calamine", "pyxlsb"
]
ExcelSpreadsheetEngine: TypeAlias = Literal["xlsx2csv", "openpyxl", "calamine"]


class SeriesBuffers(TypedDict):
Expand Down
3 changes: 0 additions & 3 deletions py-polars/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@ pandas = ["pyarrow >= 7.0.0", "pandas"]
plot = ["hvplot >= 0.9.1"]
pyarrow = ["pyarrow >= 7.0.0"]
pydantic = ["pydantic"]
pyxlsb = ["pyxlsb >= 1.0"]
sqlalchemy = ["sqlalchemy", "pandas"]
timezone = ["backports.zoneinfo; python_version < '3.9'", "tzdata; platform_system == 'Windows'"]
xlsx2csv = ["xlsx2csv >= 0.8.0"]
Expand Down Expand Up @@ -101,7 +100,6 @@ module = [
"pyarrow.*",
"pydantic",
"pyiceberg.*",
"pyxlsb",
"sqlalchemy.*",
"torch.*",
"xlsx2csv",
Expand Down Expand Up @@ -234,7 +232,6 @@ filterwarnings = [
# TODO: Excel tests lead to unclosed file warnings
# https://github.com/pola-rs/polars/issues/14466
"ignore:unclosed file.*:ResourceWarning",
"ignore:the 'pyxlsb' engine is deprecated.*:DeprecationWarning",
# TODO: Remove when behavior is updated
# https://github.com/pola-rs/polars/issues/13441
"ignore:.*default coalesce behavior of left join.*:DeprecationWarning",
Expand Down
1 change: 0 additions & 1 deletion py-polars/requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ s3fs[boto3]
lxml
fastexcel>=0.9
openpyxl
pyxlsb
xlsx2csv
XlsxWriter
deltalake>=0.15.0
Expand Down
10 changes: 1 addition & 9 deletions py-polars/tests/unit/io/test_spreadsheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,6 @@ def path_ods_mixed(io_files_path: Path) -> Path:
(pl.read_excel, "path_xlsx", {"engine": "calamine"}),
# xlsb file (binary)
(pl.read_excel, "path_xlsb", {"engine": "calamine"}),
(pl.read_excel, "path_xlsb", {"engine": "pyxlsb"}),
# open document
(pl.read_ods, "path_ods", {}),
],
Expand Down Expand Up @@ -126,7 +125,6 @@ def test_read_spreadsheet(
(pl.read_excel, "path_xlsx", {"engine": "calamine"}),
# xlsb file (binary)
(pl.read_excel, "path_xlsb", {"engine": "calamine"}),
(pl.read_excel, "path_xlsb", {"engine": "pyxlsb"}),
# open document
(pl.read_ods, "path_ods", {}),
],
Expand Down Expand Up @@ -171,7 +169,6 @@ def test_read_excel_multi_sheets(
(pl.read_excel, "path_xlsx", {"engine": "calamine"}),
# xlsb file (binary)
(pl.read_excel, "path_xlsb", {"engine": "calamine"}),
(pl.read_excel, "path_xlsb", {"engine": "pyxlsb"}),
# open document
(pl.read_ods, "path_ods", {}),
],
Expand Down Expand Up @@ -278,7 +275,6 @@ def test_read_excel_basic_datatypes(
(pl.read_excel, "path_xlsx", {"engine": "calamine"}),
# xlsb file (binary)
(pl.read_excel, "path_xlsb", {"engine": "calamine"}),
(pl.read_excel, "path_xlsb", {"engine": "pyxlsb"}),
# open document
(pl.read_ods, "path_ods", {}),
],
Expand Down Expand Up @@ -308,7 +304,6 @@ def test_read_invalid_worksheet(
("read_spreadsheet", "source", "additional_params"),
[
(pl.read_excel, "path_xlsx_mixed", {"engine": "openpyxl"}),
(pl.read_excel, "path_xlsb_mixed", {"engine": "pyxlsb"}),
(pl.read_ods, "path_ods_mixed", {}),
],
)
Expand Down Expand Up @@ -488,10 +483,7 @@ def test_unsupported_engine() -> None:
pl.read_excel(None, engine="foo") # type: ignore[call-overload]


def test_unsupported_binary_workbook(path_xlsx: Path, path_xlsb: Path) -> None:
with pytest.raises(Exception, match="invalid Excel Binary Workbook"):
pl.read_excel(path_xlsx, engine="pyxlsb")

def test_unsupported_binary_workbook(path_xlsb: Path) -> None:
with pytest.raises(Exception, match="does not support binary format"):
pl.read_excel(path_xlsb, engine="openpyxl")

Expand Down
Loading