feat(python)!: Remove pyxlsb engine from read_database (#16784)

pola-rs · Jun 6, 2024 · 3a36067 · 3a36067
1 parent b80972b
commit 3a36067
Show file tree

Hide file tree

Showing 7 changed files with 3 additions and 104 deletions.
diff --git a/docs/user-guide/io/excel.md b/docs/user-guide/io/excel.md
@@ -9,7 +9,6 @@ Polars does not have a native Excel reader. Instead, it uses external libraries
 
 - xlsx2csv: This is the current default.
 - openpyxl: Typically slower than xls2csv, but can provide more flexibility for files that are difficult to parse.
-- pyxlsb: For reading binary Excel files (xlsb).
 - fastexcel: This reader is based on [calamine](https://github.com/tafia/calamine) and is typically the fastest reader but has fewer features than xls2csv.
 
 Although fastexcel is not the default at this point, we recommend trying fastexcel first and using xlsx2csv or openpyxl if you encounter issues.
@@ -19,7 +18,7 @@ To use one of these engines, the appropriate Python package must be installed as
 === ":fontawesome-brands-python: Python"
 
     ```shell
-    $ pip install xlsx2csv openpyxl pyxlsb fastexcel
+    $ pip install xlsx2csv openpyxl fastexcel
     ```
 
 The default Excel reader is xlsx2csv.

diff --git a/py-polars/polars/io/spreadsheet/functions.py b/py-polars/polars/io/spreadsheet/functions.py
@@ -152,8 +152,6 @@ def read_excel(
 
     .. versionadded:: 0.20.6
         Added "calamine" fastexcel engine for Excel Workbooks (.xlsx, .xlsb, .xls).
-    .. versionadded:: 0.19.4
-        Added "pyxlsb" engine for Excel Binary Workbooks (.xlsb).
     .. versionadded:: 0.19.3
         Added "openpyxl" engine, and added `schema_overrides` parameter.
 
@@ -184,18 +182,13 @@ def read_excel(
           additional automatic type inference; potentially useful if you are otherwise
           unable to parse your sheet with the (default) `xlsx2csv` engine in
           conjunction with the `schema_overrides` parameter.
-        * "pyxlsb": this engine can be used for Excel Binary Workbooks (`.xlsb` files).
-          Note that you have to use `schema_overrides` to correctly load date/datetime
-          columns (or these will be read as floats representing offset Julian values).
-          You should now prefer the "calamine" engine for this Workbook type.
     engine_options
         Additional options passed to the underlying engine's primary parsing
         constructor (given below), if supported:
 
         * "xlsx2csv": `Xlsx2csv`
         * "calamine": n/a (can only provide `read_options`)
         * "openpyxl": `load_workbook`
-        * "pyxlsb": `open_workbook`
     read_options
         Options passed to the underlying engine method that reads the sheet data.
         Where supported, this allows for additional control over parsing. The
@@ -204,7 +197,6 @@ def read_excel(
         * "xlsx2csv": `pl.read_csv`
         * "calamine": `ExcelReader.load_sheet_by_name`
         * "openpyxl": n/a (can only provide `engine_options`)
-        * "pyxlsb":  n/a (can only provide `engine_options`)
     schema_overrides
         Support type specification or override of one or more columns.
     infer_schema_length
@@ -658,24 +650,6 @@ def _initialise_spreadsheet_parser(
         ]
         return _read_spreadsheet_calamine, parser, sheets
 
-    elif engine == "pyxlsb":
-        issue_deprecation_warning(
-            "the 'pyxlsb' engine is deprecated and should be replaced with 'calamine'",
-            version="0.20.22",
-        )
-        pyxlsb = import_optional("pyxlsb")
-        try:
-            parser = pyxlsb.open_workbook(source, **engine_options)
-        except KeyError as err:
-            if "no item named 'xl/_rels/workbook.bin.rels'" in str(err):
-                msg = f"invalid Excel Binary Workbook: {source!r}"
-                raise TypeError(msg) from None
-            raise
-        sheets = [
-            {"index": i + 1, "name": name} for i, name in enumerate(parser.sheets)
-        ]
-        return _read_spreadsheet_pyxlsb, parser, sheets
-
     msg = f"unrecognized engine: {engine!r}"
     raise NotImplementedError(msg)
 
@@ -890,64 +864,6 @@ def _read_spreadsheet_calamine(
     return df
 
 
-def _read_spreadsheet_pyxlsb(
-    parser: Any,
-    sheet_name: str | None,
-    read_options: dict[str, Any],
-    schema_overrides: SchemaDict | None,
-    *,
-    raise_if_empty: bool,
-) -> pl.DataFrame:
-    from pyxlsb import convert_date
-
-    infer_schema_length = read_options.pop("infer_schema_length", None)
-    ws = parser.get_sheet(sheet_name)
-    try:
-        # establish header/data rows
-        header: list[str | None] = []
-        rows_iter = ws.rows()
-        for row in rows_iter:
-            row_values = [cell.v for cell in row]
-            if any(v is not None for v in row_values):
-                header.extend(row_values)
-                break
-
-        # load data rows as series
-        series_data = []
-        for name, column_data in zip(header, zip(*rows_iter)):
-            if name:
-                values = [cell.v for cell in column_data]
-                if (dtype := (schema_overrides or {}).get(name)) == String:
-                    # note: if we init series with mixed-type data (eg: str/int)
-                    # the non-strings will become null, so we handle the cast here
-                    values = [
-                        str(int(v) if isinstance(v, float) and v.is_integer() else v)
-                        if (v is not None)
-                        else v
-                        for v in values
-                    ]
-                elif dtype in (Datetime, Date):
-                    dtype = None
-
-                s = pl.Series(name, values, dtype=dtype)
-                series_data.append(s)
-    finally:
-        ws.close()
-
-    if schema_overrides:
-        for idx, s in enumerate(series_data):
-            if schema_overrides.get(s.name) in (Datetime, Date):
-                series_data[idx] = s.map_elements(convert_date, return_dtype=Datetime)
-
-    df = pl.DataFrame(
-        {s.name: s for s in series_data},
-        schema_overrides=schema_overrides,
-        infer_schema_length=infer_schema_length,
-        strict=False,
-    )
-    return _drop_null_data(df, raise_if_empty=raise_if_empty)
-
-
 def _read_spreadsheet_xlsx2csv(
     parser: Any,
     sheet_name: str | None,

diff --git a/py-polars/polars/meta/versions.py b/py-polars/polars/meta/versions.py
@@ -35,7 +35,6 @@ def show_versions() -> None:
     pyarrow:              16.0.0
     pydantic:             2.7.1
     pyiceberg:            0.6.0
-    pyxlsb:               1.0.10
     sqlalchemy:           2.0.29
     torch:                2.2.2
     xlsx2csv:             0.8.2
@@ -79,7 +78,6 @@ def _get_dependency_info() -> dict[str, str]:
         "pyarrow",
         "pydantic",
         "pyiceberg",
-        "pyxlsb",
         "sqlalchemy",
         "torch",
         "xlsx2csv",

diff --git a/py-polars/polars/type_aliases.py b/py-polars/polars/type_aliases.py
@@ -215,9 +215,7 @@
 BufferInfo: TypeAlias = Tuple[int, int, int]
 
 # type alias for supported spreadsheet engines
-ExcelSpreadsheetEngine: TypeAlias = Literal[
-    "xlsx2csv", "openpyxl", "calamine", "pyxlsb"
-]
+ExcelSpreadsheetEngine: TypeAlias = Literal["xlsx2csv", "openpyxl", "calamine"]
 
 
 class SeriesBuffers(TypedDict):

diff --git a/py-polars/pyproject.toml b/py-polars/pyproject.toml
@@ -55,7 +55,6 @@ pandas = ["pyarrow >= 7.0.0", "pandas"]
 plot = ["hvplot >= 0.9.1"]
 pyarrow = ["pyarrow >= 7.0.0"]
 pydantic = ["pydantic"]
-pyxlsb = ["pyxlsb >= 1.0"]
 sqlalchemy = ["sqlalchemy", "pandas"]
 timezone = ["backports.zoneinfo; python_version < '3.9'", "tzdata; platform_system == 'Windows'"]
 xlsx2csv = ["xlsx2csv >= 0.8.0"]
@@ -101,7 +100,6 @@ module = [
   "pyarrow.*",
   "pydantic",
   "pyiceberg.*",
-  "pyxlsb",
   "sqlalchemy.*",
   "torch.*",
   "xlsx2csv",
@@ -234,7 +232,6 @@ filterwarnings = [
   # TODO: Excel tests lead to unclosed file warnings
   # https://github.com/pola-rs/polars/issues/14466
   "ignore:unclosed file.*:ResourceWarning",
-  "ignore:the 'pyxlsb' engine is deprecated.*:DeprecationWarning",
   # TODO: Remove when behavior is updated
   # https://github.com/pola-rs/polars/issues/13441
   "ignore:.*default coalesce behavior of left join.*:DeprecationWarning",

diff --git a/py-polars/requirements-dev.txt b/py-polars/requirements-dev.txt
@@ -39,7 +39,6 @@ s3fs[boto3]
 lxml
 fastexcel>=0.9
 openpyxl
-pyxlsb
 xlsx2csv
 XlsxWriter
 deltalake>=0.15.0

diff --git a/py-polars/tests/unit/io/test_spreadsheet.py b/py-polars/tests/unit/io/test_spreadsheet.py
@@ -88,7 +88,6 @@ def path_ods_mixed(io_files_path: Path) -> Path:
         (pl.read_excel, "path_xlsx", {"engine": "calamine"}),
         # xlsb file (binary)
         (pl.read_excel, "path_xlsb", {"engine": "calamine"}),
-        (pl.read_excel, "path_xlsb", {"engine": "pyxlsb"}),
         # open document
         (pl.read_ods, "path_ods", {}),
     ],
@@ -126,7 +125,6 @@ def test_read_spreadsheet(
         (pl.read_excel, "path_xlsx", {"engine": "calamine"}),
         # xlsb file (binary)
         (pl.read_excel, "path_xlsb", {"engine": "calamine"}),
-        (pl.read_excel, "path_xlsb", {"engine": "pyxlsb"}),
         # open document
         (pl.read_ods, "path_ods", {}),
     ],
@@ -171,7 +169,6 @@ def test_read_excel_multi_sheets(
         (pl.read_excel, "path_xlsx", {"engine": "calamine"}),
         # xlsb file (binary)
         (pl.read_excel, "path_xlsb", {"engine": "calamine"}),
-        (pl.read_excel, "path_xlsb", {"engine": "pyxlsb"}),
         # open document
         (pl.read_ods, "path_ods", {}),
     ],
@@ -278,7 +275,6 @@ def test_read_excel_basic_datatypes(
         (pl.read_excel, "path_xlsx", {"engine": "calamine"}),
         # xlsb file (binary)
         (pl.read_excel, "path_xlsb", {"engine": "calamine"}),
-        (pl.read_excel, "path_xlsb", {"engine": "pyxlsb"}),
         # open document
         (pl.read_ods, "path_ods", {}),
     ],
@@ -308,7 +304,6 @@ def test_read_invalid_worksheet(
     ("read_spreadsheet", "source", "additional_params"),
     [
         (pl.read_excel, "path_xlsx_mixed", {"engine": "openpyxl"}),
-        (pl.read_excel, "path_xlsb_mixed", {"engine": "pyxlsb"}),
         (pl.read_ods, "path_ods_mixed", {}),
     ],
 )
@@ -488,10 +483,7 @@ def test_unsupported_engine() -> None:
         pl.read_excel(None, engine="foo")  # type: ignore[call-overload]
 
 
-def test_unsupported_binary_workbook(path_xlsx: Path, path_xlsb: Path) -> None:
-    with pytest.raises(Exception, match="invalid Excel Binary Workbook"):
-        pl.read_excel(path_xlsx, engine="pyxlsb")
-
+def test_unsupported_binary_workbook(path_xlsb: Path) -> None:
     with pytest.raises(Exception, match="does not support binary format"):
         pl.read_excel(path_xlsb, engine="openpyxl")