pola-rs · alexander-beedie · Nov 10, 2024 · Nov 9, 2024 · Nov 9, 2024
@@ -495,48 +495,6 @@ def read_ods(
     )
 
 
-def _identify_from_magic_bytes(data: IO[bytes] | bytes) -> str | None:
-    if isinstance(data, bytes):
-        data = BytesIO(data)
-
-    xls_bytes = b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1"  # excel 97-2004
-    xlsx_bytes = b"PK\x03\x04"  # xlsx/openoffice (zipped xml)
-
-    initial_position = data.tell()
-    try:
-        magic_bytes = data.read(8)
-        if magic_bytes == xls_bytes:
-            return "xls"
-        elif magic_bytes[:4] == xlsx_bytes:
-            return "xlsx"
-    except UnicodeDecodeError:
-        pass
-    finally:
-        data.seek(initial_position)
-    return None
-
-
-def _identify_workbook(wb: str | Path | IO[bytes] | bytes) -> str | None:
-    """Use file extension (and magic bytes) to identify Workbook type."""
-    if not isinstance(wb, (str, Path)):
-        # raw binary data (bytesio, etc)
-        return _identify_from_magic_bytes(wb)
-    else:
-        p = Path(wb)
-        ext = p.suffix[1:].lower()
-
-        # unambiguous file extensions
-        if ext in ("xlsx", "xlsm", "xlsb"):
-            return ext
-        elif ext[:2] == "od":
-            return "ods"
-
-        # check magic bytes to resolve ambiguity (eg: xls/xlsx, or no extension)
-        with p.open("rb") as f:
-            magic_bytes = BytesIO(f.read(8))
-            return _identify_from_magic_bytes(magic_bytes)
-
-
 def _read_spreadsheet(
     sheet_id: int | Sequence[int] | None,
     sheet_name: str | list[str] | tuple[str] | None,

@@ -4,20 +4,19 @@
 from collections import OrderedDict
 from datetime import date, datetime
 from io import BytesIO
-from pathlib import Path
 from typing import TYPE_CHECKING, Any, Callable
 
 import pytest
 
 import polars as pl
 import polars.selectors as cs
 from polars.exceptions import NoDataError, ParameterCollisionError
-from polars.io.spreadsheet.functions import _identify_workbook
 from polars.testing import assert_frame_equal, assert_series_equal
 from tests.unit.conftest import FLOAT_DTYPES, NUMERIC_DTYPES
 
 if TYPE_CHECKING:
     from collections.abc import Sequence
+    from pathlib import Path
 
     from polars._typing import ExcelSpreadsheetEngine, SelectorType
 
@@ -1028,44 +1027,6 @@ def test_excel_type_inference_with_nulls(engine: ExcelSpreadsheetEngine) -> None
         assert_frame_equal(df.select(reversed_cols), read_df)
 
 
-@pytest.mark.parametrize(
-    ("path", "file_type"),
-    [
-        ("path_xls", "xls"),
-        ("path_xlsx", "xlsx"),
-        ("path_xlsb", "xlsb"),
-    ],
-)
-def test_identify_workbook(
-    path: str, file_type: str, request: pytest.FixtureRequest
-) -> None:
-    # identify from file path
-    spreadsheet_path = request.getfixturevalue(path)
-    assert _identify_workbook(spreadsheet_path) == file_type
-
-    # note that we can't distinguish between xlsx and xlsb
-    # from the magic bytes block alone (so we default to xlsx)
-    if file_type == "xlsb":
-        file_type = "xlsx"
-
-    # identify from IO[bytes]
-    with Path.open(spreadsheet_path, "rb") as f:
-        assert _identify_workbook(f) == file_type
-        assert isinstance(pl.read_excel(f, engine="calamine"), pl.DataFrame)
-
-    # identify from bytes
-    with Path.open(spreadsheet_path, "rb") as f:
-        raw_data = f.read()
-        assert _identify_workbook(raw_data) == file_type
-        assert isinstance(pl.read_excel(raw_data, engine="calamine"), pl.DataFrame)
-
-    # identify from BytesIO
-    with Path.open(spreadsheet_path, "rb") as f:
-        bytesio_data = BytesIO(f.read())
-        assert _identify_workbook(bytesio_data) == file_type
-        assert isinstance(pl.read_excel(bytesio_data, engine="calamine"), pl.DataFrame)
-
-
 def test_drop_empty_rows(path_empty_rows_excel: Path) -> None:
     df1 = pl.read_excel(source=path_empty_rows_excel, engine="xlsx2csv")
     assert df1.shape == (8, 4)