Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor(python): Remove unused Excel code #19710

Merged
merged 2 commits into from
Nov 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 0 additions & 42 deletions py-polars/polars/io/spreadsheet/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -495,48 +495,6 @@ def read_ods(
)


def _identify_from_magic_bytes(data: IO[bytes] | bytes) -> str | None:
if isinstance(data, bytes):
data = BytesIO(data)

xls_bytes = b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1" # excel 97-2004
xlsx_bytes = b"PK\x03\x04" # xlsx/openoffice (zipped xml)

initial_position = data.tell()
try:
magic_bytes = data.read(8)
if magic_bytes == xls_bytes:
return "xls"
elif magic_bytes[:4] == xlsx_bytes:
return "xlsx"
except UnicodeDecodeError:
pass
finally:
data.seek(initial_position)
return None


def _identify_workbook(wb: str | Path | IO[bytes] | bytes) -> str | None:
"""Use file extension (and magic bytes) to identify Workbook type."""
if not isinstance(wb, (str, Path)):
# raw binary data (bytesio, etc)
return _identify_from_magic_bytes(wb)
else:
p = Path(wb)
ext = p.suffix[1:].lower()

# unambiguous file extensions
if ext in ("xlsx", "xlsm", "xlsb"):
return ext
elif ext[:2] == "od":
return "ods"

# check magic bytes to resolve ambiguity (eg: xls/xlsx, or no extension)
with p.open("rb") as f:
magic_bytes = BytesIO(f.read(8))
return _identify_from_magic_bytes(magic_bytes)


def _read_spreadsheet(
sheet_id: int | Sequence[int] | None,
sheet_name: str | list[str] | tuple[str] | None,
Expand Down
41 changes: 1 addition & 40 deletions py-polars/tests/unit/io/test_spreadsheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,19 @@
from collections import OrderedDict
from datetime import date, datetime
from io import BytesIO
from pathlib import Path
from typing import TYPE_CHECKING, Any, Callable

import pytest

import polars as pl
import polars.selectors as cs
from polars.exceptions import NoDataError, ParameterCollisionError
from polars.io.spreadsheet.functions import _identify_workbook
from polars.testing import assert_frame_equal, assert_series_equal
from tests.unit.conftest import FLOAT_DTYPES, NUMERIC_DTYPES

if TYPE_CHECKING:
from collections.abc import Sequence
from pathlib import Path

from polars._typing import ExcelSpreadsheetEngine, SelectorType

Expand Down Expand Up @@ -1028,44 +1027,6 @@ def test_excel_type_inference_with_nulls(engine: ExcelSpreadsheetEngine) -> None
assert_frame_equal(df.select(reversed_cols), read_df)


@pytest.mark.parametrize(
("path", "file_type"),
[
("path_xls", "xls"),
("path_xlsx", "xlsx"),
("path_xlsb", "xlsb"),
],
)
def test_identify_workbook(
path: str, file_type: str, request: pytest.FixtureRequest
) -> None:
# identify from file path
spreadsheet_path = request.getfixturevalue(path)
assert _identify_workbook(spreadsheet_path) == file_type

# note that we can't distinguish between xlsx and xlsb
# from the magic bytes block alone (so we default to xlsx)
if file_type == "xlsb":
file_type = "xlsx"

# identify from IO[bytes]
with Path.open(spreadsheet_path, "rb") as f:
assert _identify_workbook(f) == file_type
assert isinstance(pl.read_excel(f, engine="calamine"), pl.DataFrame)

# identify from bytes
with Path.open(spreadsheet_path, "rb") as f:
raw_data = f.read()
assert _identify_workbook(raw_data) == file_type
assert isinstance(pl.read_excel(raw_data, engine="calamine"), pl.DataFrame)

# identify from BytesIO
with Path.open(spreadsheet_path, "rb") as f:
bytesio_data = BytesIO(f.read())
assert _identify_workbook(bytesio_data) == file_type
assert isinstance(pl.read_excel(bytesio_data, engine="calamine"), pl.DataFrame)


def test_drop_empty_rows(path_empty_rows_excel: Path) -> None:
df1 = pl.read_excel(source=path_empty_rows_excel, engine="xlsx2csv")
assert df1.shape == (8, 4)
Expand Down
Loading