Skip to content

Commit

Permalink
feat(python)!: Implement binary serialization of LazyFrame/DataFrame/…
Browse files Browse the repository at this point in the history
…Expr and set it as the default format (#17223)
  • Loading branch information
stinodego authored Jun 28, 2024
1 parent e50a2c4 commit 855b3fc
Show file tree
Hide file tree
Showing 20 changed files with 698 additions and 358 deletions.
2 changes: 1 addition & 1 deletion crates/polars-core/src/datatypes/_serde.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ impl<'de> serde::Deserialize<'de> for Wrap<Utf8ViewArray> {
{
let mut utf8array = MutablePlString::with_capacity(seq.size_hint().unwrap_or(10));
while let Some(key) = seq.next_element()? {
let key: Option<&str> = key;
let key: Option<String> = key;
utf8array.push(key)
}
Ok(Wrap(utf8array.into()))
Expand Down
63 changes: 63 additions & 0 deletions py-polars/polars/_utils/serde.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
"""Utility for serializing Polars objects."""

from __future__ import annotations

from io import BytesIO, StringIO
from pathlib import Path
from typing import TYPE_CHECKING, Callable, Literal, overload

from polars._utils.various import normalize_filepath

if TYPE_CHECKING:
from io import IOBase

from polars.type_aliases import SerializationFormat


@overload
def serialize_polars_object(
serializer: Callable[[IOBase | str], None], file: None, format: Literal["binary"]
) -> bytes: ...
@overload
def serialize_polars_object(
serializer: Callable[[IOBase | str], None], file: None, format: Literal["json"]
) -> str: ...
@overload
def serialize_polars_object(
serializer: Callable[[IOBase | str], None],
file: IOBase | str | Path,
format: SerializationFormat,
) -> None: ...


def serialize_polars_object(
serializer: Callable[[IOBase | str], None],
file: IOBase | str | Path | None,
format: SerializationFormat,
) -> bytes | str | None:
"""Serialize a Polars object (DataFrame/LazyFrame/Expr)."""

def serialize_to_bytes() -> bytes:
with BytesIO() as buf:
serializer(buf)
serialized = buf.getvalue()
return serialized

if file is None:
serialized = serialize_to_bytes()
return serialized.decode() if format == "json" else serialized
elif isinstance(file, StringIO):
serialized_str = serialize_to_bytes().decode()
file.write(serialized_str)
return None
elif isinstance(file, BytesIO):
serialized = serialize_to_bytes()
file.write(serialized)
return None
elif isinstance(file, (str, Path)):
file = normalize_filepath(file)
serializer(file)
return None
else:
serializer(file)
return None
117 changes: 83 additions & 34 deletions py-polars/polars/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
)
from polars._utils.getitem import get_df_item_by_key
from polars._utils.parse import parse_into_expression
from polars._utils.serde import serialize_polars_object
from polars._utils.unstable import issue_unstable_warning, unstable
from polars._utils.various import (
is_bool_sequence,
Expand Down Expand Up @@ -100,7 +101,11 @@
from polars.functions import col, lit
from polars.schema import Schema
from polars.selectors import _expand_selector_dicts, _expand_selectors
from polars.type_aliases import DbWriteMode, JaxExportType, TorchExportType
from polars.type_aliases import (
DbWriteMode,
JaxExportType,
TorchExportType,
)

with contextlib.suppress(ImportError): # Module not available when building docs
from polars.polars import PyDataFrame
Expand Down Expand Up @@ -159,6 +164,7 @@
SchemaDefinition,
SchemaDict,
SelectorType,
SerializationFormat,
SingleColSelector,
SingleIndexSelector,
SizeUnit,
Expand Down Expand Up @@ -416,29 +422,39 @@ def __init__(
raise TypeError(msg)

@classmethod
def deserialize(cls, source: str | Path | IOBase) -> DataFrame:
def deserialize(
cls, source: str | Path | IOBase, *, format: SerializationFormat = "binary"
) -> DataFrame:
"""
Read a serialized DataFrame from a file.
.. versionadded:: 0.20.31
Parameters
----------
source
Path to a file or a file-like object (by file-like object, we refer to
objects that have a `read()` method, such as a file handler (e.g.
via builtin `open` function) or `BytesIO`).
format
The format with which the DataFrame was serialized. Options:
- `"binary"`: Deserialize from binary format (bytes). This is the default.
- `"json"`: Deserialize from JSON format (string).
See Also
--------
DataFrame.serialize
Notes
-----
Serialization is not stable across Polars versions: a LazyFrame serialized
in one Polars version may not be deserializable in another Polars version.
Examples
--------
>>> import io
>>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4.0, 5.0, 6.0]})
>>> json = df.serialize()
>>> pl.DataFrame.deserialize(io.StringIO(json))
>>> bytes = df.serialize()
>>> pl.DataFrame.deserialize(io.BytesIO(bytes))
shape: (3, 2)
┌─────┬─────┐
│ a ┆ b │
Expand All @@ -455,7 +471,15 @@ def deserialize(cls, source: str | Path | IOBase) -> DataFrame:
elif isinstance(source, (str, Path)):
source = normalize_filepath(source)

return cls._from_pydf(PyDataFrame.deserialize(source))
if format == "binary":
deserializer = PyDataFrame.deserialize_binary
elif format == "json":
deserializer = PyDataFrame.deserialize_json
else:
msg = f"`format` must be one of {{'binary', 'json'}}, got {format!r}"
raise ValueError(msg)

return cls._from_pydf(deserializer(source))

@classmethod
def _from_pydf(cls, py_df: PyDataFrame) -> DataFrame:
Expand Down Expand Up @@ -2351,54 +2375,79 @@ def to_init_repr(self, n: int = 1000) -> str:
return output.getvalue()

@overload
def serialize(self, file: None = ...) -> str: ...

def serialize(
self, file: None = ..., *, format: Literal["binary"] = ...
) -> bytes: ...
@overload
def serialize(self, file: None = ..., *, format: Literal["json"]) -> str: ...
@overload
def serialize(self, file: IOBase | str | Path) -> None: ...
def serialize(
self, file: IOBase | str | Path, *, format: SerializationFormat = ...
) -> None: ...

def serialize(self, file: IOBase | str | Path | None = None) -> str | None:
"""
def serialize(
self,
file: IOBase | str | Path | None = None,
*,
format: SerializationFormat = "binary",
) -> bytes | str | None:
r"""
Serialize this DataFrame to a file or string in JSON format.
.. versionadded:: 0.20.31
Parameters
----------
file
File path or writable file-like object to which the result will be written.
If set to `None` (default), the output is returned as a string instead.
format
The format in which to serialize. Options:
- `"binary"`: Serialize to binary format (bytes). This is the default.
- `"json"`: Serialize to JSON format (string).
Notes
-----
Serialization is not stable across Polars versions: a LazyFrame serialized
in one Polars version may not be deserializable in another Polars version.
Examples
--------
Serialize the DataFrame into a binary representation.
>>> df = pl.DataFrame(
... {
... "foo": [1, 2, 3],
... "bar": [6, 7, 8],
... }
... )
>>> df.serialize()
'{"columns":[{"name":"foo","datatype":"Int64","bit_settings":"","values":[1,2,3]},{"name":"bar","datatype":"Int64","bit_settings":"","values":[6,7,8]}]}'
"""
>>> bytes = df.serialize()
>>> bytes # doctest: +ELLIPSIS
b'\xa1gcolumns\x82\xa4dnamecfoohdatatypeeInt64lbit_settings\x00fvalues\x83...'
def serialize_to_string() -> str:
with BytesIO() as buf:
self._df.serialize(buf)
json_bytes = buf.getvalue()
return json_bytes.decode("utf8")
The bytes can later be deserialized back into a DataFrame.
if file is None:
return serialize_to_string()
elif isinstance(file, StringIO):
json_str = serialize_to_string()
file.write(json_str)
return None
elif isinstance(file, (str, Path)):
file = normalize_filepath(file)
self._df.serialize(file)
return None
>>> import io
>>> pl.DataFrame.deserialize(io.BytesIO(bytes))
shape: (3, 2)
┌─────┬─────┐
│ foo ┆ bar │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 1 ┆ 6 │
│ 2 ┆ 7 │
│ 3 ┆ 8 │
└─────┴─────┘
"""
if format == "binary":
serializer = self._df.serialize_binary
elif format == "json":
serializer = self._df.serialize_json
else:
self._df.serialize(file)
return None
msg = f"`format` must be one of {{'binary', 'json'}}, got {format!r}"
raise ValueError(msg)

return serialize_polars_object(serializer, file, format)

@overload
def write_json(self, file: None = ...) -> str: ...
Expand Down
37 changes: 28 additions & 9 deletions py-polars/polars/expr/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@
RankMethod,
RollingInterpolationMethod,
SearchSortedSide,
SerializationFormat,
TemporalLiteral,
WindowMappingStrategy,
)
Expand Down Expand Up @@ -328,7 +329,9 @@ def function(s: Series) -> Series: # pragma: no cover
return root_expr.map_batches(function, is_elementwise=True).meta.undo_aliases()

@classmethod
def deserialize(cls, source: str | Path | IOBase) -> Expr:
def deserialize(
cls, source: str | Path | IOBase, *, format: SerializationFormat = "binary"
) -> Expr:
"""
Read a serialized expression from a file.
Expand All @@ -338,33 +341,49 @@ def deserialize(cls, source: str | Path | IOBase) -> Expr:
Path to a file or a file-like object (by file-like object, we refer to
objects that have a `read()` method, such as a file handler (e.g.
via builtin `open` function) or `BytesIO`).
format
The format with which the Expr was serialized. Options:
- `"binary"`: Deserialize from binary format (bytes). This is the default.
- `"json"`: Deserialize from JSON format (string).
Warnings
--------
This function uses :mod:`pickle` when the logical plan contains Python UDFs,
This function uses :mod:`pickle` if the logical plan contains Python UDFs,
and as such inherits the security implications. Deserializing can execute
arbitrary code, so it should only be attempted on trusted data.
See Also
--------
Expr.meta.serialize
Notes
-----
Serialization is not stable across Polars versions: a LazyFrame serialized
in one Polars version may not be deserializable in another Polars version.
Examples
--------
>>> from io import StringIO
>>> import io
>>> expr = pl.col("foo").sum().over("bar")
>>> json = expr.meta.serialize()
>>> pl.Expr.deserialize(StringIO(json)) # doctest: +ELLIPSIS
>>> bytes = expr.meta.serialize()
>>> pl.Expr.deserialize(io.BytesIO(bytes)) # doctest: +ELLIPSIS
<Expr ['col("foo").sum().over([col("ba…'] at ...>
"""
if isinstance(source, StringIO):
source = BytesIO(source.getvalue().encode())
elif isinstance(source, (str, Path)):
source = normalize_filepath(source)

expr = cls.__new__(cls)
expr._pyexpr = PyExpr.deserialize(source)
return expr
if format == "binary":
deserializer = PyExpr.deserialize_binary
elif format == "json":
deserializer = PyExpr.deserialize_json
else:
msg = f"`format` must be one of {{'binary', 'json'}}, got {format!r}"
raise ValueError(msg)

return cls._from_pyexpr(deserializer(source))

def to_physical(self) -> Expr:
"""
Expand Down Expand Up @@ -10479,7 +10498,7 @@ def from_json(cls, value: str) -> Expr:
" Enclose your input in `io.StringIO` to keep the same behavior.",
version="0.20.11",
)
return cls.deserialize(StringIO(value))
return cls.deserialize(StringIO(value), format="json")

@property
def bin(self) -> ExprBinaryNameSpace:
Expand Down
Loading

0 comments on commit 855b3fc

Please sign in to comment.