Skip to content

Commit

Permalink
[c++/python] Add GeometryDataFrame creation (#3674)
Browse files Browse the repository at this point in the history
* Remove extra spatial arrow table, include spatial domain info in index column arrow table

* Add GeometryDataframe python implementation

* Update schema generation

* Pass coordinate space down to geometry column during schema creation, decouple arrow schema between column schema and column domain

* Change geometry domain representation

* Update unit test and C++ geometry index column generation

* lint fixes

* Add automatic addition of column configurable

* Fix domain generation

* Add basic dataframe construction tests

* lint fixes

* Properly handle complex domain saturation flag

* Apply suggestions from code review

Co-authored-by: Julia Dark <[email protected]>

* Update unit test

---------

Co-authored-by: Julia Dark <[email protected]>
  • Loading branch information
XanthosXanthopoulos and jp-dark authored Feb 11, 2025
1 parent af669db commit 552dfc0
Show file tree
Hide file tree
Showing 33 changed files with 940 additions and 283 deletions.
1 change: 1 addition & 0 deletions apis/python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,7 @@ def run(self):
"src/tiledbsoma/soma_object.cc",
"src/tiledbsoma/soma_dataframe.cc",
"src/tiledbsoma/soma_point_cloud_dataframe.cc",
"src/tiledbsoma/soma_geometry_dataframe.cc",
"src/tiledbsoma/soma_dense_ndarray.cc",
"src/tiledbsoma/soma_sparse_ndarray.cc",
"src/tiledbsoma/soma_group.cc",
Expand Down
2 changes: 2 additions & 0 deletions apis/python/src/tiledbsoma/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,7 @@
get_storage_engine,
show_package_versions,
)
from ._geometry_dataframe import GeometryDataFrame
from ._indexer import IntIndexer, tiledbsoma_build_index
from ._measurement import Measurement
from ._multiscale_image import MultiscaleImage
Expand Down Expand Up @@ -209,6 +210,7 @@
"DoesNotExistError",
"Experiment",
"ExperimentAxisQuery",
"GeometryDataFrame",
"get_implementation_version",
"get_implementation",
"get_SOMA_version",
Expand Down
93 changes: 85 additions & 8 deletions apis/python/src/tiledbsoma/_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@

from . import _arrow_types, _util
from . import pytiledbsoma as clib
from ._constants import SOMA_JOINID
from ._constants import SOMA_GEOMETRY, SOMA_JOINID
from ._exception import SOMAError, map_exception_for_create
from ._read_iters import ManagedQuery, TableReadIter
from ._soma_array import SOMAArray
Expand Down Expand Up @@ -790,7 +790,9 @@ def write(


def _canonicalize_schema(
schema: pa.Schema, index_column_names: Sequence[str]
schema: pa.Schema,
index_column_names: Sequence[str],
required_columns: Sequence[str] = [SOMA_JOINID],
) -> pa.Schema:
"""Turns an Arrow schema into the canonical version and checks for errors.
Expand All @@ -807,21 +809,45 @@ def _canonicalize_schema(
raise ValueError(
f"{SOMA_JOINID} field must be of type Arrow int64 but is {joinid_type}"
)
else:
elif SOMA_JOINID in required_columns:
# add SOMA_JOINID
schema = schema.append(pa.field(SOMA_JOINID, pa.int64()))

if SOMA_GEOMETRY in schema.names:
geometry_type = schema.field(SOMA_GEOMETRY).type
if geometry_type != pa.binary() and geometry_type != pa.large_binary():
raise ValueError(
f"{SOMA_GEOMETRY} field must be of type Arrow binary or large_binary but is {geometry_type}"
)
schema.set(
schema.get_field_index(SOMA_GEOMETRY),
schema.field(SOMA_GEOMETRY).with_metadata({"dtype": "WKB"}),
)
elif SOMA_GEOMETRY in required_columns:
# add SOMA_GEOMETRY
schema = schema.append(
pa.field(SOMA_GEOMETRY, pa.large_binary(), metadata={"dtype": "WKB"})
)

# verify no illegal use of soma_ prefix
for field_name in schema.names:
if field_name.startswith("soma_") and field_name != SOMA_JOINID:
if (
field_name.startswith("soma_")
and field_name != SOMA_JOINID
and field_name != SOMA_GEOMETRY
):
raise ValueError(
f"DataFrame schema may not contain fields with name prefix ``soma_``: got ``{field_name}``"
)

# verify that all index_column_names are present in the schema
schema_names_set = set(schema.names)
for index_column_name in index_column_names:
if index_column_name.startswith("soma_") and index_column_name != SOMA_JOINID:
if (
index_column_name.startswith("soma_")
and index_column_name != SOMA_JOINID
and index_column_name != SOMA_GEOMETRY
):
raise ValueError(
f'index_column_name other than "soma_joinid" must not begin with "soma_"; got "{index_column_name}"'
)
Expand Down Expand Up @@ -864,7 +890,7 @@ def _fill_out_slot_soma_domain(
index_column_name: str,
pa_type: pa.DataType,
dtype: Any,
) -> Tuple[Tuple[Any, Any], bool]:
) -> Tuple[Tuple[Any, Any], Union[bool, Tuple[bool, ...]]]:
"""Helper function for _build_tiledb_schema. Given a user-specified domain for a
dimension slot -- which may be ``None``, or a two-tuple of which either element
may be ``None`` -- return either what the user specified (if adequate) or
Expand All @@ -873,6 +899,37 @@ def _fill_out_slot_soma_domain(
Returns a boolean for whether the underlying datatype's max range was used.
"""
saturated_range = False
if index_column_name == SOMA_GEOMETRY:
# SOMA_GEOMETRY domain should be either a list of None or a list of tuple[float, float]
axes_lo = []
axes_hi = []
if isinstance(slot_domain, list):
f64info: NPFInfo = np.finfo(np.float64)
saturated_multi_range = []
for axis_domain in slot_domain:
if axis_domain is None:
axes_lo.append(f64info.min)
axes_hi.append(f64info.max)
saturated_multi_range.append(True)
elif not isinstance(axis_domain, tuple) or len(axis_domain) != 2:
raise ValueError("Axis domain should be a tuple[float, float]")
else:
if np.issubdtype(type(axis_domain[0]), NPFloating) or np.issubdtype(
type(axis_domain[1]), NPFloating
):
raise ValueError("Axis domain should be a tuple[float, float]")

axes_lo.append(axis_domain[0])
axes_hi.append(axis_domain[1])
saturated_multi_range.append(False)
slot_domain = tuple(axes_lo), tuple(axes_hi)
else:
raise ValueError(
f"{SOMA_GEOMETRY} domain should be either a list of None or a list of tuple[float, float]"
)

return (slot_domain, tuple(saturated_multi_range))

if slot_domain is not None:
# User-specified; go with it when possible
if (
Expand Down Expand Up @@ -1007,6 +1064,9 @@ def _find_extent_for_domain(
if isinstance(dtype, np.dtype) and dtype.itemsize == 1:
extent = 1

if index_column_name == SOMA_GEOMETRY:
return extent

# Core string dims have no extent and no (core) domain. We return "" here
# simply so we can pass libtiledbsoma "" for domain and extent, while it
# will (and must) ignore these when creating the TileDB schema.
Expand Down Expand Up @@ -1053,9 +1113,26 @@ def _find_extent_for_domain(
# extent exceeds max value representable by domain type. Reduce domain max
# by 1 tile extent to allow for expansion.
def _revise_domain_for_extent(
domain: Tuple[Any, Any], extent: Any, saturated_range: bool
domain: Tuple[Any, Any], extent: Any, saturated_range: Union[bool, Tuple[bool, ...]]
) -> Tuple[Any, Any]:
if saturated_range:
if isinstance(saturated_range, tuple):
# Handle SOMA_GEOMETRY domain with is tuple[list[float], list[float]]
if isinstance(domain[1], tuple):
if len(saturated_range) != len(domain[1]):
raise ValueError(
"Internal error: Saturatin flag length does not match domain size"
)

return (
domain[0],
[
(dim_max - extent) if saturated_range[idx] else dim_max
for idx, dim_max in enumerate(domain[1])
],
)

raise ValueError("Expected a complex domain")
elif saturated_range:
return (domain[0], domain[1] - extent)
else:
return domain
Loading

0 comments on commit 552dfc0

Please sign in to comment.