Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revert "[Data] Change offsets to int64 and change to LargeList for ArrowTensorArray" #46511

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 0 additions & 8 deletions python/ray/air/tests/test_tensor_extension.py
Original file line number Diff line number Diff line change
Expand Up @@ -676,14 +676,6 @@ def test_variable_shaped_tensor_array_uniform_dim():
np.testing.assert_array_equal(a, expected)


def test_large_arrow_tensor_array():
test_arr = np.ones((1000, 550), dtype=np.uint8)
ta = ArrowTensorArray.from_numpy([test_arr] * 4000)
assert len(ta) == 4000
for arr in ta:
assert np.asarray(arr).shape == (1000, 550)


if __name__ == "__main__":
import sys

Expand Down
14 changes: 6 additions & 8 deletions python/ray/air/util/tensor_extensions/arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def __init__(self, shape: Tuple[int, ...], dtype: pa.DataType):
dtype: pyarrow dtype of tensor elements.
"""
self._shape = shape
super().__init__(pa.large_list(dtype), "ray.data.arrow_tensor")
super().__init__(pa.list_(dtype), "ray.data.arrow_tensor")

@property
def shape(self):
Expand Down Expand Up @@ -316,7 +316,7 @@ class ArrowTensorArray(_ArrowTensorScalarIndexingMixin, pa.ExtensionArray):
https://arrow.apache.org/docs/python/extending_types.html#custom-extension-array-class
"""

OFFSET_DTYPE = np.int64
OFFSET_DTYPE = np.int32

@classmethod
def from_numpy(
Expand Down Expand Up @@ -414,7 +414,7 @@ def _from_numpy(
)

storage = pa.Array.from_buffers(
pa.large_list(pa_dtype),
pa.list_(pa_dtype),
outer_len,
[None, offset_buffer],
children=[data_array],
Expand Down Expand Up @@ -612,9 +612,7 @@ def __init__(self, dtype: pa.DataType, ndim: int):
"""
self._ndim = ndim
super().__init__(
pa.struct(
[("data", pa.large_list(dtype)), ("shape", pa.list_(pa.int64()))]
),
pa.struct([("data", pa.list_(dtype)), ("shape", pa.list_(pa.int64()))]),
"ray.data.arrow_variable_shaped_tensor",
)

Expand Down Expand Up @@ -721,7 +719,7 @@ class ArrowVariableShapedTensorArray(
https://arrow.apache.org/docs/python/extending_types.html#custom-extension-array-class
"""

OFFSET_DTYPE = np.int64
OFFSET_DTYPE = np.int32

@classmethod
def from_numpy(
Expand Down Expand Up @@ -811,7 +809,7 @@ def from_numpy(
# corresponds to a tensor element.
size_offsets = np.insert(size_offsets, 0, 0)
offset_array = pa.array(size_offsets)
data_array = pa.LargeListArray.from_arrays(offset_array, value_array)
data_array = pa.ListArray.from_arrays(offset_array, value_array)
# We store the tensor element shapes so we can reconstruct each tensor when
# converting back to NumPy ndarrays.
shape_array = pa.array(shapes)
Expand Down
4 changes: 1 addition & 3 deletions python/ray/data/tests/test_tensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,7 @@ def test_tensors_basic(ray_start_regular_shared):
"Dataset(num_rows=6, schema={data: numpy.ndarray(shape=(3, 5), dtype=int64)})"
)
# The actual size is slightly larger due to metadata.
# We add 6 (one per tensor) offset values of 8 bytes each to account for the
# in-memory representation of the PyArrow LargeList type
assert math.isclose(ds.size_bytes(), 5 * 3 * 6 * 8 + 6 * 8, rel_tol=0.1)
assert math.isclose(ds.size_bytes(), 5 * 3 * 6 * 8, rel_tol=0.1)

# Test row iterator yields tensors.
for tensor in ds.iter_rows():
Expand Down