Skip to content

Commit

Permalink
core(fix): offsets for arrow string array not respected in conversion
Browse files Browse the repository at this point in the history
  • Loading branch information
maartenbreddels committed Jul 17, 2020
1 parent ce4889a commit 2a85f7c
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 2 deletions.
3 changes: 2 additions & 1 deletion packages/vaex-core/vaex/arrow/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,8 @@ def column_from_arrow_array(arrow_array):
string_bytes = np.array([], dtype='S1')
else:
string_bytes = np.frombuffer(string_bytes, 'S1', len(string_bytes))
column = vaex.column.ColumnStringArrow(offsets, string_bytes, len(arrow_array), null_bitmap=null_bitmap)
offset = arrow_array.offset
column = vaex.column.ColumnStringArrow(offsets[offset:], string_bytes, len(arrow_array), null_bitmap=null_bitmap)
return column
else:
raise TypeError('type unsupported: %r' % arrow_type)
Expand Down
2 changes: 1 addition & 1 deletion packages/vaex-core/vaex/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -501,7 +501,7 @@ def map(thread_index, i1, i2, ar):
if not transient:
assert ar is previous_ar.string_sequence
# TODO: what about masked values?
inverse[i1:i2:] = ordered_set.map_ordinal(ar)
inverse[i1:i2] = ordered_set.map_ordinal(ar)
def reduce(a, b):
pass
self.map_reduce(map, reduce, [expression], delay=delay, name='unique_return_inverse', info=True, to_numpy=False, selection=selection)
Expand Down
9 changes: 9 additions & 0 deletions tests/arrow/conversion_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import pyarrow as pa
import pytest
from vaex import array_types
import vaex.arrow.convert


bools = [False, True, True]
Expand Down Expand Up @@ -48,6 +49,14 @@ def test_float_sliced_masked():
assert x.tolist() == x_original[2:].tolist()


def test_string_sliced():
values = ["a", "bb", "ccc", "dddd"]
ar = pa.array(values)
assert vaex.arrow.convert.column_from_arrow_array(ar).tolist() == values
assert vaex.arrow.convert.column_from_arrow_array(ar[1:]).tolist() == values[1:]
assert vaex.arrow.convert.column_from_arrow_array(ar[1:3]).tolist() == values[1:3] != values[1:]


def test_keep_masked_data_values():
x_original = np.arange(5, dtype='f8')
mask = x_original > 2
Expand Down

0 comments on commit 2a85f7c

Please sign in to comment.