From e24fc95d46d1f4228df259f0885cf83680c51871 Mon Sep 17 00:00:00 2001 From: "Maarten A. Breddels" Date: Mon, 13 Jul 2020 12:59:47 +0200 Subject: [PATCH] core(fix): offsets for arrow string array not respected in conversion --- packages/vaex-core/vaex/arrow/convert.py | 3 ++- packages/vaex-core/vaex/dataframe.py | 2 +- tests/arrow/conversion_test.py | 9 +++++++++ 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/packages/vaex-core/vaex/arrow/convert.py b/packages/vaex-core/vaex/arrow/convert.py index 34ddb0e59d..cbae35651c 100644 --- a/packages/vaex-core/vaex/arrow/convert.py +++ b/packages/vaex-core/vaex/arrow/convert.py @@ -93,7 +93,8 @@ def column_from_arrow_array(arrow_array): string_bytes = np.array([], dtype='S1') else: string_bytes = np.frombuffer(string_bytes, 'S1', len(string_bytes)) - column = vaex.column.ColumnStringArrow(offsets, string_bytes, len(arrow_array), null_bitmap=null_bitmap) + offset = arrow_array.offset + column = vaex.column.ColumnStringArrow(offsets[offset:], string_bytes, len(arrow_array), null_bitmap=null_bitmap) return column else: raise TypeError('type unsupported: %r' % arrow_type) diff --git a/packages/vaex-core/vaex/dataframe.py b/packages/vaex-core/vaex/dataframe.py index 433b43aac6..fbfcb2e5d2 100644 --- a/packages/vaex-core/vaex/dataframe.py +++ b/packages/vaex-core/vaex/dataframe.py @@ -501,7 +501,7 @@ def map(thread_index, i1, i2, ar): if not transient: assert ar is previous_ar.string_sequence # TODO: what about masked values? - inverse[i1:i2:] = ordered_set.map_ordinal(ar) + inverse[i1:i2] = ordered_set.map_ordinal(ar) def reduce(a, b): pass self.map_reduce(map, reduce, [expression], delay=delay, name='unique_return_inverse', info=True, to_numpy=False, selection=selection) diff --git a/tests/arrow/conversion_test.py b/tests/arrow/conversion_test.py index 99ee1ade04..08d7acfb14 100644 --- a/tests/arrow/conversion_test.py +++ b/tests/arrow/conversion_test.py @@ -2,6 +2,7 @@ import pyarrow as pa import pytest from vaex import array_types +import vaex.arrow.convert bools = [False, True, True] @@ -48,6 +49,14 @@ def test_float_sliced_masked(): assert x.tolist() == x_original[2:].tolist() +def test_string_sliced(): + values = ["a", "bb", "ccc", "dddd"] + ar = pa.array(values) + assert vaex.arrow.convert.column_from_arrow_array(ar).tolist() == values + assert vaex.arrow.convert.column_from_arrow_array(ar[1:]).tolist() == values[1:] + assert vaex.arrow.convert.column_from_arrow_array(ar[1:3]).tolist() == values[1:3] != values[1:] + + def test_keep_masked_data_values(): x_original = np.arange(5, dtype='f8') mask = x_original > 2