From a4edc77136c7be014825389734ab95a6cc7959e2 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Fri, 16 Feb 2024 11:55:31 -0600 Subject: [PATCH 1/5] stripped-down version of src/awkward/_connect/pyarrow.py in studies --- studies/cudf-to-awkward.py | 322 +++++++++++++++++++++++++++++++++++++ 1 file changed, 322 insertions(+) create mode 100644 studies/cudf-to-awkward.py diff --git a/studies/cudf-to-awkward.py b/studies/cudf-to-awkward.py new file mode 100644 index 0000000000..8e271c85ef --- /dev/null +++ b/studies/cudf-to-awkward.py @@ -0,0 +1,322 @@ +import cudf +import cupy as cp +import pyarrow as pa +import awkward as ak + + +######################### stripped-down copy of src/awkward/_connect/pyarrow.py + + +def revertable(modified, original): + modified.__pyarrow_original = original + return modified + + +def remove_optiontype(akarray): + return akarray.__pyarrow_original + + +def popbuffers_finalize(out, array, validbits, generate_bitmasks, fix_offsets=True): + # Every buffer from Arrow must be offsets-corrected. + if fix_offsets and (array.offset != 0 or len(array) != len(out)): + out = out[array.offset : array.offset + len(array)] + + # Everything must leave popbuffers as option-type; the mask_node will be + # removed by the next level up in popbuffers recursion if appropriate. + + if validbits is None and generate_bitmasks: + # ceildiv(len(out), 8) = -(len(out) // -8) + validbits = numpy.full(-(len(out) // -8), np.uint8(0xFF), dtype=np.uint8) + + if validbits is None: + return revertable(ak.contents.UnmaskedArray.simplified(out), out) + else: + return revertable( + ak.contents.BitMaskedArray.simplified( + ak.index.IndexU8(numpy.frombuffer(validbits, dtype=np.uint8)), + out, + valid_when=True, + length=len(out), + lsb_order=True, + ), + out, + ) + + +def popbuffers(paarray, storage_type, buffers, generate_bitmasks): + ### Beginning of the big if-elif-elif chain! + + if isinstance(storage_type, pyarrow.lib.DictionaryType): + masked_index = popbuffers( + paarray.indices, + storage_type.index_type, + buffers, + generate_bitmasks, + ) + index = masked_index.content.data + + if not isinstance(masked_index, ak.contents.UnmaskedArray): + mask = masked_index.mask_as_bool(valid_when=False) + if mask.any(): + index = numpy.asarray(index, copy=True) + index[mask] = -1 + + content = handle_arrow(paarray.dictionary, generate_bitmasks) + + parameters = {"__array__": "categorical"} + + return revertable( + ak.contents.IndexedOptionArray.simplified( + ak.index.Index(index), + content, + parameters=parameters, + ), + ak.contents.IndexedArray( + ak.index.Index(index), + remove_optiontype(content) if content.is_option else content, + parameters=parameters, + ), + ) + + elif isinstance(storage_type, pyarrow.lib.FixedSizeListType): + assert storage_type.num_buffers == 1 + validbits = buffers.pop(0) + + akcontent = popbuffers( + paarray.values, storage_type.value_type, buffers, generate_bitmasks + ) + + if not storage_type.value_field.nullable: + # strip the dummy option-type node + akcontent = remove_optiontype(akcontent) + + out = ak.contents.RegularArray( + akcontent, + storage_type.list_size, + parameters=None, + ) + return popbuffers_finalize(out, paarray, validbits, generate_bitmasks) + + elif isinstance(storage_type, (pyarrow.lib.LargeListType, pyarrow.lib.ListType)): + assert storage_type.num_buffers == 2 + validbits = buffers.pop(0) + paoffsets = buffers.pop(0) + + if isinstance(storage_type, pyarrow.lib.LargeListType): + akoffsets = ak.index.Index64(numpy.frombuffer(paoffsets, dtype=np.int64)) + else: + akoffsets = ak.index.Index32(numpy.frombuffer(paoffsets, dtype=np.int32)) + + akcontent = popbuffers( + paarray.values, storage_type.value_type, buffers, generate_bitmasks + ) + + if not storage_type.value_field.nullable: + # strip the dummy option-type node + akcontent = remove_optiontype(akcontent) + + out = ak.contents.ListOffsetArray(akoffsets, akcontent, parameters=None) + return popbuffers_finalize(out, paarray, validbits, generate_bitmasks) + + elif isinstance(storage_type, pyarrow.lib.MapType): + # FIXME: make a ListOffsetArray of 2-tuples with __array__ == "sorted_map". + # (Make sure the keys are sorted). + raise NotImplementedError + + elif isinstance( + storage_type, (pyarrow.lib.Decimal128Type, pyarrow.lib.Decimal256Type) + ): + # Note: Decimal128Type and Decimal256Type are subtypes of FixedSizeBinaryType. + # NumPy doesn't support decimal: https://github.com/numpy/numpy/issues/9789 + raise ValueError( + "Arrow arrays containing pyarrow.decimal128 or pyarrow.decimal256 types can't be converted into Awkward Arrays" + ) + + elif isinstance(storage_type, pyarrow.lib.FixedSizeBinaryType): + assert storage_type.num_buffers == 2 + validbits = buffers.pop(0) + pacontent = buffers.pop(0) + + parameters = {"__array__": "bytestring"} + sub_parameters = {"__array__": "byte"} + + out = ak.contents.RegularArray( + ak.contents.NumpyArray( + numpy.frombuffer(pacontent, dtype=np.uint8), + parameters=sub_parameters, + backend=NumpyBackend.instance(), + ), + storage_type.byte_width, + parameters=parameters, + ) + return popbuffers_finalize(out, paarray, validbits, generate_bitmasks) + + elif storage_type in _string_like: + assert storage_type.num_buffers == 3 + validbits = buffers.pop(0) + paoffsets = buffers.pop(0) + pacontent = buffers.pop(0) + + if storage_type in _string_like[::2]: + akoffsets = ak.index.Index32(numpy.frombuffer(paoffsets, dtype=np.int32)) + else: + akoffsets = ak.index.Index64(numpy.frombuffer(paoffsets, dtype=np.int64)) + + if storage_type in _string_like[:2]: + parameters = {"__array__": "string"} + sub_parameters = {"__array__": "char"} + else: + parameters = {"__array__": "bytestring"} + sub_parameters = {"__array__": "byte"} + + out = ak.contents.ListOffsetArray( + akoffsets, + ak.contents.NumpyArray( + numpy.frombuffer(pacontent, dtype=np.uint8), + parameters=sub_parameters, + backend=NumpyBackend.instance(), + ), + parameters=parameters, + ) + return popbuffers_finalize(out, paarray, validbits, generate_bitmasks) + + elif isinstance(storage_type, pyarrow.lib.StructType): + assert storage_type.num_buffers == 1 + validbits = buffers.pop(0) + + keys = [] + contents = [] + for i in range(storage_type.num_fields): + field = storage_type[i] + field_name = field.name + keys.append(field_name) + + akcontent = popbuffers( + paarray.field(field_name), field.type, buffers, generate_bitmasks + ) + if not field.nullable: + # strip the dummy option-type node + akcontent = remove_optiontype(akcontent) + contents.append(akcontent) + + out = ak.contents.RecordArray( + contents, keys, length=len(paarray), parameters=None + ) + return popbuffers_finalize( + out, paarray, validbits, generate_bitmasks, fix_offsets=False + ) + + elif isinstance(storage_type, pyarrow.lib.UnionType): + if isinstance(storage_type, pyarrow.lib.SparseUnionType): + assert storage_type.num_buffers == 2 + validbits = buffers.pop(0) + nptags = numpy.frombuffer(buffers.pop(0), dtype=np.int8) + npindex = numpy.arange(len(nptags), dtype=np.int32) + else: + assert storage_type.num_buffers == 3 + validbits = buffers.pop(0) + nptags = numpy.frombuffer(buffers.pop(0), dtype=np.int8) + npindex = numpy.frombuffer(buffers.pop(0), dtype=np.int32) + + akcontents = [] + for i in range(storage_type.num_fields): + field = storage_type[i] + akcontent = popbuffers( + paarray.field(i), field.type, buffers, generate_bitmasks + ) + + if not field.nullable: + # strip the dummy option-type node + akcontent = remove_optiontype(akcontent) + akcontents.append(akcontent) + + out = ak.contents.UnionArray.simplified( + ak.index.Index8(nptags), + ak.index.Index32(npindex), + akcontents, + parameters=None, + ) + return popbuffers_finalize(out, paarray, None, generate_bitmasks) + + elif storage_type == pyarrow.null(): + validbits = buffers.pop(0) + assert storage_type.num_fields == 0 + + # This is already an option-type and offsets-corrected, so no popbuffers_finalize. + return ak.contents.IndexedOptionArray( + ak.index.Index64(numpy.full(len(paarray), -1, dtype=np.int64)), + ak.contents.EmptyArray(parameters=None), + parameters=None, + ) + + elif storage_type == pyarrow.bool_(): + assert storage_type.num_buffers == 2 + validbits = buffers.pop(0) + bitdata = buffers.pop(0) + + bytedata = numpy.unpackbits( + numpy.frombuffer(bitdata, dtype=np.uint8), bitorder="little" + ) + + out = ak.contents.NumpyArray( + bytedata.view(np.bool_), + parameters=None, + backend=NumpyBackend.instance(), + ) + return popbuffers_finalize(out, paarray, validbits, generate_bitmasks) + + elif isinstance(storage_type, pyarrow.lib.DataType): + assert storage_type.num_buffers == 2 + validbits = buffers.pop(0) + data = buffers.pop(0) + + to64, dt = _pyarrow_to_numpy_dtype.get(str(storage_type), (False, None)) + if to64: + data = numpy.astype(numpy.frombuffer(data, dtype=np.int32), dtype=np.int64) + if dt is None: + dt = storage_type.to_pandas_dtype() + + out = ak.contents.NumpyArray( + numpy.frombuffer(data, dtype=dt), + parameters=None, + backend=NumpyBackend.instance(), + ) + return popbuffers_finalize(out, paarray, validbits, generate_bitmasks) + + else: + raise TypeError(f"unrecognized Arrow array type: {storage_type!r}") + + +def handle_arrow(obj, generate_bitmasks, pass_empty_field): + buffers = obj.buffers() + out = popbuffers(obj, obj.type, buffers, generate_bitmasks) + assert len(buffers) == 0 + return out + + +def pyarrow_to_awkward( + pyarrow_array: pyarrow.lib.Array, + generate_bitmasks=False, + highlevel=True, + behavior=None, + attrs=None, +): + ctx = ak._layout.HighLevelContext(behavior=behavior, attrs=attrs).finalize() + + out = handle_arrow(pyarrow_array, generate_bitmasks, True) + if isinstance(out, ak.contents.UnmaskedArray): + out = remove_optiontype(out) + + def remove_revertable(layout, **kwargs): + if hasattr(layout, "__pyarrow_original"): + del layout.__pyarrow_original + + ak._do.recursively_apply(out, remove_revertable) + + return ctx.wrap(out, highlevel=highlevel) + + +if __name__ == "__main__": + df = cudf.DataFrame({"record": [{"inner": [[3], [1, 2]], "simple": [8, None]}] * 6}) + + From f481c88c41dd6d1c88bd8101d20210072f8ab2b3 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Fri, 16 Feb 2024 12:12:40 -0600 Subject: [PATCH 2/5] add a modest test suite --- studies/cudf-to-awkward.py | 106 +++++++++++++++++++++++++++++-------- 1 file changed, 85 insertions(+), 21 deletions(-) diff --git a/studies/cudf-to-awkward.py b/studies/cudf-to-awkward.py index 8e271c85ef..eb6386e0b1 100644 --- a/studies/cudf-to-awkward.py +++ b/studies/cudf-to-awkward.py @@ -1,12 +1,40 @@ import cudf -import cupy as cp -import pyarrow as pa +import pyarrow +import cupy +import numpy + import awkward as ak +from awkward._backends.numpy import NumpyBackend ######################### stripped-down copy of src/awkward/_connect/pyarrow.py +_string_like = ( + pyarrow.string(), + pyarrow.large_string(), + pyarrow.binary(), + pyarrow.large_binary(), +) + +_pyarrow_to_numpy_dtype = { + pyarrow.date32(): (True, numpy.dtype("M8[D]")), + pyarrow.date64(): (False, numpy.dtype("M8[ms]")), + pyarrow.time32("s"): (True, numpy.dtype("M8[s]")), + pyarrow.time32("ms"): (True, numpy.dtype("M8[ms]")), + pyarrow.time64("us"): (False, numpy.dtype("M8[us]")), + pyarrow.time64("ns"): (False, numpy.dtype("M8[ns]")), + pyarrow.timestamp("s"): (False, numpy.dtype("M8[s]")), + pyarrow.timestamp("ms"): (False, numpy.dtype("M8[ms]")), + pyarrow.timestamp("us"): (False, numpy.dtype("M8[us]")), + pyarrow.timestamp("ns"): (False, numpy.dtype("M8[ns]")), + pyarrow.duration("s"): (False, numpy.dtype("m8[s]")), + pyarrow.duration("ms"): (False, numpy.dtype("m8[ms]")), + pyarrow.duration("us"): (False, numpy.dtype("m8[us]")), + pyarrow.duration("ns"): (False, numpy.dtype("m8[ns]")), +} + + def revertable(modified, original): modified.__pyarrow_original = original return modified @@ -26,14 +54,14 @@ def popbuffers_finalize(out, array, validbits, generate_bitmasks, fix_offsets=Tr if validbits is None and generate_bitmasks: # ceildiv(len(out), 8) = -(len(out) // -8) - validbits = numpy.full(-(len(out) // -8), np.uint8(0xFF), dtype=np.uint8) + validbits = numpy.full(-(len(out) // -8), numpy.uint8(0xFF), dtype=numpy.uint8) if validbits is None: return revertable(ak.contents.UnmaskedArray.simplified(out), out) else: return revertable( ak.contents.BitMaskedArray.simplified( - ak.index.IndexU8(numpy.frombuffer(validbits, dtype=np.uint8)), + ak.index.IndexU8(numpy.frombuffer(validbits, dtype=numpy.uint8)), out, valid_when=True, length=len(out), @@ -103,9 +131,9 @@ def popbuffers(paarray, storage_type, buffers, generate_bitmasks): paoffsets = buffers.pop(0) if isinstance(storage_type, pyarrow.lib.LargeListType): - akoffsets = ak.index.Index64(numpy.frombuffer(paoffsets, dtype=np.int64)) + akoffsets = ak.index.Index64(numpy.frombuffer(paoffsets, dtype=numpy.int64)) else: - akoffsets = ak.index.Index32(numpy.frombuffer(paoffsets, dtype=np.int32)) + akoffsets = ak.index.Index32(numpy.frombuffer(paoffsets, dtype=numpy.int32)) akcontent = popbuffers( paarray.values, storage_type.value_type, buffers, generate_bitmasks @@ -142,7 +170,7 @@ def popbuffers(paarray, storage_type, buffers, generate_bitmasks): out = ak.contents.RegularArray( ak.contents.NumpyArray( - numpy.frombuffer(pacontent, dtype=np.uint8), + numpy.frombuffer(pacontent, dtype=numpy.uint8), parameters=sub_parameters, backend=NumpyBackend.instance(), ), @@ -158,9 +186,9 @@ def popbuffers(paarray, storage_type, buffers, generate_bitmasks): pacontent = buffers.pop(0) if storage_type in _string_like[::2]: - akoffsets = ak.index.Index32(numpy.frombuffer(paoffsets, dtype=np.int32)) + akoffsets = ak.index.Index32(numpy.frombuffer(paoffsets, dtype=numpy.int32)) else: - akoffsets = ak.index.Index64(numpy.frombuffer(paoffsets, dtype=np.int64)) + akoffsets = ak.index.Index64(numpy.frombuffer(paoffsets, dtype=numpy.int64)) if storage_type in _string_like[:2]: parameters = {"__array__": "string"} @@ -172,7 +200,7 @@ def popbuffers(paarray, storage_type, buffers, generate_bitmasks): out = ak.contents.ListOffsetArray( akoffsets, ak.contents.NumpyArray( - numpy.frombuffer(pacontent, dtype=np.uint8), + numpy.frombuffer(pacontent, dtype=numpy.uint8), parameters=sub_parameters, backend=NumpyBackend.instance(), ), @@ -210,13 +238,13 @@ def popbuffers(paarray, storage_type, buffers, generate_bitmasks): if isinstance(storage_type, pyarrow.lib.SparseUnionType): assert storage_type.num_buffers == 2 validbits = buffers.pop(0) - nptags = numpy.frombuffer(buffers.pop(0), dtype=np.int8) - npindex = numpy.arange(len(nptags), dtype=np.int32) + nptags = numpy.frombuffer(buffers.pop(0), dtype=numpy.int8) + npindex = numpy.arange(len(nptags), dtype=numpy.int32) else: assert storage_type.num_buffers == 3 validbits = buffers.pop(0) - nptags = numpy.frombuffer(buffers.pop(0), dtype=np.int8) - npindex = numpy.frombuffer(buffers.pop(0), dtype=np.int32) + nptags = numpy.frombuffer(buffers.pop(0), dtype=numpy.int8) + npindex = numpy.frombuffer(buffers.pop(0), dtype=numpy.int32) akcontents = [] for i in range(storage_type.num_fields): @@ -244,7 +272,7 @@ def popbuffers(paarray, storage_type, buffers, generate_bitmasks): # This is already an option-type and offsets-corrected, so no popbuffers_finalize. return ak.contents.IndexedOptionArray( - ak.index.Index64(numpy.full(len(paarray), -1, dtype=np.int64)), + ak.index.Index64(numpy.full(len(paarray), -1, dtype=numpy.int64)), ak.contents.EmptyArray(parameters=None), parameters=None, ) @@ -255,11 +283,11 @@ def popbuffers(paarray, storage_type, buffers, generate_bitmasks): bitdata = buffers.pop(0) bytedata = numpy.unpackbits( - numpy.frombuffer(bitdata, dtype=np.uint8), bitorder="little" + numpy.frombuffer(bitdata, dtype=numpy.uint8), bitorder="little" ) out = ak.contents.NumpyArray( - bytedata.view(np.bool_), + bytedata.view(numpy.bool_), parameters=None, backend=NumpyBackend.instance(), ) @@ -272,7 +300,9 @@ def popbuffers(paarray, storage_type, buffers, generate_bitmasks): to64, dt = _pyarrow_to_numpy_dtype.get(str(storage_type), (False, None)) if to64: - data = numpy.astype(numpy.frombuffer(data, dtype=np.int32), dtype=np.int64) + data = numpy.astype( + numpy.frombuffer(data, dtype=numpy.int32), dtype=numpy.int64 + ) if dt is None: dt = storage_type.to_pandas_dtype() @@ -317,6 +347,40 @@ def remove_revertable(layout, **kwargs): if __name__ == "__main__": - df = cudf.DataFrame({"record": [{"inner": [[3], [1, 2]], "simple": [8, None]}] * 6}) - - + # tests numerics, lists, records, and option-type, but not union-type + examples = [ + [False, True, True], # booleans are special (1-bit) + [1.1, 2.2, 3.3], + [[False, True, True], [], [True, False]], + [[1, 2, 3], [], [4, 5]], + [[[1, 2], [3]], [], [[]], [[4], [], [5, 6, 7]], [[8, 9]]], + [{"x": 1}, {"x": 2}, {"x": 3}], + [{"x": 1.1, "y": []}, {"x": 2.2, "y": [1]}, {"x": 3.3, "y": [1, 2]}], + [[{"x": 1}, {"x": 2}, {"x": 3}], [], [{"x": 4}, {"x": 5}]], + [False, True, None, True], + [1.1, 2.2, None, 3.3], + [[False, True, None, True], [], [True, False]], + [[False, True, True], None, [], [True, False]], + [[1, 2, None, 3], [], [4, 5]], + [[1, 2, 3], None, [], [4, 5]], + [[[1, 2, None], [3]], [], [[]], [[4], [], [5, 6, 7]], [[8, 9]]], + [[[1, 2], None, [3]], [], [[]], [[4], [], [5, 6, 7]], [[8, 9]]], + [[[1, 2], [3]], None, [], [[]], [[4], [], [5, 6, 7]], [[8, 9]]], + [{"x": 1}, {"x": None}, {"x": 3}], + [{"x": 1}, {"x": 2}, None, {"x": 3}], + [{"x": 1.1, "y": []}, {"x": None, "y": [1]}, {"x": 3.3, "y": [1, 2]}], + [{"x": 1.1, "y": []}, {"x": 2.2, "y": [1, None]}, {"x": 3.3, "y": [1, 2]}], + [{"x": 1.1, "y": []}, {"x": 2.2, "y": [1]}, None, {"x": 3.3, "y": [1, 2]}], + [[{"x": 1}, {"x": None}, {"x": 3}], [], [{"x": 4}, {"x": 5}]], + [[{"x": 1}, {"x": 2}, None, {"x": 3}], [], [{"x": 4}, {"x": 5}]], + [[{"x": 1}, {"x": 2}, {"x": 3}], None, [], [{"x": 4}, {"x": 5}]], + ] + + for example in examples: + df = cudf.DataFrame({"column": example}) + + pyarrow_array = df._data["column"].to_arrow() + assert pyarrow_array.tolist() == example + + awkward_array = pyarrow_to_awkward(pyarrow_array) + assert awkward_array.tolist() == example From bac7ab8020479b92c73303e571618de0119b6ffe Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Fri, 16 Feb 2024 13:00:49 -0600 Subject: [PATCH 3/5] numbers and lists are working --- studies/cudf-to-awkward.py | 243 ++++++++++++++++++++++++++++++------- 1 file changed, 200 insertions(+), 43 deletions(-) diff --git a/studies/cudf-to-awkward.py b/studies/cudf-to-awkward.py index eb6386e0b1..5815e4fbfb 100644 --- a/studies/cudf-to-awkward.py +++ b/studies/cudf-to-awkward.py @@ -5,6 +5,7 @@ import awkward as ak from awkward._backends.numpy import NumpyBackend +from awkward._backends.cupy import CupyBackend ######################### stripped-down copy of src/awkward/_connect/pyarrow.py @@ -71,13 +72,13 @@ def popbuffers_finalize(out, array, validbits, generate_bitmasks, fix_offsets=Tr ) -def popbuffers(paarray, storage_type, buffers, generate_bitmasks): +def popbuffers(paarray, arrow_type, buffers, generate_bitmasks): ### Beginning of the big if-elif-elif chain! - if isinstance(storage_type, pyarrow.lib.DictionaryType): + if isinstance(arrow_type, pyarrow.lib.DictionaryType): masked_index = popbuffers( paarray.indices, - storage_type.index_type, + arrow_type.index_type, buffers, generate_bitmasks, ) @@ -106,53 +107,53 @@ def popbuffers(paarray, storage_type, buffers, generate_bitmasks): ), ) - elif isinstance(storage_type, pyarrow.lib.FixedSizeListType): - assert storage_type.num_buffers == 1 + elif isinstance(arrow_type, pyarrow.lib.FixedSizeListType): + assert arrow_type.num_buffers == 1 validbits = buffers.pop(0) akcontent = popbuffers( - paarray.values, storage_type.value_type, buffers, generate_bitmasks + paarray.values, arrow_type.value_type, buffers, generate_bitmasks ) - if not storage_type.value_field.nullable: + if not arrow_type.value_field.nullable: # strip the dummy option-type node akcontent = remove_optiontype(akcontent) out = ak.contents.RegularArray( akcontent, - storage_type.list_size, + arrow_type.list_size, parameters=None, ) return popbuffers_finalize(out, paarray, validbits, generate_bitmasks) - elif isinstance(storage_type, (pyarrow.lib.LargeListType, pyarrow.lib.ListType)): - assert storage_type.num_buffers == 2 + elif isinstance(arrow_type, (pyarrow.lib.LargeListType, pyarrow.lib.ListType)): + assert arrow_type.num_buffers == 2 validbits = buffers.pop(0) paoffsets = buffers.pop(0) - if isinstance(storage_type, pyarrow.lib.LargeListType): + if isinstance(arrow_type, pyarrow.lib.LargeListType): akoffsets = ak.index.Index64(numpy.frombuffer(paoffsets, dtype=numpy.int64)) else: akoffsets = ak.index.Index32(numpy.frombuffer(paoffsets, dtype=numpy.int32)) akcontent = popbuffers( - paarray.values, storage_type.value_type, buffers, generate_bitmasks + paarray.values, arrow_type.value_type, buffers, generate_bitmasks ) - if not storage_type.value_field.nullable: + if not arrow_type.value_field.nullable: # strip the dummy option-type node akcontent = remove_optiontype(akcontent) out = ak.contents.ListOffsetArray(akoffsets, akcontent, parameters=None) return popbuffers_finalize(out, paarray, validbits, generate_bitmasks) - elif isinstance(storage_type, pyarrow.lib.MapType): + elif isinstance(arrow_type, pyarrow.lib.MapType): # FIXME: make a ListOffsetArray of 2-tuples with __array__ == "sorted_map". # (Make sure the keys are sorted). raise NotImplementedError elif isinstance( - storage_type, (pyarrow.lib.Decimal128Type, pyarrow.lib.Decimal256Type) + arrow_type, (pyarrow.lib.Decimal128Type, pyarrow.lib.Decimal256Type) ): # Note: Decimal128Type and Decimal256Type are subtypes of FixedSizeBinaryType. # NumPy doesn't support decimal: https://github.com/numpy/numpy/issues/9789 @@ -160,8 +161,8 @@ def popbuffers(paarray, storage_type, buffers, generate_bitmasks): "Arrow arrays containing pyarrow.decimal128 or pyarrow.decimal256 types can't be converted into Awkward Arrays" ) - elif isinstance(storage_type, pyarrow.lib.FixedSizeBinaryType): - assert storage_type.num_buffers == 2 + elif isinstance(arrow_type, pyarrow.lib.FixedSizeBinaryType): + assert arrow_type.num_buffers == 2 validbits = buffers.pop(0) pacontent = buffers.pop(0) @@ -174,23 +175,23 @@ def popbuffers(paarray, storage_type, buffers, generate_bitmasks): parameters=sub_parameters, backend=NumpyBackend.instance(), ), - storage_type.byte_width, + arrow_type.byte_width, parameters=parameters, ) return popbuffers_finalize(out, paarray, validbits, generate_bitmasks) - elif storage_type in _string_like: - assert storage_type.num_buffers == 3 + elif arrow_type in _string_like: + assert arrow_type.num_buffers == 3 validbits = buffers.pop(0) paoffsets = buffers.pop(0) pacontent = buffers.pop(0) - if storage_type in _string_like[::2]: + if arrow_type in _string_like[::2]: akoffsets = ak.index.Index32(numpy.frombuffer(paoffsets, dtype=numpy.int32)) else: akoffsets = ak.index.Index64(numpy.frombuffer(paoffsets, dtype=numpy.int64)) - if storage_type in _string_like[:2]: + if arrow_type in _string_like[:2]: parameters = {"__array__": "string"} sub_parameters = {"__array__": "char"} else: @@ -208,14 +209,14 @@ def popbuffers(paarray, storage_type, buffers, generate_bitmasks): ) return popbuffers_finalize(out, paarray, validbits, generate_bitmasks) - elif isinstance(storage_type, pyarrow.lib.StructType): - assert storage_type.num_buffers == 1 + elif isinstance(arrow_type, pyarrow.lib.StructType): + assert arrow_type.num_buffers == 1 validbits = buffers.pop(0) keys = [] contents = [] - for i in range(storage_type.num_fields): - field = storage_type[i] + for i in range(arrow_type.num_fields): + field = arrow_type[i] field_name = field.name keys.append(field_name) @@ -234,21 +235,21 @@ def popbuffers(paarray, storage_type, buffers, generate_bitmasks): out, paarray, validbits, generate_bitmasks, fix_offsets=False ) - elif isinstance(storage_type, pyarrow.lib.UnionType): - if isinstance(storage_type, pyarrow.lib.SparseUnionType): - assert storage_type.num_buffers == 2 + elif isinstance(arrow_type, pyarrow.lib.UnionType): + if isinstance(arrow_type, pyarrow.lib.SparseUnionType): + assert arrow_type.num_buffers == 2 validbits = buffers.pop(0) nptags = numpy.frombuffer(buffers.pop(0), dtype=numpy.int8) npindex = numpy.arange(len(nptags), dtype=numpy.int32) else: - assert storage_type.num_buffers == 3 + assert arrow_type.num_buffers == 3 validbits = buffers.pop(0) nptags = numpy.frombuffer(buffers.pop(0), dtype=numpy.int8) npindex = numpy.frombuffer(buffers.pop(0), dtype=numpy.int32) akcontents = [] - for i in range(storage_type.num_fields): - field = storage_type[i] + for i in range(arrow_type.num_fields): + field = arrow_type[i] akcontent = popbuffers( paarray.field(i), field.type, buffers, generate_bitmasks ) @@ -266,9 +267,9 @@ def popbuffers(paarray, storage_type, buffers, generate_bitmasks): ) return popbuffers_finalize(out, paarray, None, generate_bitmasks) - elif storage_type == pyarrow.null(): + elif arrow_type == pyarrow.null(): validbits = buffers.pop(0) - assert storage_type.num_fields == 0 + assert arrow_type.num_fields == 0 # This is already an option-type and offsets-corrected, so no popbuffers_finalize. return ak.contents.IndexedOptionArray( @@ -277,8 +278,8 @@ def popbuffers(paarray, storage_type, buffers, generate_bitmasks): parameters=None, ) - elif storage_type == pyarrow.bool_(): - assert storage_type.num_buffers == 2 + elif arrow_type == pyarrow.bool_(): + assert arrow_type.num_buffers == 2 validbits = buffers.pop(0) bitdata = buffers.pop(0) @@ -293,18 +294,18 @@ def popbuffers(paarray, storage_type, buffers, generate_bitmasks): ) return popbuffers_finalize(out, paarray, validbits, generate_bitmasks) - elif isinstance(storage_type, pyarrow.lib.DataType): - assert storage_type.num_buffers == 2 + elif isinstance(arrow_type, pyarrow.lib.DataType): + assert arrow_type.num_buffers == 2 validbits = buffers.pop(0) data = buffers.pop(0) - to64, dt = _pyarrow_to_numpy_dtype.get(str(storage_type), (False, None)) + to64, dt = _pyarrow_to_numpy_dtype.get(str(arrow_type), (False, None)) if to64: data = numpy.astype( numpy.frombuffer(data, dtype=numpy.int32), dtype=numpy.int64 ) if dt is None: - dt = storage_type.to_pandas_dtype() + dt = arrow_type.to_pandas_dtype() out = ak.contents.NumpyArray( numpy.frombuffer(data, dtype=dt), @@ -314,10 +315,10 @@ def popbuffers(paarray, storage_type, buffers, generate_bitmasks): return popbuffers_finalize(out, paarray, validbits, generate_bitmasks) else: - raise TypeError(f"unrecognized Arrow array type: {storage_type!r}") + raise TypeError(f"unrecognized Arrow array type: {arrow_type!r}") -def handle_arrow(obj, generate_bitmasks, pass_empty_field): +def handle_arrow(obj, generate_bitmasks): buffers = obj.buffers() out = popbuffers(obj, obj.type, buffers, generate_bitmasks) assert len(buffers) == 0 @@ -333,7 +334,7 @@ def pyarrow_to_awkward( ): ctx = ak._layout.HighLevelContext(behavior=behavior, attrs=attrs).finalize() - out = handle_arrow(pyarrow_array, generate_bitmasks, True) + out = handle_arrow(pyarrow_array, generate_bitmasks) if isinstance(out, ak.contents.UnmaskedArray): out = remove_optiontype(out) @@ -346,6 +347,142 @@ def remove_revertable(layout, **kwargs): return ctx.wrap(out, highlevel=highlevel) +######################### equivalent for CuDF + + +def recurse_finalize( + out: ak.contents.Content, + column: cudf.core.column.column.ColumnBase, + validbits: None | cudf.core.buffer.buffer.Buffer, + generate_bitmasks: bool, + fix_offsets: bool = True, +): + if validbits is None: + return revertable(ak.contents.UnmaskedArray.simplified(out), out) + else: + return revertable( + ak.contents.BitMaskedArray.simplified( + ak.index.IndexU8(cupy.asarray(validbits)), + out, + valid_when=True, + length=len(out), + lsb_order=True, + ), + out, + ) + + +def recurse( + column: cudf.core.column.column.ColumnBase, + arrow_type: pyarrow.lib.DataType, + generate_bitmasks: bool, +): + if isinstance(arrow_type, pyarrow.lib.DictionaryType): + raise NotImplementedError + + elif isinstance(arrow_type, pyarrow.lib.FixedSizeListType): + raise NotImplementedError + + elif isinstance(arrow_type, (pyarrow.lib.LargeListType, pyarrow.lib.ListType)): + validbits = column.base_mask + paoffsets = column.offsets.base_data + + if isinstance(arrow_type, pyarrow.lib.LargeListType): + akoffsets = ak.index.Index64(cupy.asarray(paoffsets).view(cupy.int64)) + else: + akoffsets = ak.index.Index32(cupy.asarray(paoffsets).view(cupy.int32)) + + akcontent = recurse( + column.base_children[-1], arrow_type.value_type, generate_bitmasks + ) + + if not arrow_type.value_field.nullable: + # strip the dummy option-type node + akcontent = remove_optiontype(akcontent) + + out = ak.contents.ListOffsetArray(akoffsets, akcontent, parameters=None) + return recurse_finalize(out, column, validbits, generate_bitmasks) + + elif isinstance(arrow_type, pyarrow.lib.MapType): + raise NotImplementedError + + elif isinstance( + arrow_type, (pyarrow.lib.Decimal128Type, pyarrow.lib.Decimal256Type) + ): + # Note: Decimal128Type and Decimal256Type are subtypes of FixedSizeBinaryType. + # NumPy doesn't support decimal: https://github.com/numpy/numpy/issues/9789 + raise ValueError( + "Arrow arrays containing pyarrow.decimal128 or pyarrow.decimal256 types can't be converted into Awkward Arrays" + ) + + elif isinstance(arrow_type, pyarrow.lib.FixedSizeBinaryType): + raise NotImplementedError + + elif arrow_type in _string_like: + raise NotImplementedError + + elif isinstance(arrow_type, pyarrow.lib.StructType): + raise NotImplementedError + + elif isinstance(arrow_type, pyarrow.lib.UnionType): + raise NotImplementedError + + elif arrow_type == pyarrow.null(): + raise NotImplementedError + + elif arrow_type == pyarrow.bool_(): + raise NotImplementedError + + elif isinstance(arrow_type, pyarrow.lib.DataType): + validbits = column.base_mask + dt = arrow_type.to_pandas_dtype() + + out = ak.contents.NumpyArray( + cupy.asarray(column.base_data).view(dt), + parameters=None, + backend=CupyBackend.instance(), + ) + return recurse_finalize(out, column, validbits, generate_bitmasks) + + else: + raise TypeError(f"unrecognized Arrow array type: {arrow_type!r}") + + +def handle_cudf(cudf_series: cudf.core.series.Series, generate_bitmasks): + column = cudf_series._data[cudf_series.name] + dtype = column.dtype + if isinstance(dtype, numpy.dtype): + arrow_type = pyarrow.from_numpy_dtype(dtype) + else: + arrow_type = dtype.to_arrow() + return recurse(column, arrow_type, generate_bitmasks) + + +def cudf_to_awkward( + cudf_series: cudf.core.series.Series, + generate_bitmasks=False, + highlevel=True, + behavior=None, + attrs=None, +): + ctx = ak._layout.HighLevelContext(behavior=behavior, attrs=attrs).finalize() + + out = handle_cudf(cudf_series, generate_bitmasks) + if isinstance(out, ak.contents.UnmaskedArray): + out = remove_optiontype(out) + + def remove_revertable(layout, **kwargs): + if hasattr(layout, "__pyarrow_original"): + del layout.__pyarrow_original + + ak._do.recursively_apply(out, remove_revertable) + + return ctx.wrap(out, highlevel=highlevel) + + +######################### testing + + if __name__ == "__main__": # tests numerics, lists, records, and option-type, but not union-type examples = [ @@ -384,3 +521,23 @@ def remove_revertable(layout, **kwargs): awkward_array = pyarrow_to_awkward(pyarrow_array) assert awkward_array.tolist() == example + + examples = [ + [1.1, 2.2, 3.3], + [[1, 2, 3], [], [4, 5]], + [[[1, 2], [3]], [], [[]], [[4], [], [5, 6, 7]], [[8, 9]]], + [1.1, 2.2, None, 3.3], + [[1, 2, None, 3], [], [4, 5]], + [[1, 2, 3], None, [], [4, 5]], + [[[1, 2, None], [3]], [], [[]], [[4], [], [5, 6, 7]], [[8, 9]]], + [[[1, 2], None, [3]], [], [[]], [[4], [], [5, 6, 7]], [[8, 9]]], + [[[1, 2], [3]], None, [], [[]], [[4], [], [5, 6, 7]], [[8, 9]]], + ] + + for example in examples: + print(f"---- {example}") + df = cudf.DataFrame({"column": example}) + + awkward_array = cudf_to_awkward(df["column"]) + assert ak.backend(awkward_array) == "cuda" + assert awkward_array.tolist() == example From ed58d62bd5531567cf4d8ab52d64e47a116eb8f5 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Fri, 16 Feb 2024 16:10:37 -0600 Subject: [PATCH 4/5] everything that I can test works --- studies/cudf-to-awkward.py | 52 +++++++++++++++++++++++++++----------- 1 file changed, 37 insertions(+), 15 deletions(-) diff --git a/studies/cudf-to-awkward.py b/studies/cudf-to-awkward.py index 5815e4fbfb..05a36298b0 100644 --- a/studies/cudf-to-awkward.py +++ b/studies/cudf-to-awkward.py @@ -357,6 +357,10 @@ def recurse_finalize( generate_bitmasks: bool, fix_offsets: bool = True, ): + # Every buffer from Arrow must be offsets-corrected. + if fix_offsets and (column.offset != 0 or len(column) != len(out)): + out = out[column.offset : column.offset + len(column)] + if validbits is None: return revertable(ak.contents.UnmaskedArray.simplified(out), out) else: @@ -422,7 +426,25 @@ def recurse( raise NotImplementedError elif isinstance(arrow_type, pyarrow.lib.StructType): - raise NotImplementedError + validbits = column.base_mask + + keys = [] + contents = [] + for i in range(arrow_type.num_fields): + field = arrow_type[i] + field_name = field.name + keys.append(field_name) + + akcontent = recurse(column.base_children[i], field.type, generate_bitmasks) + if not field.nullable: + # strip the dummy option-type node + akcontent = remove_optiontype(akcontent) + contents.append(akcontent) + + out = ak.contents.RecordArray( + contents, keys, length=len(column), parameters=None + ) + return recurse_finalize(out, column, validbits, generate_bitmasks) elif isinstance(arrow_type, pyarrow.lib.UnionType): raise NotImplementedError @@ -431,7 +453,19 @@ def recurse( raise NotImplementedError elif arrow_type == pyarrow.bool_(): - raise NotImplementedError + validbits = column.base_mask + + ## boolean data from CuDF differs from Arrow: it's represented as bytes, not bits! + # bitdata = column.base_data + # bytedata = cupy.unpackbits(cupy.asarray(bitdata), bitorder="little") + bytedata = cupy.asarray(column.base_data) + + out = ak.contents.NumpyArray( + cupy.asarray(bytedata).view(cupy.bool_), + parameters=None, + backend=CupyBackend.instance(), + ) + return recurse_finalize(out, column, validbits, generate_bitmasks) elif isinstance(arrow_type, pyarrow.lib.DataType): validbits = column.base_mask @@ -522,22 +556,10 @@ def remove_revertable(layout, **kwargs): awkward_array = pyarrow_to_awkward(pyarrow_array) assert awkward_array.tolist() == example - examples = [ - [1.1, 2.2, 3.3], - [[1, 2, 3], [], [4, 5]], - [[[1, 2], [3]], [], [[]], [[4], [], [5, 6, 7]], [[8, 9]]], - [1.1, 2.2, None, 3.3], - [[1, 2, None, 3], [], [4, 5]], - [[1, 2, 3], None, [], [4, 5]], - [[[1, 2, None], [3]], [], [[]], [[4], [], [5, 6, 7]], [[8, 9]]], - [[[1, 2], None, [3]], [], [[]], [[4], [], [5, 6, 7]], [[8, 9]]], - [[[1, 2], [3]], None, [], [[]], [[4], [], [5, 6, 7]], [[8, 9]]], - ] - for example in examples: print(f"---- {example}") df = cudf.DataFrame({"column": example}) awkward_array = cudf_to_awkward(df["column"]) assert ak.backend(awkward_array) == "cuda" - assert awkward_array.tolist() == example + assert awkward_array.tolist() == example, awkward_array.show(type=True) From 2145d8536c9fa72a2f712af06b61231397f2c294 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Fri, 16 Feb 2024 17:24:56 -0600 Subject: [PATCH 5/5] implemented everything that can be implemented --- studies/cudf-to-awkward.py | 170 +++++++++++++++++++++++++++++++------ 1 file changed, 142 insertions(+), 28 deletions(-) diff --git a/studies/cudf-to-awkward.py b/studies/cudf-to-awkward.py index 05a36298b0..bfe3cd42e4 100644 --- a/studies/cudf-to-awkward.py +++ b/studies/cudf-to-awkward.py @@ -119,11 +119,7 @@ def popbuffers(paarray, arrow_type, buffers, generate_bitmasks): # strip the dummy option-type node akcontent = remove_optiontype(akcontent) - out = ak.contents.RegularArray( - akcontent, - arrow_type.list_size, - parameters=None, - ) + out = ak.contents.RegularArray(akcontent, arrow_type.list_size, parameters=None) return popbuffers_finalize(out, paarray, validbits, generate_bitmasks) elif isinstance(arrow_type, (pyarrow.lib.LargeListType, pyarrow.lib.ListType)): @@ -381,11 +377,61 @@ def recurse( arrow_type: pyarrow.lib.DataType, generate_bitmasks: bool, ): - if isinstance(arrow_type, pyarrow.lib.DictionaryType): - raise NotImplementedError + if isinstance(column, cudf.core.column.CategoricalColumn): + validbits = column.base_mask + + paindex = column.base_children[-1] + masked_index = recurse(paindex, arrow_type_of(paindex), generate_bitmasks) + index = masked_index.content.data + + if not isinstance(masked_index, ak.contents.UnmaskedArray): + mask = masked_index.mask_as_bool(valid_when=False) + if mask.any(): + index = cupy.asarray(index, copy=True) + index[mask] = -1 + + pacats = column.categories + content = recurse(pacats, arrow_type_of(pacats), generate_bitmasks) + + if index.dtype == cupy.dtype(cupy.int64): + akindex1 = ak.index.Index64(index) + akindex2 = akindex1 + elif index.dtype == cupy.dtype(cupy.uint32): + akindex1 = ak.index.Index64(index.astype(cupy.int64)) + akindex2 = ak.index.IndexU32(index) + elif index.dtype == cupy.dtype(cupy.int32): + akindex1 = ak.index.Index32(index) + akindex2 = akindex1 + else: + akindex1 = ak.index.Index64(index.astype(cupy.int64)) + akindex2 = akindex1 + + return revertable( + ak.contents.IndexedOptionArray.simplified( + akindex1, + content, + parameters={"__array__": "categorical"}, + ), + ak.contents.IndexedArray( + akindex2, + remove_optiontype(content) if content.is_option else content, + parameters={"__array__": "categorical"}, + ), + ) elif isinstance(arrow_type, pyarrow.lib.FixedSizeListType): - raise NotImplementedError + validbits = column.base_mask + + akcontent = recurse( + column.base_children[-1], arrow_type.value_type, generate_bitmasks + ) + + if not arrow_type.value_field.nullable: + # strip the dummy option-type node + akcontent = remove_optiontype(akcontent) + + out = ak.contents.RegularArray(akcontent, arrow_type.list_size, parameters=None) + return recurse_finalize(out, column, validbits, generate_bitmasks) elif isinstance(arrow_type, (pyarrow.lib.LargeListType, pyarrow.lib.ListType)): validbits = column.base_mask @@ -408,6 +454,8 @@ def recurse( return recurse_finalize(out, column, validbits, generate_bitmasks) elif isinstance(arrow_type, pyarrow.lib.MapType): + # FIXME: make a ListOffsetArray of 2-tuples with __array__ == "sorted_map". + # (Make sure the keys are sorted). raise NotImplementedError elif isinstance( @@ -420,10 +468,51 @@ def recurse( ) elif isinstance(arrow_type, pyarrow.lib.FixedSizeBinaryType): - raise NotImplementedError + validbits = column.base_mask + pacontent = column.base_data + + parameters = {"__array__": "bytestring"} + sub_parameters = {"__array__": "byte"} + + out = ak.contents.RegularArray( + ak.contents.NumpyArray( + cupy.asarray(pacontent), + parameters=sub_parameters, + backend=CupyBackend.instance(), + ), + arrow_type.byte_width, + parameters=parameters, + ) + return recurse_finalize(out, column, validbits, generate_bitmasks) elif arrow_type in _string_like: - raise NotImplementedError + validbits = column.base_mask + + paoffsets = column.base_children[-1] + pacontent = column.base_data + + if arrow_type in _string_like[::2]: + akoffsets = ak.index.Index32(cupy.asarray(paoffsets).view(cupy.int32)) + else: + akoffsets = ak.index.Index64(cupy.asarray(paoffsets).view(cupy.int64)) + + if arrow_type in _string_like[:2]: + parameters = {"__array__": "string"} + sub_parameters = {"__array__": "char"} + else: + parameters = {"__array__": "bytestring"} + sub_parameters = {"__array__": "byte"} + + out = ak.contents.ListOffsetArray( + akoffsets, + ak.contents.NumpyArray( + cupy.asarray(pacontent), + parameters=sub_parameters, + backend=CupyBackend.instance(), + ), + parameters=parameters, + ) + return recurse_finalize(out, column, validbits, generate_bitmasks) elif isinstance(arrow_type, pyarrow.lib.StructType): validbits = column.base_mask @@ -450,7 +539,14 @@ def recurse( raise NotImplementedError elif arrow_type == pyarrow.null(): - raise NotImplementedError + validbits = column.base_mask + + # This is already an option-type and offsets-corrected, so no popbuffers_finalize. + return ak.contents.IndexedOptionArray( + ak.index.Index64(cupy.full(len(column), -1, dtype=cupy.int64)), + ak.contents.EmptyArray(parameters=None), + parameters=None, + ) elif arrow_type == pyarrow.bool_(): validbits = column.base_mask @@ -469,7 +565,12 @@ def recurse( elif isinstance(arrow_type, pyarrow.lib.DataType): validbits = column.base_mask - dt = arrow_type.to_pandas_dtype() + + to64, dt = _pyarrow_to_numpy_dtype.get(str(arrow_type), (False, None)) + if to64: + data = cupy.asarray(data).view(cupy.int32).astype(cupy.int64) + if dt is None: + dt = arrow_type.to_pandas_dtype() out = ak.contents.NumpyArray( cupy.asarray(column.base_data).view(dt), @@ -482,14 +583,28 @@ def recurse( raise TypeError(f"unrecognized Arrow array type: {arrow_type!r}") -def handle_cudf(cudf_series: cudf.core.series.Series, generate_bitmasks): - column = cudf_series._data[cudf_series.name] +def arrow_type_of(column): dtype = column.dtype - if isinstance(dtype, numpy.dtype): - arrow_type = pyarrow.from_numpy_dtype(dtype) + + if isinstance(column, cudf.core.column.StringColumn): + return pyarrow.string() + + elif isinstance(column, cudf.core.column.CategoricalColumn): + return None # deal with it in `recurse` for nesting-generality + + elif isinstance(dtype, numpy.dtype): + if dtype == numpy.dtype(object): + raise TypeError("Python object type encountered in CuDF Series") + else: + return pyarrow.from_numpy_dtype(dtype) + else: - arrow_type = dtype.to_arrow() - return recurse(column, arrow_type, generate_bitmasks) + return dtype.to_arrow() + + +def handle_cudf(cudf_series: cudf.core.series.Series, generate_bitmasks): + column = cudf_series._data[cudf_series.name] + return recurse(column, arrow_type_of(column), generate_bitmasks) def cudf_to_awkward( @@ -528,6 +643,9 @@ def remove_revertable(layout, **kwargs): [{"x": 1}, {"x": 2}, {"x": 3}], [{"x": 1.1, "y": []}, {"x": 2.2, "y": [1]}, {"x": 3.3, "y": [1, 2]}], [[{"x": 1}, {"x": 2}, {"x": 3}], [], [{"x": 4}, {"x": 5}]], + ["This", "is", "a", "string", "array", ".", ""], + [["This", "is", "a"], ["nested"], ["string", "array", ".", ""]], + [None, None, None, None, None], [False, True, None, True], [1.1, 2.2, None, 3.3], [[False, True, None, True], [], [True, False]], @@ -545,21 +663,17 @@ def remove_revertable(layout, **kwargs): [[{"x": 1}, {"x": None}, {"x": 3}], [], [{"x": 4}, {"x": 5}]], [[{"x": 1}, {"x": 2}, None, {"x": 3}], [], [{"x": 4}, {"x": 5}]], [[{"x": 1}, {"x": 2}, {"x": 3}], None, [], [{"x": 4}, {"x": 5}]], + ["This", "is", "a", None, "string", "array", ".", ""], + [["This", "is", "a", None], ["nested"], ["string", "array", ".", ""]], + [["This", "is", "a"], None, ["nested"], ["string", "array", ".", ""]], + numpy.array(["2024-01-01", "2024-01-02"], dtype="datetime64[s]"), + numpy.array([1, 2, 3], dtype="timedelta64[s]"), ] - for example in examples: - df = cudf.DataFrame({"column": example}) - - pyarrow_array = df._data["column"].to_arrow() - assert pyarrow_array.tolist() == example - - awkward_array = pyarrow_to_awkward(pyarrow_array) - assert awkward_array.tolist() == example - for example in examples: print(f"---- {example}") df = cudf.DataFrame({"column": example}) awkward_array = cudf_to_awkward(df["column"]) assert ak.backend(awkward_array) == "cuda" - assert awkward_array.tolist() == example, awkward_array.show(type=True) + assert awkward_array.tolist() == list(example), awkward_array.show(type=True)