Skip to content

Commit

Permalink
Arrow/Parquet: add support for reading list (or map) of struct (relat…
Browse files Browse the repository at this point in the history
…es to #8606)
  • Loading branch information
rouault committed Oct 25, 2023
1 parent cd2a054 commit 99abd8a
Show file tree
Hide file tree
Showing 8 changed files with 625 additions and 251 deletions.
216 changes: 215 additions & 1 deletion autotest/generate_parquet_test_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,26 @@ def generate_test_parquet():
],
type=pa.list_(pa.float64()),
)
list_decimal128 = pa.array(
[
[decimal.Decimal("1234.567")],
[decimal.Decimal("-1234.567")],
None,
[decimal.Decimal("1234.567")],
[decimal.Decimal("-1234.567")],
],
type=pa.list_(pa.decimal128(7, 3)),
)
list_decimal256 = pa.array(
[
[decimal.Decimal("1234.567")],
[decimal.Decimal("-1234.567")],
None,
[decimal.Decimal("1234.567")],
[decimal.Decimal("-1234.567")],
],
type=pa.list_(pa.decimal256(7, 3)),
)
list_string = pa.array(
[
None
Expand All @@ -290,6 +310,17 @@ def generate_test_parquet():
for i in range(5)
]
)
list_large_string = pa.array(
[
None
if i == 2
else [
"".join(["%c" % (65 + j + k) for k in range(1 + j)]) for j in range(i)
]
for i in range(5)
],
type=pa.list_(pa.large_string()),
)
fixed_size_list_boolean = pa.array(
[[True, False], [False, True], [True, False], [False, True], [True, False]],
type=pa.list_(pa.bool_(), 2),
Expand Down Expand Up @@ -332,6 +363,8 @@ def generate_test_parquet():
[{"a": 1, "b": 2.5, "c": {"d": "e", "f": "g"}, "h": [5, 6], "i": 3}] * 5
)

list_struct = pa.array([[{"a": 1, "b": 2.5}, {"a": 3, "c": 4.5}]] * 5)

# struct_val = { "a": 5 }
# for i in range(123):
# struct_val = { "a": struct_val }
Expand Down Expand Up @@ -387,10 +420,52 @@ def generate_test_parquet():
[[("x", 1.5), ("y", None)], [("z", 3)], None, [], []],
type=pa.map_(pa.string(), pa.float64()),
)
map_decimal128 = pa.array(
[
[("x", decimal.Decimal("1234.567")), ("y", None)],
[("z", decimal.Decimal("-1234.567"))],
None,
[],
[],
],
type=pa.map_(pa.string(), pa.decimal128(7, 3)),
)
map_decimal256 = pa.array(
[
[("x", decimal.Decimal("1234.567")), ("y", None)],
[("z", decimal.Decimal("-1234.567"))],
None,
[],
[],
],
type=pa.map_(pa.string(), pa.decimal256(7, 3)),
)
map_string = pa.array(
[[("x", "x_val"), ("y", None)], [("z", "z_val")], None, [], []],
type=pa.map_(pa.string(), pa.string()),
)
map_large_string = pa.array(
[[("x", "x_val"), ("y", None)], [("z", "z_val")], None, [], []],
type=pa.map_(pa.string(), pa.large_string()),
)
map_list_string = pa.array(
[[("x", ["x_val"]), ("y", None)], [("z", [None, "z_val"])], None, [], []],
type=pa.map_(pa.string(), pa.list_(pa.string())),
)
map_large_list_string = pa.array(
[[("x", ["x_val"]), ("y", None)], [("z", [None, "z_val"])], None, [], []],
type=pa.map_(pa.string(), pa.large_list(pa.string())),
)
map_fixed_size_list_string = pa.array(
[
[("x", ["x_val", None]), ("y", [None, None])],
[("z", [None, "z_val"])],
None,
[],
[],
],
type=pa.map_(pa.string(), pa.list_(pa.string(), 2)),
)

indices = pa.array([0, 1, 2, None, 2], type=pa.int32())
dictionary = pa.array(["foo", "bar", "baz"])
Expand Down Expand Up @@ -453,7 +528,10 @@ def generate_test_parquet():
"list_int64",
"list_float32",
"list_float64",
"list_decimal128",
"list_decimal256",
"list_string",
"list_large_string",
"fixed_size_list_boolean",
"fixed_size_list_uint8",
"fixed_size_list_int8",
Expand All @@ -467,6 +545,7 @@ def generate_test_parquet():
"fixed_size_list_float64",
"fixed_size_list_string",
"struct_field",
"list_struct",
"map_boolean",
"map_uint8",
"map_int8",
Expand All @@ -478,8 +557,13 @@ def generate_test_parquet():
"map_int64",
"map_float32",
"map_float64",
"map_decimal128",
"map_decimal256",
"map_string",
# "map_list",
"map_large_string",
"map_list_string",
"map_large_list_string",
"map_fixed_size_list_string",
"dict",
"geometry",
]
Expand Down Expand Up @@ -620,8 +704,10 @@ def generate_parquet_wkt_with_dict():


def generate_nested_types():
import decimal
import pathlib

import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq

Expand Down Expand Up @@ -818,11 +904,122 @@ def generate_nested_types():
type=pa.map_(pa.string(), pa.map_(pa.string(), pa.string())),
)

list_list_bool = pa.array(
[[[True], None, [False, None, True]], None, [[False]], [], []],
type=pa.list_(pa.list_(pa.bool_())),
)

list_list_uint8 = pa.array(
[[[1], None, [2, None, 3]], None, [[4]], [], []],
type=pa.list_(pa.list_(pa.uint8())),
)

list_list_int8 = pa.array(
[[[1], None, [-2, None, 3]], None, [[-4]], [], []],
type=pa.list_(pa.list_(pa.int8())),
)

list_list_uint16 = pa.array(
[[[1], None, [2, None, 3]], None, [[4]], [], []],
type=pa.list_(pa.list_(pa.uint16())),
)

list_list_int16 = pa.array(
[[[1], None, [-2, None, 3]], None, [[-4]], [], []],
type=pa.list_(pa.list_(pa.int16())),
)

list_list_uint32 = pa.array(
[[[1], None, [2, None, 3]], None, [[4]], [], []],
type=pa.list_(pa.list_(pa.uint32())),
)

list_list_int32 = pa.array(
[[[1], None, [-2, None, 3]], None, [[-4]], [], []],
type=pa.list_(pa.list_(pa.int32())),
)

list_list_uint64 = pa.array(
[[[1], None, [2, None, 3]], None, [[4]], [], []],
type=pa.list_(pa.list_(pa.uint64())),
)

list_list_int64 = pa.array(
[[[1], None, [-2, None, 3]], None, [[-4]], [], []],
type=pa.list_(pa.list_(pa.int64())),
)

list_list_float16 = pa.array(
[
[[np.float16(1)], None, [np.float16(2), None, np.float16(3)]],
None,
[[np.float16(4)]],
[],
[],
],
type=pa.list_(pa.list_(pa.float16())),
)

list_list_float32 = pa.array(
[[[1], None, [-2, None, 3]], None, [[-4]], [], []],
type=pa.list_(pa.list_(pa.float32())),
)

list_list_float64 = pa.array(
[[[1], None, [-2, None, 3]], None, [[-4]], [], []],
type=pa.list_(pa.list_(pa.float64())),
)

list_list_decimal128 = pa.array(
[
[
[decimal.Decimal("1234.567")],
None,
[decimal.Decimal("-1234.567"), None, decimal.Decimal("1234.567")],
],
None,
[[decimal.Decimal("-1234.567")]],
[],
[],
],
type=pa.list_(pa.list_(pa.decimal128(7, 3))),
)

list_list_decimal256 = pa.array(
[
[
[decimal.Decimal("1234.567")],
None,
[decimal.Decimal("-1234.567"), None, decimal.Decimal("1234.567")],
],
None,
[[decimal.Decimal("-1234.567")]],
[],
[],
],
type=pa.list_(pa.list_(pa.decimal256(7, 3))),
)

list_list_string = pa.array(
[[["a"], None, ["b", None, "cd"]], None, [["efg"]], [], []],
type=pa.list_(pa.list_(pa.string())),
)

list_list_large_string = pa.array(
[[["a"], None, ["b", None, "cd"]], None, [["efg"]], [], []],
type=pa.list_(pa.list_(pa.large_string())),
)

list_large_list_string = pa.array(
[[["a"], None, ["b", None, "cd"]], None, [["efg"]], [], []],
type=pa.list_(pa.large_list(pa.string())),
)

list_fixed_size_list_string = pa.array(
[[["a", "b"]], None, [["e", "f"]], [["g", "h"]], [["i", "j"]]],
type=pa.list_(pa.list_(pa.string(), 2)),
)

list_map_string = pa.array(
[[[("a", "b"), ("c", "d")], [("e", "f")]], None, [None], [], []],
type=pa.list_(pa.map_(pa.string(), pa.string())),
Expand Down Expand Up @@ -852,7 +1049,24 @@ def generate_nested_types():
"map_map_float32",
"map_map_float64",
"map_map_string",
"list_list_bool",
"list_list_uint8",
"list_list_int8",
"list_list_uint16",
"list_list_int16",
"list_list_uint32",
"list_list_int32",
"list_list_uint64",
"list_list_int64",
# "list_list_float16",
"list_list_float32",
"list_list_float64",
"list_list_decimal128",
"list_list_decimal256",
"list_list_string",
"list_list_large_string",
"list_large_list_string",
"list_fixed_size_list_string",
"list_map_string",
]

Expand Down
Binary file modified autotest/ogr/data/arrow/test.feather
Binary file not shown.
Binary file modified autotest/ogr/data/parquet/all_geoms.parquet
Binary file not shown.
Binary file modified autotest/ogr/data/parquet/nested_types.parquet
Binary file not shown.
Binary file modified autotest/ogr/data/parquet/test.parquet
Binary file not shown.
Binary file modified autotest/ogr/data/parquet/test_single_group.parquet
Binary file not shown.
Loading

0 comments on commit 99abd8a

Please sign in to comment.