diff --git a/autotest/generate_parquet_test_file.py b/autotest/generate_parquet_test_file.py index 2768254d2c26..9ad1ce882ee3 100644 --- a/autotest/generate_parquet_test_file.py +++ b/autotest/generate_parquet_test_file.py @@ -280,6 +280,26 @@ def generate_test_parquet(): ], type=pa.list_(pa.float64()), ) + list_decimal128 = pa.array( + [ + [decimal.Decimal("1234.567")], + [decimal.Decimal("-1234.567")], + None, + [decimal.Decimal("1234.567")], + [decimal.Decimal("-1234.567")], + ], + type=pa.list_(pa.decimal128(7, 3)), + ) + list_decimal256 = pa.array( + [ + [decimal.Decimal("1234.567")], + [decimal.Decimal("-1234.567")], + None, + [decimal.Decimal("1234.567")], + [decimal.Decimal("-1234.567")], + ], + type=pa.list_(pa.decimal256(7, 3)), + ) list_string = pa.array( [ None @@ -290,6 +310,17 @@ def generate_test_parquet(): for i in range(5) ] ) + list_large_string = pa.array( + [ + None + if i == 2 + else [ + "".join(["%c" % (65 + j + k) for k in range(1 + j)]) for j in range(i) + ] + for i in range(5) + ], + type=pa.list_(pa.large_string()), + ) fixed_size_list_boolean = pa.array( [[True, False], [False, True], [True, False], [False, True], [True, False]], type=pa.list_(pa.bool_(), 2), @@ -332,6 +363,8 @@ def generate_test_parquet(): [{"a": 1, "b": 2.5, "c": {"d": "e", "f": "g"}, "h": [5, 6], "i": 3}] * 5 ) + list_struct = pa.array([[{"a": 1, "b": 2.5}, {"a": 3, "c": 4.5}]] * 5) + # struct_val = { "a": 5 } # for i in range(123): # struct_val = { "a": struct_val } @@ -387,10 +420,52 @@ def generate_test_parquet(): [[("x", 1.5), ("y", None)], [("z", 3)], None, [], []], type=pa.map_(pa.string(), pa.float64()), ) + map_decimal128 = pa.array( + [ + [("x", decimal.Decimal("1234.567")), ("y", None)], + [("z", decimal.Decimal("-1234.567"))], + None, + [], + [], + ], + type=pa.map_(pa.string(), pa.decimal128(7, 3)), + ) + map_decimal256 = pa.array( + [ + [("x", decimal.Decimal("1234.567")), ("y", None)], + [("z", decimal.Decimal("-1234.567"))], + None, + [], + [], + ], + type=pa.map_(pa.string(), pa.decimal256(7, 3)), + ) map_string = pa.array( [[("x", "x_val"), ("y", None)], [("z", "z_val")], None, [], []], type=pa.map_(pa.string(), pa.string()), ) + map_large_string = pa.array( + [[("x", "x_val"), ("y", None)], [("z", "z_val")], None, [], []], + type=pa.map_(pa.string(), pa.large_string()), + ) + map_list_string = pa.array( + [[("x", ["x_val"]), ("y", None)], [("z", [None, "z_val"])], None, [], []], + type=pa.map_(pa.string(), pa.list_(pa.string())), + ) + map_large_list_string = pa.array( + [[("x", ["x_val"]), ("y", None)], [("z", [None, "z_val"])], None, [], []], + type=pa.map_(pa.string(), pa.large_list(pa.string())), + ) + map_fixed_size_list_string = pa.array( + [ + [("x", ["x_val", None]), ("y", [None, None])], + [("z", [None, "z_val"])], + None, + [], + [], + ], + type=pa.map_(pa.string(), pa.list_(pa.string(), 2)), + ) indices = pa.array([0, 1, 2, None, 2], type=pa.int32()) dictionary = pa.array(["foo", "bar", "baz"]) @@ -453,7 +528,10 @@ def generate_test_parquet(): "list_int64", "list_float32", "list_float64", + "list_decimal128", + "list_decimal256", "list_string", + "list_large_string", "fixed_size_list_boolean", "fixed_size_list_uint8", "fixed_size_list_int8", @@ -467,6 +545,7 @@ def generate_test_parquet(): "fixed_size_list_float64", "fixed_size_list_string", "struct_field", + "list_struct", "map_boolean", "map_uint8", "map_int8", @@ -478,8 +557,13 @@ def generate_test_parquet(): "map_int64", "map_float32", "map_float64", + "map_decimal128", + "map_decimal256", "map_string", - # "map_list", + "map_large_string", + "map_list_string", + "map_large_list_string", + "map_fixed_size_list_string", "dict", "geometry", ] @@ -620,8 +704,10 @@ def generate_parquet_wkt_with_dict(): def generate_nested_types(): + import decimal import pathlib + import numpy as np import pyarrow as pa import pyarrow.parquet as pq @@ -818,11 +904,122 @@ def generate_nested_types(): type=pa.map_(pa.string(), pa.map_(pa.string(), pa.string())), ) + list_list_bool = pa.array( + [[[True], None, [False, None, True]], None, [[False]], [], []], + type=pa.list_(pa.list_(pa.bool_())), + ) + + list_list_uint8 = pa.array( + [[[1], None, [2, None, 3]], None, [[4]], [], []], + type=pa.list_(pa.list_(pa.uint8())), + ) + + list_list_int8 = pa.array( + [[[1], None, [-2, None, 3]], None, [[-4]], [], []], + type=pa.list_(pa.list_(pa.int8())), + ) + + list_list_uint16 = pa.array( + [[[1], None, [2, None, 3]], None, [[4]], [], []], + type=pa.list_(pa.list_(pa.uint16())), + ) + + list_list_int16 = pa.array( + [[[1], None, [-2, None, 3]], None, [[-4]], [], []], + type=pa.list_(pa.list_(pa.int16())), + ) + + list_list_uint32 = pa.array( + [[[1], None, [2, None, 3]], None, [[4]], [], []], + type=pa.list_(pa.list_(pa.uint32())), + ) + + list_list_int32 = pa.array( + [[[1], None, [-2, None, 3]], None, [[-4]], [], []], + type=pa.list_(pa.list_(pa.int32())), + ) + + list_list_uint64 = pa.array( + [[[1], None, [2, None, 3]], None, [[4]], [], []], + type=pa.list_(pa.list_(pa.uint64())), + ) + + list_list_int64 = pa.array( + [[[1], None, [-2, None, 3]], None, [[-4]], [], []], + type=pa.list_(pa.list_(pa.int64())), + ) + + list_list_float16 = pa.array( + [ + [[np.float16(1)], None, [np.float16(2), None, np.float16(3)]], + None, + [[np.float16(4)]], + [], + [], + ], + type=pa.list_(pa.list_(pa.float16())), + ) + + list_list_float32 = pa.array( + [[[1], None, [-2, None, 3]], None, [[-4]], [], []], + type=pa.list_(pa.list_(pa.float32())), + ) + + list_list_float64 = pa.array( + [[[1], None, [-2, None, 3]], None, [[-4]], [], []], + type=pa.list_(pa.list_(pa.float64())), + ) + + list_list_decimal128 = pa.array( + [ + [ + [decimal.Decimal("1234.567")], + None, + [decimal.Decimal("-1234.567"), None, decimal.Decimal("1234.567")], + ], + None, + [[decimal.Decimal("-1234.567")]], + [], + [], + ], + type=pa.list_(pa.list_(pa.decimal128(7, 3))), + ) + + list_list_decimal256 = pa.array( + [ + [ + [decimal.Decimal("1234.567")], + None, + [decimal.Decimal("-1234.567"), None, decimal.Decimal("1234.567")], + ], + None, + [[decimal.Decimal("-1234.567")]], + [], + [], + ], + type=pa.list_(pa.list_(pa.decimal256(7, 3))), + ) + list_list_string = pa.array( [[["a"], None, ["b", None, "cd"]], None, [["efg"]], [], []], type=pa.list_(pa.list_(pa.string())), ) + list_list_large_string = pa.array( + [[["a"], None, ["b", None, "cd"]], None, [["efg"]], [], []], + type=pa.list_(pa.list_(pa.large_string())), + ) + + list_large_list_string = pa.array( + [[["a"], None, ["b", None, "cd"]], None, [["efg"]], [], []], + type=pa.list_(pa.large_list(pa.string())), + ) + + list_fixed_size_list_string = pa.array( + [[["a", "b"]], None, [["e", "f"]], [["g", "h"]], [["i", "j"]]], + type=pa.list_(pa.list_(pa.string(), 2)), + ) + list_map_string = pa.array( [[[("a", "b"), ("c", "d")], [("e", "f")]], None, [None], [], []], type=pa.list_(pa.map_(pa.string(), pa.string())), @@ -852,7 +1049,24 @@ def generate_nested_types(): "map_map_float32", "map_map_float64", "map_map_string", + "list_list_bool", + "list_list_uint8", + "list_list_int8", + "list_list_uint16", + "list_list_int16", + "list_list_uint32", + "list_list_int32", + "list_list_uint64", + "list_list_int64", + # "list_list_float16", + "list_list_float32", + "list_list_float64", + "list_list_decimal128", + "list_list_decimal256", "list_list_string", + "list_list_large_string", + "list_large_list_string", + "list_fixed_size_list_string", "list_map_string", ] diff --git a/autotest/ogr/data/arrow/test.feather b/autotest/ogr/data/arrow/test.feather index b4e222f8c6ea..00493d31e6d7 100644 Binary files a/autotest/ogr/data/arrow/test.feather and b/autotest/ogr/data/arrow/test.feather differ diff --git a/autotest/ogr/data/parquet/all_geoms.parquet b/autotest/ogr/data/parquet/all_geoms.parquet index 297e3f15754a..b9731a068014 100644 Binary files a/autotest/ogr/data/parquet/all_geoms.parquet and b/autotest/ogr/data/parquet/all_geoms.parquet differ diff --git a/autotest/ogr/data/parquet/nested_types.parquet b/autotest/ogr/data/parquet/nested_types.parquet index b3ba80887c6b..c18e9441d0d5 100644 Binary files a/autotest/ogr/data/parquet/nested_types.parquet and b/autotest/ogr/data/parquet/nested_types.parquet differ diff --git a/autotest/ogr/data/parquet/test.parquet b/autotest/ogr/data/parquet/test.parquet index b1322fdafa8c..69b4e1190666 100644 Binary files a/autotest/ogr/data/parquet/test.parquet and b/autotest/ogr/data/parquet/test.parquet differ diff --git a/autotest/ogr/data/parquet/test_single_group.parquet b/autotest/ogr/data/parquet/test_single_group.parquet index d8536c302b92..7efdc001dfce 100644 Binary files a/autotest/ogr/data/parquet/test_single_group.parquet and b/autotest/ogr/data/parquet/test_single_group.parquet differ diff --git a/autotest/ogr/ogr_parquet.py b/autotest/ogr/ogr_parquet.py index 355becca404f..89ea7e3073d9 100755 --- a/autotest/ogr/ogr_parquet.py +++ b/autotest/ogr/ogr_parquet.py @@ -121,7 +121,7 @@ def _check_test_parquet( assert srs is not None assert srs.GetAuthorityCode(None) == "4326" assert lyr_defn.GetGeomFieldDefn(0).GetType() == ogr.wkbPoint - assert lyr_defn.GetFieldCount() == 73 + assert lyr_defn.GetFieldCount() == 83 got_field_defns = [ ( lyr_defn.GetFieldDefn(i).GetName(), @@ -176,7 +176,10 @@ def _check_test_parquet( ("list_int64", "Integer64List", "None", 0, 0), ("list_float32", "RealList", "Float32", 0, 0), ("list_float64", "RealList", "None", 0, 0), + ("list_decimal128", "RealList", "None", 0, 0), + ("list_decimal256", "RealList", "None", 0, 0), ("list_string", "StringList", "None", 0, 0), + ("list_large_string", "StringList", "None", 0, 0), ("fixed_size_list_boolean", "IntegerList", "Boolean", 0, 0), ("fixed_size_list_uint8", "IntegerList", "None", 0, 0), ("fixed_size_list_int8", "IntegerList", "None", 0, 0), @@ -195,6 +198,7 @@ def _check_test_parquet( ("struct_field.c.f", "String", "None", 0, 0), ("struct_field.h", "Integer64List", "None", 0, 0), ("struct_field.i", "Integer64", "None", 0, 0), + ("list_struct", "String", "JSON", 0, 0), ("map_boolean", "String", "JSON", 0, 0), ("map_uint8", "String", "JSON", 0, 0), ("map_int8", "String", "JSON", 0, 0), @@ -206,7 +210,13 @@ def _check_test_parquet( ("map_int64", "String", "JSON", 0, 0), ("map_float32", "String", "JSON", 0, 0), ("map_float64", "String", "JSON", 0, 0), + ("map_decimal128", "String", "JSON", 0, 0), + ("map_decimal256", "String", "JSON", 0, 0), ("map_string", "String", "JSON", 0, 0), + ("map_large_string", "String", "JSON", 0, 0), + ("map_list_string", "String", "JSON", 0, 0), + ("map_large_list_string", "String", "JSON", 0, 0), + ("map_fixed_size_list_string", "String", "JSON", 0, 0), ("dict", "Integer", "None", 0, 0), ] assert got_field_defns == expected_field_defns @@ -280,7 +290,10 @@ def _check_test_parquet( assert f["list_int64"] == [] assert f["list_float32"] == [] assert f["list_float64"] == [] + assert f["list_decimal128"] == [1234.567] + assert f["list_decimal256"] == [1234.567] assert f["list_string"] is None + assert f["list_large_string"] is None assert f["fixed_size_list_boolean"] == [1, 0] assert f["fixed_size_list_uint8"] == [0, 1] assert f["fixed_size_list_int8"] == [0, 1] @@ -301,6 +314,7 @@ def _check_test_parquet( assert f["struct_field.c.f"] == "g" assert f["struct_field.h"] == [5, 6] assert f["struct_field.i"] == 3 + assert f["list_struct"] == """[{"a":1,"b":2.5,"c":null},{"a":3,"b":null,"c":4.5}]""" assert f["map_boolean"] == '{"x":null,"y":true}' assert f["map_uint8"] == '{"x":1,"y":null}' assert f["map_int8"] == '{"x":1,"y":null}' @@ -308,11 +322,17 @@ def _check_test_parquet( assert f["map_int16"] == '{"x":1,"y":null}' assert f["map_uint32"] == '{"x":4000000000,"y":null}' assert f["map_int32"] == '{"x":2000000000,"y":null}' - assert f["map_uint64"] == '{"x":4000000000000.0,"y":null}' + assert f["map_uint64"] == '{"x":4000000000000,"y":null}' assert f["map_int64"] == '{"x":-2000000000000,"y":null}' assert f["map_float32"] == '{"x":1.5,"y":null}' assert f["map_float64"] == '{"x":1.5,"y":null}' + assert f["map_decimal128"] == '{"x":1234.567,"y":null}' + assert f["map_decimal256"] == '{"x":1234.567,"y":null}' assert f["map_string"] == '{"x":"x_val","y":null}' + assert f["map_large_string"] == '{"x":"x_val","y":null}' + assert f["map_list_string"] == '{"x":["x_val"],"y":null}' + assert f["map_large_list_string"] == '{"x":["x_val"],"y":null}' + assert f["map_fixed_size_list_string"] == '{"x":["x_val",null],"y":[null,null]}' assert f["dict"] == 0 assert f.GetGeometryRef().ExportToWkt() == "POINT (0 2)" @@ -2728,12 +2748,32 @@ def test_ogr_parquet_nested_types(): assert f["map_map_int16"] == """{"a":{"b":1,"c":null,"d":2},"e":null}""" assert f["map_map_uint32"] == """{"a":{"b":1,"c":null,"d":2},"e":null}""" assert f["map_map_int32"] == """{"a":{"b":1,"c":null,"d":2},"e":null}""" - assert f["map_map_uint64"] == """{"a":{"b":1.0,"c":null,"d":2.0},"e":null}""" + assert f["map_map_uint64"] == """{"a":{"b":1,"c":null,"d":2},"e":null}""" assert f["map_map_int64"] == """{"a":{"b":1,"c":null,"d":2},"e":null}""" assert f["map_map_float32"] == """{"a":{"b":1.0,"c":null,"d":2.0},"e":null}""" assert f["map_map_float64"] == """{"a":{"b":1.0,"c":null,"d":2.0},"e":null}""" assert f["map_map_string"] == """{"a":{"b":"c","d":null},"e":null}""" + assert f["list_list_bool"] == """[[true],null,[false,null,true]]""" + assert f["list_list_uint8"] == """[[1],null,[2,null,3]]""" + assert f["list_list_int8"] == """[[1],null,[-2,null,3]]""" + assert f["list_list_uint16"] == """[[1],null,[2,null,3]]""" + assert f["list_list_int16"] == """[[1],null,[-2,null,3]]""" + assert f["list_list_uint32"] == """[[1],null,[2,null,3]]""" + assert f["list_list_int32"] == """[[1],null,[-2,null,3]]""" + assert f["list_list_uint64"] == """[[1],null,[2,null,3]]""" + assert f["list_list_int64"] == """[[1],null,[-2,null,3]]""" + assert f["list_list_float32"] == """[[1.0],null,[-2.0,null,3.0]]""" + assert f["list_list_float64"] == """[[1.0],null,[-2.0,null,3.0]]""" + assert ( + f["list_list_decimal128"] == """[[1234.567],null,[-1234.567,null,1234.567]]""" + ) + assert ( + f["list_list_decimal256"] == """[[1234.567],null,[-1234.567,null,1234.567]]""" + ) assert f["list_list_string"] == """[["a"],null,["b",null,"cd"]]""" + assert f["list_list_large_string"] == """[["a"],null,["b",null,"cd"]]""" + assert f["list_large_list_string"] == """[["a"],null,["b",null,"cd"]]""" + assert f["list_fixed_size_list_string"] == """[["a","b"]]""" assert f["list_map_string"] == """[{"a":"b","c":"d"},{"e":"f"}]""" f = lyr.GetNextFeature() @@ -2760,7 +2800,12 @@ def test_ogr_parquet_nested_types(): assert f["map_map_float32"] is None assert f["map_map_float64"] is None assert f["map_map_string"] is None + assert f["list_list_decimal128"] is None + assert f["list_list_decimal256"] is None assert f["list_list_string"] is None + assert f["list_list_large_string"] is None + assert f["list_large_list_string"] is None + assert f["list_fixed_size_list_string"] is None assert f["list_map_string"] is None f = lyr.GetNextFeature() @@ -2782,12 +2827,17 @@ def test_ogr_parquet_nested_types(): assert f["map_map_int16"] == """{"f":{"g":3}}""" assert f["map_map_uint32"] == """{"f":{"g":3}}""" assert f["map_map_int32"] == """{"f":{"g":3}}""" - assert f["map_map_uint64"] == """{"f":{"g":3.0}}""" + assert f["map_map_uint64"] == """{"f":{"g":3}}""" assert f["map_map_int64"] == """{"f":{"g":3}}""" assert f["map_map_float32"] == """{"f":{"g":3.0}}""" assert f["map_map_float64"] == """{"f":{"g":3.0}}""" assert f["map_map_string"] == """{"f":{"g":"h"}}""" + assert f["list_list_decimal128"] == """[[-1234.567]]""" + assert f["list_list_decimal256"] == """[[-1234.567]]""" assert f["list_list_string"] == """[["efg"]]""" + assert f["list_list_large_string"] == """[["efg"]]""" + assert f["list_large_list_string"] == """[["efg"]]""" + assert f["list_fixed_size_list_string"] == """[["e","f"]]""" assert f["list_map_string"] == """[null]""" f = lyr.GetNextFeature() diff --git a/ogr/ogrsf_frmts/arrow_common/ograrrowlayer.hpp b/ogr/ogrsf_frmts/arrow_common/ograrrowlayer.hpp index 3d55222c2de0..bee830cc11ec 100644 --- a/ogr/ogrsf_frmts/arrow_common/ograrrowlayer.hpp +++ b/ogr/ogrsf_frmts/arrow_common/ograrrowlayer.hpp @@ -182,11 +182,17 @@ inline bool OGRArrowLayer::IsHandledListOrMapType( itemTypeId == arrow::Type::HALF_FLOAT || itemTypeId == arrow::Type::FLOAT || itemTypeId == arrow::Type::DOUBLE || + itemTypeId == arrow::Type::DECIMAL128 || + itemTypeId == arrow::Type::DECIMAL256 || itemTypeId == arrow::Type::STRING || + itemTypeId == arrow::Type::LARGE_STRING || + itemTypeId == arrow::Type::STRUCT || (itemTypeId == arrow::Type::MAP && IsHandledMapType( std::static_pointer_cast(valueType))) || - (itemTypeId == arrow::Type::LIST && + ((itemTypeId == arrow::Type::LIST || + itemTypeId == arrow::Type::LARGE_LIST || + itemTypeId == arrow::Type::FIXED_SIZE_LIST) && IsHandledListType( std::static_pointer_cast(valueType))); } @@ -355,9 +361,12 @@ inline bool OGRArrowLayer::MapArrowTypeToOGR( eSubType = OFSTFloat32; break; case arrow::Type::DOUBLE: + case arrow::Type::DECIMAL128: + case arrow::Type::DECIMAL256: eType = OFTRealList; break; case arrow::Type::STRING: + case arrow::Type::LARGE_STRING: eType = OFTStringList; break; default: @@ -910,153 +919,375 @@ OGRArrowLayer::GetGeometryTypeFromString(const std::string &osType) return eGeomType; } +static CPLJSONObject GetObjectAsJSON(const arrow::Array *array, + const size_t nIdx); + /************************************************************************/ -/* ReadList() */ +/* AddToArray() */ /************************************************************************/ -static CPLJSONObject ReadMap(const arrow::MapArray *array, int64_t nIdxInArray); +static void AddToArray(CPLJSONArray &oArray, const arrow::Array *array, + const size_t nIdx) +{ + switch (array->type()->id()) + { + case arrow::Type::BOOL: + { + oArray.Add( + static_cast(array)->Value(nIdx)); + break; + } + case arrow::Type::UINT8: + { + oArray.Add( + static_cast(array)->Value(nIdx)); + break; + } + case arrow::Type::INT8: + { + oArray.Add( + static_cast(array)->Value(nIdx)); + break; + } + case arrow::Type::UINT16: + { + oArray.Add( + static_cast(array)->Value(nIdx)); + break; + } + case arrow::Type::INT16: + { + oArray.Add( + static_cast(array)->Value(nIdx)); + break; + } + case arrow::Type::INT32: + { + oArray.Add( + static_cast(array)->Value(nIdx)); + break; + } + case arrow::Type::UINT32: + { + oArray.Add(static_cast( + static_cast(array)->Value(nIdx))); + break; + } + case arrow::Type::INT64: + { + oArray.Add(static_cast( + static_cast(array)->Value(nIdx))); + break; + } + case arrow::Type::UINT64: + { + oArray.Add(static_cast( + static_cast(array)->Value(nIdx))); + break; + } + case arrow::Type::HALF_FLOAT: + { + oArray.Add( + static_cast(array)->Value(nIdx)); + break; + } + case arrow::Type::FLOAT: + { + oArray.Add( + static_cast(array)->Value(nIdx)); + break; + } + case arrow::Type::DOUBLE: + { + oArray.Add( + static_cast(array)->Value(nIdx)); + break; + } + case arrow::Type::DECIMAL128: + case arrow::Type::DECIMAL256: + { + oArray.Add(CPLAtof(static_cast(array) + ->FormatValue(nIdx) + .c_str())); + break; + } + case arrow::Type::STRING: + { + oArray.Add( + static_cast(array)->GetString( + nIdx)); + break; + } + case arrow::Type::LARGE_STRING: + { + oArray.Add( + static_cast(array)->GetString( + nIdx)); + break; + } + case arrow::Type::LIST: + case arrow::Type::LARGE_LIST: + case arrow::Type::FIXED_SIZE_LIST: + case arrow::Type::MAP: + case arrow::Type::STRUCT: + { + oArray.Add(GetObjectAsJSON(array, nIdx)); + break; + } -template -static CPLJSONArray ReadList(const ArrayType *array, int64_t nIdxInArray) + default: + { + CPLDebug("ARROW", "AddToArray(): unexpected data type %s", + array->type()->ToString().c_str()); + break; + } + } +} + +/************************************************************************/ +/* GetListAsJSON() */ +/************************************************************************/ + +template +static CPLJSONObject GetListAsJSON(const ArrowType *array, + const size_t nIdxInArray) { const auto values = std::static_pointer_cast(array->values()); const auto nIdxStart = array->value_offset(nIdxInArray); - const int nCount = array->value_length(nIdxInArray); + const auto nCount = array->value_length(nIdxInArray); CPLJSONArray oArray; - for (int k = 0; k < nCount; k++) + for (auto k = decltype(nCount){0}; k < nCount; k++) { if (values->IsNull(nIdxStart + k)) oArray.AddNull(); else - oArray.Add(static_cast(values->Value(nIdxStart + k))); + AddToArray(oArray, values.get(), nIdxStart + k); } return oArray; } -template -static CPLJSONArray ReadList(const ArrayType *array, int64_t nIdxInArray) +/************************************************************************/ +/* AddToDict() */ +/************************************************************************/ + +static void AddToDict(CPLJSONObject &oDict, const std::string &osKey, + const arrow::Array *array, const size_t nIdx) { - switch (array->value_type()->id()) + switch (array->type()->id()) { case arrow::Type::BOOL: { - return ReadList(array, nIdxInArray); + oDict.Add( + osKey, + static_cast(array)->Value(nIdx)); + break; } case arrow::Type::UINT8: { - return ReadList(array, nIdxInArray); + oDict.Add( + osKey, + static_cast(array)->Value(nIdx)); + break; } case arrow::Type::INT8: { - return ReadList(array, nIdxInArray); + oDict.Add( + osKey, + static_cast(array)->Value(nIdx)); + break; } case arrow::Type::UINT16: { - return ReadList(array, nIdxInArray); + oDict.Add( + osKey, + static_cast(array)->Value(nIdx)); + break; } case arrow::Type::INT16: { - return ReadList(array, nIdxInArray); + oDict.Add( + osKey, + static_cast(array)->Value(nIdx)); + break; } case arrow::Type::INT32: { - return ReadList(array, nIdxInArray); + oDict.Add( + osKey, + static_cast(array)->Value(nIdx)); + break; } case arrow::Type::UINT32: { - return ReadList(array, nIdxInArray); + oDict.Add(osKey, + static_cast( + static_cast(array)->Value( + nIdx))); + break; } case arrow::Type::INT64: { - return ReadList(array, nIdxInArray); + oDict.Add(osKey, + static_cast( + static_cast(array)->Value( + nIdx))); + break; } case arrow::Type::UINT64: { - return ReadList(array, nIdxInArray); + oDict.Add(osKey, + static_cast( + static_cast(array)->Value( + nIdx))); + break; } case arrow::Type::HALF_FLOAT: { - return ReadList(array, nIdxInArray); + oDict.Add( + osKey, + static_cast(array)->Value(nIdx)); + break; } case arrow::Type::FLOAT: { - return ReadList(array, nIdxInArray); + oDict.Add( + osKey, + static_cast(array)->Value(nIdx)); + break; } case arrow::Type::DOUBLE: { - return ReadList(array, nIdxInArray); + oDict.Add( + osKey, + static_cast(array)->Value(nIdx)); + break; } - case arrow::Type::STRING: + case arrow::Type::DECIMAL128: + case arrow::Type::DECIMAL256: { - CPLJSONArray oArray; - const auto values = - std::static_pointer_cast(array->values()); - const auto nIdxStart = array->value_offset(nIdxInArray); - const int nCount = array->value_length(nIdxInArray); - for (int k = 0; k < nCount; k++) - { - if (values->IsNull(nIdxStart + k)) - { - oArray.AddNull(); - } - else - { - oArray.Add(values->GetString(nIdxStart + k)); - } - } - return oArray; + oDict.Add(osKey, + CPLAtof(static_cast(array) + ->FormatValue(nIdx) + .c_str())); + break; } - case arrow::Type::LIST: + case arrow::Type::STRING: { - CPLJSONArray oArray; - const auto values = - std::static_pointer_cast(array->values()); - const auto nIdxStart = array->value_offset(nIdxInArray); - const int nCount = array->value_length(nIdxInArray); - for (int k = 0; k < nCount; k++) - { - if (values->IsNull(nIdxStart + k)) - { - oArray.AddNull(); - } - else - { - oArray.Add(ReadList(values.get(), nIdxStart + k)); - } - } - return oArray; + oDict.Add(osKey, + static_cast(array)->GetString( + nIdx)); + break; } - + case arrow::Type::LARGE_STRING: + { + oDict.Add(osKey, static_cast(array) + ->GetString(nIdx)); + break; + } + case arrow::Type::LIST: + case arrow::Type::LARGE_LIST: + case arrow::Type::FIXED_SIZE_LIST: case arrow::Type::MAP: + case arrow::Type::STRUCT: { - CPLJSONArray oArray; - const auto values = - std::static_pointer_cast(array->values()); - const auto nIdxStart = array->value_offset(nIdxInArray); - const int nCount = array->value_length(nIdxInArray); - for (int k = 0; k < nCount; k++) - { - if (values->IsNull(nIdxStart + k)) - { - oArray.AddNull(); - } - else - { - oArray.Add(ReadMap(values.get(), nIdxStart + k)); - } - } - return oArray; + oDict.Add(osKey, GetObjectAsJSON(array, nIdx)); + break; } default: { - CPLDebug("ARROW", "ReadList(): unexpected data type %s", - array->values()->type()->ToString().c_str()); + CPLDebug("ARROW", "AddToDict(): unexpected data type %s", + array->type()->ToString().c_str()); break; } } - return CPLJSONArray(); +} + +/************************************************************************/ +/* GetMapAsJSON() */ +/************************************************************************/ + +static CPLJSONObject GetMapAsJSON(const arrow::Array *array, + const size_t nIdxInArray) +{ + const auto mapArray = static_cast(array); + const auto keys = + std::static_pointer_cast(mapArray->keys()); + const auto values = mapArray->items(); + const auto nIdxStart = mapArray->value_offset(nIdxInArray); + const int nCount = mapArray->value_length(nIdxInArray); + CPLJSONObject oRoot; + for (int k = 0; k < nCount; k++) + { + if (!keys->IsNull(nIdxStart + k)) + { + const auto osKey = keys->GetString(nIdxStart + k); + if (!values->IsNull(nIdxStart + k)) + AddToDict(oRoot, osKey, values.get(), nIdxStart + k); + else + oRoot.AddNull(osKey); + } + } + return oRoot; +} + +/************************************************************************/ +/* GetStructureAsJSON() */ +/************************************************************************/ + +static CPLJSONObject GetStructureAsJSON(const arrow::Array *array, + const size_t nIdxInArray) +{ + CPLJSONObject oRoot; + const auto structArray = static_cast(array); + const auto structArrayType = structArray->type(); + for (int i = 0; i < structArrayType->num_fields(); ++i) + { + const auto field = structArray->field(i); + if (!field->IsNull(nIdxInArray)) + { + AddToDict(oRoot, structArrayType->field(i)->name(), field.get(), + nIdxInArray); + } + else + oRoot.AddNull(structArrayType->field(i)->name()); + } + + return oRoot; +} + +/************************************************************************/ +/* GetObjectAsJSON() */ +/************************************************************************/ + +static CPLJSONObject GetObjectAsJSON(const arrow::Array *array, + const size_t nIdxInArray) +{ + switch (array->type()->id()) + { + case arrow::Type::MAP: + return GetMapAsJSON(array, nIdxInArray); + case arrow::Type::LIST: + return GetListAsJSON(static_cast(array), + nIdxInArray); + case arrow::Type::LARGE_LIST: + return GetListAsJSON( + static_cast(array), nIdxInArray); + case arrow::Type::FIXED_SIZE_LIST: + return GetListAsJSON( + static_cast(array), + nIdxInArray); + case arrow::Type::STRUCT: + return GetStructureAsJSON(array, nIdxInArray); + default: + { + CPLError(CE_Failure, CPLE_AppDefined, + "GetObjectAsJSON(): unhandled value format: %s", + array->type()->ToString().c_str()); + return CPLJSONObject(); + } + } } template @@ -1167,6 +1398,26 @@ static void ReadList(OGRFeature *poFeature, int i, int64_t nIdxInArray, array); break; } + case arrow::Type::DECIMAL128: + case arrow::Type::DECIMAL256: + { + const auto values = + std::static_pointer_cast(array->values()); + const auto nIdxStart = array->value_offset(nIdxInArray); + const int nCount = array->value_length(nIdxInArray); + std::vector aValues; + aValues.reserve(nCount); + for (int k = 0; k < nCount; k++) + { + if (values->IsNull(nIdxStart + k)) + aValues.push_back(std::numeric_limits::quiet_NaN()); + else + aValues.push_back( + CPLAtof(values->FormatValue(nIdxStart + k).c_str())); + } + poFeature->SetField(i, nCount, aValues.data()); + break; + } case arrow::Type::STRING: { const auto values = @@ -1185,12 +1436,33 @@ static void ReadList(OGRFeature *poFeature, int i, int64_t nIdxInArray, poFeature->SetField(i, aosList.List()); break; } - + case arrow::Type::LARGE_STRING: + { + const auto values = + std::static_pointer_cast( + array->values()); + const auto nIdxStart = array->value_offset(nIdxInArray); + const auto nCount = array->value_length(nIdxInArray); + CPLStringList aosList; + for (auto k = decltype(nCount){0}; k < nCount; k++) + { + if (values->IsNull(nIdxStart + k)) + aosList.AddString( + ""); // we cannot have null strings in a list + else + aosList.AddString(values->GetString(nIdxStart + k).c_str()); + } + poFeature->SetField(i, aosList.List()); + break; + } case arrow::Type::LIST: + case arrow::Type::LARGE_LIST: + case arrow::Type::FIXED_SIZE_LIST: case arrow::Type::MAP: + case arrow::Type::STRUCT: { poFeature->SetField(i, - ReadList(array, nIdxInArray) + GetListAsJSON(array, nIdxInArray) .Format(CPLJSONObject::PrettyFormat::Plain) .c_str()); break; @@ -1205,160 +1477,6 @@ static void ReadList(OGRFeature *poFeature, int i, int64_t nIdxInArray, } } -/************************************************************************/ -/* ReadMap() */ -/************************************************************************/ - -template -static CPLJSONObject ReadMap(const arrow::MapArray *array, int64_t nIdxInArray) -{ - const auto keys = - std::static_pointer_cast(array->keys()); - const auto values = std::static_pointer_cast(array->items()); - const auto nIdxStart = array->value_offset(nIdxInArray); - const int nCount = array->value_length(nIdxInArray); - CPLJSONObject oRoot; - for (int k = 0; k < nCount; k++) - { - if (!keys->IsNull(nIdxStart + k)) - { - const auto osKey = keys->GetString(nIdxStart + k); - if (!values->IsNull(nIdxStart + k)) - oRoot.Add(osKey, - static_cast(values->Value(nIdxStart + k))); - else - oRoot.AddNull(osKey); - } - } - return oRoot; -} - -static CPLJSONObject ReadMap(const arrow::MapArray *array, int64_t nIdxInArray) -{ - const auto mapType = - static_cast(array->data()->type.get()); - const auto itemTypeId = mapType->item_type()->id(); - if (mapType->key_type()->id() == arrow::Type::STRING) - { - if (itemTypeId == arrow::Type::BOOL) - { - return ReadMap(array, nIdxInArray); - } - else if (itemTypeId == arrow::Type::UINT8) - { - return ReadMap(array, nIdxInArray); - } - else if (itemTypeId == arrow::Type::INT8) - { - return ReadMap(array, nIdxInArray); - } - else if (itemTypeId == arrow::Type::UINT16) - { - return ReadMap(array, nIdxInArray); - } - else if (itemTypeId == arrow::Type::INT16) - { - return ReadMap(array, nIdxInArray); - } - else if (itemTypeId == arrow::Type::UINT32) - { - return ReadMap(array, nIdxInArray); - } - else if (itemTypeId == arrow::Type::INT32) - { - return ReadMap(array, nIdxInArray); - } - else if (itemTypeId == arrow::Type::UINT64) - { - return ReadMap(array, nIdxInArray); - } - else if (itemTypeId == arrow::Type::INT64) - { - return ReadMap(array, nIdxInArray); - } - else if (itemTypeId == arrow::Type::FLOAT) - { - return ReadMap(array, nIdxInArray); - } - else if (itemTypeId == arrow::Type::DOUBLE) - { - return ReadMap(array, nIdxInArray); - } - else if (itemTypeId == arrow::Type::STRING) - { - const auto keys = - std::static_pointer_cast(array->keys()); - const auto values = - std::static_pointer_cast(array->items()); - const auto nIdxStart = array->value_offset(nIdxInArray); - const int nCount = array->value_length(nIdxInArray); - CPLJSONObject oRoot; - for (int k = 0; k < nCount; k++) - { - if (!keys->IsNull(nIdxStart + k)) - { - const auto osKey = keys->GetString(nIdxStart + k); - if (!values->IsNull(nIdxStart + k)) - oRoot.Add(osKey, values->GetString(nIdxStart + k)); - else - oRoot.AddNull(osKey); - } - } - return oRoot; - } - else if (itemTypeId == arrow::Type::LIST) - { - const auto keys = - std::static_pointer_cast(array->keys()); - const auto values = - std::static_pointer_cast(array->items()); - const auto nIdxStart = array->value_offset(nIdxInArray); - const int nCount = array->value_length(nIdxInArray); - CPLJSONObject oRoot; - for (int k = 0; k < nCount; k++) - { - if (!keys->IsNull(nIdxStart + k)) - { - const auto osKey = keys->GetString(nIdxStart + k); - if (!values->IsNull(nIdxStart + k)) - oRoot.Add(osKey, ReadList(values.get(), nIdxStart + k)); - else - oRoot.AddNull(osKey); - } - } - return oRoot; - } - else if (itemTypeId == arrow::Type::MAP) - { - const auto keys = - std::static_pointer_cast(array->keys()); - const auto values = - std::static_pointer_cast(array->items()); - const auto nIdxStart = array->value_offset(nIdxInArray); - const int nCount = array->value_length(nIdxInArray); - CPLJSONObject oRoot; - for (int k = 0; k < nCount; k++) - { - if (!keys->IsNull(nIdxStart + k)) - { - const auto osKey = keys->GetString(nIdxStart + k); - if (!values->IsNull(nIdxStart + k)) - oRoot.Add(osKey, ReadMap(values.get(), nIdxStart + k)); - else - oRoot.AddNull(osKey); - } - } - return oRoot; - } - else - { - CPLDebug("ARROW", "ReadMap(): unexpected data type %s", - array->items()->type()->ToString().c_str()); - } - } - return CPLJSONObject(); -} - /************************************************************************/ /* SetPointsOfLine() */ /************************************************************************/ @@ -1763,18 +1881,10 @@ inline OGRFeature *OGRArrowLayer::ReadFeature( } case arrow::Type::DECIMAL128: - { - const auto castArray = - static_cast(array); - poFeature->SetField( - i, CPLAtof(castArray->FormatValue(nIdxInBatch).c_str())); - break; - } - case arrow::Type::DECIMAL256: { const auto castArray = - static_cast(array); + static_cast(array); poFeature->SetField( i, CPLAtof(castArray->FormatValue(nIdxInBatch).c_str())); break; @@ -1838,7 +1948,7 @@ inline OGRFeature *OGRArrowLayer::ReadFeature( const auto castArray = static_cast(array); poFeature->SetField( - i, ReadMap(castArray, nIdxInBatch) + i, GetMapAsJSON(castArray, nIdxInBatch) .Format(CPLJSONObject::PrettyFormat::Plain) .c_str()); break;