From 0d6b1b848014ce4870bc0cc72408c18d021df787 Mon Sep 17 00:00:00 2001 From: "jay.narale" Date: Fri, 22 Mar 2024 02:18:31 +0000 Subject: [PATCH] Fix Parquet Complex type handling --- velox/dwio/parquet/reader/ParquetReader.cpp | 37 ++++++++++++++++--- .../dwio/parquet/reader/ParquetTypeWithId.cpp | 15 ++++---- velox/dwio/parquet/reader/ParquetTypeWithId.h | 6 +++ .../tests/reader/ParquetTableScanTest.cpp | 13 +++---- 4 files changed, 49 insertions(+), 22 deletions(-) diff --git a/velox/dwio/parquet/reader/ParquetReader.cpp b/velox/dwio/parquet/reader/ParquetReader.cpp index 72f36943896f2..0ef7b65189b09 100644 --- a/velox/dwio/parquet/reader/ParquetReader.cpp +++ b/velox/dwio/parquet/reader/ParquetReader.cpp @@ -235,6 +235,8 @@ std::shared_ptr ReaderBase::getParquetColumnInfo( auto& schema = fileMetaData_->schema; uint32_t curSchemaIdx = schemaIdx; auto& schemaElement = schema[curSchemaIdx]; + bool isRepeated = false; + bool isOptional = false; if (schemaElement.__isset.repetition_type) { if (schemaElement.repetition_type != @@ -244,6 +246,11 @@ std::shared_ptr ReaderBase::getParquetColumnInfo( if (schemaElement.repetition_type == thrift::FieldRepetitionType::REPEATED) { maxRepeat++; + isRepeated = true; + } + if (schemaElement.repetition_type == + thrift::FieldRepetitionType::OPTIONAL) { + isOptional = true; } } @@ -296,7 +303,9 @@ std::shared_ptr ReaderBase::getParquetColumnInfo( std::nullopt, std::nullopt, maxRepeat, - maxDefine); + maxDefine, + isOptional, + isRepeated); } // For backward-compatibility, a group annotated with MAP_KEY_VALUE @@ -309,6 +318,10 @@ std::shared_ptr ReaderBase::getParquetColumnInfo( VELOX_CHECK_EQ(children.size(), 1); const auto& child = children[0]; auto grandChildren = child->getChildren(); + isRepeated = true; + // This level will not have the "isRepeated" info in the parquet schema since parquet schema will have a child layer which will have the "repeated info" + // which we are ignoring here, hence we set the isRepeated to true + // eg https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists return std::make_shared( child->type(), std::move(grandChildren), @@ -319,7 +332,9 @@ std::shared_ptr ReaderBase::getParquetColumnInfo( std::nullopt, std::nullopt, maxRepeat + 1, - maxDefine); + maxDefine, + isOptional, + isRepeated); } default: @@ -346,7 +361,9 @@ std::shared_ptr ReaderBase::getParquetColumnInfo( std::nullopt, std::nullopt, maxRepeat, - maxDefine); + maxDefine, + isOptional, + isRepeated); } else if (children.size() == 2) { // children of MAP auto childrenCopy = children; @@ -361,7 +378,9 @@ std::shared_ptr ReaderBase::getParquetColumnInfo( std::nullopt, std::nullopt, maxRepeat, - maxDefine); + maxDefine, + isOptional, + isRepeated); } } else { // Row type @@ -376,7 +395,9 @@ std::shared_ptr ReaderBase::getParquetColumnInfo( std::nullopt, std::nullopt, maxRepeat, - maxDefine); + maxDefine, + isOptional, + isRepeated); } } } else { // leaf node @@ -403,6 +424,8 @@ std::shared_ptr ReaderBase::getParquetColumnInfo( logicalType_, maxRepeat, maxDefine, + isOptional, + isRepeated, precision, scale, type_length); @@ -423,7 +446,9 @@ std::shared_ptr ReaderBase::getParquetColumnInfo( std::nullopt, std::nullopt, maxRepeat, - maxDefine - 1); + maxDefine - 1, + isOptional, + isRepeated); } return leafTypePtr; } diff --git a/velox/dwio/parquet/reader/ParquetTypeWithId.cpp b/velox/dwio/parquet/reader/ParquetTypeWithId.cpp index 2846391f6f779..1114b7db103f2 100644 --- a/velox/dwio/parquet/reader/ParquetTypeWithId.cpp +++ b/velox/dwio/parquet/reader/ParquetTypeWithId.cpp @@ -51,15 +51,14 @@ bool ParquetTypeWithId::hasNonRepeatedLeaf() const { } LevelMode ParquetTypeWithId::makeLevelInfo(LevelInfo& info) const { - int16_t repeatedAncestor = 0; - for (auto parent = parquetParent(); parent; - parent = parent->parquetParent()) { - if (parent->type()->kind() == TypeKind::ARRAY || - parent->type()->kind() == TypeKind::MAP) { - repeatedAncestor = parent->maxDefine_; - break; + int16_t repeatedAncestor = maxDefine_; + auto node = this; + do { + if (node->isOptional_) { + repeatedAncestor--; } - } + node = node->parquetParent(); + } while (node && !node->isRepeated_); bool isList = type()->kind() == TypeKind::ARRAY; bool isStruct = type()->kind() == TypeKind::ROW; bool isMap = type()->kind() == TypeKind::MAP; diff --git a/velox/dwio/parquet/reader/ParquetTypeWithId.h b/velox/dwio/parquet/reader/ParquetTypeWithId.h index 4fc6b347f2213..720d074ee3cfb 100644 --- a/velox/dwio/parquet/reader/ParquetTypeWithId.h +++ b/velox/dwio/parquet/reader/ParquetTypeWithId.h @@ -45,6 +45,8 @@ class ParquetTypeWithId : public dwio::common::TypeWithId { std::optional logicalType, uint32_t maxRepeat, uint32_t maxDefine, + bool isOptional, + bool isRepeated, int32_t precision = 0, int32_t scale = 0, int32_t typeLength = 0) @@ -54,6 +56,8 @@ class ParquetTypeWithId : public dwio::common::TypeWithId { logicalType_(std::move(logicalType)), maxRepeat_(maxRepeat), maxDefine_(maxDefine), + isOptional_(isOptional), + isRepeated_(isRepeated), precision_(precision), scale_(scale), typeLength_(typeLength) {} @@ -79,6 +83,8 @@ class ParquetTypeWithId : public dwio::common::TypeWithId { const std::optional logicalType_; const uint32_t maxRepeat_; const uint32_t maxDefine_; + const bool isOptional_; + const bool isRepeated_; const int32_t precision_; const int32_t scale_; const int32_t typeLength_; diff --git a/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp b/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp index 1ec63e3e793a3..a2e436957dbb0 100644 --- a/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp +++ b/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp @@ -303,7 +303,7 @@ TEST_F(ParquetTableScanTest, singleRowStruct) { } // Core dump and incorrect result are fixed. -TEST_F(ParquetTableScanTest, DISABLED_array) { +TEST_F(ParquetTableScanTest, array) { auto vector = makeArrayVector({{1, 2, 3}}); loadData( @@ -320,8 +320,7 @@ TEST_F(ParquetTableScanTest, DISABLED_array) { } // Optional array with required elements. -// Incorrect result. -TEST_F(ParquetTableScanTest, DISABLED_optArrayReqEle) { +TEST_F(ParquetTableScanTest, optArrayReqEle) { auto vector = makeArrayVector({}); loadData( @@ -342,7 +341,7 @@ TEST_F(ParquetTableScanTest, DISABLED_optArrayReqEle) { // Required array with required elements. // Core dump is fixed, but the result is incorrect. -TEST_F(ParquetTableScanTest, DISABLED_reqArrayReqEle) { +TEST_F(ParquetTableScanTest, reqArrayReqEle) { auto vector = makeArrayVector({}); loadData( @@ -362,8 +361,7 @@ TEST_F(ParquetTableScanTest, DISABLED_reqArrayReqEle) { } // Required array with optional elements. -// Incorrect result. -TEST_F(ParquetTableScanTest, DISABLED_reqArrayOptEle) { +TEST_F(ParquetTableScanTest, reqArrayOptEle) { auto vector = makeArrayVector({}); loadData( @@ -383,8 +381,7 @@ TEST_F(ParquetTableScanTest, DISABLED_reqArrayOptEle) { } // Required array with legacy format. -// Incorrect result. -TEST_F(ParquetTableScanTest, DISABLED_reqArrayLegacy) { +TEST_F(ParquetTableScanTest, reqArrayLegacy) { auto vector = makeArrayVector({}); loadData(