Skip to content

Commit

Permalink
[OPPRO-170] Filter validation for Parquet reader at runtime (oap-proj…
Browse files Browse the repository at this point in the history
…ect#27)

* Filter validation for Parquet reader at runtime

* Style

* Style

* Format

Removed special handling for avg (oap-project#31)

[OPPRO-173] Make batch size configurable (oap-project#32)

support dwrf format
  • Loading branch information
zhztheplayer authored and zhejiangxiaomai committed Sep 22, 2022
1 parent 0c771a9 commit b33c235
Show file tree
Hide file tree
Showing 28 changed files with 316 additions and 1,579 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# limitations under the License.
cmake_minimum_required(VERSION 3.10)
set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/CMake" ${CMAKE_MODULE_PATH})

set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
# set the project name
project(velox)

Expand Down
4 changes: 3 additions & 1 deletion third_party/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,9 @@ if(VELOX_ENABLE_ARROW)
-DARROW_WITH_UTF8PROC=OFF
-DCMAKE_INSTALL_PREFIX=${ARROW_PREFIX}/install
-DARROW_BUILD_STATIC=ON
-DThrift_SOURCE=${THRIFT_SOURCE})
-DThrift_SOURCE=BUNDLED
-DARROW_DEPENDENCY_SOURCE=BUNDLED
-Dre2_SOURCE=AUTO)
set(ARROW_LIBDIR ${ARROW_PREFIX}/install/${CMAKE_INSTALL_LIBDIR})

add_library(thrift STATIC IMPORTED GLOBAL)
Expand Down
2 changes: 1 addition & 1 deletion velox/connectors/hive/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

add_library(velox_hive_connector HiveConnector.cpp FileHandle.cpp)

target_link_libraries(velox_hive_connector velox_connector
target_link_libraries(velox_hive_connector velox_connector velox_dwio_common_exception
velox_dwio_dwrf_reader velox_dwio_dwrf_writer velox_file)

add_library(velox_hive_partition_function HivePartitionFunction.cpp)
Expand Down
2 changes: 1 addition & 1 deletion velox/dwio/dwrf/common/Checksum.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

#include <boost/crc.hpp>
#define XXH_INLINE_ALL
#include <xxhash.h>
#include "velox/external/xxhash/xxhash.h"

namespace facebook::velox::dwrf {

Expand Down
4 changes: 2 additions & 2 deletions velox/dwio/parquet/writer/Writer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@
*/

#include "velox/dwio/parquet/writer/Writer.h"
#include <arrow/c/bridge.h> // @manual
#include <arrow/c/bridge.h>
#include <arrow/table.h> // @manual
#include "velox/vector/arrow/Bridge.h"
#include "velox/vector/arrow/c/Bridge.h"

namespace facebook::velox::parquet {

Expand Down
8 changes: 7 additions & 1 deletion velox/exec/TableScan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ TableScan::TableScan(
"TableScan"),
tableHandle_(tableScanNode->tableHandle()),
columnHandles_(tableScanNode->assignments()),
driverCtx_(driverCtx) {
driverCtx_(driverCtx),
preferredBatchSize_(driverCtx->queryConfig().preferredOutputBatchSize()) {
connector_ = connector::getConnector(tableHandle_->connectorId());
}

Expand Down Expand Up @@ -139,6 +140,11 @@ bool TableScan::isFinished() {
}

void TableScan::setBatchSize() {
if (preferredBatchSize_ != 1024) {
// Not the default value.
readBatchSize_ = preferredBatchSize_;
return;
}
constexpr int64_t kMB = 1 << 20;
auto estimate = dataSource_->estimatedRowSize();
if (estimate == connector::DataSource::kUnknownRowSize) {
Expand Down
2 changes: 2 additions & 0 deletions velox/exec/TableScan.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ class TableScan : public SourceOperator {
std::unordered_map<column_index_t, std::shared_ptr<common::Filter>>
pendingDynamicFilters_;
int32_t readBatchSize_{kDefaultBatchSize};
// A preferred batch size from configuration.
uint32_t preferredBatchSize_;

// String shown in ExceptionContext inside DataSource and LazyVector loading.
std::string debugString_;
Expand Down
2 changes: 1 addition & 1 deletion velox/external/duckdb/duckdb.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -397,7 +397,7 @@ namespace duckdb {
DUCKDB_API void DuckDBAssertInternal(bool condition, const char *condition_name, const char *file, int linenr);
}

#define D_ASSERT(condition) duckdb::DuckDBAssertInternal(bool(condition), #condition, __FILE__, __LINE__)
#define D_ASSERT(condition) ::duckdb::DuckDBAssertInternal(bool(condition), #condition, __FILE__, __LINE__)

#endif

Expand Down
2 changes: 1 addition & 1 deletion velox/substrait/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ set(SRCS
SubstraitParser.cpp
SubstraitToVeloxExpr.cpp
SubstraitToVeloxPlan.cpp
TypeUtils.cpp
TypeUtils.cpp_out
SubstraitExtensionCollector.cpp
VeloxToSubstraitExpr.cpp
VeloxToSubstraitPlan.cpp
Expand Down
41 changes: 8 additions & 33 deletions velox/substrait/SubstraitParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ std::shared_ptr<SubstraitParser::SubstraitType> SubstraitParser::parseType(
switch (substraitType.kind_case()) {
case ::substrait::Type::KindCase::kBool: {
typeName = "BOOLEAN";
nullability = sType.bool_().nullability();
nullability = substraitType.bool_().nullability();
break;
}
case ::substrait::Type::KindCase::kI8: {
Expand Down Expand Up @@ -115,7 +115,7 @@ std::shared_ptr<SubstraitParser::SubstraitType> SubstraitParser::parseType(
}
case ::substrait::Type::KindCase::kDate: {
typeName = "DATE";
nullability = sType.date().nullability();
nullability = substraitType.date().nullability();
break;
}
default:
Expand Down Expand Up @@ -217,39 +217,14 @@ std::string SubstraitParser::findSubstraitFuncSpec(
return map[id];
}

std::string SubstraitParser::getFunctionName(
const std::string& functionSpec) const {
std::string SubstraitParser::getSubFunctionName(
const std::string& subFuncSpec) const {
// Get the position of ":" in the function name.
std::size_t pos = functionSpec.find(":");
std::size_t pos = subFuncSpec.find(":");
if (pos == std::string::npos) {
return functionSpec;
return subFuncSpec;
}
return functionSpec.substr(0, pos);
}

void SubstraitParser::getFunctionTypes(
const std::string& functionSpec,
std::vector<std::string>& types) const {
types.clear();
// Get the position of ":" in the function name.
std::size_t pos = functionSpec.find(":");
// Get the parameter types.
std::string funcTypes;
if (pos == std::string::npos) {
return;
} else {
if (pos == functionSpec.size() - 1) {
return;
}
funcTypes = functionSpec.substr(pos + 1);
}
// Split the types with delimiter.
std::string delimiter = "_";
while ((pos = funcTypes.find(delimiter)) != std::string::npos) {
types.emplace_back(funcTypes.substr(0, pos));
funcTypes.erase(0, pos + delimiter.length());
}
types.emplace_back(funcTypes);
return subFuncSpec.substr(0, pos);
}

void SubstraitParser::getSubFunctionTypes(
Expand Down Expand Up @@ -293,7 +268,7 @@ std::string SubstraitParser::mapToVeloxFunction(

// If not finding the mapping from Substrait function name to Velox function
// name, the original Substrait function name will be used.
return substraitFunction;
return subFunc;
}

} // namespace facebook::velox::substrait
9 changes: 2 additions & 7 deletions velox/substrait/SubstraitParser.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,18 +65,13 @@ class SubstraitParser {
/// Currently, the input types in the function specification are not used. But
/// in the future, they should be used for the validation according the
/// specifications in Substrait yaml files.
const std::string& findFunctionSpec(
std::string findSubstraitFuncSpec(
const std::unordered_map<uint64_t, std::string>& functionMap,
uint64_t id) const;

/// Extracts the function name for a function from specified compound name.
/// When the input is a simple name, it will be returned.
std::string getFunctionName(const std::string& functionSpec) const;

/// Extracts argument types for a function from specified compound name.
void getFunctionTypes(
const std::string& functionSpec,
std::vector<std::string>& types) const;
std::string getSubFunctionName(const std::string& functionSpec) const;

/// This function is used get the types from the compound name.
void getSubFunctionTypes(
Expand Down
Loading

0 comments on commit b33c235

Please sign in to comment.