Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DUCKDB] Native duckdb lance reader #347

Merged
merged 20 commits into from
Dec 5, 2022
Merged
1 change: 1 addition & 0 deletions .clang-format
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ BasedOnStyle: Google
ColumnLimit: 100
BinPackArguments: false
BinPackParameters: false
ReferenceAlignment: Left
---
Language: Proto
BasedOnStyle: Google
16 changes: 16 additions & 0 deletions .github/workflows/duckdb.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,20 @@ jobs:
defaults:
run:
working-directory: ./integration/duckdb
env:
ArrowVersion: 10.0.1-1
steps:
- uses: actions/checkout@v2
- name: ccache
uses: hendrikmuhs/ccache-action@v1
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y -V ca-certificates lsb-release wget
wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
sudo apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
sudo apt update
sudo apt install -y -V libarrow-dev=${ArrowVersion} libarrow-dataset-dev=${ArrowVersion} libparquet-dev=${ArrowVersion}
- name: Cmake
run: cmake -B build
- name: Build
Expand All @@ -30,6 +40,12 @@ jobs:
working-directory: ./integration/duckdb
steps:
- uses: actions/checkout@v2
- name: Install dependencies
run: |
brew update
cd $(brew --repository)
git checkout 3.6.8 # Arrow 10.0
brew install apache-arrow
- name: Cmake
run: cmake -B build
- name: Build
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/lance/format/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
protobuf_generate_cpp(
PROTO_SRCS
PROTO_HDRS
${CMAKE_SOURCE_DIR}/../protos/format.proto
${PROJECT_SOURCE_DIR}/../protos/format.proto
)

add_library(
Expand Down
19 changes: 16 additions & 3 deletions integration/duckdb/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ if(POLICY CMP0135)
cmake_policy(SET CMP0135 NEW)
endif()

add_compile_options(-mf16c) # opencv
#add_compile_options(-mf16c) # opencv

project(lance_duckdb CXX)
option(LANCE_BUILD_PYTORCH "Build with PyTorch" TRUE)
Expand Down Expand Up @@ -88,7 +88,7 @@ endif()

FetchContent_MakeAvailable(${available_contents})

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_STANDARD_REQUIRED True)

include_directories(${duckdb_SOURCE_DIR}/src/include)
Expand All @@ -109,12 +109,24 @@ if(LANCE_BUILD_PYTORCH)
include_directories(${OpenCV_INCLUDE_DIRS})
endif()

# Add lance core as dependency
find_package(Arrow REQUIRED)
find_package(ArrowDataset REQUIRED)
include_directories(${CMAKE_BINARY_DIR}/lance/src ../../cpp/include ../../cpp/src)
add_subdirectory(../../cpp lance)

include_directories(src)

set(LANCE_EXT_SOURCE_COMMON
src/lance/duckdb/lance_reader.cc
src/lance/duckdb/lance_reader.h
src/lance/duckdb/lance-extension.cc
src/lance/duckdb/lance.cc
src/lance/duckdb/lance.h
src/lance/duckdb/list_functions.cc
src/lance/duckdb/vector_functions.cc)
src/lance/duckdb/list_functions.h
src/lance/duckdb/vector_functions.cc
)

set(LANCE_EXT_SOURCE_ML
src/lance/duckdb/ml/catalog.cc
Expand All @@ -132,6 +144,7 @@ endif()
# add_library(lance_extension STATIC ${LANCE_EXT_SOURCES})
set(PARAMETERS "-warnings")
build_loadable_extension(lance ${PARAMETERS} ${LANCE_EXT_SOURCES})
target_link_libraries(lance_loadable_extension lance ArrowDataset::arrow_dataset_shared fmt::fmt)

if(LANCE_BUILD_PYTORCH)
target_link_libraries(lance_loadable_extension "${TORCH_LIBRARIES}"
Expand Down
12 changes: 10 additions & 2 deletions integration/duckdb/src/lance/duckdb/lance-extension.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,10 @@

#include <duckdb.hpp>

#include "lance/duckdb/lance_reader.h"
#include "lance/duckdb/list_functions.h"
#include "lance/duckdb/vector_functions.h"
#include "lance/duckdb/ml/functions.h"
#include "lance/duckdb/vector_functions.h"

namespace duckdb {

Expand All @@ -29,6 +30,7 @@ void LanceExtension::Load(::duckdb::DuckDB &db) {
con.BeginTransaction();
auto &context = *con.context;
auto &catalog = ::duckdb::Catalog::GetCatalog(context);
auto &config = DBConfig::GetConfig(*db.instance);

for (auto &func : lance::duckdb::GetListFunctions()) {
catalog.CreateFunction(context, func.get());
Expand All @@ -46,11 +48,17 @@ void LanceExtension::Load(::duckdb::DuckDB &db) {
catalog.CreateTableFunction(context, func.get());
}

auto scan_func = lance::duckdb::GetLanceReaderFunction();
::duckdb::CreateTableFunctionInfo scan(scan_func);
catalog.CreateTableFunction(context, &scan);

config.replacement_scans.emplace_back(lance::duckdb::LanceScanReplacement);

con.Commit();
}

std::string LanceExtension::Name() { return {"lance"}; }
};
}; // namespace duckdb

extern "C" {

Expand Down
2 changes: 2 additions & 0 deletions integration/duckdb/src/lance/duckdb/lance-extension.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,9 @@ namespace duckdb {

class LanceExtension : public Extension {
public:

void Load(DuckDB &db) override;

std::string Name() override;
};

Expand Down
98 changes: 98 additions & 0 deletions integration/duckdb/src/lance/duckdb/lance.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
// Copyright 2022 Lance Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//

#include "lance/duckdb/lance.h"

#include <arrow/type.h>

#include <duckdb/common/exception.hpp>
#include <vector>

namespace lance::duckdb {

namespace {

inline ::duckdb::LogicalType ToLogicalType(const ::arrow::DictionaryType& dtype) {
return lance::duckdb::ToLogicalType(*dtype.value_type());
}

inline ::duckdb::LogicalType ToLogicalType(const ::arrow::StructType& struct_type) {
::duckdb::child_list_t<::duckdb::LogicalType> children;
for (auto& child : struct_type.fields()) {
children.emplace_back(
std::make_pair(child->name(), lance::duckdb::ToLogicalType(*child->type())));
}
return ::duckdb::LogicalType::STRUCT(children);
}

template <typename L>
inline ::duckdb::LogicalType ToLogicalType(const ::arrow::DataType& dtype) {
auto& list_type = dynamic_cast<const L&>(dtype);
auto child_type = lance::duckdb::ToLogicalType(*list_type.value_type());
return ::duckdb::LogicalType::LIST(child_type);
}

} // namespace

::duckdb::LogicalType ToLogicalType(const ::arrow::DataType& arrow_type) {
switch (arrow_type.id()) {
case ::arrow::Type::BOOL:
return ::duckdb::LogicalType::BOOLEAN;
case ::arrow::Type::INT8:
return ::duckdb::LogicalType::TINYINT;
case ::arrow::Type::UINT8:
return ::duckdb::LogicalType::UTINYINT;
case ::arrow::Type::INT16:
return ::duckdb::LogicalType::SMALLINT;
case ::arrow::Type::UINT16:
return ::duckdb::LogicalType::USMALLINT;
case ::arrow::Type::INT32:
return ::duckdb::LogicalType::INTEGER;
case ::arrow::Type::UINT64:
return ::duckdb::LogicalType::UINTEGER;
case ::arrow::Type::FLOAT:
case ::arrow::Type::HALF_FLOAT:
return ::duckdb::LogicalType::FLOAT;
case ::arrow::Type::DOUBLE:
return ::duckdb::LogicalType::DOUBLE;
case ::arrow::Type::STRING:
case ::arrow::Type::LARGE_STRING:
return ::duckdb::LogicalType::VARCHAR;
case ::arrow::Type::BINARY:
case ::arrow::Type::LARGE_BINARY:
return ::duckdb::LogicalType::BLOB;
case ::arrow::Type::TIME32:
case ::arrow::Type::TIME64:
return ::duckdb::LogicalType::TIME;
case ::arrow::Type::TIMESTAMP:
return ::duckdb::LogicalType::TIMESTAMP;
case ::arrow::Type::DATE32:
case ::arrow::Type::DATE64:
return ::duckdb::LogicalType::DATE;
case ::arrow::Type::DICTIONARY:
return ToLogicalType(dynamic_cast<const ::arrow::DictionaryType&>(arrow_type));
case ::arrow::Type::STRUCT:
return ToLogicalType(dynamic_cast<const ::arrow::StructType&>(arrow_type));
case ::arrow::Type::LIST:
return ToLogicalType<::arrow::ListType>(arrow_type);
case ::arrow::Type::FIXED_SIZE_LIST:
return ToLogicalType<::arrow::FixedSizeListType>(arrow_type);
default:
throw ::duckdb::InvalidInputException("Does not support type: %s",
arrow_type.ToString().c_str());
}
}

} // namespace lance::duckdb
47 changes: 47 additions & 0 deletions integration/duckdb/src/lance/duckdb/lance.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
// Copyright 2022 Lance Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//

#pragma once

/// \brief Lance Core Adaptors and utilities

#include <arrow/result.h>
#include <arrow/status.h>
#include <arrow/type_fwd.h>

#include <duckdb/common/exception.hpp>
#include <duckdb/common/types.hpp>

namespace lance::duckdb {

template <typename T, typename E = ::duckdb::IOException>
T GetResult(::arrow::Result<T>&& result) {
if (result.ok()) {
return std::move(result.ValueOrDie());
}
throw E(result.status().message());
}

template <typename E = ::duckdb::IOException>
void CheckStatus(const ::arrow::Status& status) {
if (!status.ok()) {
throw E(status.message());
}
}

/// Convert Arrow and Lance types into DuckDB logical type
::duckdb::LogicalType ToLogicalType(const ::arrow::DataType& arrow_type);

} // namespace lance::duckdb
Loading