Skip to content

Commit

Permalink
Project skips scan if the filter covers all the columns (#117)
Browse files Browse the repository at this point in the history
  • Loading branch information
eddyxu authored Aug 18, 2022
1 parent beb1d92 commit a71b508
Show file tree
Hide file tree
Showing 8 changed files with 163 additions and 5 deletions.
3 changes: 2 additions & 1 deletion cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ function(add_lance_test test_name)
target_link_libraries(${test_name}
Catch2::Catch2WithMain
lance
$<TARGET_OBJECTS:lance_testing>
)
target_include_directories(${test_name} SYSTEM PRIVATE ${ARROW_INCLUDE_DIR})
target_include_directories(${test_name} SYSTEM PRIVATE ${PARQUET_INCLUDE_DIR})
Expand All @@ -107,7 +108,7 @@ endfunction()
if (NOT CMAKE_BUILD_TYPE STREQUAL "Release")
include(CTest)
include(Catch)
set(test_libs Catch2::Catch2WithMain)
set(test_libs Catch2::Catch2WithMain $<TARGET_OBJECTS:lance_testing>)
enable_testing()
endif ()

Expand Down
4 changes: 4 additions & 0 deletions cpp/src/lance/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,7 @@ add_subdirectory(arrow)
add_subdirectory(encodings)
add_subdirectory(format)
add_subdirectory(io)

if (CMAKE_BUILD_TYPE STREQUAL Debug)
add_subdirectory(testing)
endif ()
1 change: 1 addition & 0 deletions cpp/src/lance/io/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,5 @@ add_dependencies(io format)

add_lance_test(filter_test)
add_lance_test(limit_test)
add_lance_test(project_test)
add_lance_test(reader_test)
14 changes: 10 additions & 4 deletions cpp/src/lance/io/project.cc
Original file line number Diff line number Diff line change
Expand Up @@ -80,10 +80,16 @@ ::arrow::Result<std::shared_ptr<::arrow::RecordBatch>> Project::Execute(
std::static_pointer_cast<decltype(indices)::element_type>(indices->Slice(offset, len));
values = values->Slice(offset, len);
}
ARROW_ASSIGN_OR_RAISE(auto batch, reader->ReadBatch(*scan_schema_, batch_id, indices));
assert(values->num_rows() == batch->num_rows());
ARROW_ASSIGN_OR_RAISE(auto merged, lance::arrow::MergeRecordBatches(values, batch));
return merged;
if (scan_schema_->fields().empty()) {
// No extra columns other than the filtered columns need to be read, for example,
// SELECT id FROM t WHERE id > 10
return values;
} else {
ARROW_ASSIGN_OR_RAISE(auto batch, reader->ReadBatch(*scan_schema_, batch_id, indices));
assert(values->num_rows() == batch->num_rows());
ARROW_ASSIGN_OR_RAISE(auto merged, lance::arrow::MergeRecordBatches(values, batch));
return merged;
}
} else {
// Read without filter.
if (limit_) {
Expand Down
60 changes: 60 additions & 0 deletions cpp/src/lance/io/project_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
// Copyright 2022 Lance Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "lance/io/project.h"

#include <arrow/compute/exec/expression.h>
#include <arrow/dataset/dataset.h>
#include <arrow/table.h>
#include <arrow/type.h>

#include <catch2/catch_test_macros.hpp>
#include <string>
#include <vector>

#include "lance/arrow/scanner.h"
#include "lance/arrow/stl.h"
#include "lance/format/schema.h"
#include "lance/io/filter.h"
#include "lance/io/limit.h"
#include "lance/io/reader.h"
#include "lance/testing/io.h"

TEST_CASE("Project schema") {
auto schema =
::arrow::schema({arrow::field("k", arrow::int16()), arrow::field("v", arrow::int32())});
auto arr = arrow::StructArray::Make(::arrow::ArrayVector({
lance::arrow::ToArray<int16_t>({1, 2, 3, 4}).ValueOrDie(),
lance::arrow::ToArray<int32_t>({10, 20, 30, 40}).ValueOrDie(),
}),
std::vector<std::string>({"k", "v"}))
.ValueOrDie();
auto tbl =
arrow::Table::FromRecordBatches({arrow::RecordBatch::FromStructArray(arr).ValueOrDie()})
.ValueOrDie();
auto dataset = std::make_shared<arrow::dataset::InMemoryDataset>(tbl);

auto scan_builder = lance::arrow::ScannerBuilder(dataset);
scan_builder.Project({"v"});
scan_builder.Filter(
arrow::compute::equal(arrow::compute::field_ref("v"), arrow::compute::literal(20)));
auto scanner = scan_builder.Finish().ValueOrDie();

auto lance_schema = lance::format::Schema(schema);
auto project = lance::io::Project::Make(lance_schema, scanner->options()).ValueOrDie();

auto reader = lance::testing::MakeReader(tbl).ValueOrDie();
auto batch = project->Execute(reader, 0).ValueOrDie();
CHECK(batch->GetColumnByName("v")->Equals(lance::arrow::ToArray({20}).ValueOrDie()));
}
21 changes: 21 additions & 0 deletions cpp/src/lance/testing/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Copyright Lance Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

add_library(
lance_testing
OBJECT
io.cc
io.h
)
target_include_directories(lance_testing SYSTEM PRIVATE ${Protobuf_INCLUDE_DIR})
35 changes: 35 additions & 0 deletions cpp/src/lance/testing/io.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
// Copyright 2022 Lance Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "lance/testing/io.h"

#include <arrow/io/api.h>
#include <arrow/result.h>

#include "lance/arrow/writer.h"
#include "lance/io/reader.h"

namespace lance::testing {

::arrow::Result<std::shared_ptr<io::FileReader>> MakeReader(
const std::shared_ptr<::arrow::Table>& table) {
auto sink = ::arrow::io::BufferOutputStream::Create().ValueOrDie();
ARROW_RETURN_NOT_OK(lance::arrow::WriteTable(*table, sink));
auto infile = make_shared<::arrow::io::BufferReader>(sink->Finish().ValueOrDie());
auto reader = std::make_shared<io::FileReader>(infile);
ARROW_RETURN_NOT_OK(reader->Open());
return reader;
}

} // namespace lance::testing
30 changes: 30 additions & 0 deletions cpp/src/lance/testing/io.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
// Copyright 2022 Lance Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#include <arrow/result.h>
#include <arrow/table.h>

#include <memory>

#include "lance/io/reader.h"

namespace lance::testing {

/// Make lance::io::FileReader from an Arrow Table.
::arrow::Result<std::shared_ptr<lance::io::FileReader>> MakeReader(
const std::shared_ptr<::arrow::Table>& table);

} // namespace lance::testing

0 comments on commit a71b508

Please sign in to comment.