Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Plain encoding supports writing boolean array #120

Merged
merged 6 commits into from
Aug 22, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cpp/src/lance/arrow/stl.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ ToArray(const std::vector<T>& vec, ::arrow::MemoryPool* pool = ::arrow::default_
using ArrayType = typename ::arrow::TypeTraits<ArrowType>::ArrayType;
typename ::arrow::TypeTraits<ArrowType>::BuilderType builder(pool);
ARROW_RETURN_NOT_OK(builder.Reserve(vec.size()));
for (auto& v : vec) {
for (const auto& v : vec) {
ARROW_RETURN_NOT_OK(builder.Append(v));
}
auto result = builder.Finish();
Expand Down
9 changes: 8 additions & 1 deletion cpp/src/lance/arrow/writer_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
#include "lance/io/reader.h"

using arrow::ArrayBuilder;
using arrow::BooleanBuilder;
using arrow::Int32Builder;
using arrow::LargeBinaryBuilder;
using arrow::ListBuilder;
Expand All @@ -58,11 +59,13 @@ auto CocoDataset() {
auto schema = arrow::schema({arrow::field("filename", arrow::utf8()),
arrow::field("split", arrow::utf8()),
arrow::field("width", arrow::int32()),
arrow::field("valid", arrow::boolean()),
arrow::field("image", image_type),
arrow::field("annotations", arrow::list(annotationType))});

StringBuilder stringBuilder;
Int32Builder intBuilder;
BooleanBuilder booleanBuilder;

CHECK(stringBuilder.AppendValues({"1.jpg", "2.jpg", "3.jpg", "4.jpg"}).ok());
auto filenameArr = stringBuilder.Finish().ValueOrDie();
Expand All @@ -76,6 +79,10 @@ auto CocoDataset() {
auto width = intBuilder.Finish().ValueOrDie();
intBuilder.Reset();

CHECK(booleanBuilder.AppendValues(std::vector<bool>({true, true, false, true})).ok());
auto valid = booleanBuilder.Finish().ValueOrDie();
booleanBuilder.Reset();

auto uriBuilder = std::make_shared<StringBuilder>();
auto imageBuilder = std::make_shared<StructBuilder>(
image_type, arrow::default_memory_pool(), vector<shared_ptr<ArrayBuilder>>({uriBuilder}));
Expand Down Expand Up @@ -115,7 +122,7 @@ auto CocoDataset() {
auto annotations = listBuilder.Finish().ValueOrDie();
listBuilder.Reset();

auto table = arrow::Table::Make(schema, {filenameArr, split, width, images, annotations});
auto table = arrow::Table::Make(schema, {filenameArr, split, width, valid, images, annotations});
return table;
}

Expand Down
51 changes: 50 additions & 1 deletion cpp/src/lance/encodings/plain.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,30 @@ namespace lance::encodings {

PlainEncoder::PlainEncoder(std::shared_ptr<::arrow::io::OutputStream> out) : Encoder(out) {}

::arrow::Status WriteBooleanArray(const std::shared_ptr<::arrow::io::OutputStream>& out,
const std::shared_ptr<::arrow::BooleanArray>& arr) {
// TODO: Boolean array is not necessarily aligned with the byte boundary:
// See ::arrow::BooleanArray::Value(int64)
// Is there a faster way to write / shift the boolean array?
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would any of the bit packing / unpacking utilities from Arrow work? https://github.com/apache/arrow/tree/master/cpp/src/arrow/util

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks. will keep looking it in following PRs

::arrow::BooleanBuilder builder;
ARROW_RETURN_NOT_OK(builder.Reserve(arr->length()));
for (int i = 0; i < arr->length(); i++) {
ARROW_RETURN_NOT_OK(builder.Append(arr->Value(i)));
}
ARROW_ASSIGN_OR_RAISE(auto written_arr, builder.Finish());
return out->Write(std::dynamic_pointer_cast<::arrow::BooleanArray>(written_arr)->values());
}

::arrow::Result<int64_t> PlainEncoder::Write(std::shared_ptr<::arrow::Array> arr) {
auto data_type = arr->type();

ARROW_ASSIGN_OR_RAISE(auto value_offset, out_->Tell());
// TODO: support more types.
switch (data_type->id()) {
case ::arrow::Type::BOOL:
ARROW_RETURN_NOT_OK(
WriteBooleanArray(out_, std::dynamic_pointer_cast<::arrow::BooleanArray>(arr)));
break;
case ::arrow::Type::INT8:
ARROW_RETURN_NOT_OK(out_->Write(std::static_pointer_cast<::arrow::Int8Array>(arr)->values()));
break;
Expand Down Expand Up @@ -145,6 +163,37 @@ class PlainDecoderImpl : public Decoder {
using BuilderType = typename ::arrow::TypeTraits<T>::BuilderType;
};

class BooleanPlainDecoderImpl : public PlainDecoderImpl<::arrow::BooleanType> {
public:
using PlainDecoderImpl::PlainDecoderImpl;

/// Get one single scalar value from the column.
::arrow::Result<std::shared_ptr<::arrow::Scalar>> GetScalar(int64_t idx) const override {
int64_t offset = idx / 8;
uint8_t byte;
ARROW_RETURN_NOT_OK(infile_->ReadAt(position_ + offset, 1, &byte));
return std::make_shared<::arrow::BooleanScalar>(
::arrow::bit_util::GetBitFromByte(byte, idx % 8));
}

::arrow::Result<std::shared_ptr<::arrow::Array>> ToArray(
int32_t start, std::optional<int32_t> length) const override {
if (!length.has_value()) {
length = length_ - start;
}
if (start + length.value() > length_ || start > length_) {
return ::arrow::Status::IndexError(
fmt::format("PlainDecoder::ToArray: out of range: start={}, length={}, page_length={}\n",
start,
length.value(),
length_));
}
int64_t byte_length = ::arrow::bit_util::BytesForBits(length.value());
ARROW_ASSIGN_OR_RAISE(auto buf, infile_->ReadAt(position_ + start / 8, byte_length));
return std::make_shared<::arrow::BooleanArray>(length.value(), buf);
}
};

} // namespace

PlainDecoder::PlainDecoder(std::shared_ptr<::arrow::io::RandomAccessFile> infile,
Expand All @@ -156,7 +205,7 @@ PlainDecoder::~PlainDecoder() {}
::arrow::Status PlainDecoder::Init() {
switch (type_->id()) {
case ::arrow::Type::BOOL:
impl_.reset(new PlainDecoderImpl<::arrow::BooleanType>(infile_, type_));
impl_.reset(new BooleanPlainDecoderImpl(infile_, type_));
break;
case ::arrow::Type::INT8:
impl_.reset(new PlainDecoderImpl<::arrow::Int8Type>(infile_, type_));
Expand Down
32 changes: 32 additions & 0 deletions cpp/src/lance/encodings/plain_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include "lance/arrow/stl.h"

using arrow::Int32Builder;
using lance::arrow::ToArray;

TEST_CASE("Test Write Int32 array") {
auto arr = lance::arrow::ToArray({1, 2, 3, 4, 5, 6, 7, 8}).ValueOrDie();
Expand Down Expand Up @@ -66,3 +67,34 @@ TEST_CASE("Test take plain values") {
INFO("Indices " << indices->ToString() << " Actual " << actual->ToString());
CHECK(actual->Equals(indices));
}

TEST_CASE("Write boolean array") {
arrow::BooleanBuilder builder;
for (int i = 0; i < 10; i++) {
CHECK(builder.Append(i % 3 == 0).ok());
}
auto arr = builder.Finish().ValueOrDie();

auto sink = arrow::io::BufferOutputStream::Create().ValueOrDie();
lance::encodings::PlainEncoder encoder(sink);
auto offset = encoder.Write(arr).ValueOrDie();

auto infile = make_shared<arrow::io::BufferReader>(sink->Finish().ValueOrDie());
lance::encodings::PlainDecoder decoder(infile, arr->type());
CHECK(decoder.Init().ok());
decoder.Reset(offset, arr->length());

auto actual = decoder.ToArray().ValueOrDie();
CHECK(arr->Equals(actual));

for (int i = 0; i < arr->length(); i++) {
INFO("Expected: " << arr->GetScalar(i).ValueOrDie()->ToString()
<< " Got: " << decoder.GetScalar(i).ValueOrDie()->ToString());
CHECK(arr->GetScalar(i).ValueOrDie()->Equals(decoder.GetScalar(i).ValueOrDie()));
}

auto indices = ToArray({0, 3, 6}).ValueOrDie();
CHECK(lance::arrow::ToArray({true, true, true})
.ValueOrDie()
->Equals(decoder.Take(indices).ValueOrDie()));
}
4 changes: 4 additions & 0 deletions cpp/src/lance/io/writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,10 @@ ::arrow::Status FileWriter::WriteArray(const std::shared_ptr<format::Field>& fie

::arrow::Status FileWriter::WritePrimitiveArray(const std::shared_ptr<format::Field>& field,
const std::shared_ptr<::arrow::Array>& arr) {
if (arr->length() == 0) {
return ::arrow::Status::OK();
}

auto field_id = field->id();
auto encoder = field->GetEncoder(destination_);
auto type = field->type();
Expand Down