Skip to content

Commit

Permalink
fix: Optimize ContainerRowSerde deserialization for string, array and…
Browse files Browse the repository at this point in the history
… map (#12362)

Summary:

1. Optimize `deserializeString` in case of inlined strings
2. Optimize `deserializeArray` to avoid heap allocation for intermediate null-like data
3. Inline `FlatVector<StringView>::setNoCopy`

Differential Revision: D69750091
  • Loading branch information
Yuhta authored and facebook-github-bot committed Feb 17, 2025
1 parent 730842b commit 80add21
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 30 deletions.
39 changes: 28 additions & 11 deletions velox/exec/ContainerRowSerde.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -224,14 +224,17 @@ void deserializeString(
VELOX_CHECK_EQ(result.encoding(), VectorEncoding::Simple::FLAT);
auto values = result.asUnchecked<FlatVector<StringView>>();
auto size = in.read<int32_t>();
auto buffer = values->getBufferWithSpace(size);
auto start = buffer->asMutable<char>() + buffer->size();
in.readBytes(start, size);
// If the string is not inlined in string view, we need to advance the buffer.
if (not StringView::isInline(size)) {
if (StringView::isInline(size)) {
char data[StringView::kInlineSize];
in.readBytes(data, size);
values->setNoCopy(index, StringView(data, size));
} else {
auto* buffer = values->getBufferWithSpace(size);
auto* start = buffer->asMutable<char>() + buffer->size();
in.readBytes(start, size);
buffer->setSize(buffer->size() + size);
values->setNoCopy(index, StringView(start, size));
}
values->setNoCopy(index, StringView(start, size));
}

template <>
Expand All @@ -250,12 +253,17 @@ void deserializeOne<TypeKind::VARBINARY>(
deserializeString(in, index, result);
}

std::vector<uint64_t> readNulls(ByteInputStream& in, int32_t size) {
void readNulls(ByteInputStream& in, int32_t size, uint64_t* out) {
auto n = bits::nwords(size);
std::vector<uint64_t> nulls(n);
for (auto i = 0; i < n; ++i) {
nulls[i] = in.read<uint64_t>();
out[i] = in.read<uint64_t>();
}
}

std::vector<uint64_t> readNulls(ByteInputStream& in, int32_t size) {
auto n = bits::nwords(size);
std::vector<uint64_t> nulls(n);
readNulls(in, size, nulls.data());
return nulls;
}

Expand Down Expand Up @@ -292,11 +300,20 @@ vector_size_t deserializeArray(
BaseVector& elements,
vector_size_t& offset) {
auto size = in.read<int32_t>();
auto nulls = readNulls(in, size);
offset = elements.size();
elements.resize(offset + size);
uint64_t smallNulls;
std::vector<uint64_t> largeNulls;
const uint64_t* nulls;
if (size <= 64) {
readNulls(in, size, &smallNulls);
nulls = &smallNulls;
} else {
largeNulls = readNulls(in, size);
nulls = largeNulls.data();
}
for (auto i = 0; i < size; ++i) {
if (bits::isBitSet(nulls.data(), i)) {
if (bits::isBitSet(nulls, i)) {
elements.setNull(i + offset, true);
} else {
deserializeSwitch(in, i + offset, elements);
Expand Down
17 changes: 0 additions & 17 deletions velox/vector/FlatVector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,23 +119,6 @@ void FlatVector<StringView>::set(vector_size_t idx, StringView value) {
}
}

/// For types that requires buffer allocation this should be called only if
/// value is inlined or if value is already allocated in a buffer within the
/// vector. Used by StringWriter to allow UDFs to write directly into the
/// buffers and avoid copying.
template <>
void FlatVector<StringView>::setNoCopy(
const vector_size_t idx,
const StringView& value) {
VELOX_DCHECK_LT(idx, BaseVector::length_);
ensureValues();
VELOX_DCHECK(!values_->isView());
if (BaseVector::nulls_) {
BaseVector::setNull(idx, false);
}
rawValues_[idx] = value;
}

template <>
void FlatVector<StringView>::acquireSharedStringBuffers(
const BaseVector* source) {
Expand Down
16 changes: 14 additions & 2 deletions velox/vector/FlatVector.h
Original file line number Diff line number Diff line change
Expand Up @@ -610,10 +610,22 @@ Range<bool> FlatVector<bool>::asRange() const;
template <>
void FlatVector<StringView>::set(vector_size_t idx, StringView value);

/// For types that requires buffer allocation this should be called only if
/// value is inlined or if value is already allocated in a buffer within the
/// vector. Used by StringWriter to allow UDFs to write directly into the
/// buffers and avoid copying.
template <>
void FlatVector<StringView>::setNoCopy(
inline void FlatVector<StringView>::setNoCopy(
const vector_size_t idx,
const StringView& value);
const StringView& value) {
VELOX_DCHECK_LT(idx, BaseVector::length_);
ensureValues();
VELOX_DCHECK(!values_->isView());
if (BaseVector::nulls_) {
BaseVector::setNull(idx, false);
}
rawValues_[idx] = value;
}

template <>
void FlatVector<bool>::set(vector_size_t idx, bool value);
Expand Down

0 comments on commit 80add21

Please sign in to comment.