Skip to content

Commit

Permalink
Improve roaring UDF
Browse files Browse the repository at this point in the history
  • Loading branch information
jsjant committed Jan 4, 2025
1 parent 8bd8b57 commit 082076c
Show file tree
Hide file tree
Showing 8 changed files with 254 additions and 0 deletions.
144 changes: 144 additions & 0 deletions ydb/library/yql/udfs/common/roaring/roaring.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <contrib/libs/croaring/include/roaring/memory.h>
#include <contrib/libs/croaring/include/roaring/roaring.h>

#include <util/generic/array_ref.h>
#include <util/generic/vector.h>
#include <util/string/builder.h>
#include <util/system/yassert.h>
Expand All @@ -30,6 +31,11 @@ namespace {
{
}

TRoaringWrapper(roaring_bitmap_t* bitmap)
: Roaring(bitmap)
{
}

~TRoaringWrapper() {
roaring_bitmap_free(Roaring);
}
Expand Down Expand Up @@ -105,6 +111,47 @@ namespace {
}
};

class TRoaringAndNotWithBinary: public TBoxedValue {
public:
TRoaringAndNotWithBinary() {
}

static TStringRef Name() {
return TStringRef::Of("AndNotWithBinary");
}

private:
TUnboxedValue Run(const IValueBuilder* valueBuilder,
const TUnboxedValuePod* args) const override {
Y_UNUSED(valueBuilder);
auto binaryString = args[1].AsStringRef();
auto bitmap = DeserializePortable(binaryString);

roaring_bitmap_andnot_inplace(GetBitmapFromArg(args[0]), bitmap);
roaring_bitmap_free(bitmap);

return args[0];
}
};

class TRoaringAndNot: public TBoxedValue {
public:
TRoaringAndNot() {
}

static TStringRef Name() {
return TStringRef::Of("AndNot");
}

private:
TUnboxedValue Run(const IValueBuilder* valueBuilder,
const TUnboxedValuePod* args) const override {
Y_UNUSED(valueBuilder);
roaring_bitmap_andnot_inplace(GetBitmapFromArg(args[0]), GetBitmapFromArg(args[1]));
return args[0];
}
};

class TRoaringOr: public TBoxedValue {
public:
TRoaringOr() {
Expand Down Expand Up @@ -223,6 +270,46 @@ namespace {
TSourcePosition Pos_;
};

class TRoaringFromUint32List: public TBoxedValue {
public:
TRoaringFromUint32List(TSourcePosition pos)
: Pos_(pos)
{
}

static TStringRef Name() {
return TStringRef::Of("FromUint32List");
}

private:
TUnboxedValue Run(const IValueBuilder* valueBuilder,
const TUnboxedValuePod* args) const override {
Y_UNUSED(valueBuilder);
try {
auto* bitmap = roaring_bitmap_create();

const auto vector = args[0];
const auto* elements = vector.GetElements();
if (elements) {
for (auto& value : TArrayRef{elements, vector.GetListLength()}) {
roaring_bitmap_add(bitmap, value.Get<ui32>());
}
} else {
TUnboxedValue value;
const auto it = vector.GetListIterator();
while (it.Next(value)) {
roaring_bitmap_add(bitmap, value.Get<ui32>());
}
}

return TUnboxedValuePod(new TRoaringWrapper(bitmap));
} catch (const std::exception& e) {
UdfTerminate((TStringBuilder() << Pos_ << " " << e.what()).data());
}
}
TSourcePosition Pos_;
};

class TRoaringSerialize: public TBoxedValue {
public:
TRoaringSerialize() {
Expand Down Expand Up @@ -266,6 +353,25 @@ namespace {
}
};

class TRoaringRunOptimize: public TBoxedValue {
public:
TRoaringRunOptimize() {
}

static TStringRef Name() {
return TStringRef::Of("RunOptimize");
}

private:
TUnboxedValue Run(const IValueBuilder* valueBuilder,
const TUnboxedValuePod* args) const override {
Y_UNUSED(valueBuilder);
auto bitmap = GetBitmapFromArg(args[0]);
roaring_bitmap_run_optimize(bitmap);
return args[0];
}
};

class TRoaringModule: public IUdfModule {
public:
TRoaringModule() {
Expand All @@ -282,6 +388,7 @@ namespace {
void GetAllFunctions(IFunctionsSink& sink) const final {
sink.Add(TRoaringSerialize::Name());
sink.Add(TRoaringDeserialize::Name());
sink.Add(TRoaringFromUint32List::Name());

sink.Add(TRoaringCardinality::Name());

Expand All @@ -292,6 +399,11 @@ namespace {

sink.Add(TRoaringAndWithBinary::Name());
sink.Add(TRoaringAnd::Name());

sink.Add(TRoaringAndNotWithBinary::Name());
sink.Add(TRoaringAndNot::Name());

sink.Add(TRoaringRunOptimize::Name());
}

void CleanupOnTerminate() const final {
Expand All @@ -312,6 +424,12 @@ namespace {
if (!typesOnly) {
builder.Implementation(new TRoaringDeserialize(builder.GetSourcePosition()));
}
} else if (TRoaringFromUint32List::Name() == name) {
builder.Returns<TResource<RoaringResourceName>>().Args()->Add<TListType<ui32>>();

if (!typesOnly) {
builder.Implementation(new TRoaringFromUint32List(builder.GetSourcePosition()));
}
} else if (TRoaringSerialize::Name() == name) {
builder.Returns(builder.SimpleType<char*>())
.Args()
Expand Down Expand Up @@ -372,6 +490,32 @@ namespace {
if (!typesOnly) {
builder.Implementation(new TRoaringAnd());
}
} else if (TRoaringAndNotWithBinary::Name() == name) {
builder.Returns<TResource<RoaringResourceName>>()
.Args()
->Add<TAutoMap<TResource<RoaringResourceName>>>()
.Add<TAutoMap<char*>>();

if (!typesOnly) {
builder.Implementation(new TRoaringAndNotWithBinary());
}
} else if (TRoaringAndNot::Name() == name) {
builder.Returns<TResource<RoaringResourceName>>()
.Args()
->Add<TAutoMap<TResource<RoaringResourceName>>>()
.Add<TAutoMap<TResource<RoaringResourceName>>>();

if (!typesOnly) {
builder.Implementation(new TRoaringAndNot());
}
} else if (TRoaringRunOptimize::Name() == name) {
builder.Returns<TResource<RoaringResourceName>>()
.Args()
->Add<TAutoMap<TResource<RoaringResourceName>>>();

if (!typesOnly) {
builder.Implementation(new TRoaringRunOptimize());
}
} else {
TStringBuilder sb;
sb << "Unknown function: " << name.Data();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,10 @@
{
"uri": "file://test.test_union_/results.txt"
}
],
"test.test[run_optimize]": [
{
"uri": "file://test.test_run_optimize_/results.txt"
}
]
}
Original file line number Diff line number Diff line change
Expand Up @@ -102,5 +102,36 @@
]
}
]
};
{
"Write" = [
{
"Type" = [
"ListType";
[
"StructType";
[
[
"AndNotList";
[
"ListType";
[
"DataType";
"Uint32"
]
]
]
]
]
];
"Data" = [
[
[
"42"
]
]
]
}
]
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
[
{
"Write" = [
{
"Type" = [
"ListType";
[
"StructType";
[
[
"RunOptimizeList";
[
"ListType";
[
"DataType";
"Uint32"
]
]
]
]
]
];
"Data" = [
[
[
"10";
"42";
"567"
]
]
]
}
]
}
]
Original file line number Diff line number Diff line change
Expand Up @@ -172,5 +172,38 @@
]
}
]
};
{
"Write" = [
{
"Type" = [
"ListType";
[
"StructType";
[
[
"DeserializedList";
[
"ListType";
[
"DataType";
"Uint32"
]
]
]
]
]
];
"Data" = [
[
[
"10";
"42";
"567"
]
]
]
}
]
}
]
2 changes: 2 additions & 0 deletions ydb/library/yql/udfs/common/roaring/test/cases/intersect.sql
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
SELECT Roaring::Uint32List(Roaring::And(Roaring::Deserialize(left), Roaring::Deserialize(right))) AS AndList FROM Input;
SELECT Roaring::Uint32List(Roaring::AndWithBinary(Roaring::Deserialize(right), left)) AS AndWithBinaryList FROM Input;
SELECT Roaring::Uint32List(Roaring::AndWithBinary(Roaring::Deserialize(right), NULL)) AS AndWithBinaryListEmpty FROM Input;

SELECT Roaring::Uint32List(Roaring::AndNot(Roaring::FromUint32List(AsList(1, 10, 42)), Roaring::FromUint32List(AsList(10, 1)))) AS AndNotList;
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
SELECT Roaring::Uint32List(Roaring::RunOptimize(Roaring::FromUint32List(AsList(10, 567, 42)))) AS RunOptimizeList;
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,6 @@ FROM Input;

SELECT ListTake(ListSkip(Roaring::Uint32List(Roaring::Deserialize(binaryString)), 10), 1) AS EmptyList
FROM Input;

SELECT Roaring::Uint32List(Roaring::FromUint32List(AsList(10, 567, 42))) AS DeserializedList
FROM Input;

0 comments on commit 082076c

Please sign in to comment.