Skip to content

Commit

Permalink
feat: Implement read support for String/Binary View types
Browse files Browse the repository at this point in the history
  • Loading branch information
WillAyd committed Aug 23, 2024
1 parent 31feee9 commit a66af4f
Show file tree
Hide file tree
Showing 7 changed files with 114 additions and 5 deletions.
2 changes: 1 addition & 1 deletion meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ if get_option('tests')
ipc_test_files = {
'ipc-decoder': {
'src': 'decoder',
'deps': [nanoarrow_ipc_dep, arrow_dep, gtest_dep],
'deps': [nanoarrow_ipc_dep, arrow_dep, gtest_dep, gmock_dep],
'timeout': 30,
},
'ipc-reader': {
Expand Down
12 changes: 9 additions & 3 deletions src/nanoarrow/common/array.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <errno.h>
#include <inttypes.h>
#include <stdarg.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
Expand Down Expand Up @@ -701,9 +702,15 @@ static int ArrowArrayViewSetArrayInternal(struct ArrowArrayView* array_view,
array_view->length = array->length;
array_view->null_count = array->null_count;

const bool fixed_nbuffers = (array_view->storage_type == NANOARROW_TYPE_STRING_VIEW ||
array_view->storage_type == NANOARROW_TYPE_BINARY_VIEW)
? 0
: 1;

int64_t buffers_required = 0;
for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) {
if (array_view->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_NONE) {
if (array_view->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_NONE &&
fixed_nbuffers) {
break;
}

Expand All @@ -720,8 +727,7 @@ static int ArrowArrayViewSetArrayInternal(struct ArrowArrayView* array_view,
}
}

// Check the number of buffers
if (buffers_required != array->n_buffers) {
if (buffers_required != array->n_buffers && fixed_nbuffers) {
ArrowErrorSet(error,
"Expected array with %" PRId64 " buffer(s) but found %" PRId64
" buffer(s)",
Expand Down
25 changes: 25 additions & 0 deletions src/nanoarrow/common/array_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3054,6 +3054,11 @@ void TestGetFromBinary(BuilderClass& builder) {
ARROW_EXPECT_OK(builder.Append("1234"));
ARROW_EXPECT_OK(builder.AppendNulls(2));
ARROW_EXPECT_OK(builder.Append("four"));

if constexpr (!std::is_same_v<BuilderClass, FixedSizeBinaryBuilder>) {
ARROW_EXPECT_OK(builder.Append("this_is_a_relatively_long_string"));
}

auto maybe_arrow_array = builder.Finish();
ARROW_EXPECT_OK(maybe_arrow_array);
auto arrow_array = maybe_arrow_array.ValueUnsafe();
Expand All @@ -3075,6 +3080,20 @@ void TestGetFromBinary(BuilderClass& builder) {
EXPECT_EQ(buffer_view.size_bytes, strlen("four"));
EXPECT_EQ(memcmp(buffer_view.data.as_char, "four", buffer_view.size_bytes), 0);

if constexpr (!std::is_same_v<BuilderClass, FixedSizeBinaryBuilder>) {
string_view = ArrowArrayViewGetStringUnsafe(&array_view, 4);
EXPECT_EQ(string_view.size_bytes, strlen("this_is_a_relatively_long_string"));
EXPECT_EQ(memcmp(string_view.data, "this_is_a_relatively_long_string",
string_view.size_bytes),
0);

buffer_view = ArrowArrayViewGetBytesUnsafe(&array_view, 4);
EXPECT_EQ(buffer_view.size_bytes, strlen("this_is_a_relatively_long_string"));
EXPECT_EQ(memcmp(buffer_view.data.as_char, "this_is_a_relatively_long_string",
buffer_view.size_bytes),
0);
}

ArrowArrayViewReset(&array_view);
ArrowArrayRelease(&array);
ArrowSchemaRelease(&schema);
Expand All @@ -3095,6 +3114,12 @@ TEST(ArrayViewTest, ArrayViewTestGetString) {

auto fixed_size_builder = FixedSizeBinaryBuilder(fixed_size_binary(4));
TestGetFromBinary<FixedSizeBinaryBuilder>(fixed_size_builder);

auto string_view_builder = StringViewBuilder();
TestGetFromBinary<StringViewBuilder>(string_view_builder);

auto binary_view_builder = BinaryViewBuilder();
TestGetFromBinary<BinaryViewBuilder>(binary_view_builder);
}

TEST(ArrayViewTest, ArrayViewTestGetIntervalYearMonth) {
Expand Down
48 changes: 48 additions & 0 deletions src/nanoarrow/common/inline_array.h
Original file line number Diff line number Diff line change
Expand Up @@ -914,6 +914,22 @@ static inline double ArrowArrayViewGetDoubleUnsafe(
}
}

#define ARROW_VIEW_PREFIX_SIZE 4
#define ARROW_VIEW_INLINE_SIZE 12

union ArrowBinaryViewType { // TODO: C++ impl uses alignas which comes in C11
struct {
int32_t size;
uint8_t data[ARROW_VIEW_INLINE_SIZE];
} inlined;
struct {
int32_t size;
uint8_t data[ARROW_VIEW_PREFIX_SIZE];
int32_t buffer_index;
int32_t offset;
} ref;
};

static inline struct ArrowStringView ArrowArrayViewGetStringUnsafe(
const struct ArrowArrayView* array_view, int64_t i) {
i += array_view->offset;
Expand All @@ -938,6 +954,22 @@ static inline struct ArrowStringView ArrowArrayViewGetStringUnsafe(
view.size_bytes = array_view->layout.element_size_bits[1] / 8;
view.data = array_view->buffer_views[1].data.as_char + (i * view.size_bytes);
break;
case NANOARROW_TYPE_STRING_VIEW:
case NANOARROW_TYPE_BINARY_VIEW: {
const union ArrowBufferViewData value_view = array_view->buffer_views[1].data;
union ArrowBinaryViewType bvt;
const size_t idx = sizeof(union ArrowBinaryViewType) * i;
memcpy(&bvt, value_view.as_uint8 + idx, sizeof(union ArrowBinaryViewType));
const int32_t inline_size = bvt.inlined.size;
view.size_bytes = inline_size;
if (inline_size <= ARROW_VIEW_INLINE_SIZE) {
view.data = value_view.as_char + idx + sizeof(int32_t);
} else {
const int32_t buf_index = bvt.ref.buffer_index + 2;
view.data = array_view->buffer_views[buf_index].data.as_char + bvt.ref.offset;
}
break;
}
default:
view.data = NULL;
view.size_bytes = 0;
Expand Down Expand Up @@ -972,6 +1004,22 @@ static inline struct ArrowBufferView ArrowArrayViewGetBytesUnsafe(
view.data.as_uint8 =
array_view->buffer_views[1].data.as_uint8 + (i * view.size_bytes);
break;
case NANOARROW_TYPE_STRING_VIEW:
case NANOARROW_TYPE_BINARY_VIEW: {
const union ArrowBufferViewData value_view = array_view->buffer_views[1].data;
union ArrowBinaryViewType bvt;
const size_t idx = sizeof(union ArrowBinaryViewType) * i;
memcpy(&bvt, value_view.as_uint8 + idx, sizeof(union ArrowBinaryViewType));
const int32_t inline_size = bvt.inlined.size;
view.size_bytes = inline_size;
if (inline_size <= ARROW_VIEW_INLINE_SIZE) {
view.data.as_uint8 = value_view.as_uint8 + idx + sizeof(int32_t);
} else {
const int32_t buf_index = bvt.ref.buffer_index + 2;
view.data = array_view->buffer_views[buf_index].data;
}
break;
}
default:
view.data.data = NULL;
view.size_bytes = 0;
Expand Down
8 changes: 7 additions & 1 deletion src/nanoarrow/common/inline_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -451,7 +451,9 @@ enum ArrowType {
NANOARROW_TYPE_LARGE_BINARY,
NANOARROW_TYPE_LARGE_LIST,
NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO,
NANOARROW_TYPE_RUN_END_ENCODED
NANOARROW_TYPE_RUN_END_ENCODED,
NANOARROW_TYPE_BINARY_VIEW,
NANOARROW_TYPE_STRING_VIEW
};

/// \brief Get a string value of an enum ArrowType value
Expand Down Expand Up @@ -540,6 +542,10 @@ static inline const char* ArrowTypeString(enum ArrowType type) {
return "interval_month_day_nano";
case NANOARROW_TYPE_RUN_END_ENCODED:
return "run_end_encoded";
case NANOARROW_TYPE_BINARY_VIEW:
return "binary_view";
case NANOARROW_TYPE_STRING_VIEW:
return "string_view";
default:
return NULL;
}
Expand Down
20 changes: 20 additions & 0 deletions src/nanoarrow/common/schema.c
Original file line number Diff line number Diff line change
Expand Up @@ -1019,6 +1019,24 @@ static ArrowErrorCode ArrowSchemaViewParse(struct ArrowSchemaView* schema_view,
return EINVAL;
}

// view types
case 'v': {
switch (format[1]) {
case 'u':
ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_STRING_VIEW);
*format_end_out = format + 2;
return NANOARROW_OK;
case 'z':
ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_BINARY_VIEW);
*format_end_out = format + 2;
return NANOARROW_OK;
default:
ArrowErrorSet(error, "Expected 'u', or 'z' following 'v' but found '%s'",
format + 1);
return EINVAL;
}
}

default:
ArrowErrorSet(error, "Unknown format: '%s'", format);
return EINVAL;
Expand Down Expand Up @@ -1150,6 +1168,8 @@ static ArrowErrorCode ArrowSchemaViewValidate(struct ArrowSchemaView* schema_vie
case NANOARROW_TYPE_TIME32:
case NANOARROW_TYPE_TIME64:
case NANOARROW_TYPE_DURATION:
case NANOARROW_TYPE_BINARY_VIEW:
case NANOARROW_TYPE_STRING_VIEW:
return ArrowSchemaViewValidateNChildren(schema_view, 0, error);

case NANOARROW_TYPE_FIXED_SIZE_BINARY:
Expand Down
4 changes: 4 additions & 0 deletions src/nanoarrow/testing/testing.cc
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,7 @@ ArrowErrorCode WriteData(std::ostream& out, const ArrowArrayView* value,

case NANOARROW_TYPE_STRING:
case NANOARROW_TYPE_LARGE_STRING:
case NANOARROW_TYPE_STRING_VIEW:
WriteString(out, ArrowArrayViewGetStringUnsafe(value, 0));
for (int64_t i = 1; i < value->length; i++) {
out << ", ";
Expand Down Expand Up @@ -404,6 +405,9 @@ ArrowErrorCode WriteTypeFromView(std::ostream& out, const ArrowSchemaView* field
case NANOARROW_TYPE_LARGE_STRING:
out << R"("name": "largeutf8")";
break;
case NANOARROW_TYPE_STRING_VIEW:
out << R"("name": "stringview")";
break;
case NANOARROW_TYPE_BINARY:
out << R"("name": "binary")";
break;
Expand Down

0 comments on commit a66af4f

Please sign in to comment.