From 4a53e2905cb9777897df7791630bbee31c29c1a0 Mon Sep 17 00:00:00 2001 From: Jeff Raymakers Date: Sat, 20 Jul 2024 17:14:24 -0700 Subject: [PATCH] string handling --- alt/bindings/src/duckdb.d.ts | 20 +++++++--- alt/bindings/src/duckdb_node_bindings.cpp | 29 ++++++++++---- alt/bindings/test/query.test.ts | 48 ++++++++++++++++++++++- 3 files changed, 81 insertions(+), 16 deletions(-) diff --git a/alt/bindings/src/duckdb.d.ts b/alt/bindings/src/duckdb.d.ts index 50e6e8d7..535fa896 100644 --- a/alt/bindings/src/duckdb.d.ts +++ b/alt/bindings/src/duckdb.d.ts @@ -602,10 +602,10 @@ export function data_chunk_set_size(chunk: DataChunk, size: number): void; export function vector_get_column_type(vector: Vector): LogicalType; // void *duckdb_vector_get_data(duckdb_vector vector) -export function vector_get_data(vector: Vector, byteCount: number): Buffer; +export function vector_get_data(vector: Vector, byte_count: number): Uint8Array; // uint64_t *duckdb_vector_get_validity(duckdb_vector vector) -export function vector_get_validity(vector: Vector, byteCount: number): Buffer; +export function vector_get_validity(vector: Vector, byte_cunt: number): Uint8Array; // void duckdb_vector_ensure_validity_writable(duckdb_vector vector) export function vector_ensure_validity_writable(vector: Vector): void; @@ -634,16 +634,16 @@ export function struct_vector_get_child(vector: Vector, index: number): Vector; export function array_vector_get_child(vector: Vector): Vector; // bool duckdb_validity_row_is_valid(uint64_t *validity, idx_t row) -export function validity_row_is_valid(validity: Buffer, row_index: number): boolean; +export function validity_row_is_valid(validity: Uint8Array, row_index: number): boolean; // void duckdb_validity_set_row_validity(uint64_t *validity, idx_t row, bool valid) -export function validity_set_row_validity(validity: Buffer, row_index: number, valid: boolean): void; +export function validity_set_row_validity(validity: Uint8Array, row_index: number, valid: boolean): void; // void duckdb_validity_set_row_invalid(uint64_t *validity, idx_t row) -export function validity_set_row_invalid(validity: Buffer, row_index: number): void; +export function validity_set_row_invalid(validity: Uint8Array, row_index: number): void; // void duckdb_validity_set_row_valid(uint64_t *validity, idx_t row) -export function validity_set_row_valid(validity: Buffer, row_index: number): void; +export function validity_set_row_valid(validity: Uint8Array, row_index: number): void; // duckdb_state duckdb_appender_create(duckdb_connection connection, const char *schema, const char *table, duckdb_appender *out_appender) export function appender_create(connection: Connection, schema: string, table: string): Appender; @@ -739,3 +739,11 @@ export function append_data_chunk(appender: Appender, chunk: DataChunk): State; // duckdb_data_chunk duckdb_fetch_chunk(duckdb_result result) export function fetch_chunk(result: Result): Promise; + +// ADDED +/** + * Read a pointer from `array_buffer` at `pointer_offset`, then read and return `byte_count` bytes from that pointer. + * + * Used to read from `duckdb_string_t`s with non-inlined data that are embedded in VARCHAR, BLOB, and BIT vectors. + */ +export function get_data_from_pointer(array_buffer: ArrayBuffer, pointer_offset: number, byte_count: number): Uint8Array; diff --git a/alt/bindings/src/duckdb_node_bindings.cpp b/alt/bindings/src/duckdb_node_bindings.cpp index 687e79f1..a7b3663d 100644 --- a/alt/bindings/src/duckdb_node_bindings.cpp +++ b/alt/bindings/src/duckdb_node_bindings.cpp @@ -477,6 +477,8 @@ class DuckDBNodeAddon : public Napi::Addon { // TODO: duckdb_validity_set_row_valid InstanceMethod("fetch_chunk", &DuckDBNodeAddon::fetch_chunk), + + InstanceMethod("get_data_from_pointer", &DuckDBNodeAddon::get_data_from_pointer), }); } @@ -941,23 +943,23 @@ class DuckDBNodeAddon : public Napi::Addon { // TODO // void *duckdb_vector_get_data(duckdb_vector vector) - // function vector_get_data(vector: Vector, length: number): Buffer + // function vector_get_data(vector: Vector, byte_count: number): Uint8Array Napi::Value vector_get_data(const Napi::CallbackInfo& info) { auto env = info.Env(); auto vector = GetVectorFromExternal(env, info[0]); - auto byteCount = info[1].As().Uint32Value(); + auto byte_count = info[1].As().Uint32Value(); void *data = duckdb_vector_get_data(vector); - return Napi::Buffer::NewOrCopy(env, reinterpret_cast(data), byteCount); + return Napi::Buffer::NewOrCopy(env, reinterpret_cast(data), byte_count); } // uint64_t *duckdb_vector_get_validity(duckdb_vector vector) - // function vector_get_validity(vector: Vector, byteCount: number): Buffer + // function vector_get_validity(vector: Vector, byte_count: number): Uint8Array Napi::Value vector_get_validity(const Napi::CallbackInfo& info) { auto env = info.Env(); auto vector = GetVectorFromExternal(env, info[0]); - auto byteCount = info[1].As().Uint32Value(); + auto byte_count = info[1].As().Uint32Value(); uint64_t *data = duckdb_vector_get_validity(vector); - return Napi::Buffer::NewOrCopy(env, reinterpret_cast(data), byteCount); + return Napi::Buffer::NewOrCopy(env, reinterpret_cast(data), byte_count); } // void duckdb_vector_ensure_validity_writable(duckdb_vector vector) @@ -1010,10 +1012,10 @@ class DuckDBNodeAddon : public Napi::Addon { } // bool duckdb_validity_row_is_valid(uint64_t *validity, idx_t row) - // function validity_row_is_valid(validity: Buffer, row_index: number): boolean + // function validity_row_is_valid(validity: Uint8Array, row_index: number): boolean Napi::Value validity_row_is_valid(const Napi::CallbackInfo& info) { auto env = info.Env(); - auto validity = reinterpret_cast(info[0].As>().Data()); + auto validity = reinterpret_cast(info[0].As().Data()); auto row_index = info[1].As().Uint32Value(); auto valid = duckdb_validity_row_is_valid(validity, row_index); return Napi::Boolean::New(env, valid); @@ -1070,6 +1072,17 @@ class DuckDBNodeAddon : public Napi::Addon { return worker->Promise(); } + // ADDED + // function get_data_from_pointer(array_buffer: ArrayBuffer, pointer_offset: number, byte_count: number): Uint8Array + Napi::Value get_data_from_pointer(const Napi::CallbackInfo& info) { + auto env = info.Env(); + auto data = reinterpret_cast(info[0].As().Data()); + auto pointer_offset = info[1].As().Uint32Value(); + auto byte_count = info[2].As().Uint32Value(); + auto pointer_pointer = reinterpret_cast(data + pointer_offset); + auto pointer = *pointer_pointer; + return Napi::Buffer::NewOrCopy(env, pointer, byte_count); + } }; diff --git a/alt/bindings/test/query.test.ts b/alt/bindings/test/query.test.ts index c212e93c..85f2aaf6 100644 --- a/alt/bindings/test/query.test.ts +++ b/alt/bindings/test/query.test.ts @@ -5,11 +5,34 @@ function isValid(validity: BigUint64Array, bit: number): boolean { return (validity[Math.floor(bit / 64)] & (1n << BigInt(bit % 64))) !== 0n; } -function expectValidity(validity_bytes: Buffer, validity: BigUint64Array, bit: number, expected: boolean) { +function expectValidity(validity_bytes: Uint8Array, validity: BigUint64Array, bit: number, expected: boolean) { expect(duckdb.validity_row_is_valid(validity_bytes, bit)).toBe(expected); expect(isValid(validity, bit)).toBe(expected); } +/** + * Gets the bytes either in or referenced by a `duckdb_string_t` + * that is at `string_byte_offset` of the given `DataView`. + */ +function getStringBytes(dv: DataView, string_byte_offset: number): Uint8Array { + const length_in_bytes = dv.getUint32(string_byte_offset, true); + if (length_in_bytes <= 12) { + return new Uint8Array(dv.buffer, dv.byteOffset + string_byte_offset + 4, length_in_bytes); + } else { + return duckdb.get_data_from_pointer(dv.buffer, dv.byteOffset + string_byte_offset + 8, length_in_bytes); + } +} + +const decoder = new TextDecoder(); + +/** + * Gets the UTF-8 string either in or referenced by a `duckdb_string_t` + * that is at `string_byte_offset` of the given `DataView`. + */ +function getVarchar(dv: DataView, string_byte_offset: number): string { + return decoder.decode(getStringBytes(dv, string_byte_offset)); +} + suite('query', () => { test('basic select', async () => { const db = await duckdb.open(); @@ -74,8 +97,14 @@ suite('query', () => { expect(duckdb.column_count(res)).toBe(53); expect(duckdb.column_name(res, 0)).toBe('bool'); expect(duckdb.column_type(res, 0)).toBe(duckdb.Type.BOOLEAN); + expect(duckdb.column_name(res, 27)).toBe('varchar'); + expect(duckdb.column_type(res, 27)).toBe(duckdb.Type.VARCHAR); expect(duckdb.column_name(res, 33)).toBe('int_array'); expect(duckdb.column_type(res, 33)).toBe(duckdb.Type.LIST); + expect(duckdb.column_name(res, 40)).toBe('struct'); + expect(duckdb.column_type(res, 40)).toBe(duckdb.Type.STRUCT); + expect(duckdb.column_name(res, 45)).toBe('fixed_int_array'); + expect(duckdb.column_type(res, 45)).toBe(duckdb.Type.ARRAY); expect(duckdb.column_name(res, 52)).toBe('list_of_fixed_int_array'); expect(duckdb.column_type(res, 52)).toBe(duckdb.Type.LIST); const chunk = await duckdb.fetch_chunk(res); @@ -100,6 +129,21 @@ suite('query', () => { expectValidity(bool_validity_bytes, bool_validity, 2, false); + // varchar + const varchar_vector = duckdb.data_chunk_get_vector(chunk, 27); + const varchar_validity_bytes = duckdb.vector_get_validity(varchar_vector, 8); + const varchar_validity = new BigUint64Array(varchar_validity_bytes.buffer, 0, 1); + const varchar_data = duckdb.vector_get_data(varchar_vector, 3*16); + const varchar_dv = new DataView(varchar_data.buffer); + + expectValidity(varchar_validity_bytes, varchar_validity, 0, true); + expect(getVarchar(varchar_dv, 0*16)).toBe('🦆🦆🦆🦆🦆🦆'); + + expectValidity(varchar_validity_bytes, varchar_validity, 1, true); + expect(getVarchar(varchar_dv, 1*16)).toBe('goo\0se'); + + expectValidity(varchar_validity_bytes, varchar_validity, 2, false); + // int_array const int_array_vector = duckdb.data_chunk_get_vector(chunk, 33); const int_array_validity_bytes = duckdb.vector_get_validity(int_array_vector, 8); @@ -168,7 +212,7 @@ suite('query', () => { expect(struct_child0_dv.getInt32(1*4, true)).toBe(42); expectValidity(struct_child1_validity_bytes, struct_child1_validity, 1, true); expect(struct_child1_dv.getInt32(1*16, true)).toBe(24); - // TODO: validate string contents + expect(getVarchar(struct_child1_dv, 1*16)).toBe('🦆🦆🦆🦆🦆🦆'); expectValidity(struct_validity_bytes, struct_validity, 2, false); expectValidity(struct_child0_validity_bytes, struct_child0_validity, 2, false);