Skip to content

Commit

Permalink
temp/WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
johnkerl committed Jan 30, 2025
1 parent 2f12828 commit b90317c
Show file tree
Hide file tree
Showing 4 changed files with 146 additions and 8 deletions.
2 changes: 1 addition & 1 deletion apis/python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ def run(self):
str(tiledb_dir / "lib"),
]

CXX_FLAGS = ["-O3"]
CXX_FLAGS = ["-g", "-O0"]

if platform.machine() == "x86_64":
CXX_FLAGS.append("-mavx2")
Expand Down
24 changes: 20 additions & 4 deletions apis/python/src/tiledbsoma/common.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@

namespace tiledbsoma {

using namespace pybind11::literals; // to bring in the `_a` literal

std::unordered_map<tiledb_datatype_t, std::string> _tdb_to_np_name_dtype = {
{TILEDB_INT32, "int32"},
{TILEDB_INT64, "int64"},
Expand Down Expand Up @@ -185,22 +187,36 @@ bool is_tdb_str(tiledb_datatype_t type) {
py::object _buffer_to_table(std::shared_ptr<ArrayBuffers> buffers) {
auto pa = py::module::import("pyarrow");
auto pa_table_from_arrays = pa.attr("Table").attr("from_arrays");
auto pa_array_import = pa.attr("Array").attr("_import_from_c");
auto pa_schema_import = pa.attr("Schema").attr("_import_from_c");
auto py_array_importer = pa.attr("Array").attr("_import_from_c");
auto pa_schema_importer = pa.attr("Schema").attr("_import_from_c");

py::list array_list;
py::list names;

for (auto& name : buffers->names()) {
auto column = buffers->at(name);
auto [pa_array, pa_schema] = ArrowAdapter::to_arrow(column);
auto array = pa_array_import(
auto array = py_array_importer(
py::capsule(pa_array.get()), py::capsule(pa_schema.get()));
array_list.append(array);
names.append(name);
}

return pa_table_from_arrays(array_list, names);
auto py_arrow_table = pa_table_from_arrays(array_list, names);

// ----------------------------------------------------------------
ArrowSchema c_arrow_schema;
uintptr_t c_arrow_schema_ptr = (uintptr_t)(&c_arrow_schema);

py_arrow_table.attr("schema").attr("_export_to_c")(c_arrow_schema_ptr);
ArrowAdapter::set_metadata_for_pandas(&c_arrow_schema);
auto py_arrow_schema = pa_schema_importer(py::capsule(&c_arrow_schema));

py_arrow_table = pa_table_from_arrays(
array_list, "schema"_a = py_arrow_schema);
// ----------------------------------------------------------------

return py_arrow_table;
}

std::optional<py::object> to_table(
Expand Down
125 changes: 122 additions & 3 deletions libtiledbsoma/src/utils/arrow_adapter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@
#include "../soma/column_buffer.h"
#include "arrow_adapter.h"
#include "logger.h"
#include "nlohmann/json.hpp"
#include "util.h"
#include "version.h"

#include "../soma/soma_attribute.h"
#include "../soma/soma_dimension.h"
Expand Down Expand Up @@ -348,6 +350,110 @@ json ArrowAdapter::_get_filter_list_json(FilterList filter_list) {
return filter_list_as_json;
}

// XXX comment/move
// clang-format off
static std::map<std::string, std::string> arrow_format_to_pandas_type = {
{ "c", "int8"},
{ "s", "int16"},
{ "i", "int32"},
{ "l", "int64"},
{ "C", "uint8"},
{ "S", "uint16"},
{ "I", "uint32"},
{ "L", "uint64"},
{ "f", "float32"},
{ "g", "float64"},
{ "u", "string"},
{ "z", "binary"},
{ "b", "bool"},
// { "tss:", xxx}, // TILEDB_DATETIME_SEC,
// { "tsm:", xxx}, // TILEDB_DATETIME_MS,
// { "tsu:", xxx}, // TILEDB_DATETIME_US,
// { "tsn:", xxx}, // TILEDB_DATETIME_NS,
};

static std::map<std::string, std::string> arrow_format_to_numpy_type = {
{ "c", "int8"},
{ "s", "int16"},
{ "i", "int32"},
{ "l", "int64"},
{ "C", "uint8"},
{ "S", "uint16"},
{ "I", "uint32"},
{ "L", "uint64"},
{ "f", "float32"},
{ "g", "float64"},
{ "u", "string"},
{ "z", "binary"},
{ "b", "bool"},
};
// clang-format on

// XXX comment re this helper function
void ArrowAdapter::set_metadata_for_pandas(ArrowSchema* arrow_schema) {
arrow_schema->metadata = nullptr;

std::vector<nlohmann::json> columns;
nlohmann::json creator = {
{"library", "tiledbsoma"},
// This gets us "2.28.0", not "1.15.5" ... where that latter is
// not available directly in C++ (unless we take it as a function
// argument).
{"version", tiledbsoma::version::as_string().c_str()}};

for (auto i = 0; i < arrow_schema->n_children; i++) {
if (arrow_schema->children[i] == nullptr) {
continue;
}
auto child = arrow_schema->children[i];

auto arrow_format = std::string(child->format);
std::string pandas_type = "object";
auto it = arrow_format_to_pandas_type.find(arrow_format);
if (it != arrow_format_to_pandas_type.end()) {
pandas_type = it->second;
}

std::string numpy_type = "object";
it = arrow_format_to_numpy_type.find(arrow_format);
if (it != arrow_format_to_numpy_type.end()) {
numpy_type = it->second;
}

// clang-format off
nlohmann::json column = {
{"name", child->name},
{"field_name", child->name},
{"pandas_type", pandas_type.c_str()},
{"numpy_type", numpy_type.c_str()},
{"metadata", nullptr},
};
// clang-format on
columns.push_back(column);
}

// clang-format off
nlohmann::json pandas_info = {
{"columns", columns},
{"creator", creator},
// Announce that this is the API version we're conforming to
{"pandas_version", "2.2.3"}
};
// clang-format on

nanoarrow::UniqueBuffer buffer;
ArrowMetadataBuilderInit(buffer.get(), nullptr);

ArrowMetadataBuilderAppend(
buffer.get(),
ArrowCharView("pandas"),
ArrowCharView(pandas_info.dump(2).c_str()));

ArrowSchemaSetMetadata(
arrow_schema,
std::string((char*)buffer->data, buffer->size_bytes).c_str());
}

std::unique_ptr<ArrowSchema> ArrowAdapter::arrow_schema_from_tiledb_array(
std::shared_ptr<Context> ctx, std::shared_ptr<Array> tiledb_array) {
auto tiledb_schema = tiledb_array->schema();
Expand All @@ -357,7 +463,7 @@ std::unique_ptr<ArrowSchema> ArrowAdapter::arrow_schema_from_tiledb_array(
std::unique_ptr<ArrowSchema> arrow_schema = std::make_unique<ArrowSchema>();
arrow_schema->format = strdup("+s");
arrow_schema->name = strdup("parent");
arrow_schema->metadata = nullptr;

arrow_schema->flags = 0;
arrow_schema->n_children = ndim + nattr;
arrow_schema->dictionary = nullptr;
Expand Down Expand Up @@ -453,6 +559,11 @@ std::unique_ptr<ArrowSchema> ArrowAdapter::arrow_schema_from_tiledb_array(
child->release = &ArrowAdapter::release_schema;
}

// XXX copy here from TileDB-Array metadata to Arrow-table metadata
// arrow_schema->metadata = nullptr;
// XXX TODO: paste in GitHub issue link
set_metadata_for_pandas(arrow_schema.get());

return arrow_schema;
}

Expand Down Expand Up @@ -1559,7 +1670,7 @@ std::unique_ptr<ArrowSchema> ArrowAdapter::make_arrow_schema(
auto arrow_schema = std::make_unique<ArrowSchema>();
arrow_schema->format = "+s"; // structure, i.e. non-leaf node
arrow_schema->name = strdup("parent");
arrow_schema->metadata = nullptr;

arrow_schema->flags = 0;
arrow_schema->n_children = num_names; // non-leaf node
arrow_schema->children = (ArrowSchema**)malloc(
Expand Down Expand Up @@ -1607,6 +1718,10 @@ std::unique_ptr<ArrowSchema> ArrowAdapter::make_arrow_schema(
}
}

// XXX
// arrow_schema->metadata = nullptr;
set_metadata_for_pandas(arrow_schema.get());

return arrow_schema;
}

Expand All @@ -1615,7 +1730,7 @@ std::unique_ptr<ArrowSchema> ArrowAdapter::make_arrow_schema_parent(
auto arrow_schema = std::make_unique<ArrowSchema>();
arrow_schema->format = "+s"; // structure, i.e. non-leaf node
arrow_schema->name = strdup("parent");
arrow_schema->metadata = nullptr;

arrow_schema->flags = 0;
arrow_schema->n_children = num_columns; // non-leaf node
arrow_schema->children = (ArrowSchema**)malloc(
Expand All @@ -1632,6 +1747,10 @@ std::unique_ptr<ArrowSchema> ArrowAdapter::make_arrow_schema_parent(
"[ArrowAdapter] make_arrow_schema n_children {}",
arrow_schema->n_children));

// XXX
// arrow_schema->metadata = nullptr;
set_metadata_for_pandas(arrow_schema.get());

return arrow_schema;
}

Expand Down
3 changes: 3 additions & 0 deletions libtiledbsoma/src/utils/arrow_adapter.h
Original file line number Diff line number Diff line change
Expand Up @@ -1191,6 +1191,9 @@ class ArrowAdapter {
}
}

// XXX WIP
static void set_metadata_for_pandas(ArrowSchema* arrow_schema);

private:
static std::pair<const void*, std::size_t> _get_data_and_length(
Enumeration& enmr, const void* dst);
Expand Down

0 comments on commit b90317c

Please sign in to comment.