Skip to content

Commit

Permalink
Migrate dataframe domain setters
Browse files Browse the repository at this point in the history
  • Loading branch information
XanthosXanthopoulos committed Feb 3, 2025
1 parent 97640fd commit 921daca
Show file tree
Hide file tree
Showing 3 changed files with 114 additions and 249 deletions.
285 changes: 40 additions & 245 deletions libtiledbsoma/src/soma/soma_array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1132,21 +1132,13 @@ StatusAndReason SOMAArray::_can_set_domain_helper(
// domain isn't outside the core domain, which is immutable. For
// old-style dataframes, if the requested domain fits in the array's
// core domain, it's good to go as a new soma domain.
auto domain_check = _can_set_dataframe_domainish_subhelper(
newdomain, false, function_name_for_messages);
if (!domain_check.first) {
return domain_check;
}

// For new-style dataframes, we need to additionally that the the
// requested soma domain (core current domain) isn't a downsize of the
// current one.
if (has_current_domain()) {
auto current_domain_check = _can_set_dataframe_domainish_subhelper(
newdomain, true, function_name_for_messages);
if (!current_domain_check.first) {
return current_domain_check;
}
auto current_domain_check = _can_set_dataframe_domainish_subhelper(
newdomain, function_name_for_messages);
if (!current_domain_check.first) {
return current_domain_check;
}

return std::pair(true, "");
Expand All @@ -1156,140 +1148,47 @@ StatusAndReason SOMAArray::_can_set_domain_helper(
// the user's requested soma domain against the core current domain or core
// (max) domain.
StatusAndReason SOMAArray::_can_set_dataframe_domainish_subhelper(
const ArrowTable& newdomain,
bool check_current_domain,
std::string function_name_for_messages) {
Domain domain = arr_->schema().domain();

ArrowArray* new_domain_array = newdomain.first.get();
ArrowSchema* new_domain_schema = newdomain.second.get();

if (new_domain_schema->n_children != domain.ndim()) {
const ArrowTable& newdomain, std::string function_name_for_messages) {
if (newdomain.second->n_children != static_cast<int64_t>(ndim())) {
return std::pair(
false,
std::format(
"{}: requested domain has ndim={} but the dataframe has "
"ndim={}",
function_name_for_messages,
new_domain_schema->n_children,
domain.ndim()));
newdomain.second->n_children,
ndim()));
}

if (new_domain_schema->n_children != new_domain_array->n_children) {
if (newdomain.second->n_children != newdomain.first->n_children) {
return std::pair(
false,
std::format(
"{}: internal coding error", function_name_for_messages));
}

for (unsigned i = 0; i < domain.ndim(); i++) {
const auto& dim = domain.dimension(i);

StatusAndReason status_and_reason;

switch (dim.type()) {
case TILEDB_STRING_ASCII:
case TILEDB_STRING_UTF8:
case TILEDB_CHAR:
status_and_reason =
_can_set_dataframe_domainish_slot_checker_string(
check_current_domain, newdomain, dim.name());
break;
case TILEDB_BOOL:
status_and_reason =
_can_set_dataframe_domainish_slot_checker_non_string<bool>(
check_current_domain, newdomain, dim.name());
break;
case TILEDB_INT8:
status_and_reason =
_can_set_dataframe_domainish_slot_checker_non_string<
int8_t>(check_current_domain, newdomain, dim.name());
break;
case TILEDB_UINT8:
status_and_reason =
_can_set_dataframe_domainish_slot_checker_non_string<
uint8_t>(check_current_domain, newdomain, dim.name());
break;
case TILEDB_INT16:
status_and_reason =
_can_set_dataframe_domainish_slot_checker_non_string<
int16_t>(check_current_domain, newdomain, dim.name());
break;
case TILEDB_UINT16:
status_and_reason =
_can_set_dataframe_domainish_slot_checker_non_string<
uint16_t>(check_current_domain, newdomain, dim.name());
break;
case TILEDB_INT32:
status_and_reason =
_can_set_dataframe_domainish_slot_checker_non_string<
int32_t>(check_current_domain, newdomain, dim.name());
break;
case TILEDB_UINT32:
status_and_reason =
_can_set_dataframe_domainish_slot_checker_non_string<
uint32_t>(check_current_domain, newdomain, dim.name());
break;
case TILEDB_INT64:
case TILEDB_DATETIME_YEAR:
case TILEDB_DATETIME_MONTH:
case TILEDB_DATETIME_WEEK:
case TILEDB_DATETIME_DAY:
case TILEDB_DATETIME_HR:
case TILEDB_DATETIME_MIN:
case TILEDB_DATETIME_SEC:
case TILEDB_DATETIME_MS:
case TILEDB_DATETIME_US:
case TILEDB_DATETIME_NS:
case TILEDB_DATETIME_PS:
case TILEDB_DATETIME_FS:
case TILEDB_DATETIME_AS:
case TILEDB_TIME_HR:
case TILEDB_TIME_MIN:
case TILEDB_TIME_SEC:
case TILEDB_TIME_MS:
case TILEDB_TIME_US:
case TILEDB_TIME_NS:
case TILEDB_TIME_PS:
case TILEDB_TIME_FS:
case TILEDB_TIME_AS:
status_and_reason =
_can_set_dataframe_domainish_slot_checker_non_string<
int64_t>(check_current_domain, newdomain, dim.name());
break;
case TILEDB_UINT64:
status_and_reason =
_can_set_dataframe_domainish_slot_checker_non_string<
uint64_t>(check_current_domain, newdomain, dim.name());
break;
case TILEDB_FLOAT32:
status_and_reason =
_can_set_dataframe_domainish_slot_checker_non_string<float>(
check_current_domain, newdomain, dim.name());
break;
case TILEDB_FLOAT64:
status_and_reason =
_can_set_dataframe_domainish_slot_checker_non_string<
double>(check_current_domain, newdomain, dim.name());
break;
default:
throw TileDBSOMAError(std::format(
"{}: saw invalid TileDB type when attempting to cast "
"domain information: {}",
function_name_for_messages,
tiledb::impl::type_to_str(dim.type())));
}
std::optional<NDRectangle>
ndrect = has_current_domain() ?
std::make_optional<NDRectangle>(
tiledb::ArraySchemaExperimental::current_domain(
*ctx_->tiledb_ctx(), arr_->schema())
.ndrectangle()) :
std::nullopt;

if (status_and_reason.first == false) {
return std::pair(
false,
std::format(
"{} for {}: {}",
function_name_for_messages,
dim.name(),
status_and_reason.second));
for (const auto& column :
columns_ | std::views::filter(
[](const auto& col) { return col->isIndexColumn(); })) {
auto status = column->can_set_current_domain_slot(
ndrect,
ArrowAdapter::get_table_any_column_by_name<2>(
newdomain, column->name(), 0));

if (status.first == false) {
status.second = std::format(
"[{}] {}", function_name_for_messages, status.second);
}
}

return std::pair(true, "");
}

Expand Down Expand Up @@ -1317,136 +1216,32 @@ void SOMAArray::_set_domain_helper(
}
}

Domain domain = arr_->schema().domain();

ArrowArray* new_domain_array = newdomain.first.get();
ArrowSchema* new_domain_schema = newdomain.second.get();

if (new_domain_schema->n_children != domain.ndim()) {
if (newdomain.second->n_children != static_cast<int64_t>(ndim())) {
throw TileDBSOMAError(std::format(
"{}: requested domain has ndim={} but the dataframe has "
"ndim={}",
function_name_for_messages,
new_domain_schema->n_children,
domain.ndim()));
newdomain.second->n_children,
ndim()));
}

if (new_domain_schema->n_children != new_domain_array->n_children) {
if (newdomain.second->n_children != newdomain.first->n_children) {
throw TileDBSOMAError(std::format(
"{}: internal coding error", function_name_for_messages));
}

auto tctx = ctx_->tiledb_ctx();
NDRectangle ndrect(*tctx, domain);
NDRectangle ndrect(*tctx, arr_->schema().domain());
CurrentDomain new_current_domain(*tctx);
ArraySchemaEvolution schema_evolution(*tctx);

for (unsigned i = 0; i < domain.ndim(); i++) {
const Dimension& dim = domain.dimension(i);
const std::string dim_name = dim.name();

switch (dim.type()) {
case TILEDB_STRING_ASCII:
case TILEDB_STRING_UTF8:
case TILEDB_CHAR:
case TILEDB_GEOM_WKB:
case TILEDB_GEOM_WKT: {
auto lo_hi = ArrowAdapter::get_table_string_column_by_index(
newdomain, i);
if (lo_hi[0] == "" && lo_hi[1] == "") {
// Don't care -> as big as possible.
// See comments in soma_array.h.
ndrect.set_range(dim_name, "", "\x7f");
} else {
throw TileDBSOMAError(std::format(
"domain (\"{}\", \"{}\") cannot be set for "
"string index columns: please use "
"(\"\", \"\")",
lo_hi[0],
lo_hi[1]));
}
} break;

case TILEDB_INT8: {
auto lo_hi = ArrowAdapter::get_table_non_string_column_by_index<
int8_t>(newdomain, i);
ndrect.set_range<int8_t>(dim_name, lo_hi[0], lo_hi[1]);
} break;
case TILEDB_BOOL:
case TILEDB_UINT8: {
auto lo_hi = ArrowAdapter::get_table_non_string_column_by_index<
uint8_t>(newdomain, i);
ndrect.set_range<uint8_t>(dim_name, lo_hi[0], lo_hi[1]);
} break;
case TILEDB_INT16: {
auto lo_hi = ArrowAdapter::get_table_non_string_column_by_index<
int16_t>(newdomain, i);
ndrect.set_range<int16_t>(dim_name, lo_hi[0], lo_hi[1]);
} break;
case TILEDB_UINT16: {
auto lo_hi = ArrowAdapter::get_table_non_string_column_by_index<
uint16_t>(newdomain, i);
ndrect.set_range<uint16_t>(dim_name, lo_hi[0], lo_hi[1]);
} break;
case TILEDB_INT32: {
auto lo_hi = ArrowAdapter::get_table_non_string_column_by_index<
int32_t>(newdomain, i);
ndrect.set_range<int32_t>(dim_name, lo_hi[0], lo_hi[1]);
} break;
case TILEDB_UINT32: {
auto lo_hi = ArrowAdapter::get_table_non_string_column_by_index<
uint32_t>(newdomain, i);
ndrect.set_range<uint32_t>(dim_name, lo_hi[0], lo_hi[1]);
} break;
case TILEDB_INT64:
case TILEDB_DATETIME_YEAR:
case TILEDB_DATETIME_MONTH:
case TILEDB_DATETIME_WEEK:
case TILEDB_DATETIME_DAY:
case TILEDB_DATETIME_HR:
case TILEDB_DATETIME_MIN:
case TILEDB_DATETIME_SEC:
case TILEDB_DATETIME_MS:
case TILEDB_DATETIME_US:
case TILEDB_DATETIME_NS:
case TILEDB_DATETIME_PS:
case TILEDB_DATETIME_FS:
case TILEDB_DATETIME_AS:
case TILEDB_TIME_HR:
case TILEDB_TIME_MIN:
case TILEDB_TIME_SEC:
case TILEDB_TIME_MS:
case TILEDB_TIME_US:
case TILEDB_TIME_NS:
case TILEDB_TIME_PS:
case TILEDB_TIME_FS:
case TILEDB_TIME_AS: {
auto lo_hi = ArrowAdapter::get_table_non_string_column_by_index<
int64_t>(newdomain, i);
ndrect.set_range<int64_t>(dim_name, lo_hi[0], lo_hi[1]);
} break;
case TILEDB_UINT64: {
auto lo_hi = ArrowAdapter::get_table_non_string_column_by_index<
uint64_t>(newdomain, i);
ndrect.set_range<uint64_t>(dim_name, lo_hi[0], lo_hi[1]);
} break;
case TILEDB_FLOAT32: {
auto lo_hi = ArrowAdapter::get_table_non_string_column_by_index<
float>(newdomain, i);
ndrect.set_range<float>(dim_name, lo_hi[0], lo_hi[1]);
} break;
case TILEDB_FLOAT64: {
auto lo_hi = ArrowAdapter::get_table_non_string_column_by_index<
double>(newdomain, i);
ndrect.set_range<double>(dim_name, lo_hi[0], lo_hi[1]);
} break;
default:
throw TileDBSOMAError(std::format(
"{}: internal error: unhandled type {} for {}.",
function_name_for_messages,
tiledb::impl::type_to_str(dim.type()),
dim_name));
}
for (const auto& column :
columns_ | std::views::filter(
[](const auto& col) { return col->isIndexColumn(); })) {
column->set_current_domain_slot(
ndrect,
ArrowAdapter::get_table_any_column_by_name<2>(
newdomain, column->name(), 0));
}

new_current_domain.set_ndrectangle(ndrect);
Expand Down
4 changes: 1 addition & 3 deletions libtiledbsoma/src/soma/soma_array.h
Original file line number Diff line number Diff line change
Expand Up @@ -1230,9 +1230,7 @@ class SOMAArray : public SOMAObject {
* This is a code-dedupe helper for can_upgrade_domain.
*/
StatusAndReason _can_set_dataframe_domainish_subhelper(
const ArrowTable& newdomain,
bool check_current_domain,
std::string function_name_for_messages);
const ArrowTable& newdomain, std::string function_name_for_messages);

/**
* This is a code-dedupe helper for can_resize_soma_joinid_shape and
Expand Down
Loading

0 comments on commit 921daca

Please sign in to comment.