Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(java): support add columns via sql expressions #3287

Merged
merged 4 commits into from
Jan 7, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 70 additions & 1 deletion java/core/lance-jni/src/blocking_dataset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,9 @@ use jni::sys::{jbyteArray, jlong};
use jni::{objects::JObject, JNIEnv};
use lance::dataset::builder::DatasetBuilder;
use lance::dataset::transaction::Operation;
use lance::dataset::{ColumnAlteration, Dataset, ProjectionRequest, ReadParams, WriteParams};
use lance::dataset::{
ColumnAlteration, Dataset, NewColumnTransform, ProjectionRequest, ReadParams, WriteParams,
};
use lance::io::{ObjectStore, ObjectStoreParams};
use lance::table::format::Fragment;
use lance::table::format::Index;
Expand Down Expand Up @@ -934,3 +936,70 @@ fn inner_alter_columns(
RT.block_on(dataset_guard.inner.alter_columns(&column_alterations))?;
Ok(())
}

#[no_mangle]
pub extern "system" fn Java_com_lancedb_lance_Dataset_nativeAddColumnsBySqlExpressions(
mut env: JNIEnv,
java_dataset: JObject,
sql_expressions: JObject, // SqlExpressions
batch_size: JObject, // Optional<Long>
) {
ok_or_throw_without_return!(
env,
inner_add_columns_by_sql_expressions(&mut env, java_dataset, sql_expressions, batch_size)
)
}

fn inner_add_columns_by_sql_expressions(
env: &mut JNIEnv,
java_dataset: JObject,
sql_expressions: JObject, // SqlExpressions
batch_size: JObject, // Optional<Long>
) -> Result<()> {
let sql_expressions_obj = env
.get_field(sql_expressions, "sqlExpressions", "Ljava/util/List;")?
.l()?;

let sql_expressions_obj_list = env.get_list(&sql_expressions_obj)?;
let mut expressions: Vec<(String, String)> = Vec::new();

let mut iterator = sql_expressions_obj_list.iter(env)?;

while let Some(item) = iterator.next(env)? {
let name = env
.call_method(&item, "getName", "()Ljava/lang/String;", &[])?
.l()?;
let value = env
.call_method(&item, "getExpression", "()Ljava/lang/String;", &[])?
.l()?;
let key_str: String = env.get_string(&JString::from(name))?.into();
let value_str: String = env.get_string(&JString::from(value))?.into();
expressions.push((key_str, value_str));
}

let rust_transform = NewColumnTransform::SqlExpressions(expressions);

let batch_size = if env.call_method(&batch_size, "isPresent", "()Z", &[])?.z()? {
let batch_size_value = env.get_long_opt(&batch_size)?;
match batch_size_value {
Some(value) => Some(
value
.try_into()
.map_err(|_| Error::input_error("Batch size conversion error".to_string()))?,
),
None => None,
}
} else {
None
};

let mut dataset_guard =
unsafe { env.get_rust_field::<_, _, BlockingDataset>(java_dataset, NATIVE_DATASET) }?;

RT.block_on(
dataset_guard
.inner
.add_columns(rust_transform, None, batch_size),
)?;
Ok(())
}
18 changes: 18 additions & 0 deletions java/core/src/main/java/com/lancedb/lance/Dataset.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import com.lancedb.lance.ipc.LanceScanner;
import com.lancedb.lance.ipc.ScanOptions;
import com.lancedb.lance.schema.ColumnAlteration;
import com.lancedb.lance.schema.SqlExpressions;

import org.apache.arrow.c.ArrowArrayStream;
import org.apache.arrow.c.ArrowSchema;
Expand Down Expand Up @@ -267,6 +268,23 @@ public static native Dataset commitOverwrite(
*/
public static native void drop(String path, Map<String, String> storageOptions);

/**
* Add columns to the dataset.
*
* @param sqlExpressions The SQL expressions to add columns
* @param batchSize The number of rows to read at a time from the source dataset when applying the
* transform.
*/
public void addColumns(SqlExpressions sqlExpressions, Optional<Long> batchSize) {
try (LockManager.WriteLock writeLock = lockManager.acquireWriteLock()) {
Preconditions.checkArgument(nativeDatasetHandle != 0, "Dataset is closed");
nativeAddColumnsBySqlExpressions(sqlExpressions, batchSize);
}
}

private native void nativeAddColumnsBySqlExpressions(
SqlExpressions sqlExpressions, Optional<Long> batchSize);

/**
* Drop columns from the dataset.
*
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
/*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
* in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing permissions and limitations under
* the License.
*/

package com.lancedb.lance.schema;

import java.util.ArrayList;
import java.util.List;

/**
* Represents a list of SQL expressions. Each expression has a name and an expression string. Name:
* is used to refer to the new column name. Expression: SQL expression strings. These strings can
* reference existing columns in the dataset. The expression would be calculated as the value of new
* column.
*/
public class SqlExpressions {
yanghua marked this conversation as resolved.
Show resolved Hide resolved

private final List<SqlExpression> sqlExpressions;

private SqlExpressions(List<SqlExpression> sqlExpressions) {
this.sqlExpressions = sqlExpressions;
}

public List<SqlExpression> getSqlExpressions() {
return sqlExpressions;
}

public static class SqlExpression {

private String name;
private String expression;

public SqlExpression() {}

public String getName() {
return name;
}

public void setName(String name) {
this.name = name;
}

public String getExpression() {
return expression;
}

public void setExpression(String expression) {
this.expression = expression;
}
}

public static class Builder {

private final SqlExpressions sqlExpressions;

public Builder() {
this.sqlExpressions = new SqlExpressions(new ArrayList<>());
}

public Builder withExpression(String name, String expr) {
SqlExpression expression = new SqlExpression();
expression.setName(name);
expression.setExpression(expr);
this.sqlExpressions.getSqlExpressions().add(expression);
return this;
}

public SqlExpressions build() {
return this.sqlExpressions;
}
}
}
52 changes: 52 additions & 0 deletions java/core/src/test/java/com/lancedb/lance/DatasetTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
package com.lancedb.lance;

import com.lancedb.lance.schema.ColumnAlteration;
import com.lancedb.lance.schema.SqlExpressions;

import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
Expand All @@ -30,6 +31,10 @@
import java.nio.channels.ClosedChannelException;
import java.nio.file.Path;
import java.util.*;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.Optional;
import java.util.stream.Collectors;

import static org.junit.jupiter.api.Assertions.*;
Expand Down Expand Up @@ -297,6 +302,53 @@ void testAlterColumns() {
}
}

@Test
void testAddColumnBySqlExpressions() {
String testMethodName = new Object() {}.getClass().getEnclosingMethod().getName();
String datasetPath = tempDir.resolve(testMethodName).toString();
try (RootAllocator allocator = new RootAllocator(Long.MAX_VALUE)) {
TestUtils.SimpleTestDataset testDataset =
new TestUtils.SimpleTestDataset(allocator, datasetPath);
dataset = testDataset.createEmptyDataset();

SqlExpressions sqlExpressions =
new SqlExpressions.Builder().withExpression("double_id", "id * 2").build();
dataset.addColumns(sqlExpressions, Optional.empty());

Schema changedSchema =
new Schema(
Arrays.asList(
Field.nullable("id", new ArrowType.Int(32, true)),
Field.nullable("name", new ArrowType.Utf8()),
Field.nullable("double_id", new ArrowType.Int(32, true))),
null);

assertEquals(changedSchema.getFields().size(), dataset.getSchema().getFields().size());
assertEquals(
changedSchema.getFields().stream().map(Field::getName).collect(Collectors.toList()),
dataset.getSchema().getFields().stream()
.map(Field::getName)
.collect(Collectors.toList()));

sqlExpressions = new SqlExpressions.Builder().withExpression("triple_id", "id * 3").build();
dataset.addColumns(sqlExpressions, Optional.empty());
changedSchema =
new Schema(
Arrays.asList(
Field.nullable("id", new ArrowType.Int(32, true)),
Field.nullable("name", new ArrowType.Utf8()),
Field.nullable("double_id", new ArrowType.Int(32, true)),
Field.nullable("triple_id", new ArrowType.Int(32, true))),
null);
assertEquals(changedSchema.getFields().size(), dataset.getSchema().getFields().size());
assertEquals(
changedSchema.getFields().stream().map(Field::getName).collect(Collectors.toList()),
dataset.getSchema().getFields().stream()
.map(Field::getName)
.collect(Collectors.toList()));
}
}

@Test
void testDropPath() {
String testMethodName = new Object() {}.getClass().getEnclosingMethod().getName();
Expand Down
Loading