-
Notifications
You must be signed in to change notification settings - Fork 1.8k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add examples for Delta Kernel API usage
- Loading branch information
1 parent
115abb3
commit 5f9b4e2
Showing
6 changed files
with
873 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
## | ||
## This script runs the Delta Kernel example programs using the golden | ||
## tables located in the <repo-root>/connectors/golden-tables/src/main/resources/golden | ||
## directory. | ||
## | ||
## Make sure to run this script from <repo-root> in order for the relative | ||
## paths used for referring to the golden tables work. | ||
|
||
BASEDIR=`pwd` | ||
echo $BASEDIR | ||
GOLDEN_TABLE_DIR="${BASEDIR}/connectors//golden-tables/src/main/resources/golden/" | ||
|
||
cd kernel/examples/table-reader | ||
|
||
SINGLE_THREAD_READER="io.delta.kernel.examples.SingleThreadedTableReader" | ||
MULTI_THREADED_READER="io.delta.kernel.examples.MultiThreadedTableReader" | ||
|
||
declare -a tests_single_threaded=( | ||
"--table=${GOLDEN_TABLE_DIR}/data-reader-primitives --columns=as_int,as_long --limit=5" | ||
"--table=${GOLDEN_TABLE_DIR}/data-reader-primitives --columns=as_int,as_long,as_double,as_string --limit=20" | ||
"--table=${GOLDEN_TABLE_DIR}/data-reader-partition-values --columns=as_string,as_byte,as_list_of_records,as_nested_struct --limit=20" | ||
) | ||
|
||
for test in "${tests_single_threaded[@]}" | ||
do | ||
mvn package exec:java \ | ||
-Dexec.cleanupDaemonThreads=false \ | ||
-Dexec.mainClass=${SINGLE_THREAD_READER} \ | ||
-Dstaging.repo.url=${EXTRA_MAVEN_REPO:-"___"} \ | ||
-Ddelta-kernel.version=${STANDALONE_VERSION:-"3.0.0-SNAPSHOT"} \ | ||
-Dexec.args="${test}" | ||
done | ||
|
||
declare -a tests_multi_threaded=( | ||
"--table=${GOLDEN_TABLE_DIR}/data-reader-primitives --columns=as_int,as_long --limit=5 --parallelism=5" | ||
"--table=${GOLDEN_TABLE_DIR}/data-reader-primitives --columns=as_int,as_long,as_double,as_string --limit=20 --parallelism=20" | ||
"--table=${GOLDEN_TABLE_DIR}/data-reader-partition-values --columns=as_string,as_byte,as_list_of_records,as_nested_struct --limit=20 --parallelism=2" | ||
) | ||
|
||
for test in "${tests_single_threaded[@]}" | ||
do | ||
mvn package exec:java \ | ||
-Dexec.cleanupDaemonThreads=false \ | ||
-Dexec.mainClass=${MULTI_THREADED_READER} \ | ||
-Dstaging.repo.url=${EXTRA_MAVEN_REPO:-"___"} \ | ||
-Ddelta-kernel.version=${STANDALONE_VERSION:-"3.0.0-SNAPSHOT"} \ | ||
-Dexec.args="${test}" | ||
done | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
|
||
<!--Copyright (2021) The Delta Lake Project Authors. | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License.--> | ||
|
||
<project xmlns="http://maven.apache.org/POM/4.0.0" | ||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
<modelVersion>4.0.0</modelVersion> | ||
|
||
<groupId>org.example</groupId> | ||
<artifactId>table-reader</artifactId> | ||
<version>3.0.0-SNAPSHOT</version> | ||
|
||
<properties> | ||
<maven.compiler.source>1.8</maven.compiler.source> | ||
<maven.compiler.target>1.8</maven.compiler.target> | ||
<staging.repo.url>""</staging.repo.url> | ||
<delta-kernel.version>3.0.0-SNAPSHOT</delta-kernel.version> | ||
<hadoop.version>3.3.1</hadoop.version> | ||
</properties> | ||
|
||
<repositories> | ||
<repository> | ||
<id>staging-repo</id> | ||
<url>${staging.repo.url}</url> | ||
</repository> | ||
</repositories> | ||
|
||
<dependencies> | ||
<dependency> | ||
<groupId>io.delta</groupId> | ||
<artifactId>delta-kernel-api</artifactId> | ||
<version>${delta-kernel.version}</version> | ||
</dependency> | ||
|
||
<dependency> | ||
<groupId>io.delta</groupId> | ||
<artifactId>delta-kernel-default</artifactId> | ||
<version>${delta-kernel.version}</version> | ||
</dependency> | ||
|
||
<dependency> | ||
<groupId>org.apache.hadoop</groupId> | ||
<artifactId>hadoop-client-runtime</artifactId> | ||
<version>${hadoop.version}</version> | ||
</dependency> | ||
|
||
<dependency> | ||
<groupId>org.apache.hadoop</groupId> | ||
<artifactId>hadoop-client-api</artifactId> | ||
<version>${hadoop.version}</version> | ||
</dependency> | ||
|
||
<dependency> | ||
<groupId>commons-cli</groupId> | ||
<artifactId>commons-cli</artifactId> | ||
<version>1.5.0</version> | ||
</dependency> | ||
|
||
<dependency> | ||
<groupId>com.fasterxml.jackson.core</groupId> | ||
<artifactId>jackson-databind</artifactId> | ||
<version>2.13.5</version> | ||
</dependency> | ||
|
||
</dependencies> | ||
</project> |
193 changes: 193 additions & 0 deletions
193
kernel/examples/table-reader/src/main/java/io/delta/kernel/examples/BaseTableReader.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,193 @@ | ||
/* | ||
* Copyright (2023) The Delta Lake Project Authors. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package io.delta.kernel.examples; | ||
|
||
import java.io.IOException; | ||
import java.util.Arrays; | ||
import java.util.List; | ||
import java.util.Optional; | ||
import java.util.stream.Collectors; | ||
import java.util.stream.IntStream; | ||
import org.apache.commons.cli.CommandLine; | ||
import org.apache.commons.cli.CommandLineParser; | ||
import org.apache.commons.cli.DefaultParser; | ||
import org.apache.commons.cli.HelpFormatter; | ||
import org.apache.commons.cli.Option; | ||
import org.apache.commons.cli.Options; | ||
import org.apache.commons.cli.ParseException; | ||
import org.apache.hadoop.conf.Configuration; | ||
|
||
import io.delta.kernel.TableNotFoundException; | ||
import io.delta.kernel.client.DefaultTableClient; | ||
import io.delta.kernel.client.TableClient; | ||
import io.delta.kernel.data.ColumnVector; | ||
import io.delta.kernel.data.ColumnarBatch; | ||
import io.delta.kernel.data.DataReadResult; | ||
import io.delta.kernel.data.vector.VectorUtils; | ||
import io.delta.kernel.types.StructField; | ||
import io.delta.kernel.types.StructType; | ||
|
||
import static java.lang.String.format; | ||
import static java.util.Objects.requireNonNull; | ||
|
||
/** | ||
* Base class for reading Delta Lake tables using the Delta Kernel APIs. | ||
*/ | ||
public abstract class BaseTableReader | ||
{ | ||
public static final int DEFAULT_LIMIT = 20; | ||
|
||
protected final String tablePath; | ||
protected final TableClient tableClient; | ||
|
||
public BaseTableReader(String tablePath) | ||
{ | ||
this.tablePath = requireNonNull(tablePath); | ||
this.tableClient = DefaultTableClient.create(new Configuration()); | ||
} | ||
|
||
/** | ||
* Show the given {@code limit} rows containing the given columns from the table. | ||
* | ||
* @param limit Max number of rows to show. | ||
* @param columnsOpt If null, show all columns in the table. | ||
* @throws TableNotFoundException | ||
* @throws IOException | ||
*/ | ||
public abstract void show(int limit, Optional<List<String>> columnsOpt) | ||
throws TableNotFoundException, IOException; | ||
|
||
/** | ||
* Utility method to return a pruned schema that contains the given {@code columns} from | ||
* {@code baseSchema} | ||
*/ | ||
protected static StructType pruneSchema(StructType baseSchema, Optional<List<String>> columns) | ||
{ | ||
if (!columns.isPresent()) { | ||
return baseSchema; | ||
} | ||
List<StructField> selectedFields = columns.get().stream().map(column -> { | ||
if (baseSchema.indexOf(column) == -1) { | ||
throw new IllegalArgumentException( | ||
format("Column %s is not found in table", column)); | ||
} | ||
return baseSchema.get(column); | ||
}).collect(Collectors.toList()); | ||
|
||
return new StructType(selectedFields); | ||
} | ||
|
||
protected static int printData(DataReadResult dataReadResult, int maxRowsToPrint) | ||
{ | ||
int printedRowCount = 0; | ||
ColumnarBatch data = dataReadResult.getData(); | ||
Optional<ColumnVector> selectionVector = dataReadResult.getSelectionVector(); | ||
for (int rowId = 0; rowId < data.getSize(); rowId++) { | ||
if (!selectionVector.isPresent() || selectionVector.get().getBoolean(rowId)) { | ||
printRow(data, rowId); | ||
printedRowCount++; | ||
if (printedRowCount == maxRowsToPrint) { | ||
break; | ||
} | ||
} | ||
} | ||
return printedRowCount; | ||
} | ||
|
||
protected static void printSchema(StructType schema) | ||
{ | ||
System.out.printf(formatter(schema.length()), schema.fieldNames().toArray(new String[0])); | ||
} | ||
|
||
protected static void printRow(ColumnarBatch batch, int rowId) | ||
{ | ||
int numCols = batch.getSchema().length(); | ||
Object[] rowValues = IntStream.range(0, numCols).mapToObj(colOrdinal -> { | ||
ColumnVector columnVector = batch.getColumnVector(colOrdinal); | ||
return VectorUtils.getValueAsObject(columnVector, rowId); | ||
}).toArray(); | ||
|
||
// TODO: Need to handle the Row, Map, Array, Timestamp, Date types specially to | ||
// print them in the format they need. Copy this code from Spark CLI. | ||
|
||
System.out.printf(formatter(numCols), rowValues); | ||
} | ||
|
||
private static String formatter(int length) | ||
{ | ||
return IntStream.range(0, length) | ||
.mapToObj(i -> "%20s") | ||
.collect(Collectors.joining("|")) + "\n"; | ||
} | ||
|
||
/** | ||
* Minimum command line options for any implementation of this reader. | ||
*/ | ||
protected static Options baseOptions() | ||
{ | ||
return new Options() | ||
.addRequiredOption("t", "table", true, "Fully qualified table path") | ||
.addOption("c", "columns", true, | ||
"Comma separated list of columns to read from the table. " + | ||
"Ex. --columns=id,name,address") | ||
.addOption( | ||
Option.builder() | ||
.option("l") | ||
.longOpt("limit") | ||
.hasArg(true) | ||
.desc("Maximum number of rows to read from the table (default 20).") | ||
.type(Number.class) | ||
.build() | ||
); | ||
} | ||
|
||
/** | ||
* Helper method to parse the command line arguments. | ||
*/ | ||
protected static CommandLine parseArgs(Options options, String[] args) | ||
{ | ||
CommandLineParser cliParser = new DefaultParser(); | ||
|
||
try { | ||
return cliParser.parse(options, args); | ||
} | ||
catch (ParseException parseException) { | ||
new HelpFormatter().printHelp( | ||
"java " + SingleThreadedTableReader.class.getCanonicalName(), | ||
options, | ||
true | ||
); | ||
} | ||
System.exit(-1); | ||
return null; | ||
} | ||
|
||
protected static Optional<List<String>> parseColumnList(CommandLine cli, String optionName) | ||
{ | ||
return Optional.ofNullable(cli.getOptionValue(optionName)) | ||
.map(colString -> Arrays.asList(colString.split(",[ ]*"))); | ||
} | ||
|
||
protected static int parseInt(CommandLine cli, String optionName, int defaultValue) | ||
throws ParseException | ||
{ | ||
return Optional.ofNullable(cli.getParsedOptionValue(optionName)) | ||
.map(Number.class::cast) | ||
.map(Number::intValue) | ||
.orElse(defaultValue); | ||
} | ||
} | ||
|
Oops, something went wrong.