[WIP] Add examples for Delta Kernel API usage

delta-io · Jul 21, 2023 · 1d0c173 · 1d0c173
1 parent c78daef
commit 1d0c173
Show file tree

Hide file tree

Showing 5 changed files with 791 additions and 0 deletions.
diff --git a/kernel/examples/table-reader/pom.xml b/kernel/examples/table-reader/pom.xml
@@ -0,0 +1,79 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--Copyright (2021) The Delta Lake Project Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.-->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <groupId>org.example</groupId>
+    <artifactId>table-reader</artifactId>
+    <version>3.0.0-SNAPSHOT</version>
+
+    <properties>
+        <maven.compiler.source>1.8</maven.compiler.source>
+        <maven.compiler.target>1.8</maven.compiler.target>
+        <staging.repo.url>""</staging.repo.url>
+        <delta-kernel.version>3.0.0-SNAPSHOT</delta-kernel.version>
+        <hadoop.version>3.3.1</hadoop.version>
+    </properties>
+
+    <repositories>
+        <repository>
+            <id>staging-repo</id>
+            <url>${staging.repo.url}</url>
+        </repository>
+    </repositories>
+
+    <dependencies>
+        <dependency>
+            <groupId>io.delta</groupId>
+            <artifactId>delta-kernel-api</artifactId>
+            <version>${delta-kernel.version}</version>
+        </dependency>
+
+        <dependency>
+            <groupId>io.delta</groupId>
+            <artifactId>delta-kernel-default</artifactId>
+            <version>${delta-kernel.version}</version>
+        </dependency>
+
+        <dependency>
+            <groupId>org.apache.hadoop</groupId>
+            <artifactId>hadoop-client-runtime</artifactId>
+            <version>${hadoop.version}</version>
+        </dependency>
+
+        <dependency>
+            <groupId>org.apache.hadoop</groupId>
+            <artifactId>hadoop-client-api</artifactId>
+            <version>${hadoop.version}</version>
+        </dependency>
+
+        <dependency>
+            <groupId>commons-cli</groupId>
+            <artifactId>commons-cli</artifactId>
+            <version>1.5.0</version>
+        </dependency>
+
+        <dependency>
+            <groupId>com.fasterxml.jackson.core</groupId>
+            <artifactId>jackson-databind</artifactId>
+            <version>2.13.5</version>
+        </dependency>
+
+    </dependencies>
+</project>
diff --git a/kernel/examples/table-reader/src/main/java/io/delta/kernel/examples/BaseTableReader.java b/kernel/examples/table-reader/src/main/java/io/delta/kernel/examples/BaseTableReader.java
@@ -0,0 +1,188 @@
+package io.delta.kernel.examples;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.List;
+import java.util.Optional;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.DefaultParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.ParseException;
+import org.apache.hadoop.conf.Configuration;
+
+import io.delta.kernel.TableNotFoundException;
+import io.delta.kernel.client.DefaultTableClient;
+import io.delta.kernel.client.TableClient;
+import io.delta.kernel.data.ColumnVector;
+import io.delta.kernel.data.ColumnarBatch;
+import io.delta.kernel.data.DataReadResult;
+import io.delta.kernel.data.vector.VectorUtils;
+import io.delta.kernel.types.StructField;
+import io.delta.kernel.types.StructType;
+import io.delta.kernel.utils.Utils;
+
+import static java.lang.String.format;
+import static java.util.Objects.requireNonNull;
+
+/**
+ * Base class for reading Delta Lake tables using the Delta Kernel APIs.
+ */
+public abstract class BaseTableReader
+{
+    public static final long DEFAULT_LIMIT = 20L;
+
+    protected final String tablePath;
+    protected final TableClient tableClient;
+
+    public BaseTableReader(String tablePath)
+    {
+        this.tablePath = requireNonNull(tablePath);
+        this.tableClient = DefaultTableClient.create(new Configuration());
+    }
+
+    /**
+     * Show the given {@code limit} rows containing the given columns from the table.
+     *
+     * @param limit Max number of rows to show.
+     * @param columnsOpt If null, show all columns in the table.
+     * @throws TableNotFoundException
+     * @throws IOException
+     */
+    public abstract void show(
+        long limit,
+        Optional<List<String>> columnsOpt
+    ) throws TableNotFoundException, IOException;
+
+    /**
+     * Utility method to return a pruned schema that contains the given {@code columns} from
+     * {@code baseSchema}
+     */
+    protected static StructType pruneSchema(StructType baseSchema, Optional<List<String>> columns)
+    {
+        if (!columns.isPresent()) {
+            return baseSchema;
+        }
+        List<StructField> selectedFields = columns.get().stream().map(column -> {
+            if (baseSchema.indexOf(column) == -1) {
+                throw new IllegalArgumentException(
+                    format("Column %s is not found in table", column));
+            }
+            return baseSchema.get(column);
+        }).collect(Collectors.toList());
+
+        return new StructType(selectedFields);
+    }
+
+    /**
+     * Utility method to print the data in given data batches. At most {@code maxRows} are printed.
+     *
+     * @param dataReadResults
+     * @param maxRows
+     */
+    protected static void printData(
+        StructType schema,
+        List<DataReadResult> dataReadResults,
+        long maxRows)
+    {
+        printSchema(schema);
+        long printedRowCount = 0;
+        try {
+            for (DataReadResult dataReadResult : dataReadResults) {
+                ColumnarBatch data = dataReadResult.getData();
+                Optional<ColumnVector> selectionVector = dataReadResult.getSelectionVector();
+                for (int rowId = 0; rowId < data.getSize(); rowId++) {
+                    if (!selectionVector.isPresent() ||
+                        selectionVector.get().getBoolean(rowId)) {
+                        // Print the row
+                        printRow(data, rowId);
+                        printedRowCount++;
+                        if (printedRowCount == maxRows) {
+                            return;
+                        }
+                    }
+                }
+            }
+        }
+        finally {
+            for (DataReadResult dataReadResult : dataReadResults) {
+                Utils.closeCloseables(
+                    // TODO: ColumnarBatch doesn't have close method - add one
+                    // (Closeable) dataReadResult.getData(),
+                    (Closeable) dataReadResult.getSelectionVector().orElse(null));
+            }
+        }
+    }
+
+    protected static void printSchema(StructType schema)
+    {
+        System.out.printf(formatter(schema.length()), schema.fieldNames().toArray(new String[0]));
+    }
+
+    protected static void printRow(ColumnarBatch batch, int rowId)
+    {
+        int numCols = batch.getSchema().length();
+        Object[] rowValues = IntStream.range(0, numCols).mapToObj(colOrdinal -> {
+            ColumnVector columnVector = batch.getColumnVector(colOrdinal);
+            return VectorUtils.getValueAsObject(columnVector, rowId);
+        }).toArray();
+
+        // TODO: Need to handle the Row, Map, Array, Timestamp, Date types specially to
+        // print them in the format they need. Copy this code from Spark CLI.
+
+        System.out.printf(formatter(numCols), rowValues);
+    }
+
+    private static String formatter(int length)
+    {
+        return IntStream.range(0, length)
+            .mapToObj(i -> "%20s")
+            .collect(Collectors.joining("|")) + "\n";
+    }
+
+    /**
+     * Minimum command line options for any implementation of this reader.
+     */
+    protected static Options baseOptions() {
+        return new Options()
+            .addRequiredOption("t", "table", true, "Fully qualified table path")
+            .addOption("c", "columns", true,
+                "Comma separated list of columns to read from the table. " +
+                    "Ex. --columns=id,name,address")
+            .addOption(
+                Option.builder()
+                    .option("l")
+                    .longOpt("limit")
+                    .hasArg(true)
+                    .desc("Maximum number of rows to read from the table (default 20).")
+                    .type(Number.class)
+                    .build()
+            );
+    }
+
+    /**
+     * Helper method to parse the command line arguments.
+     */
+    protected static CommandLine parseArgs(Options options, String[] args)
+    {
+        CommandLineParser cliParser = new DefaultParser();
+
+        try {
+            return cliParser.parse(options, args);
+        }
+        catch (ParseException parseException) {
+            new HelpFormatter().printHelp(
+                "java " + SingleThreadedTableReader.class.getCanonicalName(),
+                options,
+                true
+            );
+        }
+        System.exit(-1);
+        return null;
+    }
+}
+