From 7434e2fa91325c064ae418ded6329c75decb866a Mon Sep 17 00:00:00 2001 From: Jacques Nadeau Date: Tue, 7 Jan 2025 14:03:59 -1000 Subject: [PATCH] feat: introduce Iceberg table type using metadata file (#758) Adds Iceberg table type and first sub-variety, reading manifest files directly. --- proto/substrait/algebra.proto | 24 ++++++++++++++++++++++++ site/docs/relations/logical_relations.md | 14 ++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/proto/substrait/algebra.proto b/proto/substrait/algebra.proto index 80e88ded1..16f4b5ff9 100644 --- a/proto/substrait/algebra.proto +++ b/proto/substrait/algebra.proto @@ -114,6 +114,7 @@ message ReadRel { LocalFiles local_files = 6; NamedTable named_table = 7; ExtensionTable extension_table = 8; + IcebergTable iceberg_table = 9; } // A base table. The list of string is used to represent namespacing (e.g., mydb.mytable). @@ -123,6 +124,29 @@ message ReadRel { substrait.extensions.AdvancedExtension advanced_extension = 10; } + // Read an Iceberg Table + message IcebergTable { + oneof table_type { + MetadataFileRead direct = 1; + // future: add catalog table types (e.g. rest api, latest metadata in path, etc) + } + + // Read an Iceberg table using a metadata file. Implicit assumption: required credentials are already known by plan consumer. + message MetadataFileRead { + // the specific uri of a metadata file (e.g. s3://mybucket/mytable/-.metadata.json) + string metadata_uri = 1; + + // snapshot options. if none set, uses the current snapshot listed in the metadata file + oneof snapshot { + // the snapshot id to read. + string snapshot_id = 2; + + // the timestamp that should be used to select the snapshot (Time passed in microseconds since 1970-01-01 00:00:00.000000 in UTC) + int64 snapshot_timestamp = 3; + } + } + } + // A table composed of expressions. message VirtualTable { repeated Expression.Literal.Struct values = 1 [deprecated = true]; diff --git a/site/docs/relations/logical_relations.md b/site/docs/relations/logical_relations.md index aeb2a061d..be51bfae1 100644 --- a/site/docs/relations/logical_relations.md +++ b/site/docs/relations/logical_relations.md @@ -95,6 +95,20 @@ possible approach is that a chunk should only be read if the midpoint of the chu %%% proto.algebra.ReadRel %%% ``` +#### Iceberg Table Type + +A Iceberg Table is a table built on [Apache Iceberg](https://iceberg.apache.org/). Iceberg tables can be read by either directly reading a [metadata file](https://iceberg.apache.org/spec/#table-metadata) or by consulting a [catalog](https://iceberg.apache.org/concepts/catalog/). + +##### Metadata File Reading + +Points to an [Iceberg metadata file](https://iceberg.apache.org/spec/#table-metadata) and uses that as a starting point for reading an Iceberg table. This is the simplest form of Iceberg table access but should be limited to use for reads. (Writes often also need to update an external catalog.) + +| Property | Description | Required | +| -------- | ---------------------------------------------------------------- | ----------------------- | +| metadata_uri | A URI for an Iceberg metadata file. This current snapshot will be read from this file. | Required | +| snapshot_id | The snapshot that should be read using id. If not provided, the current snapshot is read. Only one of snapshot_id or snapshot_timestamp should be set. | Optional | +| snapshot_timestamp | The snapshot that should be read using timestamp. If not provided, the current snapshot is read. | Optional | + ## Filter Operation