From c2b18d1c44afcab49ab1cc3b4105c0d819aead30 Mon Sep 17 00:00:00 2001 From: Rahil Chertara Date: Fri, 9 Feb 2024 13:31:24 -0800 Subject: [PATCH 01/12] Add PrePlanTable and PlanTable Endpoints to open api spec --- open-api/rest-catalog-open-api.py | 162 +++++++++++- open-api/rest-catalog-open-api.yaml | 396 +++++++++++++++++++++++++++- 2 files changed, 552 insertions(+), 6 deletions(-) diff --git a/open-api/rest-catalog-open-api.py b/open-api/rest-catalog-open-api.py index c5af1940c324..f978bc108f0e 100644 --- a/open-api/rest-catalog-open-api.py +++ b/open-api/rest-catalog-open-api.py @@ -80,7 +80,7 @@ class Namespace(BaseModel): class PageToken(BaseModel): __root__: Optional[str] = Field( None, - description='An opaque token that allows clients to make use of pagination for list APIs (e.g. ListTables). Clients may initiate the first paginated request by sending an empty query parameter `pageToken` to the server.\nServers that support pagination should identify the `pageToken` parameter and return a `next-page-token` in the response if there are more results available. After the initial request, the value of `next-page-token` from each response must be used as the `pageToken` parameter value for the next request. The server must return `null` value for the `next-page-token` in the last response.\nServers that support pagination must return all results in a single response with the value of `next-page-token` set to `null` if the query parameter `pageToken` is not set in the request.\nServers that do not support pagination should ignore the `pageToken` parameter and return all results in a single response. The `next-page-token` must be omitted from the response.\nClients must interpret either `null` or missing response value of `next-page-token` as the end of the listing results.', + description='An opaque token that allows clients to make use of pagination for list APIs (e.g. ListTables) as well as for scan-planning APIs (e.g PlanTable). Clients may initiate the first paginated request by sending an empty query parameter `pageToken` to the server.\nServers that support pagination should identify the `pageToken` parameter and return a `next-page-token` in the response if there are more results available. After the initial request, the value of `next-page-token` from each response must be used as the `pageToken` parameter value for the next request. The server must return `null` value for the `next-page-token` in the last response.\nServers that support pagination must return all results in a single response with the value of `next-page-token` set to `null` if the query parameter `pageToken` is not set in the request.\nServers that do not support pagination should ignore the `pageToken` parameter and return all results in a single response. The `next-page-token` must be omitted from the response.\nClients must interpret either `null` or missing response value of `next-page-token` as the end of the listing results.', ) @@ -97,6 +97,8 @@ class ExpressionType(BaseModel): __root__: str = Field( ..., example=[ + 'true', + 'false', 'eq', 'and', 'or', @@ -118,6 +120,14 @@ class ExpressionType(BaseModel): ) +class TrueExpression(BaseModel): + type: ExpressionType + + +class FalseExpression(BaseModel): + type: ExpressionType + + class Reference(BaseModel): __root__: str = Field(..., example=['column-name']) @@ -777,8 +787,8 @@ class ContentFile(BaseModel): file_path: str = Field(..., alias='file-path') file_format: FileFormat = Field(..., alias='file-format') spec_id: int = Field(..., alias='spec-id') - partition: Optional[List[PrimitiveTypeValue]] = Field( - None, + partition: List[PrimitiveTypeValue] = Field( + ..., description='A list of partition field values ordered based on the fields of the partition spec specified by the `spec-id`', example=[1, 'bar'], ) @@ -808,6 +818,34 @@ class EqualityDeleteFile(ContentFile): ) +class BaseTableScanContext(BaseModel): + type: str + + +class FieldName(BaseModel): + __root__: str = Field( + ..., + description='A field name that follows the Iceberg naming standard, and can be used in APIs like Java `Schema#findField(String name)`.\nThe nested field name follows these rules - nested struct fields are named by concatenating field names at each struct level using dot (`.`) delimiter, e.g. employer.contact_info.address.zip_code - nested fields in a map key are named using the keyword `key`, e.g. employee_address_map.key.first_name - nested fields in a map value are named using the keyword `value`, e.g. employee_address_map.value.zip_code - nested fields in a list are named using the keyword `element`, e.g. employees.element.first_name', + ) + + +class SelectedFieldNames(BaseModel): + """ + A list of fields in schema that are selected in a table scan. When not specified, all columns in the requested schema should be selected. + """ + + __root__: List[FieldName] = Field( + ..., + description='A list of fields in schema that are selected in a table scan. When not specified, all columns in the requested schema should be selected.', + ) + + +class PlanTask(BaseModel): + """ + An opaque JSON object that contains information provided by the REST server to be utilized by clients for distributed table scan planning; should be supplied as input in `PlanTable` operation. + """ + + class CreateNamespaceRequest(BaseModel): namespace: Namespace properties: Optional[Dict[str, str]] = Field( @@ -852,6 +890,11 @@ class ViewRequirement(BaseModel): __root__: AssertViewUUID = Field(..., discriminator='type') +class PreplanTableResult(BaseModel): + plan_tasks: List[PlanTask] = Field(..., alias='plan-tasks') + next_page_token: Optional[PageToken] = Field(None, alias='next-page-token') + + class ReportMetricsRequest2(CommitReport): report_type: str = Field(..., alias='report-type') @@ -968,6 +1011,8 @@ class Type(BaseModel): class Expression(BaseModel): __root__: Union[ + TrueExpression, + FalseExpression, AndOrExpression, NotExpression, SetExpression, @@ -1111,6 +1156,11 @@ class LoadTableResult(BaseModel): config: Optional[Dict[str, str]] = None +class PlanTableResult(BaseModel): + file_scan_tasks: List[FileScanTask] = Field(..., alias='file-scan-tasks') + next_page_token: Optional[PageToken] = Field(None, alias='next-page-token') + + class CommitTableRequest(BaseModel): identifier: Optional[TableIdentifier] = Field( None, @@ -1197,6 +1247,98 @@ class CommitTableResponse(BaseModel): metadata: TableMetadata +class PreplanTableRequest(BaseModel): + table_scan_context: TableScanContext = Field(..., alias='table-scan-context') + + +class PlanTableRequest(BaseModel): + table_scan_context: TableScanContext = Field(..., alias='table-scan-context') + plan_task: Optional[PlanTask] = Field(None, alias='plan-task') + stats_fields: Optional[List[FieldName]] = Field( + None, + alias='stats-fields', + description='A list of fields that the client requests the server to send statistics in each `FileScanTask` returned in the response', + ) + + +class TableScanContext(BaseModel): + __root__: Union[SnapshotScanContext, IncrementalSnapshotScanContext] + + +class SnapshotScanContext(BaseTableScanContext): + """ + context for scanning data in a specific snapshot + """ + + type: Literal['snapshot-scan'] + select: Optional[SelectedFieldNames] = None + filter: Optional[Filter] = None + case_sensitive: Optional[bool] = Field( + True, + alias='case-sensitive', + description='If field selection and filtering should be case sensitive', + ) + snapshot_id: Optional[int] = Field( + None, + alias='snapshot-id', + description='The ID of the snapshot to use for the table scan. If not specified, the snapshot at the main branch head will be used.', + ) + use_snapshot_schema: Optional[bool] = Field( + False, + alias='use-snapshot-schema', + description='If the schema of the specific snapshot should be used instead of the table schema.', + ) + + +class IncrementalSnapshotScanContext(BaseTableScanContext): + """ + Context for scanning data appended in a range of snapshots. The scan always follows the schema of the snapshot at the main branch head. + """ + + type: Literal['incremental-snapshot-scan'] + select: Optional[SelectedFieldNames] = None + filter: Optional[Filter] = None + case_sensitive: Optional[bool] = Field( + True, + alias='case-sensitive', + description='If field selection and filtering should be case sensitive', + ) + start_snapshot_id: int = Field( + ..., + alias='start-snapshot-id', + description='The ID of the starting snapshot of the incremental scan', + ) + inclusive_start: Optional[bool] = Field( + False, + alias='inclusive-start', + description='If the data appended in the start snapshot should be included in the scan', + ) + end_snapshot_id: Optional[int] = Field( + None, + alias='end-snapshot-id', + description='The ID of the inclusive ending snapshot of the incremental scan. If not specified, the snapshot at the main branch head will be used as the end snapshot.', + ) + + +class FileScanTask(BaseModel): + data_file: DataFile = Field(..., alias='data-file') + position_delete_files: Optional[List[PositionDeleteFile]] = Field( + None, + alias='position-delete-files', + description='a list of position delete files that should be applied to the data file during a scan', + ) + equality_delete_files: Optional[List[EqualityDeleteFile]] = Field( + None, + alias='equality-delete-files', + description='a list of equality delete files that should be applied to the data file during a scan', + ) + residual_filter: Filter = Field( + ..., + alias='residual-filter', + description='the filters should be applied to rows in this file scan task', + ) + + class Schema(StructType): schema_id: Optional[int] = Field(None, alias='schema-id') identifier_field_ids: Optional[List[int]] = Field( @@ -1204,6 +1346,12 @@ class Schema(StructType): ) +class Filter(BaseModel): + """ + an unbounded expression to describe the filters to apply to a table scan, default to `TrueExpression` meaning that nothing is filtered. + """ + + class ReportMetricsRequest1(ScanReport): report_type: str = Field(..., alias='report-type') @@ -1215,6 +1363,14 @@ class ReportMetricsRequest1(ScanReport): TableMetadata.update_forward_refs() ViewMetadata.update_forward_refs() AddSchemaUpdate.update_forward_refs() +PlanTableResult.update_forward_refs() CreateTableRequest.update_forward_refs() CreateViewRequest.update_forward_refs() ReportMetricsRequest.update_forward_refs() +PreplanTableRequest.update_forward_refs() +PlanTableRequest.update_forward_refs() +TableScanContext.update_forward_refs() +SnapshotScanContext.update_forward_refs() +IncrementalSnapshotScanContext.update_forward_refs() +FileScanTask.update_forward_refs() +Filter.update_forward_refs() diff --git a/open-api/rest-catalog-open-api.yaml b/open-api/rest-catalog-open-api.yaml index 4bb73cd44120..f15d68ae9882 100644 --- a/open-api/rest-catalog-open-api.yaml +++ b/open-api/rest-catalog-open-api.yaml @@ -541,6 +541,130 @@ paths: 5XX: $ref: '#/components/responses/ServerErrorResponse' + /v1/{prefix}/namespaces/{namespace}/tables/{table}/preplan: + parameters: + - $ref: '#/components/parameters/prefix' + - $ref: '#/components/parameters/namespace' + - $ref: '#/components/parameters/table' + post: + tags: + - Catalog API + summary: Prepare a list of tasks that can be used to distribute table scan planning + description: + Prepare a list of tasks that can be used to distribute table scan planning based on a set of table scan criteria + such as selected columns, filters, snapshot range, case sensitivity, etc. + + This API returns a list of `plan-task`s, and each of them can be used in the `PlanTable` API + to request a subset of all file scan tasks in a table scan. + This mechanism allows clients to distribute and parallelize the entire table scan planning process. + operationId: PreplanTable + parameters: + - $ref: '#/components/parameters/page-token' + - $ref: '#/components/parameters/page-size' + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/PreplanTableRequest' + responses: + 200: + $ref: '#/components/responses/PreplanTableResponse' + 400: + $ref: '#/components/responses/BadRequestErrorResponse' + 401: + $ref: '#/components/responses/UnauthorizedResponse' + 403: + $ref: '#/components/responses/ForbiddenResponse' + 404: + description: + Not Found + - NoSuchTableException, the table does not exist + - NoSuchNamespaceException, the namespace does not exist + content: + application/json: + schema: + $ref: '#/components/schemas/IcebergErrorResponse' + examples: + TableDoesNotExist: + $ref: '#/components/examples/NoSuchTableError' + NamespaceDoesNotExist: + $ref: '#/components/examples/NoSuchNamespaceError' + 406: + $ref: '#/components/responses/UnsupportedOperationResponse' + 419: + $ref: '#/components/responses/AuthenticationTimeoutResponse' + 503: + $ref: '#/components/responses/ServiceUnavailableResponse' + 5XX: + $ref: '#/components/responses/ServerErrorResponse' + + /v1/{prefix}/namespaces/{namespace}/tables/{table}/plan: + parameters: + - $ref: '#/components/parameters/prefix' + - $ref: '#/components/parameters/namespace' + - $ref: '#/components/parameters/table' + post: + tags: + - Catalog API + summary: Perform scan planning against a table + operationId: PlanTable + parameters: + - $ref: '#/components/parameters/page-token' + - $ref: '#/components/parameters/page-size' + description: + Perform scan planning against a table based on a set of table scan criteria such as selected columns, filters, + snapshot range, case sensitivity, etc. + + An optional `plan-task` can be provided to request only a subset of file scan tasks. + The `plan-task` can be retrieved by invoking the `PreplanTable` endpoint. + + If preplanning using the `PreplanTable` endpoint is required before hitting this endpoint but the client fails + to supply a `plan-task` in the request, then a `421 Misdirected Request` response should be returned to + indicate this requirement. + + If planning a table scan produces too many file scan tasks and the server is unable to return them within its + response size limit, then a `422 Unprocessable Content` response should be returned to indicate that the client + should first attempt to preplan the specific table scan to distribute the planning process and make the content + processable by the server. + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/PlanTableRequest' + responses: + 200: + $ref: '#/components/responses/PlanTableResponse' + 400: + $ref: '#/components/responses/BadRequestErrorResponse' + 401: + $ref: '#/components/responses/UnauthorizedResponse' + 403: + $ref: '#/components/responses/ForbiddenResponse' + 404: + description: + Not Found + - NoSuchTableException, the table does not exist + - NoSuchNamespaceException, the namespace does not exist + content: + application/json: + schema: + $ref: '#/components/schemas/IcebergErrorResponse' + examples: + TableDoesNotExist: + $ref: '#/components/examples/NoSuchTableError' + NamespaceDoesNotExist: + $ref: '#/components/examples/NoSuchNamespaceError' + 419: + $ref: '#/components/responses/AuthenticationTimeoutResponse' + 421: + $ref: '#/components/responses/MisdirectedRequestResponse' + 422: + $ref: '#/components/responses/UnprocessableContentResponse' + 503: + $ref: '#/components/responses/ServiceUnavailableResponse' + 5XX: + $ref: '#/components/responses/ServerErrorResponse' + /v1/{prefix}/namespaces/{namespace}/register: parameters: - $ref: '#/components/parameters/prefix' @@ -629,7 +753,7 @@ paths: The snapshots to return in the body of the metadata. Setting the value to `all` would return the full set of snapshots currently valid for the table. Setting the value to `refs` would load all snapshots referenced by branches or tags. - + Default if no param is provided is `all`. required: false schema: @@ -1618,8 +1742,8 @@ components: PageToken: description: - An opaque token that allows clients to make use of pagination for list APIs - (e.g. ListTables). Clients may initiate the first paginated request by sending an empty + An opaque token that allows clients to make use of pagination for list APIs (e.g. ListTables) + as well as for scan-planning APIs (e.g PlanTable). Clients may initiate the first paginated request by sending an empty query parameter `pageToken` to the server. Servers that support pagination should identify the `pageToken` parameter and return a @@ -1758,6 +1882,8 @@ components: Expression: oneOf: + - $ref: '#/components/schemas/TrueExpression' + - $ref: '#/components/schemas/FalseExpression' - $ref: '#/components/schemas/AndOrExpression' - $ref: '#/components/schemas/NotExpression' - $ref: '#/components/schemas/SetExpression' @@ -1767,6 +1893,8 @@ components: ExpressionType: type: string example: + - "true" + - "false" - "eq" - "and" - "or" @@ -1785,6 +1913,24 @@ components: - "is-nan" - "not-nan" + TrueExpression: + type: object + required: + - type + properties: + type: + $ref: '#/components/schemas/ExpressionType' + enum: ["true"] + + FalseExpression: + type: object + required: + - type + properties: + type: + $ref: '#/components/schemas/ExpressionType' + enum: ["false"] + AndOrExpression: type: object required: @@ -2774,6 +2920,30 @@ components: additionalProperties: type: string + PreplanTableResult: + type: object + required: + - plan-tasks + properties: + plan-tasks: + type: array + items: + $ref: '#/components/schemas/PlanTask' + next-page-token: + $ref: '#/components/schemas/PageToken' + + PlanTableResult: + type: object + required: + - file-scan-tasks + properties: + file-scan-tasks: + type: array + items: + $ref: '#/components/schemas/FileScanTask' + next-page-token: + $ref: '#/components/schemas/PageToken' + CommitTableRequest: type: object required: @@ -3544,6 +3714,7 @@ components: type: object required: - spec-id + - partition - content - file-path - file-format @@ -3647,6 +3818,176 @@ components: type: integer description: "List of equality field IDs" + PreplanTableRequest: + type: object + required: + - table-scan-context + properties: + table-scan-context: + $ref: '#/components/schemas/TableScanContext' + + PlanTableRequest: + type: object + required: + - table-scan-context + properties: + table-scan-context: + $ref: '#/components/schemas/TableScanContext' + plan-task: + $ref: '#/components/schemas/PlanTask' + stats-fields: + description: + A list of fields that the client requests the server to send statistics + in each `FileScanTask` returned in the response + type: array + items: + $ref: '#/components/schemas/FieldName' + + TableScanContext: + anyOf: + - $ref: '#/components/schemas/SnapshotScanContext' + - $ref: '#/components/schemas/IncrementalSnapshotScanContext' + + BaseTableScanContext: + discriminator: + propertyName: type + mapping: + snapshot-scan: '#/components/schemas/SnapshotScanContext' + incremental-snapshot-scan: '#/components/schemas/IncrementalSnapshotScanContext' + type: object + required: + - type + properties: + type: + type: string + + SnapshotScanContext: + description: context for scanning data in a specific snapshot + type: object + allOf: + - $ref: '#/components/schemas/BaseTableScanContext' + required: + - type + properties: + type: + type: string + enum: ["snapshot-scan"] + select: + $ref: '#/components/schemas/SelectedFieldNames' + filter: + $ref: '#/components/schemas/Filter' + case-sensitive: + description: If field selection and filtering should be case sensitive + type: boolean + default: true + snapshot-id: + description: + The ID of the snapshot to use for the table scan. + If not specified, the snapshot at the main branch head will be used. + type: integer + format: int64 + use-snapshot-schema: + description: + If the schema of the specific snapshot should be used instead of the table schema. + type: boolean + default: false + + IncrementalSnapshotScanContext: + description: + Context for scanning data appended in a range of snapshots. + The scan always follows the schema of the snapshot at the main branch head. + type: object + allOf: + - $ref: '#/components/schemas/BaseTableScanContext' + required: + - type + - start-snapshot-id + properties: + type: + type: string + enum: ["incremental-snapshot-scan"] + select: + $ref: '#/components/schemas/SelectedFieldNames' + filter: + $ref: '#/components/schemas/Filter' + case-sensitive: + description: If field selection and filtering should be case sensitive + type: boolean + default: true + start-snapshot-id: + description: The ID of the starting snapshot of the incremental scan + type: integer + format: int64 + inclusive-start: + description: If the data appended in the start snapshot should be included in the scan + type: boolean + default: false + end-snapshot-id: + description: + The ID of the inclusive ending snapshot of the incremental scan. + If not specified, the snapshot at the main branch head will be used as the end snapshot. + type: integer + format: int64 + + FieldName: + description: + A field name that follows the Iceberg naming standard, and can be used in APIs like + Java `Schema#findField(String name)`. + + The nested field name follows these rules + - nested struct fields are named by concatenating field names at each struct level using dot (`.`) delimiter, + e.g. employer.contact_info.address.zip_code + - nested fields in a map key are named using the keyword `key`, e.g. employee_address_map.key.first_name + - nested fields in a map value are named using the keyword `value`, e.g. employee_address_map.value.zip_code + - nested fields in a list are named using the keyword `element`, e.g. employees.element.first_name + type: string + + SelectedFieldNames: + description: + A list of fields in schema that are selected in a table scan. + When not specified, all columns in the requested schema should be selected. + type: array + items: + $ref: '#/components/schemas/FieldName' + + Filter: + description: + an unbounded expression to describe the filters to apply to a table scan, + default to `TrueExpression` meaning that nothing is filtered. + allOf: + - $ref: '#/components/schemas/Expression' + default: { "type": "true" } + + PlanTask: + description: + An opaque JSON object that contains information provided by the REST server + to be utilized by clients for distributed table scan planning; should be supplied + as input in `PlanTable` operation. + type: object + + FileScanTask: + type: object + required: + - data-file + - residual-filter + properties: + data-file: + $ref: '#/components/schemas/DataFile' + position-delete-files: + description: a list of position delete files that should be applied to the data file during a scan + type: array + items: + $ref: '#/components/schemas/PositionDeleteFile' + equality-delete-files: + description: a list of equality delete files that should be applied to the data file during a scan + type: array + items: + $ref: '#/components/schemas/EqualityDeleteFile' + residual-filter: + description: the filters should be applied to rows in this file scan task + allOf: + - $ref: '#/components/schemas/Filter' + ############################# # Reusable Response Objects # ############################# @@ -3809,6 +4150,41 @@ components: } } + # Note that this is a representative example response for use as a shorthand in the spec. + # The fields `message` and `type` as indicated here are not presently prescriptive. + MisdirectedRequestResponse: + description: + Misdirected Request. The request was directed to a server that is not able to produce a response. + content: + application/json: + schema: + $ref: '#/components/schemas/IcebergErrorResponse' + example: { + "error": { + "message": "Fail to plan table scan: must perform preplan before planning a table scan", + "type": "PreplanTableRequiredException", + "code": 421 + } + } + + # Note that this is a representative example response for use as a shorthand in the spec. + # The fields `message` and `type` as indicated here are not presently prescriptive. + UnprocessableContentResponse: + description: + Unprocessable Content. The server understands the content type of the request entity, and the syntax of the + request entity is correct, but it was unable to process the contained instructions. + content: + application/json: + schema: + $ref: '#/components/schemas/IcebergErrorResponse' + example: { + "error": { + "message": "Fail to plan table scan: too many file scan tasks. Please use preplan to distribute table scan.", + "type": "TooManyFileScanTasksException", + "code": 422 + } + } + ServiceUnavailableResponse: description: The service is not ready to handle the request. The client should wait and retry. @@ -3863,6 +4239,20 @@ components: schema: $ref: '#/components/schemas/LoadTableResult' + PreplanTableResponse: + description: Result of preplanning a table scan + content: + application/json: + schema: + $ref: '#/components/schemas/PreplanTableResult' + + PlanTableResponse: + description: Result of planning a table scan + content: + application/json: + schema: + $ref: '#/components/schemas/PlanTableResult' + LoadTableResponse: description: Table metadata result when loading a table content: From b8c45facfab509dc8d77ae23d19c2f19ea1c2e61 Mon Sep 17 00:00:00 2001 From: Rahil Chertara Date: Thu, 25 Jul 2024 00:22:42 -0700 Subject: [PATCH 02/12] Revise spec based on ryan recent comments --- open-api/rest-catalog-open-api.py | 217 ++++++++++++++++++---------- open-api/rest-catalog-open-api.yaml | 191 +++++++++++------------- 2 files changed, 227 insertions(+), 181 deletions(-) diff --git a/open-api/rest-catalog-open-api.py b/open-api/rest-catalog-open-api.py index f978bc108f0e..2f1f289b1c75 100644 --- a/open-api/rest-catalog-open-api.py +++ b/open-api/rest-catalog-open-api.py @@ -818,8 +818,8 @@ class EqualityDeleteFile(ContentFile): ) -class BaseTableScanContext(BaseModel): - type: str +class DeleteFile(BaseModel): + __root__: Union[PositionDeleteFile, EqualityDeleteFile] class FieldName(BaseModel): @@ -829,17 +829,6 @@ class FieldName(BaseModel): ) -class SelectedFieldNames(BaseModel): - """ - A list of fields in schema that are selected in a table scan. When not specified, all columns in the requested schema should be selected. - """ - - __root__: List[FieldName] = Field( - ..., - description='A list of fields in schema that are selected in a table scan. When not specified, all columns in the requested schema should be selected.', - ) - - class PlanTask(BaseModel): """ An opaque JSON object that contains information provided by the REST server to be utilized by clients for distributed table scan planning; should be supplied as input in `PlanTable` operation. @@ -1158,6 +1147,11 @@ class LoadTableResult(BaseModel): class PlanTableResult(BaseModel): file_scan_tasks: List[FileScanTask] = Field(..., alias='file-scan-tasks') + delete_files: Optional[List[DeleteFile]] = Field( + None, + alias='delete-files', + description='A list of delete files that can be either positional or equality. If the client does not recognize the type of delete file being returned by the service it should immediately throw an exception that it does not support this type.', + ) next_page_token: Optional[PageToken] = Field(None, alias='next-page-token') @@ -1247,95 +1241,181 @@ class CommitTableResponse(BaseModel): metadata: TableMetadata -class PreplanTableRequest(BaseModel): - table_scan_context: TableScanContext = Field(..., alias='table-scan-context') +class PreplanTableRequest1(BaseModel): + snapshot_id: int = Field( + ..., + alias='snapshot-id', + description='The ID of the snapshot to use for the table scan.', + ) + select: Optional[List[FieldName]] = Field( + None, + description='A list of fields in schema that are selected in a table scan. When not specified, all columns in the requested schema should be selected.', + ) + filter: Optional[Expression] = Field( + None, + description='an unbounded expression to describe the filters to apply to a table scan,', + ) + case_sensitive: Optional[bool] = Field( + True, + alias='case-sensitive', + description='If field selection and filtering should be case sensitive', + ) + use_snapshot_schema: Optional[bool] = Field( + False, + alias='use-snapshot-schema', + description='If the client is performing time travel, the snapshot schema should be used. For clients performing a plan for a branch, should default to using the table schema.', + ) + start_snapshot_id: Optional[int] = Field( + None, + alias='start-snapshot-id', + description='The ID of the starting snapshot of the incremental scan', + ) + end_snapshot_id: Optional[int] = Field( + None, + alias='end-snapshot-id', + description='The ID of the inclusive ending snapshot of the incremental scan. If not specified, the snapshot at the main branch head will be used as the end snapshot.', + ) -class PlanTableRequest(BaseModel): - table_scan_context: TableScanContext = Field(..., alias='table-scan-context') - plan_task: Optional[PlanTask] = Field(None, alias='plan-task') - stats_fields: Optional[List[FieldName]] = Field( +class PreplanTableRequest2(BaseModel): + snapshot_id: Optional[int] = Field( None, - alias='stats-fields', - description='A list of fields that the client requests the server to send statistics in each `FileScanTask` returned in the response', + alias='snapshot-id', + description='The ID of the snapshot to use for the table scan.', + ) + select: Optional[List[FieldName]] = Field( + None, + description='A list of fields in schema that are selected in a table scan. When not specified, all columns in the requested schema should be selected.', + ) + filter: Optional[Expression] = Field( + None, + description='an unbounded expression to describe the filters to apply to a table scan,', + ) + case_sensitive: Optional[bool] = Field( + True, + alias='case-sensitive', + description='If field selection and filtering should be case sensitive', + ) + use_snapshot_schema: Optional[bool] = Field( + False, + alias='use-snapshot-schema', + description='If the client is performing time travel, the snapshot schema should be used. For clients performing a plan for a branch, should default to using the table schema.', + ) + start_snapshot_id: int = Field( + ..., + alias='start-snapshot-id', + description='The ID of the starting snapshot of the incremental scan', + ) + end_snapshot_id: int = Field( + ..., + alias='end-snapshot-id', + description='The ID of the inclusive ending snapshot of the incremental scan. If not specified, the snapshot at the main branch head will be used as the end snapshot.', ) -class TableScanContext(BaseModel): - __root__: Union[SnapshotScanContext, IncrementalSnapshotScanContext] - +class PreplanTableRequest(BaseModel): + __root__: Union[PreplanTableRequest1, PreplanTableRequest2] -class SnapshotScanContext(BaseTableScanContext): - """ - context for scanning data in a specific snapshot - """ - type: Literal['snapshot-scan'] - select: Optional[SelectedFieldNames] = None - filter: Optional[Filter] = None +class PlanTableRequest1(BaseModel): + plan_task: Optional[PlanTask] = Field(None, alias='plan-task') + snapshot_id: int = Field( + ..., + alias='snapshot-id', + description='The ID of the snapshot to use for the table scan.', + ) + select: Optional[List[FieldName]] = Field( + None, + description='A list of fields in schema that are selected in a table scan. When not specified, all columns in the requested schema should be selected.', + ) + filter: Optional[Expression] = Field( + None, + description='an unbounded expression to describe the filters to apply to a table scan,', + ) case_sensitive: Optional[bool] = Field( True, alias='case-sensitive', description='If field selection and filtering should be case sensitive', ) - snapshot_id: Optional[int] = Field( - None, - alias='snapshot-id', - description='The ID of the snapshot to use for the table scan. If not specified, the snapshot at the main branch head will be used.', - ) use_snapshot_schema: Optional[bool] = Field( False, alias='use-snapshot-schema', - description='If the schema of the specific snapshot should be used instead of the table schema.', + description='If the client is performing time travel, the snapshot schema should be used. For clients performing a plan for a branch, should default to using the table schema.', + ) + start_snapshot_id: Optional[int] = Field( + None, + alias='start-snapshot-id', + description='The ID of the starting snapshot of the incremental scan', + ) + end_snapshot_id: Optional[int] = Field( + None, + alias='end-snapshot-id', + description='The ID of the inclusive ending snapshot of the incremental scan. If not specified, the snapshot at the main branch head will be used as the end snapshot.', + ) + stats_fields: Optional[List[FieldName]] = Field( + None, + alias='stats-fields', + description='A list of fields that the client requests the server to send statistics in each `FileScanTask` returned in the response', ) -class IncrementalSnapshotScanContext(BaseTableScanContext): - """ - Context for scanning data appended in a range of snapshots. The scan always follows the schema of the snapshot at the main branch head. - """ - - type: Literal['incremental-snapshot-scan'] - select: Optional[SelectedFieldNames] = None - filter: Optional[Filter] = None +class PlanTableRequest2(BaseModel): + plan_task: Optional[PlanTask] = Field(None, alias='plan-task') + snapshot_id: Optional[int] = Field( + None, + alias='snapshot-id', + description='The ID of the snapshot to use for the table scan.', + ) + select: Optional[List[FieldName]] = Field( + None, + description='A list of fields in schema that are selected in a table scan. When not specified, all columns in the requested schema should be selected.', + ) + filter: Optional[Expression] = Field( + None, + description='an unbounded expression to describe the filters to apply to a table scan,', + ) case_sensitive: Optional[bool] = Field( True, alias='case-sensitive', description='If field selection and filtering should be case sensitive', ) + use_snapshot_schema: Optional[bool] = Field( + False, + alias='use-snapshot-schema', + description='If the client is performing time travel, the snapshot schema should be used. For clients performing a plan for a branch, should default to using the table schema.', + ) start_snapshot_id: int = Field( ..., alias='start-snapshot-id', description='The ID of the starting snapshot of the incremental scan', ) - inclusive_start: Optional[bool] = Field( - False, - alias='inclusive-start', - description='If the data appended in the start snapshot should be included in the scan', - ) - end_snapshot_id: Optional[int] = Field( - None, + end_snapshot_id: int = Field( + ..., alias='end-snapshot-id', description='The ID of the inclusive ending snapshot of the incremental scan. If not specified, the snapshot at the main branch head will be used as the end snapshot.', ) + stats_fields: Optional[List[FieldName]] = Field( + None, + alias='stats-fields', + description='A list of fields that the client requests the server to send statistics in each `FileScanTask` returned in the response', + ) + + +class PlanTableRequest(BaseModel): + __root__: Union[PlanTableRequest1, PlanTableRequest2] class FileScanTask(BaseModel): data_file: DataFile = Field(..., alias='data-file') - position_delete_files: Optional[List[PositionDeleteFile]] = Field( + delete_files_references: Optional[List[int]] = Field( None, - alias='position-delete-files', - description='a list of position delete files that should be applied to the data file during a scan', + alias='delete-files-references', + description='A list of positional indices that correspond to a delete files array.', ) - equality_delete_files: Optional[List[EqualityDeleteFile]] = Field( + residual_filter: Optional[Expression] = Field( None, - alias='equality-delete-files', - description='a list of equality delete files that should be applied to the data file during a scan', - ) - residual_filter: Filter = Field( - ..., alias='residual-filter', - description='the filters should be applied to rows in this file scan task', + description='An optional filter to be applied to rows in this file scan task. If the residual is not present, the client should calculate this or the original filter should be used.', ) @@ -1346,12 +1426,6 @@ class Schema(StructType): ) -class Filter(BaseModel): - """ - an unbounded expression to describe the filters to apply to a table scan, default to `TrueExpression` meaning that nothing is filtered. - """ - - class ReportMetricsRequest1(ScanReport): report_type: str = Field(..., alias='report-type') @@ -1367,10 +1441,3 @@ class ReportMetricsRequest1(ScanReport): CreateTableRequest.update_forward_refs() CreateViewRequest.update_forward_refs() ReportMetricsRequest.update_forward_refs() -PreplanTableRequest.update_forward_refs() -PlanTableRequest.update_forward_refs() -TableScanContext.update_forward_refs() -SnapshotScanContext.update_forward_refs() -IncrementalSnapshotScanContext.update_forward_refs() -FileScanTask.update_forward_refs() -Filter.update_forward_refs() diff --git a/open-api/rest-catalog-open-api.yaml b/open-api/rest-catalog-open-api.yaml index f15d68ae9882..d2fa9580c041 100644 --- a/open-api/rest-catalog-open-api.yaml +++ b/open-api/rest-catalog-open-api.yaml @@ -2941,6 +2941,14 @@ components: type: array items: $ref: '#/components/schemas/FileScanTask' + delete-files: + description: + A list of delete files that can be either positional or equality. + If the client does not recognize the type of delete file being returned by the service + it should immediately throw an exception that it does not support this type. + type: array + items: + $ref: '#/components/schemas/DeleteFile' next-page-token: $ref: '#/components/schemas/PageToken' @@ -3818,116 +3826,111 @@ components: type: integer description: "List of equality field IDs" - PreplanTableRequest: - type: object - required: - - table-scan-context - properties: - table-scan-context: - $ref: '#/components/schemas/TableScanContext' + DeleteFile: + oneOf: + - $ref: '#/components/schemas/PositionDeleteFile' + - $ref: '#/components/schemas/EqualityDeleteFile' - PlanTableRequest: + PreplanTableRequest: type: object - required: - - table-scan-context + oneOf: + - required: + - snapshot-id + - required: + - start-snapshot-id + - end-snapshot-id properties: - table-scan-context: - $ref: '#/components/schemas/TableScanContext' - plan-task: - $ref: '#/components/schemas/PlanTask' - stats-fields: + snapshot-id: description: - A list of fields that the client requests the server to send statistics - in each `FileScanTask` returned in the response + The ID of the snapshot to use for the table scan. + type: integer + format: int64 + select: + description: + A list of fields in schema that are selected in a table scan. + When not specified, all columns in the requested schema should be selected. type: array items: $ref: '#/components/schemas/FieldName' - - TableScanContext: - anyOf: - - $ref: '#/components/schemas/SnapshotScanContext' - - $ref: '#/components/schemas/IncrementalSnapshotScanContext' - - BaseTableScanContext: - discriminator: - propertyName: type - mapping: - snapshot-scan: '#/components/schemas/SnapshotScanContext' - incremental-snapshot-scan: '#/components/schemas/IncrementalSnapshotScanContext' - type: object - required: - - type - properties: - type: - type: string - - SnapshotScanContext: - description: context for scanning data in a specific snapshot - type: object - allOf: - - $ref: '#/components/schemas/BaseTableScanContext' - required: - - type - properties: - type: - type: string - enum: ["snapshot-scan"] - select: - $ref: '#/components/schemas/SelectedFieldNames' filter: - $ref: '#/components/schemas/Filter' + description: + an unbounded expression to describe the filters to apply to a table scan, + $ref: '#/components/schemas/Expression' case-sensitive: description: If field selection and filtering should be case sensitive type: boolean default: true - snapshot-id: - description: - The ID of the snapshot to use for the table scan. - If not specified, the snapshot at the main branch head will be used. - type: integer - format: int64 use-snapshot-schema: description: - If the schema of the specific snapshot should be used instead of the table schema. + If the client is performing time travel, the snapshot schema should be used. + For clients performing a plan for a branch, should default to using the table schema. type: boolean default: false + start-snapshot-id: + description: The ID of the starting snapshot of the incremental scan + type: integer + format: int64 + end-snapshot-id: + description: + The ID of the inclusive ending snapshot of the incremental scan. + If not specified, the snapshot at the main branch head will be used as the end snapshot. + type: integer + format: int64 - IncrementalSnapshotScanContext: - description: - Context for scanning data appended in a range of snapshots. - The scan always follows the schema of the snapshot at the main branch head. + PlanTableRequest: type: object - allOf: - - $ref: '#/components/schemas/BaseTableScanContext' - required: - - type - - start-snapshot-id + oneOf: + - required: + - snapshot-id + - required: + - start-snapshot-id + - end-snapshot-id properties: - type: - type: string - enum: ["incremental-snapshot-scan"] + plan-task: + $ref: '#/components/schemas/PlanTask' + snapshot-id: + description: + The ID of the snapshot to use for the table scan. + type: integer + format: int64 select: - $ref: '#/components/schemas/SelectedFieldNames' + description: + A list of fields in schema that are selected in a table scan. + When not specified, all columns in the requested schema should be selected. + type: array + items: + $ref: '#/components/schemas/FieldName' filter: - $ref: '#/components/schemas/Filter' + description: + an unbounded expression to describe the filters to apply to a table scan, + $ref: '#/components/schemas/Expression' case-sensitive: description: If field selection and filtering should be case sensitive type: boolean default: true + use-snapshot-schema: + description: + If the client is performing time travel, the snapshot schema should be used. + For clients performing a plan for a branch, should default to using the table schema. + type: boolean + default: false start-snapshot-id: description: The ID of the starting snapshot of the incremental scan type: integer format: int64 - inclusive-start: - description: If the data appended in the start snapshot should be included in the scan - type: boolean - default: false end-snapshot-id: description: The ID of the inclusive ending snapshot of the incremental scan. If not specified, the snapshot at the main branch head will be used as the end snapshot. type: integer format: int64 + stats-fields: + description: + A list of fields that the client requests the server to send statistics + in each `FileScanTask` returned in the response + type: array + items: + $ref: '#/components/schemas/FieldName' FieldName: description: @@ -3942,22 +3945,6 @@ components: - nested fields in a list are named using the keyword `element`, e.g. employees.element.first_name type: string - SelectedFieldNames: - description: - A list of fields in schema that are selected in a table scan. - When not specified, all columns in the requested schema should be selected. - type: array - items: - $ref: '#/components/schemas/FieldName' - - Filter: - description: - an unbounded expression to describe the filters to apply to a table scan, - default to `TrueExpression` meaning that nothing is filtered. - allOf: - - $ref: '#/components/schemas/Expression' - default: { "type": "true" } - PlanTask: description: An opaque JSON object that contains information provided by the REST server @@ -3969,24 +3956,20 @@ components: type: object required: - data-file - - residual-filter properties: data-file: $ref: '#/components/schemas/DataFile' - position-delete-files: - description: a list of position delete files that should be applied to the data file during a scan - type: array - items: - $ref: '#/components/schemas/PositionDeleteFile' - equality-delete-files: - description: a list of equality delete files that should be applied to the data file during a scan + delete-files-references: + description: A list of positional indices that correspond to a delete files array. type: array items: - $ref: '#/components/schemas/EqualityDeleteFile' + type: integer residual-filter: - description: the filters should be applied to rows in this file scan task + description: + An optional filter to be applied to rows in this file scan task. + If the residual is not present, the client should calculate this or the original filter should be used. allOf: - - $ref: '#/components/schemas/Filter' + - $ref: '#/components/schemas/Expression' ############################# # Reusable Response Objects # @@ -4150,8 +4133,6 @@ components: } } - # Note that this is a representative example response for use as a shorthand in the spec. - # The fields `message` and `type` as indicated here are not presently prescriptive. MisdirectedRequestResponse: description: Misdirected Request. The request was directed to a server that is not able to produce a response. @@ -4167,8 +4148,6 @@ components: } } - # Note that this is a representative example response for use as a shorthand in the spec. - # The fields `message` and `type` as indicated here are not presently prescriptive. UnprocessableContentResponse: description: Unprocessable Content. The server understands the content type of the request entity, and the syntax of the From b22036f40485b77c0f07a693702cfd6c216d052a Mon Sep 17 00:00:00 2001 From: Rahil Chertara Date: Fri, 26 Jul 2024 00:11:08 -0700 Subject: [PATCH 03/12] add discriminator for DeleteFile --- open-api/rest-catalog-open-api.py | 10 ++++++---- open-api/rest-catalog-open-api.yaml | 15 ++++++++++----- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/open-api/rest-catalog-open-api.py b/open-api/rest-catalog-open-api.py index 2f1f289b1c75..e4867a67082d 100644 --- a/open-api/rest-catalog-open-api.py +++ b/open-api/rest-catalog-open-api.py @@ -818,10 +818,6 @@ class EqualityDeleteFile(ContentFile): ) -class DeleteFile(BaseModel): - __root__: Union[PositionDeleteFile, EqualityDeleteFile] - - class FieldName(BaseModel): __root__: str = Field( ..., @@ -937,6 +933,12 @@ class DataFile(ContentFile): ) +class DeleteFile(BaseModel): + __root__: Union[PositionDeleteFile, EqualityDeleteFile] = Field( + ..., discriminator='content' + ) + + class Term(BaseModel): __root__: Union[Reference, TransformTerm] diff --git a/open-api/rest-catalog-open-api.yaml b/open-api/rest-catalog-open-api.yaml index d2fa9580c041..22799eb9dfdf 100644 --- a/open-api/rest-catalog-open-api.yaml +++ b/open-api/rest-catalog-open-api.yaml @@ -3801,6 +3801,16 @@ components: - $ref: '#/components/schemas/ValueMap' description: "Map of column id to upper bound primitive type values" + DeleteFile: + discriminator: + propertyName: content + mapping: + position-deletes: '#/components/schemas/PositionDeleteFile' + equality-deletes: '#/components/schemas/EqualityDeleteFile' + oneOf: + - $ref: '#/components/schemas/PositionDeleteFile' + - $ref: '#/components/schemas/EqualityDeleteFile' + PositionDeleteFile: allOf: - $ref: '#/components/schemas/ContentFile' @@ -3826,11 +3836,6 @@ components: type: integer description: "List of equality field IDs" - DeleteFile: - oneOf: - - $ref: '#/components/schemas/PositionDeleteFile' - - $ref: '#/components/schemas/EqualityDeleteFile' - PreplanTableRequest: type: object oneOf: From c86351b455c5e286e252bd1fea16c58f00e215ce Mon Sep 17 00:00:00 2001 From: Rahil Chertara Date: Sun, 28 Jul 2024 17:56:35 -0700 Subject: [PATCH 04/12] Remove required props for snapshot id and clairfy description --- open-api/rest-catalog-open-api.py | 96 ++--------------------------- open-api/rest-catalog-open-api.yaml | 18 ++---- 2 files changed, 11 insertions(+), 103 deletions(-) diff --git a/open-api/rest-catalog-open-api.py b/open-api/rest-catalog-open-api.py index e4867a67082d..6135acc5bdc7 100644 --- a/open-api/rest-catalog-open-api.py +++ b/open-api/rest-catalog-open-api.py @@ -1243,9 +1243,9 @@ class CommitTableResponse(BaseModel): metadata: TableMetadata -class PreplanTableRequest1(BaseModel): - snapshot_id: int = Field( - ..., +class PreplanTableRequest(BaseModel): + snapshot_id: Optional[int] = Field( + None, alias='snapshot-id', description='The ID of the snapshot to use for the table scan.', ) @@ -1279,7 +1279,8 @@ class PreplanTableRequest1(BaseModel): ) -class PreplanTableRequest2(BaseModel): +class PlanTableRequest(BaseModel): + plan_task: Optional[PlanTask] = Field(None, alias='plan-task') snapshot_id: Optional[int] = Field( None, alias='snapshot-id', @@ -1303,47 +1304,6 @@ class PreplanTableRequest2(BaseModel): alias='use-snapshot-schema', description='If the client is performing time travel, the snapshot schema should be used. For clients performing a plan for a branch, should default to using the table schema.', ) - start_snapshot_id: int = Field( - ..., - alias='start-snapshot-id', - description='The ID of the starting snapshot of the incremental scan', - ) - end_snapshot_id: int = Field( - ..., - alias='end-snapshot-id', - description='The ID of the inclusive ending snapshot of the incremental scan. If not specified, the snapshot at the main branch head will be used as the end snapshot.', - ) - - -class PreplanTableRequest(BaseModel): - __root__: Union[PreplanTableRequest1, PreplanTableRequest2] - - -class PlanTableRequest1(BaseModel): - plan_task: Optional[PlanTask] = Field(None, alias='plan-task') - snapshot_id: int = Field( - ..., - alias='snapshot-id', - description='The ID of the snapshot to use for the table scan.', - ) - select: Optional[List[FieldName]] = Field( - None, - description='A list of fields in schema that are selected in a table scan. When not specified, all columns in the requested schema should be selected.', - ) - filter: Optional[Expression] = Field( - None, - description='an unbounded expression to describe the filters to apply to a table scan,', - ) - case_sensitive: Optional[bool] = Field( - True, - alias='case-sensitive', - description='If field selection and filtering should be case sensitive', - ) - use_snapshot_schema: Optional[bool] = Field( - False, - alias='use-snapshot-schema', - description='If the client is performing time travel, the snapshot schema should be used. For clients performing a plan for a branch, should default to using the table schema.', - ) start_snapshot_id: Optional[int] = Field( None, alias='start-snapshot-id', @@ -1361,52 +1321,6 @@ class PlanTableRequest1(BaseModel): ) -class PlanTableRequest2(BaseModel): - plan_task: Optional[PlanTask] = Field(None, alias='plan-task') - snapshot_id: Optional[int] = Field( - None, - alias='snapshot-id', - description='The ID of the snapshot to use for the table scan.', - ) - select: Optional[List[FieldName]] = Field( - None, - description='A list of fields in schema that are selected in a table scan. When not specified, all columns in the requested schema should be selected.', - ) - filter: Optional[Expression] = Field( - None, - description='an unbounded expression to describe the filters to apply to a table scan,', - ) - case_sensitive: Optional[bool] = Field( - True, - alias='case-sensitive', - description='If field selection and filtering should be case sensitive', - ) - use_snapshot_schema: Optional[bool] = Field( - False, - alias='use-snapshot-schema', - description='If the client is performing time travel, the snapshot schema should be used. For clients performing a plan for a branch, should default to using the table schema.', - ) - start_snapshot_id: int = Field( - ..., - alias='start-snapshot-id', - description='The ID of the starting snapshot of the incremental scan', - ) - end_snapshot_id: int = Field( - ..., - alias='end-snapshot-id', - description='The ID of the inclusive ending snapshot of the incremental scan. If not specified, the snapshot at the main branch head will be used as the end snapshot.', - ) - stats_fields: Optional[List[FieldName]] = Field( - None, - alias='stats-fields', - description='A list of fields that the client requests the server to send statistics in each `FileScanTask` returned in the response', - ) - - -class PlanTableRequest(BaseModel): - __root__: Union[PlanTableRequest1, PlanTableRequest2] - - class FileScanTask(BaseModel): data_file: DataFile = Field(..., alias='data-file') delete_files_references: Optional[List[int]] = Field( diff --git a/open-api/rest-catalog-open-api.yaml b/open-api/rest-catalog-open-api.yaml index 22799eb9dfdf..76ba0ad0853a 100644 --- a/open-api/rest-catalog-open-api.yaml +++ b/open-api/rest-catalog-open-api.yaml @@ -554,6 +554,9 @@ paths: Prepare a list of tasks that can be used to distribute table scan planning based on a set of table scan criteria such as selected columns, filters, snapshot range, case sensitivity, etc. + Requires that client specifies only a `snapshot-id` for a regular scan, or for performing incremental scans only provide + a `start-snapshot-id` and an `end-snapshot-id`. + This API returns a list of `plan-task`s, and each of them can be used in the `PlanTable` API to request a subset of all file scan tasks in a table scan. This mechanism allows clients to distribute and parallelize the entire table scan planning process. @@ -615,6 +618,9 @@ paths: Perform scan planning against a table based on a set of table scan criteria such as selected columns, filters, snapshot range, case sensitivity, etc. + Requires that client specifies only a `snapshot-id` for a regular scan, or for performing incremental scans only provide + a `start-snapshot-id` and an `end-snapshot-id`. + An optional `plan-task` can be provided to request only a subset of file scan tasks. The `plan-task` can be retrieved by invoking the `PreplanTable` endpoint. @@ -3838,12 +3844,6 @@ components: PreplanTableRequest: type: object - oneOf: - - required: - - snapshot-id - - required: - - start-snapshot-id - - end-snapshot-id properties: snapshot-id: description: @@ -3884,12 +3884,6 @@ components: PlanTableRequest: type: object - oneOf: - - required: - - snapshot-id - - required: - - start-snapshot-id - - end-snapshot-id properties: plan-task: $ref: '#/components/schemas/PlanTask' From 8594f43c8a891dcb077b21c0a78d546013a6ddf5 Mon Sep 17 00:00:00 2001 From: Rahil Chertara Date: Sat, 24 Aug 2024 15:50:25 -0400 Subject: [PATCH 05/12] Improvements to scan-planning design --- open-api/rest-catalog-open-api.py | 107 ++++---- open-api/rest-catalog-open-api.yaml | 384 ++++++++++++++++++---------- 2 files changed, 303 insertions(+), 188 deletions(-) diff --git a/open-api/rest-catalog-open-api.py b/open-api/rest-catalog-open-api.py index 6135acc5bdc7..e3e91dffd488 100644 --- a/open-api/rest-catalog-open-api.py +++ b/open-api/rest-catalog-open-api.py @@ -80,7 +80,7 @@ class Namespace(BaseModel): class PageToken(BaseModel): __root__: Optional[str] = Field( None, - description='An opaque token that allows clients to make use of pagination for list APIs (e.g. ListTables) as well as for scan-planning APIs (e.g PlanTable). Clients may initiate the first paginated request by sending an empty query parameter `pageToken` to the server.\nServers that support pagination should identify the `pageToken` parameter and return a `next-page-token` in the response if there are more results available. After the initial request, the value of `next-page-token` from each response must be used as the `pageToken` parameter value for the next request. The server must return `null` value for the `next-page-token` in the last response.\nServers that support pagination must return all results in a single response with the value of `next-page-token` set to `null` if the query parameter `pageToken` is not set in the request.\nServers that do not support pagination should ignore the `pageToken` parameter and return all results in a single response. The `next-page-token` must be omitted from the response.\nClients must interpret either `null` or missing response value of `next-page-token` as the end of the listing results.', + description='An opaque token that allows clients to make use of pagination for list APIs (e.g. ListTables). Clients may initiate the first paginated request by sending an empty query parameter `pageToken` to the server.\nServers that support pagination should identify the `pageToken` parameter and return a `next-page-token` in the response if there are more results available. After the initial request, the value of `next-page-token` from each response must be used as the `pageToken` parameter value for the next request. The server must return `null` value for the `next-page-token` in the last response.\nServers that support pagination must return all results in a single response with the value of `next-page-token` set to `null` if the query parameter `pageToken` is not set in the request.\nServers that do not support pagination should ignore the `pageToken` parameter and return all results in a single response. The `next-page-token` must be omitted from the response.\nClients must interpret either `null` or missing response value of `next-page-token` as the end of the listing results.', ) @@ -818,6 +818,18 @@ class EqualityDeleteFile(ContentFile): ) +class GetTasksStatusRequest(BaseModel): + plan_id: str = Field( + ..., alias='plan-id', description='id used to track status of `planTable`' + ) + + +class CancelPlanRequest(BaseModel): + plan_id: str = Field( + ..., alias='plan-id', description='id used to cancel `planTable` operation' + ) + + class FieldName(BaseModel): __root__: str = Field( ..., @@ -827,10 +839,16 @@ class FieldName(BaseModel): class PlanTask(BaseModel): """ - An opaque JSON object that contains information provided by the REST server to be utilized by clients for distributed table scan planning; should be supplied as input in `PlanTable` operation. + An opaque JSON object that contains information provided by the REST server to be utilized by clients for distributed table scan planning; should be supplied as input in `RetrieveTasks` operation. """ +class PlanStatus(BaseModel): + __root__: Literal['started', 'cancelled', 'failed'] = Field( + ..., description='Represents the current status of the `planTable` operation.' + ) + + class CreateNamespaceRequest(BaseModel): namespace: Namespace properties: Optional[Dict[str, str]] = Field( @@ -875,9 +893,12 @@ class ViewRequirement(BaseModel): __root__: AssertViewUUID = Field(..., discriminator='type') -class PreplanTableResult(BaseModel): - plan_tasks: List[PlanTask] = Field(..., alias='plan-tasks') - next_page_token: Optional[PageToken] = Field(None, alias='next-page-token') +class CancelPlanResult(BaseModel): + """ + Used to indicate state of cancellation. If successful should return "cancelled" state. + """ + + cancel_status: Optional[PlanStatus] = Field(None, alias='cancel-status') class ReportMetricsRequest2(CommitReport): @@ -939,6 +960,10 @@ class DeleteFile(BaseModel): ) +class RetrieveTasksRequest(BaseModel): + plan_task: PlanTask = Field(..., alias='plan-task') + + class Term(BaseModel): __root__: Union[Reference, TransformTerm] @@ -1148,13 +1173,34 @@ class LoadTableResult(BaseModel): class PlanTableResult(BaseModel): - file_scan_tasks: List[FileScanTask] = Field(..., alias='file-scan-tasks') - delete_files: Optional[List[DeleteFile]] = Field( - None, - alias='delete-files', - description='A list of delete files that can be either positional or equality. If the client does not recognize the type of delete file being returned by the service it should immediately throw an exception that it does not support this type.', + """ + If the plan has not finished return a `plan-id`. If finished, the response will contain a list of `FileScanTask`, a list of `PlanTask`, or both. + """ + + file_scan_tasks: Optional[List[FileScanTask]] = Field(None, alias='file-scan-tasks') + plan_tasks: Optional[List[PlanTask]] = Field(None, alias='plan-tasks') + plan_id: Optional[str] = Field( + None, alias='plan-id', description='id used to track progress of the plan' ) - next_page_token: Optional[PageToken] = Field(None, alias='next-page-token') + + +class GetTasksStatusResult(BaseModel): + """ + If the plan has not finished return a `plan-status`. If the plan has finished can return a list of `FileScanTask`, a list of `PlanTask`, or both. + """ + + file_scan_tasks: Optional[List[FileScanTask]] = Field(None, alias='file-scan-tasks') + plan_tasks: Optional[List[PlanTask]] = Field(None, alias='plan-tasks') + plan_status: Optional[PlanStatus] = Field(None, alias='plan-status') + + +class RetrieveTasksResult(BaseModel): + """ + Used to fetch file scan tasks for a given `planTask`. Can also return additional plan-tasks. + """ + + file_scan_tasks: Optional[List[FileScanTask]] = Field(None, alias='file-scan-tasks') + plan_tasks: Optional[List[PlanTask]] = Field(None, alias='plan-tasks') class CommitTableRequest(BaseModel): @@ -1243,44 +1289,7 @@ class CommitTableResponse(BaseModel): metadata: TableMetadata -class PreplanTableRequest(BaseModel): - snapshot_id: Optional[int] = Field( - None, - alias='snapshot-id', - description='The ID of the snapshot to use for the table scan.', - ) - select: Optional[List[FieldName]] = Field( - None, - description='A list of fields in schema that are selected in a table scan. When not specified, all columns in the requested schema should be selected.', - ) - filter: Optional[Expression] = Field( - None, - description='an unbounded expression to describe the filters to apply to a table scan,', - ) - case_sensitive: Optional[bool] = Field( - True, - alias='case-sensitive', - description='If field selection and filtering should be case sensitive', - ) - use_snapshot_schema: Optional[bool] = Field( - False, - alias='use-snapshot-schema', - description='If the client is performing time travel, the snapshot schema should be used. For clients performing a plan for a branch, should default to using the table schema.', - ) - start_snapshot_id: Optional[int] = Field( - None, - alias='start-snapshot-id', - description='The ID of the starting snapshot of the incremental scan', - ) - end_snapshot_id: Optional[int] = Field( - None, - alias='end-snapshot-id', - description='The ID of the inclusive ending snapshot of the incremental scan. If not specified, the snapshot at the main branch head will be used as the end snapshot.', - ) - - class PlanTableRequest(BaseModel): - plan_task: Optional[PlanTask] = Field(None, alias='plan-task') snapshot_id: Optional[int] = Field( None, alias='snapshot-id', @@ -1354,6 +1363,8 @@ class ReportMetricsRequest1(ScanReport): ViewMetadata.update_forward_refs() AddSchemaUpdate.update_forward_refs() PlanTableResult.update_forward_refs() +GetTasksStatusResult.update_forward_refs() +RetrieveTasksResult.update_forward_refs() CreateTableRequest.update_forward_refs() CreateViewRequest.update_forward_refs() ReportMetricsRequest.update_forward_refs() diff --git a/open-api/rest-catalog-open-api.yaml b/open-api/rest-catalog-open-api.yaml index 76ba0ad0853a..a16117cddb32 100644 --- a/open-api/rest-catalog-open-api.yaml +++ b/open-api/rest-catalog-open-api.yaml @@ -541,7 +541,7 @@ paths: 5XX: $ref: '#/components/responses/ServerErrorResponse' - /v1/{prefix}/namespaces/{namespace}/tables/{table}/preplan: + /v1/{prefix}/namespaces/{namespace}/tables/{table}/plan: parameters: - $ref: '#/components/parameters/prefix' - $ref: '#/components/parameters/namespace' @@ -549,29 +549,28 @@ paths: post: tags: - Catalog API - summary: Prepare a list of tasks that can be used to distribute table scan planning + summary: Returns either a list of `PlanTask`, a list of `FileScanTask`. If planning is not complete returns a `plan-id`. description: - Prepare a list of tasks that can be used to distribute table scan planning based on a set of table scan criteria + Prepares either a list of `PlanTask` that can be used to distribute table scan planning, or a list of `FileScanTask`, based on a set of table scan criteria such as selected columns, filters, snapshot range, case sensitivity, etc. + In the event that the plan tasks or file scan tasks are not ready to be served, the service will return a `plan-id`, + which can be used as input in the `GetTasksStatus`. - Requires that client specifies only a `snapshot-id` for a regular scan, or for performing incremental scans only provide + Requires that the client specifies only a `snapshot-id` for a snapshot scan, or for performing incremental scans only provide a `start-snapshot-id` and an `end-snapshot-id`. - - This API returns a list of `plan-task`s, and each of them can be used in the `PlanTable` API - to request a subset of all file scan tasks in a table scan. + + Each `PlanTask`, can be used as input in the `RetrieveTasks` API + to request a subset of file scan tasks in a table scan. This mechanism allows clients to distribute and parallelize the entire table scan planning process. - operationId: PreplanTable - parameters: - - $ref: '#/components/parameters/page-token' - - $ref: '#/components/parameters/page-size' + operationId: planTable requestBody: content: application/json: schema: - $ref: '#/components/schemas/PreplanTableRequest' + $ref: '#/components/schemas/PlanTableRequest' responses: 200: - $ref: '#/components/responses/PreplanTableResponse' + $ref: '#/components/responses/PlanTableResponse' 400: $ref: '#/components/responses/BadRequestErrorResponse' 401: @@ -601,45 +600,125 @@ paths: 5XX: $ref: '#/components/responses/ServerErrorResponse' - /v1/{prefix}/namespaces/{namespace}/tables/{table}/plan: + /v1/{prefix}/namespaces/{namespace}/tables/{table}/plan/{id}: parameters: - $ref: '#/components/parameters/prefix' - $ref: '#/components/parameters/namespace' - $ref: '#/components/parameters/table' - post: + - $ref: '#/components/parameters/id' + + get: tags: - Catalog API - summary: Perform scan planning against a table - operationId: PlanTable - parameters: - - $ref: '#/components/parameters/page-token' - - $ref: '#/components/parameters/page-size' + summary: Uses a `plan-id` as input to get status of `planTable`. Returns a list of `PlanTask`, a list of `FileScanTask`, or both when plan is complete. + operationId: GetTaskStatus description: - Perform scan planning against a table based on a set of table scan criteria such as selected columns, filters, - snapshot range, case sensitivity, etc. - - Requires that client specifies only a `snapshot-id` for a regular scan, or for performing incremental scans only provide - a `start-snapshot-id` and an `end-snapshot-id`. - - An optional `plan-task` can be provided to request only a subset of file scan tasks. - The `plan-task` can be retrieved by invoking the `PreplanTable` endpoint. - - If preplanning using the `PreplanTable` endpoint is required before hitting this endpoint but the client fails - to supply a `plan-task` in the request, then a `421 Misdirected Request` response should be returned to - indicate this requirement. + Gets the status of a plan by using a `plan-id` which can be obtained from `planTable`. + If the plan is not completed, returns a `plan-status` representing the state of the plan. + If the plan is completed, returns a list of `PlanTask` or `FileScanTask`. + + If an invalid `plan-id` is provided, the service will return a `404 NoSuchPlanIdError` exception indicating that the requested plan does not exist. + responses: + 200: + $ref: '#/components/responses/GetTasksStatusResponse' + 400: + $ref: '#/components/responses/BadRequestErrorResponse' + 401: + $ref: '#/components/responses/UnauthorizedResponse' + 403: + $ref: '#/components/responses/ForbiddenResponse' + 404: + description: + Not Found + - NoSuchPlanIdException, the plan-id does not exist + - NoSuchTableException, the table does not exist + - NoSuchNamespaceException, the namespace does not exist + content: + application/json: + schema: + $ref: '#/components/schemas/IcebergErrorResponse' + examples: + PlanIdDoesNotExist: + $ref: '#/components/examples/NoSuchPlanIdError' + TableDoesNotExist: + $ref: '#/components/examples/NoSuchTableError' + NamespaceDoesNotExist: + $ref: '#/components/examples/NoSuchNamespaceError' + 419: + $ref: '#/components/responses/AuthenticationTimeoutResponse' + 503: + $ref: '#/components/responses/ServiceUnavailableResponse' + 5XX: + $ref: '#/components/responses/ServerErrorResponse' + + delete: + tags: + - Catalog API + summary: Uses a `plan-id` as input to cancel a `planTable` operation. + operationId: CancelPlan + description: + Uses a `plan-id` as input to cancel a `planTable` operation. + If successful, should return a state that says "cancelled". - If planning a table scan produces too many file scan tasks and the server is unable to return them within its - response size limit, then a `422 Unprocessable Content` response should be returned to indicate that the client - should first attempt to preplan the specific table scan to distribute the planning process and make the content - processable by the server. + If an invalid `plan-id` is provided, the service will return a `404 NoSuchPlanIdError` exception indicating that the requested plan does not exist. + responses: + 200: + $ref: '#/components/responses/CancelPlanResponse' + 400: + $ref: '#/components/responses/BadRequestErrorResponse' + 401: + $ref: '#/components/responses/UnauthorizedResponse' + 403: + $ref: '#/components/responses/ForbiddenResponse' + 404: + description: + Not Found + - NoSuchPlanIdException, the plan-id does not exist + - NoSuchTableException, the table does not exist + - NoSuchNamespaceException, the namespace does not exist + content: + application/json: + schema: + $ref: '#/components/schemas/IcebergErrorResponse' + examples: + PlanIdDoesNotExist: + $ref: '#/components/examples/NoSuchPlanIdError' + TableDoesNotExist: + $ref: '#/components/examples/NoSuchTableError' + NamespaceDoesNotExist: + $ref: '#/components/examples/NoSuchNamespaceError' + 419: + $ref: '#/components/responses/AuthenticationTimeoutResponse' + 503: + $ref: '#/components/responses/ServiceUnavailableResponse' + 5XX: + $ref: '#/components/responses/ServerErrorResponse' + + + /v1/{prefix}/namespaces/{namespace}/tables/{table}/tasks: + parameters: + - $ref: '#/components/parameters/prefix' + - $ref: '#/components/parameters/namespace' + - $ref: '#/components/parameters/table' + + post: + tags: + - Catalog API + summary: Retrieves additional `PlanTask`s and `FileScanTask`s from the service. + operationId: RetrieveTasks + description: + Requires a client to provide a `PlanTask` which can be obtained from `GetTaskStatus`, + in order to fetch additional `PlanTask`s and `FileScanTask`s from a service. + + If an invalid plan-task is provided to the service, the service will return a `404 NoSuchPlanTaskError` exception. requestBody: content: application/json: schema: - $ref: '#/components/schemas/PlanTableRequest' + $ref: '#/components/schemas/RetrieveTasksRequest' responses: 200: - $ref: '#/components/responses/PlanTableResponse' + $ref: '#/components/responses/RetrieveTasksResponse' 400: $ref: '#/components/responses/BadRequestErrorResponse' 401: @@ -649,6 +728,7 @@ paths: 404: description: Not Found + - NoSuchPlanTaskException, the plan-task does not exist - NoSuchTableException, the table does not exist - NoSuchNamespaceException, the namespace does not exist content: @@ -656,21 +736,21 @@ paths: schema: $ref: '#/components/schemas/IcebergErrorResponse' examples: + PlanTaskDoesNotExist: + $ref: '#/components/examples/NoSuchPlanTaskError' TableDoesNotExist: $ref: '#/components/examples/NoSuchTableError' NamespaceDoesNotExist: $ref: '#/components/examples/NoSuchNamespaceError' 419: $ref: '#/components/responses/AuthenticationTimeoutResponse' - 421: - $ref: '#/components/responses/MisdirectedRequestResponse' - 422: - $ref: '#/components/responses/UnprocessableContentResponse' 503: $ref: '#/components/responses/ServiceUnavailableResponse' 5XX: $ref: '#/components/responses/ServerErrorResponse' + + /v1/{prefix}/namespaces/{namespace}/register: parameters: - $ref: '#/components/parameters/prefix' @@ -1593,6 +1673,14 @@ components: type: string example: "sales" + id: + name: id + in: path + description: A plan id + required: true + schema: + type: string + view: name: view in: path @@ -1748,8 +1836,8 @@ components: PageToken: description: - An opaque token that allows clients to make use of pagination for list APIs (e.g. ListTables) - as well as for scan-planning APIs (e.g PlanTable). Clients may initiate the first paginated request by sending an empty + An opaque token that allows clients to make use of pagination for list APIs + (e.g. ListTables). Clients may initiate the first paginated request by sending an empty query parameter `pageToken` to the server. Servers that support pagination should identify the `pageToken` parameter and return a @@ -2926,37 +3014,59 @@ components: additionalProperties: type: string - PreplanTableResult: + PlanTableResult: type: object - required: - - plan-tasks + description: If the plan has not finished return a `plan-id`. + If finished, the response will contain a list of `FileScanTask`, a list of `PlanTask`, or both. + properties: + file-scan-tasks: + type: array + items: + $ref: '#/components/schemas/FileScanTask' + plan-tasks: + type: array + items: + $ref: '#/components/schemas/PlanTask' + plan-id: + description: id used to track progress of the plan + type: string + + GetTasksStatusResult: + type: object + description: If the plan has not finished return a `plan-status`. + If the plan has finished can return a list of `FileScanTask`, a list of `PlanTask`, or both. properties: + file-scan-tasks: + type: array + items: + $ref: '#/components/schemas/FileScanTask' plan-tasks: type: array items: $ref: '#/components/schemas/PlanTask' - next-page-token: - $ref: '#/components/schemas/PageToken' + plan-status: + $ref: '#/components/schemas/PlanStatus' - PlanTableResult: + + RetrieveTasksResult: type: object - required: - - file-scan-tasks + description: Used to fetch file scan tasks for a given `planTask`. Can also return additional plan-tasks. properties: file-scan-tasks: type: array items: $ref: '#/components/schemas/FileScanTask' - delete-files: - description: - A list of delete files that can be either positional or equality. - If the client does not recognize the type of delete file being returned by the service - it should immediately throw an exception that it does not support this type. + plan-tasks: type: array items: - $ref: '#/components/schemas/DeleteFile' - next-page-token: - $ref: '#/components/schemas/PageToken' + $ref: '#/components/schemas/PlanTask' + + CancelPlanResult: + type: object + description: Used to indicate state of cancellation. If successful should return "cancelled" state. + properties: + cancel-status: + $ref: '#/components/schemas/PlanStatus' CommitTableRequest: type: object @@ -3842,51 +3952,9 @@ components: type: integer description: "List of equality field IDs" - PreplanTableRequest: - type: object - properties: - snapshot-id: - description: - The ID of the snapshot to use for the table scan. - type: integer - format: int64 - select: - description: - A list of fields in schema that are selected in a table scan. - When not specified, all columns in the requested schema should be selected. - type: array - items: - $ref: '#/components/schemas/FieldName' - filter: - description: - an unbounded expression to describe the filters to apply to a table scan, - $ref: '#/components/schemas/Expression' - case-sensitive: - description: If field selection and filtering should be case sensitive - type: boolean - default: true - use-snapshot-schema: - description: - If the client is performing time travel, the snapshot schema should be used. - For clients performing a plan for a branch, should default to using the table schema. - type: boolean - default: false - start-snapshot-id: - description: The ID of the starting snapshot of the incremental scan - type: integer - format: int64 - end-snapshot-id: - description: - The ID of the inclusive ending snapshot of the incremental scan. - If not specified, the snapshot at the main branch head will be used as the end snapshot. - type: integer - format: int64 - PlanTableRequest: type: object properties: - plan-task: - $ref: '#/components/schemas/PlanTask' snapshot-id: description: The ID of the snapshot to use for the table scan. @@ -3931,6 +3999,32 @@ components: items: $ref: '#/components/schemas/FieldName' + GetTasksStatusRequest: + type: object + required: + - plan-id + properties: + plan-id: + description: id used to track status of `planTable` + type: string + + RetrieveTasksRequest: + type: object + required: + - plan-task + properties: + plan-task: + $ref: '#/components/schemas/PlanTask' + + CancelPlanRequest: + type: object + required: + - plan-id + properties: + plan-id: + description: id used to cancel `planTable` operation + type: string + FieldName: description: A field name that follows the Iceberg naming standard, and can be used in APIs like @@ -3948,9 +4042,15 @@ components: description: An opaque JSON object that contains information provided by the REST server to be utilized by clients for distributed table scan planning; should be supplied - as input in `PlanTable` operation. + as input in `RetrieveTasks` operation. type: object + PlanStatus: + description: + Represents the current status of the `planTable` operation. + type: string + enum: [started, cancelled, failed] + FileScanTask: type: object required: @@ -4132,37 +4232,6 @@ components: } } - MisdirectedRequestResponse: - description: - Misdirected Request. The request was directed to a server that is not able to produce a response. - content: - application/json: - schema: - $ref: '#/components/schemas/IcebergErrorResponse' - example: { - "error": { - "message": "Fail to plan table scan: must perform preplan before planning a table scan", - "type": "PreplanTableRequiredException", - "code": 421 - } - } - - UnprocessableContentResponse: - description: - Unprocessable Content. The server understands the content type of the request entity, and the syntax of the - request entity is correct, but it was unable to process the contained instructions. - content: - application/json: - schema: - $ref: '#/components/schemas/IcebergErrorResponse' - example: { - "error": { - "message": "Fail to plan table scan: too many file scan tasks. Please use preplan to distribute table scan.", - "type": "TooManyFileScanTasksException", - "code": 422 - } - } - ServiceUnavailableResponse: description: The service is not ready to handle the request. The client should wait and retry. @@ -4217,19 +4286,34 @@ components: schema: $ref: '#/components/schemas/LoadTableResult' - PreplanTableResponse: - description: Result of preplanning a table scan + PlanTableResponse: + description: Result of planning a table scan content: application/json: schema: - $ref: '#/components/schemas/PreplanTableResult' + $ref: '#/components/schemas/PlanTableResult' - PlanTableResponse: - description: Result of planning a table scan + GetTasksStatusResponse: + description: Result of checking status of a plan content: application/json: schema: - $ref: '#/components/schemas/PlanTableResult' + $ref: '#/components/schemas/GetTasksStatusResult' + + RetrieveTasksResponse: + description: Result of retrieving additional plan tasks and file scan tasks. + content: + application/json: + schema: + $ref: '#/components/schemas/RetrieveTasksResult' + + CancelPlanResponse: + description: Result of a cancellation of a plan + content: + application/json: + schema: + $ref: '#/components/schemas/CancelPlanResult' + LoadTableResponse: description: Table metadata result when loading a table @@ -4308,6 +4392,26 @@ components: } } + NoSuchPlanIdError: + summary: The plan id does not exist + value: { + "error": { + "message": "The plan id does not exist", + "type": "NoSuchPlanIdException", + "code": 404 + } + } + + NoSuchPlanTaskError: + summary: The plan task does not exist + value: { + "error": { + "message": "The plan task does not exist", + "type": "NoSuchPlanTaskException", + "code": 404 + } + } + NoSuchTableError: summary: The requested table does not exist value: { From 1d5d5d39f88935b9e3f46cd0f96548dd1104beee Mon Sep 17 00:00:00 2001 From: Ryan Blue Date: Wed, 28 Aug 2024 16:11:10 -0700 Subject: [PATCH 06/12] Update scan planning endpoints. (#2) * Update scan planning endpoints. * Fix review issues. * Use 204 * Fix formatting --- open-api/rest-catalog-open-api.yaml | 355 ++++++++++++++++------------ 1 file changed, 200 insertions(+), 155 deletions(-) diff --git a/open-api/rest-catalog-open-api.yaml b/open-api/rest-catalog-open-api.yaml index a16117cddb32..ce1628eb795a 100644 --- a/open-api/rest-catalog-open-api.yaml +++ b/open-api/rest-catalog-open-api.yaml @@ -549,28 +549,57 @@ paths: post: tags: - Catalog API - summary: Returns either a list of `PlanTask`, a list of `FileScanTask`. If planning is not complete returns a `plan-id`. + summary: Submit a scan for planning description: - Prepares either a list of `PlanTask` that can be used to distribute table scan planning, or a list of `FileScanTask`, based on a set of table scan criteria - such as selected columns, filters, snapshot range, case sensitivity, etc. - In the event that the plan tasks or file scan tasks are not ready to be served, the service will return a `plan-id`, - which can be used as input in the `GetTasksStatus`. - - Requires that the client specifies only a `snapshot-id` for a snapshot scan, or for performing incremental scans only provide - a `start-snapshot-id` and an `end-snapshot-id`. - - Each `PlanTask`, can be used as input in the `RetrieveTasks` API - to request a subset of file scan tasks in a table scan. - This mechanism allows clients to distribute and parallelize the entire table scan planning process. - operationId: planTable + Submits a scan for server-side planning. + + + Point-in-time scans are planned by passing snapshot-id to identify the + table snapshot to scan. Incremental scans are planned by passing both + start-snapshot-id and end-snapshot-id. Requests that include both point + in time config properties and incremental config properties are + invalid. If the request does not include either incremental or + point-in-time config properties, scan planning should produce a + point-in-time scan of the latest snapshot in the table's main branch. + + + Responses must include a valid status + + - When "completed" the planning operation has produced plan-tasks and + file-scan-tasks that must be returned in the response + + - When "submitted" the response must include a plan-id used to poll + fetchPlanningResult to fetch the planning result when it is ready + + - When "failed" the response must be a valid error response + + - Status "cancelled" is not a valid status from this endpoint + + + The response for a "completed" planning operation includes two types of + tasks (file scan tasks and plan tasks) and both may be included in the + response. Tasks must not be included for any other response status. + + + Responses that include a plan-id indicate that the service is holding + state or performing work for the client. + + + - Clients should use the plan-id to fetch results from + fetchPlanningResult when the response status is "submitted" + + - Clients should inform the service if planning results are no longer + needed by calling cancelPlanning. Cancellation is not necessary after + fetchScanTasks has been used to fetch scan tasks for each plan task. + operationId: planTableScan requestBody: content: application/json: schema: - $ref: '#/components/schemas/PlanTableRequest' + $ref: '#/components/schemas/PlanTableScanRequest' responses: 200: - $ref: '#/components/responses/PlanTableResponse' + $ref: '#/components/responses/PlanTableScanResponse' 400: $ref: '#/components/responses/BadRequestErrorResponse' 401: @@ -600,27 +629,41 @@ paths: 5XX: $ref: '#/components/responses/ServerErrorResponse' - /v1/{prefix}/namespaces/{namespace}/tables/{table}/plan/{id}: + /v1/{prefix}/namespaces/{namespace}/tables/{table}/plan/{plan-id}: parameters: - $ref: '#/components/parameters/prefix' - $ref: '#/components/parameters/namespace' - $ref: '#/components/parameters/table' - - $ref: '#/components/parameters/id' + - $ref: '#/components/parameters/plan-id' get: tags: - Catalog API - summary: Uses a `plan-id` as input to get status of `planTable`. Returns a list of `PlanTask`, a list of `FileScanTask`, or both when plan is complete. - operationId: GetTaskStatus + summary: Fetches the result of scan planning for a plan-id + operationId: fetchPlanningResult description: - Gets the status of a plan by using a `plan-id` which can be obtained from `planTable`. - If the plan is not completed, returns a `plan-status` representing the state of the plan. - If the plan is completed, returns a list of `PlanTask` or `FileScanTask`. - - If an invalid `plan-id` is provided, the service will return a `404 NoSuchPlanIdError` exception indicating that the requested plan does not exist. + Fetches the result of scan planning for a plan-id. + + + Responses must include a valid status + + - When "completed" the planning operation has produced plan-tasks and + file-scan-tasks that must be returned in the response + + - When "submitted" the planning operation has not completed; the client + should wait to call this endpoint again to fetch a completed response + + - When "failed" the response must be a valid error response + + - When "cancelled" the plan-id is invalid and should be discarded + + + The response for a "completed" planning operation includes two types of + tasks (file scan tasks and plan tasks) and both may be included in the + response. Tasks must not be included for any other response status. responses: 200: - $ref: '#/components/responses/GetTasksStatusResponse' + $ref: '#/components/responses/FetchPlanningResultResponse' 400: $ref: '#/components/responses/BadRequestErrorResponse' 401: @@ -654,16 +697,27 @@ paths: delete: tags: - Catalog API - summary: Uses a `plan-id` as input to cancel a `planTable` operation. - operationId: CancelPlan + summary: Cancels scan planning for a plan-id + operationId: cancelPlanning description: - Uses a `plan-id` as input to cancel a `planTable` operation. - If successful, should return a state that says "cancelled". - - If an invalid `plan-id` is provided, the service will return a `404 NoSuchPlanIdError` exception indicating that the requested plan does not exist. + Cancels scan planning for a plan-id. + + + This notifies the service that it can release resources held for the + scan. Clients should cancel scans that are no longer needed, either + while the plan-id returns a "submitted" status or while there are + remaining plan tasks that have not been fetched. + + + Cancellation is not necessary when + + - Scan tasks for each plan task have been fetched using fetchScanTasks + + - A plan-id has produced a "failed" or "cancelled" status from + planTableScan or fetchPlanningResult responses: - 200: - $ref: '#/components/responses/CancelPlanResponse' + 204: + description: Success, no content 400: $ref: '#/components/responses/BadRequestErrorResponse' 401: @@ -673,7 +727,6 @@ paths: 404: description: Not Found - - NoSuchPlanIdException, the plan-id does not exist - NoSuchTableException, the table does not exist - NoSuchNamespaceException, the namespace does not exist content: @@ -681,8 +734,6 @@ paths: schema: $ref: '#/components/schemas/IcebergErrorResponse' examples: - PlanIdDoesNotExist: - $ref: '#/components/examples/NoSuchPlanIdError' TableDoesNotExist: $ref: '#/components/examples/NoSuchTableError' NamespaceDoesNotExist: @@ -704,21 +755,17 @@ paths: post: tags: - Catalog API - summary: Retrieves additional `PlanTask`s and `FileScanTask`s from the service. - operationId: RetrieveTasks - description: - Requires a client to provide a `PlanTask` which can be obtained from `GetTaskStatus`, - in order to fetch additional `PlanTask`s and `FileScanTask`s from a service. - - If an invalid plan-task is provided to the service, the service will return a `404 NoSuchPlanTaskError` exception. + summary: Fetches result tasks for a plan task + operationId: fetchScanTasks + description: Fetches result tasks for a plan task. requestBody: content: application/json: schema: - $ref: '#/components/schemas/RetrieveTasksRequest' + $ref: '#/components/schemas/FetchScanTasksRequest' responses: 200: - $ref: '#/components/responses/RetrieveTasksResponse' + $ref: '#/components/responses/FetchScanTasksResponse' 400: $ref: '#/components/responses/BadRequestErrorResponse' 401: @@ -1673,10 +1720,10 @@ components: type: string example: "sales" - id: - name: id + plan-id: + name: plan-id in: path - description: A plan id + description: ID used to track a planning request required: true schema: type: string @@ -3014,28 +3061,31 @@ components: additionalProperties: type: string - PlanTableResult: + ScanTasks: type: object - description: If the plan has not finished return a `plan-id`. - If finished, the response will contain a list of `FileScanTask`, a list of `PlanTask`, or both. + description: + Scan and planning tasks for server-side scan planning + + + - `plan-tasks` contains opaque units of planning work + + - `file-scan-tasks` contains a partial list of table scan tasks + + - `delete-files` contains delete files referenced by file scan tasks + + + Each plan task must be passed to the fetchScanTasks endpoint to fetch + the file scan tasks for the plan task. + + + The list of delete files must contain all delete files referenced by + the file scan tasks. properties: - file-scan-tasks: + delete-files: + description: Delete files referenced by file scan tasks type: array items: - $ref: '#/components/schemas/FileScanTask' - plan-tasks: - type: array - items: - $ref: '#/components/schemas/PlanTask' - plan-id: - description: id used to track progress of the plan - type: string - - GetTasksStatusResult: - type: object - description: If the plan has not finished return a `plan-status`. - If the plan has finished can return a list of `FileScanTask`, a list of `PlanTask`, or both. - properties: + $ref: '#/components/schemas/DeleteFile' file-scan-tasks: type: array items: @@ -3044,29 +3094,51 @@ components: type: array items: $ref: '#/components/schemas/PlanTask' - plan-status: - $ref: '#/components/schemas/PlanStatus' + EmptyResult: + type: object + description: Empty object result - RetrieveTasksResult: + PlanningResult: type: object - description: Used to fetch file scan tasks for a given `planTask`. Can also return additional plan-tasks. + description: Result of server-side scan planning + discriminator: + propertyName: status + mapping: + completed: '#/components/schemas/ScanTasks' + submitted: '#/components/schemas/EmptyResult' + cancelled: '#/components/schemas/EmptyResult' + failed: '#/components/schemas/IcebergErrorResponse' + oneOf: + - $ref: '#/components/schemas/ScanTasks' + - $ref: '#/components/schemas/IcebergErrorResponse' + - $ref: '#/components/schemas/EmptyResult' + required: + - status properties: - file-scan-tasks: - type: array - items: - $ref: '#/components/schemas/FileScanTask' - plan-tasks: - type: array - items: - $ref: '#/components/schemas/PlanTask' + status: + $ref: '#/components/schemas/PlanStatus' + + PlanStatus: + description: Status of a server-side planning operation + type: string + enum: [completed, submitted, cancelled, failed] - CancelPlanResult: + PlanTableScanResult: type: object - description: Used to indicate state of cancellation. If successful should return "cancelled" state. + description: Response schema for planTableScan + allOf: + - $ref: '#/components/schemas/PlanningResult' properties: - cancel-status: - $ref: '#/components/schemas/PlanStatus' + plan-id: + description: ID used to track a planning request + type: string + + FetchScanTasksResult: + type: object + description: Response schema for fetchScanTasks + allOf: + - $ref: '#/components/schemas/ScanTasks' CommitTableRequest: type: object @@ -3858,8 +3930,8 @@ components: items: $ref: '#/components/schemas/PrimitiveTypeValue' description: - "A list of partition field values ordered based on the fields of the partition spec specified by the - `spec-id`" + A list of partition field values ordered based on the fields of + the partition spec specified by the `spec-id` example: [1, "bar"] file-size-in-bytes: type: integer @@ -3952,63 +4024,71 @@ components: type: integer description: "List of equality field IDs" - PlanTableRequest: + PlanTableScanRequest: type: object properties: snapshot-id: description: - The ID of the snapshot to use for the table scan. + Identifier for the snapshot to scan in a point-in-time scan type: integer format: int64 select: - description: - A list of fields in schema that are selected in a table scan. - When not specified, all columns in the requested schema should be selected. + description: List of selected schema fields type: array items: $ref: '#/components/schemas/FieldName' filter: description: - an unbounded expression to describe the filters to apply to a table scan, + Expression used to filter the table data $ref: '#/components/schemas/Expression' case-sensitive: - description: If field selection and filtering should be case sensitive + description: Enables case sensitive field matching for filter and select type: boolean default: true use-snapshot-schema: description: - If the client is performing time travel, the snapshot schema should be used. - For clients performing a plan for a branch, should default to using the table schema. + Whether to use the schema at the time the snapshot was written. + + When time travelling, the snapshot schema should be used (true). + When scanning a branch, the table schema should be used (false). type: boolean default: false start-snapshot-id: - description: The ID of the starting snapshot of the incremental scan + description: Starting snapshot ID for an incremental scan (exclusive) type: integer format: int64 end-snapshot-id: description: - The ID of the inclusive ending snapshot of the incremental scan. - If not specified, the snapshot at the main branch head will be used as the end snapshot. + Ending snapshot ID for an incremental scan (inclusive). + + Required when start-snapshot-id is specified. type: integer format: int64 stats-fields: description: - A list of fields that the client requests the server to send statistics - in each `FileScanTask` returned in the response + List of fields for which the service should send column stats. type: array items: $ref: '#/components/schemas/FieldName' - GetTasksStatusRequest: - type: object - required: - - plan-id - properties: - plan-id: - description: id used to track status of `planTable` - type: string + FieldName: + description: + A full field name (including parent field names), such as those passed + in APIs like Java `Schema#findField(String name)`. - RetrieveTasksRequest: + The nested field name follows these rules + - Nested struct fields are named by concatenating field names at each + struct level using dot (`.`) delimiter, e.g. + employer.contact_info.address.zip_code + - Nested fields in a map key are named using the keyword `key`, e.g. + employee_address_map.key.first_name + - Nested fields in a map value are named using the keyword `value`, + e.g. employee_address_map.value.zip_code + - Nested fields in a list are named using the keyword `element`, e.g. + employees.element.first_name + type: string + + FetchScanTasksRequest: type: object required: - plan-task @@ -4016,41 +4096,12 @@ components: plan-task: $ref: '#/components/schemas/PlanTask' - CancelPlanRequest: - type: object - required: - - plan-id - properties: - plan-id: - description: id used to cancel `planTable` operation - type: string - - FieldName: - description: - A field name that follows the Iceberg naming standard, and can be used in APIs like - Java `Schema#findField(String name)`. - - The nested field name follows these rules - - nested struct fields are named by concatenating field names at each struct level using dot (`.`) delimiter, - e.g. employer.contact_info.address.zip_code - - nested fields in a map key are named using the keyword `key`, e.g. employee_address_map.key.first_name - - nested fields in a map value are named using the keyword `value`, e.g. employee_address_map.value.zip_code - - nested fields in a list are named using the keyword `element`, e.g. employees.element.first_name - type: string - PlanTask: description: - An opaque JSON object that contains information provided by the REST server - to be utilized by clients for distributed table scan planning; should be supplied - as input in `RetrieveTasks` operation. + An opaque JSON object provided by the REST server that represents a + unit of work to produce file scan tasks for scan planning. type: object - PlanStatus: - description: - Represents the current status of the `planTable` operation. - type: string - enum: [started, cancelled, failed] - FileScanTask: type: object required: @@ -4058,15 +4109,17 @@ components: properties: data-file: $ref: '#/components/schemas/DataFile' - delete-files-references: - description: A list of positional indices that correspond to a delete files array. + delete-file-references: + description: A list of indices in the delete files array (0-based) type: array items: type: integer residual-filter: description: An optional filter to be applied to rows in this file scan task. - If the residual is not present, the client should calculate this or the original filter should be used. + + If the residual is not present, the client must produce the + residual or use the original filter. allOf: - $ref: '#/components/schemas/Expression' @@ -4286,34 +4339,26 @@ components: schema: $ref: '#/components/schemas/LoadTableResult' - PlanTableResponse: + PlanTableScanResponse: description: Result of planning a table scan content: application/json: schema: - $ref: '#/components/schemas/PlanTableResult' + $ref: '#/components/schemas/PlanTableScanResult' - GetTasksStatusResponse: + FetchPlanningResultResponse: description: Result of checking status of a plan content: application/json: schema: - $ref: '#/components/schemas/GetTasksStatusResult' + $ref: '#/components/schemas/PlanningResult' - RetrieveTasksResponse: + FetchScanTasksResponse: description: Result of retrieving additional plan tasks and file scan tasks. content: application/json: schema: - $ref: '#/components/schemas/RetrieveTasksResult' - - CancelPlanResponse: - description: Result of a cancellation of a plan - content: - application/json: - schema: - $ref: '#/components/schemas/CancelPlanResult' - + $ref: '#/components/schemas/FetchScanTasksResult' LoadTableResponse: description: Table metadata result when loading a table From b2837f61dc8161cf906f7865fdfca343cab3f748 Mon Sep 17 00:00:00 2001 From: Ryan Blue Date: Thu, 29 Aug 2024 09:06:58 -0700 Subject: [PATCH 07/12] Update rest-catalog-open-api.yaml Minor clarification. --- open-api/rest-catalog-open-api.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/open-api/rest-catalog-open-api.yaml b/open-api/rest-catalog-open-api.yaml index ce1628eb795a..8ea65e0910cc 100644 --- a/open-api/rest-catalog-open-api.yaml +++ b/open-api/rest-catalog-open-api.yaml @@ -565,8 +565,9 @@ paths: Responses must include a valid status - - When "completed" the planning operation has produced plan-tasks and - file-scan-tasks that must be returned in the response + - When "completed" the planning operation has produced plan tasks and + file scan tasks that must be returned in the response (not fetched + later by calling fetchPlanningResult) - When "submitted" the response must include a plan-id used to poll fetchPlanningResult to fetch the planning result when it is ready From 9c920620d231195e6be6e88f5b6964be4719d713 Mon Sep 17 00:00:00 2001 From: Ryan Blue Date: Thu, 29 Aug 2024 21:03:38 -0700 Subject: [PATCH 08/12] Fix yaml for python codegen. (#3) * Fix yaml for python codegen. * Add updated python. --- open-api/rest-catalog-open-api.py | 157 +++++++++++++++++----------- open-api/rest-catalog-open-api.yaml | 105 ++++++++++++++----- 2 files changed, 177 insertions(+), 85 deletions(-) diff --git a/open-api/rest-catalog-open-api.py b/open-api/rest-catalog-open-api.py index e3e91dffd488..e268b3d6d9af 100644 --- a/open-api/rest-catalog-open-api.py +++ b/open-api/rest-catalog-open-api.py @@ -451,6 +451,12 @@ class AssertViewUUID(BaseModel): uuid: str +class PlanStatus(BaseModel): + __root__: Literal['completed', 'submitted', 'cancelled', 'failed'] = Field( + ..., description='Status of a server-side planning operation' + ) + + class RegisterTableRequest(BaseModel): name: str metadata_location: str = Field(..., alias='metadata-location') @@ -818,37 +824,19 @@ class EqualityDeleteFile(ContentFile): ) -class GetTasksStatusRequest(BaseModel): - plan_id: str = Field( - ..., alias='plan-id', description='id used to track status of `planTable`' - ) - - -class CancelPlanRequest(BaseModel): - plan_id: str = Field( - ..., alias='plan-id', description='id used to cancel `planTable` operation' - ) - - class FieldName(BaseModel): __root__: str = Field( ..., - description='A field name that follows the Iceberg naming standard, and can be used in APIs like Java `Schema#findField(String name)`.\nThe nested field name follows these rules - nested struct fields are named by concatenating field names at each struct level using dot (`.`) delimiter, e.g. employer.contact_info.address.zip_code - nested fields in a map key are named using the keyword `key`, e.g. employee_address_map.key.first_name - nested fields in a map value are named using the keyword `value`, e.g. employee_address_map.value.zip_code - nested fields in a list are named using the keyword `element`, e.g. employees.element.first_name', + description='A full field name (including parent field names), such as those passed in APIs like Java `Schema#findField(String name)`.\nThe nested field name follows these rules - Nested struct fields are named by concatenating field names at each struct level using dot (`.`) delimiter, e.g. employer.contact_info.address.zip_code - Nested fields in a map key are named using the keyword `key`, e.g. employee_address_map.key.first_name - Nested fields in a map value are named using the keyword `value`, e.g. employee_address_map.value.zip_code - Nested fields in a list are named using the keyword `element`, e.g. employees.element.first_name', ) class PlanTask(BaseModel): """ - An opaque JSON object that contains information provided by the REST server to be utilized by clients for distributed table scan planning; should be supplied as input in `RetrieveTasks` operation. + An opaque JSON object provided by the REST server that represents a unit of work to produce file scan tasks for scan planning. """ -class PlanStatus(BaseModel): - __root__: Literal['started', 'cancelled', 'failed'] = Field( - ..., description='Represents the current status of the `planTable` operation.' - ) - - class CreateNamespaceRequest(BaseModel): namespace: Namespace properties: Optional[Dict[str, str]] = Field( @@ -893,12 +881,27 @@ class ViewRequirement(BaseModel): __root__: AssertViewUUID = Field(..., discriminator='type') -class CancelPlanResult(BaseModel): +class FailedPlanningResult(IcebergErrorResponse): """ - Used to indicate state of cancellation. If successful should return "cancelled" state. + Failed server-side planning result """ - cancel_status: Optional[PlanStatus] = Field(None, alias='cancel-status') + status: Literal['failed'] + + +class AsyncPlanningResult(BaseModel): + status: Literal['submitted'] + plan_id: Optional[str] = Field( + None, alias='plan-id', description='ID used to track a planning request' + ) + + +class EmptyResult(BaseModel): + """ + Empty server-side planning result + """ + + status: Literal['cancelled'] class ReportMetricsRequest2(CommitReport): @@ -960,7 +963,7 @@ class DeleteFile(BaseModel): ) -class RetrieveTasksRequest(BaseModel): +class FetchScanTasksRequest(BaseModel): plan_task: PlanTask = Field(..., alias='plan-task') @@ -1172,35 +1175,47 @@ class LoadTableResult(BaseModel): config: Optional[Dict[str, str]] = None -class PlanTableResult(BaseModel): - """ - If the plan has not finished return a `plan-id`. If finished, the response will contain a list of `FileScanTask`, a list of `PlanTask`, or both. +class ScanTasks(BaseModel): """ + Scan and planning tasks for server-side scan planning - file_scan_tasks: Optional[List[FileScanTask]] = Field(None, alias='file-scan-tasks') - plan_tasks: Optional[List[PlanTask]] = Field(None, alias='plan-tasks') - plan_id: Optional[str] = Field( - None, alias='plan-id', description='id used to track progress of the plan' - ) + - `plan-tasks` contains opaque units of planning work + - `file-scan-tasks` contains a partial list of table scan tasks + - `delete-files` contains delete files referenced by file scan tasks + Each plan task must be passed to the fetchScanTasks endpoint to fetch the file scan tasks for the plan task. -class GetTasksStatusResult(BaseModel): - """ - If the plan has not finished return a `plan-status`. If the plan has finished can return a list of `FileScanTask`, a list of `PlanTask`, or both. + The list of delete files must contain all delete files referenced by the file scan tasks. """ + delete_files: Optional[List[DeleteFile]] = Field( + None, + alias='delete-files', + description='Delete files referenced by file scan tasks', + ) file_scan_tasks: Optional[List[FileScanTask]] = Field(None, alias='file-scan-tasks') plan_tasks: Optional[List[PlanTask]] = Field(None, alias='plan-tasks') - plan_status: Optional[PlanStatus] = Field(None, alias='plan-status') -class RetrieveTasksResult(BaseModel): - """ - Used to fetch file scan tasks for a given `planTask`. Can also return additional plan-tasks. - """ +class FetchPlanningResult(BaseModel): + __root__: Union[CompletedPlanningResult, FailedPlanningResult, EmptyResult] = Field( + ..., + description='Result of server-side scan planning for fetchPlanningResult', + discriminator='status', + ) - file_scan_tasks: Optional[List[FileScanTask]] = Field(None, alias='file-scan-tasks') - plan_tasks: Optional[List[PlanTask]] = Field(None, alias='plan-tasks') + +class PlanTableScanResult(BaseModel): + __root__: Union[ + CompletedPlanningWithIDResult, + FailedPlanningResult, + AsyncPlanningResult, + EmptyResult, + ] = Field( + ..., + description='Result of server-side scan planning for planTableScan', + discriminator='status', + ) class CommitTableRequest(BaseModel): @@ -1289,58 +1304,56 @@ class CommitTableResponse(BaseModel): metadata: TableMetadata -class PlanTableRequest(BaseModel): +class PlanTableScanRequest(BaseModel): snapshot_id: Optional[int] = Field( None, alias='snapshot-id', - description='The ID of the snapshot to use for the table scan.', + description='Identifier for the snapshot to scan in a point-in-time scan', ) select: Optional[List[FieldName]] = Field( - None, - description='A list of fields in schema that are selected in a table scan. When not specified, all columns in the requested schema should be selected.', + None, description='List of selected schema fields' ) filter: Optional[Expression] = Field( - None, - description='an unbounded expression to describe the filters to apply to a table scan,', + None, description='Expression used to filter the table data' ) case_sensitive: Optional[bool] = Field( True, alias='case-sensitive', - description='If field selection and filtering should be case sensitive', + description='Enables case sensitive field matching for filter and select', ) use_snapshot_schema: Optional[bool] = Field( False, alias='use-snapshot-schema', - description='If the client is performing time travel, the snapshot schema should be used. For clients performing a plan for a branch, should default to using the table schema.', + description='Whether to use the schema at the time the snapshot was written.\nWhen time travelling, the snapshot schema should be used (true). When scanning a branch, the table schema should be used (false).', ) start_snapshot_id: Optional[int] = Field( None, alias='start-snapshot-id', - description='The ID of the starting snapshot of the incremental scan', + description='Starting snapshot ID for an incremental scan (exclusive)', ) end_snapshot_id: Optional[int] = Field( None, alias='end-snapshot-id', - description='The ID of the inclusive ending snapshot of the incremental scan. If not specified, the snapshot at the main branch head will be used as the end snapshot.', + description='Ending snapshot ID for an incremental scan (inclusive).\nRequired when start-snapshot-id is specified.', ) stats_fields: Optional[List[FieldName]] = Field( None, alias='stats-fields', - description='A list of fields that the client requests the server to send statistics in each `FileScanTask` returned in the response', + description='List of fields for which the service should send column stats.', ) class FileScanTask(BaseModel): data_file: DataFile = Field(..., alias='data-file') - delete_files_references: Optional[List[int]] = Field( + delete_file_references: Optional[List[int]] = Field( None, - alias='delete-files-references', - description='A list of positional indices that correspond to a delete files array.', + alias='delete-file-references', + description='A list of indices in the delete files array (0-based)', ) residual_filter: Optional[Expression] = Field( None, alias='residual-filter', - description='An optional filter to be applied to rows in this file scan task. If the residual is not present, the client should calculate this or the original filter should be used.', + description='An optional filter to be applied to rows in this file scan task.\nIf the residual is not present, the client must produce the residual or use the original filter.', ) @@ -1351,10 +1364,31 @@ class Schema(StructType): ) +class CompletedPlanningResult(ScanTasks): + """ + Completed server-side planning result + """ + + status: Literal['completed'] + + +class FetchScanTasksResult(ScanTasks): + """ + Response schema for fetchScanTasks + """ + + class ReportMetricsRequest1(ScanReport): report_type: str = Field(..., alias='report-type') +class CompletedPlanningWithIDResult(CompletedPlanningResult): + plan_id: Optional[str] = Field( + None, alias='plan-id', description='ID used to track a planning request' + ) + status: Literal['completed'] + + StructField.update_forward_refs() ListType.update_forward_refs() MapType.update_forward_refs() @@ -1362,9 +1396,12 @@ class ReportMetricsRequest1(ScanReport): TableMetadata.update_forward_refs() ViewMetadata.update_forward_refs() AddSchemaUpdate.update_forward_refs() -PlanTableResult.update_forward_refs() -GetTasksStatusResult.update_forward_refs() -RetrieveTasksResult.update_forward_refs() +ScanTasks.update_forward_refs() +FetchPlanningResult.update_forward_refs() +PlanTableScanResult.update_forward_refs() CreateTableRequest.update_forward_refs() CreateViewRequest.update_forward_refs() ReportMetricsRequest.update_forward_refs() +CompletedPlanningResult.update_forward_refs() +FetchScanTasksResult.update_forward_refs() +CompletedPlanningWithIDResult.update_forward_refs() diff --git a/open-api/rest-catalog-open-api.yaml b/open-api/rest-catalog-open-api.yaml index 8ea65e0910cc..6fd05b7e52ab 100644 --- a/open-api/rest-catalog-open-api.yaml +++ b/open-api/rest-catalog-open-api.yaml @@ -3096,44 +3096,99 @@ components: items: $ref: '#/components/schemas/PlanTask' - EmptyResult: + CompletedPlanningResult: type: object - description: Empty object result + description: Completed server-side planning result + allOf: + - $ref: '#/components/schemas/ScanTasks' + - type: object + required: + - status + properties: + status: + $ref: '#/components/schemas/PlanStatus' + enum: ["completed"] - PlanningResult: + CompletedPlanningWithIDResult: type: object - description: Result of server-side scan planning - discriminator: - propertyName: status - mapping: - completed: '#/components/schemas/ScanTasks' - submitted: '#/components/schemas/EmptyResult' - cancelled: '#/components/schemas/EmptyResult' - failed: '#/components/schemas/IcebergErrorResponse' - oneOf: - - $ref: '#/components/schemas/ScanTasks' + allOf: + - $ref: '#/components/schemas/CompletedPlanningResult' + - type: object + properties: + plan-id: + description: ID used to track a planning request + type: string + + FailedPlanningResult: + type: object + description: Failed server-side planning result + allOf: - $ref: '#/components/schemas/IcebergErrorResponse' - - $ref: '#/components/schemas/EmptyResult' + - type: object + required: + - status + properties: + status: + $ref: '#/components/schemas/PlanStatus' + enum: ["failed"] + + AsyncPlanningResult: + type: object required: - status properties: status: $ref: '#/components/schemas/PlanStatus' + enum: ["submitted"] + plan-id: + description: ID used to track a planning request + type: string + + EmptyResult: + type: object + description: Empty server-side planning result + required: + - status + properties: + status: + $ref: '#/components/schemas/PlanStatus' + enum: ["submitted", "cancelled"] PlanStatus: description: Status of a server-side planning operation type: string - enum: [completed, submitted, cancelled, failed] + enum: ["completed", "submitted", "cancelled", "failed"] + + FetchPlanningResult: + type: object + description: Result of server-side scan planning for fetchPlanningResult + discriminator: + propertyName: status + mapping: + completed: '#/components/schemas/CompletedPlanningResult' + submitted: '#/components/schemas/EmptyResult' + cancelled: '#/components/schemas/EmptyResult' + failed: '#/components/schemas/FailedPlanningResult' + oneOf: + - $ref: '#/components/schemas/CompletedPlanningResult' + - $ref: '#/components/schemas/FailedPlanningResult' + - $ref: '#/components/schemas/EmptyResult' PlanTableScanResult: type: object - description: Response schema for planTableScan - allOf: - - $ref: '#/components/schemas/PlanningResult' - properties: - plan-id: - description: ID used to track a planning request - type: string + description: Result of server-side scan planning for planTableScan + discriminator: + propertyName: status + mapping: + completed: '#/components/schemas/CompletedPlanningWithIDResult' + submitted: '#/components/schemas/AsyncPlanningResult' + cancelled: '#/components/schemas/EmptyResult' + failed: '#/components/schemas/FailedPlanningResult' + oneOf: + - $ref: '#/components/schemas/CompletedPlanningWithIDResult' + - $ref: '#/components/schemas/FailedPlanningResult' + - $ref: '#/components/schemas/AsyncPlanningResult' + - $ref: '#/components/schemas/EmptyResult' FetchScanTasksResult: type: object @@ -4341,18 +4396,18 @@ components: $ref: '#/components/schemas/LoadTableResult' PlanTableScanResponse: - description: Result of planning a table scan + description: Result of submitting a table scan to plan content: application/json: schema: $ref: '#/components/schemas/PlanTableScanResult' FetchPlanningResultResponse: - description: Result of checking status of a plan + description: Result of fetching a submitted scan planning operation content: application/json: schema: - $ref: '#/components/schemas/PlanningResult' + $ref: '#/components/schemas/FetchPlanningResult' FetchScanTasksResponse: description: Result of retrieving additional plan tasks and file scan tasks. From d9154460e12ae0cf2f15ec434a8308ba7976bc69 Mon Sep 17 00:00:00 2001 From: Ryan Blue Date: Fri, 30 Aug 2024 16:10:46 -0700 Subject: [PATCH 09/12] Fix yaml for some editors. Co-authored-by: Daniel Weeks --- open-api/rest-catalog-open-api.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/open-api/rest-catalog-open-api.yaml b/open-api/rest-catalog-open-api.yaml index 6fd05b7e52ab..0c65cf605d19 100644 --- a/open-api/rest-catalog-open-api.yaml +++ b/open-api/rest-catalog-open-api.yaml @@ -550,7 +550,7 @@ paths: tags: - Catalog API summary: Submit a scan for planning - description: + description: > Submits a scan for server-side planning. @@ -642,7 +642,7 @@ paths: - Catalog API summary: Fetches the result of scan planning for a plan-id operationId: fetchPlanningResult - description: + description: > Fetches the result of scan planning for a plan-id. @@ -700,7 +700,7 @@ paths: - Catalog API summary: Cancels scan planning for a plan-id operationId: cancelPlanning - description: + description: > Cancels scan planning for a plan-id. From 2dc3fa552ece9d271ab278766108b52f6091df77 Mon Sep 17 00:00:00 2001 From: Ryan Blue Date: Tue, 3 Sep 2024 15:01:59 -0700 Subject: [PATCH 10/12] Update open-api/rest-catalog-open-api.yaml Co-authored-by: Daniel Weeks --- open-api/rest-catalog-open-api.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/open-api/rest-catalog-open-api.yaml b/open-api/rest-catalog-open-api.yaml index 0c65cf605d19..da87c95d7dc0 100644 --- a/open-api/rest-catalog-open-api.yaml +++ b/open-api/rest-catalog-open-api.yaml @@ -3144,7 +3144,7 @@ components: description: ID used to track a planning request type: string - EmptyResult: + EmptyPlanningResult: type: object description: Empty server-side planning result required: From 64c0e85e9ab020e36800d4b71cfe14520bd8e5a2 Mon Sep 17 00:00:00 2001 From: Rahil Chertara Date: Tue, 3 Sep 2024 15:26:48 -0700 Subject: [PATCH 11/12] fix yaml issues --- open-api/rest-catalog-open-api.py | 15 +++++++++------ open-api/rest-catalog-open-api.yaml | 16 ++++++++-------- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/open-api/rest-catalog-open-api.py b/open-api/rest-catalog-open-api.py index e268b3d6d9af..31fbf33d43f5 100644 --- a/open-api/rest-catalog-open-api.py +++ b/open-api/rest-catalog-open-api.py @@ -832,9 +832,10 @@ class FieldName(BaseModel): class PlanTask(BaseModel): - """ - An opaque JSON object provided by the REST server that represents a unit of work to produce file scan tasks for scan planning. - """ + __root__: str = Field( + ..., + description='An opaque string provided by the REST server that represents a unit of work to produce file scan tasks for scan planning.', + ) class CreateNamespaceRequest(BaseModel): @@ -896,7 +897,7 @@ class AsyncPlanningResult(BaseModel): ) -class EmptyResult(BaseModel): +class EmptyPlanningResult(BaseModel): """ Empty server-side planning result """ @@ -1198,7 +1199,9 @@ class ScanTasks(BaseModel): class FetchPlanningResult(BaseModel): - __root__: Union[CompletedPlanningResult, FailedPlanningResult, EmptyResult] = Field( + __root__: Union[ + CompletedPlanningResult, FailedPlanningResult, EmptyPlanningResult + ] = Field( ..., description='Result of server-side scan planning for fetchPlanningResult', discriminator='status', @@ -1210,7 +1213,7 @@ class PlanTableScanResult(BaseModel): CompletedPlanningWithIDResult, FailedPlanningResult, AsyncPlanningResult, - EmptyResult, + EmptyPlanningResult, ] = Field( ..., description='Result of server-side scan planning for planTableScan', diff --git a/open-api/rest-catalog-open-api.yaml b/open-api/rest-catalog-open-api.yaml index da87c95d7dc0..1ba67dadf14f 100644 --- a/open-api/rest-catalog-open-api.yaml +++ b/open-api/rest-catalog-open-api.yaml @@ -887,7 +887,7 @@ paths: The snapshots to return in the body of the metadata. Setting the value to `all` would return the full set of snapshots currently valid for the table. Setting the value to `refs` would load all snapshots referenced by branches or tags. - + Default if no param is provided is `all`. required: false schema: @@ -3166,13 +3166,13 @@ components: propertyName: status mapping: completed: '#/components/schemas/CompletedPlanningResult' - submitted: '#/components/schemas/EmptyResult' - cancelled: '#/components/schemas/EmptyResult' + submitted: '#/components/schemas/EmptyPlanningResult' + cancelled: '#/components/schemas/EmptyPlanningResult' failed: '#/components/schemas/FailedPlanningResult' oneOf: - $ref: '#/components/schemas/CompletedPlanningResult' - $ref: '#/components/schemas/FailedPlanningResult' - - $ref: '#/components/schemas/EmptyResult' + - $ref: '#/components/schemas/EmptyPlanningResult' PlanTableScanResult: type: object @@ -3182,13 +3182,13 @@ components: mapping: completed: '#/components/schemas/CompletedPlanningWithIDResult' submitted: '#/components/schemas/AsyncPlanningResult' - cancelled: '#/components/schemas/EmptyResult' + cancelled: '#/components/schemas/EmptyPlanningResult' failed: '#/components/schemas/FailedPlanningResult' oneOf: - $ref: '#/components/schemas/CompletedPlanningWithIDResult' - $ref: '#/components/schemas/FailedPlanningResult' - $ref: '#/components/schemas/AsyncPlanningResult' - - $ref: '#/components/schemas/EmptyResult' + - $ref: '#/components/schemas/EmptyPlanningResult' FetchScanTasksResult: type: object @@ -4154,9 +4154,9 @@ components: PlanTask: description: - An opaque JSON object provided by the REST server that represents a + An opaque string provided by the REST server that represents a unit of work to produce file scan tasks for scan planning. - type: object + type: string FileScanTask: type: object From 961a37817ca9b0d0e385850cfe1cfe3e53b7e345 Mon Sep 17 00:00:00 2001 From: Rahil Chertara Date: Mon, 9 Sep 2024 17:53:41 -0700 Subject: [PATCH 12/12] address flyrain comments --- open-api/rest-catalog-open-api.py | 5 +++-- open-api/rest-catalog-open-api.yaml | 12 +++++------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/open-api/rest-catalog-open-api.py b/open-api/rest-catalog-open-api.py index 31fbf33d43f5..6d71de7c069c 100644 --- a/open-api/rest-catalog-open-api.py +++ b/open-api/rest-catalog-open-api.py @@ -834,7 +834,7 @@ class FieldName(BaseModel): class PlanTask(BaseModel): __root__: str = Field( ..., - description='An opaque string provided by the REST server that represents a unit of work to produce file scan tasks for scan planning.', + description='An opaque string provided by the REST server that represents a unit of work to produce file scan tasks for scan planning. This allows clients to fetch tasks across multiple requests to accommodate large result sets.', ) @@ -1181,12 +1181,13 @@ class ScanTasks(BaseModel): Scan and planning tasks for server-side scan planning - `plan-tasks` contains opaque units of planning work - - `file-scan-tasks` contains a partial list of table scan tasks + - `file-scan-tasks` contains a partial or complete list of table scan tasks - `delete-files` contains delete files referenced by file scan tasks Each plan task must be passed to the fetchScanTasks endpoint to fetch the file scan tasks for the plan task. The list of delete files must contain all delete files referenced by the file scan tasks. + """ delete_files: Optional[List[DeleteFile]] = Field( diff --git a/open-api/rest-catalog-open-api.yaml b/open-api/rest-catalog-open-api.yaml index 1ba67dadf14f..9523681940ae 100644 --- a/open-api/rest-catalog-open-api.yaml +++ b/open-api/rest-catalog-open-api.yaml @@ -563,7 +563,7 @@ paths: point-in-time scan of the latest snapshot in the table's main branch. - Responses must include a valid status + Responses must include a valid status listed below. A "cancelled" status is considered invalid for this endpoint. - When "completed" the planning operation has produced plan tasks and file scan tasks that must be returned in the response (not fetched @@ -574,9 +574,6 @@ paths: - When "failed" the response must be a valid error response - - Status "cancelled" is not a valid status from this endpoint - - The response for a "completed" planning operation includes two types of tasks (file scan tasks and plan tasks) and both may be included in the response. Tasks must not be included for any other response status. @@ -3064,13 +3061,13 @@ components: ScanTasks: type: object - description: + description: > Scan and planning tasks for server-side scan planning - `plan-tasks` contains opaque units of planning work - - `file-scan-tasks` contains a partial list of table scan tasks + - `file-scan-tasks` contains a partial or complete list of table scan tasks - `delete-files` contains delete files referenced by file scan tasks @@ -4155,7 +4152,8 @@ components: PlanTask: description: An opaque string provided by the REST server that represents a - unit of work to produce file scan tasks for scan planning. + unit of work to produce file scan tasks for scan planning. This allows + clients to fetch tasks across multiple requests to accommodate large result sets. type: string FileScanTask: