From e988d5808486e18efb0e08a56d919c8108d85782 Mon Sep 17 00:00:00 2001 From: Piotr Skrydalewicz Date: Wed, 20 Nov 2024 15:49:10 +0100 Subject: [PATCH 1/5] Bumped pyiceberg dependency to 0.8.0 --- metadata-ingestion/setup.py | 2 +- .../src/datahub/ingestion/source/iceberg/iceberg.py | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 2469af74b0334..3908a24cea799 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -249,7 +249,7 @@ iceberg_common = { # Iceberg Python SDK - "pyiceberg>=0.4,<0.7", + "pyiceberg>=0.8.0", } mssql_common = { diff --git a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py index 258a4b9ad6daf..e8f401f23d398 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py +++ b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py @@ -192,9 +192,7 @@ def _process_dataset(dataset_path: Identifier) -> Iterable[MetadataWorkUnit]: table = thread_local.local_catalog.load_table(dataset_path) time_taken = timer.elapsed_seconds() self.report.report_table_load_time(time_taken) - LOGGER.debug( - f"Loaded table: {table.identifier}, time taken: {time_taken}" - ) + LOGGER.debug(f"Loaded table: {table.name()}, time taken: {time_taken}") yield from self._create_iceberg_workunit(dataset_name, table) except NoSuchPropertyException as e: self.report.report_warning( From bafab23f560b75646c2d626b2cdd055de8e6455e Mon Sep 17 00:00:00 2001 From: Piotr Skrydalewicz Date: Thu, 28 Nov 2024 13:20:48 +0100 Subject: [PATCH 2/5] Added iceberg recipe definition to the UI --- .../src/app/ingest/source/builder/sources.json | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/datahub-web-react/src/app/ingest/source/builder/sources.json b/datahub-web-react/src/app/ingest/source/builder/sources.json index c20869a1c849c..f7661af88e34b 100644 --- a/datahub-web-react/src/app/ingest/source/builder/sources.json +++ b/datahub-web-react/src/app/ingest/source/builder/sources.json @@ -317,5 +317,13 @@ "displayName": "CassandraDB", "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/cassandra", "recipe": "source:\n type: cassandra\n config:\n # Credentials for on prem cassandra\n contact_point: localhost\n port: 9042\n username: admin\n password: password\n\n # Or\n # Credentials Astra Cloud\n #cloud_config:\n # secure_connect_bundle: Path to Secure Connect Bundle (.zip)\n # token: Application Token\n\n # Optional Allow / Deny extraction of particular keyspaces.\n keyspace_pattern:\n allow: [.*]\n\n # Optional Allow / Deny extraction of particular tables.\n table_pattern:\n allow: [.*]" + }, + { + "urn": "urn:li:dataPlatform:iceberg", + "name": "iceberg", + "displayName": "Iceberg", + "description": "Ingest databases and tables from any Iceberg catalog implementation", + "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/iceberg", + "recipe": "source:\n type: \"iceberg\"\n config:\n env: dev\n # each thread will open internet connections to fetch manifest files independently, \n # this value needs to be adjusted with ulimit\n processing_threads: 1 \n # a single catalog definition with a form of a dictionary\n catalog: \n demo: # name of the catalog\n type: \"rest\" # other types are available\n uri: \"uri\"\n s3.access-key-id: \"access-key\"\n s3.secret-access-key: \"secret-access-key\"\n s3.region: \"aws-region\"\n profiling:\n enabled: false\n" } ] From 835d33b24d5011da8a3f4da75f80be550962d27e Mon Sep 17 00:00:00 2001 From: Piotr Skrydalewicz Date: Thu, 28 Nov 2024 13:28:36 +0100 Subject: [PATCH 3/5] Added handling of additional exception, clarified documentation --- .../src/datahub/ingestion/source/iceberg/iceberg.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py index e8f401f23d398..5931873f54236 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py +++ b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py @@ -9,6 +9,7 @@ NoSuchIcebergTableError, NoSuchNamespaceError, NoSuchPropertyException, + NoSuchTableError, ) from pyiceberg.schema import Schema, SchemaVisitorPerPrimitiveType, visit from pyiceberg.table import Table @@ -104,7 +105,7 @@ @capability(SourceCapability.DESCRIPTIONS, "Enabled by default.") @capability( SourceCapability.OWNERSHIP, - "Optionally enabled via configuration by specifying which Iceberg table property holds user or group ownership.", + "Automatically ingests ownership information from table properties based on `user_ownership_property` and `group_ownership_property`", ) @capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion") class IcebergSource(StatefulIngestionSourceBase): @@ -204,12 +205,20 @@ def _process_dataset(dataset_path: Identifier) -> Iterable[MetadataWorkUnit]: ) except NoSuchIcebergTableError as e: self.report.report_warning( - "no-iceberg-table", + "not-an-iceberg-table", f"Failed to create workunit for {dataset_name}. {e}", ) LOGGER.warning( f"NoSuchIcebergTableError while processing table {dataset_path}, skipping it.", ) + except NoSuchTableError as e: + self.report.report_warning( + "no-such-table", + f"Failed to create workunit for {dataset_name}. {e}", + ) + LOGGER.warning( + f"NoSuchTableError while processing table {dataset_path}, skipping it.", + ) except Exception as e: self.report.report_failure("general", f"Failed to create workunit: {e}") LOGGER.exception( From 59fa1a92b3eaaccc0a13213f7649e1ce4120daac Mon Sep 17 00:00:00 2001 From: Piotr Skrydalewicz Date: Thu, 28 Nov 2024 14:34:19 +0100 Subject: [PATCH 4/5] Reverting dependency bump for iceberg --- metadata-ingestion/setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 3908a24cea799..8efc2ddccbd20 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -249,7 +249,8 @@ iceberg_common = { # Iceberg Python SDK - "pyiceberg>=0.8.0", + # Kept at 0.4.0 due to higher versions requiring pydantic>2, as soon as we are fine with it, bump this dependency + "pyiceberg>=0.4.0", } mssql_common = { From 151627a1e813f9a8289a9a0d252ebb5fb089ff27 Mon Sep 17 00:00:00 2001 From: Piotr Skrydalewicz Date: Thu, 28 Nov 2024 14:56:09 +0100 Subject: [PATCH 5/5] Filled in empty doc section --- metadata-ingestion/docs/sources/iceberg/iceberg.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion/docs/sources/iceberg/iceberg.md b/metadata-ingestion/docs/sources/iceberg/iceberg.md index 7e40315a2e319..92aac5ffa6ce5 100644 --- a/metadata-ingestion/docs/sources/iceberg/iceberg.md +++ b/metadata-ingestion/docs/sources/iceberg/iceberg.md @@ -18,6 +18,8 @@ This ingestion source maps the following Source System Concepts to DataHub Conce ## Troubleshooting -### [Common Issue] +### Exceptions while increasing `processing_threads` -[Provide description of common issues with this integration and steps to resolve] +Each processing thread will open several files/sockets to download manifest files from blob storage. If you experience +exceptions appearing when increasing `processing_threads` configuration parameter, try to increase limit of open +files (i.e. using `ulimit` in Linux).