Skip to content

Commit

Permalink
fix(ingest): presto-on-hive - Adding catalog name to the presto on hi…
Browse files Browse the repository at this point in the history
…ve urn (#6024)

Co-authored-by: Shirshanka Das <[email protected]>
  • Loading branch information
treff7es and shirshanka authored Oct 19, 2022
1 parent e54f376 commit 0545f3c
Show file tree
Hide file tree
Showing 6 changed files with 4,864 additions and 2,305 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,11 @@ class PrestoOnHiveConfig(BasicSQLAlchemyConfig):
description="Dataset Subtype name to be 'Table' or 'View' Valid options: ['True', 'False']",
)

include_catalog_name_in_ids: bool = Field(
default=False,
description="Add the Presto catalog name (e.g. hive) to the generated dataset urns. `urn:li:dataset:(urn:li:dataPlatform:hive,hive.user.logging_events,PROD)` versus `urn:li:dataset:(urn:li:dataPlatform:hive,user.logging_events,PROD)`",
)

def get_sql_alchemy_url(self, uri_opts: Optional[Dict[str, Any]] = None) -> str:
if not ((self.host_port and self.scheme) or self.sqlalchemy_uri):
raise ValueError("host_port and schema or connect_uri required.")
Expand Down Expand Up @@ -407,9 +412,17 @@ def loop_tables(
iter_res = self._alchemy_client.execute_query(statement)

for key, group in groupby(iter_res, self._get_table_key):
db_name = self.get_db_name(inspector)
schema_name = (
f"{db_name}.{key.schema}"
if self.config.include_catalog_name_in_ids
else key.schema
)

dataset_name = self.get_identifier(
schema=key.schema, entity=key.table, inspector=inspector
schema=schema_name, entity=key.table, inspector=inspector
)

self.report.report_entity_scanned(dataset_name, ent_type="table")

if not sql_config.table_pattern.allowed(dataset_name):
Expand Down Expand Up @@ -521,8 +534,14 @@ def get_hive_view_columns(self, inspector: Inspector) -> Iterable[ViewDataset]:

iter_res = self._alchemy_client.execute_query(statement)
for key, group in groupby(iter_res, self._get_table_key):
db_name = self.get_db_name(inspector)
schema_name = (
f"{db_name}.{key.schema}"
if self.config.include_catalog_name_in_ids
else key.schema
)
dataset_name = self.get_identifier(
schema=key.schema, entity=key.table, inspector=inspector
schema=schema_name, entity=key.table, inspector=inspector
)
columns = list(group)

Expand Down Expand Up @@ -553,8 +572,16 @@ def get_presto_view_columns(self, inspector: Inspector) -> Iterable[ViewDataset]

iter_res = self._alchemy_client.execute_query(statement)
for row in iter_res:
db_name = self.get_db_name(inspector)
schema_name = (
f"{db_name}.{row['schema']}"
if self.config.include_catalog_name_in_ids
else row["schema"]
)
dataset_name = self.get_identifier(
schema=row["schema"], entity=row["name"], inspector=inspector
schema=schema_name,
entity=row["name"],
inspector=inspector,
)

columns, view_definition = self._get_presto_view_column_metadata(
Expand Down
Loading

0 comments on commit 0545f3c

Please sign in to comment.