datahub-project · hsheth2 · Jan 26, 2023 · Jan 24, 2023 · Jan 24, 2023 · Jan 24, 2023
diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py
@@ -122,6 +122,8 @@ def get_long_description():
     "greenlet",
 }
 
+sqllineage_lib = "sqllineage==1.3.6"
+
 aws_common = {
     # AWS Python SDK
     "boto3",
@@ -143,7 +145,7 @@ def get_long_description():
     # See https://github.com/joshtemple/lkml/issues/73.
     "lkml>=1.3.0b5",
     "sql-metadata==2.2.2",
-    "sqllineage==1.3.6",
+    sqllineage_lib,
     "GitPython>2",
 }
 
@@ -165,7 +167,7 @@ def get_long_description():
     "sqlalchemy-redshift",
     "psycopg2-binary",
     "GeoAlchemy2",
-    "sqllineage==1.3.6",
+    sqllineage_lib,
     *path_spec_common,
 }
 
@@ -255,18 +257,18 @@ def get_long_description():
         "gql>=3.3.0",
         "gql[requests]>=3.3.0",
     },
-    "great-expectations": sql_common | {"sqllineage==1.3.6"},
+    "great-expectations": sql_common | {sqllineage_lib},
     # Source plugins
     # PyAthena is pinned with exact version because we use private method in PyAthena
     "athena": sql_common | {"PyAthena[SQLAlchemy]==2.4.1"},
     "azure-ad": set(),
     "bigquery": sql_common
     | bigquery_common
-    | {"sqllineage==1.3.6", "sql_metadata", "sqlalchemy-bigquery>=1.4.1"},
+    | {sqllineage_lib, "sql_metadata", "sqlalchemy-bigquery>=1.4.1"},
     "bigquery-beta": sql_common
     | bigquery_common
     | {
-        "sqllineage==1.3.6",
+        sqllineage_lib,
         "sql_metadata",
         "sqlalchemy-bigquery>=1.4.1",
     },  # deprecated, but keeping the extra for backwards compatibility
@@ -310,8 +312,8 @@ def get_long_description():
     "ldap": {"python-ldap>=2.4"},
     "looker": looker_common,
     "lookml": looker_common,
-    "metabase": {"requests", "sqllineage==1.3.6"},
-    "mode": {"requests", "sqllineage==1.3.6", "tenacity>=8.0.1"},
+    "metabase": {"requests", sqllineage_lib},
+    "mode": {"requests", sqllineage_lib, "tenacity>=8.0.1"},
     "mongodb": {"pymongo[srv]>=3.11", "packaging"},
     "mssql": sql_common | {"sqlalchemy-pytds>=0.3"},
     "mssql-odbc": sql_common | {"pyodbc"},
@@ -325,7 +327,7 @@ def get_long_description():
     "presto-on-hive": sql_common
     | {"psycopg2-binary", "acryl-pyhive[hive]>=0.6.12", "pymysql>=1.0.2"},
     "pulsar": {"requests"},
-    "redash": {"redash-toolbelt", "sql-metadata", "sqllineage==1.3.6"},
+    "redash": {"redash-toolbelt", "sql-metadata", sqllineage_lib},
     "redshift": sql_common | redshift_common,
     "redshift-usage": sql_common | usage_common | redshift_common,
     "s3": {*s3_base, *data_lake_profiling},

diff --git a/metadata-ingestion/src/datahub/cli/cli_utils.py b/metadata-ingestion/src/datahub/cli/cli_utils.py
@@ -88,8 +88,7 @@ def write_gms_config(
             # ok to fail on this
             previous_config = {}
             log.debug(
-                f"Failed to retrieve config from file {DATAHUB_CONFIG_PATH}. This isn't fatal.",
-                e,
+                f"Failed to retrieve config from file {DATAHUB_CONFIG_PATH}: {e}. This isn't fatal."
             )
         config_dict = {**previous_config, **config.dict()}
     else:
@@ -687,8 +686,6 @@ def get_aspects_for_entity(
         aspect_py_class: Optional[Type[Any]] = _get_pydantic_class_from_aspect_name(
             aspect_name
         )
-        if aspect_name == "unknown":
-            print(f"Failed to find aspect_name for class {aspect_name}")
 
         aspect_dict = a["value"]
         if not typed:
@@ -699,6 +696,8 @@ def get_aspects_for_entity(
                 aspect_map[aspect_name] = aspect_py_class.from_obj(post_json_obj)
             except Exception as e:
                 log.error(f"Error on {json.dumps(aspect_dict)}", e)
+        else:
+            log.debug(f"Failed to find class for aspect {aspect_name}")
 
     if aspects:
         return {k: v for (k, v) in aspect_map.items() if k in aspects}

diff --git a/metadata-ingestion/src/datahub/cli/delete_cli.py b/metadata-ingestion/src/datahub/cli/delete_cli.py
@@ -187,7 +187,7 @@ def delete(
         remove_references: bool = False
 
         if (not force) and references_count > 0:
-            print(
+            click.echo(
                 f"This urn was referenced in {references_count} other aspects across your metadata graph:"
             )
             click.echo(

diff --git a/metadata-ingestion/src/datahub/cli/ingest_cli.py b/metadata-ingestion/src/datahub/cli/ingest_cli.py
@@ -393,5 +393,5 @@ def rollback(
                     writer.writerow([row.get("urn")])
 
         except IOError as e:
-            print(e)
+            logger.exception(f"Unable to save rollback failure report: {e}")
             sys.exit(f"Unable to write reports to {report_dir}")
diff --git a/metadata-ingestion/src/datahub/cli/migrate.py b/metadata-ingestion/src/datahub/cli/migrate.py
@@ -262,7 +262,7 @@ def dataplatform2instance_func(
             delete_cli._delete_one_urn(src_entity_urn, soft=not hard, run_id=run_id)
         migration_report.on_entity_migrated(src_entity_urn, "status")  # type: ignore
 
-    print(f"{migration_report}")
+    click.echo(f"{migration_report}")
     migrate_containers(
         dry_run=dry_run,
         env=env,
@@ -372,7 +372,7 @@ def migrate_containers(
             delete_cli._delete_one_urn(src_urn, soft=not hard, run_id=run_id)
         migration_report.on_entity_migrated(src_urn, "status")  # type: ignore
 
-    print(f"{migration_report}")
+    click.echo(f"{migration_report}")
 
 
 def get_containers_for_migration(env: str) -> List[Any]:

diff --git a/metadata-ingestion/src/datahub/entrypoints.py b/metadata-ingestion/src/datahub/entrypoints.py
@@ -2,6 +2,7 @@
 import os
 import platform
 import sys
+from typing import Optional
 
 import click
 import stackprinter
@@ -25,21 +26,11 @@
 from datahub.cli.telemetry import telemetry as telemetry_cli
 from datahub.cli.timeline_cli import timeline
 from datahub.telemetry import telemetry
+from datahub.utilities.logging_manager import configure_logging
 from datahub.utilities.server_config_util import get_gms_config
 
 logger = logging.getLogger(__name__)
-
-# Configure some loggers.
-logging.getLogger("urllib3").setLevel(logging.ERROR)
-logging.getLogger("snowflake").setLevel(level=logging.WARNING)
-# logging.getLogger("botocore").setLevel(logging.INFO)
-# logging.getLogger("google").setLevel(logging.INFO)
-
-# Configure logger.
-BASE_LOGGING_FORMAT = (
-    "[%(asctime)s] %(levelname)-8s {%(name)s:%(lineno)d} - %(message)s"
-)
-logging.basicConfig(format=BASE_LOGGING_FORMAT)
+_logging_configured = None
 
 MAX_CONTENT_WIDTH = 120
 
@@ -58,6 +49,12 @@
     default=False,
     help="Enable debug logging.",
 )
+@click.option(
+    "--log-file",
+    type=click.Path(dir_okay=False),
+    default=None,
+    help="Enable debug logging.",
+)
 @click.option(
     "--debug-vars/--no-debug-vars",
     type=bool,
@@ -79,35 +76,31 @@
 )
 @click.pass_context
 def datahub(
-    ctx: click.Context, debug: bool, debug_vars: bool, detect_memory_leaks: bool
+    ctx: click.Context,
+    debug: bool,
+    log_file: Optional[str],
+    debug_vars: bool,
+    detect_memory_leaks: bool,
 ) -> None:
     if debug_vars:
+        # debug_vars implies debug. This option isn't actually used here, but instead
+        # read directly from the command line arguments in the main entrypoint.
         debug = True
 
-    # Insulate 'datahub' and all child loggers from inadvertent changes to the
-    # root logger by the external site packages that we import.
-    # (Eg: https://github.com/reata/sqllineage/commit/2df027c77ea0a8ea4909e471dcd1ecbf4b8aeb2f#diff-30685ea717322cd1e79c33ed8d37903eea388e1750aa00833c33c0c5b89448b3R11
-    #  changes the root logger's handler level to WARNING, causing any message below
-    #  WARNING level to be dropped  after this module is imported, irrespective
-    #  of the logger's logging level! The lookml source was affected by this).
-
-    # 1. Create 'datahub' parent logger.
-    datahub_logger = logging.getLogger("datahub")
-    # 2. Setup the stream handler with formatter.
-    stream_handler = logging.StreamHandler()
-    formatter = logging.Formatter(BASE_LOGGING_FORMAT)
-    stream_handler.setFormatter(formatter)
-    datahub_logger.addHandler(stream_handler)
-    # 3. Turn off propagation to the root handler.
-    datahub_logger.propagate = False
-    # 4. Adjust log-levels.
-    if debug or get_boolean_env_variable("DATAHUB_DEBUG", False):
-        logging.getLogger().setLevel(logging.INFO)
-        datahub_logger.setLevel(logging.DEBUG)
-        logging.getLogger("datahub_classify").setLevel(logging.DEBUG)
-    else:
-        logging.getLogger().setLevel(logging.WARNING)
-        datahub_logger.setLevel(logging.INFO)
+    debug = debug or get_boolean_env_variable("DATAHUB_DEBUG", False)
+
+    # Note that we're purposely leaking the context manager here.
+    # Technically we should wrap this with ctx.with_resource(). However, we have
+    # some important error logging in the main() wrapper function that we don't
+    # want to miss. If we wrap this with ctx.with_resource(), then click would
+    # clean it up before those error handlers are processed.
+    # So why is this ok? Because we're leaking a context manager, this will
+    # still get cleaned up automatically when the memory is reclaimed, which is
+    # worse-case at program exit.
+    global _logging_configured
+    _logging_configured = None  # see if we can force python to GC this
+    _logging_configured = configure_logging(debug=debug, log_file=log_file)
+    _logging_configured.__enter__()
 
     # Setup the context for the memory_leak_detector decorator.
     ctx.ensure_object(dict)
@@ -232,16 +225,3 @@ def main(**kwargs):
         )
         logger.debug(f"GMS config {get_gms_config()}")
         sys.exit(1)
-
-
-def _get_pretty_chained_message(exc: Exception) -> str:
-    pretty_msg = f"{exc.__class__.__name__} {exc}"
-    tmp_exc = exc.__cause__
-    indent = "\n\t\t"
-    while tmp_exc:
-        pretty_msg = (
-            f"{pretty_msg} due to {indent}{tmp_exc.__class__.__name__}{tmp_exc}"
-        )
-        tmp_exc = tmp_exc.__cause__
-        indent += "\t"
-    return pretty_msg
diff --git a/metadata-ingestion/src/datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py b/metadata-ingestion/src/datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py
@@ -24,8 +24,10 @@
     ExecutionRequestInputClass,
     ExecutionRequestResultClass,
     ExecutionRequestSourceClass,
+    StructuredExecutionReportClass,
     _Aspect,
 )
+from datahub.utilities.logging_manager import get_log_buffer
 from datahub.utilities.urns.urn import Urn
 
 logger = logging.getLogger(__name__)
@@ -143,7 +145,6 @@ def __init__(self, sink: Sink, report_recipe: bool, ctx: PipelineContext) -> Non
         # Emit the dataHubIngestionSourceInfo aspect
         self._emit_aspect(
             entity_urn=self.ingestion_source_urn,
-            aspect_name="dataHubIngestionSourceInfo",
             aspect_value=source_info_aspect,
         )
 
@@ -154,17 +155,12 @@ def _get_recipe_to_report(self, ctx: PipelineContext) -> str:
         else:
             return json.dumps(redact_raw_config(ctx.pipeline_config._raw_dict))
 
-    def _emit_aspect(
-        self, entity_urn: Urn, aspect_name: str, aspect_value: _Aspect
-    ) -> None:
+    def _emit_aspect(self, entity_urn: Urn, aspect_value: _Aspect) -> None:
         self.sink.write_record_async(
             RecordEnvelope(
                 record=MetadataChangeProposalWrapper(
-                    entityType=entity_urn.get_type(),
                     entityUrn=str(entity_urn),
-                    aspectName=aspect_name,
                     aspect=aspect_value,
-                    changeType="UPSERT",
                 ),
                 metadata={},
             ),
@@ -190,7 +186,6 @@ def on_start(self, ctx: PipelineContext) -> None:
         # Emit the dataHubExecutionRequestInput aspect
         self._emit_aspect(
             entity_urn=self.execution_request_input_urn,
-            aspect_name="dataHubExecutionRequestInput",
             aspect_value=execution_input_aspect,
         )
 
@@ -200,18 +195,29 @@ def on_completion(
         report: Dict[str, Any],
         ctx: PipelineContext,
     ) -> None:
+        # Prepare a nicely formatted summary
+        structured_report_str = json.dumps(report, indent=2)
+        summary = f"~~~~ Ingestion Report ~~~~\n{structured_report_str}\n\n"
+        summary += "~~~~ Ingestion Logs ~~~~\n"
+        summary += get_log_buffer().format_lines()
+
         # Construct the dataHubExecutionRequestResult aspect
+        structured_report = StructuredExecutionReportClass(
+            type="CLI_INGEST",
+            serializedValue=structured_report_str,
+            contentType="application/json",
+        )
         execution_result_aspect = ExecutionRequestResultClass(
             status=status,
             startTimeMs=self.start_time_ms,
             durationMs=self.get_cur_time_in_ms() - self.start_time_ms,
-            report=json.dumps(report, indent=2),
+            report=summary,
+            structuredReport=structured_report,
         )
 
         # Emit the dataHubExecutionRequestResult aspect
         self._emit_aspect(
             entity_urn=self.execution_request_input_urn,
-            aspect_name="dataHubExecutionRequestResult",
             aspect_value=execution_result_aspect,
         )
         self.sink.close()