Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(airflow): support kafka hook in the airflow plugin #6508

Merged
merged 1 commit into from
Nov 22, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 24 additions & 8 deletions metadata-ingestion/src/datahub_provider/_plugin.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from datahub_provider._airflow_compat import Operator

import contextlib
import logging
import traceback
from typing import Any, Iterable, List
from typing import Any, Callable, Iterable, List, Optional

from airflow.configuration import conf
from airflow.lineage import PIPELINE_OUTLETS
Expand Down Expand Up @@ -105,6 +106,19 @@ def get_inlets_from_task(task: BaseOperator, context: Any) -> Iterable[Any]:
return inlets


def _make_emit_callback(
logger: logging.Logger,
goal: str,
) -> Callable[[Optional[Exception], str], None]:
def kafka_callback(err: Optional[Exception], msg: str) -> None:
if not err:
logger.info(f"Sent metadata to datahub {goal}")
else:
logger.error(f"Failed to send {goal} to datahub: {msg}", exc_info=err)

return kafka_callback


def datahub_task_status_callback(context, status):
ti = context["ti"]
task: "BaseOperator" = ti.task
Expand All @@ -126,9 +140,9 @@ def datahub_task_status_callback(context, status):
capture_tags=context["_datahub_config"].capture_tags_info,
capture_owner=context["_datahub_config"].capture_ownership_info,
)
dataflow.emit(emitter)

task.log.info(f"Emitted Datahub DataFlow: {dataflow}")
dataflow.emit(
emitter, callback=_make_emit_callback(task.log, f"Datahub DataFlow {dataflow}")
)

datajob = AirflowGenerator.generate_datajob(
cluster=context["_datahub_config"].cluster,
Expand All @@ -145,8 +159,9 @@ def datahub_task_status_callback(context, status):
for outlet in task_outlets:
datajob.outlets.append(outlet.urn)

task.log.info(f"Emitted Datahub dataJob: {datajob}")
datajob.emit(emitter)
datajob.emit(
emitter, callback=_make_emit_callback(task.log, f"Datahub dataJob {datajob}")
)

if context["_datahub_config"].capture_executions:
dpi = AirflowGenerator.run_datajob(
Expand Down Expand Up @@ -207,8 +222,9 @@ def datahub_pre_execution(context):
for outlet in task_outlets:
datajob.outlets.append(outlet.urn)

datajob.emit(emitter)
task.log.info(f"Emitting Datahub DataJob: {datajob}")
datajob.emit(
emitter, callback=_make_emit_callback(task.log, f"Datahub dataJob {datajob}")
)

if context["_datahub_config"].capture_executions:
dpi = AirflowGenerator.run_datajob(
Expand Down