From e695654cca9ba320e2cb6989887473960fed0e6e Mon Sep 17 00:00:00 2001 From: Aseem Bansal Date: Thu, 26 Dec 2024 16:49:10 +0530 Subject: [PATCH 1/2] fix(ingest/gc): misc fixes in gc source --- .../datahub/ingestion/source/gc/datahub_gc.py | 21 +++++-- .../source/gc/execution_request_cleanup.py | 61 +++++++++++++++---- .../source_report/ingestion_stage.py | 1 + 3 files changed, 66 insertions(+), 17 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py b/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py index 4eecbb4d9d717..0bb1537b5c096 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py +++ b/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py @@ -34,6 +34,7 @@ SoftDeletedEntitiesCleanupConfig, SoftDeletedEntitiesReport, ) +from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport logger = logging.getLogger(__name__) @@ -86,6 +87,7 @@ class DataHubGcSourceReport( DataProcessCleanupReport, SoftDeletedEntitiesReport, DatahubExecutionRequestCleanupReport, + IngestionStageReport, ): expired_tokens_revoked: int = 0 @@ -139,31 +141,38 @@ def get_workunits_internal( ) -> Iterable[MetadataWorkUnit]: if self.config.cleanup_expired_tokens: try: + self.report.report_ingestion_stage_start("Expired Token Cleanup") self.revoke_expired_tokens() except Exception as e: self.report.failure("While trying to cleanup expired token ", exc=e) if self.config.truncate_indices: try: + self.report.report_ingestion_stage_start("Truncate Indices") self.truncate_indices() except Exception as e: self.report.failure("While trying to truncate indices ", exc=e) if self.config.soft_deleted_entities_cleanup.enabled: try: + self.report.report_ingestion_stage_start( + "Soft Deleted Entities Cleanup" + ) self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities() except Exception as e: self.report.failure( "While trying to cleanup soft deleted entities ", exc=e ) - if self.config.execution_request_cleanup.enabled: - try: - self.execution_request_cleanup.run() - except Exception as e: - self.report.failure("While trying to cleanup execution request ", exc=e) if self.config.dataprocess_cleanup.enabled: try: + self.report.report_ingestion_stage_start("Data Process Cleanup") yield from self.dataprocess_cleanup.get_workunits_internal() except Exception as e: self.report.failure("While trying to cleanup data process ", exc=e) + if self.config.execution_request_cleanup.enabled: + try: + self.report.report_ingestion_stage_start("Execution request Cleanup") + self.execution_request_cleanup.run() + except Exception as e: + self.report.failure("While trying to cleanup execution request ", exc=e) yield from [] def truncate_indices(self) -> None: @@ -281,6 +290,8 @@ def revoke_expired_tokens(self) -> None: list_access_tokens = expired_tokens_res.get("listAccessTokens", {}) tokens = list_access_tokens.get("tokens", []) total = list_access_tokens.get("total", 0) + if tokens == []: + break for token in tokens: self.report.expired_tokens_revoked += 1 token_id = token["id"] diff --git a/metadata-ingestion/src/datahub/ingestion/source/gc/execution_request_cleanup.py b/metadata-ingestion/src/datahub/ingestion/source/gc/execution_request_cleanup.py index 3baf858e44cdc..08b99aa120cc4 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/gc/execution_request_cleanup.py +++ b/metadata-ingestion/src/datahub/ingestion/source/gc/execution_request_cleanup.py @@ -1,3 +1,4 @@ +import datetime import logging import time from typing import Any, Dict, Iterator, Optional @@ -42,16 +43,28 @@ class DatahubExecutionRequestCleanupConfig(ConfigModel): description="Global switch for this cleanup task", ) + runtime_limit_seconds: int = Field( + default=3600, + description="Maximum runtime in seconds for the cleanup task", + ) + + max_read_errors: int = Field( + default=10, + description="Maximum number of read errors before aborting", + ) + def keep_history_max_milliseconds(self): return self.keep_history_max_days * 24 * 3600 * 1000 class DatahubExecutionRequestCleanupReport(SourceReport): - execution_request_cleanup_records_read: int = 0 - execution_request_cleanup_records_preserved: int = 0 - execution_request_cleanup_records_deleted: int = 0 - execution_request_cleanup_read_errors: int = 0 - execution_request_cleanup_delete_errors: int = 0 + ergc_records_read: int = 0 + ergc_records_preserved: int = 0 + ergc_records_deleted: int = 0 + ergc_read_errors: int = 0 + ergc_delete_errors: int = 0 + ergc_start_time: Optional[datetime.datetime] = None + ergc_end_time: Optional[datetime.datetime] = None class CleanupRecord(BaseModel): @@ -124,6 +137,13 @@ def _scroll_execution_requests( params.update(overrides) while True: + if self._reached_runtime_limit(): + break + if self.report.ergc_read_errors >= self.config.max_read_errors: + logger.error( + f"ergc({self.instance_id}): too many read errors, aborting." + ) + break try: url = f"{self.graph.config.server}/openapi/v2/entity/{DATAHUB_EXECUTION_REQUEST_ENTITY_NAME}" response = self.graph._session.get(url, headers=headers, params=params) @@ -141,7 +161,7 @@ def _scroll_execution_requests( logger.error( f"ergc({self.instance_id}): failed to fetch next batch of execution requests: {e}" ) - self.report.execution_request_cleanup_read_errors += 1 + self.report.ergc_read_errors += 1 def _scroll_garbage_records(self): state: Dict[str, Dict] = {} @@ -150,7 +170,7 @@ def _scroll_garbage_records(self): running_guard_timeout = now_ms - 30 * 24 * 3600 * 1000 for entry in self._scroll_execution_requests(): - self.report.execution_request_cleanup_records_read += 1 + self.report.ergc_records_read += 1 key = entry.ingestion_source # Always delete corrupted records @@ -171,7 +191,7 @@ def _scroll_garbage_records(self): # Do not delete if number of requests is below minimum if state[key]["count"] < self.config.keep_history_min_count: - self.report.execution_request_cleanup_records_preserved += 1 + self.report.ergc_records_preserved += 1 continue # Do not delete if number of requests do not exceed allowed maximum, @@ -179,7 +199,7 @@ def _scroll_garbage_records(self): if (state[key]["count"] < self.config.keep_history_max_count) and ( entry.requested_at > state[key]["cutoffTimestamp"] ): - self.report.execution_request_cleanup_records_preserved += 1 + self.report.ergc_records_preserved += 1 continue # Do not delete if status is RUNNING or PENDING and created within last month. If the record is >month old and it did not @@ -188,7 +208,7 @@ def _scroll_garbage_records(self): "RUNNING", "PENDING", ]: - self.report.execution_request_cleanup_records_preserved += 1 + self.report.ergc_records_preserved += 1 continue # Otherwise delete current record @@ -200,7 +220,7 @@ def _scroll_garbage_records(self): f"record timestamp: {entry.requested_at}." ) ) - self.report.execution_request_cleanup_records_deleted += 1 + self.report.ergc_records_deleted += 1 yield entry def _delete_entry(self, entry: CleanupRecord) -> None: @@ -210,17 +230,31 @@ def _delete_entry(self, entry: CleanupRecord) -> None: ) self.graph.delete_entity(entry.urn, True) except Exception as e: - self.report.execution_request_cleanup_delete_errors += 1 + self.report.ergc_delete_errors += 1 logger.error( f"ergc({self.instance_id}): failed to delete ExecutionRequest {entry.request_id}: {e}" ) + def _reached_runtime_limit(self) -> bool: + if ( + self.config.runtime_limit_seconds + and self.report.ergc_start_time + and ( + datetime.datetime.now() - self.report.ergc_start_time + >= datetime.timedelta(seconds=self.config.runtime_limit_seconds) + ) + ): + logger.info(f"ergc({self.instance_id}): max runtime reached.") + return True + return False + def run(self) -> None: if not self.config.enabled: logger.info( f"ergc({self.instance_id}): ExecutionRequest cleaner is disabled." ) return + self.report.ergc_start_time = datetime.datetime.now() logger.info( ( @@ -232,8 +266,11 @@ def run(self) -> None: ) for entry in self._scroll_garbage_records(): + if self._reached_runtime_limit(): + break self._delete_entry(entry) + self.report.ergc_end_time = datetime.datetime.now() logger.info( f"ergc({self.instance_id}): Finished cleanup of ExecutionRequest records." ) diff --git a/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py b/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py index 42b3b648bd298..ce683e64b3f46 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py +++ b/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py @@ -42,4 +42,5 @@ def report_ingestion_stage_start(self, stage: str) -> None: self._timer = PerfTimer() self.ingestion_stage = f"{stage} at {datetime.now(timezone.utc)}" + logger.info(f"Stage started: {self.ingestion_stage}") self._timer.start() From ebd8e08993c9d0f38afa61dccb77fae7790c409e Mon Sep 17 00:00:00 2001 From: Aseem Bansal Date: Thu, 26 Dec 2024 17:12:42 +0530 Subject: [PATCH 2/2] minor change --- .../src/datahub/ingestion/source/gc/datahub_gc.py | 2 ++ .../datahub/ingestion/source/gc/execution_request_cleanup.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py b/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py index 0bb1537b5c096..168b787b85e8b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py +++ b/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py @@ -173,6 +173,8 @@ def get_workunits_internal( self.execution_request_cleanup.run() except Exception as e: self.report.failure("While trying to cleanup execution request ", exc=e) + # Otherwise last stage's duration does not get calculated. + self.report.report_ingestion_stage_start("End") yield from [] def truncate_indices(self) -> None: diff --git a/metadata-ingestion/src/datahub/ingestion/source/gc/execution_request_cleanup.py b/metadata-ingestion/src/datahub/ingestion/source/gc/execution_request_cleanup.py index 08b99aa120cc4..170a6ada3e336 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/gc/execution_request_cleanup.py +++ b/metadata-ingestion/src/datahub/ingestion/source/gc/execution_request_cleanup.py @@ -140,7 +140,7 @@ def _scroll_execution_requests( if self._reached_runtime_limit(): break if self.report.ergc_read_errors >= self.config.max_read_errors: - logger.error( + self.report.failure( f"ergc({self.instance_id}): too many read errors, aborting." ) break