awslabs · cnfait · Jun 20, 2024 · Jun 20, 2024
diff --git a/sdlf-stage-ecsfargate/.gitignore b/sdlf-stage-ecsfargate/.gitignore
@@ -0,0 +1,21 @@
+# Packaged Templates
+output/
+
+# Editors
+.vscode/
+.idea/
+
+# Mac/OSX
+.DS_Store
+
+# Windows
+Thumbs.db
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# Environments
+.env
+.venv
diff --git a/sdlf-stage-ecsfargate/lambda/error/src/lambda_function.py b/sdlf-stage-ecsfargate/lambda/error/src/lambda_function.py
@@ -0,0 +1,30 @@
+import json
+import os
+
+from datalake_library.commons import init_logger
+from datalake_library.configuration.resource_configs import SQSConfiguration
+from datalake_library.interfaces.sqs_interface import SQSInterface
+
+logger = init_logger(__name__)
+team = os.environ["TEAM"]
+dataset = os.environ["DATASET"]
+pipeline = os.environ["PIPELINE"]
+pipeline_stage = os.environ["PIPELINE_STAGE"]
+org = os.environ["ORG"]
+domain = os.environ["DOMAIN"]
+env = os.environ["ENV"]
+
+
+def lambda_handler(event, context):
+    try:
+        if isinstance(event, str):
+            event = json.loads(event)
+
+        sqs_config = SQSConfiguration(team, pipeline, pipeline_stage)
+        sqs_interface = SQSInterface(sqs_config.get_stage_dlq_name)
+
+        logger.info("Execution Failed. Sending original payload to DLQ")
+        sqs_interface.send_message_to_fifo_queue(json.dumps(event), "failed")
+    except Exception as e:
+        logger.error("Fatal error", exc_info=True)
+        raise e
diff --git a/sdlf-stage-ecsfargate/lambda/postupdate-metadata/src/lambda_function.py b/sdlf-stage-ecsfargate/lambda/postupdate-metadata/src/lambda_function.py
@@ -0,0 +1,53 @@
+import os
+
+from datalake_library import octagon
+from datalake_library.commons import init_logger
+from datalake_library.octagon import peh
+
+logger = init_logger(__name__)
+team = os.environ["TEAM"]
+dataset = os.environ["DATASET"]
+pipeline = os.environ["PIPELINE"]
+pipeline_stage = os.environ["PIPELINE_STAGE"]
+org = os.environ["ORG"]
+domain = os.environ["DOMAIN"]
+env = os.environ["ENV"]
+
+
+def lambda_handler(event, context):
+    """Updates the S3 objects metadata catalog
+
+    Arguments:
+        event {dict} -- Dictionary with details on previous processing step
+        context {dict} -- Dictionary with details on Lambda context
+
+    Returns:
+        {dict} -- Dictionary with outcome of the process
+    """
+    try:
+        logger.info("Initializing Octagon client")
+        component = context.function_name.split("-")[-2].title()
+        octagon_client = octagon.OctagonClient().with_run_lambda(True).with_configuration_instance(env).build()
+        peh_id = event[0][0]["peh_id"]
+        peh.PipelineExecutionHistoryAPI(octagon_client).retrieve_pipeline_execution(peh_id)
+
+        partial_failure = False
+        for records in event:
+            for record in records:
+                if "processed" not in record or not record["processed"]:
+                    partial_failure = True
+
+        if not partial_failure:
+            octagon_client.update_pipeline_execution(
+                status="{} {} Processing".format(pipeline_stage, component), component=component
+            )
+            octagon_client.end_pipeline_execution_success()
+        else:
+            raise Exception("Failure: Processing failed for one or more record")
+
+    except Exception as e:
+        logger.error("Fatal error", exc_info=True)
+        octagon_client.end_pipeline_execution_failed(
+            component=component, issue_comment=f"{pipeline_stage} {component} Error: {repr(e)}"
+        )
+        raise e
diff --git a/sdlf-stage-ecsfargate/lambda/redrive/src/lambda_function.py b/sdlf-stage-ecsfargate/lambda/redrive/src/lambda_function.py
@@ -0,0 +1,27 @@
+import os
+
+from datalake_library.commons import init_logger
+from datalake_library.configuration.resource_configs import SQSConfiguration
+from datalake_library.interfaces.sqs_interface import SQSInterface
+
+logger = init_logger(__name__)
+
+
+def lambda_handler(event, context):
+    try:
+        sqs_config = SQSConfiguration(os.environ["TEAM"], os.environ["PIPELINE"], os.environ["STAGE"])
+        dlq_interface = SQSInterface(sqs_config.get_stage_dlq_name)
+        messages = dlq_interface.receive_messages(1)
+        if not messages:
+            logger.info("No messages found in {}".format(sqs_config.get_stage_dlq_name))
+            return
+
+        logger.info("Received {} messages".format(len(messages)))
+        queue_interface = SQSInterface(sqs_config.get_stage_queue_name)
+        for message in messages:
+            queue_interface.send_message_to_fifo_queue(message["Body"], "redrive")
+            logger.info("Redrive message succeeded")
+    except Exception as e:
+        logger.error("Fatal error", exc_info=True)
+        raise e
+    return
diff --git a/sdlf-stage-ecsfargate/lambda/routing/src/lambda_function.py b/sdlf-stage-ecsfargate/lambda/routing/src/lambda_function.py
@@ -0,0 +1,135 @@
+import json
+import os
+
+from datalake_library import octagon
+from datalake_library.commons import init_logger
+from datalake_library.configuration.resource_configs import (
+    DynamoConfiguration,
+    SQSConfiguration,
+    StateMachineConfiguration,
+)
+from datalake_library.interfaces.dynamo_interface import DynamoInterface
+from datalake_library.interfaces.sqs_interface import SQSInterface
+from datalake_library.interfaces.states_interface import StatesInterface
+
+logger = init_logger(__name__)
+team = os.environ["TEAM"]
+dataset = os.environ["DATASET"]
+pipeline = os.environ["PIPELINE"]
+pipeline_stage = os.environ["PIPELINE_STAGE"]
+org = os.environ["ORG"]
+domain = os.environ["DOMAIN"]
+env = os.environ["ENV"]
+
+
+def pipeline_start(octagon_client, event):
+    peh_id = octagon_client.start_pipeline_execution(
+        pipeline_name=f"{team}-{pipeline}-{pipeline_stage}",
+        dataset_name=f"{team}-{dataset}",
+        comment=event,  # TODO test maximum size
+    )
+    logger.info(f"peh_id: {peh_id}")
+    return peh_id
+
+
+# sdlf-stage-* stages supports three types of trigger:
+# event: run stage when an event received on the team's event bus matches the configured event pattern
+# event-schedule: store events received on the team's event bus matching the configured event pattern, then process them on the configured schedule
+# schedule: run stage on the configured schedule, without any event as input
+def get_source_records(event, dynamo_interface):
+    records = []
+
+    if event.get("trigger_type") == "schedule" and "event_pattern" not in event:
+        logger.info("Stage trigger: schedule")
+        records.append(event)
+    elif event.get("trigger_type") == "schedule" and "event_pattern" in event:
+        logger.info("Stage trigger: event-schedule")
+        pipeline_info = dynamo_interface.get_pipelines_table_item(f"{team}-{pipeline}-{pipeline_stage}")
+        min_items_to_process = 1
+        max_items_to_process = 100
+        logger.info(f"Pipeline is {pipeline}, stage is {pipeline_stage}")
+        logger.info(f"Details from DynamoDB: {pipeline_info.get('pipeline', {})}")
+        min_items_to_process = pipeline_info["pipeline"].get("min_items_process", min_items_to_process)
+        max_items_to_process = pipeline_info["pipeline"].get("max_items_process", max_items_to_process)
+
+        sqs_config = SQSConfiguration(team, pipeline, pipeline_stage)
+        queue_interface = SQSInterface(sqs_config.get_stage_queue_name)
+        logger.info(f"Querying {team}-{pipeline}-{pipeline_stage} objects waiting for processing")
+        messages = queue_interface.receive_min_max_messages(min_items_to_process, max_items_to_process)
+        logger.info(f"{len(messages)} Objects ready for processing")
+
+        for record in messages:
+            records.append(json.loads(record))
+    elif "Records" in event:
+        logger.info("Stage trigger: event")
+        for record in event["Records"]:
+            records.append(json.loads(record["body"]))
+    else:
+        raise Exception("Unable to ascertain trigger type (schedule, event-schedule or event)")
+
+    return records
+
+
+def enrich_records(records, metadata):
+    enriched_records = []
+    for record in records:
+        enriched_record = {
+            **record,
+            **metadata,
+        }
+        enriched_records.append(enriched_record)
+
+    return enriched_records
+
+
+def get_transform_details(dynamo_interface):
+    transform_info = dynamo_interface.get_transform_table_item(f"{team}-{dataset}")
+    ecsfargate_cluster = ""
+    ecsfargate_arn = ""
+    logger.info(f"Pipeline is {pipeline}, stage is {pipeline_stage}")
+    if pipeline in transform_info.get("pipeline", {}):
+        if pipeline_stage in transform_info["pipeline"][pipeline]:
+            logger.info(f"Details from DynamoDB: {transform_info['pipeline'][pipeline][pipeline_stage]}")
+            ecsfargate_cluster = transform_info["pipeline"][pipeline][pipeline_stage].get(
+                "ecsfargate_cluster", ecsfargate_cluster
+            )
+            ecsfargate_arn = transform_info["pipeline"][pipeline][pipeline_stage].get("ecsfargate_arn", ecsfargate_arn)
+
+    return ecsfargate_cluster, ecsfargate_arn
+
+
+def lambda_handler(event, context):
+    try:
+        octagon_client = octagon.OctagonClient().with_run_lambda(True).with_configuration_instance(env).build()
+        dynamo_config = DynamoConfiguration()
+        dynamo_interface = DynamoInterface(dynamo_config)
+        peh_id = pipeline_start(octagon_client, event)
+        records = get_source_records(event, dynamo_interface)
+        ecsfargate_cluster, ecsfargate_transform = get_transform_details(
+            dynamo_interface
+        )  # allow customising the ecsfargate task transform and cluster through sdlf-dataset's pPipelineDetails
+        metadata = dict(peh_id=peh_id, ecsfargate_transform=ecsfargate_transform, ecsfargate_cluster=ecsfargate_cluster)
+        records = enrich_records(records, metadata)
+
+        if records:
+            if records[0].get("trigger_type"):
+                logger.info("Starting State Machine Execution (scheduled run without source events)")
+            else:
+                logger.info(f"Starting State Machine Execution (processing {len(records)} source events)")
+            state_config = StateMachineConfiguration(team, pipeline, pipeline_stage)
+            StatesInterface().run_state_machine(state_config.get_stage_state_machine_arn, json.dumps(records))
+            octagon_client.update_pipeline_execution(
+                status=f"{pipeline_stage} Transform Processing", component="Transform"
+            )
+        else:
+            logger.info("Nothing to process, exiting pipeline")
+            octagon_client.end_pipeline_execution_success()
+
+    except Exception as e:
+        logger.error("Fatal error", exc_info=True)
+        component = context.function_name.split("-")[-2].title()
+        octagon_client.end_pipeline_execution_failed(
+            component=component,
+            issue_comment=f"{pipeline_stage} {component} Error: {repr(e)}",
+        )
+        raise e
diff --git a/sdlf-stage-ecsfargate/state-machine/stage-ecsfargate.asl.json b/sdlf-stage-ecsfargate/state-machine/stage-ecsfargate.asl.json
@@ -0,0 +1,112 @@
+{
+  "Comment": "Simple ECS Fargate-based transform",
+  "StartAt": "Try",
+  "States": {
+    "Try": {
+      "Type": "Parallel",
+      "Branches": [
+        {
+          "StartAt": "Pass",
+          "States": {
+            "Pass": {
+              "Type": "Pass",
+              "Next": "Records",
+              "Parameters": {
+                "Items.$": "States.StringToJson($)"
+              }
+            },
+            "Records": {
+              "Type": "Map",
+              "ItemProcessor": {
+                "ProcessorConfig": {
+                  "Mode": "DISTRIBUTED",
+                  "ExecutionType": "STANDARD"
+                },
+                "StartAt": "Execute ECS Fargate Transformation",
+                "States": {
+                  "Execute ECS Fargate Transformation": {
+                    "Type": "Task",
+                    "Resource": "arn:aws:states:::ecs:runTask.sync",
+                    "Parameters": {
+                      "LaunchType": "FARGATE",
+                      "Cluster": "$.Items[0].ecsfargate_cluster",
+                      "TaskDefinition": "$.Items[0].ecsfargate_transform"
+                    },
+                    "End": true
+                  }
+                }
+              },
+              "Next": "Post-update Catalog",
+              "Label": "Records",
+              "MaxConcurrency": 50,
+              "ToleratedFailurePercentage": 100,
+              "ItemBatcher": {
+                "MaxItemsPerBatch": 1
+              },
+              "InputPath": "$.Items"
+            },
+            "Post-update Catalog": {
+              "Type": "Task",
+              "Resource": "arn:aws:states:::lambda:invoke",
+              "ResultPath": null,
+              "Parameters": {
+                "Payload.$": "$",
+                "FunctionName": "${lStep3}:$LATEST"
+              },
+              "Retry": [
+                {
+                  "ErrorEquals": [
+                    "Lambda.ServiceException",
+                    "Lambda.AWSLambdaException",
+                    "Lambda.SdkClientException",
+                    "Lambda.TooManyRequestsException"
+                  ],
+                  "IntervalSeconds": 2,
+                  "MaxAttempts": 6,
+                  "BackoffRate": 2
+                }
+              ],
+              "End": true
+            }
+          }
+        }
+      ],
+      "End": true,
+      "Catch": [
+        {
+          "ErrorEquals": [
+            "States.ALL"
+          ],
+          "ResultPath": null,
+          "Next": "Error"
+        }
+      ]
+    },
+    "Error": {
+      "Type": "Task",
+      "Resource": "arn:aws:states:::lambda:invoke",
+      "OutputPath": "$.Payload",
+      "Parameters": {
+        "Payload.$": "$",
+        "FunctionName": "${lError}:$LATEST"
+      },
+      "Retry": [
+        {
+          "ErrorEquals": [
+            "Lambda.ServiceException",
+            "Lambda.AWSLambdaException",
+            "Lambda.SdkClientException",
+            "Lambda.TooManyRequestsException"
+          ],
+          "IntervalSeconds": 2,
+          "MaxAttempts": 6,
+          "BackoffRate": 2
+        }
+      ],
+      "Next": "Fail"
+    },
+    "Fail": {
+      "Type": "Fail"
+    }
+  }
+}