Skip to content

Commit

Permalink
Demo for serverless integration with Textract
Browse files Browse the repository at this point in the history
  • Loading branch information
Vinod Kumar authored and Vinod Kumar committed Nov 17, 2024
1 parent 9331c49 commit cfae526
Show file tree
Hide file tree
Showing 10 changed files with 985 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,6 @@
webapp/node_modules/
webapp/package-lock.json
iac/aws/terraform/creating-custom-vpc/.terraform/
iac/demo/textract/.terraform.lock.hcl
iac/demo/textract/.terraform/*

38 changes: 38 additions & 0 deletions iac/demo/textract/lambda_function.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import boto3
import json
import os

s3_client = boto3.client('s3')
textract_client = boto3.client('textract')
sqs_client = boto3.client('sqs')

SQS_QUEUE_URL = os.environ['SQS_QUEUE_URL']

def lambda_handler(event, context):
# Get S3 bucket and object key from event
bucket_name = event['Records'][0]['s3']['bucket']['name']
object_key = event['Records'][0]['s3']['object']['key']

# Call Textract to extract text
response = textract_client.detect_document_text(
Document={'S3Object': {'Bucket': bucket_name, 'Name': object_key}}
)

# Extract text blocks
text_blocks = [block['Text'] for block in response['Blocks'] if block['BlockType'] == 'LINE']
extracted_text = '\n'.join(text_blocks)

# Send extracted text to SQS
sqs_client.send_message(
QueueUrl=SQS_QUEUE_URL,
MessageBody=json.dumps({
'bucket': bucket_name,
'key': object_key,
'text': extracted_text
})
)

return {
'statusCode': 200,
'body': json.dumps('Text extracted and sent to SQS')
}
Binary file added iac/demo/textract/lambda_function.zip
Binary file not shown.
179 changes: 179 additions & 0 deletions iac/demo/textract/main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
provider "aws" {
region = var.region
profile = var.aws_profile
}

# S3 Bucket
resource "aws_s3_bucket" "textract_bucket" {
bucket = var.s3_bucket_name
acl = "private"

tags = {
Name = "TextractBucket"
}
}

resource "aws_s3_bucket_notification" "bucket_notification" {
bucket = aws_s3_bucket.textract_bucket.id

lambda_function {
lambda_function_arn = aws_lambda_function.textract_lambda.arn
events = ["s3:ObjectCreated:*"]
filter_prefix = ""
filter_suffix = ".pdf"
}

lambda_function {
lambda_function_arn = aws_lambda_function.textract_lambda.arn
events = ["s3:ObjectCreated:*"]
filter_suffix = ".jpg"
}

depends_on = [aws_lambda_permission.allow_s3]
}

# SQS Queue
resource "aws_sqs_queue" "textract_queue" {
name = var.sqs_queue_name

tags = {
Name = "TextractQueue"
}
}

# IAM Role for Lambda
resource "aws_iam_role" "lambda_role" {
name = "lambda-textract-role"

assume_role_policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Action = "sts:AssumeRole"
Effect = "Allow"
Principal = {
Service = "lambda.amazonaws.com"
}
}
]
})
}

# IAM Policy for Lambda
resource "aws_iam_role_policy" "lambda_policy" {
name = "lambda-textract-policy"
role = aws_iam_role.lambda_role.id
policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Effect = "Allow"
Action = ["s3:GetObject"]
Resource = "${aws_s3_bucket.textract_bucket.arn}/*"
},
{
Effect = "Allow"
Action = ["sqs:SendMessage"]
Resource = aws_sqs_queue.textract_queue.arn
},
{
Effect = "Allow"
Action = ["textract:DetectDocumentText", "textract:AnalyzeDocument"]
Resource = "*"
},
{
Effect = "Allow"
Action = [
"logs:CreateLogGroup",
"logs:CreateLogStream",
"logs:PutLogEvents"
]
Resource = "*"
}
]
})
}

# Lambda Function
resource "aws_lambda_function" "textract_lambda" {
filename = "lambda_function.zip"
function_name = var.lambda_function_name
role = aws_iam_role.lambda_role.arn
handler = "lambda_function.lambda_handler"
runtime = "python3.9"
source_code_hash = filebase64sha256("lambda_function.zip")
timeout = 60

environment {
variables = {
SQS_QUEUE_URL = aws_sqs_queue.textract_queue.id
}
}
}

# Allow S3 to invoke Lambda
resource "aws_lambda_permission" "allow_s3" {
statement_id = "AllowS3Invoke"
action = "lambda:InvokeFunction"
function_name = aws_lambda_function.textract_lambda.function_name
principal = "s3.amazonaws.com"
source_arn = aws_s3_bucket.textract_bucket.arn
}


# IAM Policy for Lambda Function 2
resource "aws_iam_role_policy" "lambda_policy_csv" {
name = "lambda-csv-policy"
role = aws_iam_role.lambda_role.id
policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Effect = "Allow"
Action = ["s3:PutObject"]
Resource = "${aws_s3_bucket.textract_bucket.arn}/*"
},
{
Effect = "Allow"
Action = ["sqs:ReceiveMessage", "sqs:DeleteMessage", "sqs:GetQueueAttributes"]
Resource = aws_sqs_queue.textract_queue.arn
},
{
Effect = "Allow"
Action = [
"logs:CreateLogGroup",
"logs:CreateLogStream",
"logs:PutLogEvents"
]
Resource = "*"
}
]
})
}

# Lambda Function 2 to process SQS messages and generate CSV
resource "aws_lambda_function" "sqs_to_csv_lambda" {
filename = "sqs_to_csv_lambda.zip"
function_name = var.lambda_function_name_2
role = aws_iam_role.lambda_role.arn
handler = "sqs_to_csv_lambda.lambda_handler"
runtime = "python3.9"
source_code_hash = filebase64sha256("sqs_to_csv_lambda.zip")
timeout = 60

environment {
variables = {
CSV_S3_BUCKET = aws_s3_bucket.textract_bucket.id
CSV_S3_PREFIX = var.csv_s3_prefix
}
}
}

# SQS Event Source Mapping for Lambda Function 2
resource "aws_lambda_event_source_mapping" "sqs_trigger" {
event_source_arn = aws_sqs_queue.textract_queue.arn
function_name = aws_lambda_function.sqs_to_csv_lambda.arn
batch_size = 10
enabled = true
}

19 changes: 19 additions & 0 deletions iac/demo/textract/outputs.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
output "s3_bucket_name" {
value = aws_s3_bucket.textract_bucket.id
}

output "lambda_function_arn" {
value = aws_lambda_function.textract_lambda.arn
}

output "sqs_queue_url" {
value = aws_sqs_queue.textract_queue.id
}

output "csv_lambda_function_arn" {
value = aws_lambda_function.sqs_to_csv_lambda.arn
}

output "csv_s3_prefix" {
value = var.csv_s3_prefix
}
44 changes: 44 additions & 0 deletions iac/demo/textract/sqs_to_csv_lambda.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import boto3
import csv
import os
import json
from io import StringIO
import datetime

s3_client = boto3.client('s3')
sqs_client = boto3.client('sqs')

CSV_S3_BUCKET = os.environ['CSV_S3_BUCKET']
CSV_S3_PREFIX = os.environ['CSV_S3_PREFIX']

def lambda_handler(event, context):
for record in event['Records']:
# Parse the SQS message
message_body = json.loads(record['body'])
bucket = message_body['bucket']
key = message_body['key']
extracted_text = message_body['text']

# Prepare CSV data
csv_buffer = StringIO()
csv_writer = csv.writer(csv_buffer)
csv_writer.writerow(['Bucket', 'Key', 'ExtractedText'])
csv_writer.writerow([bucket, key, extracted_text])

# Define the CSV file path and name
timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
csv_filename = f"{CSV_S3_PREFIX}{key.replace('/', '_')}_{timestamp}.csv"

# Upload CSV to S3
s3_client.put_object(
Bucket=CSV_S3_BUCKET,
Key=csv_filename,
Body=csv_buffer.getvalue()
)

print(f"CSV file saved to S3: {csv_filename}")

return {
'statusCode': 200,
'body': 'CSV files created and saved to S3'
}
Binary file added iac/demo/textract/sqs_to_csv_lambda.zip
Binary file not shown.
Loading

0 comments on commit cfae526

Please sign in to comment.