-
Notifications
You must be signed in to change notification settings - Fork 24
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Demo for serverless integration with Textract
- Loading branch information
Vinod Kumar
authored and
Vinod Kumar
committed
Nov 17, 2024
1 parent
9331c49
commit cfae526
Showing
10 changed files
with
985 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
import boto3 | ||
import json | ||
import os | ||
|
||
s3_client = boto3.client('s3') | ||
textract_client = boto3.client('textract') | ||
sqs_client = boto3.client('sqs') | ||
|
||
SQS_QUEUE_URL = os.environ['SQS_QUEUE_URL'] | ||
|
||
def lambda_handler(event, context): | ||
# Get S3 bucket and object key from event | ||
bucket_name = event['Records'][0]['s3']['bucket']['name'] | ||
object_key = event['Records'][0]['s3']['object']['key'] | ||
|
||
# Call Textract to extract text | ||
response = textract_client.detect_document_text( | ||
Document={'S3Object': {'Bucket': bucket_name, 'Name': object_key}} | ||
) | ||
|
||
# Extract text blocks | ||
text_blocks = [block['Text'] for block in response['Blocks'] if block['BlockType'] == 'LINE'] | ||
extracted_text = '\n'.join(text_blocks) | ||
|
||
# Send extracted text to SQS | ||
sqs_client.send_message( | ||
QueueUrl=SQS_QUEUE_URL, | ||
MessageBody=json.dumps({ | ||
'bucket': bucket_name, | ||
'key': object_key, | ||
'text': extracted_text | ||
}) | ||
) | ||
|
||
return { | ||
'statusCode': 200, | ||
'body': json.dumps('Text extracted and sent to SQS') | ||
} |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,179 @@ | ||
provider "aws" { | ||
region = var.region | ||
profile = var.aws_profile | ||
} | ||
|
||
# S3 Bucket | ||
resource "aws_s3_bucket" "textract_bucket" { | ||
bucket = var.s3_bucket_name | ||
acl = "private" | ||
|
||
tags = { | ||
Name = "TextractBucket" | ||
} | ||
} | ||
|
||
resource "aws_s3_bucket_notification" "bucket_notification" { | ||
bucket = aws_s3_bucket.textract_bucket.id | ||
|
||
lambda_function { | ||
lambda_function_arn = aws_lambda_function.textract_lambda.arn | ||
events = ["s3:ObjectCreated:*"] | ||
filter_prefix = "" | ||
filter_suffix = ".pdf" | ||
} | ||
|
||
lambda_function { | ||
lambda_function_arn = aws_lambda_function.textract_lambda.arn | ||
events = ["s3:ObjectCreated:*"] | ||
filter_suffix = ".jpg" | ||
} | ||
|
||
depends_on = [aws_lambda_permission.allow_s3] | ||
} | ||
|
||
# SQS Queue | ||
resource "aws_sqs_queue" "textract_queue" { | ||
name = var.sqs_queue_name | ||
|
||
tags = { | ||
Name = "TextractQueue" | ||
} | ||
} | ||
|
||
# IAM Role for Lambda | ||
resource "aws_iam_role" "lambda_role" { | ||
name = "lambda-textract-role" | ||
|
||
assume_role_policy = jsonencode({ | ||
Version = "2012-10-17" | ||
Statement = [ | ||
{ | ||
Action = "sts:AssumeRole" | ||
Effect = "Allow" | ||
Principal = { | ||
Service = "lambda.amazonaws.com" | ||
} | ||
} | ||
] | ||
}) | ||
} | ||
|
||
# IAM Policy for Lambda | ||
resource "aws_iam_role_policy" "lambda_policy" { | ||
name = "lambda-textract-policy" | ||
role = aws_iam_role.lambda_role.id | ||
policy = jsonencode({ | ||
Version = "2012-10-17" | ||
Statement = [ | ||
{ | ||
Effect = "Allow" | ||
Action = ["s3:GetObject"] | ||
Resource = "${aws_s3_bucket.textract_bucket.arn}/*" | ||
}, | ||
{ | ||
Effect = "Allow" | ||
Action = ["sqs:SendMessage"] | ||
Resource = aws_sqs_queue.textract_queue.arn | ||
}, | ||
{ | ||
Effect = "Allow" | ||
Action = ["textract:DetectDocumentText", "textract:AnalyzeDocument"] | ||
Resource = "*" | ||
}, | ||
{ | ||
Effect = "Allow" | ||
Action = [ | ||
"logs:CreateLogGroup", | ||
"logs:CreateLogStream", | ||
"logs:PutLogEvents" | ||
] | ||
Resource = "*" | ||
} | ||
] | ||
}) | ||
} | ||
|
||
# Lambda Function | ||
resource "aws_lambda_function" "textract_lambda" { | ||
filename = "lambda_function.zip" | ||
function_name = var.lambda_function_name | ||
role = aws_iam_role.lambda_role.arn | ||
handler = "lambda_function.lambda_handler" | ||
runtime = "python3.9" | ||
source_code_hash = filebase64sha256("lambda_function.zip") | ||
timeout = 60 | ||
|
||
environment { | ||
variables = { | ||
SQS_QUEUE_URL = aws_sqs_queue.textract_queue.id | ||
} | ||
} | ||
} | ||
|
||
# Allow S3 to invoke Lambda | ||
resource "aws_lambda_permission" "allow_s3" { | ||
statement_id = "AllowS3Invoke" | ||
action = "lambda:InvokeFunction" | ||
function_name = aws_lambda_function.textract_lambda.function_name | ||
principal = "s3.amazonaws.com" | ||
source_arn = aws_s3_bucket.textract_bucket.arn | ||
} | ||
|
||
|
||
# IAM Policy for Lambda Function 2 | ||
resource "aws_iam_role_policy" "lambda_policy_csv" { | ||
name = "lambda-csv-policy" | ||
role = aws_iam_role.lambda_role.id | ||
policy = jsonencode({ | ||
Version = "2012-10-17" | ||
Statement = [ | ||
{ | ||
Effect = "Allow" | ||
Action = ["s3:PutObject"] | ||
Resource = "${aws_s3_bucket.textract_bucket.arn}/*" | ||
}, | ||
{ | ||
Effect = "Allow" | ||
Action = ["sqs:ReceiveMessage", "sqs:DeleteMessage", "sqs:GetQueueAttributes"] | ||
Resource = aws_sqs_queue.textract_queue.arn | ||
}, | ||
{ | ||
Effect = "Allow" | ||
Action = [ | ||
"logs:CreateLogGroup", | ||
"logs:CreateLogStream", | ||
"logs:PutLogEvents" | ||
] | ||
Resource = "*" | ||
} | ||
] | ||
}) | ||
} | ||
|
||
# Lambda Function 2 to process SQS messages and generate CSV | ||
resource "aws_lambda_function" "sqs_to_csv_lambda" { | ||
filename = "sqs_to_csv_lambda.zip" | ||
function_name = var.lambda_function_name_2 | ||
role = aws_iam_role.lambda_role.arn | ||
handler = "sqs_to_csv_lambda.lambda_handler" | ||
runtime = "python3.9" | ||
source_code_hash = filebase64sha256("sqs_to_csv_lambda.zip") | ||
timeout = 60 | ||
|
||
environment { | ||
variables = { | ||
CSV_S3_BUCKET = aws_s3_bucket.textract_bucket.id | ||
CSV_S3_PREFIX = var.csv_s3_prefix | ||
} | ||
} | ||
} | ||
|
||
# SQS Event Source Mapping for Lambda Function 2 | ||
resource "aws_lambda_event_source_mapping" "sqs_trigger" { | ||
event_source_arn = aws_sqs_queue.textract_queue.arn | ||
function_name = aws_lambda_function.sqs_to_csv_lambda.arn | ||
batch_size = 10 | ||
enabled = true | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
output "s3_bucket_name" { | ||
value = aws_s3_bucket.textract_bucket.id | ||
} | ||
|
||
output "lambda_function_arn" { | ||
value = aws_lambda_function.textract_lambda.arn | ||
} | ||
|
||
output "sqs_queue_url" { | ||
value = aws_sqs_queue.textract_queue.id | ||
} | ||
|
||
output "csv_lambda_function_arn" { | ||
value = aws_lambda_function.sqs_to_csv_lambda.arn | ||
} | ||
|
||
output "csv_s3_prefix" { | ||
value = var.csv_s3_prefix | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
import boto3 | ||
import csv | ||
import os | ||
import json | ||
from io import StringIO | ||
import datetime | ||
|
||
s3_client = boto3.client('s3') | ||
sqs_client = boto3.client('sqs') | ||
|
||
CSV_S3_BUCKET = os.environ['CSV_S3_BUCKET'] | ||
CSV_S3_PREFIX = os.environ['CSV_S3_PREFIX'] | ||
|
||
def lambda_handler(event, context): | ||
for record in event['Records']: | ||
# Parse the SQS message | ||
message_body = json.loads(record['body']) | ||
bucket = message_body['bucket'] | ||
key = message_body['key'] | ||
extracted_text = message_body['text'] | ||
|
||
# Prepare CSV data | ||
csv_buffer = StringIO() | ||
csv_writer = csv.writer(csv_buffer) | ||
csv_writer.writerow(['Bucket', 'Key', 'ExtractedText']) | ||
csv_writer.writerow([bucket, key, extracted_text]) | ||
|
||
# Define the CSV file path and name | ||
timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S") | ||
csv_filename = f"{CSV_S3_PREFIX}{key.replace('/', '_')}_{timestamp}.csv" | ||
|
||
# Upload CSV to S3 | ||
s3_client.put_object( | ||
Bucket=CSV_S3_BUCKET, | ||
Key=csv_filename, | ||
Body=csv_buffer.getvalue() | ||
) | ||
|
||
print(f"CSV file saved to S3: {csv_filename}") | ||
|
||
return { | ||
'statusCode': 200, | ||
'body': 'CSV files created and saved to S3' | ||
} |
Binary file not shown.
Oops, something went wrong.