Skip to content

Commit

Permalink
emr serverless python dependencies (#250)
Browse files Browse the repository at this point in the history
  • Loading branch information
cloutierMat authored Jul 18, 2024
1 parent 367ff58 commit ec34fa0
Show file tree
Hide file tree
Showing 14 changed files with 477 additions and 1 deletion.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ node_modules/
.project
.settings/
target/
volume/

.idea/

Expand All @@ -13,7 +14,7 @@ __pycache__/
*.log

.terraform/
terraform.tfstate
terraform.tfstate*
.terraform.lock*

.venv/
Expand Down
58 changes: 58 additions & 0 deletions emr-serverless-python-dependencies/Dockerfile-aws
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# This is a muti-stage Dockerfile that can be used to build many different types of
# bundled dependencies for PySpark projects.
# The `base` stage installs generic tools necessary for packaging.
#
# There are `export-` and `build-` stages for the different types of projects.
# - python-packages - Generic support for Python projects with pyproject.toml
# - poetry - Support for Poetry projects
#
# This Dockerfile is generated automatically as part of the emr-cli tool.
# Feel free to modify it for your needs, but leave the `build-` and `export-`
# stages related to your project.
#
# To build manually, you can use the following command, assuming
# the Docker BuildKit backend is enabled. https://docs.docker.com/build/buildkit/
#
# Example for building a poetry project and saving the output to dist/ folder
# docker build --target export-poetry --output dist .


## ----------------------------------------------------------------------------
## Base stage for python development
## ----------------------------------------------------------------------------
FROM --platform=linux/amd64 amazonlinux:2 AS base

RUN yum install -y python3 tar gzip

ENV VIRTUAL_ENV=/opt/venv
RUN python3 -m venv $VIRTUAL_ENV
ENV PATH="$VIRTUAL_ENV/bin:$PATH"

# EMR 6.x uses Python 3.7 - limit Poetry version to 1.5.1
ENV POETRY_VERSION=1.5.1
RUN python3 -m pip install --upgrade pip
RUN curl -sSL https://install.python-poetry.org | python3 -

ENV PATH="$PATH:/root/.local/bin"

WORKDIR /app

COPY . .

# Test stage - installs test dependencies defined in pyproject.toml
FROM base as test
RUN python3 -m pip install .[test]


## ----------------------------------------------------------------------------
## Build and export stages for Poetry Python projects
## ----------------------------------------------------------------------------
# Build stage for poetry
FROM base as build-poetry
RUN poetry self add poetry-plugin-bundle && \
poetry bundle venv dist/bundle && \
tar -czvf dist/pyspark_deps.tar.gz -C dist/bundle . && \
rm -rf dist/bundle

FROM scratch as export-poetry
COPY --from=build-poetry /app/dist/pyspark_deps.tar.gz /
30 changes: 30 additions & 0 deletions emr-serverless-python-dependencies/Dockerfile-localstack
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
## ----------------------------------------------------------------------------
## Base stage for python development
## ----------------------------------------------------------------------------
FROM --platform=linux/amd64 localstack/localstack:latest AS base

ENV VIRTUAL_ENV=/opt/venv
RUN python3 -m venv $VIRTUAL_ENV
ENV PATH="$VIRTUAL_ENV/bin:$PATH"

# EMR 6.x uses Python 3.7 - limit Poetry version to 1.5.1
ENV POETRY_VERSION=1.5.1
RUN python3 -m pip install --upgrade pip
RUN curl -sSL https://install.python-poetry.org | python3 -

ENV PATH="$PATH:/root/.local/bin"

WORKDIR /app

COPY . .

## ----------------------------------------------------------------------------
## Build and export stages for standard Python projects
## ----------------------------------------------------------------------------
# Build stage - installs required dependencies and creates a venv package
FROM base as build-poetry
RUN poetry self add poetry-plugin-bundle && \
poetry bundle venv dist/bundle

FROM scratch as export-poetry
COPY --from=build-poetry /app/dist/bundle /pyspark_env/
48 changes: 48 additions & 0 deletions emr-serverless-python-dependencies/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
export AWS_ACCESS_KEY_ID ?= test
export AWS_SECRET_ACCESS_KEY ?= test
export AWS_DEFAULT_REGION = us-east-1

init:
terraform workspace new local &
terraform workspace new aws &
terraform init

build:
docker build . --file Dockerfile-localstack --output .

build-aws:
docker build . --file Dockerfile-aws --output .

deploy:
docker-compose up --detach
terraform workspace select local
AWS_ENDPOINT_URL=https://localhost.localstack.cloud:4566 terraform apply --auto-approve

deploy-aws:
terraform workspace select aws
terraform apply --auto-approve

run:
terraform workspace select local
./start_job.sh local

run-aws:
terraform workspace select aws
./start_job.sh aws

stop:
docker-compose down

destroy:
terraform workspace select local
./stop-application.sh
terraform destroy --auto-approve

destroy-aws:
terraform workspace select aws
./stop-application.sh aws
terraform destroy --auto-approve

test-ci:
make init build deploy run; return_code=`echo $$?`;\
make stop; exit $$return_code;
69 changes: 69 additions & 0 deletions emr-serverless-python-dependencies/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# EMR Serverless with Python dependencies

[AWS has this example](https://github.com/aws-samples/emr-serverless-samples/tree/main/examples/pyspark/dependencies) of how to add python dependencies to an emr job. Unfortunately, the same pattern isn't currently possible on LocalStack. This here will serve as a example of how to implement a workaround to still be able to add your own dependencies and module to your emr Spark jobs

## Requirements
- Make
- Terraform ~>1.9.1
- [LocalStack](https://github.com/localstack/localstack)
- [awslocal](https://github.com/localstack/awscli-local)

## init

This will initialize your terraform and terraform workspaces

```
make init
```

## Build

This will build the python dependencies for the Spark job. This is where the first difference with AWS happens, as we will not package it like we do for aws, but intead will save the environment to our project folder to mount it to Localstack countainer.

```
# For LocalStack, we create a /pyspark_env folder
make build
# For aws, we create pyspark_deps.tar.gz
make build-aws
```

## Deploy

Creates the following resources
- iam role
- iam policy
- s3 bucket
- emr-serverless application

```
# Starts localstack using docker-compose, and apply the terraform configuration.
LOCALSTACK_AUTH_TOKEN=<your_auth_token> make deploy
# apply terraform configuration to AWS
make deploy-aws
```

## Run job

We can finally run our spark job. Notice the differences in the `start_job.sh` for LocalStack and aws. For aws we add `spark.archives` to our configuration and reference the path for the environment as `environment/bin/python`. Whereas for LocalStack, we rely on the volume mounted on our container instead of the archives and are using the absolute path for the environment `/tmp/environment/bin/python`.

```
# LocalStack
make run
# aws
make run-aws
```

## Destroy

Finally we can destroy the environment. We make sure to stop the application first.

```
# LocalStack
make destroy
# aws
make destroy-aws
```
22 changes: 22 additions & 0 deletions emr-serverless-python-dependencies/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
services:
localstack:
container_name: "${LOCALSTACK_DOCKER_NAME:-localstack-main}"
# Using this image will significantly decrease the job execution time
# image: localstack/localstack-pro:latest-bigdata
image: localstack/localstack-pro:latest
ports:
- "127.0.0.1:4566:4566" # LocalStack Gateway
- "127.0.0.1:4510-4559:4510-4559" # external services port range
- "127.0.0.1:443:443" # LocalStack HTTPS Gateway (Pro)
environment:
# Activate LocalStack Pro: https://docs.localstack.cloud/getting-started/auth-token/
- LOCALSTACK_AUTH_TOKEN=${LOCALSTACK_AUTH_TOKEN:-} # required for Pro
- LOCALSTACK_API_KEY=${LOCALSTACK_API_KEY:-} # required for CI
# LocalStack configuration: https://docs.localstack.cloud/references/configuration/
- DEBUG=${DEBUG:-0}
- PERSISTENCE=${PERSISTENCE:-0}
- HIVE_DEFAULT_VERSION=3.1.3
volumes:
- "${LOCALSTACK_VOLUME_DIR:-./volume}:/var/lib/localstack"
- "/var/run/docker.sock:/var/run/docker.sock"
- "./pyspark_env:/tmp/environment"
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
{
"Version": "2012-10-17",
"Statement": [
{
"Sid": "ReadAccessForEMRSamples",
"Effect": "Allow",
"Action": [
"s3:GetObject",
"s3:ListBucket"
],
"Resource": [
"arn:aws:s3:::*.elasticmapreduce",
"arn:aws:s3:::*.elasticmapreduce/*"
]
},
{
"Sid": "FullAccessToOutputBucket",
"Effect": "Allow",
"Action": [
"s3:PutObject",
"s3:GetObject",
"s3:ListBucket",
"s3:DeleteObject"
],
"Resource": [
"arn:aws:s3:::${bucket}",
"arn:aws:s3:::${bucket}/*"
]
},
{
"Sid": "GlueCreateAndReadDataCatalog",
"Effect": "Allow",
"Action": [
"glue:GetDatabase",
"glue:CreateDatabase",
"glue:GetDataBases",
"glue:CreateTable",
"glue:GetTable",
"glue:UpdateTable",
"glue:DeleteTable",
"glue:GetTables",
"glue:GetPartition",
"glue:GetPartitions",
"glue:CreatePartition",
"glue:BatchCreatePartition",
"glue:GetUserDefinedFunctions"
],
"Resource": ["*"]
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"Version": "2012-10-17",
"Statement": [{
"Sid": "EMRServerlessTrustPolicy",
"Action": "sts:AssumeRole",
"Effect": "Allow",
"Principal": {
"Service": "emr-serverless.amazonaws.com"
}
}]
}
9 changes: 9 additions & 0 deletions emr-serverless-python-dependencies/entrypoint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from jobs.spark_run import SparkRun

# importing typer to validate it is in the environment
import typer

if __name__ == "__main__":
spark_runner = SparkRun()
spark_runner.run()
spark_runner.stop()
22 changes: 22 additions & 0 deletions emr-serverless-python-dependencies/jobs/spark_run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

class SparkRun:

def __init__(self) -> None:
self.spark = SparkSession.builder.appName("ExtremeWeather").getOrCreate()

def run(self) -> None:
df = self.spark.createDataFrame(
[
("sue", 32),
("li", 3),
("bob", 75),
("heo", 13),
],
["first_name", "age"],
)
print(df.select(col("first_name"), col("age")).first())

def stop(self):
self.spark.stop()
Loading

0 comments on commit ec34fa0

Please sign in to comment.