Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Week 3 Project Submission #53

Open
wants to merge 16 commits into
base: master
Choose a base branch
from
Open
27 changes: 20 additions & 7 deletions week_1/project/week_1.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from datetime import datetime
from typing import List

from dagster import In, Nothing, Out, job, op, usable_as_dagster_type
from dagster import In, Nothing, Out, job, op, usable_as_dagster_type, get_dagster_logger
from pydantic import BaseModel


Expand Down Expand Up @@ -50,16 +50,29 @@ def get_s3_data(context):
return output


@op
def process_data():
pass
@op(
ins={"stocks": In(dagster_type=List[Stock])},
out={"agg_max": Out(dagster_type=Aggregation)},
description="Determine the stock with the highest value"
)
def process_data(stocks: List[Stock]) -> Aggregation:
# Find the stock with the highest value
max_stock = max(stocks, key=lambda x: x.high)
return Aggregation(date=max_stock.date, high=max_stock.high)


@op
def put_redis_data():
@op(
ins={"agg_max": In(dagster_type=Aggregation)},
out=None,
description="Write to Redis"
)
def put_redis_data(agg_max: Aggregation) -> None:
# Log the output
logger = get_dagster_logger()
logger.info(f"Write {agg_max} to Redis.")
pass


@job
def week_1_pipeline():
pass
put_redis_data(process_data(get_s3_data()))
50 changes: 37 additions & 13 deletions week_2/dagster_ucr/project/week_2.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,54 @@
from typing import List

from dagster import In, Nothing, Out, ResourceDefinition, graph, op
from dagster import In, Nothing, Out, ResourceDefinition, graph, op, get_dagster_logger
from dagster_ucr.project.types import Aggregation, Stock
from dagster_ucr.resources import mock_s3_resource, redis_resource, s3_resource


@op
def get_s3_data():
pass
@op(
config_schema={"s3_key": str},
required_resource_keys={"s3"},
out={"stocks": Out(dagster_type=List[Stock])},
tags={"kind": "s3"},
description="Get a list of stocks from an S3 file",
)
def get_s3_data(context) -> List[Stock]:
stocks = context.resources.s3.get_data(context.op_config["s3_key"])
return [Stock.from_list(stock) for stock in stocks]


@op
def process_data():
# Use your op from week 1
pass
@op(
ins={"stocks": In(dagster_type=List[Stock])},
out={"agg_max": Out(dagster_type=Aggregation)},
tags={"kind": "transformation"},
description="Determine the stock with the highest value"
)
def process_data(stocks: List[Stock]) -> Aggregation:
# Find the stock with the highest value
max_stock = max(stocks, key=lambda x: x.high)
# Log the output
logger = get_dagster_logger()
logger.info(f"Higest stock is: {max_stock}")
return Aggregation(date=max_stock.date, high=max_stock.high)


@op
def put_redis_data():
pass
@op(
ins={"agg_max": In(dagster_type=Aggregation)},
required_resource_keys={"redis"},
out=Out(dagster_type=Nothing),
tags={"kind": "redis"},
description="Write to Redis"
)
def put_redis_data(context, agg_max: Aggregation) -> None:
data = context.resources.redis.put_data(str(agg_max.date), str(agg_max.high))
context.log.info(f"Write {data} to Redis.")


@graph
def week_2_pipeline():
# Use your graph from week 1
pass
s3_data = get_s3_data()
highest_stock = process_data(s3_data)
put_redis_data(highest_stock)


local = {
Expand Down
34 changes: 28 additions & 6 deletions week_2/dagster_ucr/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,13 +91,35 @@ def mock_s3_resource():
return s3_mock


@resource
def s3_resource():
@resource(
config_schema={
"bucket": Field(String),
"access_key": Field(String),
"secret_key": Field(String),
"endpoint_url": Field(String),
},
description="A resource that can connect to S3."
)
def s3_resource(context) -> S3:
"""This resource defines a S3 client"""
pass
return S3(
bucket=context.resource_config["bucket"],
access_key=context.resource_config["access_key"],
secret_key=context.resource_config["secret_key"],
endpoint_url=context.resource_config["endpoint_url"],
)


@resource
def redis_resource():
@resource(
config_schema={
"host": Field(String),
"port": Field(Int),
},
description="A resource that connects to Redis.",
)
def redis_resource(context) -> Redis:
"""This resource defines a Redis client"""
pass
return Redis(
host=context.resource_config["host"],
port=context.resource_config["port"],
)
108 changes: 84 additions & 24 deletions week_3/project/week_3.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,34 +13,57 @@
op,
sensor,
static_partitioned_config,
get_dagster_logger,
)
from project.resources import mock_s3_resource, redis_resource, s3_resource
from project.sensors import get_s3_keys
from project.types import Aggregation, Stock


@op
def get_s3_data():
# Use your ops from week 2
pass


@op
def process_data():
# Use your ops from week 2
pass
@op(
config_schema={"s3_key": str},
required_resource_keys={"s3"},
out={"stocks": Out(dagster_type=List[Stock])},
tags={"kind": "s3"},
description="Get a list of stocks from an S3 file",
)
def get_s3_data(context) -> List[Stock]:
stocks = context.resources.s3.get_data(context.op_config["s3_key"])
return [Stock.from_list(stock) for stock in stocks]


@op
def put_redis_data():
# Use your ops from week 2
pass
@op(
ins={"stocks": In(dagster_type=List[Stock])},
out={"agg_max": Out(dagster_type=Aggregation)},
tags={"kind": "transformation"},
description="Determine the stock with the highest value"
)
def process_data(stocks: List[Stock]) -> Aggregation:
# Find the stock with the highest value
max_stock = max(stocks, key=lambda x: x.high)
# Log the output
logger = get_dagster_logger()
logger.info(f"Higest stock is: {max_stock}")
return Aggregation(date=max_stock.date, high=max_stock.high)


@op(
ins={"agg_max": In(dagster_type=Aggregation)},
required_resource_keys={"redis"},
out=Out(dagster_type=Nothing),
tags={"kind": "redis"},
description="Write to Redis"
)
def put_redis_data(context, agg_max: Aggregation) -> None:
data = context.resources.redis.put_data(str(agg_max.date), str(agg_max.high))
context.log.info(f"Write {data} to Redis.")


@graph
def week_3_pipeline():
# Use your graph from week 2
pass
s3_data = get_s3_data()
highest_stock = process_data(s3_data)
put_redis_data(highest_stock)


local = {
Expand Down Expand Up @@ -69,8 +92,16 @@ def week_3_pipeline():
}


def docker_config():
pass
# Create a fixed number of partitions (1-10)
partition_keys = [str(i) for i in range(1, 11)]

@static_partitioned_config(partition_keys=partition_keys)
def docker_config(partition_key: str):
key = f'prefix/stock_{partition_key}.csv'
return {
"resources": {**docker["resources"]},
"ops": {"get_s3_data": {"config": {"s3_key": key }}}
}


local_week_3_pipeline = week_3_pipeline.to_job(
Expand All @@ -89,14 +120,43 @@ def docker_config():
"s3": s3_resource,
"redis": redis_resource,
},
op_retry_policy=RetryPolicy(max_retries=10, delay=1)
)

# Schedule for local: Every 15 minutes
local_week_3_schedule = ScheduleDefinition(job=local_week_3_pipeline, cron_schedule="*/15 * * * *")

local_week_3_schedule = None # Add your schedule

docker_week_3_schedule = None # Add your schedule
# Schedule for docker: Start of every hour
docker_week_3_schedule = ScheduleDefinition(job=docker_week_3_pipeline, cron_schedule="0 * * * *")

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The code passes the tests, but I believe this definition is incorrect. Try to turn o this schedule in Dagit, and it will compain about incorrect configuration. See this discussion - https://data-orchestration.slack.com/archives/C03SB6TAXQV/p1661960430498179



@sensor
def docker_week_3_sensor():
pass
@sensor(
job=docker_week_3_pipeline,
minimum_interval_seconds=30,
description="Check S3 bucket for new files every 30 seconds."
)
def docker_week_3_sensor(context):

# Check for new files in the S3 bucket.
new_files = get_s3_keys(
bucket="dagster",
prefix="prefix",
endpoint_url="http://host.docker.internal:4566"
)

if not new_files:
yield SkipReason("No new s3 files found in bucket.")
return

log = get_dagster_logger()
log.info(f"RunRequest for {new_files}")

# RunRequest for every new file.
for new_file in new_files:
yield RunRequest(
run_key=new_file,
run_config={
"resources": {**docker["resources"]},
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ianyoung I did exactly the same as you:) However, the code can be simplified as
{**docker, "ops": {"get_s3_data": {"config": {"s3_key": new_file}}},
}

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point @yjw868. Now updated. Thanks!

"ops": {"get_s3_data": {"config": {"s3_key": new_file}}},
}
)