Skip to content

Commit

Permalink
fix: bug on AutoMLInput to allow PipelineVariable (aws#736)
Browse files Browse the repository at this point in the history
Co-authored-by: Xinlu Tu <[email protected]>
  • Loading branch information
2 people authored and mufiAmazon committed Nov 15, 2022
1 parent 8989ae3 commit cb45a4b
Show file tree
Hide file tree
Showing 6 changed files with 123 additions and 37 deletions.
38 changes: 23 additions & 15 deletions src/sagemaker/automl/automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from sagemaker.job import _Job
from sagemaker.session import Session
from sagemaker.utils import name_from_base
from sagemaker.workflow.entities import PipelineVariable
from sagemaker.workflow.pipeline_context import runnable_by_pipeline

logger = logging.getLogger("sagemaker")
Expand All @@ -44,18 +45,20 @@ def __init__(
):
"""Convert an S3 Uri or a list of S3 Uri to an AutoMLInput object.
:param inputs (str, list[str]): a string or a list of string that points to (a)
S3 location(s) where input data is stored.
:param target_attribute_name (str): the target attribute name for regression
or classification.
:param compression (str): if training data is compressed, the compression type.
The default value is None.
:param channel_type (str): The channel type an enum to specify
whether the input resource is for training or validation.
Valid values: training or validation.
:param content_type (str): The content type of the data from the input source.
:param s3_data_type (str): The data type for S3 data source.
Valid values: ManifestFile or S3Prefix.
Args:
inputs (str, list[str], PipelineVariable):
a string or a list of string or a PipelineVariable that points to (a)
S3 location(s) where input data is stored.
target_attribute_name (str): the target attribute name for regression
or classification.
compression (str): if training data is compressed, the compression type.
The default value is None.
channel_type (str): The channel type an enum to specify
whether the input resource is for training or validation.
Valid values: training or validation.
content_type (str): The content type of the data from the input source.
s3_data_type (str): The data type for S3 data source.
Valid values: ManifestFile or S3Prefix.
"""
self.inputs = inputs
self.target_attribute_name = target_attribute_name
Expand All @@ -70,6 +73,8 @@ def to_request_dict(self):
auto_ml_input = []
if isinstance(self.inputs, string_types):
self.inputs = [self.inputs]
if isinstance(self.inputs, PipelineVariable):
self.inputs = [self.inputs]
for entry in self.inputs:
input_entry = {
"DataSource": {"S3DataSource": {"S3DataType": "S3Prefix", "S3Uri": entry}},
Expand Down Expand Up @@ -106,7 +111,7 @@ def __init__(
max_candidates: Optional[int] = None,
max_runtime_per_training_job_in_seconds: Optional[int] = None,
total_job_runtime_in_seconds: Optional[int] = None,
job_objective: Optional[str] = None,
job_objective: Optional[Dict[str, str]] = None,
generate_candidate_definitions_only: Optional[bool] = False,
tags: Optional[List[Dict[str, str]]] = None,
content_type: Optional[str] = None,
Expand Down Expand Up @@ -142,8 +147,9 @@ def __init__(
that each training job executed inside hyperparameter tuning
is allowed to run as part of a hyperparameter tuning job.
total_job_runtime_in_seconds (int): the total wait time of an AutoML job.
job_objective (str): Defines the objective metric
job_objective (dict[str, str]): Defines the objective metric
used to measure the predictive quality of an AutoML job.
In the format of: {"MetricName": str}
generate_candidate_definitions_only (bool): Whether to generates
possible candidates without training the models.
tags (List[dict[str, str]]): The list of tags to attach to this
Expand Down Expand Up @@ -969,8 +975,10 @@ def _prepare_auto_ml_stop_condition(
Returns (dict): an AutoML CompletionCriteria.
"""
stopping_condition = {"MaxCandidates": max_candidates}
stopping_condition = {}

if max_candidates is not None:
stopping_condition["MaxCandidates"] = max_candidates
if max_runtime_per_training_job_in_seconds is not None:
stopping_condition[
"MaxRuntimePerTrainingJobInSeconds"
Expand Down
12 changes: 6 additions & 6 deletions src/sagemaker/workflow/automl_step.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,15 +71,15 @@ def __init__(

root_property = Properties(step_name=name, shape_name="DescribeAutoMLJobResponse")

best_candidate_properties = Properties(step_name=name, path="bestCandidateProperties")
best_candidate_properties.__dict__["modelInsightsJsonReportPath"] = Properties(
step_name=name, path="bestCandidateProperties.modelInsightsJsonReportPath"
best_candidate_properties = Properties(step_name=name, path="BestCandidateProperties")
best_candidate_properties.__dict__["ModelInsightsJsonReportPath"] = Properties(
step_name=name, path="BestCandidateProperties.ModelInsightsJsonReportPath"
)
best_candidate_properties.__dict__["explainabilityJsonReportPath"] = Properties(
step_name=name, path="bestCandidateProperties.explainabilityJsonReportPath"
best_candidate_properties.__dict__["ExplainabilityJsonReportPath"] = Properties(
step_name=name, path="BestCandidateProperties.ExplainabilityJsonReportPath"
)

root_property.__dict__["bestCandidateProperties"] = best_candidate_properties
root_property.__dict__["BestCandidateProperties"] = best_candidate_properties
self._properties = root_property

@property
Expand Down
57 changes: 48 additions & 9 deletions tests/integ/sagemaker/workflow/test_automl_steps.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,15 @@

import os

import boto3
import pytest
from botocore.exceptions import WaiterError

from sagemaker.workflow import ParameterString
from sagemaker.workflow.automl_step import AutoMLStep
from sagemaker.automl.automl import AutoML, AutoMLInput

from sagemaker import utils, get_execution_role
from sagemaker.utils import unique_name_from_base
from sagemaker import utils, get_execution_role, ModelMetrics, MetricsSource
from sagemaker.workflow.model_step import ModelStep
from sagemaker.workflow.pipeline import Pipeline

Expand Down Expand Up @@ -50,10 +51,8 @@ def test_automl_step(pipeline_session, role, pipeline_name):
role=role,
target_attribute_name=TARGET_ATTRIBUTE_NAME,
sagemaker_session=pipeline_session,
max_candidates=1,
mode=MODE,
)
job_name = unique_name_from_base("auto-ml", max_length=32)
s3_input_training = pipeline_session.upload_data(
path=TRAINING_DATA, key_prefix=PREFIX + "/input"
)
Expand All @@ -72,27 +71,56 @@ def test_automl_step(pipeline_session, role, pipeline_name):
)
inputs = [input_training, input_validation]

step_args = auto_ml.fit(inputs=inputs, job_name=job_name)
step_args = auto_ml.fit(inputs=inputs)

automl_step = AutoMLStep(
name="MyAutoMLStep",
step_args=step_args,
)

automl_model = automl_step.get_best_auto_ml_model(sagemaker_session=pipeline_session, role=role)

step_args_create_model = automl_model.create(
instance_type="c4.4xlarge",
)

automl_model_step = ModelStep(
name="MyAutoMLModelStep",
step_args=step_args_create_model,
)

model_package_group_name = ParameterString(
name="ModelPackageName", default_value="AutoMlModelPackageGroup"
)
model_approval_status = ParameterString(name="ModelApprovalStatus", default_value="Approved")
model_metrics = ModelMetrics(
model_statistics=MetricsSource(
s3_uri=automl_step.properties.BestCandidateProperties.ModelInsightsJsonReportPath,
content_type="application/json",
),
explainability=MetricsSource(
s3_uri=automl_step.properties.BestCandidateProperties.ExplainabilityJsonReportPath,
content_type="application/json",
),
)
step_args_register_model = automl_model.register(
content_types=["text/csv"],
response_types=["text/csv"],
inference_instances=["ml.m5.xlarge"],
transform_instances=["ml.m5.xlarge"],
model_package_group_name=model_package_group_name,
approval_status=model_approval_status,
model_metrics=model_metrics,
)
register_model_step = ModelStep(
name="ModelRegistrationStep", step_args=step_args_register_model
)

pipeline = Pipeline(
name=pipeline_name,
steps=[automl_step, automl_model_step],
parameters=[
model_approval_status,
model_package_group_name,
],
steps=[automl_step, automl_model_step, register_model_step],
sagemaker_session=pipeline_session,
)

Expand All @@ -114,9 +142,20 @@ def test_automl_step(pipeline_session, role, pipeline_name):
assert step["Metadata"]["AutoMLJob"]["Arn"] is not None

assert has_automl_job
assert len(execution_steps) == 2
assert len(execution_steps) == 3
finally:
try:
sagemaker_client = boto3.client("sagemaker")
for model_package in sagemaker_client.list_model_packages(
ModelPackageGroupName="AutoMlModelPackageGroup"
)["ModelPackageSummaryList"]:
sagemaker_client.delete_model_package(
ModelPackageName=model_package["ModelPackageArn"]
)
sagemaker_client.delete_model_package_group(
ModelPackageGroupName="AutoMlModelPackageGroup"
)

pipeline.delete()
except Exception:
pass
1 change: 0 additions & 1 deletion tests/integ/test_auto_ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,6 @@ def test_auto_ml_fit_local_input(sagemaker_session):
role=ROLE,
target_attribute_name=TARGET_ATTRIBUTE_NAME,
sagemaker_session=sagemaker_session,
max_candidates=1,
generate_candidate_definitions_only=True,
)

Expand Down
44 changes: 42 additions & 2 deletions tests/unit/sagemaker/automl/test_auto_ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from mock import Mock, patch
from sagemaker import AutoML, AutoMLJob, AutoMLInput, CandidateEstimator, PipelineModel
from sagemaker.predictor import Predictor
from sagemaker.workflow.functions import Join

MODEL_DATA = "s3://bucket/model.tar.gz"
MODEL_IMAGE = "mi"
Expand Down Expand Up @@ -52,7 +53,7 @@
MAX_RUNTIME_PER_TRAINING_JOB = 3600
TOTAL_JOB_RUNTIME = 36000
TARGET_OBJECTIVE = "0.01"
JOB_OBJECTIVE = {"fake job objective"}
JOB_OBJECTIVE = {"MetricName": "F1"}
TAGS = [{"Name": "some-tag", "Value": "value-for-tag"}]
CONTENT_TYPE = "x-application/vnd.amazon+parquet"
S3_DATA_TYPE = "ManifestFile"
Expand Down Expand Up @@ -503,7 +504,46 @@ def test_auto_ml_default_fit(strftime, sagemaker_session):
],
"output_config": {"S3OutputPath": DEFAULT_OUTPUT_PATH},
"auto_ml_job_config": {
"CompletionCriteria": {"MaxCandidates": DEFAULT_MAX_CANDIDATES},
"CompletionCriteria": {},
"SecurityConfig": {
"EnableInterContainerTrafficEncryption": ENCRYPT_INTER_CONTAINER_TRAFFIC
},
},
"role": ROLE,
"job_name": DEFAULT_JOB_NAME,
"problem_type": None,
"job_objective": None,
"generate_candidate_definitions_only": GENERATE_CANDIDATE_DEFINITIONS_ONLY,
"tags": None,
}


@patch("time.strftime", return_value=TIMESTAMP)
def test_auto_ml_default_fit_with_pipeline_variable(strftime, sagemaker_session):
auto_ml = AutoML(
role=ROLE,
target_attribute_name=TARGET_ATTRIBUTE_NAME,
sagemaker_session=sagemaker_session,
)
inputs = Join(on="/", values=[DEFAULT_S3_INPUT_DATA, "ProcessingJobName"])
auto_ml.fit(inputs=AutoMLInput(inputs=inputs, target_attribute_name=TARGET_ATTRIBUTE_NAME))
sagemaker_session.auto_ml.assert_called_once()
_, args = sagemaker_session.auto_ml.call_args
assert args == {
"input_config": [
{
"DataSource": {
"S3DataSource": {
"S3DataType": "S3Prefix",
"S3Uri": Join(on="/", values=["s3://mybucket/data", "ProcessingJobName"]),
}
},
"TargetAttributeName": TARGET_ATTRIBUTE_NAME,
}
],
"output_config": {"S3OutputPath": DEFAULT_OUTPUT_PATH},
"auto_ml_job_config": {
"CompletionCriteria": {},
"SecurityConfig": {
"EnableInterContainerTrafficEncryption": ENCRYPT_INTER_CONTAINER_TRAFFIC
},
Expand Down
8 changes: 4 additions & 4 deletions tests/unit/sagemaker/workflow/test_automl_step.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,11 +231,11 @@ def test_single_automl_step_with_parameter(pipeline_session):
step_args=step_args,
)

assert automl_step.properties.bestCandidateProperties.modelInsightsJsonReportPath.expr == {
"Get": "Steps.MyAutoMLStep.bestCandidateProperties.modelInsightsJsonReportPath"
assert automl_step.properties.BestCandidateProperties.ModelInsightsJsonReportPath.expr == {
"Get": "Steps.MyAutoMLStep.BestCandidateProperties.ModelInsightsJsonReportPath"
}
assert automl_step.properties.bestCandidateProperties.explainabilityJsonReportPath.expr == {
"Get": "Steps.MyAutoMLStep.bestCandidateProperties.explainabilityJsonReportPath"
assert automl_step.properties.BestCandidateProperties.ExplainabilityJsonReportPath.expr == {
"Get": "Steps.MyAutoMLStep.BestCandidateProperties.ExplainabilityJsonReportPath"
}

pipeline = Pipeline(
Expand Down

0 comments on commit cb45a4b

Please sign in to comment.