Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

r/sagemaker_endpoint - add rolling update #32418

Merged
merged 5 commits into from
Aug 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .changelog/32418.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
```release-note:enhancement
resource/aws_sagemaker_endpoint: Make `deployment_config.blue_green_update_policy` optional
```

```release-note:enhancement
resource/aws_sagemaker_endpoint: Add `deployment_config.rolling_update_policy` argument
```
128 changes: 121 additions & 7 deletions internal/service/sagemaker/endpoint.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,12 @@ func ResourceEndpoint() *schema.Resource {
},
"blue_green_update_policy": {
Type: schema.TypeList,
Required: true,
Optional: true,
MaxItems: 1,
ExactlyOneOf: []string{
"deployment_config.0.blue_green_update_policy",
"deployment_config.0.rolling_update_policy",
},
Elem: &schema.Resource{
Schema: map[string]*schema.Schema{
"maximum_execution_timeout_in_seconds": {
Expand Down Expand Up @@ -146,6 +150,67 @@ func ResourceEndpoint() *schema.Resource {
},
},
},
"rolling_update_policy": {
Type: schema.TypeList,
Optional: true,
MaxItems: 1,
ExactlyOneOf: []string{
"deployment_config.0.blue_green_update_policy",
"deployment_config.0.rolling_update_policy",
},
Elem: &schema.Resource{
Schema: map[string]*schema.Schema{
"maximum_batch_size": {
Type: schema.TypeList,
Required: true,
MaxItems: 1,
Elem: &schema.Resource{
Schema: map[string]*schema.Schema{
"type": {
Type: schema.TypeString,
Required: true,
ValidateFunc: validation.StringInSlice(sagemaker.CapacitySizeType_Values(), false),
},
"value": {
Type: schema.TypeInt,
Required: true,
ValidateFunc: validation.IntAtLeast(1),
},
},
},
},
"maximum_execution_timeout_in_seconds": {
Type: schema.TypeInt,
Optional: true,
ValidateFunc: validation.IntBetween(600, 14400),
},
"rollback_maximum_batch_size": {
Type: schema.TypeList,
Optional: true,
MaxItems: 1,
Elem: &schema.Resource{
Schema: map[string]*schema.Schema{
"type": {
Type: schema.TypeString,
Required: true,
ValidateFunc: validation.StringInSlice(sagemaker.CapacitySizeType_Values(), false),
},
"value": {
Type: schema.TypeInt,
Required: true,
ValidateFunc: validation.IntAtLeast(1),
},
},
},
},
"wait_interval_in_seconds": {
Type: schema.TypeInt,
Required: true,
ValidateFunc: validation.IntBetween(0, 3600),
},
},
},
},
},
},
},
Expand Down Expand Up @@ -313,6 +378,10 @@ func expandEndpointDeploymentConfig(configured []interface{}) *sagemaker.Deploym
c.AutoRollbackConfiguration = expandEndpointDeploymentConfigAutoRollbackConfig(v)
}

if v, ok := m["rolling_update_policy"].([]interface{}); ok && len(v) > 0 {
c.RollingUpdatePolicy = expandEndpointDeploymentConfigRollingUpdatePolicy(v)
}

return c
}

Expand All @@ -329,6 +398,10 @@ func flattenEndpointDeploymentConfig(configured *sagemaker.DeploymentConfig) []m
cfg["auto_rollback_configuration"] = flattenEndpointDeploymentConfigAutoRollbackConfig(configured.AutoRollbackConfiguration)
}

if configured.RollingUpdatePolicy != nil {
cfg["rolling_update_policy"] = flattenEndpointDeploymentConfigRollingUpdatePolicy(configured.RollingUpdatePolicy)
}

return []map[string]interface{}{cfg}
}

Expand Down Expand Up @@ -381,11 +454,11 @@ func expandEndpointDeploymentConfigTrafficRoutingConfiguration(configured []inte
}

if v, ok := m["canary_size"].([]interface{}); ok && len(v) > 0 {
c.CanarySize = expandEndpointDeploymentConfigTrafficRoutingConfigurationCapacitySize(v)
c.CanarySize = expandEndpointDeploymentCapacitySize(v)
}

if v, ok := m["linear_step_size"].([]interface{}); ok && len(v) > 0 {
c.LinearStepSize = expandEndpointDeploymentConfigTrafficRoutingConfigurationCapacitySize(v)
c.LinearStepSize = expandEndpointDeploymentCapacitySize(v)
}

return c
Expand All @@ -402,17 +475,17 @@ func flattenEndpointDeploymentConfigTrafficRoutingConfiguration(configured *sage
}

if configured.CanarySize != nil {
cfg["canary_size"] = flattenEndpointDeploymentConfigTrafficRoutingConfigurationCapacitySize(configured.CanarySize)
cfg["canary_size"] = flattenEndpointDeploymentCapacitySize(configured.CanarySize)
}

if configured.LinearStepSize != nil {
cfg["linear_step_size"] = flattenEndpointDeploymentConfigTrafficRoutingConfigurationCapacitySize(configured.LinearStepSize)
cfg["linear_step_size"] = flattenEndpointDeploymentCapacitySize(configured.LinearStepSize)
}

return []map[string]interface{}{cfg}
}

func expandEndpointDeploymentConfigTrafficRoutingConfigurationCapacitySize(configured []interface{}) *sagemaker.CapacitySize {
func expandEndpointDeploymentCapacitySize(configured []interface{}) *sagemaker.CapacitySize {
if len(configured) == 0 {
return nil
}
Expand All @@ -427,7 +500,7 @@ func expandEndpointDeploymentConfigTrafficRoutingConfigurationCapacitySize(confi
return c
}

func flattenEndpointDeploymentConfigTrafficRoutingConfigurationCapacitySize(configured *sagemaker.CapacitySize) []map[string]interface{} {
func flattenEndpointDeploymentCapacitySize(configured *sagemaker.CapacitySize) []map[string]interface{} {
if configured == nil {
return []map[string]interface{}{}
}
Expand Down Expand Up @@ -466,6 +539,47 @@ func flattenEndpointDeploymentConfigAutoRollbackConfig(configured *sagemaker.Aut
return []map[string]interface{}{cfg}
}

func expandEndpointDeploymentConfigRollingUpdatePolicy(configured []interface{}) *sagemaker.RollingUpdatePolicy {
if len(configured) == 0 {
return nil
}

m := configured[0].(map[string]interface{})

c := &sagemaker.RollingUpdatePolicy{
WaitIntervalInSeconds: aws.Int64(int64(m["wait_interval_in_seconds"].(int))),
}

if v, ok := m["maximum_execution_timeout_in_seconds"].(int); ok && v > 0 {
c.MaximumExecutionTimeoutInSeconds = aws.Int64(int64(v))
}

if v, ok := m["maximum_batch_size"].([]interface{}); ok && len(v) > 0 {
c.MaximumBatchSize = expandEndpointDeploymentCapacitySize(v)
}

if v, ok := m["rollback_maximum_batch_size"].([]interface{}); ok && len(v) > 0 {
c.RollbackMaximumBatchSize = expandEndpointDeploymentCapacitySize(v)
}

return c
}

func flattenEndpointDeploymentConfigRollingUpdatePolicy(configured *sagemaker.RollingUpdatePolicy) []map[string]interface{} {
if configured == nil {
return []map[string]interface{}{}
}

cfg := map[string]interface{}{
"maximum_execution_timeout_in_seconds": aws.Int64Value(configured.MaximumExecutionTimeoutInSeconds),
"wait_interval_in_seconds": aws.Int64Value(configured.WaitIntervalInSeconds),
"maximum_batch_size": flattenEndpointDeploymentCapacitySize(configured.MaximumBatchSize),
"rollback_maximum_batch_size": flattenEndpointDeploymentCapacitySize(configured.RollbackMaximumBatchSize),
}

return []map[string]interface{}{cfg}
}

func expandEndpointDeploymentConfigAutoRollbackConfigAlarms(configured []interface{}) []*sagemaker.Alarm {
if len(configured) == 0 {
return nil
Expand Down
89 changes: 86 additions & 3 deletions internal/service/sagemaker/endpoint_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ func TestAccSageMakerEndpoint_deploymentConfig(t *testing.T) {
})
}

func TestAccSageMakerEndpoint_deploymentConfig_full(t *testing.T) {
func TestAccSageMakerEndpoint_deploymentConfig_blueGreen(t *testing.T) {
ctx := acctest.Context(t)
if testing.Short() {
t.Skip("skipping long-running test in short mode")
Expand All @@ -187,7 +187,7 @@ func TestAccSageMakerEndpoint_deploymentConfig_full(t *testing.T) {
CheckDestroy: testAccCheckEndpointDestroy(ctx),
Steps: []resource.TestStep{
{
Config: testAccEndpointConfig_deploymentFull(rName),
Config: testAccEndpointConfig_deploymentBlueGreen(rName),
Check: resource.ComposeTestCheckFunc(
testAccCheckEndpointExists(ctx, resourceName),
resource.TestCheckResourceAttr(resourceName, "name", rName),
Expand All @@ -204,6 +204,46 @@ func TestAccSageMakerEndpoint_deploymentConfig_full(t *testing.T) {
resource.TestCheckResourceAttr(resourceName, "deployment_config.0.blue_green_update_policy.0.traffic_routing_configuration.0.linear_step_size.#", "1"),
resource.TestCheckResourceAttr(resourceName, "deployment_config.0.blue_green_update_policy.0.traffic_routing_configuration.0.linear_step_size.0.type", "INSTANCE_COUNT"),
resource.TestCheckResourceAttr(resourceName, "deployment_config.0.blue_green_update_policy.0.traffic_routing_configuration.0.linear_step_size.0.value", "1"),
resource.TestCheckResourceAttr(resourceName, "deployment_config.0.rolling_update_policy.#", "0"),
),
},
{
ResourceName: resourceName,
ImportState: true,
ImportStateVerify: true,
},
},
})
}

func TestAccSageMakerEndpoint_deploymentConfig_rolling(t *testing.T) {
ctx := acctest.Context(t)
if testing.Short() {
t.Skip("skipping long-running test in short mode")
}

rName := sdkacctest.RandomWithPrefix(acctest.ResourcePrefix)
resourceName := "aws_sagemaker_endpoint.test"

resource.ParallelTest(t, resource.TestCase{
PreCheck: func() { acctest.PreCheck(ctx, t) },
ErrorCheck: acctest.ErrorCheck(t, sagemaker.EndpointsID),
ProtoV5ProviderFactories: acctest.ProtoV5ProviderFactories,
CheckDestroy: testAccCheckEndpointDestroy(ctx),
Steps: []resource.TestStep{
{
Config: testAccEndpointConfig_deploymentRolling(rName),
Check: resource.ComposeTestCheckFunc(
testAccCheckEndpointExists(ctx, resourceName),
resource.TestCheckResourceAttr(resourceName, "name", rName),
resource.TestCheckResourceAttr(resourceName, "deployment_config.#", "1"),
resource.TestCheckResourceAttr(resourceName, "deployment_config.0.auto_rollback_configuration.#", "1"),
resource.TestCheckResourceAttr(resourceName, "deployment_config.0.auto_rollback_configuration.0.alarms.#", "1"),
resource.TestCheckResourceAttr(resourceName, "deployment_config.0.blue_green_update_policy.#", "0"),
resource.TestCheckResourceAttr(resourceName, "deployment_config.0.rolling_update_policy.#", "1"),
resource.TestCheckResourceAttr(resourceName, "deployment_config.0.rolling_update_policy.0.wait_interval_in_seconds", "60"),
resource.TestCheckResourceAttr(resourceName, "deployment_config.0.rolling_update_policy.0.maximum_batch_size.0.type", "CAPACITY_PERCENT"),
resource.TestCheckResourceAttr(resourceName, "deployment_config.0.rolling_update_policy.0.maximum_batch_size.0.value", "5"),
),
},
{
Expand Down Expand Up @@ -444,7 +484,7 @@ resource "aws_sagemaker_endpoint" "test" {
`, rName, tType, wait)
}

func testAccEndpointConfig_deploymentFull(rName string) string {
func testAccEndpointConfig_deploymentBlueGreen(rName string) string {
return testAccEndpointConfig_Base(rName) + fmt.Sprintf(`
resource "aws_cloudwatch_metric_alarm" "test" {
alarm_name = %[1]q
Expand Down Expand Up @@ -489,3 +529,46 @@ resource "aws_sagemaker_endpoint" "test" {
}
`, rName)
}

func testAccEndpointConfig_deploymentRolling(rName string) string {
return testAccEndpointConfig_Base(rName) + fmt.Sprintf(`
resource "aws_cloudwatch_metric_alarm" "test" {
alarm_name = %[1]q
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = "2"
metric_name = "CPUUtilization"
namespace = "AWS/EC2"
period = "120"
statistic = "Average"
threshold = "80"
alarm_description = "This metric monitors ec2 cpu utilization"
insufficient_data_actions = []

dimensions = {
InstanceId = "i-abc123"
}
}

resource "aws_sagemaker_endpoint" "test" {
endpoint_config_name = aws_sagemaker_endpoint_configuration.test.name
name = %[1]q

deployment_config {
auto_rollback_configuration {
alarms {
alarm_name = aws_cloudwatch_metric_alarm.test.alarm_name
}
}

rolling_update_policy {
wait_interval_in_seconds = 60

maximum_batch_size {
type = "CAPACITY_PERCENT"
value = 5
}
}
}
}
`, rName)
}
20 changes: 19 additions & 1 deletion website/docs/r/sagemaker_endpoint.html.markdown
Original file line number Diff line number Diff line change
Expand Up @@ -36,22 +36,40 @@ This resource supports the following arguments:

### Deployment Config

* `blue_green_update_policy` - (Required) Update policy for a blue/green deployment. If this update policy is specified, SageMaker creates a new fleet during the deployment while maintaining the old fleet. See [Blue Green Update Config](#blue-green-update-config).
* `blue_green_update_policy` - (Optional) Update policy for a blue/green deployment. If this update policy is specified, SageMaker creates a new fleet during the deployment while maintaining the old fleet. SageMaker flips traffic to the new fleet according to the specified traffic routing configuration. Only one update policy should be used in the deployment configuration. If no update policy is specified, SageMaker uses a blue/green deployment strategy with all at once traffic shifting by default. See [Blue Green Update Config](#blue-green-update-config).
* `auto_rollback_configuration` - (Optional) Automatic rollback configuration for handling endpoint deployment failures and recovery. See [Auto Rollback Configuration](#auto-rollback-configuration).
* `rolling_update_policy` - (Optional) Specifies a rolling deployment strategy for updating a SageMaker endpoint. See [Rolling Update Policy](#rolling-update-policy).

#### Blue Green Update Config

* `traffic_routing_configuration` - (Required) Defines the traffic routing strategy to shift traffic from the old fleet to the new fleet during an endpoint deployment. See [Traffic Routing Configuration](#traffic-routing-configuration).
* `maximum_execution_timeout_in_seconds` - (Optional) Maximum execution timeout for the deployment. Note that the timeout value should be larger than the total waiting time specified in `termination_wait_in_seconds` and `wait_interval_in_seconds`. Valid values are between `600` and `14400`.
* `termination_wait_in_seconds` - (Optional) Additional waiting time in seconds after the completion of an endpoint deployment before terminating the old endpoint fleet. Default is `0`. Valid values are between `0` and `3600`.

#### Rolling Update Policy

* `maximum_batch_size` - (Required) Batch size for each rolling step to provision capacity and turn on traffic on the new endpoint fleet, and terminate capacity on the old endpoint fleet. Value must be between 5% to 50% of the variant's total instance count. See [Maximum Batch Size](#maximum-batch-size).
* `maximum_execution_timeout_in_seconds` - (Optional) The time limit for the total deployment. Exceeding this limit causes a timeout. Valid values are between `600` and `14400`.
* `rollback_maximum_batch_size` - (Optional) Batch size for rollback to the old endpoint fleet. Each rolling step to provision capacity and turn on traffic on the old endpoint fleet, and terminate capacity on the new endpoint fleet. If this field is absent, the default value will be set to 100% of total capacity which means to bring up the whole capacity of the old fleet at once during rollback. See [Rollback Maximum Batch Size](#rollback-maximum-batch-size).
* `wait_interval_in_seconds` - (Required) The length of the baking period, during which SageMaker monitors alarms for each batch on the new fleet. Valid values are between `0` and `3600`.

##### Traffic Routing Configuration

* `type` - (Required) Traffic routing strategy type. Valid values are: `ALL_AT_ONCE`, `CANARY`, and `LINEAR`.
* `wait_interval_in_seconds` - (Required) The waiting time (in seconds) between incremental steps to turn on traffic on the new endpoint fleet. Valid values are between `0` and `3600`.
* `canary_size` - (Optional) Batch size for the first step to turn on traffic on the new endpoint fleet. Value must be less than or equal to 50% of the variant's total instance count. See [Canary Size](#canary-size).
* `linear_step_size` - (Optional) Batch size for each step to turn on traffic on the new endpoint fleet. Value must be 10-50% of the variant's total instance count. See [Linear Step Size](#linear-step-size).

###### Maximum Batch Size

* `type` - (Required) Specifies the endpoint capacity type. Valid values are: `INSTANCE_COUNT`, or `CAPACITY_PERCENT`.
* `value` - (Required) Defines the capacity size, either as a number of instances or a capacity percentage.

###### Rollback Maximum Batch Size

* `type` - (Required) Specifies the endpoint capacity type. Valid values are: `INSTANCE_COUNT`, or `CAPACITY_PERCENT`.
* `value` - (Required) Defines the capacity size, either as a number of instances or a capacity percentage.

###### Canary Size

* `type` - (Required) Specifies the endpoint capacity type. Valid values are: `INSTANCE_COUNT`, or `CAPACITY_PERCENT`.
Expand Down