Skip to content

Commit

Permalink
feat: Add support for Auto Scaling Group Instance Refresh for self-ma…
Browse files Browse the repository at this point in the history
…naged worker groups (#1224)

Co-authored-by: Thierno IB. BARRY <[email protected]>
  • Loading branch information
bashims and barryib authored May 17, 2021
1 parent 32f70af commit 68e9df9
Show file tree
Hide file tree
Showing 9 changed files with 360 additions and 5 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ eks-admin-cluster-role-binding.yaml
eks-admin-service-account.yaml
config-map-aws-auth*.yaml
kubeconfig_*
.idea

#################################################################
# Default .gitignore content for all terraform-aws-modules below
Expand Down
5 changes: 2 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ An example of harming update was the removal of several commonly used, but depre

By default, this module manages the `aws-auth` configmap for you (`manage_aws_auth=true`). To avoid the following [issue](https://github.com/aws/containers-roadmap/issues/654) where the EKS creation is `ACTIVE` but not ready. We implemented a "retry" logic with a fork of the http provider https://github.com/terraform-aws-modules/terraform-provider-http. This fork adds the support of a self-signed CA certificate. The original PR can be found at https://github.com/hashicorp/terraform-provider-http/pull/29.

Setting `instance_refresh_enabled` to true will recreate your worker nodes without draining them first. It is recommended to install [aws-node-termination-handler](https://github.com/aws/aws-node-termination-handler) for proper node draining. Find the complete example here [instance_refresh](examples/instance_refresh).

## Usage example

A full example leveraging other community modules is contained in the [examples/basic directory](https://github.com/terraform-aws-modules/terraform-aws-eks/tree/master/examples/basic).
Expand Down Expand Up @@ -155,7 +157,6 @@ MIT Licensed. See [LICENSE](https://github.com/terraform-aws-modules/terraform-a
| <a name="provider_http"></a> [http](#provider\_http) | >= 2.3.0 |
| <a name="provider_kubernetes"></a> [kubernetes](#provider\_kubernetes) | >= 1.11.1 |
| <a name="provider_local"></a> [local](#provider\_local) | >= 1.4 |
| <a name="provider_random"></a> [random](#provider\_random) | >= 2.1 |

## Modules

Expand Down Expand Up @@ -202,8 +203,6 @@ MIT Licensed. See [LICENSE](https://github.com/terraform-aws-modules/terraform-a
| [aws_security_group_rule.workers_ingress_self](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/security_group_rule) | resource |
| [kubernetes_config_map.aws_auth](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/config_map) | resource |
| [local_file.kubeconfig](https://registry.terraform.io/providers/hashicorp/local/latest/docs/resources/file) | resource |
| [random_pet.workers](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/pet) | resource |
| [random_pet.workers_launch_template](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/pet) | resource |
| [aws_ami.eks_worker](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/ami) | data source |
| [aws_ami.eks_worker_windows](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/ami) | data source |
| [aws_caller_identity.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/caller_identity) | data source |
Expand Down
234 changes: 234 additions & 0 deletions examples/instance_refresh/main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,234 @@
provider "aws" {
region = var.region
}

data "aws_caller_identity" "current" {}

data "aws_eks_cluster" "cluster" {
name = module.eks.cluster_id
}

data "aws_eks_cluster_auth" "cluster" {
name = module.eks.cluster_id
}

provider "kubernetes" {
host = data.aws_eks_cluster.cluster.endpoint
cluster_ca_certificate = base64decode(data.aws_eks_cluster.cluster.certificate_authority.0.data)
token = data.aws_eks_cluster_auth.cluster.token
load_config_file = false
}

provider "helm" {
kubernetes {
host = data.aws_eks_cluster.cluster.endpoint
cluster_ca_certificate = base64decode(data.aws_eks_cluster.cluster.certificate_authority.0.data)
token = data.aws_eks_cluster_auth.cluster.token
}
}

data "aws_availability_zones" "available" {
}

locals {
cluster_name = "test-refresh-${random_string.suffix.result}"
}

resource "random_string" "suffix" {
length = 8
special = false
}

module "vpc" {
source = "terraform-aws-modules/vpc/aws"
version = "~> 3.0.0"

name = local.cluster_name
cidr = "10.0.0.0/16"
azs = data.aws_availability_zones.available.names
public_subnets = ["10.0.4.0/24", "10.0.5.0/24", "10.0.6.0/24"]
enable_dns_hostnames = true
}

data "aws_iam_policy_document" "node_term" {
statement {
effect = "Allow"
actions = [
"ec2:DescribeInstances",
"autoscaling:DescribeAutoScalingInstances",
"autoscaling:DescribeTags",
]
resources = [
"*",
]
}
statement {
effect = "Allow"
actions = [
"autoscaling:CompleteLifecycleAction",
]
resources = module.eks.workers_asg_arns
}
statement {
effect = "Allow"
actions = [
"sqs:DeleteMessage",
"sqs:ReceiveMessage"
]
resources = [
module.node_term_sqs.sqs_queue_arn
]
}
}

resource "aws_iam_policy" "node_term" {
name = "node-term-${local.cluster_name}"
policy = data.aws_iam_policy_document.node_term.json
}

resource "aws_iam_role_policy_attachment" "node_term_policy" {
policy_arn = aws_iam_policy.node_term.arn
role = module.eks.worker_iam_role_name
}

data "aws_iam_policy_document" "node_term_events" {
statement {
effect = "Allow"
principals {
type = "Service"
identifiers = [
"events.amazonaws.com",
"sqs.amazonaws.com",
]
}
actions = [
"sqs:SendMessage",
]
resources = [
"arn:aws:sqs:${var.region}:${data.aws_caller_identity.current.account_id}:${local.cluster_name}",
]
}
}

module "node_term_sqs" {
source = "terraform-aws-modules/sqs/aws"
version = "~> 3.0.0"
name = local.cluster_name
message_retention_seconds = 300
policy = data.aws_iam_policy_document.node_term_events.json
}

resource "aws_cloudwatch_event_rule" "node_term_event_rule" {
name = "${local.cluster_name}-nth-rule"
description = "Node termination event rule"
event_pattern = jsonencode(
{
"source" : [
"aws.autoscaling"
],
"detail-type" : [
"EC2 Instance-terminate Lifecycle Action"
]
"resources" : module.eks.workers_asg_arns
}
)
}

resource "aws_cloudwatch_event_target" "node_term_event_target" {
rule = aws_cloudwatch_event_rule.node_term_event_rule.name
target_id = "ANTHandler"
arn = module.node_term_sqs.sqs_queue_arn
}

module "node_term_role" {
source = "terraform-aws-modules/iam/aws//modules/iam-assumable-role-with-oidc"
version = "4.1.0"
create_role = true
role_description = "IRSA role for ANTH, cluster ${local.cluster_name}"
role_name_prefix = local.cluster_name
provider_url = replace(module.eks.cluster_oidc_issuer_url, "https://", "")
role_policy_arns = [aws_iam_policy.node_term.arn]
oidc_fully_qualified_subjects = ["system:serviceaccount:${var.namespace}:${var.serviceaccount}"]
}

resource "helm_release" "anth" {
depends_on = [
module.eks
]

name = "aws-node-termination-handler"
namespace = var.namespace
repository = "https://aws.github.io/eks-charts"
chart = "aws-node-termination-handler"
version = var.aws_node_termination_handler_chart_version
create_namespace = true

set {
name = "awsRegion"
value = var.region
}
set {
name = "serviceAccount.name"
value = var.serviceaccount
}
set {
name = "serviceAccount.annotations.eks\\.amazonaws\\.com/role-arn"
value = module.node_term_role.iam_role_arn
type = "string"
}
set {
name = "enableSqsTerminationDraining"
value = "true"
}
set {
name = "queueURL"
value = module.node_term_sqs.sqs_queue_id
}
set {
name = "logLevel"
value = "DEBUG"
}
}

# Creating the lifecycle-hook outside of the ASG resource's `initial_lifecycle_hook`
# ensures that node termination does not require the lifecycle action to be completed,
# and thus allows the ASG to be destroyed cleanly.
resource "aws_autoscaling_lifecycle_hook" "node_term" {
name = "node_term-${local.cluster_name}"
autoscaling_group_name = module.eks.workers_asg_names[0]
lifecycle_transition = "autoscaling:EC2_INSTANCE_TERMINATING"
heartbeat_timeout = 300
default_result = "CONTINUE"
}

module "eks" {
source = "../.."
cluster_name = local.cluster_name
cluster_version = "1.19"
subnets = module.vpc.public_subnets
vpc_id = module.vpc.vpc_id
enable_irsa = true
worker_groups_launch_template = [
{
name = "refresh"
asg_max_size = 2
asg_desired_capacity = 2
instance_refresh_enabled = true
instance_refresh_triggers = ["tag"]
public_ip = true
metadata_http_put_response_hop_limit = 3
tags = [
{
key = "aws-node-termination-handler/managed"
value = ""
propagate_at_launch = true
},
{
key = "foo"
value = "buzz"
propagate_at_launch = true
},
]
},
]
}
34 changes: 34 additions & 0 deletions examples/instance_refresh/outputs.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
output "cluster_endpoint" {
description = "Endpoint for EKS control plane."
value = module.eks.cluster_endpoint
}

output "cluster_security_group_id" {
description = "Security group ids attached to the cluster control plane."
value = module.eks.cluster_security_group_id
}

output "kubectl_config" {
description = "kubectl config as generated by the module."
value = module.eks.kubeconfig
}

output "config_map_aws_auth" {
description = "A kubernetes configuration to authenticate to this EKS cluster."
value = module.eks.config_map_aws_auth
}

output "region" {
description = "AWS region."
value = var.region
}

output "sqs_queue_asg_notification_arn" {
description = "SQS queue ASG notification ARN"
value = module.node_term_sqs.sqs_queue_arn
}

output "sqs_queue_asg_notification_url" {
description = "SQS queue ASG notification URL"
value = module.node_term_sqs.sqs_queue_id
}
18 changes: 18 additions & 0 deletions examples/instance_refresh/variables.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
variable "region" {
default = "us-west-2"
}

variable "aws_node_termination_handler_chart_version" {
description = "Version of the aws-node-termination-handler Helm chart to install."
default = "0.15.0"
}

variable "namespace" {
description = "Namespace for the aws-node-termination-handler."
default = "kube-system"
}

variable "serviceaccount" {
description = "Serviceaccount for the aws-node-termination-handler."
default = "aws-node-termination-handler"
}
11 changes: 11 additions & 0 deletions examples/instance_refresh/versions.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
terraform {
required_version = ">= 0.13.1"

required_providers {
aws = ">= 3.22.0"
local = ">= 1.4"
random = ">= 2.1"
kubernetes = "~> 1.11"
helm = "~> 2.1.2"
}
}
7 changes: 6 additions & 1 deletion local.tf
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ locals {
asg_max_size = "3" # Maximum worker capacity in the autoscaling group.
asg_min_size = "1" # Minimum worker capacity in the autoscaling group. NOTE: Change in this paramater will affect the asg_desired_capacity, like changing its value to 2 will change asg_desired_capacity value to 2 but bringing back it to 1 will not affect the asg_desired_capacity.
asg_force_delete = false # Enable forced deletion for the autoscaling group.
asg_initial_lifecycle_hooks = [] # Initital lifecycle hook for the autoscaling group.
asg_initial_lifecycle_hooks = [] # Initial lifecycle hook for the autoscaling group.
default_cooldown = null # The amount of time, in seconds, after a scaling activity completes before another scaling activity can start.
health_check_type = null # Controls how health checking is done. Valid values are "EC2" or "ELB".
health_check_grace_period = null # Time in seconds after instance comes into service before checking health.
Expand Down Expand Up @@ -95,6 +95,11 @@ locals {
spot_max_price = "" # Maximum price per unit hour that the user is willing to pay for the Spot instances. Default is the on-demand price
max_instance_lifetime = 0 # Maximum number of seconds instances can run in the ASG. 0 is unlimited.
elastic_inference_accelerator = null # Type of elastic inference accelerator to be attached. Example values are eia1.medium, eia2.large, etc.
instance_refresh_enabled = false # Enable instance refresh for the worker autoscaling group.
instance_refresh_strategy = "Rolling" # Strategy to use for instance refresh. Default is 'Rolling' which the only valid value.
instance_refresh_min_healthy_percentage = 90 # The amount of capacity in the ASG that must remain healthy during an instance refresh, as a percentage of the ASG's desired capacity.
instance_refresh_instance_warmup = null # The number of seconds until a newly launched instance is configured and ready to use. Defaults to the ASG's health check grace period.
instance_refresh_triggers = [] # Set of additional property names that will trigger an Instance Refresh. A refresh will always be triggered by a change in any of launch_configuration, launch_template, or mixed_instances_policy.
}

workers_group_defaults = merge(
Expand Down
27 changes: 27 additions & 0 deletions workers.tf
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,33 @@ resource "aws_autoscaling_group" "workers" {
}
}

# logic duplicated in workers_launch_template.tf
dynamic "instance_refresh" {
for_each = lookup(var.worker_groups[count.index],
"instance_refresh_enabled",
local.workers_group_defaults["instance_refresh_enabled"]) ? [1] : []
content {
strategy = lookup(
var.worker_groups[count.index], "instance_refresh_strategy",
local.workers_group_defaults["instance_refresh_strategy"]
)
preferences {
instance_warmup = lookup(
var.worker_groups[count.index], "instance_refresh_instance_warmup",
local.workers_group_defaults["instance_refresh_instance_warmup"]
)
min_healthy_percentage = lookup(
var.worker_groups[count.index], "instance_refresh_min_healthy_percentage",
local.workers_group_defaults["instance_refresh_min_healthy_percentage"]
)
}
triggers = lookup(
var.worker_groups[count.index], "instance_refresh_triggers",
local.workers_group_defaults["instance_refresh_triggers"]
)
}
}

lifecycle {
create_before_destroy = true
ignore_changes = [desired_capacity]
Expand Down
Loading

0 comments on commit 68e9df9

Please sign in to comment.