From c6da22c78f60a8643a6c76f97c93724f4e1f4e5a Mon Sep 17 00:00:00 2001 From: Bryant Biggs Date: Sat, 2 Nov 2024 13:33:13 +0000 Subject: [PATCH] feat: Add support for creating `efa-only` network interfaces (#3196) --- .pre-commit-config.yaml | 2 +- modules/eks-managed-node-group/README.md | 2 + modules/eks-managed-node-group/main.tf | 3 +- modules/eks-managed-node-group/variables.tf | 13 ++ modules/self-managed-node-group/README.md | 2 + modules/self-managed-node-group/main.tf | 5 +- modules/self-managed-node-group/variables.tf | 13 ++ node_groups.tf | 4 + tests/eks-managed-node-group/main.tf | 101 +++++++++----- tests/self-managed-node-group/main.tf | 130 ++++++++++++------- 10 files changed, 196 insertions(+), 79 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4f0721881c..baacea01bb 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/antonbabenko/pre-commit-terraform - rev: v1.96.1 + rev: v1.96.2 hooks: - id: terraform_fmt - id: terraform_docs diff --git a/modules/eks-managed-node-group/README.md b/modules/eks-managed-node-group/README.md index 709885898d..ace9106c3d 100644 --- a/modules/eks-managed-node-group/README.md +++ b/modules/eks-managed-node-group/README.md @@ -132,9 +132,11 @@ module "eks_managed_node_group" { | [disable\_api\_termination](#input\_disable\_api\_termination) | If true, enables EC2 instance termination protection | `bool` | `null` | no | | [disk\_size](#input\_disk\_size) | Disk size in GiB for nodes. Defaults to `20`. Only valid when `use_custom_launch_template` = `false` | `number` | `null` | no | | [ebs\_optimized](#input\_ebs\_optimized) | If true, the launched EC2 instance(s) will be EBS-optimized | `bool` | `null` | no | +| [efa\_indices](#input\_efa\_indices) | The indices of the network interfaces that should be EFA-enabled. Only valid when `enable_efa_support` = `true` | `list(number)` |
[
0
]
| no | | [elastic\_gpu\_specifications](#input\_elastic\_gpu\_specifications) | The elastic GPU to attach to the instance | `any` | `{}` | no | | [elastic\_inference\_accelerator](#input\_elastic\_inference\_accelerator) | Configuration block containing an Elastic Inference Accelerator to attach to the instance | `map(string)` | `{}` | no | | [enable\_bootstrap\_user\_data](#input\_enable\_bootstrap\_user\_data) | Determines whether the bootstrap configurations are populated within the user data template. Only valid when using a custom AMI via `ami_id` | `bool` | `false` | no | +| [enable\_efa\_only](#input\_enable\_efa\_only) | Determines whether to enable EFA (`false`, default) or EFA and EFA-only (`true`) network interfaces. Note: requires vpc-cni version `v1.18.4` or later | `bool` | `false` | no | | [enable\_efa\_support](#input\_enable\_efa\_support) | Determines whether to enable Elastic Fabric Adapter (EFA) support | `bool` | `false` | no | | [enable\_monitoring](#input\_enable\_monitoring) | Enables/disables detailed monitoring | `bool` | `true` | no | | [enclave\_options](#input\_enclave\_options) | Enable Nitro Enclaves on launched instances | `map(string)` | `{}` | no | diff --git a/modules/eks-managed-node-group/main.tf b/modules/eks-managed-node-group/main.tf index 42c221dc91..c1581439d2 100644 --- a/modules/eks-managed-node-group/main.tf +++ b/modules/eks-managed-node-group/main.tf @@ -44,13 +44,14 @@ locals { efa_instance_type = try(element(var.instance_types, 0), "") num_network_cards = try(data.aws_ec2_instance_type.this[0].maximum_network_cards, 0) + # Primary network interface must be EFA, remaining can be EFA or EFA-only efa_network_interfaces = [ for i in range(local.num_network_cards) : { associate_public_ip_address = false delete_on_termination = true device_index = i == 0 ? 0 : 1 network_card_index = i - interface_type = "efa" + interface_type = var.enable_efa_only ? contains(concat([0], var.efa_indices), i) ? "efa" : "efa-only" : "efa" } ] diff --git a/modules/eks-managed-node-group/variables.tf b/modules/eks-managed-node-group/variables.tf index bb60b85665..e0ee435785 100644 --- a/modules/eks-managed-node-group/variables.tf +++ b/modules/eks-managed-node-group/variables.tf @@ -285,6 +285,19 @@ variable "enable_efa_support" { default = false } +# TODO - make this true by default at next breaking change (remove variable, only pass indices) +variable "enable_efa_only" { + description = "Determines whether to enable EFA (`false`, default) or EFA and EFA-only (`true`) network interfaces. Note: requires vpc-cni version `v1.18.4` or later" + type = bool + default = false +} + +variable "efa_indices" { + description = "The indices of the network interfaces that should be EFA-enabled. Only valid when `enable_efa_support` = `true`" + type = list(number) + default = [0] +} + variable "network_interfaces" { description = "Customize network interfaces to be attached at instance boot time" type = list(any) diff --git a/modules/self-managed-node-group/README.md b/modules/self-managed-node-group/README.md index c8961a104b..d2c53be59a 100644 --- a/modules/self-managed-node-group/README.md +++ b/modules/self-managed-node-group/README.md @@ -120,8 +120,10 @@ module "self_managed_node_group" { | [desired\_size\_type](#input\_desired\_size\_type) | The unit of measurement for the value specified for `desired_size`. Supported for attribute-based instance type selection only. Valid values: `units`, `vcpu`, `memory-mib` | `string` | `null` | no | | [disable\_api\_termination](#input\_disable\_api\_termination) | If true, enables EC2 instance termination protection | `bool` | `null` | no | | [ebs\_optimized](#input\_ebs\_optimized) | If true, the launched EC2 instance will be EBS-optimized | `bool` | `null` | no | +| [efa\_indices](#input\_efa\_indices) | The indices of the network interfaces that should be EFA-enabled. Only valid when `enable_efa_support` = `true` | `list(number)` |
[
0
]
| no | | [elastic\_gpu\_specifications](#input\_elastic\_gpu\_specifications) | The elastic GPU to attach to the instance | `any` | `{}` | no | | [elastic\_inference\_accelerator](#input\_elastic\_inference\_accelerator) | Configuration block containing an Elastic Inference Accelerator to attach to the instance | `map(string)` | `{}` | no | +| [enable\_efa\_only](#input\_enable\_efa\_only) | Determines whether to enable EFA (`false`, default) or EFA and EFA-only (`true`) network interfaces. Note: requires vpc-cni version `v1.18.4` or later | `bool` | `false` | no | | [enable\_efa\_support](#input\_enable\_efa\_support) | Determines whether to enable Elastic Fabric Adapter (EFA) support | `bool` | `false` | no | | [enable\_monitoring](#input\_enable\_monitoring) | Enables/disables detailed monitoring | `bool` | `true` | no | | [enabled\_metrics](#input\_enabled\_metrics) | A list of metrics to collect. The allowed values are `GroupDesiredCapacity`, `GroupInServiceCapacity`, `GroupPendingCapacity`, `GroupMinSize`, `GroupMaxSize`, `GroupInServiceInstances`, `GroupPendingInstances`, `GroupStandbyInstances`, `GroupStandbyCapacity`, `GroupTerminatingCapacity`, `GroupTerminatingInstances`, `GroupTotalCapacity`, `GroupTotalInstances` | `list(string)` | `[]` | no | diff --git a/modules/self-managed-node-group/main.tf b/modules/self-managed-node-group/main.tf index 127591081b..484a92e340 100644 --- a/modules/self-managed-node-group/main.tf +++ b/modules/self-managed-node-group/main.tf @@ -90,7 +90,7 @@ module "user_data" { ################################################################################ data "aws_ec2_instance_type" "this" { - count = local.enable_efa_support ? 1 : 0 + count = var.create && var.enable_efa_support ? 1 : 0 instance_type = var.instance_type } @@ -101,13 +101,14 @@ locals { instance_type_provided = var.instance_type != "" num_network_cards = try(data.aws_ec2_instance_type.this[0].maximum_network_cards, 0) + # Primary network interface must be EFA, remaining can be EFA or EFA-only efa_network_interfaces = [ for i in range(local.num_network_cards) : { associate_public_ip_address = false delete_on_termination = true device_index = i == 0 ? 0 : 1 network_card_index = i - interface_type = "efa" + interface_type = var.enable_efa_only ? contains(concat([0], var.efa_indices), i) ? "efa" : "efa-only" : "efa" } ] diff --git a/modules/self-managed-node-group/variables.tf b/modules/self-managed-node-group/variables.tf index fd2216d300..9076dab5af 100644 --- a/modules/self-managed-node-group/variables.tf +++ b/modules/self-managed-node-group/variables.tf @@ -334,6 +334,19 @@ variable "enable_efa_support" { default = false } +# TODO - make this true by default at next breaking change (remove variable, only pass indices) +variable "enable_efa_only" { + description = "Determines whether to enable EFA (`false`, default) or EFA and EFA-only (`true`) network interfaces. Note: requires vpc-cni version `v1.18.4` or later" + type = bool + default = false +} + +variable "efa_indices" { + description = "The indices of the network interfaces that should be EFA-enabled. Only valid when `enable_efa_support` = `true`" + type = list(number) + default = [0] +} + variable "metadata_options" { description = "Customize the metadata options for the instance" type = map(string) diff --git a/node_groups.tf b/node_groups.tf index 4ac638100a..a8e499abfa 100644 --- a/node_groups.tf +++ b/node_groups.tf @@ -375,6 +375,8 @@ module "eks_managed_node_group" { metadata_options = try(each.value.metadata_options, var.eks_managed_node_group_defaults.metadata_options, local.metadata_options) enable_monitoring = try(each.value.enable_monitoring, var.eks_managed_node_group_defaults.enable_monitoring, true) enable_efa_support = try(each.value.enable_efa_support, var.eks_managed_node_group_defaults.enable_efa_support, false) + enable_efa_only = try(each.value.enable_efa_only, var.eks_managed_node_group_defaults.enable_efa_only, false) + efa_indices = try(each.value.efa_indices, var.eks_managed_node_group_defaults.efa_indices, [0]) create_placement_group = try(each.value.create_placement_group, var.eks_managed_node_group_defaults.create_placement_group, false) placement = try(each.value.placement, var.eks_managed_node_group_defaults.placement, {}) placement_group_az = try(each.value.placement_group_az, var.eks_managed_node_group_defaults.placement_group_az, null) @@ -526,6 +528,8 @@ module "self_managed_node_group" { metadata_options = try(each.value.metadata_options, var.self_managed_node_group_defaults.metadata_options, local.metadata_options) enable_monitoring = try(each.value.enable_monitoring, var.self_managed_node_group_defaults.enable_monitoring, true) enable_efa_support = try(each.value.enable_efa_support, var.self_managed_node_group_defaults.enable_efa_support, false) + enable_efa_only = try(each.value.enable_efa_only, var.self_managed_node_group_defaults.enable_efa_only, false) + efa_indices = try(each.value.efa_indices, var.self_managed_node_group_defaults.efa_indices, [0]) network_interfaces = try(each.value.network_interfaces, var.self_managed_node_group_defaults.network_interfaces, []) placement = try(each.value.placement, var.self_managed_node_group_defaults.placement, {}) maintenance_options = try(each.value.maintenance_options, var.self_managed_node_group_defaults.maintenance_options, {}) diff --git a/tests/eks-managed-node-group/main.tf b/tests/eks-managed-node-group/main.tf index 9c59f30507..7292e765dc 100644 --- a/tests/eks-managed-node-group/main.tf +++ b/tests/eks-managed-node-group/main.tf @@ -74,7 +74,7 @@ module "eks" { control_plane_subnet_ids = module.vpc.intra_subnets eks_managed_node_group_defaults = { - ami_type = "AL2_x86_64" + ami_type = "AL2023_x86_64_STANDARD" instance_types = ["m6i.large", "m5.large", "m5n.large", "m5zn.large"] } @@ -184,7 +184,7 @@ module "eks" { # Use a custom AMI custom_ami = { - ami_type = "AL2_ARM_64" + ami_type = "AL2023_ARM_64_STANDARD" # Current default AMI used by managed node groups - pseudo "custom" ami_id = data.aws_ami.eks_default_arm.image_id @@ -211,13 +211,28 @@ module "eks" { ami_id = data.aws_ami.eks_default.image_id enable_bootstrap_user_data = true - pre_bootstrap_user_data = <<-EOT - export FOO=bar - EOT - - post_bootstrap_user_data = <<-EOT - echo "you are free little kubelet!" - EOT + cloudinit_pre_nodeadm = [{ + content = <<-EOT + --- + apiVersion: node.eks.aws/v1alpha1 + kind: NodeConfig + spec: + kubelet: + config: + shutdownGracePeriod: 30s + featureGates: + DisableKubeletCloudCredentialProviders: true + EOT + content_type = "application/node.eks.aws" + }] + + # This is only possible with a custom AMI or self-managed node group + cloudinit_post_nodeadm = [{ + content = <<-EOT + echo "All done" + EOT + content_type = "text/x-shellscript; charset=\"us-ascii\"" + }] capacity_type = "SPOT" force_update_version = true @@ -227,14 +242,6 @@ module "eks" { GithubOrg = "terraform-aws-modules" } - taints = [ - { - key = "dedicated" - value = "gpuGroup" - effect = "NO_SCHEDULE" - } - ] - update_config = { max_unavailable_percentage = 33 # or set `max_unavailable` } @@ -306,19 +313,53 @@ module "eks" { # Can be enabled when appropriate for testing/validation create = false - ami_type = "AL2_x86_64_GPU" - instance_types = ["trn1n.32xlarge"] + # The EKS AL2023 NVIDIA AMI provides all of the necessary components + # for accelerated workloads w/ EFA + ami_type = "AL2023_x86_64_NVIDIA" + instance_types = ["p5e.48xlarge"] - enable_efa_support = true - pre_bootstrap_user_data = <<-EOT - # Mount NVME instance store volumes since they are typically - # available on instances that support EFA - setup-local-disks raid0 - EOT + # Mount instance store volumes in RAID-0 for kubelet and containerd + # https://github.com/awslabs/amazon-eks-ami/blob/master/doc/USER_GUIDE.md#raid-0-for-kubelet-and-containerd-raid0 + cloudinit_pre_nodeadm = [ + { + content_type = "application/node.eks.aws" + content = <<-EOT + --- + apiVersion: node.eks.aws/v1alpha1 + kind: NodeConfig + spec: + instance: + localStorage: + strategy: RAID0 + EOT + } + ] + + # This will: + # 1. Create a placement group to place the instances close to one another + # 2. Ignore subnets that reside in AZs that do not support the instance type + # 3. Expose all of the available EFA interfaces on the launch template + enable_efa_support = true + enable_efa_only = true + efa_indices = [0, 4, 8, 12] - min_size = 2 - max_size = 2 - desired_size = 2 + min_size = 1 + max_size = 1 + desired_size = 1 + + labels = { + "vpc.amazonaws.com/efa.present" = "true" + "nvidia.com/gpu.present" = "true" + } + + taints = { + # Ensure only GPU workloads are scheduled on this node group + gpu = { + key = "nvidia.com/gpu" + value = "true" + effect = "NO_SCHEDULE" + } + } } } @@ -532,7 +573,7 @@ data "aws_ami" "eks_default" { filter { name = "name" - values = ["amazon-eks-node-${local.cluster_version}-v*"] + values = ["amazon-eks-node-al2023-x86_64-standard-${local.cluster_version}-v*"] } } @@ -542,7 +583,7 @@ data "aws_ami" "eks_default_arm" { filter { name = "name" - values = ["amazon-eks-arm64-node-${local.cluster_version}-v*"] + values = ["amazon-eks-node-al2023-arm64-standard-${local.cluster_version}-v*"] } } diff --git a/tests/self-managed-node-group/main.tf b/tests/self-managed-node-group/main.tf index b2c1423445..dee3274dc4 100644 --- a/tests/self-managed-node-group/main.tf +++ b/tests/self-managed-node-group/main.tf @@ -61,6 +61,9 @@ module "eks" { } self_managed_node_group_defaults = { + ami_type = "AL2023_x86_64_STANDARD" + ami_id = data.aws_ami.eks_default.image_id + # enable discovery of autoscaling groups by cluster-autoscaler autoscaling_group_tags = { "k8s.io/cluster-autoscaler/enabled" : true, @@ -72,29 +75,6 @@ module "eks" { # Default node group - as provisioned by the module defaults default_node_group = {} - # AL2023 node group utilizing new user data format which utilizes nodeadm - # to join nodes to the cluster (instead of /etc/eks/bootstrap.sh) - al2023_nodeadm = { - ami_type = "AL2023_x86_64_STANDARD" - - cloudinit_pre_nodeadm = [ - { - content_type = "application/node.eks.aws" - content = <<-EOT - --- - apiVersion: node.eks.aws/v1alpha1 - kind: NodeConfig - spec: - kubelet: - config: - shutdownGracePeriod: 30s - featureGates: - DisableKubeletCloudCredentialProviders: true - EOT - } - ] - } - # Bottlerocket node group bottlerocket = { name = "bottlerocket-self-mng" @@ -138,8 +118,18 @@ module "eks" { max_size = 5 desired_size = 2 - ami_type = "AL2_x86_64" - bootstrap_extra_args = "--kubelet-extra-args '--node-labels=node.kubernetes.io/lifecycle=spot'" + cloudinit_pre_nodeadm = [{ + content = <<-EOT + --- + apiVersion: node.eks.aws/v1alpha1 + kind: NodeConfig + spec: + kubelet: + flags: + - --node-labels=node.kubernetes.io/lifecycle=spot + EOT + content_type = "application/node.eks.aws" + }] use_mixed_instances_policy = true mixed_instances_policy = { @@ -173,16 +163,18 @@ module "eks" { max_size = 7 desired_size = 1 - ami_id = data.aws_ami.eks_default.id - ami_type = "AL2_x86_64" - - pre_bootstrap_user_data = <<-EOT - export FOO=bar - EOT - - post_bootstrap_user_data = <<-EOT - echo "you are free little kubelet!" - EOT + cloudinit_pre_nodeadm = [{ + content = <<-EOT + --- + apiVersion: node.eks.aws/v1alpha1 + kind: NodeConfig + spec: + kubelet: + flags: + - --node-labels=node.kubernetes.io/lifecycle=spot + EOT + content_type = "application/node.eks.aws" + }] instance_type = "m6i.large" @@ -215,9 +207,23 @@ module "eks" { max_size = 2 desired_size = 1 - ami_type = "AL2_x86_64" bootstrap_extra_args = "--kubelet-extra-args '--node-labels=node.kubernetes.io/lifecycle=spot'" + cloudinit_pre_nodeadm = [{ + content = <<-EOT + --- + apiVersion: node.eks.aws/v1alpha1 + kind: NodeConfig + spec: + kubelet: + config: + shutdownGracePeriod: 30s + featureGates: + DisableKubeletCloudCredentialProviders: true + EOT + content_type = "application/node.eks.aws" + }] + instance_type = null # launch template configuration @@ -290,19 +296,53 @@ module "eks" { # Can be enabled when appropriate for testing/validation create = false - ami_type = "AL2_x86_64_GPU" - instance_type = "trn1n.32xlarge" + # The EKS AL2023 NVIDIA AMI provides all of the necessary components + # for accelerated workloads w/ EFA + ami_type = "AL2023_x86_64_NVIDIA" + instance_types = ["p5e.48xlarge"] - enable_efa_support = true - pre_bootstrap_user_data = <<-EOT - # Mount NVME instance store volumes since they are typically - # available on instances that support EFA - setup-local-disks raid0 - EOT + # Mount instance store volumes in RAID-0 for kubelet and containerd + # https://github.com/awslabs/amazon-eks-ami/blob/master/doc/USER_GUIDE.md#raid-0-for-kubelet-and-containerd-raid0 + cloudinit_pre_nodeadm = [ + { + content_type = "application/node.eks.aws" + content = <<-EOT + --- + apiVersion: node.eks.aws/v1alpha1 + kind: NodeConfig + spec: + instance: + localStorage: + strategy: RAID0 + EOT + } + ] + + # This will: + # 1. Create a placement group to place the instances close to one another + # 2. Ignore subnets that reside in AZs that do not support the instance type + # 3. Expose all of the available EFA interfaces on the launch template + enable_efa_support = true + enable_efa_only = true + efa_indices = [0, 4, 8, 12] min_size = 2 max_size = 2 desired_size = 2 + + labels = { + "vpc.amazonaws.com/efa.present" = "true" + "nvidia.com/gpu.present" = "true" + } + + taints = { + # Ensure only GPU workloads are scheduled on this node group + gpu = { + key = "nvidia.com/gpu" + value = "true" + effect = "NO_SCHEDULE" + } + } } } @@ -354,7 +394,7 @@ data "aws_ami" "eks_default" { filter { name = "name" - values = ["amazon-eks-node-${local.cluster_version}-v*"] + values = ["amazon-eks-node-al2023-x86_64-standard-${local.cluster_version}-v*"] } }