From c6da22c78f60a8643a6c76f97c93724f4e1f4e5a Mon Sep 17 00:00:00 2001
From: Bryant Biggs <bryantbiggs@gmail.com>
Date: Sat, 2 Nov 2024 13:33:13 +0000
Subject: [PATCH] feat: Add support for creating `efa-only` network interfaces
 (#3196)

---
 .pre-commit-config.yaml                      |   2 +-
 modules/eks-managed-node-group/README.md     |   2 +
 modules/eks-managed-node-group/main.tf       |   3 +-
 modules/eks-managed-node-group/variables.tf  |  13 ++
 modules/self-managed-node-group/README.md    |   2 +
 modules/self-managed-node-group/main.tf      |   5 +-
 modules/self-managed-node-group/variables.tf |  13 ++
 node_groups.tf                               |   4 +
 tests/eks-managed-node-group/main.tf         | 101 +++++++++-----
 tests/self-managed-node-group/main.tf        | 130 ++++++++++++-------
 10 files changed, 196 insertions(+), 79 deletions(-)
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 4f0721881c..baacea01bb 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
 repos:
   - repo: https://github.com/antonbabenko/pre-commit-terraform
-    rev: v1.96.1
+    rev: v1.96.2
     hooks:
       - id: terraform_fmt
       - id: terraform_docs
diff --git a/modules/eks-managed-node-group/README.md b/modules/eks-managed-node-group/README.md
index 709885898d..ace9106c3d 100644
--- a/modules/eks-managed-node-group/README.md
+++ b/modules/eks-managed-node-group/README.md
@@ -132,9 +132,11 @@ module "eks_managed_node_group" {
 | <a name="input_disable_api_termination"></a> [disable\_api\_termination](#input\_disable\_api\_termination) | If true, enables EC2 instance termination protection | `bool` | `null` | no |
 | <a name="input_disk_size"></a> [disk\_size](#input\_disk\_size) | Disk size in GiB for nodes. Defaults to `20`. Only valid when `use_custom_launch_template` = `false` | `number` | `null` | no |
 | <a name="input_ebs_optimized"></a> [ebs\_optimized](#input\_ebs\_optimized) | If true, the launched EC2 instance(s) will be EBS-optimized | `bool` | `null` | no |
+| <a name="input_efa_indices"></a> [efa\_indices](#input\_efa\_indices) | The indices of the network interfaces that should be EFA-enabled. Only valid when `enable_efa_support` = `true` | `list(number)` | <pre>[<br/>  0<br/>]</pre> | no |
 | <a name="input_elastic_gpu_specifications"></a> [elastic\_gpu\_specifications](#input\_elastic\_gpu\_specifications) | The elastic GPU to attach to the instance | `any` | `{}` | no |
 | <a name="input_elastic_inference_accelerator"></a> [elastic\_inference\_accelerator](#input\_elastic\_inference\_accelerator) | Configuration block containing an Elastic Inference Accelerator to attach to the instance | `map(string)` | `{}` | no |
 | <a name="input_enable_bootstrap_user_data"></a> [enable\_bootstrap\_user\_data](#input\_enable\_bootstrap\_user\_data) | Determines whether the bootstrap configurations are populated within the user data template. Only valid when using a custom AMI via `ami_id` | `bool` | `false` | no |
+| <a name="input_enable_efa_only"></a> [enable\_efa\_only](#input\_enable\_efa\_only) | Determines whether to enable EFA (`false`, default) or EFA and EFA-only (`true`) network interfaces. Note: requires vpc-cni version `v1.18.4` or later | `bool` | `false` | no |
 | <a name="input_enable_efa_support"></a> [enable\_efa\_support](#input\_enable\_efa\_support) | Determines whether to enable Elastic Fabric Adapter (EFA) support | `bool` | `false` | no |
 | <a name="input_enable_monitoring"></a> [enable\_monitoring](#input\_enable\_monitoring) | Enables/disables detailed monitoring | `bool` | `true` | no |
 | <a name="input_enclave_options"></a> [enclave\_options](#input\_enclave\_options) | Enable Nitro Enclaves on launched instances | `map(string)` | `{}` | no |
diff --git a/modules/eks-managed-node-group/main.tf b/modules/eks-managed-node-group/main.tf
index 42c221dc91..c1581439d2 100644
--- a/modules/eks-managed-node-group/main.tf
+++ b/modules/eks-managed-node-group/main.tf
@@ -44,13 +44,14 @@ locals {
   efa_instance_type = try(element(var.instance_types, 0), "")
   num_network_cards = try(data.aws_ec2_instance_type.this[0].maximum_network_cards, 0)
 
+  # Primary network interface must be EFA, remaining can be EFA or EFA-only
   efa_network_interfaces = [
     for i in range(local.num_network_cards) : {
       associate_public_ip_address = false
       delete_on_termination       = true
       device_index                = i == 0 ? 0 : 1
       network_card_index          = i
-      interface_type              = "efa"
+      interface_type              = var.enable_efa_only ? contains(concat([0], var.efa_indices), i) ? "efa" : "efa-only" : "efa"
     }
   ]
 
diff --git a/modules/eks-managed-node-group/variables.tf b/modules/eks-managed-node-group/variables.tf
index bb60b85665..e0ee435785 100644
--- a/modules/eks-managed-node-group/variables.tf
+++ b/modules/eks-managed-node-group/variables.tf
@@ -285,6 +285,19 @@ variable "enable_efa_support" {
   default     = false
 }
 
+# TODO - make this true by default at next breaking change (remove variable, only pass indices)
+variable "enable_efa_only" {
+  description = "Determines whether to enable EFA (`false`, default) or EFA and EFA-only (`true`) network interfaces. Note: requires vpc-cni version `v1.18.4` or later"
+  type        = bool
+  default     = false
+}
+
+variable "efa_indices" {
+  description = "The indices of the network interfaces that should be EFA-enabled. Only valid when `enable_efa_support` = `true`"
+  type        = list(number)
+  default     = [0]
+}
+
 variable "network_interfaces" {
   description = "Customize network interfaces to be attached at instance boot time"
   type        = list(any)
diff --git a/modules/self-managed-node-group/README.md b/modules/self-managed-node-group/README.md
index c8961a104b..d2c53be59a 100644
--- a/modules/self-managed-node-group/README.md
+++ b/modules/self-managed-node-group/README.md
@@ -120,8 +120,10 @@ module "self_managed_node_group" {
 | <a name="input_desired_size_type"></a> [desired\_size\_type](#input\_desired\_size\_type) | The unit of measurement for the value specified for `desired_size`. Supported for attribute-based instance type selection only. Valid values: `units`, `vcpu`, `memory-mib` | `string` | `null` | no |
 | <a name="input_disable_api_termination"></a> [disable\_api\_termination](#input\_disable\_api\_termination) | If true, enables EC2 instance termination protection | `bool` | `null` | no |
 | <a name="input_ebs_optimized"></a> [ebs\_optimized](#input\_ebs\_optimized) | If true, the launched EC2 instance will be EBS-optimized | `bool` | `null` | no |
+| <a name="input_efa_indices"></a> [efa\_indices](#input\_efa\_indices) | The indices of the network interfaces that should be EFA-enabled. Only valid when `enable_efa_support` = `true` | `list(number)` | <pre>[<br/>  0<br/>]</pre> | no |
 | <a name="input_elastic_gpu_specifications"></a> [elastic\_gpu\_specifications](#input\_elastic\_gpu\_specifications) | The elastic GPU to attach to the instance | `any` | `{}` | no |
 | <a name="input_elastic_inference_accelerator"></a> [elastic\_inference\_accelerator](#input\_elastic\_inference\_accelerator) | Configuration block containing an Elastic Inference Accelerator to attach to the instance | `map(string)` | `{}` | no |
+| <a name="input_enable_efa_only"></a> [enable\_efa\_only](#input\_enable\_efa\_only) | Determines whether to enable EFA (`false`, default) or EFA and EFA-only (`true`) network interfaces. Note: requires vpc-cni version `v1.18.4` or later | `bool` | `false` | no |
 | <a name="input_enable_efa_support"></a> [enable\_efa\_support](#input\_enable\_efa\_support) | Determines whether to enable Elastic Fabric Adapter (EFA) support | `bool` | `false` | no |
 | <a name="input_enable_monitoring"></a> [enable\_monitoring](#input\_enable\_monitoring) | Enables/disables detailed monitoring | `bool` | `true` | no |
 | <a name="input_enabled_metrics"></a> [enabled\_metrics](#input\_enabled\_metrics) | A list of metrics to collect. The allowed values are `GroupDesiredCapacity`, `GroupInServiceCapacity`, `GroupPendingCapacity`, `GroupMinSize`, `GroupMaxSize`, `GroupInServiceInstances`, `GroupPendingInstances`, `GroupStandbyInstances`, `GroupStandbyCapacity`, `GroupTerminatingCapacity`, `GroupTerminatingInstances`, `GroupTotalCapacity`, `GroupTotalInstances` | `list(string)` | `[]` | no |
diff --git a/modules/self-managed-node-group/main.tf b/modules/self-managed-node-group/main.tf
index 127591081b..484a92e340 100644
--- a/modules/self-managed-node-group/main.tf
+++ b/modules/self-managed-node-group/main.tf
@@ -90,7 +90,7 @@ module "user_data" {
 ################################################################################
 
 data "aws_ec2_instance_type" "this" {
-  count = local.enable_efa_support ? 1 : 0
+  count = var.create && var.enable_efa_support ? 1 : 0
 
   instance_type = var.instance_type
 }
@@ -101,13 +101,14 @@ locals {
   instance_type_provided = var.instance_type != ""
   num_network_cards      = try(data.aws_ec2_instance_type.this[0].maximum_network_cards, 0)
 
+  # Primary network interface must be EFA, remaining can be EFA or EFA-only
   efa_network_interfaces = [
     for i in range(local.num_network_cards) : {
       associate_public_ip_address = false
       delete_on_termination       = true
       device_index                = i == 0 ? 0 : 1
       network_card_index          = i
-      interface_type              = "efa"
+      interface_type              = var.enable_efa_only ? contains(concat([0], var.efa_indices), i) ? "efa" : "efa-only" : "efa"
     }
   ]
 
diff --git a/modules/self-managed-node-group/variables.tf b/modules/self-managed-node-group/variables.tf
index fd2216d300..9076dab5af 100644
--- a/modules/self-managed-node-group/variables.tf
+++ b/modules/self-managed-node-group/variables.tf
@@ -334,6 +334,19 @@ variable "enable_efa_support" {
   default     = false
 }
 
+# TODO - make this true by default at next breaking change (remove variable, only pass indices)
+variable "enable_efa_only" {
+  description = "Determines whether to enable EFA (`false`, default) or EFA and EFA-only (`true`) network interfaces. Note: requires vpc-cni version `v1.18.4` or later"
+  type        = bool
+  default     = false
+}
+
+variable "efa_indices" {
+  description = "The indices of the network interfaces that should be EFA-enabled. Only valid when `enable_efa_support` = `true`"
+  type        = list(number)
+  default     = [0]
+}
+
 variable "metadata_options" {
   description = "Customize the metadata options for the instance"
   type        = map(string)
diff --git a/node_groups.tf b/node_groups.tf
index 4ac638100a..a8e499abfa 100644
--- a/node_groups.tf
+++ b/node_groups.tf
@@ -375,6 +375,8 @@ module "eks_managed_node_group" {
   metadata_options                   = try(each.value.metadata_options, var.eks_managed_node_group_defaults.metadata_options, local.metadata_options)
   enable_monitoring                  = try(each.value.enable_monitoring, var.eks_managed_node_group_defaults.enable_monitoring, true)
   enable_efa_support                 = try(each.value.enable_efa_support, var.eks_managed_node_group_defaults.enable_efa_support, false)
+  enable_efa_only                    = try(each.value.enable_efa_only, var.eks_managed_node_group_defaults.enable_efa_only, false)
+  efa_indices                        = try(each.value.efa_indices, var.eks_managed_node_group_defaults.efa_indices, [0])
   create_placement_group             = try(each.value.create_placement_group, var.eks_managed_node_group_defaults.create_placement_group, false)
   placement                          = try(each.value.placement, var.eks_managed_node_group_defaults.placement, {})
   placement_group_az                 = try(each.value.placement_group_az, var.eks_managed_node_group_defaults.placement_group_az, null)
@@ -526,6 +528,8 @@ module "self_managed_node_group" {
   metadata_options                   = try(each.value.metadata_options, var.self_managed_node_group_defaults.metadata_options, local.metadata_options)
   enable_monitoring                  = try(each.value.enable_monitoring, var.self_managed_node_group_defaults.enable_monitoring, true)
   enable_efa_support                 = try(each.value.enable_efa_support, var.self_managed_node_group_defaults.enable_efa_support, false)
+  enable_efa_only                    = try(each.value.enable_efa_only, var.self_managed_node_group_defaults.enable_efa_only, false)
+  efa_indices                        = try(each.value.efa_indices, var.self_managed_node_group_defaults.efa_indices, [0])
   network_interfaces                 = try(each.value.network_interfaces, var.self_managed_node_group_defaults.network_interfaces, [])
   placement                          = try(each.value.placement, var.self_managed_node_group_defaults.placement, {})
   maintenance_options                = try(each.value.maintenance_options, var.self_managed_node_group_defaults.maintenance_options, {})
diff --git a/tests/eks-managed-node-group/main.tf b/tests/eks-managed-node-group/main.tf
index 9c59f30507..7292e765dc 100644
--- a/tests/eks-managed-node-group/main.tf
+++ b/tests/eks-managed-node-group/main.tf
@@ -74,7 +74,7 @@ module "eks" {
   control_plane_subnet_ids = module.vpc.intra_subnets
 
   eks_managed_node_group_defaults = {
-    ami_type       = "AL2_x86_64"
+    ami_type       = "AL2023_x86_64_STANDARD"
     instance_types = ["m6i.large", "m5.large", "m5n.large", "m5zn.large"]
   }
 
@@ -184,7 +184,7 @@ module "eks" {
 
     # Use a custom AMI
     custom_ami = {
-      ami_type = "AL2_ARM_64"
+      ami_type = "AL2023_ARM_64_STANDARD"
       # Current default AMI used by managed node groups - pseudo "custom"
       ami_id = data.aws_ami.eks_default_arm.image_id
 
@@ -211,13 +211,28 @@ module "eks" {
       ami_id                     = data.aws_ami.eks_default.image_id
       enable_bootstrap_user_data = true
 
-      pre_bootstrap_user_data = <<-EOT
-        export FOO=bar
-      EOT
-
-      post_bootstrap_user_data = <<-EOT
-        echo "you are free little kubelet!"
-      EOT
+      cloudinit_pre_nodeadm = [{
+        content      = <<-EOT
+          ---
+          apiVersion: node.eks.aws/v1alpha1
+          kind: NodeConfig
+          spec:
+            kubelet:
+              config:
+                shutdownGracePeriod: 30s
+                featureGates:
+                  DisableKubeletCloudCredentialProviders: true
+        EOT
+        content_type = "application/node.eks.aws"
+      }]
+
+      # This is only possible with a custom AMI or self-managed node group
+      cloudinit_post_nodeadm = [{
+        content      = <<-EOT
+          echo "All done"
+        EOT
+        content_type = "text/x-shellscript; charset=\"us-ascii\""
+      }]
 
       capacity_type        = "SPOT"
       force_update_version = true
@@ -227,14 +242,6 @@ module "eks" {
         GithubOrg  = "terraform-aws-modules"
       }
 
-      taints = [
-        {
-          key    = "dedicated"
-          value  = "gpuGroup"
-          effect = "NO_SCHEDULE"
-        }
-      ]
-
       update_config = {
         max_unavailable_percentage = 33 # or set `max_unavailable`
       }
@@ -306,19 +313,53 @@ module "eks" {
       # Can be enabled when appropriate for testing/validation
       create = false
 
-      ami_type       = "AL2_x86_64_GPU"
-      instance_types = ["trn1n.32xlarge"]
+      # The EKS AL2023 NVIDIA AMI provides all of the necessary components
+      # for accelerated workloads w/ EFA
+      ami_type       = "AL2023_x86_64_NVIDIA"
+      instance_types = ["p5e.48xlarge"]
 
-      enable_efa_support      = true
-      pre_bootstrap_user_data = <<-EOT
-        # Mount NVME instance store volumes since they are typically
-        # available on instances that support EFA
-        setup-local-disks raid0
-      EOT
+      # Mount instance store volumes in RAID-0 for kubelet and containerd
+      # https://github.com/awslabs/amazon-eks-ami/blob/master/doc/USER_GUIDE.md#raid-0-for-kubelet-and-containerd-raid0
+      cloudinit_pre_nodeadm = [
+        {
+          content_type = "application/node.eks.aws"
+          content      = <<-EOT
+            ---
+            apiVersion: node.eks.aws/v1alpha1
+            kind: NodeConfig
+            spec:
+              instance:
+                localStorage:
+                  strategy: RAID0
+          EOT
+        }
+      ]
+
+      # This will:
+      # 1. Create a placement group to place the instances close to one another
+      # 2. Ignore subnets that reside in AZs that do not support the instance type
+      # 3. Expose all of the available EFA interfaces on the launch template
+      enable_efa_support = true
+      enable_efa_only    = true
+      efa_indices        = [0, 4, 8, 12]
 
-      min_size     = 2
-      max_size     = 2
-      desired_size = 2
+      min_size     = 1
+      max_size     = 1
+      desired_size = 1
+
+      labels = {
+        "vpc.amazonaws.com/efa.present" = "true"
+        "nvidia.com/gpu.present"        = "true"
+      }
+
+      taints = {
+        # Ensure only GPU workloads are scheduled on this node group
+        gpu = {
+          key    = "nvidia.com/gpu"
+          value  = "true"
+          effect = "NO_SCHEDULE"
+        }
+      }
     }
   }
 
@@ -532,7 +573,7 @@ data "aws_ami" "eks_default" {
 
   filter {
     name   = "name"
-    values = ["amazon-eks-node-${local.cluster_version}-v*"]
+    values = ["amazon-eks-node-al2023-x86_64-standard-${local.cluster_version}-v*"]
   }
 }
 
@@ -542,7 +583,7 @@ data "aws_ami" "eks_default_arm" {
 
   filter {
     name   = "name"
-    values = ["amazon-eks-arm64-node-${local.cluster_version}-v*"]
+    values = ["amazon-eks-node-al2023-arm64-standard-${local.cluster_version}-v*"]
   }
 }
 
diff --git a/tests/self-managed-node-group/main.tf b/tests/self-managed-node-group/main.tf
index b2c1423445..dee3274dc4 100644
--- a/tests/self-managed-node-group/main.tf
+++ b/tests/self-managed-node-group/main.tf
@@ -61,6 +61,9 @@ module "eks" {
   }
 
   self_managed_node_group_defaults = {
+    ami_type = "AL2023_x86_64_STANDARD"
+    ami_id   = data.aws_ami.eks_default.image_id
+
     # enable discovery of autoscaling groups by cluster-autoscaler
     autoscaling_group_tags = {
       "k8s.io/cluster-autoscaler/enabled" : true,
@@ -72,29 +75,6 @@ module "eks" {
     # Default node group - as provisioned by the module defaults
     default_node_group = {}
 
-    # AL2023 node group utilizing new user data format which utilizes nodeadm
-    # to join nodes to the cluster (instead of /etc/eks/bootstrap.sh)
-    al2023_nodeadm = {
-      ami_type = "AL2023_x86_64_STANDARD"
-
-      cloudinit_pre_nodeadm = [
-        {
-          content_type = "application/node.eks.aws"
-          content      = <<-EOT
-            ---
-            apiVersion: node.eks.aws/v1alpha1
-            kind: NodeConfig
-            spec:
-              kubelet:
-                config:
-                  shutdownGracePeriod: 30s
-                  featureGates:
-                    DisableKubeletCloudCredentialProviders: true
-          EOT
-        }
-      ]
-    }
-
     # Bottlerocket node group
     bottlerocket = {
       name = "bottlerocket-self-mng"
@@ -138,8 +118,18 @@ module "eks" {
       max_size     = 5
       desired_size = 2
 
-      ami_type             = "AL2_x86_64"
-      bootstrap_extra_args = "--kubelet-extra-args '--node-labels=node.kubernetes.io/lifecycle=spot'"
+      cloudinit_pre_nodeadm = [{
+        content      = <<-EOT
+          ---
+          apiVersion: node.eks.aws/v1alpha1
+          kind: NodeConfig
+          spec:
+            kubelet:
+              flags:
+                - --node-labels=node.kubernetes.io/lifecycle=spot
+        EOT
+        content_type = "application/node.eks.aws"
+      }]
 
       use_mixed_instances_policy = true
       mixed_instances_policy = {
@@ -173,16 +163,18 @@ module "eks" {
       max_size     = 7
       desired_size = 1
 
-      ami_id   = data.aws_ami.eks_default.id
-      ami_type = "AL2_x86_64"
-
-      pre_bootstrap_user_data = <<-EOT
-        export FOO=bar
-      EOT
-
-      post_bootstrap_user_data = <<-EOT
-        echo "you are free little kubelet!"
-      EOT
+      cloudinit_pre_nodeadm = [{
+        content      = <<-EOT
+          ---
+          apiVersion: node.eks.aws/v1alpha1
+          kind: NodeConfig
+          spec:
+            kubelet:
+              flags:
+                - --node-labels=node.kubernetes.io/lifecycle=spot
+        EOT
+        content_type = "application/node.eks.aws"
+      }]
 
       instance_type = "m6i.large"
 
@@ -215,9 +207,23 @@ module "eks" {
         max_size     = 2
         desired_size = 1
 
-        ami_type             = "AL2_x86_64"
         bootstrap_extra_args = "--kubelet-extra-args '--node-labels=node.kubernetes.io/lifecycle=spot'"
 
+        cloudinit_pre_nodeadm = [{
+          content      = <<-EOT
+          ---
+          apiVersion: node.eks.aws/v1alpha1
+          kind: NodeConfig
+          spec:
+            kubelet:
+              config:
+                shutdownGracePeriod: 30s
+                featureGates:
+                  DisableKubeletCloudCredentialProviders: true
+        EOT
+          content_type = "application/node.eks.aws"
+        }]
+
         instance_type = null
 
         # launch template configuration
@@ -290,19 +296,53 @@ module "eks" {
       # Can be enabled when appropriate for testing/validation
       create = false
 
-      ami_type      = "AL2_x86_64_GPU"
-      instance_type = "trn1n.32xlarge"
+      # The EKS AL2023 NVIDIA AMI provides all of the necessary components
+      # for accelerated workloads w/ EFA
+      ami_type       = "AL2023_x86_64_NVIDIA"
+      instance_types = ["p5e.48xlarge"]
 
-      enable_efa_support      = true
-      pre_bootstrap_user_data = <<-EOT
-        # Mount NVME instance store volumes since they are typically
-        # available on instances that support EFA
-        setup-local-disks raid0
-      EOT
+      # Mount instance store volumes in RAID-0 for kubelet and containerd
+      # https://github.com/awslabs/amazon-eks-ami/blob/master/doc/USER_GUIDE.md#raid-0-for-kubelet-and-containerd-raid0
+      cloudinit_pre_nodeadm = [
+        {
+          content_type = "application/node.eks.aws"
+          content      = <<-EOT
+            ---
+            apiVersion: node.eks.aws/v1alpha1
+            kind: NodeConfig
+            spec:
+              instance:
+                localStorage:
+                  strategy: RAID0
+          EOT
+        }
+      ]
+
+      # This will:
+      # 1. Create a placement group to place the instances close to one another
+      # 2. Ignore subnets that reside in AZs that do not support the instance type
+      # 3. Expose all of the available EFA interfaces on the launch template
+      enable_efa_support = true
+      enable_efa_only    = true
+      efa_indices        = [0, 4, 8, 12]
 
       min_size     = 2
       max_size     = 2
       desired_size = 2
+
+      labels = {
+        "vpc.amazonaws.com/efa.present" = "true"
+        "nvidia.com/gpu.present"        = "true"
+      }
+
+      taints = {
+        # Ensure only GPU workloads are scheduled on this node group
+        gpu = {
+          key    = "nvidia.com/gpu"
+          value  = "true"
+          effect = "NO_SCHEDULE"
+        }
+      }
     }
   }
 
@@ -354,7 +394,7 @@ data "aws_ami" "eks_default" {
 
   filter {
     name   = "name"
-    values = ["amazon-eks-node-${local.cluster_version}-v*"]
+    values = ["amazon-eks-node-al2023-x86_64-standard-${local.cluster_version}-v*"]
   }
 }