From 1b3a0f6ebc9b07c86a3c6aadbda98058213c18e6 Mon Sep 17 00:00:00 2001 From: Dalton Hubble Date: Sun, 8 Nov 2020 10:51:42 -0800 Subject: [PATCH] Add experimental Fedora CoreOS arm64 support on AWS * Add experimental `arch` variable to Fedora CoreOS AWS, accepting amd64 (default) or arm64 to support native arm64/aarch64 clusters or mixed/hybrid clusters with a worker pool of arm64 workers * Add `daemonset_tolerations` variable to cluster module (experimental) * Add `node_taints` variable to workers module * Requires flannel CNI and experimental Poseidon-built arm64 Fedora CoreOS AMIs (published to us-east-1, us-east-2, and us-west-1) WARN: * Our AMIs are experimental, may be removed at any time, and will be removed when Fedora CoreOS publishes official arm64 AMIs. Do NOT use in production Related: * https://github.com/poseidon/typhoon/pull/682 --- CHANGES.md | 9 ++ aws/fedora-coreos/kubernetes/ami.tf | 24 ++++ aws/fedora-coreos/kubernetes/bootstrap.tf | 1 + aws/fedora-coreos/kubernetes/controllers.tf | 6 +- .../kubernetes/fcc/controller.yaml | 3 +- aws/fedora-coreos/kubernetes/variables.tf | 12 ++ aws/fedora-coreos/kubernetes/workers.tf | 1 + aws/fedora-coreos/kubernetes/workers/ami.tf | 24 ++++ .../kubernetes/workers/fcc/worker.yaml | 3 + .../kubernetes/workers/variables.tf | 14 +++ .../kubernetes/workers/workers.tf | 3 +- docs/advanced/arm64.md | 116 ++++++++++++++++++ docs/advanced/overview.md | 1 + mkdocs.yml | 1 + requirements.txt | 2 +- 15 files changed, 214 insertions(+), 6 deletions(-) create mode 100644 docs/advanced/arm64.md diff --git a/CHANGES.md b/CHANGES.md index 2e2940c88..afe2a20ed 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -15,6 +15,15 @@ Notable changes between versions. * Allow a snippet with a systemd dropin to set an alternate image (e.g. mirror) * Fix local node delete oneshot on node shutdown ([#856](https://github.com/poseidon/typhoon/pull/855)) +#### AWS + +* Add experimental Fedora CoreOS arm64 support ([docs](https://typhoon.psdn.io/advanced/arm64/), [#875](https://github.com/poseidon/typhoon/pull/875)) + * Allow arm64 full-cluster or mixed/hybrid cluster with worker pools + * Add `arch` variable to cluster module + * Add `daemonset_tolerations` variable to cluster module + * Add `node_taints` variable to workers module + * Requires flannel CNI provider and use of experimental AMI (see docs) + ### Flatcar Linux * Rename `container-linux` modules to `flatcar-linux` ([#858](https://github.com/poseidon/typhoon/issues/858)) (**action required**) diff --git a/aws/fedora-coreos/kubernetes/ami.tf b/aws/fedora-coreos/kubernetes/ami.tf index a7ab184bd..2ac01d446 100644 --- a/aws/fedora-coreos/kubernetes/ami.tf +++ b/aws/fedora-coreos/kubernetes/ami.tf @@ -18,3 +18,27 @@ data "aws_ami" "fedora-coreos" { values = ["Fedora CoreOS ${var.os_stream} *"] } } + +# Experimental Fedora CoreOS arm64 / aarch64 AMIs from Poseidon +# WARNING: These AMIs will be removed when Fedora CoreOS publishes arm64 AMIs +# and may be removed for any reason before then as well. Do not use. +data "aws_ami" "fedora-coreos-arm" { + most_recent = true + owners = ["099663496933"] + + filter { + name = "architecture" + values = ["arm64"] + } + + filter { + name = "virtualization-type" + values = ["hvm"] + } + + filter { + name = "name" + values = ["fedora-coreos-*"] + } +} + diff --git a/aws/fedora-coreos/kubernetes/bootstrap.tf b/aws/fedora-coreos/kubernetes/bootstrap.tf index 88515684f..d357034f4 100644 --- a/aws/fedora-coreos/kubernetes/bootstrap.tf +++ b/aws/fedora-coreos/kubernetes/bootstrap.tf @@ -12,6 +12,7 @@ module "bootstrap" { cluster_domain_suffix = var.cluster_domain_suffix enable_reporting = var.enable_reporting enable_aggregation = var.enable_aggregation + daemonset_tolerations = var.daemonset_tolerations trusted_certs_dir = "/etc/pki/tls/certs" } diff --git a/aws/fedora-coreos/kubernetes/controllers.tf b/aws/fedora-coreos/kubernetes/controllers.tf index 2fd253db7..1ab26aaa6 100644 --- a/aws/fedora-coreos/kubernetes/controllers.tf +++ b/aws/fedora-coreos/kubernetes/controllers.tf @@ -22,9 +22,8 @@ resource "aws_instance" "controllers" { } instance_type = var.controller_type - - ami = data.aws_ami.fedora-coreos.image_id - user_data = data.ct_config.controller-ignitions.*.rendered[count.index] + ami = var.arch == "arm64" ? data.aws_ami.fedora-coreos-arm.image_id : data.aws_ami.fedora-coreos.image_id + user_data = data.ct_config.controller-ignitions.*.rendered[count.index] # storage root_block_device { @@ -63,6 +62,7 @@ data "template_file" "controller-configs" { vars = { # Cannot use cyclic dependencies on controllers or their DNS records + etcd_arch = var.arch == "arm64" ? "-arm64" : "" etcd_name = "etcd${count.index}" etcd_domain = "${var.cluster_name}-etcd${count.index}.${var.dns_zone}" # etcd0=https://cluster-etcd0.example.com,etcd1=https://cluster-etcd1.example.com,... diff --git a/aws/fedora-coreos/kubernetes/fcc/controller.yaml b/aws/fedora-coreos/kubernetes/fcc/controller.yaml index eaa912ded..4147b14d8 100644 --- a/aws/fedora-coreos/kubernetes/fcc/controller.yaml +++ b/aws/fedora-coreos/kubernetes/fcc/controller.yaml @@ -12,7 +12,7 @@ systemd: Wants=network-online.target network.target After=network-online.target [Service] - Environment=ETCD_IMAGE=quay.io/coreos/etcd:v3.4.12 + Environment=ETCD_IMAGE=quay.io/coreos/etcd:v3.4.12${etcd_arch} Type=exec ExecStartPre=/bin/mkdir -p /var/lib/etcd ExecStartPre=-/usr/bin/podman rm etcd @@ -214,6 +214,7 @@ storage: ETCD_PEER_CERT_FILE=/etc/ssl/certs/etcd/peer.crt ETCD_PEER_KEY_FILE=/etc/ssl/certs/etcd/peer.key ETCD_PEER_CLIENT_CERT_AUTH=true + ETCD_UNSUPPORTED_ARCH=arm64 passwd: users: - name: core diff --git a/aws/fedora-coreos/kubernetes/variables.tf b/aws/fedora-coreos/kubernetes/variables.tf index fa47cbe5a..7557e919b 100644 --- a/aws/fedora-coreos/kubernetes/variables.tf +++ b/aws/fedora-coreos/kubernetes/variables.tf @@ -155,3 +155,15 @@ variable "cluster_domain_suffix" { default = "cluster.local" } +variable "arch" { + type = string + description = "Container architecture (amd64 or arm64)" + default = "amd64" +} + +variable "daemonset_tolerations" { + type = list(string) + description = "List of additional taint keys kube-system DaemonSets should tolerate (e.g. ['custom-role', 'gpu-role'])" + default = [] +} + diff --git a/aws/fedora-coreos/kubernetes/workers.tf b/aws/fedora-coreos/kubernetes/workers.tf index dcfc05d9a..0ec9cdb6b 100644 --- a/aws/fedora-coreos/kubernetes/workers.tf +++ b/aws/fedora-coreos/kubernetes/workers.tf @@ -9,6 +9,7 @@ module "workers" { worker_count = var.worker_count instance_type = var.worker_type os_stream = var.os_stream + arch = var.arch disk_size = var.disk_size spot_price = var.worker_price target_groups = var.worker_target_groups diff --git a/aws/fedora-coreos/kubernetes/workers/ami.tf b/aws/fedora-coreos/kubernetes/workers/ami.tf index a7ab184bd..2ac01d446 100644 --- a/aws/fedora-coreos/kubernetes/workers/ami.tf +++ b/aws/fedora-coreos/kubernetes/workers/ami.tf @@ -18,3 +18,27 @@ data "aws_ami" "fedora-coreos" { values = ["Fedora CoreOS ${var.os_stream} *"] } } + +# Experimental Fedora CoreOS arm64 / aarch64 AMIs from Poseidon +# WARNING: These AMIs will be removed when Fedora CoreOS publishes arm64 AMIs +# and may be removed for any reason before then as well. Do not use. +data "aws_ami" "fedora-coreos-arm" { + most_recent = true + owners = ["099663496933"] + + filter { + name = "architecture" + values = ["arm64"] + } + + filter { + name = "virtualization-type" + values = ["hvm"] + } + + filter { + name = "name" + values = ["fedora-coreos-*"] + } +} + diff --git a/aws/fedora-coreos/kubernetes/workers/fcc/worker.yaml b/aws/fedora-coreos/kubernetes/workers/fcc/worker.yaml index 50f274d53..389da5a5c 100644 --- a/aws/fedora-coreos/kubernetes/workers/fcc/worker.yaml +++ b/aws/fedora-coreos/kubernetes/workers/fcc/worker.yaml @@ -68,6 +68,9 @@ systemd: %{~ for label in split(",", node_labels) ~} --node-labels=${label} \ %{~ endfor ~} + %{~ for taint in split(",", node_taints) ~} + --register-with-taints=${taint} \ + %{~ endfor ~} --pod-manifest-path=/etc/kubernetes/manifests \ --read-only-port=0 \ --rotate-certificates \ diff --git a/aws/fedora-coreos/kubernetes/workers/variables.tf b/aws/fedora-coreos/kubernetes/workers/variables.tf index 76b33cbf9..645bae38c 100644 --- a/aws/fedora-coreos/kubernetes/workers/variables.tf +++ b/aws/fedora-coreos/kubernetes/workers/variables.tf @@ -108,3 +108,17 @@ variable "node_labels" { description = "List of initial node labels" default = [] } + +variable "node_taints" { + type = list(string) + description = "List of initial node taints" + default = [] +} + +# unofficial, undocumented, unsupported + +variable "arch" { + type = string + description = "Container architecture (amd64 or arm64)" + default = "amd64" +} diff --git a/aws/fedora-coreos/kubernetes/workers/workers.tf b/aws/fedora-coreos/kubernetes/workers/workers.tf index 39f9a4a4e..4144f6649 100644 --- a/aws/fedora-coreos/kubernetes/workers/workers.tf +++ b/aws/fedora-coreos/kubernetes/workers/workers.tf @@ -44,7 +44,7 @@ resource "aws_autoscaling_group" "workers" { # Worker template resource "aws_launch_configuration" "worker" { - image_id = data.aws_ami.fedora-coreos.image_id + image_id = var.arch == "arm64" ? data.aws_ami.fedora-coreos-arm.image_id : data.aws_ami.fedora-coreos.image_id instance_type = var.instance_type spot_price = var.spot_price > 0 ? var.spot_price : null enable_monitoring = false @@ -86,6 +86,7 @@ data "template_file" "worker-config" { cluster_dns_service_ip = cidrhost(var.service_cidr, 10) cluster_domain_suffix = var.cluster_domain_suffix node_labels = join(",", var.node_labels) + node_taints = join(",", var.node_taints) } } diff --git a/docs/advanced/arm64.md b/docs/advanced/arm64.md new file mode 100644 index 000000000..499620717 --- /dev/null +++ b/docs/advanced/arm64.md @@ -0,0 +1,116 @@ +# ARM64 + +!!! warning + ARM64 support is experimental + +Typhoon has experimental support for ARM64 with Fedora CoreOS on AWS. Full clusters can be created with ARM64 controller and worker nodes. Or worker pools of ARM64 nodes can be attached to an AMD64 cluster to create a hybrid/mixed architecture cluster. + +!!! note + Currently, CNI networking must be set to flannel. + +## AMIs + +In lieu of official Fedora CoreOS ARM64 AMIs, Poseidon publishes experimental ARM64 AMIs to a few regions (us-east-1, us-east-2, us-west-1). These AMIs may be **removed** at any time and will be replaced when Fedora CoreOS publishes equivalents. + +!!! note + AMIs are only published to a few regions, and AWS availability of ARM instance types varies. + +## Cluster + +Create a cluster with ARM64 controller and worker nodes. Container workloads must be `arm64` compatible and use `arm64` container images. + +```tf +module "gravitas" { + source = "git::https://github.com/poseidon/typhoon//aws/fedora-coreos/kubernetes?ref=v1.19.4" + + # AWS + cluster_name = "gravitas" + dns_zone = "aws.example.com" + dns_zone_id = "Z3PAABBCFAKEC0" + + # configuration + ssh_authorized_key = "ssh-rsa AAAAB3Nz..." + + # optional + arch = "arm64" + networking = "flannel" + worker_count = 2 + worker_price = "0.0168" + + controller_type = "t4g.small" + worker_type = "t4g.small" +} +``` + +Verify the cluster has only arm64 (`aarch64`) nodes. + +``` +$ kubectl get nodes -o wide +NAME STATUS ROLES AGE VERSION INTERNAL-IP EXTERNAL-IP OS-IMAGE KERNEL-VERSION CONTAINER-RUNTIME +ip-10-0-12-178 Ready 101s v1.19.4 10.0.12.178 Fedora CoreOS 32.20201104.dev.0 5.8.17-200.fc32.aarch64 docker://19.3.11 +ip-10-0-18-93 Ready 102s v1.19.4 10.0.18.93 Fedora CoreOS 32.20201104.dev.0 5.8.17-200.fc32.aarch64 docker://19.3.11 +ip-10-0-90-10 Ready 104s v1.19.4 10.0.90.10 Fedora CoreOS 32.20201104.dev.0 5.8.17-200.fc32.aarch64 docker://19.3.11 +``` + +## Hybrid + +Create a hybrid/mixed arch cluster by defining an AWS cluster. Then define a [worker pool](worker-pools.md#aws) with ARM64 workers. Optional taints are added to aid in scheduling. + +=== "Cluster (amd64)" + + ```tf + module "gravitas" { + source = "git::https://github.com/poseidon/typhoon//aws/fedora-coreos/kubernetes?ref=v1.19.4" + + # AWS + cluster_name = "gravitas" + dns_zone = "aws.example.com" + dns_zone_id = "Z3PAABBCFAKEC0" + + # configuration + ssh_authorized_key = "ssh-rsa AAAAB3Nz..." + + # optional + networking = "flannel" + worker_count = 2 + worker_price = "0.021" + + daemonset_tolerations = ["arch"] # important + } + ``` + +=== "Worker Pool (arm64)" + + ```tf + module "gravitas-arm64" { + source = "git::https://github.com/poseidon/typhoon//aws/fedora-coreos/kubernetes/workers?ref=v1.19.4" + + # AWS + vpc_id = module.gravitas.vpc_id + subnet_ids = module.gravitas.subnet_ids + security_groups = module.gravitas.worker_security_groups + + # configuration + name = "gravitas-arm64" + kubeconfig = module.gravitas.kubeconfig + ssh_authorized_key = var.ssh_authorized_key + + # optional + arch = "arm64" + instance_type = "t4g.small" + spot_price = "0.0168" + node_taints = ["arch=arm64:NoSchedule"] + } + ``` + +Verify amd64 (x86_64) and arm64 (aarch64) nodes are present. + +``` +$ kubectl get nodes -o wide +NAME STATUS ROLES AGE VERSION INTERNAL-IP EXTERNAL-IP OS-IMAGE KERNEL-VERSION CONTAINER-RUNTIME +ip-10-0-14-73 Ready 116s v1.19.4 10.0.14.73 Fedora CoreOS 32.20201018.3.0 5.8.15-201.fc32.x86_64 docker://19.3.11 +ip-10-0-17-167 Ready 104s v1.19.4 10.0.17.167 Fedora CoreOS 32.20201018.3.0 5.8.15-201.fc32.x86_64 docker://19.3.11 +ip-10-0-47-166 Ready 110s v1.19.4 10.0.47.166 Fedora CoreOS 32.20201104.dev.0 5.8.17-200.fc32.aarch64 docker://19.3.11 +ip-10-0-7-237 Ready 111s v1.19.4 10.0.7.237 Fedora CoreOS 32.20201018.3.0 5.8.15-201.fc32.x86_64 docker://19.3.11 +``` + diff --git a/docs/advanced/overview.md b/docs/advanced/overview.md index 79ad7bdef..63cff7966 100644 --- a/docs/advanced/overview.md +++ b/docs/advanced/overview.md @@ -2,5 +2,6 @@ Typhoon clusters offer several advanced features for skilled users. +* [ARM64](arm64.md) * [Customization](customization.md) * [Worker Pools](worker-pools.md) diff --git a/mkdocs.yml b/mkdocs.yml index 1cca3d1f2..5d599f54e 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -78,6 +78,7 @@ nav: - 'FAQ': 'topics/faq.md' - 'Advanced': - 'Overview': 'advanced/overview.md' + - 'ARM64': 'advanced/arm64.md' - 'Customization': 'advanced/customization.md' - 'Worker Pools': 'advanced/worker-pools.md' - 'Addons': diff --git a/requirements.txt b/requirements.txt index c52163d89..5bbcd3bed 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ mkdocs==1.1.2 -mkdocs-material==6.1.0 +mkdocs-material==6.1.4 pygments==2.6.1 pymdown-extensions==7.1.0