diff --git a/README.md b/README.md index 3230053..a11e76f 100644 --- a/README.md +++ b/README.md @@ -112,17 +112,17 @@ and can be considered as some kind of one-time "migrations". > It is recommended to investigate the scripts logic before applying to a K8S cluster. -#### Upgrading to EKS 1.27 +#### Requirements -The scripts support upgrading K8S from a minimal version of `1.23` to `1.27`. - -**Requirements:** - -* [RMK](https://github.com/edenlabllc/rmk) >= v0.41.0 +* [RMK](https://github.com/edenlabllc/rmk) >= v0.44.2 * [AWS CLI](https://aws.amazon.com/cli/) >= 2.9 -* [eksctl](https://eksctl.io/) >= v0.160.0 +* [eksctl](https://eksctl.io/) >= v0.190.0 * [yq](https://mikefarah.gitbook.io/yq) >= v4.35.2 +#### Upgrading EKS from 1.23 to 1.27 + +The scripts support upgrading K8S from a minimal version of `1.23` to `1.27`. + > The current upgrade covers 4 minor versions, therefore the logic is complex. For the next versions, > it might have been simplified greatly, when upgrading to the closest version only, e.g. from `1.27` to `1.28`. @@ -176,3 +176,15 @@ configs: inject: disabled # ... ``` + +#### Upgrading EKS from 1.27 to 1.29 + +The scripts support upgrading K8S from a minimal version of `1.27` to `1.29`. + +The list of scripts: +- [upgrade-all.sh](bin/k8s-upgrade/1.29/upgrade-all.sh) - Initialize [RMK](https://github.com/edenlabllc/rmk) configuration, calling rest of scripts one by one (the main upgrade script). +- [upgrade-releases.sh](bin/k8s-upgrade/1.29/upgrade-releases.sh) - Upgrade all releases. The following subscripts are executed: + - [upgrade-ebs-csi-snapshot-scheduler.sh](bin/k8s-upgrade/1.29/upgrade-ebs-csi-snapshot-scheduler.sh) - Upgrade [EBS CSI snapshot scheduler](https://backube.github.io/snapscheduler/) to the latest version. +- [upgrade-cluster.sh](bin/k8s-upgrade/1.29/upgrade-cluster.sh) - Upgrade the K8S control plane and system worker node components (1 K8S version per iteration). +- [upgrade-nodes.sh](bin/k8s-upgrade/1.29/upgrade-nodes.sh) - Rolling-update all the K8S worker nodes. +[upgrade-ebs-csi-snapshot-scheduler.sh](bin/k8s-upgrade/1.29/upgrade-ebs-csi-snapshot-scheduler.sh) diff --git a/bin/dagster-presync-hook.sh b/bin/dagster-presync-hook.sh index 50b3563..9460ee0 100755 --- a/bin/dagster-presync-hook.sh +++ b/bin/dagster-presync-hook.sh @@ -1,5 +1,7 @@ #!/usr/bin/env bash +# DEPRECATED: removed in favour of secrets-sync-operator + set -e NAMESPACE=${1:-dagster} diff --git a/bin/k8s-upgrade/1.27/run-tests.sh b/bin/k8s-upgrade/1.27/run-tests.sh index 89bc9fd..6f21a2e 100755 --- a/bin/k8s-upgrade/1.27/run-tests.sh +++ b/bin/k8s-upgrade/1.27/run-tests.sh @@ -2,44 +2,4 @@ set -e -export PATH="${HOME}/.local/bin:${PATH}" - -# Note: In future, fhir-postgres, elt-postgres might be added. - -readonly POSTGRES_NAMESPACE="postgres" -readonly POSTGRES_RELEASE_NAME="postgres" - -# Example output: -#- Cluster: postgres-cluster -# Host: 10.1.2.38 -# Member: postgres-cluster-0 -# Role: Leader -# State: running -# TL: 7 -#- Cluster: postgres-cluster -# Host: 10.1.6.248 -# Lag in MB: 0 -# Member: postgres-cluster-1 -# Role: Sync Standby -# State: running -# TL: 7 -echo "Showing information about Patroni cluster and its members of ${POSTGRES_RELEASE_NAME}..." -readonly POSTGRES_CLUSTER_LIST="$(kubectl -n "${POSTGRES_NAMESPACE}" exec -it -c postgres "${POSTGRES_RELEASE_NAME}-cluster-0" -- patronictl list -f yaml)" -echo "${POSTGRES_CLUSTER_LIST}" - -echo "Checking all the members are running..." -if [[ "$(echo "${POSTGRES_CLUSTER_LIST}" | yq '([.[] | select(.State == "running")] | length) == (. | length)')" == "true" ]]; then - echo "OK." -else - >&2 echo "ERROR: Not all the members are running." - exit 1 -fi - -echo "Checking all the members have correct roles..." -if [[ "$(echo "${POSTGRES_CLUSTER_LIST}" | yq '([.[] | select(.Role == "Leader")] | length) == 1')" == "true" ]] \ - && [[ "$(echo "${POSTGRES_CLUSTER_LIST}" | yq '([.[] | select(.Role == "Sync Standby")] | length) == 1')" == "true" ]]; then - echo "OK." -else - >&2 echo "ERROR: The roles are not \"Leader\" and \"Sync Standby\"." - exit 1 -fi +"$(dirname "${BASH_SOURCE}")/../run-tests.sh" diff --git a/bin/k8s-upgrade/1.27/upgrade-cluster.sh b/bin/k8s-upgrade/1.27/upgrade-cluster.sh index e265a12..b9c87ad 100755 --- a/bin/k8s-upgrade/1.27/upgrade-cluster.sh +++ b/bin/k8s-upgrade/1.27/upgrade-cluster.sh @@ -2,54 +2,7 @@ set -e -export PATH="${HOME}/.local/bin:${PATH}" - -readonly NAME="$(rmk -ll error config view | yq '.name')" -CLUSTER_NAME="$(rmk -ll error config view | yq '.exported-vars.env.CLUSTER_NAME')" -if [[ "${CLUSTER_NAME}" == "null" ]]; then - CLUSTER_NAME="${NAME}-eks" -fi -CURRENT_CLUSTER_VERSION="$(eksctl get cluster --name "${CLUSTER_NAME}" -o yaml | yq '.[0].Version')" - -export AWS_PROFILE="$(rmk -ll error config view | yq '.aws.profile')" -export AWS_CONFIG_FILE="${HOME}/.aws/config_${AWS_PROFILE}" -export AWS_SHARED_CREDENTIALS_FILE="${HOME}/.aws/credentials_${AWS_PROFILE}" - -readonly NAMESPACE="kube-system" -readonly KUBE_PROXY_RELEASE_NAME="kube-proxy" -readonly COREDNS_RELEASE_NAME="coredns" - -# https://docs.aws.amazon.com/eks/latest/userguide/managing-kube-proxy.html -KUBE_PROXY_IMAGE_PREFIX="$(kubectl -n "${NAMESPACE}" get daemonset "${KUBE_PROXY_RELEASE_NAME}" -o yaml | yq '.spec.template.spec.containers[0].image')" -KUBE_PROXY_IMAGE_PREFIX="${KUBE_PROXY_IMAGE_PREFIX%:*}" -# https://docs.aws.amazon.com/eks/latest/userguide/managing-coredns.html -COREDNS_IMAGE_PREFIX="$(kubectl -n "${NAMESPACE}" get deployment "${COREDNS_RELEASE_NAME}" -o yaml | yq '.spec.template.spec.containers[0].image')" -COREDNS_IMAGE_PREFIX="${COREDNS_IMAGE_PREFIX%:*}" - -# https://docs.aws.amazon.com/eks/latest/userguide/update-cluster.html -# https://docs.aws.amazon.com/eks/latest/userguide/kubernetes-versions.html -function upgrade_cluster() { - local DESIRED_CLUSTER_VERSION="${1}" - local KUBE_PROXY_VERSION="${2}" - local COREDNS_VERSION="${3}" - - echo - echo "Current cluster version: ${CURRENT_CLUSTER_VERSION}" - echo "Desired cluster version: ${DESIRED_CLUSTER_VERSION}" - if [[ "${CURRENT_CLUSTER_VERSION//./,}" -ge "${DESIRED_CLUSTER_VERSION//./,}" ]]; then - echo "No control plane upgrade needed." - else - eksctl upgrade cluster --name "${CLUSTER_NAME}" --version "${DESIRED_CLUSTER_VERSION}" --approve - CURRENT_CLUSTER_VERSION="${DESIRED_CLUSTER_VERSION}" - fi - - if [[ "${CURRENT_CLUSTER_VERSION//./,}" -eq "${DESIRED_CLUSTER_VERSION//./,}" ]]; then - kubectl -n "${NAMESPACE}" set image daemonset "${KUBE_PROXY_RELEASE_NAME}" kube-proxy="${KUBE_PROXY_IMAGE_PREFIX}:${KUBE_PROXY_VERSION}" - kubectl -n "${NAMESPACE}" rollout status daemonset "${KUBE_PROXY_RELEASE_NAME}" - kubectl -n "${NAMESPACE}" set image deployment "${COREDNS_RELEASE_NAME}" coredns="${COREDNS_IMAGE_PREFIX}:${COREDNS_VERSION}" - kubectl -n "${NAMESPACE}" rollout status deployment "${COREDNS_RELEASE_NAME}" - fi -} +source "$(dirname "${BASH_SOURCE}")/../upgrade-cluster.sh" echo "Upgrading K8S cluster iteratively..." upgrade_cluster "1.24" "v1.24.17-minimal-eksbuild.2" "v1.9.3-eksbuild.7" diff --git a/bin/k8s-upgrade/1.27/upgrade-nodes.sh b/bin/k8s-upgrade/1.27/upgrade-nodes.sh index 0876cc4..4b2bff0 100755 --- a/bin/k8s-upgrade/1.27/upgrade-nodes.sh +++ b/bin/k8s-upgrade/1.27/upgrade-nodes.sh @@ -2,146 +2,4 @@ set -e -# optional argument -# e.g. postgres|minio -# find all possible node group names in etc/**/worker-groups.auto.tfvars of a tenant repository -NODE_GROUP_NAME="${1}" - -export PATH="${HOME}/.local/bin:${PATH}" - -# disable client-side pager -export AWS_PAGER= -export AWS_PROFILE="$(rmk --log-level error config view | yq '.aws.profile')" -export AWS_CONFIG_FILE="${HOME}/.aws/config_${AWS_PROFILE}" -export AWS_SHARED_CREDENTIALS_FILE="${HOME}/.aws/credentials_${AWS_PROFILE}" - -readonly NAME="$(rmk --log-level error config view | yq '.name')" -CLUSTER_NAME="$(rmk --log-level error config view | yq '.exported-vars.env.CLUSTER_NAME')" -if [[ "${CLUSTER_NAME}" == "null" ]]; then - CLUSTER_NAME="${NAME}-eks" -fi - -NODE_GROUP_FILTER="" -if [[ -n "${NODE_GROUP_NAME}" ]]; then - NODE_GROUP_FILTER="Name=tag-value,Values=${CLUSTER_NAME}-${NODE_GROUP_NAME}-eks_asg" -fi - -ASG_TAGS=($(aws autoscaling describe-auto-scaling-groups \ - --filters "Name=tag-key,Values=kubernetes.io/cluster/${CLUSTER_NAME}" ${NODE_GROUP_FILTER} \ - --output yaml | yq '.AutoScalingGroups[].Tags[] | select(.Key == "Name") | .Value')) -ASG_NAMES=() - -if [[ ${#ASG_TAGS[@]} -eq 0 ]]; then - >&2 echo "ERROR: No autoscaling group found." - exit 1 -fi - -echo "Rolling-updating nodes..." - -for ASG_TAG in ${ASG_TAGS[@]}; do - ASG_NAME="$(aws autoscaling describe-auto-scaling-groups \ - --filters "Name=tag-value,Values=${ASG_TAG}" \ - --query 'AutoScalingGroups[0].AutoScalingGroupName' \ - --output text - )" - ASG_NAMES+=("${ASG_NAME}") - # nodes with STS/PVC/PV need up to 10 minutes or more to warm up/check health and mount devices - ASG_UPDATE_TIMEOUT_SECONDS=600 - - # remove prefix and suffix from ASG tag to get node group name - NODE_GROUP_NAME="${ASG_TAG#${CLUSTER_NAME}-}" - NODE_GROUP_NAME="${NODE_GROUP_NAME%-eks_asg}" - IS_NODE_GROUP_STATEFUL="true" - PVC_LABELS=""; - case "${NODE_GROUP_NAME}" in - "clickhouse") PVC_LABELS="clickhouse.altinity.com/chi=clickhouse" ;; - "elt-postgres") PVC_LABELS="cluster-name=elt-postgres-cluster" ;; - "es") PVC_LABELS="elasticsearch.k8s.elastic.co/cluster-name=elastic" ;; - "es-jaeger") PVC_LABELS="elasticsearch.k8s.elastic.co/cluster-name=elastic-jaeger" ;; - "fhir-postgres") PVC_LABELS="cluster-name=fhir-postgres-cluster" ;; - "kafka") PVC_LABELS="app.kubernetes.io/instance=kafka" ;; - "loki-stack") PVC_LABELS="release=loki-stack" ;; - "minio") PVC_LABELS="release=minio" ;; - "mongodb") PVC_LABELS="app.kubernetes.io/instance=mongodb" ;; - "postgres") PVC_LABELS="cluster-name=postgres-cluster" ;; - "redis") PVC_LABELS="app.kubernetes.io/instance=redis" ;; - *) IS_NODE_GROUP_STATEFUL="false"; ASG_UPDATE_TIMEOUT_SECONDS=60 ;; - esac - - echo - echo "Node group name: ${NODE_GROUP_NAME}" - echo "Stateful: ${IS_NODE_GROUP_STATEFUL}" - echo "ASG tag: ${ASG_TAG}" - echo "ASG name: ${ASG_NAME}" - echo "ASG update timeout: ${ASG_UPDATE_TIMEOUT_SECONDS}s" - - if [[ "${IS_NODE_GROUP_STATEFUL}" == "true" && "${PVC_LABELS}" != "" ]]; then - echo "PVC labels: ${PVC_LABELS}" - - PV_NAMES="$(kubectl get pvc --all-namespaces -l "${PVC_LABELS}" -o yaml | yq '.items[].spec.volumeName')" - echo "PV names: ${PV_NAMES}" - - # adding pv-dummy to return list of items even for cases when we have only 1 PV found - ASG_AZS="$(kubectl get pv pv-dummy ${PV_NAMES} --ignore-not-found -o yaml | yq '.items[].spec.nodeAffinity.required.nodeSelectorTerms[0].matchExpressions[0].values[0]' | sort | uniq)" - echo "ASG availability zones: ${ASG_AZS}" - - ASG_SUBNETS="" - for ASG_AZ in ${ASG_AZS}; do - echo "Getting private subnet for ${ASG_AZ}..." - ASG_SUBNET="$(aws ec2 describe-subnets --filters "Name=tag-value,Values=${NAME}-vpc-private-${ASG_AZ}" --output yaml | yq '.Subnets[0].SubnetId')" - echo "Subnet ID: ${ASG_SUBNET}" - ASG_SUBNETS="${ASG_SUBNETS} ${ASG_SUBNET}" - done - echo "ASG subnets: ${ASG_SUBNETS}" - - aws autoscaling update-auto-scaling-group --auto-scaling-group-name "${ASG_NAME}" \ - --availability-zones ${ASG_AZS} \ - --vpc-zone-identifier "${ASG_SUBNETS// /,}" \ - --default-cooldown ${ASG_UPDATE_TIMEOUT_SECONDS} \ - --default-instance-warmup ${ASG_UPDATE_TIMEOUT_SECONDS} \ - --health-check-grace-period ${ASG_UPDATE_TIMEOUT_SECONDS} || true - else - echo "No ASG AZ update needed for stateless node group." - fi - - # rolling-update node group OR skip in case it is being updated already - echo "Starting instance refresh..." - aws autoscaling start-instance-refresh --auto-scaling-group-name "${ASG_NAME}" || true -done - -echo -echo "Checking instance refresh status.." -while true; do - IN_PROGRESS_ASG_COUNT="${#ASG_NAMES[@]}" - for ASG_NAME in ${ASG_NAMES[@]}; do - ASG_INSTANCE_REFRESH="$(aws autoscaling describe-instance-refreshes \ - --auto-scaling-group-name "${ASG_NAME}" \ - --max-records 1 \ - --output yaml | yq '.InstanceRefreshes[0] | select(.Status != "Successful") | .AutoScalingGroupName')" - if [[ -n "${ASG_INSTANCE_REFRESH}" && "${ASG_INSTANCE_REFRESH}" != "null" ]]; then - echo "ASG ${ASG_NAME} in progress..." - else - ((IN_PROGRESS_ASG_COUNT--)) - fi - done - - if [[ "${IN_PROGRESS_ASG_COUNT}" -gt 0 ]]; then - sleep 10 - else - break - fi -done -echo "Done." - -echo -echo "Fixing pods with a missing linkerd sidecar after the instance refresh..." -PODS_WITH_MISSING_LINKERD_SIDECAR="$(kubectl get pods --all-namespaces -l "!linkerd.io/control-plane-ns" -o yaml | yq '.items[].metadata | select(.annotations["linkerd.io/inject"] == "enabled") | (.namespace + " " + .name)')" -# iterate over lines ignoring spaces -while IFS= read -r NAMESPACE_WITH_POD; do - if [[ -z "${NAMESPACE_WITH_POD}" ]]; then - # no pods found - break - fi - kubectl delete pod --wait=true -n ${NAMESPACE_WITH_POD} -done <<< "${PODS_WITH_MISSING_LINKERD_SIDECAR}" -echo "Done." +"$(dirname "${BASH_SOURCE}")/../upgrade-nodes.sh" diff --git a/bin/k8s-upgrade/1.29/run-tests.sh b/bin/k8s-upgrade/1.29/run-tests.sh new file mode 100755 index 0000000..6f21a2e --- /dev/null +++ b/bin/k8s-upgrade/1.29/run-tests.sh @@ -0,0 +1,5 @@ +#!/usr/bin/env bash + +set -e + +"$(dirname "${BASH_SOURCE}")/../run-tests.sh" diff --git a/bin/k8s-upgrade/1.29/upgrade-all.sh b/bin/k8s-upgrade/1.29/upgrade-all.sh new file mode 100755 index 0000000..707ec22 --- /dev/null +++ b/bin/k8s-upgrade/1.29/upgrade-all.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +set -e + +export PATH="${HOME}/.local/bin:${PATH}" + +echo "Initializing cluster configuration..." +rmk update +rmk config init +rmk cluster switch -f + +echo +"$(dirname "${BASH_SOURCE}")/upgrade-releases.sh" + +echo +"$(dirname "${BASH_SOURCE}")/upgrade-cluster.sh" + +echo +"$(dirname "${BASH_SOURCE}")/upgrade-nodes.sh" + +echo +"$(dirname "${BASH_SOURCE}")/run-tests.sh" diff --git a/bin/k8s-upgrade/1.29/upgrade-cluster.sh b/bin/k8s-upgrade/1.29/upgrade-cluster.sh new file mode 100755 index 0000000..5f1e2e1 --- /dev/null +++ b/bin/k8s-upgrade/1.29/upgrade-cluster.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +set -e + +source "$(dirname "${BASH_SOURCE}")/../upgrade-cluster.sh" + +echo "Upgrading K8S cluster iteratively..." +upgrade_cluster "1.28" "v1.28.12-eksbuild.2" "v1.10.1-eksbuild.13" +upgrade_cluster "1.29" "v1.29.0-minimal-eksbuild.1" "v1.11.1-eksbuild.4" + +echo +echo "Provisioning latest AMI IDs and K8S version..." +rmk cluster provision diff --git a/bin/k8s-upgrade/1.29/upgrade-ebs-csi-snapshot-scheduler.sh b/bin/k8s-upgrade/1.29/upgrade-ebs-csi-snapshot-scheduler.sh new file mode 100755 index 0000000..4be54b2 --- /dev/null +++ b/bin/k8s-upgrade/1.29/upgrade-ebs-csi-snapshot-scheduler.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +set -e + +export PATH="${HOME}/.local/bin:${PATH}" + +readonly NAMESPACE="kube-system" +readonly RELEASE_NAME="ebs-csi-snapshot-scheduler" + +readonly CRD_NAME="snapshotschedules.snapscheduler.backube" +readonly CRD_ANNOTATIONS="meta.helm.sh/release-namespace=${NAMESPACE} meta.helm.sh/release-name=${RELEASE_NAME}" +readonly CRD_LABELS="app.kubernetes.io/managed-by=Helm" + +echo "Checking whether ${RELEASE_NAME} release installed..." +if [[ "$(rmk --log-level error release list -l "app=${RELEASE_NAME}" --output json | yq '.[0].installed')" != "true" ]]; then + echo "Skipped." + exit +fi + +echo "Fixing annotations and labels of ${CRD_NAME} CRD of ${RELEASE_NAME} release..." +kubectl -n "${NAMESPACE}" annotate --overwrite customresourcedefinition "${CRD_NAME}" ${CRD_ANNOTATIONS} +kubectl -n "${NAMESPACE}" label --overwrite customresourcedefinition "${CRD_NAME}" ${CRD_LABELS} diff --git a/bin/k8s-upgrade/1.29/upgrade-nodes.sh b/bin/k8s-upgrade/1.29/upgrade-nodes.sh new file mode 100755 index 0000000..4b2bff0 --- /dev/null +++ b/bin/k8s-upgrade/1.29/upgrade-nodes.sh @@ -0,0 +1,5 @@ +#!/usr/bin/env bash + +set -e + +"$(dirname "${BASH_SOURCE}")/../upgrade-nodes.sh" diff --git a/bin/k8s-upgrade/1.29/upgrade-releases.sh b/bin/k8s-upgrade/1.29/upgrade-releases.sh new file mode 100755 index 0000000..c9c85c3 --- /dev/null +++ b/bin/k8s-upgrade/1.29/upgrade-releases.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +set -e + +export PATH="${HOME}/.local/bin:${PATH}" + +"$(dirname "${BASH_SOURCE}")/upgrade-ebs-csi-snapshot-scheduler.sh" + +echo +echo "Synchronizing all releases..." +rmk release sync diff --git a/bin/k8s-upgrade/1.27/rotate-linkerd-certs.sh b/bin/k8s-upgrade/rotate-linkerd-certs.sh similarity index 100% rename from bin/k8s-upgrade/1.27/rotate-linkerd-certs.sh rename to bin/k8s-upgrade/rotate-linkerd-certs.sh diff --git a/bin/k8s-upgrade/run-tests.sh b/bin/k8s-upgrade/run-tests.sh new file mode 100755 index 0000000..7688766 --- /dev/null +++ b/bin/k8s-upgrade/run-tests.sh @@ -0,0 +1,58 @@ +#!/usr/bin/env bash + +set -e + +export PATH="${HOME}/.local/bin:${PATH}" + +function test_postgres() { + local POSTGRES_NAMESPACE="${1}" + local POSTGRES_RELEASE_NAME="${2}" + + echo + echo "Checking whether ${POSTGRES_RELEASE_NAME} release installed..." + if [[ "$(rmk --log-level error release list -l "app=${POSTGRES_RELEASE_NAME}" --output json | yq '.[0].installed')" != "true" ]]; then + echo "Skipped." + return + fi + + # Example output: + #- Cluster: postgres-cluster + # Host: 10.1.2.38 + # Member: postgres-cluster-0 + # Role: Leader + # State: running + # TL: 7 + #- Cluster: postgres-cluster + # Host: 10.1.6.248 + # Lag in MB: 0 + # Member: postgres-cluster-1 + # Role: Sync Standby + # State: running + # TL: 7 + + echo "Showing information about Patroni cluster and all the members of ${POSTGRES_RELEASE_NAME}..." + POSTGRES_CLUSTER_LIST="$(kubectl -n "${POSTGRES_NAMESPACE}" exec -it -c postgres "${POSTGRES_RELEASE_NAME}-cluster-0" -- patronictl list -f yaml)" + echo "${POSTGRES_CLUSTER_LIST}" + + echo "Checking all the members of ${POSTGRES_RELEASE_NAME} are running..." + if [[ "$(echo "${POSTGRES_CLUSTER_LIST}" | yq '([.[] | select(.State == "running")] | length) == (. | length)')" == "true" ]]; then + echo "OK." + else + >&2 echo "ERROR: Not all the members of ${POSTGRES_RELEASE_NAME} are running." + exit 1 + fi + + echo "Checking all the members of ${POSTGRES_RELEASE_NAME} have correct roles..." + if [[ "$(echo "${POSTGRES_CLUSTER_LIST}" | yq '([.[] | select(.Role == "Leader")] | length) == 1')" == "true" ]] \ + && [[ "$(echo "${POSTGRES_CLUSTER_LIST}" | yq '([.[] | select(.Role == "Sync Standby")] | length) == 1')" == "true" ]]; then + echo "OK." + else + >&2 echo "ERROR: The roles of all the members of ${POSTGRES_RELEASE_NAME} are not \"Leader\" and \"Sync Standby\"." + exit 1 + fi +} + +echo "Checking all postgres releases..." +test_postgres "postgres" "postgres" +test_postgres "postgres" "elt-postgres" +test_postgres "postgres" "fhir-postgres" diff --git a/bin/k8s-upgrade/upgrade-cluster.sh b/bin/k8s-upgrade/upgrade-cluster.sh new file mode 100755 index 0000000..dc35fac --- /dev/null +++ b/bin/k8s-upgrade/upgrade-cluster.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash + +set -e + +export PATH="${HOME}/.local/bin:${PATH}" + +export AWS_PROFILE="$(rmk -ll error config view | yq '.aws.profile')" +export AWS_CONFIG_FILE="${HOME}/.aws/config_${AWS_PROFILE}" +export AWS_SHARED_CREDENTIALS_FILE="${HOME}/.aws/credentials_${AWS_PROFILE}" + +readonly NAME="$(rmk -ll error config view | yq '.name')" +CLUSTER_NAME="$(rmk -ll error config view | yq '.exported-vars.env.CLUSTER_NAME')" +if [[ "${CLUSTER_NAME}" == "null" ]]; then + CLUSTER_NAME="${NAME}-eks" +fi +CURRENT_CLUSTER_VERSION="$(eksctl get cluster --name "${CLUSTER_NAME}" -o yaml | yq '.[0].Version')" + +readonly NAMESPACE="kube-system" +readonly KUBE_PROXY_RELEASE_NAME="kube-proxy" +readonly COREDNS_RELEASE_NAME="coredns" + +# https://docs.aws.amazon.com/eks/latest/userguide/managing-kube-proxy.html +KUBE_PROXY_IMAGE_PREFIX="$(kubectl -n "${NAMESPACE}" get daemonset "${KUBE_PROXY_RELEASE_NAME}" -o yaml | yq '.spec.template.spec.containers[0].image')" +KUBE_PROXY_IMAGE_PREFIX="${KUBE_PROXY_IMAGE_PREFIX%:*}" +# https://docs.aws.amazon.com/eks/latest/userguide/managing-coredns.html +COREDNS_IMAGE_PREFIX="$(kubectl -n "${NAMESPACE}" get deployment "${COREDNS_RELEASE_NAME}" -o yaml | yq '.spec.template.spec.containers[0].image')" +COREDNS_IMAGE_PREFIX="${COREDNS_IMAGE_PREFIX%:*}" + +# https://docs.aws.amazon.com/eks/latest/userguide/update-cluster.html +# https://docs.aws.amazon.com/eks/latest/userguide/kubernetes-versions.html +function upgrade_cluster() { + local DESIRED_CLUSTER_VERSION="${1}" + local KUBE_PROXY_VERSION="${2}" + local COREDNS_VERSION="${3}" + + echo + echo "Current cluster version: ${CURRENT_CLUSTER_VERSION}" + echo "Desired cluster version: ${DESIRED_CLUSTER_VERSION}" + if [[ "${CURRENT_CLUSTER_VERSION//./,}" -ge "${DESIRED_CLUSTER_VERSION//./,}" ]]; then + echo "No control plane upgrade needed." + else + eksctl upgrade cluster --name "${CLUSTER_NAME}" --version "${DESIRED_CLUSTER_VERSION}" --approve + CURRENT_CLUSTER_VERSION="${DESIRED_CLUSTER_VERSION}" + fi + + if [[ "${CURRENT_CLUSTER_VERSION//./,}" -eq "${DESIRED_CLUSTER_VERSION//./,}" ]]; then + kubectl -n "${NAMESPACE}" set image daemonset "${KUBE_PROXY_RELEASE_NAME}" kube-proxy="${KUBE_PROXY_IMAGE_PREFIX}:${KUBE_PROXY_VERSION}" + kubectl -n "${NAMESPACE}" rollout status daemonset "${KUBE_PROXY_RELEASE_NAME}" + kubectl -n "${NAMESPACE}" set image deployment "${COREDNS_RELEASE_NAME}" coredns="${COREDNS_IMAGE_PREFIX}:${COREDNS_VERSION}" + kubectl -n "${NAMESPACE}" rollout status deployment "${COREDNS_RELEASE_NAME}" + fi +} diff --git a/bin/k8s-upgrade/upgrade-nodes.sh b/bin/k8s-upgrade/upgrade-nodes.sh new file mode 100755 index 0000000..0876cc4 --- /dev/null +++ b/bin/k8s-upgrade/upgrade-nodes.sh @@ -0,0 +1,147 @@ +#!/usr/bin/env bash + +set -e + +# optional argument +# e.g. postgres|minio +# find all possible node group names in etc/**/worker-groups.auto.tfvars of a tenant repository +NODE_GROUP_NAME="${1}" + +export PATH="${HOME}/.local/bin:${PATH}" + +# disable client-side pager +export AWS_PAGER= +export AWS_PROFILE="$(rmk --log-level error config view | yq '.aws.profile')" +export AWS_CONFIG_FILE="${HOME}/.aws/config_${AWS_PROFILE}" +export AWS_SHARED_CREDENTIALS_FILE="${HOME}/.aws/credentials_${AWS_PROFILE}" + +readonly NAME="$(rmk --log-level error config view | yq '.name')" +CLUSTER_NAME="$(rmk --log-level error config view | yq '.exported-vars.env.CLUSTER_NAME')" +if [[ "${CLUSTER_NAME}" == "null" ]]; then + CLUSTER_NAME="${NAME}-eks" +fi + +NODE_GROUP_FILTER="" +if [[ -n "${NODE_GROUP_NAME}" ]]; then + NODE_GROUP_FILTER="Name=tag-value,Values=${CLUSTER_NAME}-${NODE_GROUP_NAME}-eks_asg" +fi + +ASG_TAGS=($(aws autoscaling describe-auto-scaling-groups \ + --filters "Name=tag-key,Values=kubernetes.io/cluster/${CLUSTER_NAME}" ${NODE_GROUP_FILTER} \ + --output yaml | yq '.AutoScalingGroups[].Tags[] | select(.Key == "Name") | .Value')) +ASG_NAMES=() + +if [[ ${#ASG_TAGS[@]} -eq 0 ]]; then + >&2 echo "ERROR: No autoscaling group found." + exit 1 +fi + +echo "Rolling-updating nodes..." + +for ASG_TAG in ${ASG_TAGS[@]}; do + ASG_NAME="$(aws autoscaling describe-auto-scaling-groups \ + --filters "Name=tag-value,Values=${ASG_TAG}" \ + --query 'AutoScalingGroups[0].AutoScalingGroupName' \ + --output text + )" + ASG_NAMES+=("${ASG_NAME}") + # nodes with STS/PVC/PV need up to 10 minutes or more to warm up/check health and mount devices + ASG_UPDATE_TIMEOUT_SECONDS=600 + + # remove prefix and suffix from ASG tag to get node group name + NODE_GROUP_NAME="${ASG_TAG#${CLUSTER_NAME}-}" + NODE_GROUP_NAME="${NODE_GROUP_NAME%-eks_asg}" + IS_NODE_GROUP_STATEFUL="true" + PVC_LABELS=""; + case "${NODE_GROUP_NAME}" in + "clickhouse") PVC_LABELS="clickhouse.altinity.com/chi=clickhouse" ;; + "elt-postgres") PVC_LABELS="cluster-name=elt-postgres-cluster" ;; + "es") PVC_LABELS="elasticsearch.k8s.elastic.co/cluster-name=elastic" ;; + "es-jaeger") PVC_LABELS="elasticsearch.k8s.elastic.co/cluster-name=elastic-jaeger" ;; + "fhir-postgres") PVC_LABELS="cluster-name=fhir-postgres-cluster" ;; + "kafka") PVC_LABELS="app.kubernetes.io/instance=kafka" ;; + "loki-stack") PVC_LABELS="release=loki-stack" ;; + "minio") PVC_LABELS="release=minio" ;; + "mongodb") PVC_LABELS="app.kubernetes.io/instance=mongodb" ;; + "postgres") PVC_LABELS="cluster-name=postgres-cluster" ;; + "redis") PVC_LABELS="app.kubernetes.io/instance=redis" ;; + *) IS_NODE_GROUP_STATEFUL="false"; ASG_UPDATE_TIMEOUT_SECONDS=60 ;; + esac + + echo + echo "Node group name: ${NODE_GROUP_NAME}" + echo "Stateful: ${IS_NODE_GROUP_STATEFUL}" + echo "ASG tag: ${ASG_TAG}" + echo "ASG name: ${ASG_NAME}" + echo "ASG update timeout: ${ASG_UPDATE_TIMEOUT_SECONDS}s" + + if [[ "${IS_NODE_GROUP_STATEFUL}" == "true" && "${PVC_LABELS}" != "" ]]; then + echo "PVC labels: ${PVC_LABELS}" + + PV_NAMES="$(kubectl get pvc --all-namespaces -l "${PVC_LABELS}" -o yaml | yq '.items[].spec.volumeName')" + echo "PV names: ${PV_NAMES}" + + # adding pv-dummy to return list of items even for cases when we have only 1 PV found + ASG_AZS="$(kubectl get pv pv-dummy ${PV_NAMES} --ignore-not-found -o yaml | yq '.items[].spec.nodeAffinity.required.nodeSelectorTerms[0].matchExpressions[0].values[0]' | sort | uniq)" + echo "ASG availability zones: ${ASG_AZS}" + + ASG_SUBNETS="" + for ASG_AZ in ${ASG_AZS}; do + echo "Getting private subnet for ${ASG_AZ}..." + ASG_SUBNET="$(aws ec2 describe-subnets --filters "Name=tag-value,Values=${NAME}-vpc-private-${ASG_AZ}" --output yaml | yq '.Subnets[0].SubnetId')" + echo "Subnet ID: ${ASG_SUBNET}" + ASG_SUBNETS="${ASG_SUBNETS} ${ASG_SUBNET}" + done + echo "ASG subnets: ${ASG_SUBNETS}" + + aws autoscaling update-auto-scaling-group --auto-scaling-group-name "${ASG_NAME}" \ + --availability-zones ${ASG_AZS} \ + --vpc-zone-identifier "${ASG_SUBNETS// /,}" \ + --default-cooldown ${ASG_UPDATE_TIMEOUT_SECONDS} \ + --default-instance-warmup ${ASG_UPDATE_TIMEOUT_SECONDS} \ + --health-check-grace-period ${ASG_UPDATE_TIMEOUT_SECONDS} || true + else + echo "No ASG AZ update needed for stateless node group." + fi + + # rolling-update node group OR skip in case it is being updated already + echo "Starting instance refresh..." + aws autoscaling start-instance-refresh --auto-scaling-group-name "${ASG_NAME}" || true +done + +echo +echo "Checking instance refresh status.." +while true; do + IN_PROGRESS_ASG_COUNT="${#ASG_NAMES[@]}" + for ASG_NAME in ${ASG_NAMES[@]}; do + ASG_INSTANCE_REFRESH="$(aws autoscaling describe-instance-refreshes \ + --auto-scaling-group-name "${ASG_NAME}" \ + --max-records 1 \ + --output yaml | yq '.InstanceRefreshes[0] | select(.Status != "Successful") | .AutoScalingGroupName')" + if [[ -n "${ASG_INSTANCE_REFRESH}" && "${ASG_INSTANCE_REFRESH}" != "null" ]]; then + echo "ASG ${ASG_NAME} in progress..." + else + ((IN_PROGRESS_ASG_COUNT--)) + fi + done + + if [[ "${IN_PROGRESS_ASG_COUNT}" -gt 0 ]]; then + sleep 10 + else + break + fi +done +echo "Done." + +echo +echo "Fixing pods with a missing linkerd sidecar after the instance refresh..." +PODS_WITH_MISSING_LINKERD_SIDECAR="$(kubectl get pods --all-namespaces -l "!linkerd.io/control-plane-ns" -o yaml | yq '.items[].metadata | select(.annotations["linkerd.io/inject"] == "enabled") | (.namespace + " " + .name)')" +# iterate over lines ignoring spaces +while IFS= read -r NAMESPACE_WITH_POD; do + if [[ -z "${NAMESPACE_WITH_POD}" ]]; then + # no pods found + break + fi + kubectl delete pod --wait=true -n ${NAMESPACE_WITH_POD} +done <<< "${PODS_WITH_MISSING_LINKERD_SIDECAR}" +echo "Done."