From 1a009eff24515bbb4dc787ff025a88cc73c8529b Mon Sep 17 00:00:00 2001 From: Florian Ruynat <16313165+floryut@users.noreply.github.com> Date: Mon, 25 Oct 2021 16:38:57 +0200 Subject: [PATCH] debug --- .gitlab-ci.yml | 1 - .gitlab-ci/packet.yml | 8 ++++---- README.md | 2 +- roles/download/defaults/main.yml | 2 +- roles/etcd/tasks/configure.yml | 19 +++++++++++++++++++ roles/etcd/tasks/join_etcd_member.yml | 3 +++ .../control-plane/tasks/main.yml | 4 ++++ .../recover_control_plane/etcd/tasks/main.yml | 2 +- 8 files changed, 33 insertions(+), 8 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index e74ae5aaa56..c1fcf61f597 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -51,7 +51,6 @@ before_script: .testcases: &testcases <<: *job - retry: 1 before_script: - update-alternatives --install /usr/bin/python python /usr/bin/python3 1 - ./tests/scripts/rebase.sh diff --git a/.gitlab-ci/packet.yml b/.gitlab-ci/packet.yml index 30d6ba3f34e..f7602afa430 100644 --- a/.gitlab-ci/packet.yml +++ b/.gitlab-ci/packet.yml @@ -241,16 +241,16 @@ packet_debian9-calico-upgrade-once: MITOGEN_ENABLE: "false" packet_ubuntu18-calico-ha-recover: - stage: deploy-part3 - extends: .packet_periodic + stage: unit-tests + extends: .packet_pr when: on_success variables: RECOVER_CONTROL_PLANE_TEST: "true" RECOVER_CONTROL_PLANE_TEST_GROUPS: "etcd[2:],kube_control_plane[1:]" packet_ubuntu18-calico-ha-recover-noquorum: - stage: deploy-part3 - extends: .packet_periodic + stage: unit-tests + extends: .packet_pr when: on_success variables: RECOVER_CONTROL_PLANE_TEST: "true" diff --git a/README.md b/README.md index 40bc2c668f0..3515bc3e7bb 100644 --- a/README.md +++ b/README.md @@ -132,7 +132,7 @@ Note: Upstart/SysV init based OS types are not supported. - Core - [kubernetes](https://github.com/kubernetes/kubernetes) v1.22.2 - - [etcd](https://github.com/coreos/etcd) v3.5.0 + - [etcd](https://github.com/coreos/etcd) v3.5.1 - [docker](https://www.docker.com/) v20.10 (see note) - [containerd](https://containerd.io/) v1.4.9 - [cri-o](http://cri-o.io/) v1.21 (experimental: see [CRI-O Note](docs/cri-o.md). Only on fedora, ubuntu and centos based OS) diff --git a/roles/download/defaults/main.yml b/roles/download/defaults/main.yml index e90ff238873..7a14a15140e 100644 --- a/roles/download/defaults/main.yml +++ b/roles/download/defaults/main.yml @@ -50,7 +50,7 @@ image_arch: "{{host_architecture | default('amd64')}}" # Versions kubeadm_version: "{{ kube_version }}" -etcd_version: v3.5.0 +etcd_version: v3.5.1 crun_version: 1.2 runc_version: v1.0.2 kata_containers_version: 2.2.0 diff --git a/roles/etcd/tasks/configure.yml b/roles/etcd/tasks/configure.yml index 7534e4176c1..3a623249bff 100644 --- a/roles/etcd/tasks/configure.yml +++ b/roles/etcd/tasks/configure.yml @@ -125,6 +125,25 @@ ETCDCTL_CACERT: "{{ etcd_cert_dir }}/ca.pem" ETCDCTL_ENDPOINTS: "{{ etcd_events_access_addresses }}" +- name: Configure | Check members in etcd cluster + shell: "{{ bin_dir }}/etcdctl member list" + register: etcd_members + ignore_errors: true # noqa ignore-errors + changed_when: false + check_mode: no + when: is_etcd_master and etcd_cluster_setup + tags: + - facts + environment: + ETCDCTL_API: 3 + ETCDCTL_CERT: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem" + ETCDCTL_KEY: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem" + ETCDCTL_CACERT: "{{ etcd_cert_dir }}/ca.pem" + ETCDCTL_ENDPOINTS: "{{ etcd_access_addresses }}" + +- debug: + var: etcd_members + - name: Configure | Check if member is in etcd cluster shell: "{{ bin_dir }}/etcdctl member list | grep -q {{ etcd_access_address }}" register: etcd_member_in_cluster diff --git a/roles/etcd/tasks/join_etcd_member.yml b/roles/etcd/tasks/join_etcd_member.yml index 28d259cccd6..22440394f2d 100644 --- a/roles/etcd/tasks/join_etcd_member.yml +++ b/roles/etcd/tasks/join_etcd_member.yml @@ -32,6 +32,9 @@ register: etcd_member_in_cluster changed_when: false check_mode: no + retries: "{{ etcd_retries }}" + delay: "{{ retry_stagger | random + 3 }}" + until: etcd_member_in_cluster.rc == 0 tags: - facts environment: diff --git a/roles/recover_control_plane/control-plane/tasks/main.yml b/roles/recover_control_plane/control-plane/tasks/main.yml index 450e6f36d94..9d0038ea1e2 100644 --- a/roles/recover_control_plane/control-plane/tasks/main.yml +++ b/roles/recover_control_plane/control-plane/tasks/main.yml @@ -16,6 +16,9 @@ - KUBECONFIG: "{{ ansible_env.HOME | default('/root') }}/.kube/config" with_items: "{{ groups['broken_kube_control_plane'] }}" register: delete_broken_kube_masters + until: delete_broken_kube_masters.rc == 0 + retries: 6 + delay: 10 failed_when: false when: groups['broken_kube_control_plane'] @@ -24,6 +27,7 @@ msg: "Unable to delete broken kube_control_plane node: {{ item.item }}" loop: "{{ delete_broken_kube_masters.results }}" changed_when: false + ignore_errors: true when: - groups['broken_kube_control_plane'] - "item.rc != 0 and not 'NotFound' in item.stderr" diff --git a/roles/recover_control_plane/etcd/tasks/main.yml b/roles/recover_control_plane/etcd/tasks/main.yml index e3dc339307d..5c991053f53 100644 --- a/roles/recover_control_plane/etcd/tasks/main.yml +++ b/roles/recover_control_plane/etcd/tasks/main.yml @@ -23,7 +23,7 @@ # When there is an error, everything is printed in stderr_lines, even "is healthy" messages. - name: Set has_quorum fact set_fact: - has_quorum: "{{ etcd_endpoint_health.stderr_lines | select('match', '.*is healthy.*') | list | length >= etcd_endpoint_health.stderr_lines | select('match', '.*is unhealthy.*') | list | length }}" + has_quorum: "{{ etcd_endpoint_health.stderr_lines | select('match', '.*is healthy.*') | list | length >= etcd_endpoint_health.stdout_lines | select('match', '.*is unhealthy.*') | list | length }}" when: - groups['broken_etcd']