From 065b6ff6bcc6ff678f6faaab34a17563a6f5f138 Mon Sep 17 00:00:00 2001 From: Teddy Andrieux Date: Wed, 20 May 2020 14:26:10 +0200 Subject: [PATCH 01/12] scripts: Re-generate all certificate needed for Salt master Before this commit we only regenerate etcd-client certs for Salt-master during restoration, when we should regenerate all certificates. Call `metalk8s.salt.master.certs` instead of only `etcd-client` --- scripts/restore.sh.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/restore.sh.in b/scripts/restore.sh.in index 4dab9195e0..d41f383146 100755 --- a/scripts/restore.sh.in +++ b/scripts/restore.sh.in @@ -283,7 +283,7 @@ highstate_bootstrap() { ) "${SALT_CALL}" --retcode-passthrough --state-output=mixed state.sls \ - metalk8s.salt.master.certs.etcd-client \ + metalk8s.salt.master.certs \ saltenv=metalk8s-@@VERSION "${SALT_MASTER_CALL[@]}" salt-run --state-output=mixed state.orchestrate \ metalk8s.orchestrate.deploy_node \ From 273a1b65f12522efa05c576c8f849e3e7609c450 Mon Sep 17 00:00:00 2001 From: Teddy Andrieux Date: Mon, 25 May 2020 10:28:43 +0200 Subject: [PATCH 02/12] scripts: Re-configure apiserver-proxy on all nodes during restore Apiserver proxy redirect to an old bootstrap node that may not longer exists, so lets reconfigure apiserver-proxy on every nodes to not longer talk to the old bootstrap node --- scripts/restore.sh.in | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/restore.sh.in b/scripts/restore.sh.in index d41f383146..c4dcc1c183 100755 --- a/scripts/restore.sh.in +++ b/scripts/restore.sh.in @@ -273,6 +273,10 @@ reconfigure_nodes() { # See htpps://github.com/saltstack/salt/issues/20590 "${SALT_MASTER_CALL[@]}" salt '*' saltutil.refresh_pillar && sleep 20 "${SALT_MASTER_CALL[@]}" salt '*' mine.update + + "${SALT_MASTER_CALL[@]}" salt -L "$non_bootstrap" --state-output=mixed \ + state.sls metalk8s.kubernetes.apiserver-proxy \ + saltenv=metalk8s-@@VERSION } highstate_bootstrap() { From 774ce0f5c493492dc651be8035323152c3583219 Mon Sep 17 00:00:00 2001 From: Alexandre Allard Date: Thu, 23 Jan 2020 10:30:30 +0100 Subject: [PATCH 03/12] tests/ci: add a second node to expansion tests if node2 Deploy two node instead of only one during expansion tests to have HA etcd and being able to test the bootstrap restoration Refs: #1687 --- eve/main.yml | 2 +- tests/install/features/expansion.feature | 19 ++++++++++++++++++- tests/install/steps/test_expansion.py | 7 ++++++- 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/eve/main.yml b/eve/main.yml index fb3984d269..b28a1f1b88 100644 --- a/eve/main.yml +++ b/eve/main.yml @@ -1597,7 +1597,7 @@ stages: name: Run installation scenarii on the bastion env: <<: *_env_bastion_tests - PYTEST_FILTERS: "install and ci and multinodes" + PYTEST_FILTERS: "install and ci and multinodes and not node2" - ShellCommand: <<: *bastion_tests name: Run fast tests on the bastion diff --git a/tests/install/features/expansion.feature b/tests/install/features/expansion.feature index 00b8d64fa8..abe8bc75bb 100644 --- a/tests/install/features/expansion.feature +++ b/tests/install/features/expansion.feature @@ -1,7 +1,9 @@ @install @ci @local @multinodes Feature: Cluster expansion - Scenario: Add one node to the cluster + Background: Given the Kubernetes API is available + + Scenario: Add one node to the cluster When we declare a new "control-plane" node on host "node1" Then node "node1" is registered in Kubernetes And node "node1" status is "NotReady" @@ -14,3 +16,18 @@ Feature: Cluster expansion And we have 1 running pod labeled 'k8s-app=kube-proxy' in namespace 'kube-system' on node 'node1' And we have 1 running pod labeled 'component=etcd' in namespace 'kube-system' on node 'node1' And node "node1" is a member of etcd cluster + + @node2 + Scenario: Add a second node to the cluster + When we declare a new "control-plane" node on host "node2" + Then node "node2" is registered in Kubernetes + And node "node2" status is "NotReady" + When we deploy the node "node2" + Then node "node2" status is "Ready" + And we have 1 running pod labeled 'component=kube-controller-manager' in namespace 'kube-system' on node 'node2' + And we have 1 running pod labeled 'component=kube-scheduler' in namespace 'kube-system' on node 'node2' + And we have 1 running pod labeled 'component=kube-apiserver' in namespace 'kube-system' on node 'node2' + And we have 1 running pod labeled 'k8s-app=calico-node' in namespace 'kube-system' on node 'node2' + And we have 1 running pod labeled 'k8s-app=kube-proxy' in namespace 'kube-system' on node 'node2' + And we have 1 running pod labeled 'component=etcd' in namespace 'kube-system' on node 'node2' + And node "node2" is a member of etcd cluster diff --git a/tests/install/steps/test_expansion.py b/tests/install/steps/test_expansion.py index b555a37433..e7c2588424 100644 --- a/tests/install/steps/test_expansion.py +++ b/tests/install/steps/test_expansion.py @@ -13,7 +13,12 @@ # Scenarios @scenario('../features/expansion.feature', 'Add one node to the cluster') -def test_cluster_expansion(host): +def test_cluster_expansion_1_node(host): + pass + + +@scenario('../features/expansion.feature', 'Add a second node to the cluster') +def test_cluster_expansion_2_nodes(host): pass # When {{{ From 2177dc4248c68201dd90819fcc94ed415707b841 Mon Sep 17 00:00:00 2001 From: Alexandre Allard Date: Thu, 23 Jan 2020 11:09:58 +0100 Subject: [PATCH 04/12] tests/ci: pass backup archive env var to pytest Refs: #1687 --- eve/main.yml | 2 ++ tox.ini | 2 ++ 2 files changed, 4 insertions(+) diff --git a/eve/main.yml b/eve/main.yml index b28a1f1b88..ed540ebce0 100644 --- a/eve/main.yml +++ b/eve/main.yml @@ -297,12 +297,14 @@ models: ISO_MOUNTPOINT: "/var/tmp/metalk8s" TEST_HOSTS_LIST: "bootstrap" PYTEST_FILTERS: "post and ci" + BOOTSTRAP_BACKUP_ARCHIVE: "" command: > ssh -F ssh_config bastion -- "cd metalk8s && export SSH_CONFIG_FILE=\"${SSH_CONFIG_FILE}\" && export ISO_MOUNTPOINT=\"${ISO_MOUNTPOINT}\" && export TEST_HOSTS_LIST=\"${TEST_HOSTS_LIST}\" && + export BOOTSTRAP_BACKUP_ARCHIVE=\"${BOOTSTRAP_BACKUP_ARCHIVE}\" && tox -e tests -- ${PYTEST_ARGS:-""} -m \"${PYTEST_FILTERS}\"" workdir: build/eve/workers/openstack-multiple-nodes/terraform/ haltOnFailure: true diff --git a/tox.ini b/tox.ini index bd8c468df2..15daf452f8 100644 --- a/tox.ini +++ b/tox.ini @@ -94,6 +94,7 @@ passenv = SSH_CONFIG_FILE TEST_HOSTS_LIST ISO_MOUNTPOINT + BOOTSTRAP_BACKUP_ARCHIVE setenv = VAGRANT_CWD={toxinidir} commands_pre = @@ -111,6 +112,7 @@ description = deps = {[testenv:tests]deps} passenv = ISO_MOUNTPOINT + BOOTSTRAP_BACKUP_ARCHIVE commands = pytest \ --iso-root={env:ISO_MOUNTPOINT:_build/root} \ From 9155b13dfcc0d9bccca2f12b596a91e75b08da6f Mon Sep 17 00:00:00 2001 From: Alexandre Allard Date: Fri, 15 May 2020 14:18:03 +0200 Subject: [PATCH 05/12] tests: add a get_grain helper and use it everywhere (cherry picked from commit 1e110a311b7bf11169268611d7f4489b16763f38) --- tests/conftest.py | 13 +++++++++---- tests/post/steps/test_ingress.py | 21 ++++----------------- tests/post/steps/test_ui.py | 8 ++------ tests/utils.py | 16 ++++++++++++---- 4 files changed, 27 insertions(+), 31 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 606b512be9..e32598089f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -32,6 +32,7 @@ def version(request, host): 'source %s && echo $VERSION', str(product_path) ) + @pytest.fixture(scope="module") def hostname(host): """Return the result of `hostname` on the `host` fixture. @@ -49,10 +50,14 @@ def nodename(host): Node name need to be equal to the salt minion id so just retrieve the salt minion id """ - with host.sudo(): - return host.check_output( - 'salt-call --local --out txt grains.get id | cut -c 8-' - ) + return utils.get_grain(host, 'id') + + +@pytest.fixture(scope="module") +def control_plane_ip(host): + """Return the Kubernetes control plane IP based on the salt grain + """ + return utils.get_grain(host, 'metalk8s:control_plane_ip') @pytest.fixture(scope="module") diff --git a/tests/post/steps/test_ingress.py b/tests/post/steps/test_ingress.py index 961f8515b2..0a938e49a8 100644 --- a/tests/post/steps/test_ingress.py +++ b/tests/post/steps/test_ingress.py @@ -1,11 +1,11 @@ -import json - import requests import requests.exceptions import pytest from pytest_bdd import given, parsers, scenario, then, when +from tests import utils + @scenario('../features/ingress.feature', 'Access HTTP services') def test_access_http_services(host): @@ -30,14 +30,7 @@ def context(): @given('the node control-plane IP is not equal to its workload-plane IP') def node_control_plane_ip_is_not_equal_to_its_workload_plane_ip(host): - with host.sudo(): - output = host.check_output(' '.join([ - 'salt-call --local', - 'grains.get metalk8s', - '--out json', - ])) - - data = json.loads(output)['local'] + data = utils.get_grain(host, 'metalk8s') assert 'control_plane_ip' in data assert 'workload_plane_ip' in data @@ -65,13 +58,7 @@ def perform_request(host, context, protocol, port, plane): if plane not in grains: raise NotImplementedError - with host.sudo(): - ip_output = host.check_output(' '.join([ - 'salt-call --local', - 'grains.get {grain}'.format(grain=grains[plane]), - '--out json', - ])) - ip = json.loads(ip_output)['local'] + ip = utils.get_grain(host, grains[plane]) try: context['response'] = requests.get( diff --git a/tests/post/steps/test_ui.py b/tests/post/steps/test_ui.py index b3dae6791d..bd6e886e04 100644 --- a/tests/post/steps/test_ui.py +++ b/tests/post/steps/test_ui.py @@ -4,6 +4,7 @@ from pytest_bdd import scenario, then import requests +from tests import utils # Scenarios @scenario('../features/ui_alive.feature', 'Reach the UI') @@ -13,12 +14,7 @@ def test_ui(host): @then("we can reach the UI") def reach_UI(host): - with host.sudo(): - output = host.check_output(' '.join([ - 'salt-call', '--local', '--out=json', - 'grains.get', 'metalk8s:control_plane_ip', - ])) - ip = json.loads(output)['local'] + ip = utils.get_grain(host, 'metalk8s:control_plane_ip') response = requests.get( 'https://{ip}:8443'.format(ip=ip), diff --git a/tests/utils.py b/tests/utils.py index d9cba2578d..968171ca58 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,4 +1,5 @@ import ipaddress +import json import logging import re import testinfra @@ -61,8 +62,15 @@ def get_node_name(nodename, ssh_config=None): """Get a node name (from SSH config).""" if ssh_config is not None: node = testinfra.get_host(nodename, ssh_config=ssh_config) - with node.sudo(): - return node.check_output( - 'salt-call --local --out txt grains.get id | cut -c 8-' - ) + return get_grain(node, 'id') return nodename + + +def get_grain(host, key): + with host.sudo(): + output = host.check_output( + 'salt-call --local --out=json grains.get "{}"'.format(key) + ) + grain = json.loads(output)['local'] + + return grain From e1945824fb3b74da8ae5f7b9ef395630e8d9fbc9 Mon Sep 17 00:00:00 2001 From: Alexandre Allard Date: Fri, 24 Jan 2020 09:41:27 +0100 Subject: [PATCH 06/12] tests: move check_etcd_role definition to conftest This `then` will be useful in restoration tests also so we move it to a common place Refs: #1687 --- tests/conftest.py | 29 +++++++++++++++++++++++++++ tests/install/steps/test_expansion.py | 29 --------------------------- 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index e32598089f..c477d1d3b4 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -228,6 +228,13 @@ def check_resource_list(host, resource, namespace): resource, namespace) +@then(parsers.parse('node "{node_name}" is a member of etcd cluster')) +def check_etcd_role(ssh_config, k8s_client, node_name): + """Check if the given node is a member of the etcd cluster.""" + etcd_member_list = etcdctl(k8s_client, ['member', 'list'], ssh_config) + assert node_name in etcd_member_list, \ + 'node {} is not part of the etcd cluster'.format(node_name) + # }}} # Helpers {{{ @@ -239,4 +246,26 @@ def _verify_kubeapi_service(host): if res.rc != 0: pytest.fail(res.stderr) + +def etcdctl(k8s_client, command, ssh_config): + """Run an etcdctl command inside the etcd container.""" + name = 'etcd-{}'.format( + utils.get_node_name('bootstrap', ssh_config) + ) + + etcd_command = [ + 'etcdctl', + '--endpoints', 'https://localhost:2379', + '--ca-file', '/etc/kubernetes/pki/etcd/ca.crt', + '--key-file', '/etc/kubernetes/pki/etcd/server.key', + '--cert-file', '/etc/kubernetes/pki/etcd/server.crt', + ] + command + output = kubernetes.stream.stream( + k8s_client.connect_get_namespaced_pod_exec, + name=name, namespace='kube-system', + command=etcd_command, + stderr=True, stdin=False, stdout=True, tty=False + ) + return output + # }}} diff --git a/tests/install/steps/test_expansion.py b/tests/install/steps/test_expansion.py index e7c2588424..fb6677330d 100644 --- a/tests/install/steps/test_expansion.py +++ b/tests/install/steps/test_expansion.py @@ -87,14 +87,6 @@ def _check_node_status(): ) -@then(parsers.parse('node "{node_name}" is a member of etcd cluster')) -def check_etcd_role(ssh_config, k8s_client, node_name): - """Check if the given node is a member of the etcd cluster.""" - etcd_member_list = etcdctl(k8s_client, ['member', 'list'], ssh_config) - assert node_name in etcd_member_list, \ - 'node {} is not part of the etcd cluster'.format(node_name) - - # }}} # Helpers {{{ @@ -170,25 +162,4 @@ def run_salt_command(host, command, ssh_config): output.stderr ) -def etcdctl(k8s_client, command, ssh_config): - """Run an etcdctl command inside the etcd container.""" - name = 'etcd-{}'.format( - utils.get_node_name('bootstrap', ssh_config) - ) - - etcd_command = [ - 'etcdctl', - '--endpoints', 'https://localhost:2379', - '--ca-file', '/etc/kubernetes/pki/etcd/ca.crt', - '--key-file', '/etc/kubernetes/pki/etcd/server.key', - '--cert-file', '/etc/kubernetes/pki/etcd/server.crt', - ] + command - output = k8s.stream.stream( - k8s_client.connect_get_namespaced_pod_exec, - name=name, namespace='kube-system', - command=etcd_command, - stderr=True, stdin=False, stdout=True, tty=False - ) - return output - # }}} From 5740ee8db78b0daa777a89f46d72e65dedcb33e2 Mon Sep 17 00:00:00 2001 From: Teddy Andrieux Date: Mon, 18 May 2020 18:58:16 +0200 Subject: [PATCH 07/12] ci: Add extra_opts for wait running Pods Adding `EXTRA_OPTS` environment variable in `wait_pods_status` helper to allow to add extra arguments like "--namespace" if needed --- eve/main.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/eve/main.yml b/eve/main.yml index ed540ebce0..06c4acee90 100644 --- a/eve/main.yml +++ b/eve/main.yml @@ -43,11 +43,12 @@ models: SLEEP_TIME: "5" STABILIZATION_TIME: "30" STATUS: "Running" + EXTRA_OPTS: "" command: > git checkout "%(prop:branch)s" && sudo eve/wait_pods_status.sh --sleep-time "$SLEEP_TIME" --stabilization-time "$STABILIZATION_TIME" - --status "$STATUS" --retry "$RETRY" + --status "$STATUS" --retry "$RETRY" $EXTRA_OPTS usePTY: true haltOnFailure: true - ShellCommand: &wait_pods_running_ssh @@ -56,12 +57,13 @@ models: <<: *_env_wait_pods_running SSH_CONFIG: ssh_config SSH_HOST: bootstrap + EXTRA_OPTS: "" command: > git checkout "%(prop:branch)s" && scp -F "$SSH_CONFIG" eve/wait_pods_status.sh "$SSH_HOST":/tmp/ && ssh -F "$SSH_CONFIG" "$SSH_HOST" sudo /tmp/wait_pods_status.sh --sleep-time "$SLEEP_TIME" --stabilization-time "$STABILIZATION_TIME" - --status "$STATUS" --retry "$RETRY" + --status "$STATUS" --retry "$RETRY" $EXTRA_OPTS usePTY: true haltOnFailure: true - ShellCommand: &build_all From be18d3e085d0d23b97abb2cbac69e25ec95e4866 Mon Sep 17 00:00:00 2001 From: Alexandre Allard Date: Thu, 6 Feb 2020 10:58:24 +0100 Subject: [PATCH 08/12] ci: create volumes on node1 instead of bootstrap for multi-node We now create AlertManager & Prometheus volumes on node1 instead of the bootstrap node on multi node environment to properly test that storage volume does not need to sit on bootstrap node --- eve/create-volumes.sh | 13 ++++++------ eve/main.yml | 35 +++++++++++++++++++++------------ examples/prometheus-sparse.yaml | 8 ++++---- 3 files changed, 33 insertions(+), 23 deletions(-) diff --git a/eve/create-volumes.sh b/eve/create-volumes.sh index 2d1c25190c..67b3f229b4 100755 --- a/eve/create-volumes.sh +++ b/eve/create-volumes.sh @@ -37,7 +37,7 @@ check_pod_is_in_phase() { [[ $phase = "$expected_phase" ]] } -BOOTSTRAP_NODE_NAME=${BOOTSTRAP_NODE_NAME:-$(salt-call --local --out txt grains.get id | cut -c 8-)} +NODE_NAME=${NODE_NAME:-$(salt-call --local --out txt grains.get id | cut -c 8-)} PRODUCT_TXT=${PRODUCT_TXT:-/vagrant/_build/root/product.txt} MAX_TRIES=300 @@ -55,17 +55,18 @@ KUBECONFIG=${KUBECONFIG:-/etc/kubernetes/admin.conf} export KUBECONFIG echo "Creating storage volumes" -sed "s/BOOTSTRAP_NODE_NAME/${BOOTSTRAP_NODE_NAME}/" "${PRODUCT_MOUNT}/examples/prometheus-sparse.yaml" | \ +sed "s/NODE_NAME/${NODE_NAME}/" \ + "${PRODUCT_MOUNT}/examples/prometheus-sparse.yaml" | \ kubectl apply -f - -echo "Waiting for PV 'bootstrap-alertmanager' to be provisioned" -if ! retry "$MAX_TRIES" check_pv_exists bootstrap-alertmanager; then +echo "Waiting for PV '$NODE_NAME-alertmanager' to be provisioned" +if ! retry "$MAX_TRIES" check_pv_exists "$NODE_NAME-alertmanager"; then echo "PV not created" exit 1 fi -echo "Waiting for PV 'bootstrap-prometheus' to be provisioned" -if ! retry "$MAX_TRIES" check_pv_exists bootstrap-prometheus; then +echo "Waiting for PV '$NODE_NAME-prometheus' to be provisioned" +if ! retry "$MAX_TRIES" check_pv_exists "$NODE_NAME-prometheus"; then echo "PV not created" exit 1 fi diff --git a/eve/main.yml b/eve/main.yml index 06c4acee90..e95b3ccbbf 100644 --- a/eve/main.yml +++ b/eve/main.yml @@ -1551,19 +1551,6 @@ stages: /var/tmp/metalk8s/bootstrap.sh --verbose workdir: build/eve/workers/openstack-multiple-nodes/terraform/ haltOnFailure: true - - ShellCommand: - name: Provision Prometheus and AlertManager storage - env: - SSH_CONFIG: >- - eve/workers/openstack-multiple-nodes/terraform/ssh_config - command: > - scp -F $SSH_CONFIG eve/create-volumes.sh bootstrap:/tmp/create-volumes.sh && - ssh -F $SSH_CONFIG bootstrap - sudo env - PRODUCT_TXT=/var/tmp/metalk8s/product.txt - PRODUCT_MOUNT=/var/tmp/metalk8s - /tmp/create-volumes.sh - haltOnFailure: true - ShellCommand: name: Install kubectl on the boostrap node command: > @@ -1592,8 +1579,10 @@ stages: | ssh -F $SSH_CONFIG bastion '(mkdir metalk8s; cd "$_"; tar xf -)' - ShellCommand: <<: *wait_pods_running_ssh + name: Wait for "kube-system" Pods to be in running state env: <<: *_env_wait_pods_running_ssh + EXTRA_OPTS: "--namespace kube-system" SSH_CONFIG: >- eve/workers/openstack-multiple-nodes/terraform/ssh_config - ShellCommand: @@ -1602,6 +1591,26 @@ stages: env: <<: *_env_bastion_tests PYTEST_FILTERS: "install and ci and multinodes and not node2" + - ShellCommand: + name: Provision Prometheus and AlertManager storage + env: + SSH_CONFIG: >- + eve/workers/openstack-multiple-nodes/terraform/ssh_config + command: > + scp -F $SSH_CONFIG eve/create-volumes.sh bootstrap:/tmp/create-volumes.sh && + ssh -F $SSH_CONFIG bootstrap + sudo env + PRODUCT_TXT=/var/tmp/metalk8s/product.txt + PRODUCT_MOUNT=/var/tmp/metalk8s + NODE_NAME=node1 + /tmp/create-volumes.sh + haltOnFailure: true + - ShellCommand: + <<: *wait_pods_running_ssh + env: + <<: *_env_wait_pods_running_ssh + SSH_CONFIG: >- + eve/workers/openstack-multiple-nodes/terraform/ssh_config - ShellCommand: <<: *bastion_tests name: Run fast tests on the bastion diff --git a/examples/prometheus-sparse.yaml b/examples/prometheus-sparse.yaml index f55d5cd9ef..da54db6cec 100644 --- a/examples/prometheus-sparse.yaml +++ b/examples/prometheus-sparse.yaml @@ -5,9 +5,9 @@ apiVersion: storage.metalk8s.scality.com/v1alpha1 kind: Volume metadata: - name: bootstrap-prometheus + name: NODE_NAME-prometheus spec: - nodeName: BOOTSTRAP_NODE_NAME + nodeName: NODE_NAME storageClassName: metalk8s-prometheus sparseLoopDevice: size: 10Gi @@ -19,9 +19,9 @@ spec: apiVersion: storage.metalk8s.scality.com/v1alpha1 kind: Volume metadata: - name: bootstrap-alertmanager + name: NODE_NAME-alertmanager spec: - nodeName: BOOTSTRAP_NODE_NAME + nodeName: NODE_NAME storageClassName: metalk8s-prometheus sparseLoopDevice: size: 1Gi From d97fc2552d6ebf9414243e20c57f770a5bdc9e83 Mon Sep 17 00:00:00 2001 From: Alexandre Allard Date: Mon, 3 Feb 2020 11:48:27 +0100 Subject: [PATCH 09/12] ci: copy bastion pub key to authorized keys to a list of nodes We need to be able to connect to all the nodes to do the expansion from the bastion host, create an helper in eve to do it --- eve/main.yml | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/eve/main.yml b/eve/main.yml index e95b3ccbbf..5839e09cf3 100644 --- a/eve/main.yml +++ b/eve/main.yml @@ -448,6 +448,17 @@ models: done alwaysRun: true + - ShellCommand: ©_bastion_pub_key_ssh + name: Send bastion public key to nodes + command: > + for host in $HOSTS_LIST; do + ssh -F ssh_config bastion "cat .ssh/bastion.pub" | + ssh -F ssh_config $host "cat >> .ssh/authorized_keys" + done + env: + HOSTS_LIST: bootstrap + workdir: build/eve/workers/openstack-multiple-nodes/terraform/ + stages: pre-merge: worker: @@ -1504,17 +1515,9 @@ stages: workdir: build/eve/workers/openstack-multiple-nodes/terraform/ haltOnFailure: true - ShellCommand: - # FIXME: find a way to share bastion public key to all spawned - # instances from Terraform - name: Send bastion public key to nodes - command: > - scp -F ssh_config -3 bastion:.ssh/bastion.pub bootstrap:.ssh/ && - ssh -F ssh_config bootstrap - "cat .ssh/bastion.pub >> .ssh/authorized_keys" && - scp -F ssh_config -3 bastion:.ssh/bastion.pub node1:.ssh/ && - ssh -F ssh_config node1 - "cat .ssh/bastion.pub >> .ssh/authorized_keys" - workdir: build/eve/workers/openstack-multiple-nodes/terraform/ + <<: *copy_bastion_pub_key_ssh + env: + HOSTS_LIST: "bootstrap node1" - ShellCommand: # FIXME: find a cleaner way with Terraform. name: Send bastion private key to bootstrap From ca8b522ab9c0deb82e2deb69c721bb6a07f54199 Mon Sep 17 00:00:00 2001 From: Alexandre Allard Date: Mon, 10 Feb 2020 10:36:23 +0100 Subject: [PATCH 10/12] tests: no longer check kube-dns node we don't want to check presence of kube-dns pods on the bootstrap node, because pods can move to other node, especially during restore tests Refs: #1687 --- tests/post/features/pods_alive.feature | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/post/features/pods_alive.feature b/tests/post/features/pods_alive.feature index fe8a3695ab..bd19a7bbe3 100644 --- a/tests/post/features/pods_alive.feature +++ b/tests/post/features/pods_alive.feature @@ -12,7 +12,7 @@ Feature: Pods should be alive And we have 1 running pod labeled 'component=etcd' in namespace 'kube-system' on node 'bootstrap' And we have 1 running pod labeled 'k8s-app=kube-proxy' in namespace 'kube-system' on node 'bootstrap' And we have 1 running pod labeled 'k8s-app=calico-node' in namespace 'kube-system' on node 'bootstrap' - And we have 2 running pod labeled 'k8s-app=kube-dns' in namespace 'kube-system' on node 'bootstrap' + And we have 2 running pod labeled 'k8s-app=kube-dns' in namespace 'kube-system' And we have 1 running pod labeled 'app=salt-master' in namespace 'kube-system' on node 'bootstrap' And we have 1 running pod labeled 'app=repositories' in namespace 'kube-system' on node 'bootstrap' From 4b7a9672d92ede283420e85a5e4bcb93fea2eb69 Mon Sep 17 00:00:00 2001 From: Alexandre Allard Date: Fri, 24 Jan 2020 08:58:11 +0100 Subject: [PATCH 11/12] tests/ci: add tests scenario for bootstrap restoration This scenario run the restore script and ensure that everything is working as expected (pods, ...) after the restoration. Refs: #1687 --- eve/main.yml | 6 +++-- tests/post/features/restore.feature | 14 +++++++++++ tests/post/steps/test_restore.py | 36 +++++++++++++++++++++++++++++ tox.ini | 1 + 4 files changed, 55 insertions(+), 2 deletions(-) create mode 100644 tests/post/features/restore.feature create mode 100644 tests/post/steps/test_restore.py diff --git a/eve/main.yml b/eve/main.yml index 5839e09cf3..6b8bfb59cb 100644 --- a/eve/main.yml +++ b/eve/main.yml @@ -951,7 +951,8 @@ stages: name: Run slow tests locally env: <<: *_env_local_tests - PYTEST_FILTERS: "post and ci and not multinode and slow" + PYTEST_FILTERS: >- + post and ci and not multinode and slow and not restore - ShellCommand: name: Cypress test command: bash cypress.sh @@ -1626,7 +1627,8 @@ stages: env: <<: *_env_bastion_tests PYTEST_ARGS: "--suppress-no-test-exit-code" - PYTEST_FILTERS: "post and ci and slow and not bootstrap" + PYTEST_FILTERS: >- + post and ci and slow and not bootstrap and not restore - ShellCommand: <<: *generate_report_over_ssh env: diff --git a/tests/post/features/restore.feature b/tests/post/features/restore.feature new file mode 100644 index 0000000000..bf075c1e01 --- /dev/null +++ b/tests/post/features/restore.feature @@ -0,0 +1,14 @@ +@post @ci @local @slow @restore +Feature: Restore + Scenario: Restore the bootstrap node + When we run the restore + Then the Kubernetes API is available + And we have 1 running pod labeled 'component=kube-controller-manager' in namespace 'kube-system' on node 'bootstrap' + And we have 1 running pod labeled 'component=kube-scheduler' in namespace 'kube-system' on node 'bootstrap' + And we have 1 running pod labeled 'component=kube-apiserver' in namespace 'kube-system' on node 'bootstrap' + And we have 1 running pod labeled 'k8s-app=calico-node' in namespace 'kube-system' on node 'bootstrap' + And we have 1 running pod labeled 'k8s-app=kube-proxy' in namespace 'kube-system' on node 'bootstrap' + And we have 1 running pod labeled 'component=etcd' in namespace 'kube-system' on node 'bootstrap' + And we have 1 running pod labeled 'app=salt-master' in namespace 'kube-system' on node 'bootstrap' + And we have 1 running pod labeled 'app=repositories' in namespace 'kube-system' on node 'bootstrap' + And node "bootstrap" is a member of etcd cluster diff --git a/tests/post/steps/test_restore.py b/tests/post/steps/test_restore.py new file mode 100644 index 0000000000..a769fa8ca7 --- /dev/null +++ b/tests/post/steps/test_restore.py @@ -0,0 +1,36 @@ +import os +import testinfra + +from pytest_bdd import scenario, when + +from tests import utils + + +# Scenarios +@scenario('../features/restore.feature', 'Restore the bootstrap node') +def test_restore(host): + pass + + +# When +@when('we run the restore') +def run_restore(request, host, ssh_config): + iso_root = request.config.getoption("--iso-root") + + backup_archive = os.environ.get('BOOTSTRAP_BACKUP_ARCHIVE') + assert backup_archive, \ + "No BOOTSTRAP_BACKUP_ARCHIVE environment variable defined" + + apiserver_node_ip = utils.get_grain( + testinfra.get_host('node1', ssh_config=ssh_config), + 'metalk8s:control_plane_ip' + ) + + with host.sudo(): + res = host.run( + "%s/restore.sh --backup-file %s --apiserver-node-ip %s", + str(iso_root), + backup_archive, + apiserver_node_ip, + ) + assert res.rc == 0, res.stdout diff --git a/tox.ini b/tox.ini index 15daf452f8..90076b866b 100644 --- a/tox.ini +++ b/tox.ini @@ -134,6 +134,7 @@ markers = volume: tag a BDD feature as related to Volume management bootstrap: tag a BDD feature as related to bootstrap solution: tag a BDD feature as related to solution + restore: tag a BDD feature as related to bootstrap node recovery filterwarnings = ignore:encode_point has been deprecated on EllipticCurvePublicNumbers and will be removed in a future version. Please use EllipticCurvePublicKey.public_bytes to obtain both compressed and uncompressed point encoding.:UserWarning ignore:Support for unsafe construction of public numbers from encoded data will be removed in a future version. Please use EllipticCurvePublicKey.from_encoded_point:UserWarning From 0293ddd968353052faa246d3d79f243a2fab7e73 Mon Sep 17 00:00:00 2001 From: Teddy Andrieux Date: Mon, 18 May 2020 15:56:32 +0200 Subject: [PATCH 12/12] ci: add bootstrap restore tests in CI Launch bootstrap restore tests in CI during post-merge stages, this CI step first need a full MetalK8s cluster with at least 3-node etcd cluster Refs: #1687 --- eve/main.yml | 232 ++++++++++++++++++++++++++--- eve/testrail_description_file.yaml | 14 ++ 2 files changed, 224 insertions(+), 22 deletions(-) diff --git a/eve/main.yml b/eve/main.yml index 6b8bfb59cb..773c745656 100644 --- a/eve/main.yml +++ b/eve/main.yml @@ -537,6 +537,7 @@ stages: - single-node-upgrade-centos - single-node-downgrade-centos - single-node-patch-version + - bootstrap-restore waitForFinish: True - ShellCommand: *add_final_status_artifact_success - Upload: *upload_final_status_artifact @@ -1459,7 +1460,7 @@ stages: CLASS_NAME: multi node.centos7 TEST_NAME: 1 bootstrap 1 master,etcd simultaneous_builds: 20 - worker: + worker: &multi_node_worker <<: *single_node_worker flavor: m1.medium path: eve/workers/openstack-multiple-nodes @@ -1490,19 +1491,19 @@ stages: haltOnFailure: true - ShellCommand: *terraform_install - ShellCommand: *terraform_install_check - - ShellCommand: + - ShellCommand: &terraform_init_multi_node <<: *terraform_init workdir: build/eve/workers/openstack-multiple-nodes/terraform/ - - ShellCommand: + - ShellCommand: &terraform_validate_multi_node <<: *terraform_validate workdir: build/eve/workers/openstack-multiple-nodes/terraform/ - - ShellCommand: + - ShellCommand: &terraform_apply_multi_node <<: *terraform_apply workdir: build/eve/workers/openstack-multiple-nodes/terraform/ env: <<: *_env_terraform TF_VAR_nodes_count: "1" - - ShellCommand: + - ShellCommand: &check_ssh_config_bootstrap name: Check SSH config for bootstrap node command: |- if [ ! -f ssh_config ]; then @@ -1519,7 +1520,7 @@ stages: <<: *copy_bastion_pub_key_ssh env: HOSTS_LIST: "bootstrap node1" - - ShellCommand: + - ShellCommand: ©_bastion_priv_key_to_bootstrap # FIXME: find a cleaner way with Terraform. name: Send bastion private key to bootstrap command: > @@ -1527,27 +1528,27 @@ stages: scp -F ssh_config -3 bastion:.ssh/bastion bootstrap:./ && ssh -F ssh_config bootstrap "sudo cp bastion /etc/metalk8s/pki/" workdir: build/eve/workers/openstack-multiple-nodes/terraform/ - - ShellCommand: + - ShellCommand: ©_iso_to_bootstrap name: Copy ISO to bootstrap node command: > scp -F ssh_config ../../../../metalk8s.iso bootstrap: workdir: build/eve/workers/openstack-multiple-nodes/terraform/ haltOnFailure: true - - ShellCommand: + - ShellCommand: &create_mountpoint_on_bootstrap name: Create mountpoint in bootstrap node command: > ssh -F ssh_config bootstrap sudo mkdir -p /var/tmp/metalk8s workdir: build/eve/workers/openstack-multiple-nodes/terraform/ haltOnFailure: true - - ShellCommand: + - ShellCommand: &mount_iso_on_bootstrap name: Mount ISO image in bootstrap node command: > ssh -F ssh_config bootstrap sudo mount -o loop metalk8s.iso /var/tmp/metalk8s workdir: build/eve/workers/openstack-multiple-nodes/terraform/ haltOnFailure: true - - ShellCommand: + - ShellCommand: &bootstrap_on_bootstrap name: Start the bootstrap process in bootstrap node command: > ssh -F ssh_config bootstrap @@ -1562,7 +1563,7 @@ stages: sudo yum install -y kubectl --disablerepo=* --enablerepo=metalk8s-kubernetes workdir: build/eve/workers/openstack-multiple-nodes/terraform/ - - ShellCommand: + - ShellCommand: &enable_ipip_ssh name: Enable IPIP env: SSH_CONFIG: >- @@ -1570,7 +1571,7 @@ stages: command: > ssh -F $SSH_CONFIG bootstrap 'bash /home/centos/scripts/enable_ipip.sh' - - ShellCommand: + - ShellCommand: ©_test_source_to_bastion # FIXME: should find a cleaner way to do this (git clone may be # cumbersome, unless we assume the repo is public and don't use # authentication) @@ -1581,7 +1582,7 @@ stages: command: > tar cfp - tox.ini VERSION tests/ buildchain/buildchain/versions.py | ssh -F $SSH_CONFIG bastion '(mkdir metalk8s; cd "$_"; tar xf -)' - - ShellCommand: + - ShellCommand: &wait_kube_system_pods_running_multi_node <<: *wait_pods_running_ssh name: Wait for "kube-system" Pods to be in running state env: @@ -1595,7 +1596,7 @@ stages: env: <<: *_env_bastion_tests PYTEST_FILTERS: "install and ci and multinodes and not node2" - - ShellCommand: + - ShellCommand: &provision_volumes_on_node1 name: Provision Prometheus and AlertManager storage env: SSH_CONFIG: >- @@ -1609,19 +1610,19 @@ stages: NODE_NAME=node1 /tmp/create-volumes.sh haltOnFailure: true - - ShellCommand: + - ShellCommand: &wait_pods_running_multi_node <<: *wait_pods_running_ssh env: <<: *_env_wait_pods_running_ssh SSH_CONFIG: >- eve/workers/openstack-multiple-nodes/terraform/ssh_config - - ShellCommand: + - ShellCommand: &multi_node_fast_tests <<: *bastion_tests name: Run fast tests on the bastion env: <<: *_env_bastion_tests PYTEST_FILTERS: "post and ci and not slow" - - ShellCommand: + - ShellCommand: &multi_node_slow_tests <<: *bastion_tests name: Run slow tests on the bastion env: @@ -1629,16 +1630,16 @@ stages: PYTEST_ARGS: "--suppress-no-test-exit-code" PYTEST_FILTERS: >- post and ci and slow and not bootstrap and not restore - - ShellCommand: + - ShellCommand: &generate_report_multi_node <<: *generate_report_over_ssh - env: + env: &_env_generate_report_multi_node <<: *_env_generate_report_over_ssh HOSTS_LIST: "bootstrap node1" SSH_CONFIG: >- eve/workers/openstack-multiple-nodes/terraform/ssh_config - - ShellCommand: + - ShellCommand: &collect_report_multi_node <<: *collect_report_over_ssh - env: + env: &_env_collect_report_multi_node <<: *_env_collect_report_over_ssh HOSTS_LIST: "bootstrap node1" SSH_CONFIG: >- @@ -1658,6 +1659,193 @@ stages: env: STEP_NAME: multiple-nodes-centos DURATION: "14400" - - ShellCommand: + - ShellCommand: &terraform_destroy_multi_node <<: *terraform_destroy workdir: build/eve/workers/openstack-multiple-nodes/terraform/ + + bootstrap-restore: + _metalk8s_internal_info: + junit_info: &_bootstrap_restore_junit_info + TEST_SUITE: install + CLASS_NAME: multi node.centos7 + TEST_NAME: bootstrap restore + simultaneous_builds: 20 + worker: *multi_node_worker + steps: + - Git: *git_pull + - ShellCommand: + <<: *add_final_status_artifact_failed + env: + <<: *_env_final_status_artifact_failed + <<: *_bootstrap_restore_junit_info + STEP_NAME: bootstrap-restore + - ShellCommand: *setup_cache + - ShellCommand: *ssh_ip_setup + - ShellCommand: *retrieve_iso + - ShellCommand: *retrieve_iso_checksum + - ShellCommand: *check_iso_checksum + - ShellCommand: *terraform_install + - ShellCommand: *terraform_install_check + - ShellCommand: *terraform_init_multi_node + - ShellCommand: *terraform_validate_multi_node + - ShellCommand: + <<: *terraform_apply_multi_node + env: &_env_terraform_bootstrap_restore + <<: *_env_terraform + TF_VAR_nodes_count: "2" + - ShellCommand: *check_ssh_config_bootstrap + - ShellCommand: + <<: *copy_bastion_pub_key_ssh + env: + HOSTS_LIST: "bootstrap node1 node2" + - ShellCommand: *copy_bastion_priv_key_to_bootstrap + - ShellCommand: *copy_iso_to_bootstrap + - ShellCommand: *create_mountpoint_on_bootstrap + - ShellCommand: *mount_iso_on_bootstrap + - ShellCommand: *bootstrap_on_bootstrap + - ShellCommand: *enable_ipip_ssh + - ShellCommand: *copy_test_source_to_bastion + - ShellCommand: *wait_kube_system_pods_running_multi_node + - ShellCommand: + <<: *bastion_tests + name: Run installation scenario on the bastion + env: + <<: *_env_bastion_tests + PYTEST_FILTERS: "install and ci and multinodes" + - ShellCommand: *provision_volumes_on_node1 + - ShellCommand: *wait_pods_running_multi_node + - ShellCommand: *multi_node_fast_tests + - ShellCommand: *multi_node_slow_tests + - SetPropertyFromCommand: + name: Set bootstrap backup archive property + property: bootstrap_backup_archive + command: > + ssh -F ssh_config bootstrap + "sudo find /var/lib/metalk8s -name 'backup_*.tar.gz' + -printf '%f\n' | sort | tail -n1" + workdir: build/eve/workers/openstack-multiple-nodes/terraform/ + haltOnFailure: true + - ShellCommand: + name: Retrieve the backup archive from the bootstrap node + command: > + scp -F ssh_config + bootstrap:"/var/lib/metalk8s/%(prop:bootstrap_backup_archive)s" + /tmp + workdir: build/eve/workers/openstack-multiple-nodes/terraform/ + haltOnFailure: true + - ShellCommand: + name: Destroy the bootstrap node + env: *_env_terraform_bootstrap_restore + command: > + terraform destroy -auto-approve + -target openstack_compute_instance_v2.bootstrap + workdir: build/eve/workers/openstack-multiple-nodes/terraform/ + haltOnFailure: true + - SetPropertyFromCommand: + name: Set node1 etcd container id + property: node1_etcd_container_id + command: > + ssh -F ssh_config node1 + sudo crictl ps -q --label io.kubernetes.pod.namespace=kube-system + --label io.kubernetes.container.name=etcd --state Running + workdir: build/eve/workers/openstack-multiple-nodes/terraform/ + haltOnFailure: true + - SetPropertyFromCommand: + name: Set boostrap node etcd member id + property: bootstrap_etcd_member_id + command: > + ssh -F ssh_config node1 + sudo crictl exec -i "%(prop:node1_etcd_container_id)s" sh -c \" + ETCDCTL_API=3 etcdctl --endpoints https://127.0.0.1:2379 + --cert /etc/kubernetes/pki/etcd/server.crt + --key /etc/kubernetes/pki/etcd/server.key + --cacert /etc/kubernetes/pki/etcd/ca.crt + member list\" | awk -F ', ' '$3 ~ "bootstrap" { print $1 }' + workdir: build/eve/workers/openstack-multiple-nodes/terraform/ + haltOnFailure: true + - ShellCommand: + name: Remove bootstrap node from etcd members + command: > + ssh -F ssh_config node1 + sudo crictl exec -i "%(prop:node1_etcd_container_id)s" sh -c \" + ETCDCTL_API=3 etcdctl --endpoints https://127.0.0.1:2379 + --cert /etc/kubernetes/pki/etcd/server.crt + --key /etc/kubernetes/pki/etcd/server.key + --cacert /etc/kubernetes/pki/etcd/ca.crt + member remove %(prop:bootstrap_etcd_member_id)s\" + workdir: build/eve/workers/openstack-multiple-nodes/terraform/ + haltOnFailure: true + - ShellCommand: + name: Remove bootstrap node object + command: > + ssh -F ssh_config node1 + sudo kubectl --kubeconfig=/etc/kubernetes/admin.conf delete + node --selector="node-role.kubernetes.io/bootstrap" + workdir: build/eve/workers/openstack-multiple-nodes/terraform/ + haltOnFailure: true + - ShellCommand: + name: Create a new bootstrap node + env: *_env_terraform_bootstrap_restore + command: + terraform apply -auto-approve -refresh + -target openstack_compute_instance_v2.bootstrap + -target null_resource.ssh_config + workdir: build/eve/workers/openstack-multiple-nodes/terraform/ + haltOnFailure: true + - ShellCommand: *copy_bastion_pub_key_ssh + - ShellCommand: + name: Copy the backup archive to the new bootstrap node + command: > + scp -F ssh_config + "/tmp/%(prop:bootstrap_backup_archive)s" bootstrap:/tmp + workdir: build/eve/workers/openstack-multiple-nodes/terraform/ + haltOnFailure: true + - ShellCommand: + <<: *copy_iso_to_bootstrap + name: Copy ISO to the new bootstrap node + - ShellCommand: + <<: *create_mountpoint_on_bootstrap + name: Create mountpoint on the new bootstrap node + - ShellCommand: + <<: *mount_iso_on_bootstrap + name: Mount ISO image on the new bootstrap node + - ShellCommand: + <<: *bastion_tests + name: Run restore tests on the bastion + # NOTE: Increase timeout as restore may take some time + # Since we use pytest we do not have any output until restore totaly + # finished + timeout: 1800 + env: + <<: *_env_bastion_tests + PYTEST_FILTERS: "restore" + BOOTSTRAP_BACKUP_ARCHIVE: "/tmp/%(prop:bootstrap_backup_archive)s" + - ShellCommand: *wait_pods_running_multi_node + - ShellCommand: *multi_node_fast_tests + - ShellCommand: *multi_node_slow_tests + - ShellCommand: + <<: *generate_report_multi_node + env: + <<: *_env_generate_report_multi_node + HOSTS_LIST: "bootstrap node1 node2" + - ShellCommand: + <<: *collect_report_multi_node + env: + <<: *_env_collect_report_multi_node + HOSTS_LIST: "bootstrap node1 node2" + STEP_NAME: bootstrap-restore + - Upload: *upload_report_artifacts + - ShellCommand: + <<: *add_final_status_artifact_success + env: + <<: *_env_final_status_artifact_success + <<: *_bootstrap_restore_junit_info + STEP_NAME: bootstrap-restore + - Upload: *upload_final_status_artifact + - ShellCommand: + <<: *wait_debug + timeout: 14400 + env: + STEP_NAME: bootstrap-restore + DURATION: "14400" + - ShellCommand: *terraform_destroy_multi_node diff --git a/eve/testrail_description_file.yaml b/eve/testrail_description_file.yaml index ef3a5b0a06..a52ec98bcb 100644 --- a/eve/testrail_description_file.yaml +++ b/eve/testrail_description_file.yaml @@ -33,3 +33,17 @@ Downgrade: description: >- Downgrade tests sections: *lifecycle_sections + +Install: + description: >- + Installation tests + sections: + Multi Node: + description: >- + Multiple nodes test environment + sub_sections: + CentOs7: + description: >- + CentOs 7 tests + cases: + Bootstrap restore: {}