From 6ca5e0ed4deb2398034c28c85494489f728957e8 Mon Sep 17 00:00:00 2001 From: howard Date: Thu, 5 Sep 2019 17:26:39 +0800 Subject: [PATCH] local-up-cluster kube-proxy terminated error When using hack/local-up-cluster.sh deploy local cluster, it failed with following message "kube-proxy terminated unexpectedly" and "Failed to retrieve node info: nodes "127.0.0.1" not found" in kube-proxy.log. The root reason for this error is miss boot order of kubernetes services in local-up-cluster.sh, kube-proxy and kubectl daemon. When starting kube-proxy, it would check node information. And these information are collected by kubelet daemon. However, in the shell script, kube-proxy service start before kubelet daemon. This patch changed the boot order of kubelet daemon and kube-proxy and check if node stats ready for kube-proxy start. Signed-off-by: Howard Zhang --- hack/lib/util.sh | 18 ++++++++++++++++++ hack/local-up-cluster.sh | 25 ++++++++++++++++++++++--- 2 files changed, 40 insertions(+), 3 deletions(-) diff --git a/hack/lib/util.sh b/hack/lib/util.sh index 9c1c2093a1101..8fd29213de599 100755 --- a/hack/lib/util.sh +++ b/hack/lib/util.sh @@ -64,6 +64,24 @@ kube::util::wait_for_url() { return 1 } +# Example: kube::util::wait_for_success 120 5 "kubectl get nodes|grep localhost" +# arguments: wait time, sleep time, shell command +# returns 0 if the shell command get output, 1 otherwise. +kube::util::wait_for_success(){ + local wait_time="$1" + local sleep_time="$2" + local cmd="$3" + while [ "$wait_time" -gt 0 ]; do + if eval "$cmd"; then + return 0 + else + sleep "$sleep_time" + wait_time=$((wait_time-sleep_time)) + fi + done + return 1 +} + # Example: kube::util::trap_add 'echo "in trap DEBUG"' DEBUG # See: http://stackoverflow.com/questions/3338030/multiple-bash-traps-for-the-same-signal kube::util::trap_add() { diff --git a/hack/local-up-cluster.sh b/hack/local-up-cluster.sh index ca641cdbe52f6..6755c5af18fdd 100755 --- a/hack/local-up-cluster.sh +++ b/hack/local-up-cluster.sh @@ -670,6 +670,19 @@ function start_cloud_controller_manager { export CLOUD_CTLRMGR_PID=$! } +function wait_node_ready(){ + # check the nodes information after kubelet daemon start + local nodes_stats="${KUBECTL} --kubeconfig '${CERT_DIR}/admin.kubeconfig' get nodes" + local node_name=$KUBELET_HOST + local system_node_wait_time=30 + local interval_time=2 + kube::util::wait_for_success "$system_node_wait_time" "$interval_time" "$nodes_stats | grep $node_name" + if [ $? == "1" ]; then + echo "time out on waiting $node_name info" + exit 1 + fi +} + function start_kubelet { KUBELET_LOG=${LOG_DIR}/kubelet.log mkdir -p "${POD_MANIFEST_PATH}" &>/dev/null || sudo mkdir -p "${POD_MANIFEST_PATH}" @@ -784,6 +797,10 @@ function start_kubelet { function start_kubeproxy { PROXY_LOG=${LOG_DIR}/kube-proxy.log + # wait for kubelet collect node information + echo "wait kubelet ready" + wait_node_ready + cat < /tmp/kube-proxy.yaml apiVersion: kubeproxy.config.k8s.io/v1alpha1 kind: KubeProxyConfiguration @@ -997,9 +1014,6 @@ if [[ "${START_MODE}" != "kubeletonly" ]]; then if [[ "${EXTERNAL_CLOUD_PROVIDER:-}" == "true" ]]; then start_cloud_controller_manager fi - if [[ "${START_MODE}" != "nokubeproxy" ]]; then - start_kubeproxy - fi start_kubescheduler start_kubedns if [[ "${ENABLE_NODELOCAL_DNS:-}" == "true" ]]; then @@ -1025,6 +1039,11 @@ if [[ "${START_MODE}" != "nokubelet" ]]; then esac fi +if [[ "${START_MODE}" != "kubeletonly" ]]; then + if [[ "${START_MODE}" != "nokubeproxy" ]]; then + start_kubeproxy + fi +fi if [[ -n "${PSP_ADMISSION}" && "${AUTHORIZATION_MODE}" = *RBAC* ]]; then create_psp_policy fi