From 6ca5e0ed4deb2398034c28c85494489f728957e8 Mon Sep 17 00:00:00 2001
From: howard <howard.zhang@arm.com>
Date: Thu, 5 Sep 2019 17:26:39 +0800
Subject: [PATCH] local-up-cluster kube-proxy terminated error

When using hack/local-up-cluster.sh deploy local cluster, it
failed with following message "kube-proxy terminated unexpectedly"
and "Failed to retrieve node info: nodes "127.0.0.1" not found" in
kube-proxy.log.

The root reason for this error is miss boot order of kubernetes
services in local-up-cluster.sh, kube-proxy and kubectl daemon.

When starting kube-proxy, it would check node information. And
these information are collected by kubelet daemon. However, in
the shell script, kube-proxy service start before kubelet daemon.

This patch changed the boot order of kubelet daemon and kube-proxy
and check if node stats ready for kube-proxy start.

Signed-off-by: Howard Zhang <howard.zhang@arm.com>
---
 hack/lib/util.sh         | 18 ++++++++++++++++++
 hack/local-up-cluster.sh | 25 ++++++++++++++++++++++---
 2 files changed, 40 insertions(+), 3 deletions(-)
diff --git a/hack/lib/util.sh b/hack/lib/util.sh
index 9c1c2093a1101..8fd29213de599 100755
--- a/hack/lib/util.sh
+++ b/hack/lib/util.sh
@@ -64,6 +64,24 @@ kube::util::wait_for_url() {
   return 1
 }
 
+# Example:  kube::util::wait_for_success 120 5 "kubectl get nodes|grep localhost"
+# arguments: wait time, sleep time, shell command
+# returns 0 if the shell command get output, 1 otherwise.
+kube::util::wait_for_success(){
+  local wait_time="$1"
+  local sleep_time="$2"
+  local cmd="$3"
+  while [ "$wait_time" -gt 0 ]; do
+    if eval "$cmd"; then
+      return 0
+    else
+      sleep "$sleep_time"
+      wait_time=$((wait_time-sleep_time))
+    fi
+  done
+  return 1
+}
+
 # Example:  kube::util::trap_add 'echo "in trap DEBUG"' DEBUG
 # See: http://stackoverflow.com/questions/3338030/multiple-bash-traps-for-the-same-signal
 kube::util::trap_add() {
diff --git a/hack/local-up-cluster.sh b/hack/local-up-cluster.sh
index ca641cdbe52f6..6755c5af18fdd 100755
--- a/hack/local-up-cluster.sh
+++ b/hack/local-up-cluster.sh
@@ -670,6 +670,19 @@ function start_cloud_controller_manager {
     export CLOUD_CTLRMGR_PID=$!
 }
 
+function wait_node_ready(){
+  # check the nodes information after kubelet daemon start
+  local nodes_stats="${KUBECTL} --kubeconfig '${CERT_DIR}/admin.kubeconfig' get nodes"
+  local node_name=$KUBELET_HOST
+  local system_node_wait_time=30
+  local interval_time=2
+  kube::util::wait_for_success "$system_node_wait_time" "$interval_time" "$nodes_stats | grep $node_name"
+  if [ $? == "1" ]; then
+    echo "time out on waiting $node_name info"
+    exit 1
+  fi
+}
+
 function start_kubelet {
     KUBELET_LOG=${LOG_DIR}/kubelet.log
     mkdir -p "${POD_MANIFEST_PATH}" &>/dev/null || sudo mkdir -p "${POD_MANIFEST_PATH}"
@@ -784,6 +797,10 @@ function start_kubelet {
 function start_kubeproxy {
     PROXY_LOG=${LOG_DIR}/kube-proxy.log
 
+    # wait for kubelet collect node information
+    echo "wait kubelet ready"
+    wait_node_ready
+
     cat <<EOF > /tmp/kube-proxy.yaml
 apiVersion: kubeproxy.config.k8s.io/v1alpha1
 kind: KubeProxyConfiguration
@@ -997,9 +1014,6 @@ if [[ "${START_MODE}" != "kubeletonly" ]]; then
   if [[ "${EXTERNAL_CLOUD_PROVIDER:-}" == "true" ]]; then
     start_cloud_controller_manager
   fi
-  if [[ "${START_MODE}" != "nokubeproxy" ]]; then
-    start_kubeproxy
-  fi
   start_kubescheduler
   start_kubedns
   if [[ "${ENABLE_NODELOCAL_DNS:-}" == "true" ]]; then
@@ -1025,6 +1039,11 @@ if [[ "${START_MODE}" != "nokubelet" ]]; then
     esac
 fi
 
+if [[ "${START_MODE}" != "kubeletonly" ]]; then
+  if [[ "${START_MODE}" != "nokubeproxy" ]]; then
+    start_kubeproxy
+  fi
+fi
 if [[ -n "${PSP_ADMISSION}" && "${AUTHORIZATION_MODE}" = *RBAC* ]]; then
   create_psp_policy
 fi