From ab04a33cc5c0dfeb44343e84d322e3f5e9857295 Mon Sep 17 00:00:00 2001 From: Evan Farrar Date: Mon, 24 Apr 2023 16:31:25 +0000 Subject: [PATCH] Allow BBR post unlock to proceed if CC api is remotely available Previously the unlock health check failed if any single VM was unhealthy because it used the healthz endpoint over a local route --- .../templates/post-backup-unlock.sh.erb | 8 ++++++-- src/capi_utils/monit_utils.sh | 14 ++++++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/jobs/cloud_controller_ng/templates/post-backup-unlock.sh.erb b/jobs/cloud_controller_ng/templates/post-backup-unlock.sh.erb index 0890a4b707..4a98e62014 100755 --- a/jobs/cloud_controller_ng/templates/post-backup-unlock.sh.erb +++ b/jobs/cloud_controller_ng/templates/post-backup-unlock.sh.erb @@ -17,6 +17,10 @@ source /var/vcap/packages/capi_utils/syslog_utils.sh <% (1..(p("cc.jobs.local.number_of_workers"))).each do |index| %> monit_start_job cloud_controller_worker_local_<%= index %> <% end %> - wait_for_server_to_become_healthy <%= "localhost:#{p("cc.external_port")}/healthz" %> <%= p("cc.post_bbr_healthcheck_timeout_in_seconds") %> - sleep 30 + set +e + wait_for_server_to_become_healthy_without_setminuse <%= "localhost:#{p("cc.external_port")}/healthz" %> <%= p("cc.post_bbr_healthcheck_timeout_in_seconds") %> + set -e + if [ $? -eq 0 ]; then + wait_for_server_to_become_healthy <%= "#{p("cc.external_protocol")}://#{p("cc.external_host")}.#{p("system_domain")}/info" %> <%= p("cc.post_bbr_healthcheck_timeout_in_seconds") %> + fi <% end %> diff --git a/src/capi_utils/monit_utils.sh b/src/capi_utils/monit_utils.sh index 9e941d58c2..7550b75822 100755 --- a/src/capi_utils/monit_utils.sh +++ b/src/capi_utils/monit_utils.sh @@ -52,6 +52,20 @@ function wait_for_server_to_become_healthy() { return 1 } +function wait_for_server_to_become_healthy_without_setminuse() { + local url=$1 + local timeout=$2 + for _ in $(seq "${timeout}"); do + curl -k -f --connect-timeout 1 "${url}" > /dev/null 2>&1 + if [ $? -eq 0 ]; then + return 0 + fi + sleep 1 + done + + echo "Endpoint ${url} failed to become healthy after ${timeout} seconds" + return 1 +} # monit_monitor_job # # @param job_name