hassio-addons · lmagyar · Nov 11, 2023 · Oct 18, 2024 · Oct 22, 2024 · Oct 24, 2024
diff --git a/tailscale/DOCS.md b/tailscale/DOCS.md
@@ -347,6 +347,19 @@ CGNAT networks). You can test connections with `tailscale ping
 
 When not set, an automatically selected port is used by default.
 
+## Healthcheck
+
+Tailscale is quite resilient and can recover from nearly any network change. In
+case it fails to recover, the add-on's health is set unhealthy. The add-on's
+health is checked by Home Assistant in each 30s, and if it reports itself 3
+times unhealthy in a row, the add-on will be restarted.
+
+The add-on's health is set unhealthy:
+
+- Once it was online and gets offline for longer than 5 minutes.
+
+- After a (re)start can't get online for longer than 1 hour.
+
 ## Changelog & Releases
 
 This repository keeps a change log using [GitHub's releases][releases]

diff --git a/tailscale/Dockerfile b/tailscale/Dockerfile
@@ -35,6 +35,9 @@ RUN \
 # Copy root filesystem
 COPY rootfs /
 
+HEALTHCHECK \
+    CMD healthcheck
+
 # S6 Overlay stage 2 hook
 ENV S6_STAGE2_HOOK=/etc/s6-overlay/scripts/stage2_hook.sh
 

diff --git a/tailscale/rootfs/etc/s6-overlay/s6-rc.d/post-tailscaled/run b/tailscale/rootfs/etc/s6-overlay/s6-rc.d/post-tailscaled/run
@@ -90,7 +90,7 @@ unset IFS
 # Wait for the network to be available and logged in
 while ! bashio::fs.socket_exists "/var/run/tailscale/tailscaled.sock" || \
   ! /opt/tailscale status --json --peers=false --self=false \
-    | jq --exit-status '.BackendState == "Running" or .BackendState == "NeedsLogin"' > /dev/null;
+    | jq --exit-status '.BackendState == "Running" or .BackendState == "NeedsLogin" or .BackendState == "Stopped"' > /dev/null;
 do
   sleep 2
 done

diff --git a/tailscale/rootfs/usr/bin/healthcheck b/tailscale/rootfs/usr/bin/healthcheck
@@ -0,0 +1,60 @@
+#!/command/with-contenv bashio
+# shellcheck shell=bash
+
+# Plain (non-json) tailscale status returns error when status is not Running or Starting, so eg. NeedsLogin and NeedsMachineAuth would make it unhealthy
+# The .Health json filter returns any problems, so even temporary health problems would make it unhealthy
+# This script treats the following situations unhealthy:
+# - .BackendState == "Stopped"
+# - if once was online but gets offline for more then HEALTHCHECK_OFFLINE_TIMEOUT seconds
+# - if never gets online for more then HEALTHCHECK_RESTART_TIMEOUT seconds
+#   This can handle internal TS bugs, like https://github.com/tailscale/tailscale/issues/12021 where TS fails to recover from rerouting traffic from normal WAN to failover WAN
+
+# Redirect healthchecks' output to the log
+exec &> /proc/1/fd/1
+
+readonly HEALTHCHECK_OFFLINE_TIMEOUT=300   # 5 minutes
+readonly HEALTHCHECK_RESTART_TIMEOUT=3600  # 1 hour
+
+declare status_json
+declare backend_state is_self_online
+# STARTED_TIMESTAMP is in contenv at /var/run/s6/container_environment
+# LAST_ONLINE_TIMESTAMP is in contenv at /var/run/s6/container_environment
+# LAST_REPORTED_HEALTH_STATE is in contenv at /var/run/s6/container_environment
+
+if ! bashio::var.has_value "${STARTED_TIMESTAMP-}"
+then
+  STARTED_TIMESTAMP=$(date +"%s")
+  printf "${STARTED_TIMESTAMP}" > /var/run/s6/container_environment/STARTED_TIMESTAMP
+fi
+
+status_json=$(/opt/tailscale status --json --self=true --peers=false)
+backend_state=$(jq -r '.BackendState' <<< "${status_json}")
+is_self_online=$(jq -r '.Self.Online' <<< "${status_json}")
+
+if bashio::var.equals "${backend_state}" "Running" && bashio::var.true "${is_self_online}"
+then
+  LAST_ONLINE_TIMESTAMP=$(date +"%s")
+  printf "${LAST_ONLINE_TIMESTAMP}" > /var/run/s6/container_environment/LAST_ONLINE_TIMESTAMP
+fi
+
+if [[ "${backend_state}" == "Stopped" ]] || \
+  (bashio::var.has_value "${LAST_ONLINE_TIMESTAMP-}" && \
+    (( $(date +"%s") - ${LAST_ONLINE_TIMESTAMP} > ${HEALTHCHECK_OFFLINE_TIMEOUT} )) ) || \
+  (! bashio::var.has_value "${LAST_ONLINE_TIMESTAMP-}" && \
+    (( $(date +"%s") - ${STARTED_TIMESTAMP} > ${HEALTHCHECK_RESTART_TIMEOUT} )) )
+then
+  # Unhealthy
+  if ! bashio::var.equals "${LAST_REPORTED_HEALTH_STATE-}" "UNHEALTHY"; then
+    printf "UNHEALTHY" > /var/run/s6/container_environment/LAST_REPORTED_HEALTH_STATE
+  fi
+  # Log it always
+  bashio::log.fatal "Add-on is unhealthy"
+  bashio::exit.nok
+else
+  # Healthy
+  if ! bashio::var.equals "${LAST_REPORTED_HEALTH_STATE-}" "HEALTHY"; then
+    printf "HEALTHY" > /var/run/s6/container_environment/LAST_REPORTED_HEALTH_STATE
+    # Log it only once
+    bashio::log.info "Add-on is healthy"
+  fi
+fi