hardwire healthcheck timeouts

hassio-addons · Dec 6, 2024 · 593ddd2 · 593ddd2
1 parent 5a3ea4d
commit 593ddd2
Show file tree

Hide file tree

Showing 4 changed files with 12 additions and 78 deletions.
diff --git a/tailscale/DOCS.md b/tailscale/DOCS.md
@@ -70,8 +70,6 @@ advertise_routes:
   - 192.168.1.0/24
   - fd12:3456:abcd::/64
 funnel: false
-healthcheck_offline_timeout: 110
-healthcheck_restart_timeout: 3600
 log_level: info
 login_server: "https://controlplane.tailscale.com"
 proxy: false
@@ -191,54 +189,6 @@ port 443 (or the port configured in option `proxy_and_funnel_port`)._
 **Note:** _If you encounter strange browser behaviour or strange error messages,
 try to clear all site related cookies, clear all browser cache, restart browser._
 
-### Option: `healthcheck_offline_timeout`
-
-This option allows you to set timeout in seconds for Tailscale to be offline
-after it was once online.
-
-When not set, this option is disabled by default.
-
-Tailscale is quite resilient and can recover from nearly any network change. But
-in case it fails to recover and remains offline longer than
-healthcheck_offline_timeout seconds, the add-on can be restarted. The check
-happens only after Tailscale was once online, ie. it won't have any effect when
-Tailscale's status is eg. Starting, NeedsLogin or NeedsMachineAuth.
-
-**Note:** _The Stopped status is deemed unhealthy by default._
-
-**Note:** _If the network is down, and this option is set, the add-on will be
-restarted only once. To restart even after an unsuccessful (re)start, see option
-healthcheck_restart_timeout._
-
-**Note:** _The add-on's health is checked by Home Assistant in each 30s, ie. the
-effective resolution of this option is 30s, not 1s._
-
-### Option: `healthcheck_restart_timeout`
-
-This option allows you to set timeout in seconds for Tailscale to be offline
-after (re)start.
-
-The minimum value is 900, ie. 15 minutes, to always give you enough time to
-authenticate when you start the add-on for the very first time.
-
-When not set, this option is disabled by default.
-
-Tailscale is quite resilient and can recover from nearly any network change. But
-in case it fails to recover even after a (re)start and remains offline longer
-than healthcheck_restart_timeout seconds, the add-on can be restarted again. The
-check happens only when Tailscale is starting, ie. it won't have any effect
-after Tailscale is started and got online successfully.
-
-**Note:** _The Stopped status is deemed unhealthy by default._
-
-**Note:** _If the network is down after a (re)start, and this option is set, the
-add-on will be restarted regularly, so do not set this option too low, set it to
-several minutes or even hours. To restart when the network got down after it was
-up, see option healthcheck_offline_timeout._
-
-**Note:** _The add-on's health is checked by Home Assistant in each 30s, ie. the
-effective resolution of this option is 30s, not 1s._
-
 ### Option: `log_level`
 
 Optionally enable tailscaled debug messages in the add-on's log. Turn it on only

diff --git a/tailscale/config.yaml b/tailscale/config.yaml
@@ -35,8 +35,6 @@ schema:
   advertise_routes:
     - "match(^(((25[0-5]|(2[0-4]|1\\d|[1-9]?)\\d)\\.){3}(25[0-5]|(2[0-4]|1\\d|[1-9]?)\\d)\\/(3[0-2]|[12]?\\d)|[a-fA-F\\d.:]+:[a-fA-F\\d.:]+\\/(12[0-8]|(1[01]|[1-9]?)\\d))$)?"
   funnel: bool?
-  healthcheck_offline_timeout: int?
-  healthcheck_restart_timeout: int(900,)?
   log_level: list(trace|debug|info|notice|warning|error|fatal)?
   login_server: url?
   proxy: bool?

diff --git a/tailscale/rootfs/usr/bin/healthcheck b/tailscale/rootfs/usr/bin/healthcheck
@@ -4,22 +4,24 @@
 # Plain (non-json) tailscale status returns error when status is not Running or Starting, so eg. NeedsLogin and NeedsMachineAuth would make it unhealthy
 # The .Health json filter returns any problems, so even temporary health problems would make it unhealthy
 # This script treats the following situations unhealthy:
-# - always: .BackendState == "Stopped"
-# - optionally: if once was online but gets offline for more then healthcheck_offline_timeout seconds (configurable)
-# - optionally: if never gets online for more then healthcheck_restart_timeout seconds (configurable)
+# - .BackendState == "Stopped"
+# - if once was online but gets offline for more then HEALTHCHECK_OFFLINE_TIMEOUT seconds
+# - if never gets online for more then HEALTHCHECK_RESTART_TIMEOUT seconds
 #   This can handle internal TS bugs, like https://github.com/tailscale/tailscale/issues/12021 where TS fails to recover from rerouting traffic from normal WAN to failover WAN
 
 # Redirect healthchecks' output to the log
 exec &> /proc/1/fd/1
 
+readonly HEALTHCHECK_OFFLINE_TIMEOUT=300   # 5 minutes
+readonly HEALTHCHECK_RESTART_TIMEOUT=3600  # 1 hour
+
 declare status_json
 declare backend_state is_self_online
 # STARTED_TIMESTAMP is in contenv at /var/run/s6/container_environment
 # LAST_ONLINE_TIMESTAMP is in contenv at /var/run/s6/container_environment
 # LAST_REPORTED_HEALTH_STATE is in contenv at /var/run/s6/container_environment
 
-if bashio::config.has_value "healthcheck_restart_timeout" && \
-  ! bashio::var.has_value "${STARTED_TIMESTAMP-}"
+if ! bashio::var.has_value "${STARTED_TIMESTAMP-}"
 then
   STARTED_TIMESTAMP=$(date +"%s")
   printf "${STARTED_TIMESTAMP}" > /var/run/s6/container_environment/STARTED_TIMESTAMP
@@ -29,20 +31,17 @@ status_json=$(/opt/tailscale status --json --self=true --peers=false)
 backend_state=$(jq -r '.BackendState' <<< "${status_json}")
 is_self_online=$(jq -r '.Self.Online' <<< "${status_json}")
 
-if (bashio::config.has_value "healthcheck_offline_timeout" || bashio::config.has_value "healthcheck_restart_timeout") && \
-  bashio::var.equals "${backend_state}" "Running" && bashio::var.true "${is_self_online}"
+if bashio::var.equals "${backend_state}" "Running" && bashio::var.true "${is_self_online}"
 then
   LAST_ONLINE_TIMESTAMP=$(date +"%s")
   printf "${LAST_ONLINE_TIMESTAMP}" > /var/run/s6/container_environment/LAST_ONLINE_TIMESTAMP
 fi
 
 if [[ "${backend_state}" == "Stopped" ]] || \
-  (bashio::config.has_value "healthcheck_offline_timeout" && \
-    bashio::var.has_value "${LAST_ONLINE_TIMESTAMP-}" && \
-    (( $(date +"%s") - ${LAST_ONLINE_TIMESTAMP} > $(bashio::config "healthcheck_offline_timeout") )) ) || \
-  (bashio::config.has_value "healthcheck_restart_timeout" && \
-    ! bashio::var.has_value "${LAST_ONLINE_TIMESTAMP-}" && \
-    (( $(date +"%s") - ${STARTED_TIMESTAMP} > $(bashio::config "healthcheck_restart_timeout") )) )
+  (bashio::var.has_value "${LAST_ONLINE_TIMESTAMP-}" && \
+    (( $(date +"%s") - ${LAST_ONLINE_TIMESTAMP} > ${HEALTHCHECK_OFFLINE_TIMEOUT} )) ) || \
+  (! bashio::var.has_value "${LAST_ONLINE_TIMESTAMP-}" && \
+    (( $(date +"%s") - ${STARTED_TIMESTAMP} > ${HEALTHCHECK_RESTART_TIMEOUT} )) )
 then
   # Unhealthy
   if ! bashio::var.equals "${LAST_REPORTED_HEALTH_STATE-}" "UNHEALTHY"; then

diff --git a/tailscale/translations/en.yaml b/tailscale/translations/en.yaml
@@ -41,19 +41,6 @@ configuration:
       Home Assistant instance on the wider internet using your Tailscale domain.
       This requires Tailscale Proxy to be enabled.
       When not set, this option is disabled by default.
-  healthcheck_offline_timeout:
-    name: Healthcheck offline timeout [s]
-    description: >-
-      This option allows you to set timeout in seconds for Tailscale to be offline
-      after it was once online.
-      When not set, this option is disabled by default.
-  healthcheck_restart_timeout:
-    name: Healthcheck restart timeout [s]
-    description: >-
-      This option allows you to set timeout in seconds for Tailscale to be offline
-      after (re)start. The minimum value is 900, ie. 15 minutes. Set it to several
-      minutes or even hours.
-      When not set, this option is disabled by default.
   log_level:
     name: Log level
     description: >-