Merge pull request #125 from hnousiainen/htn_pglookout_failover_prior…

…ities pglookout: support explicit failover priorities
Aiven-Open · Jan 13, 2025 · f486efe · f486efe
2 parents 60f65b2 + 56fc154
commit f486efe
Showing 5 changed files with 280 additions and 157 deletions.
diff --git a/README.rst b/README.rst
@@ -295,6 +295,14 @@ over_warning_limit_command and to create a warning file.
 
 Shell command to execute in case the node has deemed itself in need of promotion
 
+``failover_priorities`` (default ``{}``)
+
+Define priority of nodes for promotion, in case there are multiple candidates
+with the same replication position.  This allows to ensure all pglookout instances
+would elect the same standby for promotion, while still allowing for topologies
+with e.g. less preferred standbys in secondary network locations. By default,
+pglookout uses remote connection ids for the same selection purpose.
+
 ``known_gone_nodes`` (default ``[]``)
 
 Lists nodes that are explicitly known to have left the cluster.  If the old

diff --git a/pglookout/pglookout.py b/pglookout/pglookout.py
@@ -7,6 +7,7 @@
 This file is under the Apache License, Version 2.0.
 See the file `LICENSE` for details.
 """
+
 from . import logutil, statsd, version
 from .cluster_monitor import ClusterMonitor
 from .common import convert_xlog_location_to_offset, get_iso_timestamp, parse_iso_datetime
@@ -643,19 +644,35 @@ def do_failover_decision(self, standby_nodes):
         if not known_replication_positions:
             self.log.warning("No known replication positions, canceling failover consideration")
             return
-        # If there are multiple nodes with the same replication positions pick the one with the "highest" name
-        # to make sure pglookouts running on all standbys make the same decision.  The rationale for picking
-        # the "highest" node is that there's no obvious way for pglookout to decide which of the nodes is
-        # "best" beyond looking at replication positions, but picking the highest id supports environments
-        # where nodes are assigned identifiers from an incrementing sequence identifiers and where we want to
-        # promote the latest and greatest node.  In static environments node identifiers can be priority
-        # numbers, with the highest number being the one that should be preferred.
-        furthest_along_instance = max(known_replication_positions[max(known_replication_positions)])
+
+        # Find the instance that is furthest along.
+        # If there are multiple nodes with the same replication positions, try to identify one to promote either
+        # via explicit failover priority configuration or pick the one with the "highest" name by sort order.
+        # The rationale of this logic is to ensure all participating pglookouts running on all standbys make
+        # the same decision. The "highest" name works well in environments where nodes are assigned identifiers
+        # from an incrementing sequence and where we want to promote the latest and greatest node.
+
+        # First, find the list of instances that share the more recent replication position
+        furthest_along_instances = known_replication_positions[max(known_replication_positions)]
+        # Second, sort them by "instance name"
+        furthest_along_instances = sorted(furthest_along_instances, reverse=True)
+        # Third, if we have explicit failover priorities, use those for selecting the to be promoted instance
+        if "failover_priorities" in self.config:
+            highest_priority = max(
+                self.config["failover_priorities"].get(instance, 0) for instance in furthest_along_instances
+            )
+            furthest_along_instances = [
+                instance
+                for instance in furthest_along_instances
+                if self.config["failover_priorities"].get(instance) == highest_priority
+            ]
+        furthest_along_instance = furthest_along_instances[0]
         self.log.warning(
             "Node that is furthest along is: %r, all replication positions were: %r",
             furthest_along_instance,
             sorted(known_replication_positions),
         )
+
         total_observers = len(self.connected_observer_nodes) + len(self.disconnected_observer_nodes)
         # +1 in the calculation comes from the master node
         total_amount_of_nodes = len(standby_nodes) + 1 - len(self.never_promote_these_nodes) + total_observers

diff --git a/pyproject.toml b/pyproject.toml
@@ -47,6 +47,7 @@ exclude = [
     'test/test_lookout.py',
     'test/test_pgutil.py',
     'test/test_webserver.py',
+    'test/utils.py',
     # Other.
     'setup.py',
     'version.py',

diff --git a/test/test_lookout.py b/test/test_lookout.py
diff --git a/test/utils.py b/test/utils.py
@@ -0,0 +1,86 @@
+"""
+Utilities for pglookout tests
+
+Copyright (c) 2015 Ohmu Ltd
+Copyright (c) 2014 F-Secure
+
+This file is under the Apache License, Version 2.0.
+See the file `LICENSE` for details.
+"""
+
+from pglookout.common import get_iso_timestamp
+
+
+def add_to_observer_state(
+    pgl,
+    observer_name,
+    db_name,
+    pg_last_xlog_receive_location=None,
+    pg_is_in_recovery=True,
+    connection=True,
+    replication_time_lag=None,
+    fetch_time=None,
+    db_time=None,
+):
+    db_node_state = create_db_node_state(
+        pg_last_xlog_receive_location,
+        pg_is_in_recovery,
+        connection,
+        replication_time_lag,
+        fetch_time=fetch_time,
+        db_time=db_time,
+    )
+    update_dict = {
+        "fetch_time": get_iso_timestamp(),  # type: ignore[no-untyped-call]
+        "connection": True,
+        db_name: db_node_state,
+    }
+    if observer_name in pgl.observer_state:
+        pgl.observer_state[observer_name].update(update_dict)
+    else:
+        pgl.observer_state[observer_name] = update_dict
+
+
+def create_db_node_state(
+    pg_last_xlog_receive_location=None,
+    pg_is_in_recovery=True,
+    connection=True,
+    replication_time_lag=None,
+    fetch_time=None,
+    db_time=None,
+):
+    return {
+        "connection": connection,
+        "db_time": get_iso_timestamp(db_time),
+        "fetch_time": get_iso_timestamp(fetch_time),
+        "pg_is_in_recovery": pg_is_in_recovery,
+        "pg_last_xact_replay_timestamp": None,
+        "pg_last_xlog_receive_location": pg_last_xlog_receive_location,
+        "pg_last_xlog_replay_location": None,
+        "replication_time_lag": replication_time_lag,
+        "min_replication_time_lag": 0,  # simulate that we've been in sync once
+    }
+
+
+def set_instance_cluster_state(
+    pgl,
+    *,
+    instance,
+    pg_last_xlog_receive_location=None,
+    pg_is_in_recovery=True,
+    connection=True,
+    replication_time_lag=None,
+    fetch_time=None,
+    db_time=None,
+    conn_info=None,
+):
+    db_node_state = create_db_node_state(
+        pg_last_xlog_receive_location,
+        pg_is_in_recovery,
+        connection,
+        replication_time_lag,
+        fetch_time=fetch_time,
+        db_time=db_time,
+    )
+    pgl.cluster_state[instance] = db_node_state
+    pgl.config["remote_conns"][instance] = conn_info or {"host": instance}