From 0d7b7ba89b0664328127b04c5fd19b288842bae2 Mon Sep 17 00:00:00 2001 From: Anthony Dodd Date: Wed, 2 Mar 2022 11:38:39 -0600 Subject: [PATCH 1/4] Add new static_primary config option In essence, this configuration option will ensure that a Patroni cluster running with a static primary will not demote the master unnecessarily. Transient failures to update the leader lock in the DCS will not cause a demotion when running with a static primary. When running as leader under normal circumstances, DCS exceptions will not cause a demotion when running with `static_primary=thisNode`. Even if replicas are added to the Patroni cluster, Patroni will be able to protect itself from entering into unsafe states by checking the value of static_primary. If the configured static_primary is not the host node, then the replica will refuse to progress to postmaster boot. --- .github/workflows/tests.yaml | 2 ++ docs/ENVIRONMENT.rst | 1 + docs/SETTINGS.rst | 5 +++-- docs/releases.rst | 11 ++++++++++- patroni/config.py | 9 +++++++-- patroni/ha.py | 35 ++++++++++++++++++++++++++++++++--- patroni/validator.py | 1 + patroni/version.py | 2 +- tests/test_config.py | 1 + 9 files changed, 58 insertions(+), 9 deletions(-) diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index 534cd64ba..35959ab98 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -15,6 +15,8 @@ jobs: fail-fast: false matrix: os: [ubuntu, windows, macos] + env: + PYTHONWARNINGS: ignore steps: - uses: actions/checkout@v1 diff --git a/docs/ENVIRONMENT.rst b/docs/ENVIRONMENT.rst index dd78d35e4..89f407537 100644 --- a/docs/ENVIRONMENT.rst +++ b/docs/ENVIRONMENT.rst @@ -11,6 +11,7 @@ Global/Universal - **PATRONI\_NAME**: name of the node where the current instance of Patroni is running. Must be unique for the cluster. - **PATRONI\_NAMESPACE**: path within the configuration store where Patroni will keep information about the cluster. Default value: "/service" - **PATRONI\_SCOPE**: cluster name +- **PATRONI\_STATIC\_PRIMARY**: enables a few optimizations to ensure that a cluster configured with a static primary will not unnecessarily demote the cluster primary. This is useful for cases where a cluster is running as a single-node cluster. When this value is configured in the DCS, replicas will refuse to boot until the config value is removed. Log --- diff --git a/docs/SETTINGS.rst b/docs/SETTINGS.rst index 7433d4388..fd35ca9a3 100644 --- a/docs/SETTINGS.rst +++ b/docs/SETTINGS.rst @@ -18,7 +18,8 @@ Dynamic configuration is stored in the DCS (Distributed Configuration Store) and - **maximum\_lag\_on\_syncnode**: the maximum bytes a synchronous follower may lag before it is considered as an unhealthy candidate and swapped by healthy asynchronous follower. Patroni utilize the max replica lsn if there is more than one follower, otherwise it will use leader's current wal lsn. Default is -1, Patroni will not take action to swap synchronous unhealthy follower when the value is set to 0 or below. Please set the value high enough so Patroni won't swap synchrounous follower fequently during high transaction volume. - **max\_timelines\_history**: maximum number of timeline history items kept in DCS. Default value: 0. When set to 0, it keeps the full history in DCS. - **master\_start\_timeout**: the amount of time a master is allowed to recover from failures before failover is triggered (in seconds). Default is 300 seconds. When set to 0 failover is done immediately after a crash is detected if possible. When using asynchronous replication a failover can cause lost transactions. Worst case failover time for master failure is: loop\_wait + master\_start\_timeout + loop\_wait, unless master\_start\_timeout is zero, in which case it's just loop\_wait. Set the value according to your durability/availability tradeoff. -- **master\_stop\_timeout**: The number of seconds Patroni is allowed to wait when stopping Postgres and effective only when synchronous_mode is enabled. When set to > 0 and the synchronous_mode is enabled, Patroni sends SIGKILL to the postmaster if the stop operation is running for more than the value set by master_stop_timeout. Set the value according to your durability/availability tradeoff. If the parameter is not set or set <= 0, master_stop_timeout does not apply. +- **master\_stop\_timeout**: the number of seconds Patroni is allowed to wait when stopping Postgres and effective only when synchronous_mode is enabled. When set to > 0 and the synchronous_mode is enabled, Patroni sends SIGKILL to the postmaster if the stop operation is running for more than the value set by master_stop_timeout. Set the value according to your durability/availability tradeoff. If the parameter is not set or set <= 0, master_stop_timeout does not apply. +- **static\_primary**: enables a few optimizations to ensure that a cluster configured with a static primary will not unnecessarily demote the cluster primary. This is useful for cases where a cluster is running as a single-node cluster. When this value is configured in the DCS, replicas will refuse to boot until the config value is removed. - **synchronous\_mode**: turns on synchronous replication mode. In this mode a replica will be chosen as synchronous and only the latest leader and synchronous replica are able to participate in leader election. Synchronous mode makes sure that successfully committed transactions will not be lost at failover, at the cost of losing availability for writes when Patroni cannot ensure transaction durability. See :ref:`replication modes documentation ` for details. - **synchronous\_mode\_strict**: prevents disabling synchronous replication if no synchronous replicas are available, blocking all client writes to the master. See :ref:`replication modes documentation ` for details. - **postgresql**: @@ -182,7 +183,7 @@ ZooKeeper - **key**: (optional) File with the client key. - **key_password**: (optional) The client key password. - **verify**: (optional) Whether to verify certificate or not. Defaults to ``true``. -- **set_acls**: (optional) If set, configure Kazoo to apply a default ACL to each ZNode that it creates. ACLs will assume 'x509' schema and should be specified as a dictionary with the principal as the key and one or more permissions as a list in the value. Permissions may be one of ``CREATE``, ``READ``, ``WRITE``, ``DELETE`` or ``ADMIN``. For example, ``set_acls: {CN=principal1: [CREATE, READ], CN=principal2: [ALL]}``. +- **set_acls**: (optional) If set, configure Kazoo to apply a default ACL to each ZNode that it creates. ACLs will assume 'x509' schema and should be specified as a dictionary with the principal as the key and one or more permissions as a list in the value. Permissions may be one of ``CREATE``, ``READ``, ``WRITE``, ``DELETE`` or ``ADMIN``. For example, ``set_acls: {CN=principal1: [CREATE, READ], CN=principal2: [ALL]}``. .. note:: It is required to install ``kazoo>=2.6.0`` to support SSL. diff --git a/docs/releases.rst b/docs/releases.rst index fce7805ca..06d1f1890 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -3,6 +3,15 @@ Release notes ============= +Version 2.2.0 +------------- + +**New features** + +- Added support for ``static_primary`` configuration (Anthony Dodd) + + This can be configured using the ``static_primary=`` config value, which enables a few optimizations to ensure that a cluster configured with a static primary will not unnecessarily demote the cluster primary. This is useful for cases where a cluster is running as a single-node cluster. When this value is configured in the DCS, replicas will refuse to boot until the config value is removed. + Version 2.1.3 ------------- @@ -1036,7 +1045,7 @@ Version 1.6.1 - Kill all children along with the callback process before starting the new one (Alexander Kukushkin) - Not doing so makes it hard to implement callbacks in bash and eventually can lead to the situation when two callbacks are running at the same time. + Not doing so makes it hard to implement callbacks in bash and eventually can lead to the situation when two callbacks are running at the same time. - Fix 'start failed' issue (Alexander Kukushkin) diff --git a/patroni/config.py b/patroni/config.py index 8633d4475..453e609ce 100644 --- a/patroni/config.py +++ b/patroni/config.py @@ -65,6 +65,7 @@ class Config(object): 'check_timeline': False, 'master_start_timeout': 300, 'master_stop_timeout': 0, + 'static_primary': None, 'synchronous_mode': False, 'synchronous_mode_strict': False, 'synchronous_node_count': 1, @@ -234,7 +235,7 @@ def _safe_copy_dynamic_configuration(self, dynamic_configuration): if name in self.__DEFAULT_CONFIG['standby_cluster']: config['standby_cluster'][name] = deepcopy(value) elif name in config: # only variables present in __DEFAULT_CONFIG allowed to be overridden from DCS - if name in ('synchronous_mode', 'synchronous_mode_strict'): + if name in ('synchronous_mode', 'synchronous_mode_strict', 'static_primary'): config[name] = value else: config[name] = int(value) @@ -247,7 +248,7 @@ def _build_environment_configuration(): def _popenv(name): return os.environ.pop(PATRONI_ENV_PREFIX + name.upper(), None) - for param in ('name', 'namespace', 'scope'): + for param in ('name', 'namespace', 'scope', 'static_primary'): value = _popenv(param) if value: ret[param] = value @@ -428,6 +429,10 @@ def _build_effective_configuration(self, dynamic_configuration, local_configurat if 'name' not in config and 'name' in pg_config: config['name'] = pg_config['name'] + # if 'static_primary' not in config and 'static_primary' in local_configuration + if 'static_primary' in local_configuration: + config['static_primary'] = local_configuration['static_primary'] + updated_fields = ( 'name', 'scope', diff --git a/patroni/ha.py b/patroni/ha.py index 9e0f9b84e..a9fd4a527 100644 --- a/patroni/ha.py +++ b/patroni/ha.py @@ -128,6 +128,19 @@ def is_leader(self): with self._is_leader_lock: return self._is_leader > time.time() + def is_static_primary(self): + """Check if this node is configured as the static primary of the cluster.""" + static_primary = self.patroni.config.get('static_primary') + name = self.patroni.config.get('name') + if static_primary is None or name is None: + return False + return static_primary == name + + def is_static_primary_configured(self): + """Check if the Patroni cluster has been configured with a static primary.""" + static_primary = self.patroni.config.get('static_primary') + return static_primary is not None + def set_is_leader(self, value): with self._is_leader_lock: self._is_leader = time.time() + self.dcs.ttl if value else 0 @@ -689,7 +702,9 @@ def _is_healthiest_node(self, members, check_replication_lag=True): def is_failover_possible(self, members, check_synchronous=True, cluster_lsn=None): ret = False cluster_timeline = self.cluster.timeline - members = [m for m in members if m.name != self.state_handler.name and not m.nofailover and m.api_url] + is_static_primary = self.is_static_primary() + members = [m for m in members if m.name != self.state_handler.name + and not m.nofailover and m.api_url and not is_static_primary] if check_synchronous and self.is_synchronous_mode(): members = [m for m in members if self.cluster.sync.matches(m.name)] if members: @@ -966,7 +981,6 @@ def process_manual_failover_from_leader(self): def process_unhealthy_cluster(self): """Cluster has no leader key""" - if self.is_healthiest_node(): if self.acquire_lock(): failover = self.cluster.failover @@ -991,6 +1005,9 @@ def process_unhealthy_cluster(self): 'promoted self to leader by acquiring session lock' ) else: + if self.is_static_primary(): + return 'no action as cluster is in static single node config mode' + return self.follow('demoted self after trying and failing to obtain lock', 'following new leader after trying and failing to obtain lock') else: @@ -1003,6 +1020,8 @@ def process_unhealthy_cluster(self): if self.patroni.nofailover: return self.follow('demoting self because I am not allowed to become master', 'following a different leader because I am not allowed to promote') + if self.is_static_primary(): + return 'no action as cluster is in static single node config mode' return self.follow('demoting self because i am not the healthiest node', 'following a different leader because i am not the healthiest node') @@ -1043,6 +1062,9 @@ def process_healthy_cluster(self): if self.state_handler.is_leader(): if self.is_paused(): return 'continue to run as master after failing to update leader lock in DCS' + if self.is_static_primary(): + return 'continue to run as master after failing to update leader lock in DCS \ + due to static_primary config' self.demote('immediate-nolock') return 'demoted self because failed to update leader lock in DCS' else: @@ -1346,6 +1368,12 @@ def _run_cycle(self): self.state_handler.reset_cluster_info_state(None, self.patroni.nofailover) raise + # If the cluster has been configured with a static primary, + # and we are not that primary, then do not proceed. + if self.is_static_primary_configured() and not self.is_static_primary(): + return 'patroni cluster is configured with a static primary, \ + and this node is not the primary, refusing to start' + if self.is_paused(): self.watchdog.disable() self._was_paused = True @@ -1487,7 +1515,8 @@ def _run_cycle(self): except DCSError: dcs_failed = True logger.error('Error communicating with DCS') - if not self.is_paused() and self.state_handler.is_running() and self.state_handler.is_leader(): + if not self.is_paused() and self.state_handler.is_running() \ + and self.state_handler.is_leader() and not self.is_static_primary(): self.demote('offline') return 'demoted self because DCS is not accessible and i was a leader' return 'DCS is not accessible' diff --git a/patroni/validator.py b/patroni/validator.py index 6d2a060c6..796e36b8d 100644 --- a/patroni/validator.py +++ b/patroni/validator.py @@ -313,6 +313,7 @@ def assert_(condition, message="Wrong value"): schema = Schema({ "name": str, + Optional("static_primary"): str, "scope": str, "restapi": { "listen": validate_host_port_listen, diff --git a/patroni/version.py b/patroni/version.py index 2d31b1c32..04188a16d 100644 --- a/patroni/version.py +++ b/patroni/version.py @@ -1 +1 @@ -__version__ = '2.1.3' +__version__ = '2.2.0' diff --git a/tests/test_config.py b/tests/test_config.py index dddae1749..973702d5d 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -31,6 +31,7 @@ def test_reload_local_configuration(self): 'PATRONI_LOGLEVEL': 'ERROR', 'PATRONI_LOG_LOGGERS': 'patroni.postmaster: WARNING, urllib3: DEBUG', 'PATRONI_LOG_FILE_NUM': '5', + 'PATRONI_STATIC_PRIMARY': 'postgres0', 'PATRONI_RESTAPI_USERNAME': 'username', 'PATRONI_RESTAPI_PASSWORD': 'password', 'PATRONI_RESTAPI_LISTEN': '0.0.0.0:8008', From 07c7ff855dfbf9cee65ca9e54de4c29bb3819720 Mon Sep 17 00:00:00 2001 From: Anthony Dodd Date: Wed, 27 Apr 2022 12:59:59 -0500 Subject: [PATCH 2/4] Add behavioral tests for static primary feature --- docs/ENVIRONMENT.rst | 1 - docs/SETTINGS.rst | 2 +- docs/releases.rst | 2 +- features/environment.py | 7 +++++++ features/static_primary.feature | 20 ++++++++++++++++++++ features/steps/static_primary.py | 25 +++++++++++++++++++++++++ patroni/config.py | 6 +----- patroni/validator.py | 6 +++--- tests/test_config.py | 10 ++++++++-- 9 files changed, 66 insertions(+), 13 deletions(-) create mode 100644 features/static_primary.feature create mode 100644 features/steps/static_primary.py diff --git a/docs/ENVIRONMENT.rst b/docs/ENVIRONMENT.rst index 89f407537..dd78d35e4 100644 --- a/docs/ENVIRONMENT.rst +++ b/docs/ENVIRONMENT.rst @@ -11,7 +11,6 @@ Global/Universal - **PATRONI\_NAME**: name of the node where the current instance of Patroni is running. Must be unique for the cluster. - **PATRONI\_NAMESPACE**: path within the configuration store where Patroni will keep information about the cluster. Default value: "/service" - **PATRONI\_SCOPE**: cluster name -- **PATRONI\_STATIC\_PRIMARY**: enables a few optimizations to ensure that a cluster configured with a static primary will not unnecessarily demote the cluster primary. This is useful for cases where a cluster is running as a single-node cluster. When this value is configured in the DCS, replicas will refuse to boot until the config value is removed. Log --- diff --git a/docs/SETTINGS.rst b/docs/SETTINGS.rst index fd35ca9a3..f7894949c 100644 --- a/docs/SETTINGS.rst +++ b/docs/SETTINGS.rst @@ -19,7 +19,7 @@ Dynamic configuration is stored in the DCS (Distributed Configuration Store) and - **max\_timelines\_history**: maximum number of timeline history items kept in DCS. Default value: 0. When set to 0, it keeps the full history in DCS. - **master\_start\_timeout**: the amount of time a master is allowed to recover from failures before failover is triggered (in seconds). Default is 300 seconds. When set to 0 failover is done immediately after a crash is detected if possible. When using asynchronous replication a failover can cause lost transactions. Worst case failover time for master failure is: loop\_wait + master\_start\_timeout + loop\_wait, unless master\_start\_timeout is zero, in which case it's just loop\_wait. Set the value according to your durability/availability tradeoff. - **master\_stop\_timeout**: the number of seconds Patroni is allowed to wait when stopping Postgres and effective only when synchronous_mode is enabled. When set to > 0 and the synchronous_mode is enabled, Patroni sends SIGKILL to the postmaster if the stop operation is running for more than the value set by master_stop_timeout. Set the value according to your durability/availability tradeoff. If the parameter is not set or set <= 0, master_stop_timeout does not apply. -- **static\_primary**: enables a few optimizations to ensure that a cluster configured with a static primary will not unnecessarily demote the cluster primary. This is useful for cases where a cluster is running as a single-node cluster. When this value is configured in the DCS, replicas will refuse to boot until the config value is removed. +- **static\_primary**: enables a few optimizations to ensure that a cluster configured with a static primary will not unnecessarily demote the cluster primary. This is useful for cases where a cluster is running as a single-node cluster. When this value is set, replicas will refuse to boot until the config value is removed from DCS config. - **synchronous\_mode**: turns on synchronous replication mode. In this mode a replica will be chosen as synchronous and only the latest leader and synchronous replica are able to participate in leader election. Synchronous mode makes sure that successfully committed transactions will not be lost at failover, at the cost of losing availability for writes when Patroni cannot ensure transaction durability. See :ref:`replication modes documentation ` for details. - **synchronous\_mode\_strict**: prevents disabling synchronous replication if no synchronous replicas are available, blocking all client writes to the master. See :ref:`replication modes documentation ` for details. - **postgresql**: diff --git a/docs/releases.rst b/docs/releases.rst index 06d1f1890..970699ea0 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -10,7 +10,7 @@ Version 2.2.0 - Added support for ``static_primary`` configuration (Anthony Dodd) - This can be configured using the ``static_primary=`` config value, which enables a few optimizations to ensure that a cluster configured with a static primary will not unnecessarily demote the cluster primary. This is useful for cases where a cluster is running as a single-node cluster. When this value is configured in the DCS, replicas will refuse to boot until the config value is removed. + This can be configured using the ``static_primary=`` config value, which enables a few optimizations to ensure that a cluster configured with a static primary will not unnecessarily demote the cluster primary. This is useful for cases where a cluster is running as a single-node cluster. When this value is set, replicas will refuse to boot until the config value is removed from DCS config. Version 2.1.3 ------------- diff --git a/features/environment.py b/features/environment.py index 955e2a39d..f84026037 100644 --- a/features/environment.py +++ b/features/environment.py @@ -666,6 +666,13 @@ def start(self, name, max_wait_limit=40, custom_config=None): self._output_dir, custom_config) self._processes[name].start(max_wait_limit) + def start_with_expected_failure(self, name, max_wait_limit=40, custom_config=None): + try: + self.start(name, max_wait_limit, custom_config) + assert False, 'expected startup to fail' + except: + pass + def __getattr__(self, func): if func not in ['stop', 'query', 'write_label', 'read_label', 'check_role_has_changed_to', 'add_tag_to_config', 'get_watchdog', 'patroni_hang', 'backup']: diff --git a/features/static_primary.feature b/features/static_primary.feature new file mode 100644 index 000000000..9dc02971b --- /dev/null +++ b/features/static_primary.feature @@ -0,0 +1,20 @@ +Feature: static primary + We should check that static primary behavior is safe + + Scenario: check static primary config in dcs blocks replica from starting + Given I start postgres0 as static primary + Then postgres0 is a leader after 10 seconds + And there is a non empty initialize key in DCS after 15 seconds + When I issue a PATCH request to http://127.0.0.1:8008/config with {"ttl": 20, "loop_wait": 2, "synchronous_mode": true} + Then I receive a response code 200 + When I start postgres1 with a configured static primary will not boot after 20 seconds + And I start postgres2 with a configured static primary will not boot after 20 seconds + And "sync" key not in DCS after waiting 20 seconds + And "members/postgres1" key not in DCS after waiting 10 seconds + And "members/postgres2" key not in DCS after waiting 10 seconds + + Scenario: check removing static primary config from dcs allows replica startup + Given I issue a PATCH request to http://127.0.0.1:8008/config with {"static_primary": null} + Then "sync" key in DCS has leader=postgres0 after 20 seconds + And "members/postgres1" key in DCS has state=running after 10 seconds + And "members/postgres2" key in DCS has state=running after 10 seconds diff --git a/features/steps/static_primary.py b/features/steps/static_primary.py new file mode 100644 index 000000000..b2aab9e65 --- /dev/null +++ b/features/steps/static_primary.py @@ -0,0 +1,25 @@ +import json +import patroni.psycopg as pg + +from behave import step, then +from time import sleep, time + + +@step('I start {name:w} as static primary') +def start_patroni_with_static_primary(context, name): + return context.pctl.start(name, custom_config={'bootstrap': {'dcs': {'static_primary': name}}}) + + +@step('I start {name:w} with a configured static primary will not boot after {time_limit:d} seconds') +def start_patroni_as_replica_with_static_primary(context, name, time_limit): + return context.pctl.start_with_expected_failure(name, max_wait_limit=time_limit) + + +@step('"{name}" key not in DCS after waiting {time_limit:d} seconds') +def check_member_not_present(context, name, time_limit): + sleep(time_limit) + try: + json.loads(context.dcs_ctl.query(name)) + assert False, "found value under DCS key {} after {} seconds".format(name, time_limit) + except Exception: + return diff --git a/patroni/config.py b/patroni/config.py index 453e609ce..1aad082bf 100644 --- a/patroni/config.py +++ b/patroni/config.py @@ -248,7 +248,7 @@ def _build_environment_configuration(): def _popenv(name): return os.environ.pop(PATRONI_ENV_PREFIX + name.upper(), None) - for param in ('name', 'namespace', 'scope', 'static_primary'): + for param in ('name', 'namespace', 'scope'): value = _popenv(param) if value: ret[param] = value @@ -429,10 +429,6 @@ def _build_effective_configuration(self, dynamic_configuration, local_configurat if 'name' not in config and 'name' in pg_config: config['name'] = pg_config['name'] - # if 'static_primary' not in config and 'static_primary' in local_configuration - if 'static_primary' in local_configuration: - config['static_primary'] = local_configuration['static_primary'] - updated_fields = ( 'name', 'scope', diff --git a/patroni/validator.py b/patroni/validator.py index 796e36b8d..ab36515af 100644 --- a/patroni/validator.py +++ b/patroni/validator.py @@ -313,7 +313,6 @@ def assert_(condition, message="Wrong value"): schema = Schema({ "name": str, - Optional("static_primary"): str, "scope": str, "restapi": { "listen": validate_host_port_listen, @@ -324,8 +323,9 @@ def assert_(condition, message="Wrong value"): Optional("ttl"): int, Optional("loop_wait"): int, Optional("retry_timeout"): int, - Optional("maximum_lag_on_failover"): int - }, + Optional("maximum_lag_on_failover"): int, + Optional("static_primary"): str + }, "pg_hba": [str], "initdb": [Or(str, dict)] }, diff --git a/tests/test_config.py b/tests/test_config.py index 973702d5d..df318f95c 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -31,7 +31,6 @@ def test_reload_local_configuration(self): 'PATRONI_LOGLEVEL': 'ERROR', 'PATRONI_LOG_LOGGERS': 'patroni.postmaster: WARNING, urllib3: DEBUG', 'PATRONI_LOG_FILE_NUM': '5', - 'PATRONI_STATIC_PRIMARY': 'postgres0', 'PATRONI_RESTAPI_USERNAME': 'username', 'PATRONI_RESTAPI_PASSWORD': 'password', 'PATRONI_RESTAPI_LISTEN': '0.0.0.0:8008', @@ -82,7 +81,7 @@ def test_reload_local_configuration(self): @patch('shutil.move', Mock(return_value=None)) @patch('json.dump', Mock()) def test_save_cache(self): - self.config.set_dynamic_configuration({'ttl': 30, 'postgresql': {'foo': 'bar'}}) + self.config.set_dynamic_configuration({'ttl': 30, 'static_primary': 'baz', 'postgresql': {'foo': 'bar'}}) with patch('os.fdopen', Mock(side_effect=IOError)): self.config.save_cache() with patch('os.fdopen', MagicMock()): @@ -100,6 +99,13 @@ def test_standby_cluster_parameters(self): for name, value in dynamic_configuration['standby_cluster'].items(): self.assertEqual(self.config['standby_cluster'][name], value) + def test_static_primary_parameter(self): + dynamic_configuration = { + 'static_primary': 'foobar' + } + self.config.set_dynamic_configuration(dynamic_configuration) + self.assertEqual(self.config['static_primary'], 'foobar') + @patch('os.path.exists', Mock(return_value=True)) @patch('os.path.isfile', Mock(side_effect=lambda fname: fname != 'postgres0')) @patch('os.path.isdir', Mock(return_value=True)) From bc9c1dfcc12cf684d78a6c8f860a1378caf65632 Mon Sep 17 00:00:00 2001 From: Anthony Dodd Date: Mon, 16 May 2022 12:51:16 -0500 Subject: [PATCH 3/4] Shutdown replica if cluster dynamically configured w/ static primary --- patroni/ha.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/patroni/ha.py b/patroni/ha.py index a9fd4a527..143a93f23 100644 --- a/patroni/ha.py +++ b/patroni/ha.py @@ -1371,8 +1371,10 @@ def _run_cycle(self): # If the cluster has been configured with a static primary, # and we are not that primary, then do not proceed. if self.is_static_primary_configured() and not self.is_static_primary(): + self.shutdown() return 'patroni cluster is configured with a static primary, \ - and this node is not the primary, refusing to start' + and this node is not the primary, shutting down and \ + refusing to start' if self.is_paused(): self.watchdog.disable() From 89c6013d34a3775ceb1557574a10c1c156e4b9f4 Mon Sep 17 00:00:00 2001 From: Anthony Dodd Date: Wed, 18 May 2022 14:48:59 -0500 Subject: [PATCH 4/4] Formatting / style updates per review (will squash) --- patroni/ha.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/patroni/ha.py b/patroni/ha.py index 143a93f23..27189def9 100644 --- a/patroni/ha.py +++ b/patroni/ha.py @@ -702,9 +702,10 @@ def _is_healthiest_node(self, members, check_replication_lag=True): def is_failover_possible(self, members, check_synchronous=True, cluster_lsn=None): ret = False cluster_timeline = self.cluster.timeline - is_static_primary = self.is_static_primary() - members = [m for m in members if m.name != self.state_handler.name - and not m.nofailover and m.api_url and not is_static_primary] + if self.is_static_primary(): + logger.warning('manual failover: not possible when instance is static primary') + return ret + members = [m for m in members if m.name != self.state_handler.name and not m.nofailover and m.api_url] if check_synchronous and self.is_synchronous_mode(): members = [m for m in members if self.cluster.sync.matches(m.name)] if members: