Skip to content

Commit

Permalink
[DPE-6484] Add scope to promote to primary (#850)
Browse files Browse the repository at this point in the history
* Promote unit action

* Tweaks for failing REST calls

* VM parity
  • Loading branch information
dragomirp authored Feb 19, 2025
1 parent 842aa47 commit c9f6554
Show file tree
Hide file tree
Showing 9 changed files with 114 additions and 18 deletions.
7 changes: 6 additions & 1 deletion actions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,13 @@ list-backups:
pre-upgrade-check:
description: Run necessary pre-upgrade checks and preparations before executing a charm refresh.
promote-to-primary:
description: Promotes the cluster of choice to a primary cluster. Must be ran against the leader unit.
description: Promotes the cluster of choice to a primary cluster. Must be ran against the leader unit when promoting a cluster
or against the unit to be promoted within the cluster.
params:
scope:
type: string
default: cluster
description: Whether to promote a unit or a cluster. Must be set to either unit or cluster.
force:
type: boolean
description: Force the promotion of a cluster when there is already a primary cluster.
Expand Down
23 changes: 22 additions & 1 deletion src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@
WORKLOAD_OS_GROUP,
WORKLOAD_OS_USER,
)
from patroni import NotReadyError, Patroni, SwitchoverFailedError
from patroni import NotReadyError, Patroni, SwitchoverFailedError, SwitchoverNotSyncError
from relations.async_replication import (
REPLICATION_CONSUMER_RELATION,
REPLICATION_OFFER_RELATION,
Expand Down Expand Up @@ -211,6 +211,7 @@ def __init__(self, *args):
self.framework.observe(self.on.stop, self._on_stop)
self.framework.observe(self.on.get_password_action, self._on_get_password)
self.framework.observe(self.on.set_password_action, self._on_set_password)
self.framework.observe(self.on.promote_to_primary_action, self._on_promote_to_primary)
self.framework.observe(self.on.get_primary_action, self._on_get_primary)
self.framework.observe(self.on.update_status, self._on_update_status)
self._storage_path = self.meta.storages["pgdata"].location
Expand Down Expand Up @@ -1305,6 +1306,26 @@ def _on_set_password(self, event: ActionEvent) -> None:

event.set_results({"password": password})

def _on_promote_to_primary(self, event: ActionEvent) -> None:
if event.params.get("scope") == "cluster":
return self.async_replication.promote_to_primary(event)
elif event.params.get("scope") == "unit":
return self.promote_primary_unit(event)
else:
event.fail("Scope should be either cluster or unit")

def promote_primary_unit(self, event: ActionEvent) -> None:
"""Handles promote to primary for unit scope."""
if event.params.get("force"):
event.fail("Suprerfluous force flag with unit scope")
else:
try:
self._patroni.switchover(self.unit.name, wait=False)
except SwitchoverNotSyncError:
event.fail("Unit is not sync standby")
except SwitchoverFailedError:
event.fail("Switchover failed or timed out, check the logs for details")

def _on_get_primary(self, event: ActionEvent) -> None:
"""Get primary instance."""
try:
Expand Down
16 changes: 15 additions & 1 deletion src/patroni.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,10 @@ class SwitchoverFailedError(Exception):
"""Raised when a switchover failed for some reason."""


class SwitchoverNotSyncError(SwitchoverFailedError):
"""Raised when a switchover failed because node is not sync."""


class UpdateSyncNodeCountError(Exception):
"""Raised when updating synchronous_node_count failed for some reason."""

Expand Down Expand Up @@ -612,7 +616,7 @@ def restart_postgresql(self) -> None:
timeout=PATRONI_TIMEOUT,
)

def switchover(self, candidate: str | None = None) -> None:
def switchover(self, candidate: str | None = None, wait: bool = True) -> None:
"""Trigger a switchover."""
# Try to trigger the switchover.
if candidate is not None:
Expand All @@ -631,8 +635,18 @@ def switchover(self, candidate: str | None = None) -> None:

# Check whether the switchover was unsuccessful.
if r.status_code != 200:
if (
r.status_code == 412
and r.text == "candidate name does not match with sync_standby"
):
logger.debug("Unit is not sync standby")
raise SwitchoverNotSyncError()
logger.warning(f"Switchover call failed with code {r.status_code} {r.text}")
raise SwitchoverFailedError(f"received {r.status_code}")

if not wait:
return

for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3), reraise=True):
with attempt:
new_primary = self.get_primary()
Expand Down
5 changes: 1 addition & 4 deletions src/relations/async_replication.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,9 +104,6 @@ def __init__(self, charm):
self.framework.observe(
self.charm.on.create_replication_action, self._on_create_replication
)
self.framework.observe(
self.charm.on.promote_to_primary_action, self._on_promote_to_primary
)

self.framework.observe(self.charm.on.secret_changed, self._on_secret_changed)

Expand Down Expand Up @@ -575,7 +572,7 @@ def _on_create_replication(self, event: ActionEvent) -> None:
# Set the status.
self.charm.unit.status = MaintenanceStatus("Creating replication...")

def _on_promote_to_primary(self, event: ActionEvent) -> None:
def promote_to_primary(self, event: ActionEvent) -> None:
"""Promote this cluster to the primary cluster."""
if (
self.charm.app.status.message != READ_ONLY_MODE_BLOCKING_MESSAGE
Expand Down
4 changes: 2 additions & 2 deletions tests/integration/ha_tests/test_async_replication.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ async def test_switchover(
leader_unit = await get_leader_unit(ops_test, DATABASE_APP_NAME, model=second_model)
assert leader_unit is not None, "No leader unit found"
logger.info("promoting the second cluster")
run_action = await leader_unit.run_action("promote-to-primary", **{"force": True})
run_action = await leader_unit.run_action("promote-to-primary", force=True, scope="cluster")
await run_action.wait()
assert (run_action.results.get("return-code", None) == 0) or (
run_action.results.get("Code", None) == "0"
Expand Down Expand Up @@ -295,7 +295,7 @@ async def test_promote_standby(
leader_unit = await get_leader_unit(ops_test, DATABASE_APP_NAME)
assert leader_unit is not None, "No leader unit found"
logger.info("promoting the first cluster")
run_action = await leader_unit.run_action("promote-to-primary")
run_action = await leader_unit.run_action("promote-to-primary", scope="cluster")
await run_action.wait()
assert (run_action.results.get("return-code", None) == 0) or (
run_action.results.get("Code", None) == "0"
Expand Down
16 changes: 16 additions & 0 deletions tests/integration/test_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from pytest_operator.plugin import OpsTest
from tenacity import Retrying, stop_after_delay, wait_fixed

from .ha_tests.helpers import get_cluster_roles
from .helpers import (
CHARM_BASE,
METADATA,
Expand Down Expand Up @@ -253,6 +254,21 @@ async def test_scale_down_and_up(ops_test: OpsTest):
await scale_application(ops_test, APP_NAME, initial_scale)


async def test_switchover_sync_standby(ops_test: OpsTest):
original_roles = await get_cluster_roles(
ops_test, ops_test.model.applications[APP_NAME].units[0].name
)
run_action = await ops_test.model.units[original_roles["sync_standbys"][0]].run_action(
"promote-to-primary", scope="unit"
)
await run_action.wait()
await ops_test.model.wait_for_idle(status="active", timeout=200)
new_roles = await get_cluster_roles(
ops_test, ops_test.model.applications[APP_NAME].units[0].name
)
assert new_roles["primaries"][0] == original_roles["sync_standbys"][0]


async def test_persist_data_through_graceful_restart(ops_test: OpsTest):
"""Test data persists through a graceful restart."""
primary = await get_primary(ops_test)
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/test_async_replication.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,7 @@ def test_promote_to_primary(harness, relation_name):
)
harness.update_relation_data(rel_id, "standby/0", {"unit-address": "10.2.2.10"})

harness.run_action("promote-to-primary")
harness.run_action("promote-to-primary", {"scope": "cluster"})

assert (
harness.get_relation_data(rel_id, harness.charm.app.name).get(
Expand Down
41 changes: 40 additions & 1 deletion tests/unit/test_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@

from charm import EXTENSION_OBJECT_MESSAGE, PostgresqlOperatorCharm
from constants import PEER, SECRET_INTERNAL_LABEL
from patroni import NotReadyError
from patroni import NotReadyError, SwitchoverFailedError, SwitchoverNotSyncError
from tests.unit.helpers import _FakeApiError

POSTGRESQL_CONTAINER = "postgresql"
Expand Down Expand Up @@ -1847,3 +1847,42 @@ def test_get_plugins(harness):
"insert_username",
"moddatetime",
]


def test_on_promote_to_primary(harness):
with (
patch("charm.PostgreSQLAsyncReplication.promote_to_primary") as _promote_to_primary,
patch("charm.Patroni.switchover") as _switchover,
):
event = Mock()
event.params = {"scope": "cluster"}

# Cluster
harness.charm._on_promote_to_primary(event)
_promote_to_primary.assert_called_once_with(event)

# Unit, no force, regular promotion
event.params = {"scope": "unit"}

harness.charm._on_promote_to_primary(event)

_switchover.assert_called_once_with("postgresql-k8s/0", wait=False)

# Unit, no force, switchover failed
event.params = {"scope": "unit"}
_switchover.side_effect = SwitchoverFailedError

harness.charm._on_promote_to_primary(event)

event.fail.assert_called_once_with(
"Switchover failed or timed out, check the logs for details"
)
event.fail.reset_mock()

# Unit, no force, not sync
event.params = {"scope": "unit"}
_switchover.side_effect = SwitchoverNotSyncError

harness.charm._on_promote_to_primary(event)

event.fail.assert_called_once_with("Unit is not sync standby")
18 changes: 11 additions & 7 deletions tests/unit/test_patroni.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

from charm import PostgresqlOperatorCharm
from constants import REWIND_USER
from patroni import PATRONI_TIMEOUT, Patroni, SwitchoverFailedError
from patroni import PATRONI_TIMEOUT, Patroni, SwitchoverFailedError, SwitchoverNotSyncError
from tests.helpers import STORAGE_PATH


Expand Down Expand Up @@ -331,11 +331,9 @@ def test_switchover(harness, patroni):
# Test failed switchovers.
_post.reset_mock()
_get_primary.side_effect = ["postgresql-k8s-0", "postgresql-k8s-1"]
try:
with pytest.raises(SwitchoverFailedError):
patroni.switchover("postgresql-k8s/2")
assert False
except SwitchoverFailedError:
pass
_post.assert_called_once_with(
"http://postgresql-k8s-0:8008/switchover",
json={"leader": "postgresql-k8s-0", "candidate": "postgresql-k8s-2"},
Expand All @@ -347,11 +345,9 @@ def test_switchover(harness, patroni):
_post.reset_mock()
_get_primary.side_effect = ["postgresql-k8s-0", "postgresql-k8s-2"]
response.status_code = 400
try:
with pytest.raises(SwitchoverFailedError):
patroni.switchover("postgresql-k8s/2")
assert False
except SwitchoverFailedError:
pass
_post.assert_called_once_with(
"http://postgresql-k8s-0:8008/switchover",
json={"leader": "postgresql-k8s-0", "candidate": "postgresql-k8s-2"},
Expand All @@ -360,6 +356,14 @@ def test_switchover(harness, patroni):
timeout=PATRONI_TIMEOUT,
)

# Test candidate, not sync
response = _post.return_value
response.status_code = 412
response.text = "candidate name does not match with sync_standby"
with pytest.raises(SwitchoverNotSyncError):
patroni.switchover("candidate")
assert False


def test_member_replication_lag(harness, patroni):
with (
Expand Down

0 comments on commit c9f6554

Please sign in to comment.