Skip to content

Commit

Permalink
[MISC] Add degraded status to primary message (#874)
Browse files Browse the repository at this point in the history
* Add degraded status on to primary message

* Use const running tests
  • Loading branch information
dragomirp authored Feb 26, 2025
1 parent 3b38f9d commit e92a015
Show file tree
Hide file tree
Showing 4 changed files with 62 additions and 53 deletions.
14 changes: 10 additions & 4 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -1032,10 +1032,16 @@ def _set_active_status(self):
self.app_peer_data["s3-initialization-block-message"]
)
return
if self._patroni.get_primary(unit_name_pattern=True) == self.unit.name:
self.unit.status = ActiveStatus("Primary")
elif self.is_standby_leader:
self.unit.status = ActiveStatus("Standby")
if (
self._patroni.get_primary(unit_name_pattern=True) == self.unit.name
or self.is_standby_leader
):
danger_state = ""
if len(self._patroni.get_running_cluster_members()) < self.app.planned_units():
danger_state = " (degraded)"
self.unit.status = ActiveStatus(
f"{'Standby' if self.is_standby_leader else 'Primary'}{danger_state}"
)
elif self._patroni.member_started:
self.unit.status = ActiveStatus()
except (RetryError, RequestsConnectionError) as e:
Expand Down
86 changes: 40 additions & 46 deletions src/patroni.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,19 @@ def update_synchronous_node_count(self) -> None:
if r.status_code != 200:
raise UpdateSyncNodeCountError(f"received {r.status_code}")

def get_cluster(
self, attempt: AttemptManager, alternative_endpoints: list[str] | None = None
) -> dict[str, str | int]:
"""Call the cluster endpoint."""
url = self._get_alternative_patroni_url(attempt, alternative_endpoints)
r = requests.get(
f"{url}/cluster",
verify=self._verify,
auth=self._patroni_auth,
timeout=PATRONI_TIMEOUT,
)
return r.json()

def get_primary(
self, unit_name_pattern=False, alternative_endpoints: list[str] | None = None
) -> str:
Expand All @@ -180,11 +193,7 @@ def get_primary(
# Request info from cluster endpoint (which returns all members of the cluster).
for attempt in Retrying(stop=stop_after_attempt(len(self._endpoints) + 1)):
with attempt:
url = self._get_alternative_patroni_url(attempt, alternative_endpoints)
r = requests.get(
f"{url}/cluster", verify=self._verify, timeout=5, auth=self._patroni_auth
)
for member in r.json()["members"]:
for member in self.get_cluster(attempt, alternative_endpoints)["members"]:
if member["role"] == "leader":
primary = member["name"]
if unit_name_pattern:
Expand All @@ -209,14 +218,7 @@ def get_standby_leader(
# Request info from cluster endpoint (which returns all members of the cluster).
for attempt in Retrying(stop=stop_after_attempt(len(self._endpoints) + 1)):
with attempt:
url = self._get_alternative_patroni_url(attempt)
r = requests.get(
f"{url}/cluster",
verify=self._verify,
auth=self._patroni_auth,
timeout=PATRONI_TIMEOUT,
)
for member in r.json()["members"]:
for member in self.get_cluster(attempt)["members"]:
if member["role"] == "standby_leader":
if check_whether_is_running and member["state"] not in RUNNING_STATES:
logger.warning(f"standby leader {member['name']} is not running")
Expand All @@ -234,30 +236,33 @@ def get_sync_standby_names(self) -> list[str]:
# Request info from cluster endpoint (which returns all members of the cluster).
for attempt in Retrying(stop=stop_after_attempt(len(self._endpoints) + 1)):
with attempt:
url = self._get_alternative_patroni_url(attempt)
r = requests.get(
f"{url}/cluster",
verify=self._verify,
auth=self._patroni_auth,
timeout=PATRONI_TIMEOUT,
)
for member in r.json()["members"]:
for member in self.get_cluster(attempt)["members"]:
if member["role"] == "sync_standby":
sync_standbys.append("/".join(member["name"].rsplit("-", 1)))
return sync_standbys

@property
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
def cluster_members(self) -> set:
"""Get the current cluster members."""
# Request info from cluster endpoint (which returns all members of the cluster).
r = requests.get(
f"{self._patroni_url}/cluster",
verify=self._verify,
auth=self._patroni_auth,
timeout=PATRONI_TIMEOUT,
)
return {member["name"] for member in r.json()["members"]}
for attempt in Retrying(
stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10)
):
with attempt:
return {member["name"] for member in self.get_cluster(attempt)["members"]}

def get_running_cluster_members(self) -> list[str]:
"""List running patroni members."""
try:
for attempt in Retrying(stop=stop_after_attempt(1)):
with attempt:
return [
member["name"]
for member in self.get_cluster(attempt)["members"]
if member["state"] in RUNNING_STATES
]
except Exception:
return []

def are_all_members_ready(self) -> bool:
"""Check if all members are correctly running Patroni and PostgreSQL.
Expand All @@ -271,17 +276,13 @@ def are_all_members_ready(self) -> bool:
try:
for attempt in Retrying(stop=stop_after_delay(10), wait=wait_fixed(3)):
with attempt:
r = requests.get(
f"{self._patroni_url}/cluster",
verify=self._verify,
auth=self._patroni_auth,
timeout=PATRONI_TIMEOUT,
return all(
member["state"] in RUNNING_STATES
for member in self.get_cluster(attempt)["members"]
)
except RetryError:
return False

return all(member["state"] in RUNNING_STATES for member in r.json()["members"])

@property
def is_creating_backup(self) -> bool:
"""Returns whether a backup is being created."""
Expand All @@ -291,20 +292,13 @@ def is_creating_backup(self) -> bool:
try:
for attempt in Retrying(stop=stop_after_delay(10), wait=wait_fixed(3)):
with attempt:
r = requests.get(
f"{self._patroni_url}/cluster",
verify=self._verify,
auth=self._patroni_auth,
timeout=PATRONI_TIMEOUT,
return any(
"tags" in member and member["tags"].get("is_creating_backup")
for member in self.get_cluster(attempt)["members"]
)
except RetryError:
return False

return any(
"tags" in member and member["tags"].get("is_creating_backup")
for member in r.json()["members"]
)

@property
def is_replication_healthy(self) -> bool:
"""Return whether the replication is healthy."""
Expand Down
11 changes: 10 additions & 1 deletion tests/unit/test_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -479,6 +479,12 @@ def test_on_update_status(harness):
patch("ops.model.Container.pebble") as _pebble,
patch("ops.model.Container.restart") as _restart,
patch("upgrade.PostgreSQLUpgrade.idle", return_value="idle"),
patch(
"charm.PostgresqlOperatorCharm.is_standby_leader",
new_callable=PropertyMock,
return_value=False,
),
patch("charm.Patroni.get_running_cluster_members", return_value=["test"]),
):
# Early exit on can connect.
harness.set_can_connect(POSTGRESQL_CONTAINER, False)
Expand Down Expand Up @@ -1740,6 +1746,7 @@ def test_handle_postgresql_restart_need(harness):
def test_set_active_status(harness):
with (
patch("charm.Patroni.member_started", new_callable=PropertyMock) as _member_started,
patch("charm.Patroni.get_running_cluster_members", return_value=["test"]),
patch(
"charm.PostgresqlOperatorCharm.is_standby_leader", new_callable=PropertyMock
) as _is_standby_leader,
Expand Down Expand Up @@ -1772,7 +1779,9 @@ def test_set_active_status(harness):
assert isinstance(harness.charm.unit.status, MaintenanceStatus)
else:
_is_standby_leader.side_effect = None
_is_standby_leader.return_value = values[1]
_is_standby_leader.return_value = (
values[0] != harness.charm.unit.name and values[1]
)
harness.charm._set_active_status()
assert isinstance(
harness.charm.unit.status,
Expand Down
4 changes: 2 additions & 2 deletions tests/unit/test_patroni.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def test_get_primary(harness, patroni):
_get.assert_called_once_with(
"http://postgresql-k8s-0:8008/cluster",
verify=True,
timeout=5,
timeout=10,
auth=patroni._patroni_auth,
)

Expand All @@ -99,7 +99,7 @@ def test_get_primary(harness, patroni):
_get.assert_called_once_with(
"http://postgresql-k8s-0:8008/cluster",
verify=True,
timeout=5,
timeout=10,
auth=patroni._patroni_auth,
)

Expand Down

0 comments on commit e92a015

Please sign in to comment.