diff --git a/engine/apps/alerts/models/alert_receive_channel.py b/engine/apps/alerts/models/alert_receive_channel.py index 7a351d2aad..57021c1156 100644 --- a/engine/apps/alerts/models/alert_receive_channel.py +++ b/engine/apps/alerts/models/alert_receive_channel.py @@ -29,7 +29,7 @@ metrics_remove_deleted_integration_from_cache, metrics_update_integration_cache, ) -from apps.slack.constants import SLACK_RATE_LIMIT_DELAY, SLACK_RATE_LIMIT_TIMEOUT +from apps.slack.constants import SLACK_RATE_LIMIT_TIMEOUT from apps.slack.tasks import post_slack_rate_limit_message from apps.slack.utils import post_message_to_channel from common.api_helpers.utils import create_engine_url @@ -43,7 +43,7 @@ from apps.alerts.models import AlertGroup, ChannelFilter from apps.labels.models import AlertReceiveChannelAssociatedLabel - from apps.user_management.models import Organization, Team + from apps.user_management.models import Organization, Team, User logger = logging.getLogger(__name__) @@ -391,7 +391,7 @@ def save(self, *args, **kwargs): return super().save(*args, **kwargs) - def change_team(self, team_id, user): + def change_team(self, team_id: int, user: "User") -> None: if team_id == self.team_id: raise TeamCanNotBeChangedError("Integration is already in this team") @@ -409,26 +409,26 @@ def grafana_alerting_sync_manager(self): return GrafanaAlertingSyncManager(self) @property - def is_alerting_integration(self): + def is_alerting_integration(self) -> bool: return self.integration in { AlertReceiveChannel.INTEGRATION_GRAFANA_ALERTING, AlertReceiveChannel.INTEGRATION_LEGACY_GRAFANA_ALERTING, } @cached_property - def team_name(self): + def team_name(self) -> str: return self.team.name if self.team else "No team" @cached_property - def team_id_or_no_team(self): + def team_id_or_no_team(self) -> str: return self.team_id if self.team else "no_team" @cached_property - def emojized_verbal_name(self): + def emojized_verbal_name(self) -> str: return emoji.emojize(self.verbal_name, language="alias") @property - def new_incidents_web_link(self): + def new_incidents_web_link(self) -> str: from apps.alerts.models import AlertGroup return UIURLBuilder(self.organization).alert_groups( @@ -436,25 +436,27 @@ def new_incidents_web_link(self): ) @property - def is_rate_limited_in_slack(self): + def is_rate_limited_in_slack(self) -> bool: return ( self.rate_limited_in_slack_at is not None and self.rate_limited_in_slack_at + SLACK_RATE_LIMIT_TIMEOUT > timezone.now() ) - def start_send_rate_limit_message_task(self, delay=SLACK_RATE_LIMIT_DELAY): + def start_send_rate_limit_message_task(self, error_message_verb: str, delay: int) -> None: task_id = celery_uuid() + self.rate_limit_message_task_id = task_id self.rate_limited_in_slack_at = timezone.now() self.save(update_fields=["rate_limit_message_task_id", "rate_limited_in_slack_at"]) - post_slack_rate_limit_message.apply_async((self.pk,), countdown=delay, task_id=task_id) + + post_slack_rate_limit_message.apply_async((self.pk, error_message_verb), countdown=delay, task_id=task_id) @property - def alert_groups_count(self): + def alert_groups_count(self) -> int: return self.alert_groups.count() @property - def alerts_count(self): + def alerts_count(self) -> int: from apps.alerts.models import Alert return Alert.objects.filter(group__channel=self).count() @@ -548,14 +550,14 @@ def integration_url(self) -> str | None: return create_engine_url(f"integrations/v1/{slug}/{self.token}/") @property - def inbound_email(self): + def inbound_email(self) -> typing.Optional[str]: if self.integration != AlertReceiveChannel.INTEGRATION_INBOUND_EMAIL: return None return f"{self.token}@{live_settings.INBOUND_EMAIL_DOMAIN}" @property - def default_channel_filter(self): + def default_channel_filter(self) -> typing.Optional["ChannelFilter"]: return self.channel_filters.filter(is_default=True).first() # Templating @@ -590,7 +592,7 @@ def templates(self): } @property - def is_available_for_custom_templates(self): + def is_available_for_custom_templates(self) -> bool: return True # Maintenance diff --git a/engine/apps/slack/alert_group_slack_service.py b/engine/apps/slack/alert_group_slack_service.py index ed614305f8..4ac6b69dd3 100644 --- a/engine/apps/slack/alert_group_slack_service.py +++ b/engine/apps/slack/alert_group_slack_service.py @@ -1,6 +1,8 @@ import logging import typing +from django.core.cache import cache + from apps.slack.client import SlackClient from apps.slack.errors import ( SlackAPICantUpdateMessageError, @@ -23,6 +25,11 @@ class AlertGroupSlackService: _slack_client: SlackClient + UPDATE_ALERT_GROUP_DEBOUNCE_INTERVAL_SECONDS = 30 + """ + Time in seconds to wait before allowing the next update to the Alert Group slack message + """ + def __init__( self, slack_team_identity: "SlackTeamIdentity", @@ -35,7 +42,15 @@ def __init__( self._slack_client = SlackClient(slack_team_identity) def update_alert_group_slack_message(self, alert_group: "AlertGroup") -> None: - logger.info(f"Update message for alert_group {alert_group.pk}") + alert_group_pk = alert_group.pk + debounce_alert_group_update_cache_key = f"debounce_update_alert_group_slack_message_{alert_group_pk}" + + logger.info(f"Update message for alert_group {alert_group_pk}") + + # Check if the method has been called recently for this alert_group, if so skip to avoid approaching rate limits + if cache.get(debounce_alert_group_update_cache_key): + logger.info(f"Skipping update for alert_group {alert_group_pk} due to debounce interval") + return try: self._slack_client.chat_update( @@ -44,13 +59,14 @@ def update_alert_group_slack_message(self, alert_group: "AlertGroup") -> None: attachments=alert_group.render_slack_attachments(), blocks=alert_group.render_slack_blocks(), ) - logger.info(f"Message has been updated for alert_group {alert_group.pk}") + + logger.info(f"Message has been updated for alert_group {alert_group_pk}") except SlackAPIRatelimitError as e: if not alert_group.channel.is_maintenace_integration: if not alert_group.channel.is_rate_limited_in_slack: - alert_group.channel.start_send_rate_limit_message_task(e.retry_after) + alert_group.channel.start_send_rate_limit_message_task("Updating", e.retry_after) logger.info( - f"Message has not been updated for alert_group {alert_group.pk} due to slack rate limit." + f"Message has not been updated for alert_group {alert_group_pk} due to slack rate limit." ) else: raise @@ -62,6 +78,9 @@ def update_alert_group_slack_message(self, alert_group: "AlertGroup") -> None: SlackAPIChannelNotFoundError, ): pass + finally: + # Set the cache key to enforce debounce interval + cache.set(debounce_alert_group_update_cache_key, True, self.UPDATE_ALERT_GROUP_DEBOUNCE_INTERVAL_SECONDS) def publish_message_to_alert_group_thread( self, alert_group: "AlertGroup", attachments=None, mrkdwn=True, unfurl_links=True, text=None diff --git a/engine/apps/slack/scenarios/distribute_alerts.py b/engine/apps/slack/scenarios/distribute_alerts.py index 3d3c1a60a8..fe7f9ac87c 100644 --- a/engine/apps/slack/scenarios/distribute_alerts.py +++ b/engine/apps/slack/scenarios/distribute_alerts.py @@ -159,7 +159,7 @@ def _post_alert_group_to_slack( if not alert_group.channel.is_maintenace_integration: alert_group.reason_to_skip_escalation = AlertGroup.RATE_LIMITED alert_group.save(update_fields=["reason_to_skip_escalation"]) - alert_group.channel.start_send_rate_limit_message_task(e.retry_after) + alert_group.channel.start_send_rate_limit_message_task("Delivering", e.retry_after) logger.info("Not delivering alert due to slack rate limit.") else: raise e diff --git a/engine/apps/slack/tasks.py b/engine/apps/slack/tasks.py index 84e5ca7653..b68b589064 100644 --- a/engine/apps/slack/tasks.py +++ b/engine/apps/slack/tasks.py @@ -287,7 +287,7 @@ def populate_slack_user_identities(organization_pk): @shared_dedicated_queue_retry_task( autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else None ) -def post_slack_rate_limit_message(integration_id): +def post_slack_rate_limit_message(integration_id: int, error_message_verb: str) -> None: from apps.alerts.models import AlertReceiveChannel try: @@ -306,10 +306,9 @@ def post_slack_rate_limit_message(integration_id): default_route = integration.channel_filters.get(is_default=True) if (slack_channel_id := default_route.slack_channel_id_or_org_default_id) is not None: text = ( - f"Delivering and updating alert groups of integration {integration.verbal_name} in Slack is " - f"temporarily stopped due to rate limit. You could find new alert groups at " - f"<{integration.new_incidents_web_link}|web page " - '"Alert Groups">' + f"{error_message_verb} Alert Groups in Slack, for integration {integration.verbal_name}, is " + f"temporarily rate-limited (due to a Slack rate-limit). Meanwhile, you can still find new Alert Groups " + f'in the <{integration.new_incidents_web_link}|"Alert Groups" web page>' ) post_message_to_channel(integration.organization, slack_channel_id, text) diff --git a/engine/apps/slack/tests/test_alert_group_slack_service.py b/engine/apps/slack/tests/test_alert_group_slack_service.py new file mode 100644 index 0000000000..5f1dd4db14 --- /dev/null +++ b/engine/apps/slack/tests/test_alert_group_slack_service.py @@ -0,0 +1,311 @@ +from unittest.mock import patch, ANY + +import pytest + +from apps.slack.alert_group_slack_service import AlertGroupSlackService +from apps.slack.errors import ( + SlackAPICantUpdateMessageError, + SlackAPIChannelInactiveError, + SlackAPIChannelNotFoundError, + SlackAPIMessageNotFoundError, + SlackAPIRatelimitError, + SlackAPITokenError, +) + + +class MockSlackResponse: + headers = {"Retry-After": 123} + + +class TestAlertGroupSlackService: + + @patch("apps.slack.alert_group_slack_service.cache") + @patch("apps.slack.alert_group_slack_service.SlackClient.chat_update") + @pytest.mark.django_db + def test_update_alert_group_slack_message_successful( + self, + mock_slack_client_chat_update, + mock_cache, + make_organization_with_slack_team_identity, + make_alert_receive_channel, + make_alert_group, + make_alert, + make_slack_message, + ): + """ + Test that the Slack message is updated successfully when not debounced. + """ + slack_message_channel_id = "C12345" + slack_message_slack_id = "1234567890.123456" + + organization, slack_team_identity = make_organization_with_slack_team_identity() + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group(alert_receive_channel) + make_alert(alert_group=alert_group, raw_request_data={}) + make_slack_message( + alert_group=alert_group, + channel_id=slack_message_channel_id, + slack_id=slack_message_slack_id, + ) + + cache_key = f"debounce_update_alert_group_slack_message_{alert_group.pk}" + + # Cache does not have the key + mock_cache.get.return_value = None + + # Call the method + service = AlertGroupSlackService(slack_team_identity=slack_team_identity) + service.update_alert_group_slack_message(alert_group) + + # Assert that the cache was checked + mock_cache.get.assert_called_once_with(cache_key) + + # Assert that Slack client's chat_update was called with correct parameters + mock_slack_client_chat_update.assert_called_once_with( + channel=slack_message_channel_id, + ts=slack_message_slack_id, + attachments=ANY, + blocks=ANY, + ) + + # Assert that the cache key was set + mock_cache.set.assert_called_once_with(cache_key, True, 30) + + @patch("apps.slack.alert_group_slack_service.cache") + @patch("apps.slack.alert_group_slack_service.SlackClient.chat_update") + @pytest.mark.django_db + def test_update_alert_group_slack_message_debounced( + self, + mock_slack_client_chat_update, + mock_cache, + make_organization_with_slack_team_identity, + make_alert_receive_channel, + make_alert_group, + ): + """ + Test that the update is skipped due to debounce interval. + """ + organization, slack_team_identity = make_organization_with_slack_team_identity() + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group(alert_receive_channel) + cache_key = f"debounce_update_alert_group_slack_message_{alert_group.pk}" + + # Cache has the key (debounced) + mock_cache.get.return_value = True + + # Call the method + service = AlertGroupSlackService(slack_team_identity=slack_team_identity) + service.update_alert_group_slack_message(alert_group) + + # Assert that the cache was checked + mock_cache.get.assert_called_with(cache_key) + + # Assert that Slack client's chat_update was not called + mock_slack_client_chat_update.assert_not_called() + + # Assert that the cache set method was not called since the method returns early + mock_cache.set.assert_not_called() + + @patch("apps.slack.alert_group_slack_service.cache") + @patch("apps.slack.alert_group_slack_service.SlackClient.chat_update") + @patch("apps.alerts.models.AlertReceiveChannel.start_send_rate_limit_message_task") + @pytest.mark.django_db + def test_update_alert_group_slack_message_ratelimit_error_not_maintenance( + self, + mock_start_send_rate_limit_message_task, + mock_slack_client_chat_update, + mock_cache, + make_organization_with_slack_team_identity, + make_alert_receive_channel, + make_alert_group, + make_alert, + make_slack_message, + ): + """ + Test handling of SlackAPIRatelimitError when not a maintenance integration. + """ + slack_message_channel_id = "C12345" + slack_message_slack_id = "1234567890.123456" + + organization, slack_team_identity = make_organization_with_slack_team_identity() + alert_receive_channel = make_alert_receive_channel(organization) + + # Ensure channel is not a maintenance integration and not already rate-limited + assert alert_receive_channel.is_maintenace_integration is False + assert alert_receive_channel.is_rate_limited_in_slack is False + + alert_group = make_alert_group(alert_receive_channel) + make_alert(alert_group=alert_group, raw_request_data={}) + make_slack_message( + alert_group=alert_group, + channel_id=slack_message_channel_id, + slack_id=slack_message_slack_id, + ) + + cache_key = f"debounce_update_alert_group_slack_message_{alert_group.pk}" + + # Cache does not have the key + mock_cache.get.return_value = None + + # Slack client raises SlackAPIRatelimitError + slack_api_ratelimit_error = SlackAPIRatelimitError(MockSlackResponse()) + mock_slack_client_chat_update.side_effect = slack_api_ratelimit_error + + # Call the method + service = AlertGroupSlackService(slack_team_identity=slack_team_identity) + service.update_alert_group_slack_message(alert_group) + + # Assert that start_send_rate_limit_message_task was called + mock_start_send_rate_limit_message_task.assert_called_with( + "Updating", + slack_api_ratelimit_error.retry_after + ) + + # Assert that cache key was set + mock_cache.set.assert_called_with(cache_key, True, 30) + + @patch("apps.slack.alert_group_slack_service.cache") + @patch("apps.slack.alert_group_slack_service.SlackClient.chat_update") + @patch("apps.alerts.models.AlertReceiveChannel.start_send_rate_limit_message_task") + @pytest.mark.django_db + def test_update_alert_group_slack_message_ratelimit_error_is_maintenance( + self, + mock_start_send_rate_limit_message_task, + mock_slack_client_chat_update, + mock_cache, + make_organization_with_slack_team_identity, + make_alert_receive_channel, + make_alert_group, + make_alert, + make_slack_message, + ): + """ + Test that SlackAPIRatelimitError is re-raised when it is a maintenance integration. + """ + organization, slack_team_identity = make_organization_with_slack_team_identity() + alert_receive_channel = make_alert_receive_channel(organization, integration="maintenance") + + # Ensure channel is a maintenance integration and not already rate-limited + assert alert_receive_channel.is_maintenace_integration is True + assert alert_receive_channel.is_rate_limited_in_slack is False + + alert_group = make_alert_group(alert_receive_channel) + make_alert(alert_group=alert_group, raw_request_data={}) + make_slack_message(alert_group=alert_group) + + cache_key = f"debounce_update_alert_group_slack_message_{alert_group.pk}" + + # Cache does not have the key + mock_cache.get.return_value = None + + # Slack client raises SlackAPIRatelimitError + slack_api_ratelimit_error = SlackAPIRatelimitError(MockSlackResponse()) + mock_slack_client_chat_update.side_effect = slack_api_ratelimit_error + + # Call the method and expect exception to be raised + with pytest.raises(SlackAPIRatelimitError): + service = AlertGroupSlackService(slack_team_identity=slack_team_identity) + service.update_alert_group_slack_message(alert_group) + + # Assert that start_send_rate_limit_message_task was not called + mock_start_send_rate_limit_message_task.assert_not_called() + + # Assert that cache key was set even when exception occurred + mock_cache.set.assert_called_with(cache_key, True, 30) + + @patch("apps.slack.alert_group_slack_service.cache") + @patch("apps.slack.alert_group_slack_service.SlackClient.chat_update") + @patch("apps.alerts.models.AlertReceiveChannel.start_send_rate_limit_message_task") + @pytest.mark.parametrize("ExceptionClass", [ + SlackAPIMessageNotFoundError, + SlackAPICantUpdateMessageError, + SlackAPIChannelInactiveError, + SlackAPITokenError, + SlackAPIChannelNotFoundError, + ]) + @pytest.mark.django_db + def test_update_alert_group_slack_message_other_exceptions( + self, + mock_start_send_rate_limit_message_task, + mock_slack_client_chat_update, + mock_cache, + ExceptionClass, + make_organization_with_slack_team_identity, + make_alert_receive_channel, + make_alert_group, + make_alert, + make_slack_message, + ): + """ + Test that other Slack API exceptions are handled silently. + """ + organization, slack_team_identity = make_organization_with_slack_team_identity() + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group(alert_receive_channel) + make_alert(alert_group=alert_group, raw_request_data={}) + make_slack_message(alert_group=alert_group) + + cache_key = f"debounce_update_alert_group_slack_message_{alert_group.pk}" + + # Cache does not have the key + mock_cache.get.return_value = None + + # Slack client raises the exception class + mock_slack_client_chat_update.side_effect = ExceptionClass("foo bar") + + try: + # Call the method + service = AlertGroupSlackService(slack_team_identity=slack_team_identity) + service.update_alert_group_slack_message(alert_group) + except: + # Assert that no exception was raised + pytest.fail() + + # Assert that start_send_rate_limit_message_task was not called + mock_start_send_rate_limit_message_task.assert_not_called() + + # Assert that cache key was set + mock_cache.set.assert_called_with(cache_key, True, 30) + + @patch("apps.slack.alert_group_slack_service.cache") + @patch("apps.slack.alert_group_slack_service.SlackClient.chat_update") + @patch("apps.alerts.models.AlertReceiveChannel.start_send_rate_limit_message_task") + @pytest.mark.django_db + def test_update_alert_group_slack_message_cache_key_set_on_exception( + self, + mock_start_send_rate_limit_message_task, + mock_slack_client_chat_update, + mock_cache, + make_organization_with_slack_team_identity, + make_alert_receive_channel, + make_alert_group, + make_alert, + make_slack_message, + ): + """ + Test that the cache key is set even when an unexpected exception occurs. + """ + organization, slack_team_identity = make_organization_with_slack_team_identity() + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group(alert_receive_channel) + make_alert(alert_group=alert_group, raw_request_data={}) + make_slack_message(alert_group=alert_group) + + cache_key = f"debounce_update_alert_group_slack_message_{alert_group.pk}" + + # Cache does not have the key + mock_cache.get.return_value = None + + # Slack client raises a generic exception + mock_slack_client_chat_update.side_effect = Exception("Unexpected error") + + # Call the method and expect the exception to propagate + with pytest.raises(Exception): + service = AlertGroupSlackService(slack_team_identity=slack_team_identity) + service.update_alert_group_slack_message(alert_group) + + # Assert that start_send_rate_limit_message_task was not called + mock_start_send_rate_limit_message_task.assert_not_called() + + # Assert that cache key was set even when exception occurred + mock_cache.set.assert_called_with(cache_key, True, 30)