From 4ace52eb7489eba6f1c151e960d6f67149f7f1f9 Mon Sep 17 00:00:00 2001 From: Duncan Watson Date: Wed, 15 Jan 2025 12:01:40 +0000 Subject: [PATCH] EES-5689 - added various failure alerts --- .../application/public-api/publicApiApp.bicep | 3 + .../public-api/publicApiAppInsights.bicep | 10 +++ .../public-api/publicApiDataProcessor.bicep | 1 + .../application/shared/appGateway.bicep | 2 + .../shared/postgreSqlFlexibleServer.bicep | 2 + .../alerts/dynamicAlertConfig.bicep | 14 ++++ .../components/alerts/resourceMetrics.bicep | 47 +++++++++++-- .../public-api/components/appGateway.bicep | 49 ++++++++++++- .../public-api/components/appInsights.bicep | 68 +++++++++++++++++++ .../public-api/components/containerApp.bicep | 54 +++++++++++++++ .../public-api/components/functionApp.bicep | 46 ++++++++++++- .../components/postgresqlDatabase.bicep | 44 +++++++++++- .../templates/public-api/main.bicep | 1 + 13 files changed, 331 insertions(+), 10 deletions(-) diff --git a/infrastructure/templates/public-api/application/public-api/publicApiApp.bicep b/infrastructure/templates/public-api/application/public-api/publicApiApp.bicep index 2b4b0b76df3..6aaf6b51d11 100644 --- a/infrastructure/templates/public-api/application/public-api/publicApiApp.bicep +++ b/infrastructure/templates/public-api/application/public-api/publicApiApp.bicep @@ -166,6 +166,9 @@ module apiContainerAppModule '../../components/containerApp.bicep' = { responseTime: true cpuPercentage: true memoryPercentage: true + connectionTimeouts: true + requestRetries: true + requestTimeouts: true alertsGroupName: resourceNames.existingResources.alertsGroup } : null tagValues: tagValues diff --git a/infrastructure/templates/public-api/application/public-api/publicApiAppInsights.bicep b/infrastructure/templates/public-api/application/public-api/publicApiAppInsights.bicep index 844b8f8d41c..59b38134136 100644 --- a/infrastructure/templates/public-api/application/public-api/publicApiAppInsights.bicep +++ b/infrastructure/templates/public-api/application/public-api/publicApiAppInsights.bicep @@ -6,11 +6,21 @@ param resourceNames ResourceNames @description('Specifies the location for all resources.') param location string +@description('Specifies a set of tags with which to tag the resource in Azure.') +param tagValues object + module applicationInsightsModule '../../components/appInsights.bicep' = { name: 'appInsightsDeploy' params: { location: location appInsightsName: resourceNames.publicApi.appInsights + alerts: { + exceptionCount: true + exceptionServerCount: true + failedRequests: true + alertsGroupName: resourceNames.existingResources.alertsGroup + } + tagValues: tagValues } } diff --git a/infrastructure/templates/public-api/application/public-api/publicApiDataProcessor.bicep b/infrastructure/templates/public-api/application/public-api/publicApiDataProcessor.bicep index 0a94666e728..813531368c6 100644 --- a/infrastructure/templates/public-api/application/public-api/publicApiDataProcessor.bicep +++ b/infrastructure/templates/public-api/application/public-api/publicApiDataProcessor.bicep @@ -121,6 +121,7 @@ module dataProcessorFunctionAppModule '../../components/functionApp.bicep' = { fileServiceAvailability: true fileServiceLatency: false fileServiceCapacity: true + httpErrors: true alertsGroupName: resourceNames.existingResources.alertsGroup } : null tagValues: tagValues diff --git a/infrastructure/templates/public-api/application/shared/appGateway.bicep b/infrastructure/templates/public-api/application/shared/appGateway.bicep index 73fc3fed4a7..8f9d31a66ac 100644 --- a/infrastructure/templates/public-api/application/shared/appGateway.bicep +++ b/infrastructure/templates/public-api/application/shared/appGateway.bicep @@ -54,6 +54,8 @@ module appGatewayModule '../../components/appGateway.bicep' = { alerts: deployAlerts ? { health: true responseTime: true + failedRequests: true + responseStatuses: true alertsGroupName: resourceNames.existingResources.alertsGroup } : null tagValues: tagValues diff --git a/infrastructure/templates/public-api/application/shared/postgreSqlFlexibleServer.bicep b/infrastructure/templates/public-api/application/shared/postgreSqlFlexibleServer.bicep index 55a2bb283df..f71557ad772 100644 --- a/infrastructure/templates/public-api/application/shared/postgreSqlFlexibleServer.bicep +++ b/infrastructure/templates/public-api/application/shared/postgreSqlFlexibleServer.bicep @@ -72,6 +72,8 @@ module postgreSqlServerModule '../../components/postgresqlDatabase.bicep' = { diskIops: true memoryPercentage: true capacity: true + failedConnections: true + deadlocks: true alertsGroupName: resourceNames.existingResources.alertsGroup } : null tagValues: tagValues diff --git a/infrastructure/templates/public-api/components/alerts/dynamicAlertConfig.bicep b/infrastructure/templates/public-api/components/alerts/dynamicAlertConfig.bicep index 72b67d33a2f..729eba2c5ae 100644 --- a/infrastructure/templates/public-api/components/alerts/dynamicAlertConfig.bicep +++ b/infrastructure/templates/public-api/components/alerts/dynamicAlertConfig.bicep @@ -75,6 +75,20 @@ var dynamicMaxGreaterThan = { operator: 'GreaterThan' } +@export() +var dynamicTotalGreaterThan = { + ...defaultDynamicAlertConfig + aggregation: 'Total' + operator: 'GreaterThan' +} + +@export() +var dynamicCountGreaterThan = { + ...defaultDynamicAlertConfig + aggregation: 'Count' + operator: 'GreaterThan' +} + @export() var cpuPercentageConfig = { ...dynamicAverageGreaterThan diff --git a/infrastructure/templates/public-api/components/alerts/resourceMetrics.bicep b/infrastructure/templates/public-api/components/alerts/resourceMetrics.bicep index 4198993190a..b1cb12e39c8 100644 --- a/infrastructure/templates/public-api/components/alerts/resourceMetrics.bicep +++ b/infrastructure/templates/public-api/components/alerts/resourceMetrics.bicep @@ -4,7 +4,24 @@ type AppGatewayMetric = { resourceType: 'Microsoft.Network/applicationGateways' metric: | 'ApplicationGatewayTotalTime' + | 'FailedRequests' | 'UnhealthyHostCount' + | 'ResponseStatus' + dimensions: { + name: + | 'BackendSettingsPool' + | 'HttpStatusGroup' + operator: DimensionOperator? + values: string[] + }[]? +} + +type AppInsightsMetric = { + resourceType: 'Microsoft.Insights/components' + metric: + | 'exceptions/count' + | 'exceptions/server' + | 'requests/failed' } type AppServicePlanMetric = { @@ -19,29 +36,42 @@ type ContainerAppMetric = { metric: | 'CpuPercentage' | 'MemoryPercentage' + | 'ResiliencyConnectTimeouts' + | 'ResiliencyRequestRetries' + | 'ResiliencyRequestTimeouts' | 'ResponseTime' | 'RestartCount' } type FileServiceMetric = { resourceType: 'Microsoft.Storage/storageAccounts/fileServices' - dimensions: { - name: 'FileShare' | 'Tier' - operator: DimensionOperator? - values: string[] - }[]? metric: | 'availability' | 'FileCapacity' | 'SuccessE2ELatency' + dimensions: { + name: + | 'FileShare' + | 'Tier' + operator: DimensionOperator? + values: string[] + }[]? } type PostgreSqlMetric = { resourceType: 'Microsoft.DBforPostgreSQL/flexibleServers' + dimensions: { + name: + | 'DatabaseName' + operator: DimensionOperator? + values: string[] + }[]? metric: | 'backup_storage_used' | 'client_connections_waiting' + | 'connections_failed' | 'cpu_percent' + | 'deadlocks' | 'disk_bandwidth_consumed_percentage' | 'disk_iops_consumed_percentage' | 'is_db_alive' @@ -55,6 +85,10 @@ type SiteMetric = { resourceType: 'Microsoft.Web/sites' metric: | 'HealthCheckStatus' + | 'Http401' + | 'Http403' + | 'Http4xx' + | 'Http5xx' } type StorageAccountMetric = { @@ -68,8 +102,9 @@ type StorageAccountMetric = { @export() @discriminator('resourceType') type ResourceMetric = -| AppServicePlanMetric | AppGatewayMetric +| AppInsightsMetric +| AppServicePlanMetric | ContainerAppMetric | ContainerAppMetric | FileServiceMetric diff --git a/infrastructure/templates/public-api/components/appGateway.bicep b/infrastructure/templates/public-api/components/appGateway.bicep index f327ebe8889..4845f7938fc 100644 --- a/infrastructure/templates/public-api/components/appGateway.bicep +++ b/infrastructure/templates/public-api/components/appGateway.bicep @@ -1,4 +1,4 @@ -import { responseTimeConfig } from 'alerts/dynamicAlertConfig.bicep' +import { responseTimeConfig, dynamicTotalGreaterThan } from 'alerts/dynamicAlertConfig.bicep' import { staticAverageGreaterThanZero } from 'alerts/staticAlertConfig.bicep' import { @@ -46,6 +46,8 @@ param availabilityZones ('1' | '2' | '3') [] = [ param alerts { health: bool responseTime: bool + failedRequests: bool + responseStatuses: bool alertsGroupName: string }? @@ -304,7 +306,6 @@ module backendPoolsHealthAlert 'alerts/staticMetricAlert.bicep' = if (alerts != } } - module responseTimeAlert 'alerts/dynamicMetricAlert.bicep' = if (alerts != null && alerts!.responseTime) { name: '${appGatewayName}ResponseTimeDeploy' params: { @@ -318,3 +319,47 @@ module responseTimeAlert 'alerts/dynamicMetricAlert.bicep' = if (alerts != null tagValues: tagValues } } + +module failedRequestsAlert 'alerts/dynamicMetricAlert.bicep' = if (alerts != null && alerts!.failedRequests) { + name: '${appGatewayName}FailedRequestsDeploy' + params: { + resourceName: appGatewayName + resourceMetric: { + resourceType: 'Microsoft.Network/applicationGateways' + metric: 'FailedRequests' + dimensions: [{ + name: 'BackendSettingsPool' + values: map(backends, backend => backend.name) + }] + } + config: { + ...dynamicTotalGreaterThan + nameSuffix: 'failed-requests' + windowSize: 'PT30M' + } + alertsGroupName: alerts!.alertsGroupName + tagValues: tagValues + } +} + +module responseStatusAlert 'alerts/dynamicMetricAlert.bicep' = if (alerts != null && alerts!.responseStatuses) { + name: '${appGatewayName}ResponseStatusDeploy' + params: { + resourceName: appGatewayName + resourceMetric: { + resourceType: 'Microsoft.Network/applicationGateways' + metric: 'ResponseStatus' + dimensions: [{ + name: 'HttpStatusGroup' + values: ['4xx', '5xx'] + }] + } + config: { + ...dynamicTotalGreaterThan + nameSuffix: 'http-4xx-5xx' + windowSize: 'PT30M' + } + alertsGroupName: alerts!.alertsGroupName + tagValues: tagValues + } +} diff --git a/infrastructure/templates/public-api/components/appInsights.bicep b/infrastructure/templates/public-api/components/appInsights.bicep index 73f5301aa67..088083e9df0 100644 --- a/infrastructure/templates/public-api/components/appInsights.bicep +++ b/infrastructure/templates/public-api/components/appInsights.bicep @@ -1,9 +1,22 @@ +import { dynamicCountGreaterThan } from 'alerts/dynamicAlertConfig.bicep' + @description('Specifies the location for all resources.') param location string @description('Specifies the Application Insights name') param appInsightsName string +@description('Whether to create or update Azure Monitor alerts during this deploy') +param alerts { + exceptionCount: bool + exceptionServerCount: bool + failedRequests: bool + alertsGroupName: string +}? + +@description('Tags for the resources') +param tagValues object + var kind = 'web' resource applicationInsights 'Microsoft.Insights/components@2020-02-02' = { @@ -15,6 +28,61 @@ resource applicationInsights 'Microsoft.Insights/components@2020-02-02' = { publicNetworkAccessForIngestion: 'Enabled' publicNetworkAccessForQuery: 'Enabled' } + tags: tagValues +} + +module exceptionCountAlert 'alerts/dynamicMetricAlert.bicep' = if (alerts != null && alerts!.exceptionCount) { + name: '${appInsightsName}ExceptionCountDeploy' + params: { + resourceName: appInsightsName + resourceMetric: { + resourceType: 'Microsoft.Insights/components' + metric: 'exceptions/count' + } + config: { + ...dynamicCountGreaterThan + nameSuffix: 'exception-count' + windowSize: 'PT30M' + } + alertsGroupName: alerts!.alertsGroupName + tagValues: tagValues + } +} + +module exceptionServerCountAlert 'alerts/dynamicMetricAlert.bicep' = if (alerts != null && alerts!.exceptionServerCount) { + name: '${appInsightsName}ExceptionServerCountDeploy' + params: { + resourceName: appInsightsName + resourceMetric: { + resourceType: 'Microsoft.Insights/components' + metric: 'exceptions/server' + } + config: { + ...dynamicCountGreaterThan + nameSuffix: 'server-exception-count' + windowSize: 'PT30M' + } + alertsGroupName: alerts!.alertsGroupName + tagValues: tagValues + } +} + +module failedRequestsAlert 'alerts/dynamicMetricAlert.bicep' = if (alerts != null && alerts!.failedRequests) { + name: '${appInsightsName}FailedRequestsDeploy' + params: { + resourceName: appInsightsName + resourceMetric: { + resourceType: 'Microsoft.Insights/components' + metric: 'requests/failed' + } + config: { + ...dynamicCountGreaterThan + nameSuffix: 'failed-requests' + windowSize: 'PT30M' + } + alertsGroupName: alerts!.alertsGroupName + tagValues: tagValues + } } output applicationInsightsKey string = applicationInsights.properties.InstrumentationKey diff --git a/infrastructure/templates/public-api/components/containerApp.bicep b/infrastructure/templates/public-api/components/containerApp.bicep index 1b9e69ef11d..76ec8a77d9e 100644 --- a/infrastructure/templates/public-api/components/containerApp.bicep +++ b/infrastructure/templates/public-api/components/containerApp.bicep @@ -111,6 +111,9 @@ param alerts { responseTime: bool cpuPercentage: bool memoryPercentage: bool + connectionTimeouts: bool + requestRetries: bool + requestTimeouts: bool alertsGroupName: string }? @@ -275,6 +278,57 @@ module memoryPercentageAlert 'alerts/dynamicMetricAlert.bicep' = if (alerts != n } } +module connectionTimeoutsAlert 'alerts/staticMetricAlert.bicep' = if (alerts != null && alerts!.connectionTimeouts) { + name: '${containerAppName}ConnectionTimeoutsAlertModule' + params: { + resourceName: containerAppName + resourceMetric: { + resourceType: 'Microsoft.App/containerApps' + metric: 'ResiliencyConnectTimeouts' + } + config: { + ...staticTotalGreaterThanZero + nameSuffix: 'connection-timeouts' + } + alertsGroupName: alerts!.alertsGroupName + tagValues: tagValues + } +} + +module requestRetriesAlert 'alerts/staticMetricAlert.bicep' = if (alerts != null && alerts!.requestRetries) { + name: '${containerAppName}RequestRetriesAlertModule' + params: { + resourceName: containerAppName + resourceMetric: { + resourceType: 'Microsoft.App/containerApps' + metric: 'ResiliencyRequestRetries' + } + config: { + ...staticTotalGreaterThanZero + nameSuffix: 'request-retries' + } + alertsGroupName: alerts!.alertsGroupName + tagValues: tagValues + } +} + +module requestTimeoutsAlert 'alerts/staticMetricAlert.bicep' = if (alerts != null && alerts!.requestTimeouts) { + name: '${containerAppName}RequestTimeoutsAlertModule' + params: { + resourceName: containerAppName + resourceMetric: { + resourceType: 'Microsoft.App/containerApps' + metric: 'ResiliencyRequestTimeouts' + } + config: { + ...staticTotalGreaterThanZero + nameSuffix: 'request-timeouts' + } + alertsGroupName: alerts!.alertsGroupName + tagValues: tagValues + } +} + output containerAppFqdn string = containerAppFqdn output containerImage string = containerImageName output containerAppName string = containerApp.name diff --git a/infrastructure/templates/public-api/components/functionApp.bicep b/infrastructure/templates/public-api/components/functionApp.bicep index 0d18a56b76a..752abde60dc 100644 --- a/infrastructure/templates/public-api/components/functionApp.bicep +++ b/infrastructure/templates/public-api/components/functionApp.bicep @@ -1,5 +1,6 @@ import { FirewallRule, IpRange, AzureFileShareMount, EntraIdAuthentication } from '../types.bicep' -import { staticAverageLessThanHundred } from 'alerts/staticAlertConfig.bicep' +import { staticAverageLessThanHundred, staticMinGreaterThanZero } from 'alerts/staticAlertConfig.bicep' +import { dynamicAverageGreaterThan } from 'alerts/dynamicAlertConfig.bicep' import { abbreviations } from '../abbreviations.bicep' @description('Specifies the location for all resources.') @@ -75,6 +76,7 @@ param storageFirewallRules IpRange[] = [] @description('Whether to create or update Azure Monitor alerts during this deploy') param alerts { functionAppHealth: bool + httpErrors: bool cpuPercentage: bool memoryPercentage: bool storageAccountAvailability: bool @@ -468,6 +470,48 @@ module healthAlert 'alerts/staticMetricAlert.bicep' = if (alerts != null && aler } } +var unexpectedHttpStatusCodeMetrics = ['Http401', 'Http5xx'] + +module unexpectedHttpStatusCodeAlerts 'alerts/staticMetricAlert.bicep' = [ + for httpStatusCode in unexpectedHttpStatusCodeMetrics: if (alerts != null && alerts!.httpErrors) { + name: '${functionAppName}${httpStatusCode}Module' + params: { + resourceName: functionAppName + resourceMetric: { + resourceType: 'Microsoft.Web/sites' + metric: httpStatusCode + } + config: { + ...staticMinGreaterThanZero + nameSuffix: toLower(httpStatusCode) + } + alertsGroupName: alerts!.alertsGroupName + tagValues: tagValues + } + } +] + +var expectedHttpStatusCodeMetrics = ['Http403', 'Http4xx'] + +module expectedHttpStatusCodeAlerts 'alerts/dynamicMetricAlert.bicep' = [ + for httpStatusCode in expectedHttpStatusCodeMetrics: if (alerts != null && alerts!.httpErrors) { + name: '${functionAppName}${httpStatusCode}Module' + params: { + resourceName: functionAppName + resourceMetric: { + resourceType: 'Microsoft.Web/sites' + metric: httpStatusCode + } + config: { + ...dynamicAverageGreaterThan + nameSuffix: toLower(httpStatusCode) + } + alertsGroupName: alerts!.alertsGroupName + tagValues: tagValues + } + } +] + output functionAppName string = functionApp.name output url string = 'https://${functionApp.name}.azurewebsites.net' output stagingUrl string = 'https://${functionApp.name}-staging.azurewebsites.net' diff --git a/infrastructure/templates/public-api/components/postgresqlDatabase.bicep b/infrastructure/templates/public-api/components/postgresqlDatabase.bicep index 4c349139ae8..8087b87d8fc 100644 --- a/infrastructure/templates/public-api/components/postgresqlDatabase.bicep +++ b/infrastructure/templates/public-api/components/postgresqlDatabase.bicep @@ -3,9 +3,10 @@ import { memoryPercentageConfig dynamicMaxGreaterThan dynamicAverageGreaterThan + dynamicTotalGreaterThan } from 'alerts/dynamicAlertConfig.bicep' -import { staticAverageLessThanHundred, capacity } from 'alerts/staticAlertConfig.bicep' +import { staticAverageLessThanHundred, capacity, staticTotalGreaterThanZero } from 'alerts/staticAlertConfig.bicep' import { IpRange, PrincipalNameAndId } from '../types.bicep' @@ -65,6 +66,8 @@ param alerts { diskIops: bool memoryPercentage: bool capacity: bool + failedConnections: bool + deadlocks: bool alertsGroupName: string }? @@ -331,6 +334,45 @@ module capacityAlerts 'alerts/staticMetricAlert.bicep' = [for capacityThreshold } }] +module failedConnectionsAlert 'alerts/dynamicMetricAlert.bicep' = if (alerts != null && alerts!.failedConnections) { + name: '${databaseServerName}FailedConnectionsDeploy' + params: { + resourceName: databaseServerName + resourceMetric: { + resourceType: 'Microsoft.DBforPostgreSQL/flexibleServers' + metric: 'connections_failed' + } + config: { + ...dynamicTotalGreaterThan + nameSuffix: 'failed-connections' + } + alertsGroupName: alerts!.alertsGroupName + tagValues: tagValues + } +} + +module deadlocksAlert 'alerts/staticMetricAlert.bicep' = if (alerts != null && alerts!.deadlocks) { + name: '${databaseServerName}DeadlocksDeploy' + params: { + resourceName: databaseServerName + resourceMetric: { + resourceType: 'Microsoft.DBforPostgreSQL/flexibleServers' + metric: 'deadlocks' + dimensions: [{ + name: 'DatabaseName' + values: databaseNames + }] + } + config: { + ...staticTotalGreaterThanZero + nameSuffix: 'deadlocks' + windowSize: 'PT5M' + } + alertsGroupName: alerts!.alertsGroupName + tagValues: tagValues + } +} + @description('The fully qualified Azure resource ID of the Database Server.') output databaseRef string = resourceId('Microsoft.DBforPostgreSQL/flexibleServers', databaseServerName) diff --git a/infrastructure/templates/public-api/main.bicep b/infrastructure/templates/public-api/main.bicep index ea9cb98dfe6..897fee104fb 100644 --- a/infrastructure/templates/public-api/main.bicep +++ b/infrastructure/templates/public-api/main.bicep @@ -243,6 +243,7 @@ module appInsightsModule 'application/public-api/publicApiAppInsights.bicep' = { params: { location: location resourceNames: resourceNames + tagValues: tagValues } }