Skip to content

Commit

Permalink
EES-5689 - added various failure alerts
Browse files Browse the repository at this point in the history
  • Loading branch information
duncan-at-hiveit committed Jan 15, 2025
1 parent 101e0df commit 4ace52e
Show file tree
Hide file tree
Showing 13 changed files with 331 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,9 @@ module apiContainerAppModule '../../components/containerApp.bicep' = {
responseTime: true
cpuPercentage: true
memoryPercentage: true
connectionTimeouts: true
requestRetries: true
requestTimeouts: true
alertsGroupName: resourceNames.existingResources.alertsGroup
} : null
tagValues: tagValues
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,21 @@ param resourceNames ResourceNames
@description('Specifies the location for all resources.')
param location string

@description('Specifies a set of tags with which to tag the resource in Azure.')
param tagValues object

module applicationInsightsModule '../../components/appInsights.bicep' = {
name: 'appInsightsDeploy'
params: {
location: location
appInsightsName: resourceNames.publicApi.appInsights
alerts: {
exceptionCount: true
exceptionServerCount: true
failedRequests: true
alertsGroupName: resourceNames.existingResources.alertsGroup
}
tagValues: tagValues
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ module dataProcessorFunctionAppModule '../../components/functionApp.bicep' = {
fileServiceAvailability: true
fileServiceLatency: false
fileServiceCapacity: true
httpErrors: true
alertsGroupName: resourceNames.existingResources.alertsGroup
} : null
tagValues: tagValues
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ module appGatewayModule '../../components/appGateway.bicep' = {
alerts: deployAlerts ? {
health: true
responseTime: true
failedRequests: true
responseStatuses: true
alertsGroupName: resourceNames.existingResources.alertsGroup
} : null
tagValues: tagValues
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ module postgreSqlServerModule '../../components/postgresqlDatabase.bicep' = {
diskIops: true
memoryPercentage: true
capacity: true
failedConnections: true
deadlocks: true
alertsGroupName: resourceNames.existingResources.alertsGroup
} : null
tagValues: tagValues
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,20 @@ var dynamicMaxGreaterThan = {
operator: 'GreaterThan'
}

@export()
var dynamicTotalGreaterThan = {
...defaultDynamicAlertConfig
aggregation: 'Total'
operator: 'GreaterThan'
}

@export()
var dynamicCountGreaterThan = {
...defaultDynamicAlertConfig
aggregation: 'Count'
operator: 'GreaterThan'
}

@export()
var cpuPercentageConfig = {
...dynamicAverageGreaterThan
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,24 @@ type AppGatewayMetric = {
resourceType: 'Microsoft.Network/applicationGateways'
metric:
| 'ApplicationGatewayTotalTime'
| 'FailedRequests'
| 'UnhealthyHostCount'
| 'ResponseStatus'
dimensions: {
name:
| 'BackendSettingsPool'
| 'HttpStatusGroup'
operator: DimensionOperator?
values: string[]
}[]?
}

type AppInsightsMetric = {
resourceType: 'Microsoft.Insights/components'
metric:
| 'exceptions/count'
| 'exceptions/server'
| 'requests/failed'
}

type AppServicePlanMetric = {
Expand All @@ -19,29 +36,42 @@ type ContainerAppMetric = {
metric:
| 'CpuPercentage'
| 'MemoryPercentage'
| 'ResiliencyConnectTimeouts'
| 'ResiliencyRequestRetries'
| 'ResiliencyRequestTimeouts'
| 'ResponseTime'
| 'RestartCount'
}

type FileServiceMetric = {
resourceType: 'Microsoft.Storage/storageAccounts/fileServices'
dimensions: {
name: 'FileShare' | 'Tier'
operator: DimensionOperator?
values: string[]
}[]?
metric:
| 'availability'
| 'FileCapacity'
| 'SuccessE2ELatency'
dimensions: {
name:
| 'FileShare'
| 'Tier'
operator: DimensionOperator?
values: string[]
}[]?
}

type PostgreSqlMetric = {
resourceType: 'Microsoft.DBforPostgreSQL/flexibleServers'
dimensions: {
name:
| 'DatabaseName'
operator: DimensionOperator?
values: string[]
}[]?
metric:
| 'backup_storage_used'
| 'client_connections_waiting'
| 'connections_failed'
| 'cpu_percent'
| 'deadlocks'
| 'disk_bandwidth_consumed_percentage'
| 'disk_iops_consumed_percentage'
| 'is_db_alive'
Expand All @@ -55,6 +85,10 @@ type SiteMetric = {
resourceType: 'Microsoft.Web/sites'
metric:
| 'HealthCheckStatus'
| 'Http401'
| 'Http403'
| 'Http4xx'
| 'Http5xx'
}

type StorageAccountMetric = {
Expand All @@ -68,8 +102,9 @@ type StorageAccountMetric = {
@export()
@discriminator('resourceType')
type ResourceMetric =
| AppServicePlanMetric
| AppGatewayMetric
| AppInsightsMetric
| AppServicePlanMetric
| ContainerAppMetric
| ContainerAppMetric
| FileServiceMetric
Expand Down
49 changes: 47 additions & 2 deletions infrastructure/templates/public-api/components/appGateway.bicep
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { responseTimeConfig } from 'alerts/dynamicAlertConfig.bicep'
import { responseTimeConfig, dynamicTotalGreaterThan } from 'alerts/dynamicAlertConfig.bicep'
import { staticAverageGreaterThanZero } from 'alerts/staticAlertConfig.bicep'

import {
Expand Down Expand Up @@ -46,6 +46,8 @@ param availabilityZones ('1' | '2' | '3') [] = [
param alerts {
health: bool
responseTime: bool
failedRequests: bool
responseStatuses: bool
alertsGroupName: string
}?

Expand Down Expand Up @@ -304,7 +306,6 @@ module backendPoolsHealthAlert 'alerts/staticMetricAlert.bicep' = if (alerts !=
}
}


module responseTimeAlert 'alerts/dynamicMetricAlert.bicep' = if (alerts != null && alerts!.responseTime) {
name: '${appGatewayName}ResponseTimeDeploy'
params: {
Expand All @@ -318,3 +319,47 @@ module responseTimeAlert 'alerts/dynamicMetricAlert.bicep' = if (alerts != null
tagValues: tagValues
}
}

module failedRequestsAlert 'alerts/dynamicMetricAlert.bicep' = if (alerts != null && alerts!.failedRequests) {
name: '${appGatewayName}FailedRequestsDeploy'
params: {
resourceName: appGatewayName
resourceMetric: {
resourceType: 'Microsoft.Network/applicationGateways'
metric: 'FailedRequests'
dimensions: [{
name: 'BackendSettingsPool'
values: map(backends, backend => backend.name)
}]
}
config: {
...dynamicTotalGreaterThan
nameSuffix: 'failed-requests'
windowSize: 'PT30M'
}
alertsGroupName: alerts!.alertsGroupName
tagValues: tagValues
}
}

module responseStatusAlert 'alerts/dynamicMetricAlert.bicep' = if (alerts != null && alerts!.responseStatuses) {
name: '${appGatewayName}ResponseStatusDeploy'
params: {
resourceName: appGatewayName
resourceMetric: {
resourceType: 'Microsoft.Network/applicationGateways'
metric: 'ResponseStatus'
dimensions: [{
name: 'HttpStatusGroup'
values: ['4xx', '5xx']
}]
}
config: {
...dynamicTotalGreaterThan
nameSuffix: 'http-4xx-5xx'
windowSize: 'PT30M'
}
alertsGroupName: alerts!.alertsGroupName
tagValues: tagValues
}
}
68 changes: 68 additions & 0 deletions infrastructure/templates/public-api/components/appInsights.bicep
Original file line number Diff line number Diff line change
@@ -1,9 +1,22 @@
import { dynamicCountGreaterThan } from 'alerts/dynamicAlertConfig.bicep'

@description('Specifies the location for all resources.')
param location string

@description('Specifies the Application Insights name')
param appInsightsName string

@description('Whether to create or update Azure Monitor alerts during this deploy')
param alerts {
exceptionCount: bool
exceptionServerCount: bool
failedRequests: bool
alertsGroupName: string
}?

@description('Tags for the resources')
param tagValues object

var kind = 'web'

resource applicationInsights 'Microsoft.Insights/components@2020-02-02' = {
Expand All @@ -15,6 +28,61 @@ resource applicationInsights 'Microsoft.Insights/components@2020-02-02' = {
publicNetworkAccessForIngestion: 'Enabled'
publicNetworkAccessForQuery: 'Enabled'
}
tags: tagValues
}

module exceptionCountAlert 'alerts/dynamicMetricAlert.bicep' = if (alerts != null && alerts!.exceptionCount) {
name: '${appInsightsName}ExceptionCountDeploy'
params: {
resourceName: appInsightsName
resourceMetric: {
resourceType: 'Microsoft.Insights/components'
metric: 'exceptions/count'
}
config: {
...dynamicCountGreaterThan
nameSuffix: 'exception-count'
windowSize: 'PT30M'
}
alertsGroupName: alerts!.alertsGroupName
tagValues: tagValues
}
}

module exceptionServerCountAlert 'alerts/dynamicMetricAlert.bicep' = if (alerts != null && alerts!.exceptionServerCount) {
name: '${appInsightsName}ExceptionServerCountDeploy'
params: {
resourceName: appInsightsName
resourceMetric: {
resourceType: 'Microsoft.Insights/components'
metric: 'exceptions/server'
}
config: {
...dynamicCountGreaterThan
nameSuffix: 'server-exception-count'
windowSize: 'PT30M'
}
alertsGroupName: alerts!.alertsGroupName
tagValues: tagValues
}
}

module failedRequestsAlert 'alerts/dynamicMetricAlert.bicep' = if (alerts != null && alerts!.failedRequests) {
name: '${appInsightsName}FailedRequestsDeploy'
params: {
resourceName: appInsightsName
resourceMetric: {
resourceType: 'Microsoft.Insights/components'
metric: 'requests/failed'
}
config: {
...dynamicCountGreaterThan
nameSuffix: 'failed-requests'
windowSize: 'PT30M'
}
alertsGroupName: alerts!.alertsGroupName
tagValues: tagValues
}
}

output applicationInsightsKey string = applicationInsights.properties.InstrumentationKey
Expand Down
54 changes: 54 additions & 0 deletions infrastructure/templates/public-api/components/containerApp.bicep
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,9 @@ param alerts {
responseTime: bool
cpuPercentage: bool
memoryPercentage: bool
connectionTimeouts: bool
requestRetries: bool
requestTimeouts: bool
alertsGroupName: string
}?

Expand Down Expand Up @@ -275,6 +278,57 @@ module memoryPercentageAlert 'alerts/dynamicMetricAlert.bicep' = if (alerts != n
}
}

module connectionTimeoutsAlert 'alerts/staticMetricAlert.bicep' = if (alerts != null && alerts!.connectionTimeouts) {
name: '${containerAppName}ConnectionTimeoutsAlertModule'
params: {
resourceName: containerAppName
resourceMetric: {
resourceType: 'Microsoft.App/containerApps'
metric: 'ResiliencyConnectTimeouts'
}
config: {
...staticTotalGreaterThanZero
nameSuffix: 'connection-timeouts'
}
alertsGroupName: alerts!.alertsGroupName
tagValues: tagValues
}
}

module requestRetriesAlert 'alerts/staticMetricAlert.bicep' = if (alerts != null && alerts!.requestRetries) {
name: '${containerAppName}RequestRetriesAlertModule'
params: {
resourceName: containerAppName
resourceMetric: {
resourceType: 'Microsoft.App/containerApps'
metric: 'ResiliencyRequestRetries'
}
config: {
...staticTotalGreaterThanZero
nameSuffix: 'request-retries'
}
alertsGroupName: alerts!.alertsGroupName
tagValues: tagValues
}
}

module requestTimeoutsAlert 'alerts/staticMetricAlert.bicep' = if (alerts != null && alerts!.requestTimeouts) {
name: '${containerAppName}RequestTimeoutsAlertModule'
params: {
resourceName: containerAppName
resourceMetric: {
resourceType: 'Microsoft.App/containerApps'
metric: 'ResiliencyRequestTimeouts'
}
config: {
...staticTotalGreaterThanZero
nameSuffix: 'request-timeouts'
}
alertsGroupName: alerts!.alertsGroupName
tagValues: tagValues
}
}

output containerAppFqdn string = containerAppFqdn
output containerImage string = containerImageName
output containerAppName string = containerApp.name
Loading

0 comments on commit 4ace52e

Please sign in to comment.