Skip to content

Commit

Permalink
Add rollback alarms for all variants of the ECS-Fargate stack
Browse files Browse the repository at this point in the history
  • Loading branch information
clareliguori committed Jan 9, 2021
1 parent 0ab5366 commit ad43a8f
Show file tree
Hide file tree
Showing 5 changed files with 84 additions and 15 deletions.
6 changes: 2 additions & 4 deletions alarms/template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,8 @@ Resources:
- !Sub "arn:${AWS::Partition}:sns:${AWS::Region}:${AWS::AccountId}:${NotificationsTopic}"
AlarmRule: !Sub
- |-
ALARM("TriviaBackend${Stage}-Unhealthy-Hosts-Blue") OR
ALARM("TriviaBackend${Stage}-Http-500-Blue") OR
ALARM("TriviaBackend${Stage}-Unhealthy-Hosts-Green") OR
ALARM("TriviaBackend${Stage}-Http-500-Green") OR
ALARM("TriviaBackend${Stage}-Unhealthy-Hosts") OR
ALARM("TriviaBackend${Stage}-Http-500") OR
ALARM("TriviaGameChatBot${Stage}-BotLatestVersionErrors") OR
ALARM("TriviaGameChatBot${Stage}-BotAliasErrors") OR
ALARM("Synthetics-Alarm-trivia-game-${StageLowerCase}")
Expand Down
21 changes: 21 additions & 0 deletions trivia-backend/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,27 @@ cdk deploy --app ecs-service.js TriviaBackendTest
cdk deploy --app ecs-service.js TriviaBackendProd
```

Follow the instructions in the [canaries](../canaries) folder to deploy synthetic traffic canaries and their associated alarms. Lastly, configure rollback alarms on the CloudFormation stacks for the backend services.
```
AWS_ACCOUNT_ID=`aws sts get-caller-identity --query Account --output text`
aws cloudformation update-stack \
--region us-east-1 \
--stack-name TriviaBackendTest \
--use-previous-template \
--parameters ParameterKey=CertArnParameterParameter,UsePreviousValue=true \
--capabilities CAPABILITY_IAM \
--rollback-configuration "RollbackTriggers=[{Arn=arn:aws:cloudwatch:us-east-1:$AWS_ACCOUNT_ID:alarm:TriviaBackendTest-Unhealthy-Hosts,Type=AWS::CloudWatch::Alarm},{Arn=arn:aws:cloudwatch:us-east-1:$AWS_ACCOUNT_ID:alarm:TriviaBackendTest-Http-500,Type=AWS::CloudWatch::Alarm},{Arn=arn:aws:cloudwatch:us-east-1:$AWS_ACCOUNT_ID:alarm:Synthetics-Alarm-trivia-game-test,Type=AWS::CloudWatch::Alarm}]"
aws cloudformation update-stack \
--region us-east-1 \
--stack-name TriviaBackendProd \
--use-previous-template \
--parameters ParameterKey=CertArnParameterParameter,UsePreviousValue=true \
--capabilities CAPABILITY_IAM \
--rollback-configuration "RollbackTriggers=[{Arn=arn:aws:cloudwatch:us-east-1:$AWS_ACCOUNT_ID:alarm:TriviaBackendProd-Unhealthy-Hosts,Type=AWS::CloudWatch::Alarm},{Arn=arn:aws:cloudwatch:us-east-1:$AWS_ACCOUNT_ID:alarm:TriviaBackendProd-Http-500,Type=AWS::CloudWatch::Alarm},{Arn=arn:aws:cloudwatch:us-east-1:$AWS_ACCOUNT_ID:alarm:Synthetics-Alarm-trivia-game-prod,Type=AWS::CloudWatch::Alarm}]"
```

### ECS on Fargate (task set deployments)

The [cdk](infra/cdk/) folder contains the example '[ecs-task-sets](infra/cdk/ecs-service.ts)' for how to model this service with the [AWS Cloud Development Kit (AWS)](https://github.com/awslabs/aws-cdk) and deploy the service with CloudFormation, using [ECS task set deployments](https://docs.aws.amazon.com/AmazonECS/latest/developerguide/deployment-type-external.html). Note that this example does not currently have a continuous deployment pipeline example in this repo.
Expand Down
24 changes: 19 additions & 5 deletions trivia-backend/infra/cdk/ecs-service-blue-green.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env node
import { Alarm, Metric } from '@aws-cdk/aws-cloudwatch';
import { Alarm, AlarmRule, AlarmState, CompositeAlarm, Metric } from '@aws-cdk/aws-cloudwatch';
import { Port, SecurityGroup, SubnetType, Vpc } from '@aws-cdk/aws-ec2';
import { Repository } from '@aws-cdk/aws-ecr';
import { AwsLogDriver, CfnPrimaryTaskSet, CfnService, CfnTaskDefinition, CfnTaskSet, Cluster, ContainerImage, DeploymentControllerType, FargateTaskDefinition, LaunchType, PropagatedTagSource } from '@aws-cdk/aws-ecs';
Expand Down Expand Up @@ -228,7 +228,7 @@ class TriviaBackendStack extends cdk.Stack {
// In order to have stack updates automatically rollback based on these alarms,
// the alarms need to manually be configured as rollback triggers on the stack
// after the stack is created.
new Alarm(this, 'TargetGroupBlueUnhealthyHosts', {
const tg1UnhealthyHosts = new Alarm(this, 'TargetGroupBlueUnhealthyHosts', {
alarmName: this.stackName + '-Unhealthy-Hosts-Blue',
metric: new Metric({
namespace: 'AWS/ApplicationELB',
Expand All @@ -243,7 +243,7 @@ class TriviaBackendStack extends cdk.Stack {
evaluationPeriods: 2,
});

new Alarm(this, 'TargetGroupBlue5xx', {
const tg1ApiFailure = new Alarm(this, 'TargetGroupBlue5xx', {
alarmName: this.stackName + '-Http-500-Blue',
metric: new Metric({
namespace: 'AWS/ApplicationELB',
Expand All @@ -259,7 +259,7 @@ class TriviaBackendStack extends cdk.Stack {
period: cdk.Duration.minutes(1)
});

new Alarm(this, 'TargetGroupGreenUnhealthyHosts', {
const tg2UnhealthyHosts = new Alarm(this, 'TargetGroupGreenUnhealthyHosts', {
alarmName: this.stackName + '-Unhealthy-Hosts-Green',
metric: new Metric({
namespace: 'AWS/ApplicationELB',
Expand All @@ -274,7 +274,7 @@ class TriviaBackendStack extends cdk.Stack {
evaluationPeriods: 2,
});

new Alarm(this, 'TargetGroupGreen5xx', {
const tg2ApiFailure = new Alarm(this, 'TargetGroupGreen5xx', {
alarmName: this.stackName + '-Http-500-Green',
metric: new Metric({
namespace: 'AWS/ApplicationELB',
Expand All @@ -289,6 +289,20 @@ class TriviaBackendStack extends cdk.Stack {
evaluationPeriods: 1,
period: cdk.Duration.minutes(1)
});

new CompositeAlarm(this, 'CompositeUnhealthyHosts', {
compositeAlarmName: this.stackName + '-Unhealthy-Hosts',
alarmRule: AlarmRule.anyOf(
AlarmRule.fromAlarm(tg1UnhealthyHosts, AlarmState.ALARM),
AlarmRule.fromAlarm(tg2UnhealthyHosts, AlarmState.ALARM))
});

new CompositeAlarm(this, 'Composite5xx', {
compositeAlarmName: this.stackName + '-Http-500',
alarmRule: AlarmRule.anyOf(
AlarmRule.fromAlarm(tg1ApiFailure, AlarmState.ALARM),
AlarmRule.fromAlarm(tg2ApiFailure, AlarmState.ALARM))
});
}
}

Expand Down
20 changes: 19 additions & 1 deletion trivia-backend/infra/cdk/ecs-service.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
#!/usr/bin/env node
import { Certificate } from '@aws-cdk/aws-certificatemanager';
import { Alarm } from '@aws-cdk/aws-cloudwatch';
import { Vpc } from '@aws-cdk/aws-ec2';
import { Repository } from '@aws-cdk/aws-ecr';
import { Cluster, ContainerImage, PropagatedTagSource } from '@aws-cdk/aws-ecs';
import { ApplicationLoadBalancedFargateService } from '@aws-cdk/aws-ecs-patterns';
import { HttpCodeTarget } from '@aws-cdk/aws-elasticloadbalancingv2';
import { HostedZone } from '@aws-cdk/aws-route53';
import { StringParameter } from '@aws-cdk/aws-ssm';
import cdk = require('@aws-cdk/core');
Expand Down Expand Up @@ -38,7 +40,7 @@ class TriviaBackendStack extends cdk.Stack {
const certificate = Certificate.fromCertificateArn(this, 'Cert', certificateArn);

// Fargate service + load balancer
new ApplicationLoadBalancedFargateService(this, 'Service', {
const service = new ApplicationLoadBalancedFargateService(this, 'Service', {
cluster,
taskImageOptions: { image },
desiredCount: 3,
Expand All @@ -47,6 +49,22 @@ class TriviaBackendStack extends cdk.Stack {
certificate,
propagateTags: PropagatedTagSource.SERVICE,
});

// Alarms: monitor 500s and unhealthy hosts on target groups
new Alarm(this, 'TargetGroupUnhealthyHosts', {
alarmName: this.stackName + '-Unhealthy-Hosts',
metric: service.targetGroup.metricUnhealthyHostCount(),
threshold: 1,
evaluationPeriods: 2,
});

new Alarm(this, 'TargetGroup5xx', {
alarmName: this.stackName + '-Http-500',
metric: service.targetGroup.metricHttpCodeTarget(HttpCodeTarget.TARGET_5XX_COUNT),
threshold: 1,
evaluationPeriods: 1,
period: cdk.Duration.minutes(1)
});
}
}

Expand Down
28 changes: 23 additions & 5 deletions trivia-backend/infra/codedeploy-blue-green/infra-setup.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env node
import { Alarm } from '@aws-cdk/aws-cloudwatch';
import { Alarm, AlarmRule, AlarmState, CompositeAlarm } from '@aws-cdk/aws-cloudwatch';
import { Port, SecurityGroup, Vpc } from '@aws-cdk/aws-ec2';
import { ApplicationLoadBalancer, ApplicationProtocol, ApplicationTargetGroup, HttpCodeTarget, IApplicationLoadBalancerTarget, LoadBalancerTargetProps, TargetType } from '@aws-cdk/aws-elasticloadbalancingv2';
import { RecordTarget, ARecord, HostedZone } from '@aws-cdk/aws-route53';
Expand Down Expand Up @@ -96,32 +96,50 @@ class TriviaBackendStack extends cdk.Stack {
});

// Alarms: monitor 500s and unhealthy hosts on target groups
new Alarm(this, 'TargetGroupUnhealthyHosts', {
const tg1UnhealthyHosts = new Alarm(this, 'TargetGroupUnhealthyHosts', {
alarmName: this.stackName + '-Unhealthy-Hosts-Blue',
metric: tg1.metricUnhealthyHostCount(),
threshold: 1,
evaluationPeriods: 2,
});

new Alarm(this, 'TargetGroup5xx', {
const tg1ApiFailure = new Alarm(this, 'TargetGroup5xx', {
alarmName: this.stackName + '-Http-500-Blue',
metric: tg1.metricHttpCodeTarget(HttpCodeTarget.TARGET_5XX_COUNT),
threshold: 1,
evaluationPeriods: 1,
period: cdk.Duration.minutes(1)
});

new Alarm(this, 'TargetGroup2UnhealthyHosts', {
const tg2UnhealthyHosts = new Alarm(this, 'TargetGroup2UnhealthyHosts', {
alarmName: this.stackName + '-Unhealthy-Hosts-Green',
metric: tg2.metricUnhealthyHostCount(),
threshold: 1,
evaluationPeriods: 2,
});

new Alarm(this, 'TargetGroup25xx', {
const tg2ApiFailure = new Alarm(this, 'TargetGroup25xx', {
alarmName: this.stackName + '-Http-500-Green',
metric: tg2.metricHttpCodeTarget(HttpCodeTarget.TARGET_5XX_COUNT),
threshold: 1,
evaluationPeriods: 1,
period: cdk.Duration.minutes(1)
});

new CompositeAlarm(this, 'CompositeUnhealthyHosts', {
compositeAlarmName: this.stackName + '-Unhealthy-Hosts',
alarmRule: AlarmRule.anyOf(
AlarmRule.fromAlarm(tg1UnhealthyHosts, AlarmState.ALARM),
AlarmRule.fromAlarm(tg2UnhealthyHosts, AlarmState.ALARM))
});

new CompositeAlarm(this, 'Composite5xx', {
compositeAlarmName: this.stackName + '-Http-500',
alarmRule: AlarmRule.anyOf(
AlarmRule.fromAlarm(tg1ApiFailure, AlarmState.ALARM),
AlarmRule.fromAlarm(tg2ApiFailure, AlarmState.ALARM))
});

// Roles
new Role(this, 'ServiceTaskDefExecutionRole', {
assumedBy: new ServicePrincipal('ecs-tasks.amazonaws.com'),
Expand Down

0 comments on commit ad43a8f

Please sign in to comment.