From 4ac66299694e88e565f055bceb5d3f104fd12c15 Mon Sep 17 00:00:00 2001 From: stefanprodan Date: Sat, 13 Apr 2019 15:33:03 +0300 Subject: [PATCH 1/6] Exclude docs branches from CI --- .circleci/config.yml | 2 +- .travis.yml | 11 +++++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 6c8124518..4a44d24e4 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -17,6 +17,6 @@ workflows: filters: branches: ignore: - - gh-pages + - /gh-pages.*/ - /docs-.*/ - /release-.*/ diff --git a/.travis.yml b/.travis.yml index b9dc35a3c..c1535e23a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,6 +1,11 @@ sudo: required language: go +branches: + except: + - /gh-pages.*/ + - /docs-.*/ + go: - 1.12.x @@ -12,13 +17,7 @@ addons: packages: - docker-ce -#before_script: -# - go get -u sigs.k8s.io/kind -# - curl https://raw.githubusercontent.com/kubernetes/helm/master/scripts/get | bash -# - curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl && chmod +x kubectl && sudo mv kubectl /usr/local/bin/ - script: - - set -e - make test-fmt - make test-codegen - go test -race -coverprofile=coverage.txt -covermode=atomic $(go list ./pkg/...) From e0fc5ecb39f0304a2fc8114c4421dcc024307297 Mon Sep 17 00:00:00 2001 From: stefanprodan Date: Sat, 13 Apr 2019 15:37:41 +0300 Subject: [PATCH 2/6] Add hook type to CRD - pre-rollout execute webhook before routing traffic to canary - rollout execute webhook during the canary analysis on each iteration - post-rollout execute webhook after the canary has been promoted or rolled back Add canary phase to webhook payload --- pkg/apis/flagger/v1alpha3/types.go | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/pkg/apis/flagger/v1alpha3/types.go b/pkg/apis/flagger/v1alpha3/types.go index 92bc55164..4b0b9d129 100755 --- a/pkg/apis/flagger/v1alpha3/types.go +++ b/pkg/apis/flagger/v1alpha3/types.go @@ -148,11 +148,24 @@ type CanaryMetric struct { Query string `json:"query,omitempty"` } +// HookType can be pre, post or during rollout +type HookType string + +const ( + // RolloutHook execute webhook during the canary analysis + RolloutHook HookType = "rollout" + // PreRolloutHook execute webhook before routing traffic to canary + PreRolloutHook HookType = "pre-rollout" + // PreRolloutHook execute webhook after the canary analysis + PostRolloutHook HookType = "post-rollout" +) + // CanaryWebhook holds the reference to external checks used for canary analysis type CanaryWebhook struct { - Name string `json:"name"` - URL string `json:"url"` - Timeout string `json:"timeout"` + Type HookType `json:"type"` + Name string `json:"name"` + URL string `json:"url"` + Timeout string `json:"timeout"` // +optional Metadata *map[string]string `json:"metadata,omitempty"` } @@ -161,6 +174,7 @@ type CanaryWebhook struct { type CanaryWebhookPayload struct { Name string `json:"name"` Namespace string `json:"namespace"` + Phase CanaryPhase `json:"phase"` Metadata map[string]string `json:"metadata,omitempty"` } From edcff9cd1552dc161b680f63c17ac80dfb78e280 Mon Sep 17 00:00:00 2001 From: stefanprodan Date: Sat, 13 Apr 2019 15:43:23 +0300 Subject: [PATCH 3/6] Execute pre/post rollout webhooks - halt the canary advancement if pre-rollout hooks are failing - include the canary status (Succeeded/Failed) in the post-rollout webhook payload - ignore post-rollout webhook failures - log pre/post rollout webhook response result --- pkg/controller/scheduler.go | 55 ++++++++++++++++++++++++++++++---- pkg/controller/webhook.go | 3 +- pkg/controller/webhook_test.go | 4 +-- 3 files changed, 54 insertions(+), 8 deletions(-) diff --git a/pkg/controller/scheduler.go b/pkg/controller/scheduler.go index 02c7206b4..9af0b3544 100644 --- a/pkg/controller/scheduler.go +++ b/pkg/controller/scheduler.go @@ -240,6 +240,7 @@ func (c *Controller) advanceCanary(name string, namespace string, skipLivenessCh } c.recorder.SetStatus(cd, flaggerv1.CanaryFailed) + c.runPostRolloutHooks(cd, flaggerv1.CanaryFailed) return } @@ -247,6 +248,15 @@ func (c *Controller) advanceCanary(name string, namespace string, skipLivenessCh // skip check if no traffic is routed to canary if canaryWeight == 0 { c.recordEventInfof(cd, "Starting canary analysis for %s.%s", cd.Spec.TargetRef.Name, cd.Namespace) + + // run pre-rollout web hooks + if ok := c.runPreRolloutHooks(cd); !ok { + if err := c.deployer.SetStatusFailedChecks(cd, cd.Status.FailedChecks+1); err != nil { + c.recordEventWarningf(cd, "%v", err) + return + } + return + } } else { if ok := c.analyseCanary(cd); !ok { if err := c.deployer.SetStatusFailedChecks(cd, cd.Status.FailedChecks+1); err != nil { @@ -314,6 +324,7 @@ func (c *Controller) advanceCanary(name string, namespace string, skipLivenessCh return } c.recorder.SetStatus(cd, flaggerv1.CanarySucceeded) + c.runPostRolloutHooks(cd, flaggerv1.CanarySucceeded) c.sendNotification(cd, "Canary analysis completed successfully, promotion finished.", false, false) return @@ -380,6 +391,7 @@ func (c *Controller) advanceCanary(name string, namespace string, skipLivenessCh return } c.recorder.SetStatus(cd, flaggerv1.CanarySucceeded) + c.runPostRolloutHooks(cd, flaggerv1.CanarySucceeded) c.sendNotification(cd, "Canary analysis completed successfully, promotion finished.", false, false) } @@ -477,14 +489,47 @@ func (c *Controller) hasCanaryRevisionChanged(cd *flaggerv1.Canary) bool { return false } +func (c *Controller) runPreRolloutHooks(canary *flaggerv1.Canary) bool { + for _, webhook := range canary.Spec.CanaryAnalysis.Webhooks { + if webhook.Type == flaggerv1.PreRolloutHook { + err := CallWebhook(canary.Name, canary.Namespace, flaggerv1.CanaryProgressing, webhook) + if err != nil { + c.recordEventWarningf(canary, "Halt %s.%s advancement pre-rollout check %s failed %v", + canary.Name, canary.Namespace, webhook.Name, err) + return false + } else { + c.recordEventInfof(canary, "Pre-rollout check %s passed", webhook.Name) + } + } + } + return true +} + +func (c *Controller) runPostRolloutHooks(canary *flaggerv1.Canary, phase flaggerv1.CanaryPhase) bool { + for _, webhook := range canary.Spec.CanaryAnalysis.Webhooks { + if webhook.Type == flaggerv1.PostRolloutHook { + err := CallWebhook(canary.Name, canary.Namespace, phase, webhook) + if err != nil { + c.recordEventWarningf(canary, "Post-rollout hook %s failed %v", webhook.Name, err) + return false + } else { + c.recordEventInfof(canary, "Post-rollout check %s passed", webhook.Name) + } + } + } + return true +} + func (c *Controller) analyseCanary(r *flaggerv1.Canary) bool { // run external checks for _, webhook := range r.Spec.CanaryAnalysis.Webhooks { - err := CallWebhook(r.Name, r.Namespace, webhook) - if err != nil { - c.recordEventWarningf(r, "Halt %s.%s advancement external check %s failed %v", - r.Name, r.Namespace, webhook.Name, err) - return false + if webhook.Type == "" || webhook.Type == flaggerv1.RolloutHook { + err := CallWebhook(r.Name, r.Namespace, flaggerv1.CanaryProgressing, webhook) + if err != nil { + c.recordEventWarningf(r, "Halt %s.%s advancement external check %s failed %v", + r.Name, r.Namespace, webhook.Name, err) + return false + } } } diff --git a/pkg/controller/webhook.go b/pkg/controller/webhook.go index 4bb1d7f0b..550921fca 100644 --- a/pkg/controller/webhook.go +++ b/pkg/controller/webhook.go @@ -15,10 +15,11 @@ import ( // CallWebhook does a HTTP POST to an external service and // returns an error if the response status code is non-2xx -func CallWebhook(name string, namespace string, w flaggerv1.CanaryWebhook) error { +func CallWebhook(name string, namespace string, phase flaggerv1.CanaryPhase, w flaggerv1.CanaryWebhook) error { payload := flaggerv1.CanaryWebhookPayload{ Name: name, Namespace: namespace, + Phase: phase, } if w.Metadata != nil { diff --git a/pkg/controller/webhook_test.go b/pkg/controller/webhook_test.go index d9ddb9134..528dc5164 100644 --- a/pkg/controller/webhook_test.go +++ b/pkg/controller/webhook_test.go @@ -19,7 +19,7 @@ func TestCallWebhook(t *testing.T) { Metadata: &map[string]string{"key1": "val1"}, } - err := CallWebhook("podinfo", "default", hook) + err := CallWebhook("podinfo", "default", flaggerv1.CanaryProgressing, hook) if err != nil { t.Fatal(err.Error()) } @@ -35,7 +35,7 @@ func TestCallWebhook_StatusCode(t *testing.T) { URL: ts.URL, } - err := CallWebhook("podinfo", "default", hook) + err := CallWebhook("podinfo", "default", flaggerv1.CanaryProgressing, hook) if err == nil { t.Errorf("Got no error wanted %v", http.StatusInternalServerError) } From 19e625d38ed0f6a736b38d56da4d73865e80a5b0 Mon Sep 17 00:00:00 2001 From: stefanprodan Date: Sat, 13 Apr 2019 20:30:19 +0300 Subject: [PATCH 4/6] Add pre/post rollout webhooks to docs --- docs/gitbook/how-it-works.md | 54 ++++++++++++++++++++++++++---------- 1 file changed, 39 insertions(+), 15 deletions(-) diff --git a/docs/gitbook/how-it-works.md b/docs/gitbook/how-it-works.md index 7940d0d50..598e299aa 100644 --- a/docs/gitbook/how-it-works.md +++ b/docs/gitbook/how-it-works.md @@ -2,7 +2,7 @@ [Flagger](https://github.com/weaveworks/flagger) takes a Kubernetes deployment and optionally a horizontal pod autoscaler \(HPA\) and creates a series of objects -\(Kubernetes deployments, ClusterIP services and Istio virtual services\) to drive the canary analysis and promotion. +\(Kubernetes deployments, ClusterIP services and Istio or App Mesh virtual services\) to drive the canary analysis and promotion. ![Flagger Canary Process](https://raw.githubusercontent.com/weaveworks/flagger/master/docs/diagrams/flagger-canary-hpa.png) @@ -268,16 +268,22 @@ Gated canary promotion stages: * check primary and canary deployments status * halt advancement if a rolling update is underway * halt advancement if pods are unhealthy +* call pre-rollout webhooks are check results + * halt advancement if any hook returned a non HTTP 2xx result + * increment the failed checks counter * increase canary traffic weight percentage from 0% to 5% (step weight) -* call webhooks and check results +* call rollout webhooks and check results * check canary HTTP request success rate and latency * halt advancement if any metric is under the specified threshold * increment the failed checks counter * check if the number of failed checks reached the threshold * route all traffic to primary * scale to zero the canary deployment and mark it as failed + * call post-rollout webhooks + * post the analysis result to Slack * wait for the canary deployment to be updated and start over * increase canary traffic weight by 5% (step weight) till it reaches 50% (max weight) + * halt advancement if any webhook call fails * halt advancement while canary request success rate is under the threshold * halt advancement while canary request duration P99 is over the threshold * halt advancement if the primary or canary deployment becomes unhealthy @@ -290,6 +296,8 @@ Gated canary promotion stages: * route all traffic to primary * scale to zero the canary deployment * mark rollout as finished +* call post-rollout webhooks +* post the analysis result to Slack * wait for the canary deployment to be updated and start over ### Canary Analysis @@ -524,39 +532,55 @@ rate reaches the 5% threshold, then the canary fails. When specifying a query, Flagger will run the promql query and convert the result to float64. Then it compares the query result value with the metric threshold value. - ### Webhooks -The canary analysis can be extended with webhooks. -Flagger will call each webhook URL and determine from the response status code (HTTP 2xx) if the canary is failing or not. +The canary analysis can be extended with webhooks. Flagger will call each webhook URL and +determine from the response status code (HTTP 2xx) if the canary is failing or not. + +There are three types of hooks: +* Pre-rollout hooks are executed before routing traffic to canary. +The canary advancement is paused if a pre-rollout hook fails and if the number of failures reach the +threshold the canary will be rollback. +* Rollout hooks are executed during the analysis on each iteration before the metric checks. +If a rollout hook call fails the canary advancement is paused and eventfully rolled back. +* Post-rollout hooks are executed after the canary has been promoted or rolled back. +If a post rollout hook fails the error is logged. Spec: ```yaml canaryAnalysis: webhooks: - - name: integration-test - url: http://int-runner.test:8080/ - timeout: 30s - metadata: - test: "all" - token: "16688eb5e9f289f1991c" - - name: db-test + - name: "smoke test" + type: pre-rollout url: http://migration-check.db/query timeout: 30s metadata: key1: "val1" key2: "val2" + - name: "load test" + type: rollout + url: http://flagger-loadtester.test/ + timeout: 15s + metadata: + cmd: "hey -z 1m -q 5 -c 2 http://podinfo-canary.test:9898/" + - name: "notify" + type: post-rollout + url: http://telegram.bot:8080/ + timeout: 5s + metadata: + some: "message" ``` -> **Note** that the sum of all webhooks timeouts should be lower than the control loop interval. +> **Note** that the sum of all rollout webhooks timeouts should be lower than the analysis interval. Webhook payload (HTTP POST): ```json { "name": "podinfo", - "namespace": "test", + "namespace": "test", + "phase": "Progressing", "metadata": { "test": "all", "token": "16688eb5e9f289f1991c" @@ -676,4 +700,4 @@ webhooks: ``` When the canary analysis starts, the load tester will initiate a [clone_and_start request](https://github.com/naver/ngrinder/wiki/REST-API-PerfTest) to the nGrinder server and start a new performance test. the load tester will periodically poll the nGrinder server -for the status of the test, and prevent duplicate requests from being sent in subsequent analysis loops. \ No newline at end of file +for the status of the test, and prevent duplicate requests from being sent in subsequent analysis loops. From 663fa08cc16b4ab42a56983fa3fa63ca199042f3 Mon Sep 17 00:00:00 2001 From: stefanprodan Date: Sat, 13 Apr 2019 21:21:51 +0300 Subject: [PATCH 5/6] Add hook type and status to CRD schema validation --- artifacts/flagger/crd.yaml | 35 ++++++++++++++++++++++++++++--- charts/flagger/templates/crd.yaml | 27 ++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 3 deletions(-) diff --git a/artifacts/flagger/crd.yaml b/artifacts/flagger/crd.yaml index c8817b80f..134020e63 100644 --- a/artifacts/flagger/crd.yaml +++ b/artifacts/flagger/crd.yaml @@ -2,6 +2,8 @@ apiVersion: apiextensions.k8s.io/v1beta1 kind: CustomResourceDefinition metadata: name: canaries.flagger.app + annotations: + helm.sh/resource-policy: keep spec: group: flagger.app version: v1alpha3 @@ -39,9 +41,9 @@ spec: properties: spec: required: - - targetRef - - service - - canaryAnalysis + - targetRef + - service + - canaryAnalysis properties: progressDeadlineSeconds: type: number @@ -119,9 +121,36 @@ spec: properties: name: type: string + type: + type: string + enum: + - "" + - pre-rollout + - rollout + - post-rollout url: type: string format: url timeout: type: string pattern: "^[0-9]+(m|s)" + status: + properties: + phase: + type: string + enum: + - "" + - Initialized + - Progressing + - Succeeded + - Failed + canaryWeight: + type: number + failedChecks: + type: number + iterations: + type: number + lastAppliedSpec: + type: string + lastTransitionTime: + type: string diff --git a/charts/flagger/templates/crd.yaml b/charts/flagger/templates/crd.yaml index 1302acee3..aad996b89 100644 --- a/charts/flagger/templates/crd.yaml +++ b/charts/flagger/templates/crd.yaml @@ -122,10 +122,37 @@ spec: properties: name: type: string + type: + type: string + enum: + - "" + - pre-rollout + - rollout + - post-rollout url: type: string format: url timeout: type: string pattern: "^[0-9]+(m|s)" + status: + properties: + phase: + type: string + enum: + - "" + - Initialized + - Progressing + - Succeeded + - Failed + canaryWeight: + type: number + failedChecks: + type: number + iterations: + type: number + lastAppliedSpec: + type: string + lastTransitionTime: + type: string {{- end }} From f46882c778f32baf10da81fa544ed468c8701977 Mon Sep 17 00:00:00 2001 From: stefanprodan Date: Sun, 14 Apr 2019 12:24:35 +0300 Subject: [PATCH 6/6] Update cert-manager to v0.7 in GKE docs --- .../gitbook/install/flagger-install-on-google-cloud.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/docs/gitbook/install/flagger-install-on-google-cloud.md b/docs/gitbook/install/flagger-install-on-google-cloud.md index 94a2f7b7b..25dfa3730 100644 --- a/docs/gitbook/install/flagger-install-on-google-cloud.md +++ b/docs/gitbook/install/flagger-install-on-google-cloud.md @@ -186,7 +186,7 @@ Install cert-manager's CRDs: ```bash CERT_REPO=https://raw.githubusercontent.com/jetstack/cert-manager -kubectl apply -f ${CERT_REPO}/release-0.6/deploy/manifests/00-crds.yaml +kubectl apply -f ${CERT_REPO}/release-0.7/deploy/manifests/00-crds.yaml ``` Create the cert-manager namespace and disable resource validation: @@ -200,10 +200,12 @@ kubectl label namespace cert-manager certmanager.k8s.io/disable-validation=true Install cert-manager with Helm: ```bash -helm repo update && helm upgrade -i cert-manager \ +helm repo add jetstack https://charts.jetstack.io && \ +helm repo update && \ +helm upgrade -i cert-manager \ --namespace cert-manager \ ---version v0.6.0 \ -stable/cert-manager +--version v0.7.0 \ +jetstack/cert-manager ``` ### Istio Gateway TLS setup