Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

"strict" health-checking (#427) #428

Merged
merged 1 commit into from
Feb 19, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -142,4 +142,5 @@ type Consul struct {
CheckScheme string
CheckTLSSkipVerify bool
CheckDeregisterCriticalServiceAfter string
ChecksRequired string
}
1 change: 1 addition & 0 deletions config/default.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ var defaultConfig = &Config{
CheckTimeout: 3 * time.Second,
CheckScheme: "http",
CheckDeregisterCriticalServiceAfter: "90m",
ChecksRequired: "one",
},
Timeout: 10 * time.Second,
Retry: 500 * time.Millisecond,
Expand Down
1 change: 1 addition & 0 deletions config/load.go
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,7 @@ func load(cmdline, environ, envprefix []string, props *properties.Properties) (c
f.DurationVar(&cfg.Registry.Consul.CheckTimeout, "registry.consul.register.checkTimeout", defaultConfig.Registry.Consul.CheckTimeout, "service check timeout")
f.BoolVar(&cfg.Registry.Consul.CheckTLSSkipVerify, "registry.consul.register.checkTLSSkipVerify", defaultConfig.Registry.Consul.CheckTLSSkipVerify, "service check TLS verification")
f.StringVar(&cfg.Registry.Consul.CheckDeregisterCriticalServiceAfter, "registry.consul.register.checkDeregisterCriticalServiceAfter", defaultConfig.Registry.Consul.CheckDeregisterCriticalServiceAfter, "critical service deregistration timeout")
f.StringVar(&cfg.Registry.Consul.ChecksRequired, "registry.consul.checksRequired", defaultConfig.Registry.Consul.ChecksRequired, "number of checks which must pass: one or all")
f.IntVar(&cfg.Runtime.GOGC, "runtime.gogc", defaultConfig.Runtime.GOGC, "sets runtime.GOGC")
f.IntVar(&cfg.Runtime.GOMAXPROCS, "runtime.gomaxprocs", defaultConfig.Runtime.GOMAXPROCS, "sets runtime.GOMAXPROCS")
f.StringVar(&cfg.UI.Access, "ui.access", defaultConfig.UI.Access, "access mode, one of [ro, rw]")
Expand Down
15 changes: 15 additions & 0 deletions docs/content/ref/registry.consul.checksRequired.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
---
title: "registry.consul.checksRequired"
---

`registry.consul.checksRequired` configures how many health checks
must pass in order for fabio to consider a service available.

Possible values are:

* `one`: at least one health check must pass
* `all`: all health checks must pass

The default is

registry.consul.checksRequired = one
12 changes: 12 additions & 0 deletions fabio.properties
Original file line number Diff line number Diff line change
Expand Up @@ -731,6 +731,18 @@
# registry.consul.register.checkDeregisterCriticalServiceAfter = 90m


# registry.consul.checksRequired configures how many health checks
# must pass in order for fabio to consider a service available.
#
# Possible values are:
# one: at least one health check must pass
# all: all health checks must pass
#
# The default is
#
# registry.consul.checksRequired = one


# metrics.target configures the backend the metrics values are
# sent to.
#
Expand Down
2 changes: 1 addition & 1 deletion registry/consul/backend.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ func (b *be) WatchServices() chan string {
log.Printf("[INFO] consul: Using tag prefix %q", b.cfg.TagPrefix)

svc := make(chan string)
go watchServices(b.c, b.cfg.TagPrefix, b.cfg.ServiceStatus, svc)
go watchServices(b.c, b.cfg, svc)
return svc
}

Expand Down
110 changes: 77 additions & 33 deletions registry/consul/passing.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,53 +7,97 @@ import (
"github.com/hashicorp/consul/api"
)

// passingServices filters out health checks for services which have
// passing health checks and where the neither the service instance itself
// nor the node is in maintenance mode.
func passingServices(checks []*api.HealthCheck, status []string) []*api.HealthCheck {
func passingServices(checks []*api.HealthCheck, status []string, strict bool) []*api.HealthCheck {
var p []*api.HealthCheck
for _, svc := range checks {
// first filter out non-service checks
if svc.ServiceID == "" || svc.CheckID == "serfHealth" || svc.CheckID == "_node_maintenance" || strings.HasPrefix("_service_maintenance:", svc.CheckID) {
if !isServiceCheck(svc) {
continue
}

// then make sure the service health check is passing
if !contains(status, svc.Status) {
total, passing := countChecks(svc, checks, status)
if passing == 0 {
continue
}
if strict && total != passing {
continue
}
if isAgentCritical(svc, checks) {
continue
}
if isNodeInMaintenance(svc, checks) {
continue
}
if isServiceInMaintenance(svc, checks) {
continue
}
p = append(p, svc)
}

// then check whether the agent is still alive and both the
// node and the service are not in maintenance mode.
for _, c := range checks {
if c.Node != svc.Node {
continue
}
if c.CheckID == "serfHealth" && c.Status == "critical" {
log.Printf("[DEBUG] consul: Skipping service %q since agent on node %q is down: %s", svc.ServiceID, svc.Node, c.Output)
goto skip
}
if c.CheckID == "_node_maintenance" {
log.Printf("[DEBUG] consul: Skipping service %q since node %q is in maintenance mode: %s", svc.ServiceID, svc.Node, c.Output)
goto skip
}
if c.CheckID == "_service_maintenance:"+svc.ServiceID && c.Status == "critical" {
log.Printf("[DEBUG] consul: Skipping service %q since it is in maintenance mode: %s", svc.ServiceID, c.Output)
goto skip
}
return p
}

// isServiceCheck returns true if the health check is a valid service check.
func isServiceCheck(c *api.HealthCheck) bool {
return c.ServiceID != "" &&
c.CheckID != "serfHealth" &&
c.CheckID != "_node_maintenance" &&
!strings.HasPrefix("_service_maintenance:", c.CheckID)
}

// isAgentCritical returns true if the agent on the node on which the service
// runs is critical.
func isAgentCritical(svc *api.HealthCheck, checks []*api.HealthCheck) bool {
for _, c := range checks {
if svc.Node == c.Node && c.CheckID == "serfHealth" && c.Status == "critical" {
log.Printf("[DEBUG] consul: Skipping service %q since agent on node %q is down: %s", c.ServiceID, c.Node, c.Output)
return true
}
}
return false
}

p = append(p, svc)
// isNodeInMaintenance returns true if the node on which the service runs is in
// maintenance mode.
func isNodeInMaintenance(svc *api.HealthCheck, checks []*api.HealthCheck) bool {
for _, c := range checks {
if svc.Node == c.Node && c.CheckID == "_node_maintenance" {
log.Printf("[DEBUG] consul: Skipping service %q since node %q is in maintenance mode: %s", c.ServiceID, c.Node, c.Output)
return true
}
}
return false
}

skip:
// isServiceInMaintenance returns true if the service instance is in
// maintenance mode.
func isServiceInMaintenance(svc *api.HealthCheck, checks []*api.HealthCheck) bool {
for _, c := range checks {
if svc.Node == c.Node && c.CheckID == "_service_maintenance:"+svc.ServiceID && c.Status == "critical" {
log.Printf("[DEBUG] consul: Skipping service %q since it is in maintenance mode: %s", svc.ServiceID, c.Output)
return true
}
}
return false
}

return p
// countChecks counts the number of service checks exist for a given service
// and how many of them are passing.
func countChecks(svc *api.HealthCheck, checks []*api.HealthCheck, status []string) (total int, passing int) {
for _, c := range checks {
if svc.Node == c.Node && svc.ServiceID == c.ServiceID {
total++
if hasStatus(c, status) {
passing++
}
}
}
return
}

func contains(slice []string, item string) bool {
for _, a := range slice {
if a == item {
// hasStatus returns true if the health check status is one of the given
// values.
func hasStatus(c *api.HealthCheck, status []string) bool {
for _, s := range status {
if c.Status == s {
return true
}
}
Expand Down
119 changes: 97 additions & 22 deletions registry/consul/passing_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ func TestPassingServices(t *testing.T) {
serfPass = &api.HealthCheck{Node: "node", CheckID: "serfHealth", Status: "passing"}
serfFail = &api.HealthCheck{Node: "node", CheckID: "serfHealth", Status: "critical"}
svc1Pass = &api.HealthCheck{Node: "node", CheckID: "service:abc", Status: "passing", ServiceName: "abc", ServiceID: "abc-1"}
svc1Chk2Warn = &api.HealthCheck{Node: "node", CheckID: "service:abc", Status: "warning", ServiceName: "abc", ServiceID: "abc-1"}
svc1Node2Pass = &api.HealthCheck{Node: "node2", CheckID: "service:abc", Status: "passing", ServiceName: "abc", ServiceID: "abc-1"}
svc1Warn = &api.HealthCheck{Node: "node", CheckID: "service:abc", Status: "warning", ServiceName: "abc", ServiceID: "abc-2"}
svc1Crit = &api.HealthCheck{Node: "node", CheckID: "service:abc", Status: "critical", ServiceName: "abc", ServiceID: "abc-3"}
Expand All @@ -22,32 +23,106 @@ func TestPassingServices(t *testing.T) {
)

tests := []struct {
name string
strict bool
status []string
in, out []*api.HealthCheck
}{
{[]string{"passing"}, nil, nil},
{[]string{"passing"}, []*api.HealthCheck{}, nil},
{[]string{"passing"}, []*api.HealthCheck{svc1Pass}, []*api.HealthCheck{svc1Pass}},
{[]string{"passing"}, []*api.HealthCheck{svc1Pass, svc2Pass}, []*api.HealthCheck{svc1Pass, svc2Pass}},
{[]string{"passing"}, []*api.HealthCheck{serfPass, svc1Pass}, []*api.HealthCheck{svc1Pass}},
{[]string{"passing"}, []*api.HealthCheck{serfFail, svc1Pass}, nil},
{[]string{"passing"}, []*api.HealthCheck{nodeMaint, svc1Pass}, nil},
{[]string{"passing"}, []*api.HealthCheck{svc1Maint, svc1Pass}, nil},
{[]string{"passing"}, []*api.HealthCheck{nodeMaint, svc1Maint, svc1Pass}, nil},
{[]string{"passing"}, []*api.HealthCheck{serfFail, nodeMaint, svc1Maint, svc1Pass}, nil},
{[]string{"passing"}, []*api.HealthCheck{svc1ID2Maint, svc1Pass}, []*api.HealthCheck{svc1Pass}},
{[]string{"passing"}, []*api.HealthCheck{svc1Maint, svc1Pass, svc2Pass}, []*api.HealthCheck{svc2Pass}},
{[]string{"passing"}, []*api.HealthCheck{svc1Crit, svc1Node2Pass}, []*api.HealthCheck{svc1Node2Pass}},
{[]string{"passing"}, []*api.HealthCheck{svc1Maint, svc1Node2Pass}, []*api.HealthCheck{svc1Node2Pass}},
{[]string{"passing", "warning"}, []*api.HealthCheck{serfPass, svc1Pass, svc1Crit}, []*api.HealthCheck{svc1Pass}},
{[]string{"passing", "warning"}, []*api.HealthCheck{serfPass, svc1Warn, svc1Crit}, []*api.HealthCheck{svc1Warn}},
{[]string{"passing", "warning"}, []*api.HealthCheck{serfPass, svc1Pass, svc1Warn}, []*api.HealthCheck{svc1Pass, svc1Warn}},
{[]string{"passing", "warning"}, []*api.HealthCheck{serfPass, svc1Warn, svc1Crit, svc1Pass}, []*api.HealthCheck{svc1Warn, svc1Pass}},
{
"expect no passing checks if checks array is nil",
false, []string{"passing"}, nil, nil,
},
{
"expect no passing checks if checks array is empty",
false, []string{"passing"}, []*api.HealthCheck{}, nil,
},
{
"expect check to pass if it has a matching status",
false, []string{"passing"}, []*api.HealthCheck{svc1Pass}, []*api.HealthCheck{svc1Pass},
},
{
"expect all checks to pass if they have a matching status",
false, []string{"passing"}, []*api.HealthCheck{svc1Pass, svc2Pass}, []*api.HealthCheck{svc1Pass, svc2Pass},
},
{
"expect that internal consul checks are filtered out",
false, []string{"passing"}, []*api.HealthCheck{serfPass, svc1Pass}, []*api.HealthCheck{svc1Pass},
},
{
"expect no passing checks if consul agent is unhealthy",
false, []string{"passing"}, []*api.HealthCheck{serfFail, svc1Pass}, nil,
},
{
"expect no passing checks if node is in maintenance mode",
false, []string{"passing"}, []*api.HealthCheck{nodeMaint, svc1Pass}, nil,
},
{
"expect no passing check if corresponding service is in maintenance mode",
false, []string{"passing"}, []*api.HealthCheck{svc1Maint, svc1Pass}, nil,
},
{
"expect no passing check if node and service are in maintenance mode",
false, []string{"passing"}, []*api.HealthCheck{nodeMaint, svc1Maint, svc1Pass}, nil,
},
{
"expect no passing check if agent is unhealthy or node and service are in maintenance mode",
false, []string{"passing"}, []*api.HealthCheck{serfFail, nodeMaint, svc1Maint, svc1Pass}, nil,
},
{
"expect check of service which is not in maintenance mode to pass if another instance of same service is in maintenance mode",
false, []string{"passing"}, []*api.HealthCheck{svc1ID2Maint, svc1Pass}, []*api.HealthCheck{svc1Pass},
},
{
"expect that no checks of a service which is in maintenance mode are returned even if it has a passing check",
false, []string{"passing"}, []*api.HealthCheck{svc1Maint, svc1Pass, svc2Pass}, []*api.HealthCheck{svc2Pass},
},
{
"expect that a service's failing check does not affect a healthy instance of same service running on different node",
false, []string{"passing"}, []*api.HealthCheck{svc1Crit, svc1Node2Pass}, []*api.HealthCheck{svc1Node2Pass},
},
{
"service in maintenance mode does not affect healthy service running on different node",
false, []string{"passing"}, []*api.HealthCheck{svc1Maint, svc1Node2Pass}, []*api.HealthCheck{svc1Node2Pass},
},
{
"expect that internal consul check and failing check are not returned",
false, []string{"passing", "warning"}, []*api.HealthCheck{serfPass, svc1Pass, svc1Crit}, []*api.HealthCheck{svc1Pass},
},
{
"expect that internal consul check is filtered out and check with warning is passing",
false, []string{"passing", "warning"}, []*api.HealthCheck{serfPass, svc1Warn, svc1Crit}, []*api.HealthCheck{svc1Warn},
},
{
"expect that warning and passing non-internal checks are returned",
false, []string{"passing", "warning"}, []*api.HealthCheck{serfPass, svc1Pass, svc1Warn}, []*api.HealthCheck{svc1Pass, svc1Warn},
},
{
"expect that warning und passing non-internal checks are returned",
false, []string{"passing", "warning"}, []*api.HealthCheck{serfPass, svc1Warn, svc1Crit, svc1Pass}, []*api.HealthCheck{svc1Warn, svc1Pass},
},
{
"in non-strict mode, expect that checks which belong to same service are passing, if at least one of them is passing",
false, []string{"passing"}, []*api.HealthCheck{svc1Pass, svc1Chk2Warn}, []*api.HealthCheck{svc1Pass, svc1Chk2Warn},
},
{
"in strict mode, expect that no checks which belong to same service are passing, if not all of them are passing",
true, []string{"passing"}, []*api.HealthCheck{svc1Pass, svc1Chk2Warn}, nil,
},
{
"in strict mode, expect that a failing check of one service does not affect a different service's passing check",
true, []string{"passing"}, []*api.HealthCheck{svc1Pass, svc1Chk2Warn, svc2Pass}, []*api.HealthCheck{svc2Pass},
},
{
"in strict mode, expect a check to pass if all of the other checks that belong to the same service are passing",
true, []string{"passing", "warning"}, []*api.HealthCheck{svc1Pass, svc1Chk2Warn}, []*api.HealthCheck{svc1Pass, svc1Chk2Warn},
},
}

for i, tt := range tests {
if got, want := passingServices(tt.in, tt.status), tt.out; !reflect.DeepEqual(got, want) {
t.Errorf("%d: got %v want %v", i, got, want)
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got, want := passingServices(tt.in, tt.status, tt.strict), tt.out; !reflect.DeepEqual(got, want) {
t.Errorf("got %v want %v", got, want)
}
})
}
}
6 changes: 4 additions & 2 deletions registry/consul/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,15 @@ import (
"strings"
"time"

"github.com/fabiolb/fabio/config"
"github.com/hashicorp/consul/api"
)

// watchServices monitors the consul health checks and creates a new configuration
// on every change.
func watchServices(client *api.Client, tagPrefix string, status []string, config chan string) {
func watchServices(client *api.Client, config *config.Consul, svcConfig chan string) {
var lastIndex uint64
var strict bool = strings.EqualFold("all", config.ChecksRequired)

for {
q := &api.QueryOptions{RequireConsistent: true, WaitIndex: lastIndex}
Expand All @@ -28,7 +30,7 @@ func watchServices(client *api.Client, tagPrefix string, status []string, config
}

log.Printf("[DEBUG] consul: Health changed to #%d", meta.LastIndex)
config <- servicesConfig(client, passingServices(checks, status), tagPrefix)
svcConfig <- servicesConfig(client, passingServices(checks, config.ServiceStatus, strict), config.TagPrefix)
lastIndex = meta.LastIndex
}
}
Expand Down