diff --git a/command/agent/command.go b/command/agent/command.go index ecae14d81a7..2d1b0a1cd07 100644 --- a/command/agent/command.go +++ b/command/agent/command.go @@ -42,6 +42,7 @@ type Command struct { httpServer *HTTPServer logFilter *logutils.LevelFilter logOutput io.Writer + retryJoinErrCh chan struct{} scadaProvider *scada.Provider scadaHttp *HTTPServer @@ -71,6 +72,11 @@ func (c *Command) readConfig() *Config { // Server-only options flags.IntVar(&cmdConfig.Server.BootstrapExpect, "bootstrap-expect", 0, "") + flags.BoolVar(&cmdConfig.Server.RejoinAfterLeave, "rejoin", false, "") + flags.Var((*sliceflag.StringFlag)(&cmdConfig.Server.StartJoin), "join", "") + flags.Var((*sliceflag.StringFlag)(&cmdConfig.Server.RetryJoin), "retry-join", "") + flags.IntVar(&cmdConfig.Server.RetryMaxAttempts, "retry-max", 0, "") + flags.StringVar(&cmdConfig.Server.RetryInterval, "retry-interval", "", "") // Client-only options flags.StringVar(&cmdConfig.Client.StateDir, "state-dir", "", "") @@ -100,6 +106,15 @@ func (c *Command) readConfig() *Config { return nil } + if cmdConfig.Server.RetryInterval != "" { + dur, err := time.ParseDuration(cmdConfig.Server.RetryInterval) + if err != nil { + c.Ui.Error(fmt.Sprintf("Error parsing retry interval: %s", err)) + return nil + } + cmdConfig.Server.retryInterval = dur + } + // Split the servers. if servers != "" { cmdConfig.Client.Servers = strings.Split(servers, ",") @@ -358,6 +373,12 @@ func (c *Command) Run(args []string) int { } }() + // Join startup nodes if specified + if err := c.startupJoin(config); err != nil { + c.Ui.Error(err.Error()) + return 1 + } + // Compile agent information for output later info := make(map[string]string) info["client"] = strconv.FormatBool(config.Client.Enabled) @@ -396,6 +417,10 @@ func (c *Command) Run(args []string) int { // Enable log streaming logGate.Flush() + // Start retry join process + c.retryJoinErrCh = make(chan struct{}) + go c.retryJoin(config) + // Wait for exit return c.handleSignals(config) } @@ -413,6 +438,8 @@ WAIT: sig = s case <-c.ShutdownCh: sig = os.Interrupt + case <-c.retryJoinErrCh: + return 1 } c.Ui.Output(fmt.Sprintf("Caught signal: %v", sig)) @@ -559,6 +586,52 @@ func (c *Command) setupSCADA(config *Config) error { return nil } +func (c *Command) startupJoin(config *Config) error { + if len(config.Server.StartJoin) == 0 || !config.Server.Enabled { + return nil + } + + c.Ui.Output("Joining cluster...") + n, err := c.agent.server.Join(config.Server.StartJoin) + if err != nil { + return err + } + + c.Ui.Info(fmt.Sprintf("Join completed. Synced with %d initial agents", n)) + return nil +} + +// retryJoin is used to handle retrying a join until it succeeds or all retries +// are exhausted. +func (c *Command) retryJoin(config *Config) { + if len(config.Server.RetryJoin) == 0 || !config.Server.Enabled { + return + } + + logger := c.agent.logger + logger.Printf("[INFO] agent: Joining cluster...") + + attempt := 0 + for { + n, err := c.agent.server.Join(config.Server.RetryJoin) + if err == nil { + logger.Printf("[INFO] agent: Join completed. Synced with %d initial agents", n) + return + } + + attempt++ + if config.Server.RetryMaxAttempts > 0 && attempt > config.Server.RetryMaxAttempts { + logger.Printf("[ERROR] agent: max join retry exhausted, exiting") + close(c.retryJoinErrCh) + return + } + + logger.Printf("[WARN] agent: Join failed: %v, retrying in %v", err, + config.Server.RetryInterval) + time.Sleep(config.Server.retryInterval) + } +} + func (c *Command) Synopsis() string { return "Runs a Nomad agent" } @@ -632,6 +705,24 @@ Server Options: bootstrapping the cluster. Once servers have joined eachother, Nomad initiates the bootstrap process. + -join=
+ Address of an agent to join at start time. Can be specified + multiple times. + + -retry-join=
+ Address of an agent to join at start time with retries enabled. + Can be specified multiple times. + + -retry-max= + Maximum number of join attempts. Defaults to 0, which will retry + indefinitely. + + -retry-interval= + Time to wait between join attempts. + + -rejoin + Ignore a previous leave and attempts to rejoin the cluster. + Client Options: -client diff --git a/command/agent/command_test.go b/command/agent/command_test.go index c68da5a5a8a..a4226185334 100644 --- a/command/agent/command_test.go +++ b/command/agent/command_test.go @@ -1,11 +1,14 @@ package agent import ( + "fmt" "io/ioutil" + "log" "os" "strings" "testing" + "github.com/hashicorp/nomad/testutil" "github.com/mitchellh/cli" ) @@ -69,3 +72,58 @@ func TestCommand_Args(t *testing.T) { } } } + +func TestRetryJoin(t *testing.T) { + dir, agent := makeAgent(t, nil) + defer os.RemoveAll(dir) + defer agent.Shutdown() + + tmpDir, err := ioutil.TempDir("", "nomad") + if err != nil { + t.Fatalf("err: %s", err) + } + defer os.RemoveAll(tmpDir) + + doneCh := make(chan struct{}) + shutdownCh := make(chan struct{}) + + defer func() { + close(shutdownCh) + <-doneCh + }() + + cmd := &Command{ + ShutdownCh: shutdownCh, + Ui: new(cli.MockUi), + } + + serfAddr := fmt.Sprintf( + "%s:%d", + agent.config.BindAddr, + agent.config.Ports.Serf) + + args := []string{ + "-server", + "-data-dir", tmpDir, + "-node", fmt.Sprintf(`"Node %d"`, getPort()), + "-retry-join", serfAddr, + "-retry-interval", "1s", + } + + go func() { + if code := cmd.Run(args); code != 0 { + log.Printf("bad: %d", code) + } + close(doneCh) + }() + + testutil.WaitForResult(func() (bool, error) { + mem := agent.server.Members() + if len(mem) != 2 { + return false, fmt.Errorf("bad :%#v", mem) + } + return true, nil + }, func(err error) { + t.Fatalf(err.Error()) + }) +} diff --git a/command/agent/config.go b/command/agent/config.go index 5f0bfdf7850..368bfa76bfa 100644 --- a/command/agent/config.go +++ b/command/agent/config.go @@ -9,6 +9,7 @@ import ( "path/filepath" "runtime" "strings" + "time" "github.com/hashicorp/hcl" client "github.com/hashicorp/nomad/client/config" @@ -181,6 +182,31 @@ type ServerConfig struct { // NodeGCThreshold contros how "old" a node must be to be collected by GC. NodeGCThreshold string `hcl:"node_gc_threshold"` + + // StartJoin is a list of addresses to attempt to join when the + // agent starts. If Serf is unable to communicate with any of these + // addresses, then the agent will error and exit. + StartJoin []string `hcl:"start_join"` + + // RetryJoin is a list of addresses to join with retry enabled. + RetryJoin []string `hcl:"retry_join"` + + // RetryMaxAttempts specifies the maximum number of times to retry joining a + // host on startup. This is useful for cases where we know the node will be + // online eventually. + RetryMaxAttempts int `hcl:"retry_max"` + + // RetryInterval specifies the amount of time to wait in between join + // attempts on agent start. The minimum allowed value is 1 second and + // the default is 30s. + RetryInterval string `hcl:"retry_interval"` + retryInterval time.Duration `hcl:"-"` + + // RejoinAfterLeave controls our interaction with the cluster after leave. + // When set to false (default), a leave causes Consul to not rejoin + // the cluster until an explicit join is received. If this is set to + // true, we ignore the leave, and rejoin the cluster on start. + RejoinAfterLeave bool `hcl:"rejoin_after_leave"` } // Telemetry is the telemetry configuration for the server @@ -252,7 +278,11 @@ func DefaultConfig() *Config { NetworkSpeed: 100, }, Server: &ServerConfig{ - Enabled: false, + Enabled: false, + StartJoin: []string{}, + RetryJoin: []string{}, + RetryInterval: "30s", + RetryMaxAttempts: 0, }, } } @@ -391,10 +421,30 @@ func (a *ServerConfig) Merge(b *ServerConfig) *ServerConfig { if b.NodeGCThreshold != "" { result.NodeGCThreshold = b.NodeGCThreshold } + if b.RetryMaxAttempts != 0 { + result.RetryMaxAttempts = b.RetryMaxAttempts + } + if b.RetryInterval != "" { + result.RetryInterval = b.RetryInterval + result.retryInterval = b.retryInterval + } + if b.RejoinAfterLeave { + result.RejoinAfterLeave = true + } // Add the schedulers result.EnabledSchedulers = append(result.EnabledSchedulers, b.EnabledSchedulers...) + // Copy the start join addresses + result.StartJoin = make([]string, 0, len(a.StartJoin)+len(b.StartJoin)) + result.StartJoin = append(result.StartJoin, a.StartJoin...) + result.StartJoin = append(result.StartJoin, b.StartJoin...) + + // Copy the retry join addresses + result.RetryJoin = make([]string, 0, len(a.RetryJoin)+len(b.RetryJoin)) + result.RetryJoin = append(result.RetryJoin, a.RetryJoin...) + result.RetryJoin = append(result.RetryJoin, b.RetryJoin...) + return &result } diff --git a/command/agent/config_test.go b/command/agent/config_test.go index c13ff91f367..90f2b620c2e 100644 --- a/command/agent/config_test.go +++ b/command/agent/config_test.go @@ -6,6 +6,7 @@ import ( "path/filepath" "reflect" "testing" + "time" "github.com/hashicorp/nomad/nomad/structs" ) @@ -114,6 +115,11 @@ func TestConfig_Merge(t *testing.T) { NumSchedulers: 2, EnabledSchedulers: []string{structs.JobTypeBatch}, NodeGCThreshold: "12h", + RejoinAfterLeave: true, + StartJoin: []string{"1.1.1.1"}, + RetryJoin: []string{"1.1.1.1"}, + RetryInterval: "10s", + retryInterval: time.Second * 10, }, Ports: &Ports{ HTTP: 20000, @@ -384,6 +390,11 @@ func TestConfig_LoadConfigString(t *testing.T) { NumSchedulers: 2, EnabledSchedulers: []string{"test"}, NodeGCThreshold: "12h", + RetryJoin: []string{"1.1.1.1", "2.2.2.2"}, + StartJoin: []string{"1.1.1.1", "2.2.2.2"}, + RetryInterval: "15s", + RejoinAfterLeave: true, + RetryMaxAttempts: 3, }, Telemetry: &Telemetry{ StatsiteAddr: "127.0.0.1:1234", @@ -457,6 +468,11 @@ server { num_schedulers = 2 enabled_schedulers = ["test"] node_gc_threshold = "12h" + retry_join = [ "1.1.1.1", "2.2.2.2" ] + start_join = [ "1.1.1.1", "2.2.2.2" ] + retry_max = 3 + retry_interval = "15s" + rejoin_after_leave = true } telemetry { statsite_address = "127.0.0.1:1234" diff --git a/website/source/docs/agent/config.html.md b/website/source/docs/agent/config.html.md index c4569903476..f931dbac1d3 100644 --- a/website/source/docs/agent/config.html.md +++ b/website/source/docs/agent/config.html.md @@ -177,6 +177,21 @@ configured on client nodes. "1.5h" or "25m". Valid time units are "ns", "us" (or "µs"), "ms", "s", "m", "h". Controls how long a node must be in a terminal state before it is garbage collected and purged from the system. + * `rejoin_after_leave` When provided, Nomad will ignore a previous leave and + attempt to rejoin the cluster when starting. By default, Nomad treats leave + as a permanent intent and does not attempt to join the cluster again when + starting. This flag allows the previous state to be used to rejoin the + cluster. + * `retry_join` Similar to [`start_join`](#start_join) but allows retrying a join + if the first attempt fails. This is useful for cases where we know the + address will become available eventually. + * `retry_interval` The time to wait between join attempts. Defaults to 30s. + * `retry_max` The maximum number of join attempts to be made before exiting + with a return code of 1. By default, this is set to 0 which is interpreted + as infinite retries. + * `start_join` An array of strings specifying addresses of nodes to join upon startup. + If Nomad is unable to join with any of the specified addresses, agent startup will + fail. By default, the agent won't join any nodes when it starts up. ## Client-specific Options @@ -301,6 +316,8 @@ via CLI arguments. The `agent` command accepts the following arguments: * `-dev`: Start the agent in development mode. This enables a pre-configured dual-role agent (client + server) which is useful for developing or testing Nomad. No other configuration is required to start the agent in this mode. +* `-join=
`: Address of another agent to join upon starting up. This can + be specified multiple times to specify multiple agents to join. * `-log-level=`: Equivalent to the [log_level](#log_level) config option. * `-meta=`: Equivalent to the Client [meta](#meta) config option. * `-network-interface`: Equivalent to the Client @@ -312,6 +329,10 @@ via CLI arguments. The `agent` command accepts the following arguments: config option. * `-node-id=`: Equivalent to the Client [node_id](#node_id) config option. * `-region=`: Equivalent to the [region](#region) config option. +* `-rejoin`: Equivalent to the [rejoin_after_leave](#rejoin_after_leave) config option. +* `-retry-interval`: Equivalent to the [retry_interval](#retry_interval) config option. +* `-retry-join`: Similar to `-join` but allows retrying a join if the first attempt fails. +* `-retry-max`: Similar to the [retry_max](#retry_max) config option. * `-server`: Enable server mode on the local agent. * `-servers=`: Equivalent to the Client [servers](#servers) config option.