Skip to content

Commit

Permalink
revad: Allow to specify a shutdown timeout
Browse files Browse the repository at this point in the history
Wnen setting 'graceful_shutdown_timeout' revad with try to shutdown in a
graceful manner when receiving an INT or TERM signal (similar to how it already
behaves on SIGQUIT). This allows ongoing operations to complete before exiting.

If the shutdown didn't finish before 'graceful_shutdown_timeout' seconds the
process will exit with an error code (1).
  • Loading branch information
rhafer committed Jul 18, 2023
1 parent c43313a commit 7843cbf
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 53 deletions.
10 changes: 10 additions & 0 deletions changelog/unreleased/graceful_shutdown_timeout.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
Enhancement: Allow to specify a shutdown timeout

When setting `graceful_shutdown_timeout` revad will try to shutdown in a
graceful manner when receiving an INT or TERM signal (similar to how it already
behaves on SIGQUIT). This allows ongoing operations to complete before exiting.

If the shutdown didn't finish before `graceful_shutdown_timeout` seconds the
process will exit with an error code (1).

https://github.com/cs3org/reva/pull/4072
115 changes: 70 additions & 45 deletions cmd/revad/internal/grace/grace.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,14 @@ import (
// Watcher watches a process for a graceful restart
// preserving open network sockets to avoid packet loss.
type Watcher struct {
log zerolog.Logger
graceful bool
ppid int
lns map[string]net.Listener
ss map[string]Server
pidFile string
childPIDs []int
log zerolog.Logger
graceful bool
ppid int
lns map[string]net.Listener
ss map[string]Server
pidFile string
childPIDs []int
gracefulShutdownTimeout int
}

// Option represent an option.
Expand All @@ -62,6 +63,12 @@ func WithPIDFile(fn string) Option {
}
}

func WithGracefuleShutdownTimeout(seconds int) Option {
return func(w *Watcher) {
w.gracefulShutdownTimeout = seconds
}
}

// NewWatcher creates a Watcher.
func NewWatcher(opts ...Option) *Watcher {
w := &Watcher{
Expand Down Expand Up @@ -279,49 +286,67 @@ func (w *Watcher) TrapSignals() {
}

case syscall.SIGQUIT:
w.log.Info().Msg("preparing for a graceful shutdown with deadline of 10 seconds")
go func() {
count := 10
ticker := time.NewTicker(time.Second)
for ; true; <-ticker.C {
w.log.Info().Msgf("shutting down in %d seconds", count-1)
count--
if count <= 0 {
w.log.Info().Msg("deadline reached before draining active conns, hard stopping ...")
for _, s := range w.ss {
err := s.Stop()
if err != nil {
w.log.Error().Err(err).Msg("error stopping server")
}
w.log.Info().Msgf("fd to %s:%s abruptly closed", s.Network(), s.Address())
}
w.Exit(1)
}
}
}()
for _, s := range w.ss {
w.log.Info().Msgf("fd to %s:%s gracefully closed ", s.Network(), s.Address())
err := s.GracefulStop()
if err != nil {
w.log.Error().Err(err).Msg("error stopping server")
w.log.Info().Msg("exit with error code 1")
w.Exit(1)
}
}
w.log.Info().Msg("exit with error code 0")
w.Exit(0)
gracefulShutdown(w)
case syscall.SIGINT, syscall.SIGTERM:
w.log.Info().Msg("preparing for hard shutdown, aborting all conns")
for _, s := range w.ss {
w.log.Info().Msgf("fd to %s:%s abruptly closed", s.Network(), s.Address())
err := s.Stop()
if err != nil {
w.log.Error().Err(err).Msg("error stopping server")
if w.gracefulShutdownTimeout == 0 {
hardShutdown(w)
}
gracefulShutdown(w)
}
}
}

// TODO: Ideally this would call exit() but properly return an error. The
// exit() is problematic (i.e. racey) especiaily when orchestrating multiple
// reva services from some external runtime (like in the "ocis server" case
func gracefulShutdown(w *Watcher) {
w.log.Info().Int("Timeout", w.gracefulShutdownTimeout).Msg("preparing for a graceful shutdown with deadline")
go func() {
count := w.gracefulShutdownTimeout
ticker := time.NewTicker(time.Second)
for ; true; <-ticker.C {
w.log.Info().Msgf("shutting down in %d seconds", count-1)
count--
if count <= 0 {
w.log.Info().Msg("deadline reached before draining active conns, hard stopping ...")
for _, s := range w.ss {
err := s.Stop()
if err != nil {
w.log.Error().Err(err).Msg("error stopping server")
}
w.log.Info().Msgf("fd to %s:%s abruptly closed", s.Network(), s.Address())
}
w.Exit(1)
}
w.Exit(0)
}
}()
for _, s := range w.ss {
w.log.Info().Msgf("fd to %s:%s gracefully closed ", s.Network(), s.Address())
err := s.GracefulStop()
if err != nil {
w.log.Error().Err(err).Msg("error stopping server")
w.log.Info().Msg("exit with error code 1")

w.Exit(1)
}
}
w.log.Info().Msg("exit with error code 0")
w.Exit(0)
}

// TODO: Ideally this would call exit() but properly return an error. The
// exit() is problematic (i.e. racey) especiaily when orchestrating multiple
// reva services from some external runtime (like in the "ocis server" case
func hardShutdown(w *Watcher) {
w.log.Info().Msg("preparing for hard shutdown, aborting all conns")
for _, s := range w.ss {
w.log.Info().Msgf("fd to %s:%s abruptly closed", s.Network(), s.Address())
err := s.Stop()
if err != nil {
w.log.Error().Err(err).Msg("error stopping server")
}
}
w.Exit(0)
}

func getListenerFile(ln net.Listener) (*os.File, error) {
Expand Down
18 changes: 10 additions & 8 deletions cmd/revad/runtime/runtime.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ type coreConf struct {

// TracingService specifies the service. i.e OpenCensus, OpenTelemetry, OpenTracing...
TracingService string `mapstructure:"tracing_service"`

GracefulShutdownTimeout int `mapstructure:"graceful_shutdown_timeout"`
}

func run(
Expand All @@ -92,7 +94,7 @@ func run(
initCPUCount(coreConf, logger)

servers := initServers(mainConf, logger, tp)
watcher, err := initWatcher(logger, filename)
watcher, err := initWatcher(logger, filename, coreConf.GracefulShutdownTimeout)
if err != nil {
log.Panic(err)
}
Expand All @@ -110,8 +112,8 @@ func initListeners(watcher *grace.Watcher, servers map[string]grace.Server, log
return listeners
}

func initWatcher(log *zerolog.Logger, filename string) (*grace.Watcher, error) {
watcher, err := handlePIDFlag(log, filename)
func initWatcher(log *zerolog.Logger, filename string, gracefulShutdownTimeout int) (*grace.Watcher, error) {
watcher, err := handlePIDFlag(log, filename, gracefulShutdownTimeout)
// TODO(labkode): maybe pidfile can be created later on? like once a server is going to be created?
if err != nil {
log.Error().Err(err).Msg("error creating grace watcher")
Expand Down Expand Up @@ -187,11 +189,11 @@ func initLogger(conf *logConf) *zerolog.Logger {
return log
}

func handlePIDFlag(l *zerolog.Logger, pidFile string) (*grace.Watcher, error) {
var opts []grace.Option
opts = append(opts, grace.WithPIDFile(pidFile))
opts = append(opts, grace.WithLogger(l.With().Str("pkg", "grace").Logger()))
w := grace.NewWatcher(opts...)
func handlePIDFlag(l *zerolog.Logger, pidFile string, gracefulShutdownTimeout int) (*grace.Watcher, error) {
w := grace.NewWatcher(grace.WithPIDFile(pidFile),
grace.WithLogger(l.With().Str("pkg", "grace").Logger()),
grace.WithGracefuleShutdownTimeout(gracefulShutdownTimeout),
)
err := w.WritePID()
if err != nil {
return nil, err
Expand Down

0 comments on commit 7843cbf

Please sign in to comment.