Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

revad: Allow to specify a shutdown timeout #4072

Merged
merged 1 commit into from
Jul 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions changelog/unreleased/graceful_shutdown_timeout.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
Enhancement: Allow to specify a shutdown timeout

When setting `graceful_shutdown_timeout` revad will try to shutdown in a
graceful manner when receiving an INT or TERM signal (similar to how it already
behaves on SIGQUIT). This allows ongoing operations to complete before exiting.

If the shutdown didn't finish before `graceful_shutdown_timeout` seconds the
process will exit with an error code (1).

https://github.com/cs3org/reva/pull/4072
115 changes: 70 additions & 45 deletions cmd/revad/internal/grace/grace.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,14 @@ import (
// Watcher watches a process for a graceful restart
// preserving open network sockets to avoid packet loss.
type Watcher struct {
log zerolog.Logger
graceful bool
ppid int
lns map[string]net.Listener
ss map[string]Server
pidFile string
childPIDs []int
log zerolog.Logger
graceful bool
ppid int
lns map[string]net.Listener
ss map[string]Server
pidFile string
childPIDs []int
gracefulShutdownTimeout int
kobergj marked this conversation as resolved.
Show resolved Hide resolved
}

// Option represent an option.
Expand All @@ -62,6 +63,12 @@ func WithPIDFile(fn string) Option {
}
}

func WithGracefuleShutdownTimeout(seconds int) Option {
return func(w *Watcher) {
w.gracefulShutdownTimeout = seconds
}
}

// NewWatcher creates a Watcher.
func NewWatcher(opts ...Option) *Watcher {
w := &Watcher{
Expand Down Expand Up @@ -279,49 +286,67 @@ func (w *Watcher) TrapSignals() {
}

case syscall.SIGQUIT:
w.log.Info().Msg("preparing for a graceful shutdown with deadline of 10 seconds")
go func() {
count := 10
ticker := time.NewTicker(time.Second)
for ; true; <-ticker.C {
w.log.Info().Msgf("shutting down in %d seconds", count-1)
count--
if count <= 0 {
w.log.Info().Msg("deadline reached before draining active conns, hard stopping ...")
for _, s := range w.ss {
err := s.Stop()
if err != nil {
w.log.Error().Err(err).Msg("error stopping server")
}
w.log.Info().Msgf("fd to %s:%s abruptly closed", s.Network(), s.Address())
}
w.Exit(1)
}
}
}()
for _, s := range w.ss {
w.log.Info().Msgf("fd to %s:%s gracefully closed ", s.Network(), s.Address())
err := s.GracefulStop()
if err != nil {
w.log.Error().Err(err).Msg("error stopping server")
w.log.Info().Msg("exit with error code 1")
w.Exit(1)
}
}
w.log.Info().Msg("exit with error code 0")
w.Exit(0)
gracefulShutdown(w)
case syscall.SIGINT, syscall.SIGTERM:
w.log.Info().Msg("preparing for hard shutdown, aborting all conns")
for _, s := range w.ss {
w.log.Info().Msgf("fd to %s:%s abruptly closed", s.Network(), s.Address())
err := s.Stop()
if err != nil {
w.log.Error().Err(err).Msg("error stopping server")
if w.gracefulShutdownTimeout == 0 {
hardShutdown(w)
}
gracefulShutdown(w)
}
}
}

// TODO: Ideally this would call exit() but properly return an error. The
// exit() is problematic (i.e. racey) especiaily when orchestrating multiple
// reva services from some external runtime (like in the "ocis server" case
func gracefulShutdown(w *Watcher) {
w.log.Info().Int("Timeout", w.gracefulShutdownTimeout).Msg("preparing for a graceful shutdown with deadline")
go func() {
count := w.gracefulShutdownTimeout
ticker := time.NewTicker(time.Second)
for ; true; <-ticker.C {
w.log.Info().Msgf("shutting down in %d seconds", count-1)
count--
if count <= 0 {
w.log.Info().Msg("deadline reached before draining active conns, hard stopping ...")
for _, s := range w.ss {
err := s.Stop()
if err != nil {
w.log.Error().Err(err).Msg("error stopping server")
}
w.log.Info().Msgf("fd to %s:%s abruptly closed", s.Network(), s.Address())
}
w.Exit(1)
}
w.Exit(0)
}
}()
for _, s := range w.ss {
w.log.Info().Msgf("fd to %s:%s gracefully closed ", s.Network(), s.Address())
err := s.GracefulStop()
if err != nil {
w.log.Error().Err(err).Msg("error stopping server")
w.log.Info().Msg("exit with error code 1")

w.Exit(1)
}
}
w.log.Info().Msg("exit with error code 0")
w.Exit(0)
}

// TODO: Ideally this would call exit() but properly return an error. The
// exit() is problematic (i.e. racey) especiaily when orchestrating multiple
// reva services from some external runtime (like in the "ocis server" case
func hardShutdown(w *Watcher) {
w.log.Info().Msg("preparing for hard shutdown, aborting all conns")
for _, s := range w.ss {
w.log.Info().Msgf("fd to %s:%s abruptly closed", s.Network(), s.Address())
err := s.Stop()
if err != nil {
w.log.Error().Err(err).Msg("error stopping server")
}
}
w.Exit(0)
}

func getListenerFile(ln net.Listener) (*os.File, error) {
Expand Down
18 changes: 10 additions & 8 deletions cmd/revad/runtime/runtime.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ type coreConf struct {

// TracingService specifies the service. i.e OpenCensus, OpenTelemetry, OpenTracing...
TracingService string `mapstructure:"tracing_service"`

GracefulShutdownTimeout int `mapstructure:"graceful_shutdown_timeout"`
}

func run(
Expand All @@ -92,7 +94,7 @@ func run(
initCPUCount(coreConf, logger)

servers := initServers(mainConf, logger, tp)
watcher, err := initWatcher(logger, filename)
watcher, err := initWatcher(logger, filename, coreConf.GracefulShutdownTimeout)
if err != nil {
log.Panic(err)
}
Expand All @@ -110,8 +112,8 @@ func initListeners(watcher *grace.Watcher, servers map[string]grace.Server, log
return listeners
}

func initWatcher(log *zerolog.Logger, filename string) (*grace.Watcher, error) {
watcher, err := handlePIDFlag(log, filename)
func initWatcher(log *zerolog.Logger, filename string, gracefulShutdownTimeout int) (*grace.Watcher, error) {
watcher, err := handlePIDFlag(log, filename, gracefulShutdownTimeout)
// TODO(labkode): maybe pidfile can be created later on? like once a server is going to be created?
if err != nil {
log.Error().Err(err).Msg("error creating grace watcher")
Expand Down Expand Up @@ -187,11 +189,11 @@ func initLogger(conf *logConf) *zerolog.Logger {
return log
}

func handlePIDFlag(l *zerolog.Logger, pidFile string) (*grace.Watcher, error) {
var opts []grace.Option
opts = append(opts, grace.WithPIDFile(pidFile))
opts = append(opts, grace.WithLogger(l.With().Str("pkg", "grace").Logger()))
w := grace.NewWatcher(opts...)
func handlePIDFlag(l *zerolog.Logger, pidFile string, gracefulShutdownTimeout int) (*grace.Watcher, error) {
w := grace.NewWatcher(grace.WithPIDFile(pidFile),
grace.WithLogger(l.With().Str("pkg", "grace").Logger()),
grace.WithGracefuleShutdownTimeout(gracefulShutdownTimeout),
)
err := w.WritePID()
if err != nil {
return nil, err
Expand Down