Skip to content

Commit

Permalink
Merge pull request #8802 from owncloud/servers_startup
Browse files Browse the repository at this point in the history
feat: add runners to startup the ocis' services
  • Loading branch information
kobergj authored Apr 29, 2024
2 parents 426df04 + 05f684a commit d8cae78
Show file tree
Hide file tree
Showing 8 changed files with 1,219 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Enhancement: Prepare runners to start the services

The runners will improve and make service startup easier. The runner's
behavior is more predictable with clear expectations.

https://github.com/owncloud/ocis/pull/8802
262 changes: 262 additions & 0 deletions ocis-pkg/runner/grouprunner.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,262 @@
package runner

import (
"context"
"sync"
"sync/atomic"
"time"
)

// GroupRunner represent a group of tasks that need to run together.
// The expectation is that all the tasks will run at the same time, and when
// one of them stops, the rest will also stop.
//
// The GroupRunner is intended to be used to run multiple services, which are
// more or less independent from eachother, but at the same time it doesn't
// make sense to have any of them stopped while the rest are running.
// Basically, either all of them run, or none of them.
// For example, you can have a GRPC and HTTP servers running, each of them
// providing a piece of functionality, however, if any of them fails, the
// feature provided by them would be incomplete or broken.
//
// The interrupt duration for the group can be set through the
// `WithInterruptDuration` option. If the option isn't supplied, the default
// value (15 secs) will be used.
//
// It's recommended that the timeouts are handled by each runner individually,
// meaning that each runner's timeout should be less than the group runner's
// timeout. This way, we can know which runner timed out.
// If the group timeout is reached, the remaining results will have the
// runner's id as "_unknown_".
//
// Note that, as services, the task aren't expected to stop by default.
// This means that, if a task finishes naturally, the rest of the task will
// asked to stop as well.
type GroupRunner struct {
runners sync.Map
runnersCount int
isRunning bool
interruptDur time.Duration
interrupted atomic.Bool
interruptedCh chan time.Duration
runningMutex sync.Mutex
}

// NewGroup will create a GroupRunner
func NewGroup(opts ...Option) *GroupRunner {
options := Options{
InterruptDuration: DefaultGroupInterruptDuration,
}

for _, o := range opts {
o(&options)
}

return &GroupRunner{
runners: sync.Map{},
runningMutex: sync.Mutex{},
interruptDur: options.InterruptDuration,
interruptedCh: make(chan time.Duration, 1),
}
}

// Add will add a runner to the group.
//
// It's mandatory that each runner in the group has an unique id, otherwise
// there will be issues
// Adding new runners once the group starts will cause a panic
func (gr *GroupRunner) Add(r *Runner) {
gr.runningMutex.Lock()
defer gr.runningMutex.Unlock()

if gr.isRunning {
panic("Adding a new runner after the group starts is forbidden")
}

// LoadOrStore will try to store the runner
if _, loaded := gr.runners.LoadOrStore(r.ID, r); loaded {
// there is already a runner with the same id, which is forbidden
panic("Trying to add a runner with an existing Id in the group")
}
// Only increase the count if a runner is stored.
// Currently panicking if the runner exists and is loaded
gr.runnersCount++
}

// Run will execute all the tasks in the group at the same time.
//
// Similarly to the "regular" runner's `Run` method, the execution thread
// will be blocked here until all tasks are completed, and their results
// will be available (each result will have the runner's id so it's easy to
// find which one failed). Note that there is no guarantee about the result's
// order, so the first result in the slice might or might not be the first
// result to be obtained.
//
// When the context is marked as done, the groupRunner will call all the
// stoppers for each runner to notify each task to stop. Note that the tasks
// might still take a while to complete.
//
// If a task finishes naturally (with the context still "alive"), it will also
// cause the groupRunner to call the stoppers of the rest of the tasks. So if
// a task finishes, the rest will also finish.
// Note that it is NOT expected for the finished task's stopper to be called
// in this case.
func (gr *GroupRunner) Run(ctx context.Context) []*Result {
// Set the flag inside the runningMutex to ensure we don't read the old value
// in the `Add` method and add a new runner when this method is being executed
// Note that if multiple `Run` or `RunAsync` happens, the underlying runners
// will panic
gr.runningMutex.Lock()
gr.isRunning = true
gr.runningMutex.Unlock()

results := make([]*Result, 0, gr.runnersCount)

ch := make(chan *Result, gr.runnersCount) // no need to block writing results
gr.runners.Range(func(_, value any) bool {
r := value.(*Runner)
r.RunAsync(ch)
return true
})

var d time.Duration
// wait for a result or for the context to be done
select {
case result := <-ch:
results = append(results, result)
case d = <-gr.interruptedCh:
results = append(results, &Result{
RunnerID: "_unknown_",
RunnerError: NewGroupTimeoutError(d),
})
case <-ctx.Done():
// Do nothing
}

// interrupt the rest of the runners
gr.Interrupt()

// Having notified that the context has been finished, we still need to
// wait for the rest of the results
for i := len(results); i < gr.runnersCount; i++ {
select {
case result := <-ch:
results = append(results, result)
case d2, ok := <-gr.interruptedCh:
if ok {
d = d2
}
results = append(results, &Result{
RunnerID: "_unknown_",
RunnerError: NewGroupTimeoutError(d),
})
}
}

// Even if we reach the group time out and bail out early, tasks might
// be running and eventually deliver the result through the channel.
// We'll rely on the buffered channel so the tasks won't block and the
// data can be eventually garbage-collected along with the unused
// channel, so we won't close the channel here.
return results
}

// RunAsync will execute the tasks in the group asynchronously.
// The result of each task will be placed in the provided channel as soon
// as it's available.
// Note that this method will finish as soon as all the tasks are running.
func (gr *GroupRunner) RunAsync(ch chan<- *Result) {
// Set the flag inside the runningMutex to ensure we don't read the old value
// in the `Add` method and add a new runner when this method is being executed
// Note that if multiple `Run` or `RunAsync` happens, the underlying runners
// will panic
gr.runningMutex.Lock()
gr.isRunning = true
gr.runningMutex.Unlock()

// we need a secondary channel to receive the first result so we can
// interrupt the rest of the tasks
interCh := make(chan *Result, gr.runnersCount)
gr.runners.Range(func(_, value any) bool {
r := value.(*Runner)
r.RunAsync(interCh)
return true
})

go func() {
var result *Result
var d time.Duration

select {
case result = <-interCh:
// result already assigned, so do nothing
case d = <-gr.interruptedCh:
// we aren't tracking which runners have finished and which are still
// running, so we'll use "_unknown_" as runner id
result = &Result{
RunnerID: "_unknown_",
RunnerError: NewGroupTimeoutError(d),
}
}
gr.Interrupt()

ch <- result
for i := 1; i < gr.runnersCount; i++ {
select {
case result = <-interCh:
// result already assigned, so do nothing
case d2, ok := <-gr.interruptedCh:
// if ok is true, d2 will have a good value; if false, the channel
// is closed and we get a default value
if ok {
d = d2
}
result = &Result{
RunnerID: "_unknown_",
RunnerError: NewGroupTimeoutError(d),
}
}
ch <- result
}
}()
}

// Interrupt will execute the stopper function of ALL the tasks, which should
// notify the tasks in order for them to finish.
// The stoppers will be called immediately but sequentially. This means that
// the second stopper won't be called until the first one has returned. This
// usually isn't a problem because the service `Stop`'s methods either don't
// take a long time to return, or they run asynchronously in another goroutine.
//
// As said, this will affect ALL the tasks in the group. It isn't possible to
// try to stop just one task.
// If a task has finished, the corresponding stopper won't be called
//
// The interrupt timeout for the group will start after all the runners in the
// group have been notified. Note that, if the task's stopper for a runner
// takes a lot of time to return, it will delay the timeout's start, so it's
// advised that the stopper either returns fast or is run asynchronously.
func (gr *GroupRunner) Interrupt() {
if gr.interrupted.CompareAndSwap(false, true) {
gr.runners.Range(func(_, value any) bool {
r := value.(*Runner)
select {
case <-r.Finished():
// No data should be sent through the channel, so we'd be
// here only if the channel is closed. This means the task
// has finished and we don't need to interrupt. We do
// nothing in this case
default:
r.Interrupt()
}
return true
})

_ = time.AfterFunc(gr.interruptDur, func() {
// timeout reached -> send it through the channel so our runner
// can abort
gr.interruptedCh <- gr.interruptDur
close(gr.interruptedCh)
})
}
}
Loading

0 comments on commit d8cae78

Please sign in to comment.