Skip to content

Commit

Permalink
Gracefully exit the program when the lease expired (#2655)
Browse files Browse the repository at this point in the history
This PR can let trillian proactively Listen "LeaseKeepAliveResponse" channel returned by KeepAlive in ETCD client. When automatic renewal interruption is detected, Exit the program by canceling the context.

Fixes #2654,#2249

Co-authored-by: Simba Peng <[email protected]>
Co-authored-by: Martin Hutchinson <[email protected]>
  • Loading branch information
px3303 and mhutchinson authored Jan 19, 2022
1 parent 472938b commit 241801b
Show file tree
Hide file tree
Showing 3 changed files with 114 additions and 21 deletions.
123 changes: 106 additions & 17 deletions cmd/internal/serverutil/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ package serverutil

import (
"context"
"errors"
"fmt"
"net"
"net/http"
Expand All @@ -28,10 +29,10 @@ import (
"github.com/google/trillian/monitoring"
"github.com/google/trillian/server/admin"
"github.com/google/trillian/server/interceptor"
"github.com/google/trillian/util"
"github.com/google/trillian/util/clock"
"github.com/prometheus/client_golang/prometheus/promhttp"
"go.etcd.io/etcd/client/v3/naming/endpoints"
"golang.org/x/sync/errgroup"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials"
"google.golang.org/grpc/reflection"
Expand Down Expand Up @@ -126,57 +127,101 @@ func (m *Main) Run(ctx context.Context) error {
trillian.RegisterTrillianAdminServer(srv, admin.New(m.Registry, m.AllowedTreeTypes))
reflection.Register(srv)

g, ctx := errgroup.WithContext(ctx)

if endpoint := m.HTTPEndpoint; endpoint != "" {
http.Handle("/metrics", promhttp.Handler())
http.HandleFunc("/healthz", m.healthz)

go func() {
s := &http.Server{
Addr: endpoint,
}

run := func() error {
glog.Infof("HTTP server starting on %v", endpoint)

var err error
// Let http.ListenAndServeTLS handle the error case when only one of the flags is set.
if m.TLSCertFile != "" || m.TLSKeyFile != "" {
err = http.ListenAndServeTLS(endpoint, m.TLSCertFile, m.TLSKeyFile, nil)
err = s.ListenAndServeTLS(m.TLSCertFile, m.TLSKeyFile)
} else {
err = http.ListenAndServe(endpoint, nil)
err = s.ListenAndServe()
}

if err != nil {
glog.Errorf("HTTP server stopped: %v", err)
if errors.Is(err, http.ErrServerClosed) {
return nil
}

err = fmt.Errorf("HTTP server stopped: %v", err)
}

return err
}

shutdown := func() {
glog.Infof("Stopping HTTP server...")
glog.Flush()

// 15 second exit time limit
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
defer cancel()

if err := s.Shutdown(ctx); err != nil {
glog.Errorf("Failed to http server shutdown: %v", err)
}
}()
}

g.Go(func() error {
return srvRun(ctx, run, shutdown)
})
}

glog.Infof("RPC server starting on %v", m.RPCEndpoint)
lis, err := net.Listen("tcp", m.RPCEndpoint)
if err != nil {
return err
}
go util.AwaitSignal(ctx, srv.Stop)

if m.TreeGCEnabled {
go func() {
g.Go(func() error {
glog.Info("Deleted tree GC started")
gc := admin.NewDeletedTreeGC(
m.Registry.AdminStorage,
m.TreeDeleteThreshold,
m.TreeDeleteMinInterval,
m.Registry.MetricFactory)
gc.Run(ctx)
}()
return nil
})
}

if err := srv.Serve(lis); err != nil {
glog.Errorf("RPC server terminated: %v", err)
run := func() error {
if err := srv.Serve(lis); err != nil {
return fmt.Errorf("RPC server terminated: %v", err)
}

return nil
}

glog.Infof("Stopping server, about to exit")
glog.Flush()
shutdown := func() {
glog.Infof("Stopping RPC server...")
glog.Flush()

srv.GracefulStop()
}

g.Go(func() error {
return srvRun(ctx, run, shutdown)
})

// wait for all jobs to exit gracefully
err = g.Wait()

// Give things a few seconds to tidy up
time.Sleep(time.Second * 5)

return nil
return err
}

// newGRPCServer starts a new Trillian gRPC server.
Expand Down Expand Up @@ -207,10 +252,11 @@ func (m *Main) newGRPCServer() (*grpc.Server, error) {
return s, nil
}

// AnnounceSelf announces this binary's presence to etcd. Returns a function that
// AnnounceSelf announces this binary's presence to etcd. This calls the cancel
// function if the keepalive lease with etcd expires. Returns a function that
// should be called on process exit.
// AnnounceSelf does nothing if client is nil.
func AnnounceSelf(ctx context.Context, client *clientv3.Client, etcdService, endpoint string) func() {
func AnnounceSelf(ctx context.Context, client *clientv3.Client, etcdService, endpoint string, cancel func()) func() {
if client == nil {
return func() {}
}
Expand All @@ -220,7 +266,12 @@ func AnnounceSelf(ctx context.Context, client *clientv3.Client, etcdService, end
if err != nil {
glog.Exitf("Failed to get lease from etcd: %v", err)
}
client.KeepAlive(ctx, leaseRsp.ID)

keepAliveRspCh, err := client.KeepAlive(ctx, leaseRsp.ID)
if err != nil {
glog.Exitf("Failed to keep lease alive from etcd: %v", err)
}
go listenKeepAliveRsp(ctx, keepAliveRspCh, cancel)

em, err := endpoints.NewManager(client, etcdService)
if err != nil {
Expand All @@ -238,3 +289,41 @@ func AnnounceSelf(ctx context.Context, client *clientv3.Client, etcdService, end
client.Revoke(ctx, leaseRsp.ID)
}
}

// listenKeepAliveRsp listens to `keepAliveRspCh` channel, and calls the cancel function
// to notify the lease expired.
func listenKeepAliveRsp(ctx context.Context, keepAliveRspCh <-chan *clientv3.LeaseKeepAliveResponse, cancel func()) {
for {
select {
case <-ctx.Done():
glog.Infof("listenKeepAliveRsp canceled: %v", ctx.Err())
return
case _, ok := <-keepAliveRspCh:
if !ok {
glog.Errorf("listenKeepAliveRsp canceled: unexpected lease expired")
cancel()
return
}
}
}
}

// srvRun run the server and call `shutdown` when the context has been cancelled
func srvRun(ctx context.Context, run func() error, shutdown func()) error {
exit := make(chan struct{})
var err error
go func() {
defer close(exit)
err = run()
}()

select {
case <-ctx.Done():
shutdown()
// wait for run to return
<-exit
case <-exit:
}

return err
}
10 changes: 7 additions & 3 deletions cmd/trillian_log_server/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ import (
"github.com/google/trillian/quota/etcd/quotapb"
"github.com/google/trillian/server"
"github.com/google/trillian/storage"
"github.com/google/trillian/util"
"github.com/google/trillian/util/clock"
clientv3 "go.etcd.io/etcd/client/v3"
"google.golang.org/grpc"
Expand Down Expand Up @@ -91,7 +92,9 @@ func main() {
}
}

ctx := context.Background()
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
go util.AwaitSignal(ctx, cancel)

var options []grpc.ServerOption
mf := prometheus.MetricFactory{}
Expand Down Expand Up @@ -124,10 +127,11 @@ func main() {
}

// Announce our endpoints to etcd if so configured.
unannounce := serverutil.AnnounceSelf(ctx, client, *etcdService, *rpcEndpoint)
unannounce := serverutil.AnnounceSelf(ctx, client, *etcdService, *rpcEndpoint, cancel)
defer unannounce()

if *httpEndpoint != "" {
unannounceHTTP := serverutil.AnnounceSelf(ctx, client, *etcdHTTPService, *httpEndpoint)
unannounceHTTP := serverutil.AnnounceSelf(ctx, client, *etcdHTTPService, *httpEndpoint, cancel)
defer unannounceHTTP()
}

Expand Down
2 changes: 1 addition & 1 deletion cmd/trillian_log_signer/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ func main() {
// Start HTTP server (optional)
if *httpEndpoint != "" {
// Announce our endpoint to etcd if so configured.
unannounceHTTP := serverutil.AnnounceSelf(ctx, client, *etcdHTTPService, *httpEndpoint)
unannounceHTTP := serverutil.AnnounceSelf(ctx, client, *etcdHTTPService, *httpEndpoint, cancel)
defer unannounceHTTP()
}

Expand Down

0 comments on commit 241801b

Please sign in to comment.